def reverse_complement_optimized(seq):
    """
    Generate the reverse complement of a DNA sequence.

    Purpose:
        This function efficiently computes the reverse complement of a DNA string
        by first reversing the sequence and then translating each nucleotide
        according to standard base-pairing rules (A↔T, C↔G).

    Input:
        seq (str): A DNA sequence consisting of characters 'A', 'T', 'C', and 'G'.

    Output:
        str: The reverse complement of the input DNA sequence.

    Notes:
        - This implementation uses str.maketrans and translate() for speed.
        - The function assumes the input contains only uppercase DNA bases.
    """
    translation_table = str.maketrans('ATCG', 'TAGC')
    return seq[::-1].translate(translation_table)

def find_pattern_positions(pattern, genome):
    """
    Find all starting positions of a pattern within a genome string.

    Purpose:
        This function performs a straightforward pattern-matching scan to locate
        every occurrence of the substring `pattern` inside the larger string `genome`.
        Overlapping matches are allowed and will be reported.

    Input:
        pattern (str): The substring to search for.
        genome (str): The larger text (e.g., a DNA sequence) in which the pattern
                      will be searched.

    Output:
        str: A space-separated string of all starting indices (0-based) where the
             pattern occurs in the genome.

    """
    positions = []
    start = 0
    pattern_length = len(pattern)
    
    while start < len(genome):
        pos = genome.find(pattern, start)
        if pos == -1:
            break
        positions.append(str(pos))
        start = pos + 1
    
    return " ".join(positions)

def FindClumps(Text, k, L, t):
    """
    Identify all k-mers forming (L, t)-clumps within a given genome string.

    Purpose:
        This function scans the genome using a sliding window of length L and
        identifies all k-mers that appear at least t times within any such window.
        A k-mer that satisfies this condition is considered to form an (L, t)-clump.

    Input:
        Text (str): The genome or long DNA string to be analyzed.
        k (int): Length of the k-mer.
        L (int): Length of the sliding window.
        t (int): Minimum number of occurrences required for a k-mer to be
                 considered part of a clump.

    Output:
        list[str]: A list of distinct k-mers that appear at least t times in
                   at least one window of length L.

    Algorithm Overview:
        - Slide a window of length L across the genome.
        - For each window, construct a frequency map of all k-mers within it.
        - Collect any k-mer whose count is ≥ t.
        - Ensure each qualifying k-mer is reported only once.

    Notes:
        - This implementation uses a naive sliding-window approach and may be
          computationally expensive for large genomes.
        - Overlapping windows are fully considered.
    """
    patterns = []
    n = len(Text)
    
    for i in range(n - L + 1):
        window = Text[i:i+L]
        
        freq_map = {}
        for j in range(len(window) - k + 1):
            kmer = window[j:j+k]
            freq_map[kmer] = freq_map.get(kmer, 0) + 1
        
        for kmer, count in freq_map.items():
            if count >= t and kmer not in patterns:
                patterns.append(kmer)
    
    return patterns

2026年3月20日研究日志¶

模式匹配 Pattern Matching¶

2026年3月20日 研究日志¶

模式匹配 Pattern Matching¶

2026年3月20日研究日志¶