def hamming_distance(p, q):
    """
    Compute the Hamming distance between two strings.

    Parameters
    ----------
    p : str
        The first string to compare.
    q : str
        The second string to compare.

    Returns
    -------
    int
        The number of positions at which the corresponding characters differ.
        Only positions up to the length of the shorter string are compared.
    """·
    return sum(1 for a, b in zip(p, q) if a != b)

def PatternCount(Text, Pattern):
    """
    Count how many times a given pattern appears in a text.

    Parameters
    ----------
    Text : str
        The full string in which the pattern will be searched.
    Pattern : str
        The substring (pattern) to count within the text.

    Returns
    -------
    int
        The total number of occurrences of Pattern in Text,
        including overlapping occurrences.
    """
    count = 0
    pattern_len = len(Pattern)
    
    # Iterate over all possible starting positions in Text
    for i in range(len(Text) - pattern_len + 1):
        # Check whether the substring starting at position i matches Pattern
        if Text[i:i+pattern_len] == Pattern:
            count += 1
    
    return count

def hamming_distance(p, q):
    return sum(1 for a, b in zip(p, q) if a != b)

def d(pattern, text):
    """
    Compute the minimum Hamming distance between a given pattern and
    any k-length substring of a longer text sequence.

    Parameters
    ----------
    pattern : str
        The k-mer whose similarity to the text will be evaluated.
        Must be a string of length k.
    text : str
        The longer sequence in which all possible k-length windows
        will be compared against the pattern.

    Returns
    -------
    int
        The smallest Hamming distance between `pattern` and any
        substring of `text` with the same length. This value represents
        how closely the pattern matches the best-aligned region of the text.

    Notes
    -----
    This function is a core component of the Median String algorithm.
    It measures the distance between a candidate k-mer and a single DNA
    sequence by finding the most similar window within that sequence.
    """
    k = len(pattern)
    return min(
        hamming_distance(pattern, text[i:i+k])
        for i in range(len(text) - k + 1)
    )

def d_pattern_dna(pattern, dna):
    """
    Compute the total distance between a given pattern and a collection
    of DNA sequences. The distance to each sequence is defined as the
    minimum Hamming distance between the pattern and any k-length
    substring within that sequence (as computed by d(pattern, text)).

    Parameters
    ----------
    pattern : str
        The candidate k-mer whose overall similarity to the DNA dataset
        will be evaluated.
    dna : list of str
        A list of DNA sequences. Each sequence will be compared against
        the pattern to determine its minimum-distance alignment.

    Returns
    -------
    int
        The sum of distances between `pattern` and each sequence in `dna`.
        This value represents how well the pattern matches the entire
        collection of sequences. Lower values indicate a better match.

    Notes
    -----
    This function is used in the Median String algorithm to evaluate
    how good a candidate k-mer is across all sequences in the dataset.
    """
    return sum(d(pattern, seq) for seq in dna)

def median_string(dna, k):
    """
    Find the k-mer (pattern of length k) that minimizes the total distance
    to a collection of DNA sequences. This is the classical Median String
    Problem in bioinformatics.

    The function exhaustively enumerates all possible k-mers over the
    alphabet {A, C, G, T}, computes their total distance to the DNA
    dataset using d_pattern_dna(), and returns the k-mer with the
    smallest total distance.

    Parameters
    ----------
    dna : list of str
        A list of DNA sequences over which the median string will be
        computed.
    k : int
        The length of the k-mer to search for.

    Returns
    -------
    str
        The k-mer that minimizes the total distance to all sequences
        in `dna`. If multiple k-mers achieve the same minimum distance,
        the first encountered in lexicographic order is returned.

    Notes
    -----
    This brute-force implementation has time complexity O(4^k * n * k),
    where n is the number of sequences. It is exact but computationally
    expensive for large k.
    """
    best_pattern = None
    best_distance = float("inf")

    for pattern_tuple in product("ACGT", repeat=k):
        pattern = "".join(pattern_tuple)
        distance = d_pattern_dna(pattern, dna)

        if distance < best_distance:
            best_distance = distance
            best_pattern = pattern

    return best_pattern

text_file = 'path/to/file'
with open('text_file','r', encoding = 'utf-8') as f:
    content = f.read()

path = r"C:\new\folder"

2026年3月19日研究日志¶

汉明距离 Hamming distance¶

字符串模式计数（Pattern Counting）¶

中位串问题 Median String Problem¶

编写python代码的小技巧¶

2026年3月19日 研究日志¶

汉明距离 Hamming distance¶

字符串模式计数（Pattern Counting）¶

中位串问题 Median String Problem¶

编写python代码的小技巧¶

2026年3月19日研究日志¶