Spaces:
Sleeping
Sleeping
| """GC content calculation — global and sliding window.""" | |
| from __future__ import annotations | |
| from typing import Optional | |
| import numpy as np | |
| def gc_fraction(sequence: str) -> float: | |
| """Return global GC fraction (0.0 – 1.0) for a nucleotide sequence.""" | |
| seq = sequence.upper() | |
| if not seq: | |
| return 0.0 | |
| gc = sum(1 for nt in seq if nt in "GC") | |
| return gc / len(seq) | |
| def gc_percent(sequence: str) -> float: | |
| """Return global GC% (0 – 100).""" | |
| return gc_fraction(sequence) * 100.0 | |
| def gc_sliding_window( | |
| sequence: str, | |
| window: int = 100, | |
| step: int = 1, | |
| ) -> tuple[np.ndarray, np.ndarray]: | |
| """ | |
| Compute GC% in a sliding window across the sequence. | |
| Returns | |
| ------- | |
| positions : np.ndarray of int | |
| Centre position of each window (0-based nucleotide index). | |
| gc_values : np.ndarray of float | |
| GC% (0–100) for each window. | |
| """ | |
| seq = sequence.upper() | |
| n = len(seq) | |
| if n == 0 or window > n: | |
| return np.array([], dtype=int), np.array([], dtype=float) | |
| # Pre-compute cumulative GC counts for O(n) sliding window | |
| gc_flags = np.array([1 if nt in "GC" else 0 for nt in seq], dtype=np.int32) | |
| cumsum = np.zeros(n + 1, dtype=np.int32) | |
| cumsum[1:] = np.cumsum(gc_flags) | |
| starts = np.arange(0, n - window + 1, step) | |
| ends = starts + window | |
| gc_counts = cumsum[ends] - cumsum[starts] | |
| gc_values = gc_counts / window * 100.0 | |
| positions = starts + window // 2 | |
| return positions, gc_values | |
| def gc_by_codon_position(cds: str) -> dict[str, float]: | |
| """ | |
| Return GC% at each codon position (GC1, GC2, GC3). | |
| cds must be in-frame and length divisible by 3. | |
| """ | |
| seq = cds.upper() | |
| if len(seq) % 3 != 0: | |
| raise ValueError("CDS length is not divisible by 3.") | |
| codons = [seq[i:i+3] for i in range(0, len(seq), 3)] | |
| gc1 = sum(1 for c in codons if c[0] in "GC") / len(codons) * 100 | |
| gc2 = sum(1 for c in codons if c[1] in "GC") / len(codons) * 100 | |
| gc3 = sum(1 for c in codons if c[2] in "GC") / len(codons) * 100 | |
| return {"GC1": gc1, "GC2": gc2, "GC3": gc3, "GC_overall": gc_percent(seq)} | |