Spaces:
Running
Running
| """Compositional features: k-mer frequencies and codon usage. | |
| These supplement the v0 amino-acid-composition features in `genome.py`. They are | |
| computed on the same predicted-CDS set, so adding them to a v1 featurize run is | |
| ~free in network/CPU terms. | |
| Two feature groups: | |
| - tetranucleotide frequencies (256 dims) — well-known signal for thermophily, | |
| halophily, and phylum-level taxonomy | |
| - codon usage frequencies (64 dims) — informs translation efficiency, GC bias, | |
| and growth rate phenotype | |
| We use them as relative frequencies (sum to 1 across each group) rather than | |
| counts, so they're scale-invariant across genome sizes. | |
| """ | |
| from __future__ import annotations | |
| from collections import Counter | |
| from collections.abc import Iterable | |
| NUCLEOTIDES = "ACGT" | |
| TETRA_KMERS = [a + b + c + d for a in NUCLEOTIDES for b in NUCLEOTIDES | |
| for c in NUCLEOTIDES for d in NUCLEOTIDES] | |
| CODONS = [a + b + c for a in NUCLEOTIDES for b in NUCLEOTIDES for c in NUCLEOTIDES] | |
| def tetranucleotide_freqs(contigs: Iterable[tuple[str, str]]) -> dict[str, float]: | |
| """Relative frequency of each of the 256 ACGT tetranucleotides. | |
| Skips any 4-mer containing a non-ACGT character (e.g. N). | |
| """ | |
| counts: Counter[str] = Counter() | |
| total = 0 | |
| for _, seq in contigs: | |
| s = seq.upper() | |
| for i in range(len(s) - 3): | |
| kmer = s[i : i + 4] | |
| if kmer in TETRA_KMERS_SET: # fast in-set check | |
| counts[kmer] += 1 | |
| total += 1 | |
| if total == 0: | |
| return {f"tetra_{k}": 0.0 for k in TETRA_KMERS} | |
| return {f"tetra_{k}": counts.get(k, 0) / total for k in TETRA_KMERS} | |
| def codon_freqs(cds_nucleotides: Iterable[str]) -> dict[str, float]: | |
| """Relative frequency of each of the 64 codons across all predicted CDS. | |
| Argument: an iterable of nucleotide CDS strings (multiples of 3, ATG-start). | |
| Skips codons containing non-ACGT (e.g. N). | |
| """ | |
| counts: Counter[str] = Counter() | |
| total = 0 | |
| for cds in cds_nucleotides: | |
| s = cds.upper() | |
| for i in range(0, len(s) - 2, 3): | |
| codon = s[i : i + 3] | |
| if codon in CODONS_SET: | |
| counts[codon] += 1 | |
| total += 1 | |
| if total == 0: | |
| return {f"codon_{k}": 0.0 for k in CODONS} | |
| return {f"codon_{k}": counts.get(k, 0) / total for k in CODONS} | |
| # Lookup sets for fast membership checks | |
| TETRA_KMERS_SET = set(TETRA_KMERS) | |
| CODONS_SET = set(CODONS) | |