Miyu Horiuchi
Deploy app from main@a3254bf (no paper/ binaries)
0ed74db
"""Compositional features: k-mer frequencies and codon usage.
These supplement the v0 amino-acid-composition features in `genome.py`. They are
computed on the same predicted-CDS set, so adding them to a v1 featurize run is
~free in network/CPU terms.
Two feature groups:
- tetranucleotide frequencies (256 dims) — well-known signal for thermophily,
halophily, and phylum-level taxonomy
- codon usage frequencies (64 dims) — informs translation efficiency, GC bias,
and growth rate phenotype
We use them as relative frequencies (sum to 1 across each group) rather than
counts, so they're scale-invariant across genome sizes.
"""
from __future__ import annotations
from collections import Counter
from collections.abc import Iterable
NUCLEOTIDES = "ACGT"
TETRA_KMERS = [a + b + c + d for a in NUCLEOTIDES for b in NUCLEOTIDES
for c in NUCLEOTIDES for d in NUCLEOTIDES]
CODONS = [a + b + c for a in NUCLEOTIDES for b in NUCLEOTIDES for c in NUCLEOTIDES]
def tetranucleotide_freqs(contigs: Iterable[tuple[str, str]]) -> dict[str, float]:
"""Relative frequency of each of the 256 ACGT tetranucleotides.
Skips any 4-mer containing a non-ACGT character (e.g. N).
"""
counts: Counter[str] = Counter()
total = 0
for _, seq in contigs:
s = seq.upper()
for i in range(len(s) - 3):
kmer = s[i : i + 4]
if kmer in TETRA_KMERS_SET: # fast in-set check
counts[kmer] += 1
total += 1
if total == 0:
return {f"tetra_{k}": 0.0 for k in TETRA_KMERS}
return {f"tetra_{k}": counts.get(k, 0) / total for k in TETRA_KMERS}
def codon_freqs(cds_nucleotides: Iterable[str]) -> dict[str, float]:
"""Relative frequency of each of the 64 codons across all predicted CDS.
Argument: an iterable of nucleotide CDS strings (multiples of 3, ATG-start).
Skips codons containing non-ACGT (e.g. N).
"""
counts: Counter[str] = Counter()
total = 0
for cds in cds_nucleotides:
s = cds.upper()
for i in range(0, len(s) - 2, 3):
codon = s[i : i + 3]
if codon in CODONS_SET:
counts[codon] += 1
total += 1
if total == 0:
return {f"codon_{k}": 0.0 for k in CODONS}
return {f"codon_{k}": counts.get(k, 0) / total for k in CODONS}
# Lookup sets for fast membership checks
TETRA_KMERS_SET = set(TETRA_KMERS)
CODONS_SET = set(CODONS)