Spaces:
Running
Running
File size: 2,457 Bytes
0ed74db | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 | """Compositional features: k-mer frequencies and codon usage.
These supplement the v0 amino-acid-composition features in `genome.py`. They are
computed on the same predicted-CDS set, so adding them to a v1 featurize run is
~free in network/CPU terms.
Two feature groups:
- tetranucleotide frequencies (256 dims) — well-known signal for thermophily,
halophily, and phylum-level taxonomy
- codon usage frequencies (64 dims) — informs translation efficiency, GC bias,
and growth rate phenotype
We use them as relative frequencies (sum to 1 across each group) rather than
counts, so they're scale-invariant across genome sizes.
"""
from __future__ import annotations
from collections import Counter
from collections.abc import Iterable
NUCLEOTIDES = "ACGT"
TETRA_KMERS = [a + b + c + d for a in NUCLEOTIDES for b in NUCLEOTIDES
for c in NUCLEOTIDES for d in NUCLEOTIDES]
CODONS = [a + b + c for a in NUCLEOTIDES for b in NUCLEOTIDES for c in NUCLEOTIDES]
def tetranucleotide_freqs(contigs: Iterable[tuple[str, str]]) -> dict[str, float]:
"""Relative frequency of each of the 256 ACGT tetranucleotides.
Skips any 4-mer containing a non-ACGT character (e.g. N).
"""
counts: Counter[str] = Counter()
total = 0
for _, seq in contigs:
s = seq.upper()
for i in range(len(s) - 3):
kmer = s[i : i + 4]
if kmer in TETRA_KMERS_SET: # fast in-set check
counts[kmer] += 1
total += 1
if total == 0:
return {f"tetra_{k}": 0.0 for k in TETRA_KMERS}
return {f"tetra_{k}": counts.get(k, 0) / total for k in TETRA_KMERS}
def codon_freqs(cds_nucleotides: Iterable[str]) -> dict[str, float]:
"""Relative frequency of each of the 64 codons across all predicted CDS.
Argument: an iterable of nucleotide CDS strings (multiples of 3, ATG-start).
Skips codons containing non-ACGT (e.g. N).
"""
counts: Counter[str] = Counter()
total = 0
for cds in cds_nucleotides:
s = cds.upper()
for i in range(0, len(s) - 2, 3):
codon = s[i : i + 3]
if codon in CODONS_SET:
counts[codon] += 1
total += 1
if total == 0:
return {f"codon_{k}": 0.0 for k in CODONS}
return {f"codon_{k}": counts.get(k, 0) / total for k in CODONS}
# Lookup sets for fast membership checks
TETRA_KMERS_SET = set(TETRA_KMERS)
CODONS_SET = set(CODONS)
|