microbe-model / tests /test_composition.py
Miyu Horiuchi
Add v1 composition features (tetranucleotides + codon usage)
6d2a502
"""Tests for tetranucleotide + codon-frequency features."""
from __future__ import annotations
from microbe_model.features.composition import codon_freqs, tetranucleotide_freqs
def test_tetranucleotide_freqs_sum_to_one() -> None:
contigs = [("c1", "ACGT" * 100)] # 400 nt → 397 4-mers
out = tetranucleotide_freqs(contigs)
assert len(out) == 256
total = sum(out.values())
assert abs(total - 1.0) < 1e-6
def test_tetranucleotide_freqs_handles_n() -> None:
contigs = [("c1", "ACGNACGTACGT")]
out = tetranucleotide_freqs(contigs)
# All 4-mers containing N should be skipped; valid ones (ACGT, CGTA, GTAC, TACG) counted
nonzero = {k: v for k, v in out.items() if v > 0}
assert all(("N" not in k.removeprefix("tetra_")) for k in nonzero)
assert nonzero # we should have some non-N kmers
def test_tetranucleotide_freqs_empty() -> None:
out = tetranucleotide_freqs([])
assert len(out) == 256
assert all(v == 0.0 for v in out.values())
def test_codon_freqs_sum_to_one() -> None:
cds_list = ["ATG" * 30 + "TAA"] # 30 ATG codons, 1 stop
out = codon_freqs(cds_list)
assert len(out) == 64
total = sum(out.values())
assert abs(total - 1.0) < 1e-6
# ATG should be 30/31 of the codons
assert abs(out["codon_ATG"] - 30 / 31) < 1e-6
def test_codon_freqs_skips_non_acgt() -> None:
cds_list = ["ATGNNNATG"] # codon NNN should be skipped
out = codon_freqs(cds_list)
assert out["codon_ATG"] == 1.0 # only the two ATG codons counted, both same
assert sum(out.values()) == 1.0