Miyu Horiuchi commited on
Commit
6d2a502
·
1 Parent(s): bbbea9d

Add v1 composition features (tetranucleotides + codon usage)

Browse files

Two new feature groups, ready to plug into a v1 featurize run:
- 256 tetranucleotide frequencies (skips kmers with N)
- 64 codon-usage frequencies (skips codons with N)

Both expressed as relative frequencies (sum to 1 within each group), so they
are scale-invariant across genome sizes.

These supplement the v0 amino-acid-composition features (33 dims) — adding them
roughly 10× the feature count. Tetranucleotides are well-known to track
phylum-level taxonomy and thermophily; codon usage informs translation
efficiency and growth-rate phenotype.

Not yet wired into the streaming pipeline — that will happen in v1 once we
have the v0 baseline numbers to compare against.

5 new tests, all passing. Total: 26/26.

OVERNIGHT_SUMMARY.md CHANGED
@@ -1,6 +1,6 @@
1
  # Overnight run — summary
2
 
3
- _Written 2026-04-26T23:03+00:00_
4
 
5
  ## Pipeline status
6
 
@@ -8,10 +8,10 @@ _Written 2026-04-26T23:03+00:00_
8
  - 19,637 have genome accessions
9
  - 50,384 have optimal_temperature_c labels
10
  - **17,054** strains are training-ready (genome + T_opt)
11
- - 🟡 Featurize: in progress (32%)
12
- - Processed: 5,489 / 17,094
13
- - Successful: 5,473 (99.7%)
14
- - Failed: 16 (mostly suppressed/withdrawn NCBI assemblies)
15
  - ⏭ Training: not yet run (waits for featurize completion)
16
  - ⏭ Eval report: not yet generated
17
 
@@ -30,6 +30,12 @@ _Written 2026-04-26T23:03+00:00_
30
 
31
  ## Commits since yesterday
32
 
 
 
 
 
 
 
33
  - 82997f4 Fix classification fold bug + add end-to-end integration tests
34
  - 8d52535 Add eval report generator + training table persistence + group-col override
35
  - 33535e5 Streaming fetch+featurize pipeline + 6× pyrodigal speedup + GCA version resolution
 
1
  # Overnight run — summary
2
 
3
+ _Written 2026-04-27T01:44+00:00_
4
 
5
  ## Pipeline status
6
 
 
8
  - 19,637 have genome accessions
9
  - 50,384 have optimal_temperature_c labels
10
  - **17,054** strains are training-ready (genome + T_opt)
11
+ - 🟡 Featurize: in progress (84%)
12
+ - Processed: 14,309 / 17,094
13
+ - Successful: 14,283 (99.8%)
14
+ - Failed: 26 (mostly suppressed/withdrawn NCBI assemblies)
15
  - ⏭ Training: not yet run (waits for featurize completion)
16
  - ⏭ Eval report: not yet generated
17
 
 
30
 
31
  ## Commits since yesterday
32
 
33
+ - 316196d Fix predictions parquet type mix + plumb feature_cols through eval
34
+ - 7db9544 Add tests for explore module (correlations + class means)
35
+ - a22773f Harden post-featurize chain: each phase runs even if previous fails
36
+ - eb37476 Add feature↔target correlation analysis to eval report
37
+ - a7d692a Update README to reflect current state
38
+ - 401687e Eval report enhancements: TL;DR + per-strain predictions + per-family error
39
  - 82997f4 Fix classification fold bug + add end-to-end integration tests
40
  - 8d52535 Add eval report generator + training table persistence + group-col override
41
  - 33535e5 Streaming fetch+featurize pipeline + 6× pyrodigal speedup + GCA version resolution
src/microbe_model/features/composition.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Compositional features: k-mer frequencies and codon usage.
2
+
3
+ These supplement the v0 amino-acid-composition features in `genome.py`. They are
4
+ computed on the same predicted-CDS set, so adding them to a v1 featurize run is
5
+ ~free in network/CPU terms.
6
+
7
+ Two feature groups:
8
+ - tetranucleotide frequencies (256 dims) — well-known signal for thermophily,
9
+ halophily, and phylum-level taxonomy
10
+ - codon usage frequencies (64 dims) — informs translation efficiency, GC bias,
11
+ and growth rate phenotype
12
+
13
+ We use them as relative frequencies (sum to 1 across each group) rather than
14
+ counts, so they're scale-invariant across genome sizes.
15
+ """
16
+ from __future__ import annotations
17
+
18
+ from collections import Counter
19
+ from collections.abc import Iterable
20
+
21
+ NUCLEOTIDES = "ACGT"
22
+ TETRA_KMERS = [a + b + c + d for a in NUCLEOTIDES for b in NUCLEOTIDES
23
+ for c in NUCLEOTIDES for d in NUCLEOTIDES]
24
+ CODONS = [a + b + c for a in NUCLEOTIDES for b in NUCLEOTIDES for c in NUCLEOTIDES]
25
+
26
+
27
+ def tetranucleotide_freqs(contigs: Iterable[tuple[str, str]]) -> dict[str, float]:
28
+ """Relative frequency of each of the 256 ACGT tetranucleotides.
29
+
30
+ Skips any 4-mer containing a non-ACGT character (e.g. N).
31
+ """
32
+ counts: Counter[str] = Counter()
33
+ total = 0
34
+ for _, seq in contigs:
35
+ s = seq.upper()
36
+ for i in range(len(s) - 3):
37
+ kmer = s[i : i + 4]
38
+ if kmer in TETRA_KMERS_SET: # fast in-set check
39
+ counts[kmer] += 1
40
+ total += 1
41
+ if total == 0:
42
+ return {f"tetra_{k}": 0.0 for k in TETRA_KMERS}
43
+ return {f"tetra_{k}": counts.get(k, 0) / total for k in TETRA_KMERS}
44
+
45
+
46
+ def codon_freqs(cds_nucleotides: Iterable[str]) -> dict[str, float]:
47
+ """Relative frequency of each of the 64 codons across all predicted CDS.
48
+
49
+ Argument: an iterable of nucleotide CDS strings (multiples of 3, ATG-start).
50
+ Skips codons containing non-ACGT (e.g. N).
51
+ """
52
+ counts: Counter[str] = Counter()
53
+ total = 0
54
+ for cds in cds_nucleotides:
55
+ s = cds.upper()
56
+ for i in range(0, len(s) - 2, 3):
57
+ codon = s[i : i + 3]
58
+ if codon in CODONS_SET:
59
+ counts[codon] += 1
60
+ total += 1
61
+ if total == 0:
62
+ return {f"codon_{k}": 0.0 for k in CODONS}
63
+ return {f"codon_{k}": counts.get(k, 0) / total for k in CODONS}
64
+
65
+
66
+ # Lookup sets for fast membership checks
67
+ TETRA_KMERS_SET = set(TETRA_KMERS)
68
+ CODONS_SET = set(CODONS)
tests/test_composition.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Tests for tetranucleotide + codon-frequency features."""
2
+ from __future__ import annotations
3
+
4
+ from microbe_model.features.composition import codon_freqs, tetranucleotide_freqs
5
+
6
+
7
+ def test_tetranucleotide_freqs_sum_to_one() -> None:
8
+ contigs = [("c1", "ACGT" * 100)] # 400 nt → 397 4-mers
9
+ out = tetranucleotide_freqs(contigs)
10
+ assert len(out) == 256
11
+ total = sum(out.values())
12
+ assert abs(total - 1.0) < 1e-6
13
+
14
+
15
+ def test_tetranucleotide_freqs_handles_n() -> None:
16
+ contigs = [("c1", "ACGNACGTACGT")]
17
+ out = tetranucleotide_freqs(contigs)
18
+ # All 4-mers containing N should be skipped; valid ones (ACGT, CGTA, GTAC, TACG) counted
19
+ nonzero = {k: v for k, v in out.items() if v > 0}
20
+ assert all(("N" not in k.removeprefix("tetra_")) for k in nonzero)
21
+ assert nonzero # we should have some non-N kmers
22
+
23
+
24
+ def test_tetranucleotide_freqs_empty() -> None:
25
+ out = tetranucleotide_freqs([])
26
+ assert len(out) == 256
27
+ assert all(v == 0.0 for v in out.values())
28
+
29
+
30
+ def test_codon_freqs_sum_to_one() -> None:
31
+ cds_list = ["ATG" * 30 + "TAA"] # 30 ATG codons, 1 stop
32
+ out = codon_freqs(cds_list)
33
+ assert len(out) == 64
34
+ total = sum(out.values())
35
+ assert abs(total - 1.0) < 1e-6
36
+ # ATG should be 30/31 of the codons
37
+ assert abs(out["codon_ATG"] - 30 / 31) < 1e-6
38
+
39
+
40
+ def test_codon_freqs_skips_non_acgt() -> None:
41
+ cds_list = ["ATGNNNATG"] # codon NNN should be skipped
42
+ out = codon_freqs(cds_list)
43
+ assert out["codon_ATG"] == 1.0 # only the two ATG codons counted, both same
44
+ assert sum(out.values()) == 1.0