Spaces:
Running
Running
Miyu Horiuchi commited on
Commit ·
6d2a502
1
Parent(s): bbbea9d
Add v1 composition features (tetranucleotides + codon usage)
Browse filesTwo new feature groups, ready to plug into a v1 featurize run:
- 256 tetranucleotide frequencies (skips kmers with N)
- 64 codon-usage frequencies (skips codons with N)
Both expressed as relative frequencies (sum to 1 within each group), so they
are scale-invariant across genome sizes.
These supplement the v0 amino-acid-composition features (33 dims) — adding them
roughly 10× the feature count. Tetranucleotides are well-known to track
phylum-level taxonomy and thermophily; codon usage informs translation
efficiency and growth-rate phenotype.
Not yet wired into the streaming pipeline — that will happen in v1 once we
have the v0 baseline numbers to compare against.
5 new tests, all passing. Total: 26/26.
- OVERNIGHT_SUMMARY.md +11 -5
- src/microbe_model/features/composition.py +68 -0
- tests/test_composition.py +44 -0
OVERNIGHT_SUMMARY.md
CHANGED
|
@@ -1,6 +1,6 @@
|
|
| 1 |
# Overnight run — summary
|
| 2 |
|
| 3 |
-
_Written 2026-04-
|
| 4 |
|
| 5 |
## Pipeline status
|
| 6 |
|
|
@@ -8,10 +8,10 @@ _Written 2026-04-26T23:03+00:00_
|
|
| 8 |
- 19,637 have genome accessions
|
| 9 |
- 50,384 have optimal_temperature_c labels
|
| 10 |
- **17,054** strains are training-ready (genome + T_opt)
|
| 11 |
-
- 🟡 Featurize: in progress (
|
| 12 |
-
- Processed:
|
| 13 |
-
- Successful:
|
| 14 |
-
- Failed:
|
| 15 |
- ⏭ Training: not yet run (waits for featurize completion)
|
| 16 |
- ⏭ Eval report: not yet generated
|
| 17 |
|
|
@@ -30,6 +30,12 @@ _Written 2026-04-26T23:03+00:00_
|
|
| 30 |
|
| 31 |
## Commits since yesterday
|
| 32 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
- 82997f4 Fix classification fold bug + add end-to-end integration tests
|
| 34 |
- 8d52535 Add eval report generator + training table persistence + group-col override
|
| 35 |
- 33535e5 Streaming fetch+featurize pipeline + 6× pyrodigal speedup + GCA version resolution
|
|
|
|
| 1 |
# Overnight run — summary
|
| 2 |
|
| 3 |
+
_Written 2026-04-27T01:44+00:00_
|
| 4 |
|
| 5 |
## Pipeline status
|
| 6 |
|
|
|
|
| 8 |
- 19,637 have genome accessions
|
| 9 |
- 50,384 have optimal_temperature_c labels
|
| 10 |
- **17,054** strains are training-ready (genome + T_opt)
|
| 11 |
+
- 🟡 Featurize: in progress (84%)
|
| 12 |
+
- Processed: 14,309 / 17,094
|
| 13 |
+
- Successful: 14,283 (99.8%)
|
| 14 |
+
- Failed: 26 (mostly suppressed/withdrawn NCBI assemblies)
|
| 15 |
- ⏭ Training: not yet run (waits for featurize completion)
|
| 16 |
- ⏭ Eval report: not yet generated
|
| 17 |
|
|
|
|
| 30 |
|
| 31 |
## Commits since yesterday
|
| 32 |
|
| 33 |
+
- 316196d Fix predictions parquet type mix + plumb feature_cols through eval
|
| 34 |
+
- 7db9544 Add tests for explore module (correlations + class means)
|
| 35 |
+
- a22773f Harden post-featurize chain: each phase runs even if previous fails
|
| 36 |
+
- eb37476 Add feature↔target correlation analysis to eval report
|
| 37 |
+
- a7d692a Update README to reflect current state
|
| 38 |
+
- 401687e Eval report enhancements: TL;DR + per-strain predictions + per-family error
|
| 39 |
- 82997f4 Fix classification fold bug + add end-to-end integration tests
|
| 40 |
- 8d52535 Add eval report generator + training table persistence + group-col override
|
| 41 |
- 33535e5 Streaming fetch+featurize pipeline + 6× pyrodigal speedup + GCA version resolution
|
src/microbe_model/features/composition.py
ADDED
|
@@ -0,0 +1,68 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Compositional features: k-mer frequencies and codon usage.
|
| 2 |
+
|
| 3 |
+
These supplement the v0 amino-acid-composition features in `genome.py`. They are
|
| 4 |
+
computed on the same predicted-CDS set, so adding them to a v1 featurize run is
|
| 5 |
+
~free in network/CPU terms.
|
| 6 |
+
|
| 7 |
+
Two feature groups:
|
| 8 |
+
- tetranucleotide frequencies (256 dims) — well-known signal for thermophily,
|
| 9 |
+
halophily, and phylum-level taxonomy
|
| 10 |
+
- codon usage frequencies (64 dims) — informs translation efficiency, GC bias,
|
| 11 |
+
and growth rate phenotype
|
| 12 |
+
|
| 13 |
+
We use them as relative frequencies (sum to 1 across each group) rather than
|
| 14 |
+
counts, so they're scale-invariant across genome sizes.
|
| 15 |
+
"""
|
| 16 |
+
from __future__ import annotations
|
| 17 |
+
|
| 18 |
+
from collections import Counter
|
| 19 |
+
from collections.abc import Iterable
|
| 20 |
+
|
| 21 |
+
NUCLEOTIDES = "ACGT"
|
| 22 |
+
TETRA_KMERS = [a + b + c + d for a in NUCLEOTIDES for b in NUCLEOTIDES
|
| 23 |
+
for c in NUCLEOTIDES for d in NUCLEOTIDES]
|
| 24 |
+
CODONS = [a + b + c for a in NUCLEOTIDES for b in NUCLEOTIDES for c in NUCLEOTIDES]
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def tetranucleotide_freqs(contigs: Iterable[tuple[str, str]]) -> dict[str, float]:
|
| 28 |
+
"""Relative frequency of each of the 256 ACGT tetranucleotides.
|
| 29 |
+
|
| 30 |
+
Skips any 4-mer containing a non-ACGT character (e.g. N).
|
| 31 |
+
"""
|
| 32 |
+
counts: Counter[str] = Counter()
|
| 33 |
+
total = 0
|
| 34 |
+
for _, seq in contigs:
|
| 35 |
+
s = seq.upper()
|
| 36 |
+
for i in range(len(s) - 3):
|
| 37 |
+
kmer = s[i : i + 4]
|
| 38 |
+
if kmer in TETRA_KMERS_SET: # fast in-set check
|
| 39 |
+
counts[kmer] += 1
|
| 40 |
+
total += 1
|
| 41 |
+
if total == 0:
|
| 42 |
+
return {f"tetra_{k}": 0.0 for k in TETRA_KMERS}
|
| 43 |
+
return {f"tetra_{k}": counts.get(k, 0) / total for k in TETRA_KMERS}
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
def codon_freqs(cds_nucleotides: Iterable[str]) -> dict[str, float]:
|
| 47 |
+
"""Relative frequency of each of the 64 codons across all predicted CDS.
|
| 48 |
+
|
| 49 |
+
Argument: an iterable of nucleotide CDS strings (multiples of 3, ATG-start).
|
| 50 |
+
Skips codons containing non-ACGT (e.g. N).
|
| 51 |
+
"""
|
| 52 |
+
counts: Counter[str] = Counter()
|
| 53 |
+
total = 0
|
| 54 |
+
for cds in cds_nucleotides:
|
| 55 |
+
s = cds.upper()
|
| 56 |
+
for i in range(0, len(s) - 2, 3):
|
| 57 |
+
codon = s[i : i + 3]
|
| 58 |
+
if codon in CODONS_SET:
|
| 59 |
+
counts[codon] += 1
|
| 60 |
+
total += 1
|
| 61 |
+
if total == 0:
|
| 62 |
+
return {f"codon_{k}": 0.0 for k in CODONS}
|
| 63 |
+
return {f"codon_{k}": counts.get(k, 0) / total for k in CODONS}
|
| 64 |
+
|
| 65 |
+
|
| 66 |
+
# Lookup sets for fast membership checks
|
| 67 |
+
TETRA_KMERS_SET = set(TETRA_KMERS)
|
| 68 |
+
CODONS_SET = set(CODONS)
|
tests/test_composition.py
ADDED
|
@@ -0,0 +1,44 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Tests for tetranucleotide + codon-frequency features."""
|
| 2 |
+
from __future__ import annotations
|
| 3 |
+
|
| 4 |
+
from microbe_model.features.composition import codon_freqs, tetranucleotide_freqs
|
| 5 |
+
|
| 6 |
+
|
| 7 |
+
def test_tetranucleotide_freqs_sum_to_one() -> None:
|
| 8 |
+
contigs = [("c1", "ACGT" * 100)] # 400 nt → 397 4-mers
|
| 9 |
+
out = tetranucleotide_freqs(contigs)
|
| 10 |
+
assert len(out) == 256
|
| 11 |
+
total = sum(out.values())
|
| 12 |
+
assert abs(total - 1.0) < 1e-6
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
def test_tetranucleotide_freqs_handles_n() -> None:
|
| 16 |
+
contigs = [("c1", "ACGNACGTACGT")]
|
| 17 |
+
out = tetranucleotide_freqs(contigs)
|
| 18 |
+
# All 4-mers containing N should be skipped; valid ones (ACGT, CGTA, GTAC, TACG) counted
|
| 19 |
+
nonzero = {k: v for k, v in out.items() if v > 0}
|
| 20 |
+
assert all(("N" not in k.removeprefix("tetra_")) for k in nonzero)
|
| 21 |
+
assert nonzero # we should have some non-N kmers
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def test_tetranucleotide_freqs_empty() -> None:
|
| 25 |
+
out = tetranucleotide_freqs([])
|
| 26 |
+
assert len(out) == 256
|
| 27 |
+
assert all(v == 0.0 for v in out.values())
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def test_codon_freqs_sum_to_one() -> None:
|
| 31 |
+
cds_list = ["ATG" * 30 + "TAA"] # 30 ATG codons, 1 stop
|
| 32 |
+
out = codon_freqs(cds_list)
|
| 33 |
+
assert len(out) == 64
|
| 34 |
+
total = sum(out.values())
|
| 35 |
+
assert abs(total - 1.0) < 1e-6
|
| 36 |
+
# ATG should be 30/31 of the codons
|
| 37 |
+
assert abs(out["codon_ATG"] - 30 / 31) < 1e-6
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
def test_codon_freqs_skips_non_acgt() -> None:
|
| 41 |
+
cds_list = ["ATGNNNATG"] # codon NNN should be skipped
|
| 42 |
+
out = codon_freqs(cds_list)
|
| 43 |
+
assert out["codon_ATG"] == 1.0 # only the two ATG codons counted, both same
|
| 44 |
+
assert sum(out.values()) == 1.0
|