Spaces:

miyuiu
/

microbe-model

Running

microbe-model / kaggle /microbe_model_code /microbe_model /features /composition.py

Miyu Horiuchi

Deploy app from main@a3254bf (no paper/ binaries)

0ed74db 4 days ago

2.46 kB

	"""Compositional features: k-mer frequencies and codon usage.

	These supplement the v0 amino-acid-composition features in `genome.py`. They are
	computed on the same predicted-CDS set, so adding them to a v1 featurize run is
	~free in network/CPU terms.

	Two feature groups:
	- tetranucleotide frequencies (256 dims) — well-known signal for thermophily,
	halophily, and phylum-level taxonomy
	- codon usage frequencies (64 dims) — informs translation efficiency, GC bias,
	and growth rate phenotype

	We use them as relative frequencies (sum to 1 across each group) rather than
	counts, so they're scale-invariant across genome sizes.
	"""
	from __future__ import annotations

	from collections import Counter
	from collections.abc import Iterable

	NUCLEOTIDES = "ACGT"
	TETRA_KMERS = [a + b + c + d for a in NUCLEOTIDES for b in NUCLEOTIDES
	for c in NUCLEOTIDES for d in NUCLEOTIDES]
	CODONS = [a + b + c for a in NUCLEOTIDES for b in NUCLEOTIDES for c in NUCLEOTIDES]


	def tetranucleotide_freqs(contigs: Iterable[tuple[str, str]]) -> dict[str, float]:
	"""Relative frequency of each of the 256 ACGT tetranucleotides.

	Skips any 4-mer containing a non-ACGT character (e.g. N).
	"""
	counts: Counter[str] = Counter()
	total = 0
	for _, seq in contigs:
	s = seq.upper()
	for i in range(len(s) - 3):
	kmer = s[i : i + 4]
	if kmer in TETRA_KMERS_SET: # fast in-set check
	counts[kmer] += 1
	total += 1
	if total == 0:
	return {f"tetra_{k}": 0.0 for k in TETRA_KMERS}
	return {f"tetra_{k}": counts.get(k, 0) / total for k in TETRA_KMERS}


	def codon_freqs(cds_nucleotides: Iterable[str]) -> dict[str, float]:
	"""Relative frequency of each of the 64 codons across all predicted CDS.

	Argument: an iterable of nucleotide CDS strings (multiples of 3, ATG-start).
	Skips codons containing non-ACGT (e.g. N).
	"""
	counts: Counter[str] = Counter()
	total = 0
	for cds in cds_nucleotides:
	s = cds.upper()
	for i in range(0, len(s) - 2, 3):
	codon = s[i : i + 3]
	if codon in CODONS_SET:
	counts[codon] += 1
	total += 1
	if total == 0:
	return {f"codon_{k}": 0.0 for k in CODONS}
	return {f"codon_{k}": counts.get(k, 0) / total for k in CODONS}


	# Lookup sets for fast membership checks
	TETRA_KMERS_SET = set(TETRA_KMERS)
	CODONS_SET = set(CODONS)