""" Codon Adaptation Index (CAI) calculation. Uses BioPython's CodonAdaptationIndex and codon usage tables. Supports human and a set of common lab organisms. Additional organisms can be added by providing a codon usage table as a dict. """ from __future__ import annotations import math from typing import Dict, Optional # Codon usage tables: {codon: relative_adaptiveness} # Tables below are RSCU (relative synonymous codon usage) normalized per # synonymous family to relative adaptiveness (0–1). Human table derived # from Homo sapiens Kazusa database (high-expression genes). _HUMAN_RSCU: Dict[str, float] = { # Phe "TTT": 0.55, "TTC": 1.00, # Leu "TTA": 0.07, "TTG": 0.19, "CTT": 0.42, "CTC": 0.68, "CTA": 0.16, "CTG": 1.00, # Ile "ATT": 0.71, "ATC": 1.00, "ATA": 0.31, # Met "ATG": 1.00, # Val "GTT": 0.46, "GTC": 0.62, "GTA": 0.27, "GTG": 1.00, # Ser "TCT": 0.85, "TCC": 1.00, "TCA": 0.69, "TCG": 0.27, "AGT": 0.72, "AGC": 0.97, # Pro "CCT": 0.85, "CCC": 1.00, "CCA": 0.83, "CCG": 0.22, # Thr "ACT": 0.74, "ACC": 1.00, "ACA": 0.77, "ACG": 0.27, # Ala "GCT": 0.91, "GCC": 1.00, "GCA": 0.67, "GCG": 0.19, # Tyr "TAT": 0.57, "TAC": 1.00, # Stop "TAA": 1.00, "TAG": 0.22, "TGA": 0.61, # His "CAT": 0.56, "CAC": 1.00, # Gln "CAA": 0.36, "CAG": 1.00, # Asn "AAT": 0.53, "AAC": 1.00, # Lys "AAA": 0.74, "AAG": 1.00, # Asp "GAT": 0.63, "GAC": 1.00, # Glu "GAA": 0.68, "GAG": 1.00, # Cys "TGT": 0.56, "TGC": 1.00, # Trp "TGG": 1.00, # Arg "CGT": 0.17, "CGC": 0.40, "CGA": 0.19, "CGG": 0.48, "AGA": 0.74, "AGG": 1.00, # Gly "GGT": 0.52, "GGC": 1.00, "GGA": 0.67, "GGG": 0.54, } _ECOLI_RSCU: Dict[str, float] = { # Phe "TTT": 1.00, "TTC": 0.59, # Leu "TTA": 0.49, "TTG": 0.74, "CTT": 0.68, "CTC": 0.39, "CTA": 0.24, "CTG": 1.00, # Ile "ATT": 1.00, "ATC": 0.82, "ATA": 0.19, # Met "ATG": 1.00, # Val "GTT": 1.00, "GTC": 0.60, "GTA": 0.73, "GTG": 0.72, # Ser "TCT": 0.92, "TCC": 0.52, "TCA": 0.46, "TCG": 0.46, "AGT": 0.72, "AGC": 1.00, # Pro "CCT": 0.63, "CCC": 0.27, "CCA": 0.67, "CCG": 1.00, # Thr "ACT": 0.95, "ACC": 1.00, "ACA": 0.47, "ACG": 0.81, # Ala "GCT": 0.92, "GCC": 0.70, "GCA": 0.91, "GCG": 1.00, # Tyr "TAT": 1.00, "TAC": 0.67, # Stop "TAA": 1.00, "TAG": 0.10, "TGA": 0.07, # His "CAT": 1.00, "CAC": 0.53, # Gln "CAA": 0.69, "CAG": 1.00, # Asn "AAT": 0.89, "AAC": 1.00, # Lys "AAA": 1.00, "AAG": 0.41, # Asp "GAT": 1.00, "GAC": 0.52, # Glu "GAA": 1.00, "GAG": 0.41, # Cys "TGT": 1.00, "TGC": 0.54, # Trp "TGG": 1.00, # Arg "CGT": 1.00, "CGC": 0.68, "CGA": 0.19, "CGG": 0.18, "AGA": 0.07, "AGG": 0.05, # Gly "GGT": 1.00, "GGC": 0.69, "GGA": 0.35, "GGG": 0.26, } CODON_TABLES: Dict[str, Dict[str, float]] = { "human": _HUMAN_RSCU, "ecoli": _ECOLI_RSCU, } def calculate_cai( cds: str, organism: str = "human", custom_table: Optional[Dict[str, float]] = None, ) -> float: """ Calculate the Codon Adaptation Index for a CDS. Parameters ---------- cds : str Coding sequence (DNA, T not U). Must start with ATG and be divisible by 3. Stop codon is excluded from the CAI calculation. organism : str Key into CODON_TABLES. Ignored if custom_table is provided. custom_table : dict, optional Custom {codon: relative_adaptiveness} table (values 0–1). Returns ------- float CAI value in [0, 1]. Higher is better adapted. """ seq = cds.upper().replace("U", "T") if len(seq) % 3 != 0: raise ValueError("CDS length is not divisible by 3.") table = custom_table if custom_table else CODON_TABLES.get(organism) if table is None: raise ValueError( f"Unknown organism '{organism}'. " f"Available: {list(CODON_TABLES.keys())}. " "Provide a custom_table to use another organism." ) codons = [seq[i:i+3] for i in range(0, len(seq), 3)] # Exclude stop codons from CAI stop_codons = {"TAA", "TAG", "TGA"} codons = [c for c in codons if c not in stop_codons] if not codons: return 0.0 log_sum = 0.0 unknown = [] for codon in codons: w = table.get(codon) if w is None or w <= 0: unknown.append(codon) continue log_sum += math.log(w) if unknown: # Non-standard codons (ambiguity codes, etc.) — skip gracefully n = len(codons) - len(unknown) else: n = len(codons) if n == 0: return 0.0 return math.exp(log_sum / n) def codon_usage_report(cds: str) -> Dict[str, int]: """Return a frequency count of each codon in the CDS.""" seq = cds.upper().replace("U", "T") if len(seq) % 3 != 0: raise ValueError("CDS length is not divisible by 3.") freq: Dict[str, int] = {} for i in range(0, len(seq), 3): codon = seq[i:i+3] freq[codon] = freq.get(codon, 0) + 1 return freq