Spaces:
Sleeping
Sleeping
| """ | |
| Dinucleotide frequency analysis. | |
| CpG dinucleotides are immunostimulatory in mammals (recognized by TLR9). | |
| UpA dinucleotides are associated with mRNA instability (targeted by | |
| cellular RNases). Quantifying these helps optimize mRNA therapeutics. | |
| """ | |
| from __future__ import annotations | |
| from dataclasses import dataclass, field | |
| from typing import Dict, List, Tuple | |
| class DinucleotideReport: | |
| """Dinucleotide frequency analysis results.""" | |
| frequencies: Dict[str, int] # all dinucleotide counts | |
| normalized: Dict[str, float] # frequencies / total dinucleotides | |
| cpg_count: int = 0 | |
| upa_count: int = 0 | |
| cpg_positions: List[int] = field(default_factory=list) | |
| upa_positions: List[int] = field(default_factory=list) | |
| total_dinucleotides: int = 0 | |
| def analyze_dinucleotides( | |
| sequence: str, | |
| flag_cpg: bool = True, | |
| flag_upa: bool = True, | |
| ) -> DinucleotideReport: | |
| """ | |
| Analyze dinucleotide frequencies in a nucleotide sequence. | |
| Parameters | |
| ---------- | |
| sequence : str | |
| DNA or RNA sequence. | |
| flag_cpg : bool | |
| Track CpG positions. | |
| flag_upa : bool | |
| Track UpA (TpA in DNA) positions. | |
| Returns | |
| ------- | |
| DinucleotideReport | |
| """ | |
| seq = sequence.upper() | |
| n = len(seq) | |
| frequencies: Dict[str, int] = {} | |
| cpg_positions: List[int] = [] | |
| upa_positions: List[int] = [] | |
| for i in range(n - 1): | |
| di = seq[i:i + 2] | |
| frequencies[di] = frequencies.get(di, 0) + 1 | |
| if flag_cpg and di == "CG": | |
| cpg_positions.append(i) | |
| if flag_upa and di in ("TA", "UA"): | |
| upa_positions.append(i) | |
| total = sum(frequencies.values()) | |
| normalized = {k: v / total if total > 0 else 0.0 for k, v in frequencies.items()} | |
| return DinucleotideReport( | |
| frequencies=frequencies, | |
| normalized=normalized, | |
| cpg_count=len(cpg_positions), | |
| upa_count=len(upa_positions), | |
| cpg_positions=cpg_positions, | |
| upa_positions=upa_positions, | |
| total_dinucleotides=total, | |
| ) | |