mrna-design-studio / core /analysis /dinucleotide.py
offtargeteffect's picture
Deploy mRNA Design Studio (Docker SDK)
99f834c verified
Raw
History Blame Contribute Delete
2.06 kB
"""
Dinucleotide frequency analysis.
CpG dinucleotides are immunostimulatory in mammals (recognized by TLR9).
UpA dinucleotides are associated with mRNA instability (targeted by
cellular RNases). Quantifying these helps optimize mRNA therapeutics.
"""
from __future__ import annotations
from dataclasses import dataclass, field
from typing import Dict, List, Tuple
@dataclass
class DinucleotideReport:
"""Dinucleotide frequency analysis results."""
frequencies: Dict[str, int] # all dinucleotide counts
normalized: Dict[str, float] # frequencies / total dinucleotides
cpg_count: int = 0
upa_count: int = 0
cpg_positions: List[int] = field(default_factory=list)
upa_positions: List[int] = field(default_factory=list)
total_dinucleotides: int = 0
def analyze_dinucleotides(
sequence: str,
flag_cpg: bool = True,
flag_upa: bool = True,
) -> DinucleotideReport:
"""
Analyze dinucleotide frequencies in a nucleotide sequence.
Parameters
----------
sequence : str
DNA or RNA sequence.
flag_cpg : bool
Track CpG positions.
flag_upa : bool
Track UpA (TpA in DNA) positions.
Returns
-------
DinucleotideReport
"""
seq = sequence.upper()
n = len(seq)
frequencies: Dict[str, int] = {}
cpg_positions: List[int] = []
upa_positions: List[int] = []
for i in range(n - 1):
di = seq[i:i + 2]
frequencies[di] = frequencies.get(di, 0) + 1
if flag_cpg and di == "CG":
cpg_positions.append(i)
if flag_upa and di in ("TA", "UA"):
upa_positions.append(i)
total = sum(frequencies.values())
normalized = {k: v / total if total > 0 else 0.0 for k, v in frequencies.items()}
return DinucleotideReport(
frequencies=frequencies,
normalized=normalized,
cpg_count=len(cpg_positions),
upa_count=len(upa_positions),
cpg_positions=cpg_positions,
upa_positions=upa_positions,
total_dinucleotides=total,
)