mrna-design-studio / core /models /sequence.py
offtargeteffect's picture
Deploy mRNA Design Studio (Docker SDK)
99f834c verified
Raw
History Blame Contribute Delete
6.64 kB
"""
Core mRNA sequence domain model.
Designed to be flexible: different databases store sequence data differently.
Some customers have a single 'mrna_sequence' field; others split into UTR/CDS/PolyA.
The SchemaMapper normalizes those into this model.
"""
from __future__ import annotations
import uuid
from dataclasses import dataclass, field
from typing import Any, Dict, List, Literal, Optional
@dataclass
class SequenceAnnotation:
"""A named region within a sequence (0-based, half-open [start, end))."""
label: str
start: int
end: int
strand: Literal["+", "-", "."] = "+"
color: Optional[str] = None
metadata: Dict[str, Any] = field(default_factory=dict)
@property
def length(self) -> int:
return self.end - self.start
@dataclass
class mRNASequence:
"""
Core mRNA sequence model.
Components are all optional because different databases represent
sequence data at different granularities. assembled_sequence will
concatenate whichever components are present, or return full_mrna
if the database provides the complete sequence as a single field.
"""
name: str
source: Literal["local", "database"]
# Auto-generated unique identifier
id: str = field(default_factory=lambda: str(uuid.uuid4()))
# Which database connection this came from (None for local sequences)
db_source: Optional[str] = None
# ── Sequence components (all optional) ──────────────────────────────────
# Stored as DNA (T not U) for computational convenience; displayed as RNA
five_prime_utr: Optional[str] = None
kozak: Optional[str] = None
cds: Optional[str] = None
three_prime_utr: Optional[str] = None
poly_a: Optional[str] = None
# Full pre-assembled sequence from DB (when component breakdown is unavailable)
full_mrna: Optional[str] = None
# Annotations populated by analysis or DB import
annotations: List[SequenceAnnotation] = field(default_factory=list)
# Raw database record β€” all original fields preserved for model use
raw_metadata: Dict[str, Any] = field(default_factory=dict)
# Analysis cache β€” populated lazily by SequenceAnalyzer
_analysis_cache: Dict[str, Any] = field(default_factory=dict, repr=False)
# ── Derived properties ──────────────────────────────────────────────────
@property
def assembled_sequence(self) -> str:
"""
Return the full sequence by concatenating present components.
Falls back to full_mrna if no components are set.
Raises ValueError if neither is available.
"""
parts = [
self.five_prime_utr or "",
self.kozak or "",
self.cds or "",
self.three_prime_utr or "",
self.poly_a or "",
]
assembled = "".join(parts)
if assembled:
return assembled.upper()
if self.full_mrna:
return self.full_mrna.upper()
raise ValueError(
f"Sequence '{self.name}' has no components and no full_mrna set."
)
@property
def has_components(self) -> bool:
"""True if at least one sub-component is explicitly set."""
return any([
self.five_prime_utr,
self.kozak,
self.cds,
self.three_prime_utr,
self.poly_a,
])
@property
def component_annotations(self) -> List[SequenceAnnotation]:
"""
Auto-derive position annotations from the component breakdown.
Only available when has_components is True.
"""
annotations = []
pos = 0
component_colors = {
"5'UTR": "#4A90D9",
"Kozak": "#F5A623",
"CDS": "#7ED321",
"3'UTR": "#9B59B6",
"PolyA": "#E74C3C",
}
components = [
("5'UTR", self.five_prime_utr),
("Kozak", self.kozak),
("CDS", self.cds),
("3'UTR", self.three_prime_utr),
("PolyA", self.poly_a),
]
for label, seq in components:
if seq:
annotations.append(SequenceAnnotation(
label=label,
start=pos,
end=pos + len(seq),
color=component_colors.get(label),
))
pos += len(seq)
return annotations
@property
def length(self) -> int:
try:
return len(self.assembled_sequence)
except ValueError:
return 0
@property
def cds_length(self) -> Optional[int]:
return len(self.cds) if self.cds else None
# ── Mutation helpers ────────────────────────────────────────────────────
def with_cds(self, cds: str) -> "mRNASequence":
"""Return a new mRNASequence with the CDS replaced."""
from dataclasses import replace
return replace(
self,
id=str(uuid.uuid4()),
cds=cds.upper(),
source="local",
db_source=None,
_analysis_cache={},
)
def to_dict(self) -> Dict[str, Any]:
return {
"id": self.id,
"name": self.name,
"source": self.source,
"db_source": self.db_source,
"five_prime_utr": self.five_prime_utr,
"kozak": self.kozak,
"cds": self.cds,
"three_prime_utr": self.three_prime_utr,
"poly_a": self.poly_a,
"full_mrna": self.full_mrna,
"raw_metadata": self.raw_metadata,
}
@classmethod
def from_dict(cls, data: Dict[str, Any]) -> "mRNASequence":
return cls(
id=data.get("id", str(uuid.uuid4())),
name=data["name"],
source=data.get("source", "local"),
db_source=data.get("db_source"),
five_prime_utr=data.get("five_prime_utr"),
kozak=data.get("kozak"),
cds=data.get("cds"),
three_prime_utr=data.get("three_prime_utr"),
poly_a=data.get("poly_a"),
full_mrna=data.get("full_mrna"),
raw_metadata=data.get("raw_metadata", {}),
)
def __repr__(self) -> str:
length = self.length
return f"mRNASequence(name={self.name!r}, source={self.source!r}, length={length})"