| """ |
| Core mRNA sequence domain model. |
| |
| Designed to be flexible: different databases store sequence data differently. |
| Some customers have a single 'mrna_sequence' field; others split into UTR/CDS/PolyA. |
| The SchemaMapper normalizes those into this model. |
| """ |
| from __future__ import annotations |
|
|
| import uuid |
| from dataclasses import dataclass, field |
| from typing import Any, Dict, List, Literal, Optional |
|
|
|
|
| @dataclass |
| class SequenceAnnotation: |
| """A named region within a sequence (0-based, half-open [start, end)).""" |
| label: str |
| start: int |
| end: int |
| strand: Literal["+", "-", "."] = "+" |
| color: Optional[str] = None |
| metadata: Dict[str, Any] = field(default_factory=dict) |
|
|
| @property |
| def length(self) -> int: |
| return self.end - self.start |
|
|
|
|
| @dataclass |
| class mRNASequence: |
| """ |
| Core mRNA sequence model. |
| |
| Components are all optional because different databases represent |
| sequence data at different granularities. assembled_sequence will |
| concatenate whichever components are present, or return full_mrna |
| if the database provides the complete sequence as a single field. |
| """ |
| name: str |
| source: Literal["local", "database"] |
|
|
| |
| id: str = field(default_factory=lambda: str(uuid.uuid4())) |
|
|
| |
| db_source: Optional[str] = None |
|
|
| |
| |
| five_prime_utr: Optional[str] = None |
| kozak: Optional[str] = None |
| cds: Optional[str] = None |
| three_prime_utr: Optional[str] = None |
| poly_a: Optional[str] = None |
|
|
| |
| full_mrna: Optional[str] = None |
|
|
| |
| annotations: List[SequenceAnnotation] = field(default_factory=list) |
|
|
| |
| raw_metadata: Dict[str, Any] = field(default_factory=dict) |
|
|
| |
| _analysis_cache: Dict[str, Any] = field(default_factory=dict, repr=False) |
|
|
| |
|
|
| @property |
| def assembled_sequence(self) -> str: |
| """ |
| Return the full sequence by concatenating present components. |
| Falls back to full_mrna if no components are set. |
| Raises ValueError if neither is available. |
| """ |
| parts = [ |
| self.five_prime_utr or "", |
| self.kozak or "", |
| self.cds or "", |
| self.three_prime_utr or "", |
| self.poly_a or "", |
| ] |
| assembled = "".join(parts) |
| if assembled: |
| return assembled.upper() |
| if self.full_mrna: |
| return self.full_mrna.upper() |
| raise ValueError( |
| f"Sequence '{self.name}' has no components and no full_mrna set." |
| ) |
|
|
| @property |
| def has_components(self) -> bool: |
| """True if at least one sub-component is explicitly set.""" |
| return any([ |
| self.five_prime_utr, |
| self.kozak, |
| self.cds, |
| self.three_prime_utr, |
| self.poly_a, |
| ]) |
|
|
| @property |
| def component_annotations(self) -> List[SequenceAnnotation]: |
| """ |
| Auto-derive position annotations from the component breakdown. |
| Only available when has_components is True. |
| """ |
| annotations = [] |
| pos = 0 |
| component_colors = { |
| "5'UTR": "#4A90D9", |
| "Kozak": "#F5A623", |
| "CDS": "#7ED321", |
| "3'UTR": "#9B59B6", |
| "PolyA": "#E74C3C", |
| } |
| components = [ |
| ("5'UTR", self.five_prime_utr), |
| ("Kozak", self.kozak), |
| ("CDS", self.cds), |
| ("3'UTR", self.three_prime_utr), |
| ("PolyA", self.poly_a), |
| ] |
| for label, seq in components: |
| if seq: |
| annotations.append(SequenceAnnotation( |
| label=label, |
| start=pos, |
| end=pos + len(seq), |
| color=component_colors.get(label), |
| )) |
| pos += len(seq) |
| return annotations |
|
|
| @property |
| def length(self) -> int: |
| try: |
| return len(self.assembled_sequence) |
| except ValueError: |
| return 0 |
|
|
| @property |
| def cds_length(self) -> Optional[int]: |
| return len(self.cds) if self.cds else None |
|
|
| |
|
|
| def with_cds(self, cds: str) -> "mRNASequence": |
| """Return a new mRNASequence with the CDS replaced.""" |
| from dataclasses import replace |
| return replace( |
| self, |
| id=str(uuid.uuid4()), |
| cds=cds.upper(), |
| source="local", |
| db_source=None, |
| _analysis_cache={}, |
| ) |
|
|
| def to_dict(self) -> Dict[str, Any]: |
| return { |
| "id": self.id, |
| "name": self.name, |
| "source": self.source, |
| "db_source": self.db_source, |
| "five_prime_utr": self.five_prime_utr, |
| "kozak": self.kozak, |
| "cds": self.cds, |
| "three_prime_utr": self.three_prime_utr, |
| "poly_a": self.poly_a, |
| "full_mrna": self.full_mrna, |
| "raw_metadata": self.raw_metadata, |
| } |
|
|
| @classmethod |
| def from_dict(cls, data: Dict[str, Any]) -> "mRNASequence": |
| return cls( |
| id=data.get("id", str(uuid.uuid4())), |
| name=data["name"], |
| source=data.get("source", "local"), |
| db_source=data.get("db_source"), |
| five_prime_utr=data.get("five_prime_utr"), |
| kozak=data.get("kozak"), |
| cds=data.get("cds"), |
| three_prime_utr=data.get("three_prime_utr"), |
| poly_a=data.get("poly_a"), |
| full_mrna=data.get("full_mrna"), |
| raw_metadata=data.get("raw_metadata", {}), |
| ) |
|
|
| def __repr__(self) -> str: |
| length = self.length |
| return f"mRNASequence(name={self.name!r}, source={self.source!r}, length={length})" |
|
|