File size: 6,642 Bytes
99f834c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
"""
Core mRNA sequence domain model.

Designed to be flexible: different databases store sequence data differently.
Some customers have a single 'mrna_sequence' field; others split into UTR/CDS/PolyA.
The SchemaMapper normalizes those into this model.
"""
from __future__ import annotations

import uuid
from dataclasses import dataclass, field
from typing import Any, Dict, List, Literal, Optional


@dataclass
class SequenceAnnotation:
    """A named region within a sequence (0-based, half-open [start, end))."""
    label: str
    start: int
    end: int
    strand: Literal["+", "-", "."] = "+"
    color: Optional[str] = None
    metadata: Dict[str, Any] = field(default_factory=dict)

    @property
    def length(self) -> int:
        return self.end - self.start


@dataclass
class mRNASequence:
    """
    Core mRNA sequence model.

    Components are all optional because different databases represent
    sequence data at different granularities. assembled_sequence will
    concatenate whichever components are present, or return full_mrna
    if the database provides the complete sequence as a single field.
    """
    name: str
    source: Literal["local", "database"]

    # Auto-generated unique identifier
    id: str = field(default_factory=lambda: str(uuid.uuid4()))

    # Which database connection this came from (None for local sequences)
    db_source: Optional[str] = None

    # ── Sequence components (all optional) ──────────────────────────────────
    # Stored as DNA (T not U) for computational convenience; displayed as RNA
    five_prime_utr: Optional[str] = None
    kozak: Optional[str] = None
    cds: Optional[str] = None
    three_prime_utr: Optional[str] = None
    poly_a: Optional[str] = None

    # Full pre-assembled sequence from DB (when component breakdown is unavailable)
    full_mrna: Optional[str] = None

    # Annotations populated by analysis or DB import
    annotations: List[SequenceAnnotation] = field(default_factory=list)

    # Raw database record β€” all original fields preserved for model use
    raw_metadata: Dict[str, Any] = field(default_factory=dict)

    # Analysis cache β€” populated lazily by SequenceAnalyzer
    _analysis_cache: Dict[str, Any] = field(default_factory=dict, repr=False)

    # ── Derived properties ──────────────────────────────────────────────────

    @property
    def assembled_sequence(self) -> str:
        """
        Return the full sequence by concatenating present components.
        Falls back to full_mrna if no components are set.
        Raises ValueError if neither is available.
        """
        parts = [
            self.five_prime_utr or "",
            self.kozak or "",
            self.cds or "",
            self.three_prime_utr or "",
            self.poly_a or "",
        ]
        assembled = "".join(parts)
        if assembled:
            return assembled.upper()
        if self.full_mrna:
            return self.full_mrna.upper()
        raise ValueError(
            f"Sequence '{self.name}' has no components and no full_mrna set."
        )

    @property
    def has_components(self) -> bool:
        """True if at least one sub-component is explicitly set."""
        return any([
            self.five_prime_utr,
            self.kozak,
            self.cds,
            self.three_prime_utr,
            self.poly_a,
        ])

    @property
    def component_annotations(self) -> List[SequenceAnnotation]:
        """
        Auto-derive position annotations from the component breakdown.
        Only available when has_components is True.
        """
        annotations = []
        pos = 0
        component_colors = {
            "5'UTR": "#4A90D9",
            "Kozak": "#F5A623",
            "CDS": "#7ED321",
            "3'UTR": "#9B59B6",
            "PolyA": "#E74C3C",
        }
        components = [
            ("5'UTR", self.five_prime_utr),
            ("Kozak", self.kozak),
            ("CDS", self.cds),
            ("3'UTR", self.three_prime_utr),
            ("PolyA", self.poly_a),
        ]
        for label, seq in components:
            if seq:
                annotations.append(SequenceAnnotation(
                    label=label,
                    start=pos,
                    end=pos + len(seq),
                    color=component_colors.get(label),
                ))
                pos += len(seq)
        return annotations

    @property
    def length(self) -> int:
        try:
            return len(self.assembled_sequence)
        except ValueError:
            return 0

    @property
    def cds_length(self) -> Optional[int]:
        return len(self.cds) if self.cds else None

    # ── Mutation helpers ────────────────────────────────────────────────────

    def with_cds(self, cds: str) -> "mRNASequence":
        """Return a new mRNASequence with the CDS replaced."""
        from dataclasses import replace
        return replace(
            self,
            id=str(uuid.uuid4()),
            cds=cds.upper(),
            source="local",
            db_source=None,
            _analysis_cache={},
        )

    def to_dict(self) -> Dict[str, Any]:
        return {
            "id": self.id,
            "name": self.name,
            "source": self.source,
            "db_source": self.db_source,
            "five_prime_utr": self.five_prime_utr,
            "kozak": self.kozak,
            "cds": self.cds,
            "three_prime_utr": self.three_prime_utr,
            "poly_a": self.poly_a,
            "full_mrna": self.full_mrna,
            "raw_metadata": self.raw_metadata,
        }

    @classmethod
    def from_dict(cls, data: Dict[str, Any]) -> "mRNASequence":
        return cls(
            id=data.get("id", str(uuid.uuid4())),
            name=data["name"],
            source=data.get("source", "local"),
            db_source=data.get("db_source"),
            five_prime_utr=data.get("five_prime_utr"),
            kozak=data.get("kozak"),
            cds=data.get("cds"),
            three_prime_utr=data.get("three_prime_utr"),
            poly_a=data.get("poly_a"),
            full_mrna=data.get("full_mrna"),
            raw_metadata=data.get("raw_metadata", {}),
        )

    def __repr__(self) -> str:
        length = self.length
        return f"mRNASequence(name={self.name!r}, source={self.source!r}, length={length})"