File size: 22,425 Bytes
060bb47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
"""Phase 1 — Sequence parsing, validation, and translation.

Accepts a raw nucleotide string or a FASTA file/string and produces a
:class:`SequenceRecord` carrying the validated DNA, the translated amino-acid
sequence, and an index map between nucleotide coordinates and AA coordinates.

The translation step uses the NCBI standard genetic code (Table 1) via
Biopython. Ambiguous IUPAC codons are rejected at translation time because
PLM scoring requires a fully determined wild-type protein.
"""

from __future__ import annotations

import io
import logging
import re
from dataclasses import dataclass, field
from pathlib import Path
from typing import Dict, List, Optional, Tuple, Union

from Bio import SeqIO
from Bio.Data.CodonTable import TranslationError
from Bio.Seq import Seq

logger = logging.getLogger(__name__)

# IUPAC nucleotide alphabet (unambiguous + ambiguous).
IUPAC_NT = set("ACGTURYSWKMBDHVN")
UNAMBIGUOUS_NT = set("ACGT")
START_CODONS = {"ATG"}
STOP_CODONS = {"TAA", "TAG", "TGA"}

# Canonical 20 amino acids + stop. Used when the input is a protein sequence
# rather than a CDS (raw paste or .faa/.pep file).
AA_ALPHABET = set("ACDEFGHIKLMNPQRSTVWY")
AA_WITH_STOP = AA_ALPHABET | {"*"}


class SequenceValidationError(ValueError):
    """Raised when an input DNA sequence violates a biological precondition.

    Carries the 0-indexed nucleotide position of the offending character (when
    known), so the UI can map it back to a line/column in the user's paste.
    """

    def __init__(
        self,
        message: str,
        *,
        nt_position: Optional[int] = None,
        code: Optional[str] = None,
    ) -> None:
        super().__init__(message)
        self.nt_position = nt_position
        self.code = code


@dataclass
class SequenceRecord:
    """Container for a validated CDS and its translation.

    Attributes:
        identifier: FASTA header or user-supplied label.
        dna: Validated coding DNA sequence (5' -> 3', uppercase).
        protein: Translated amino-acid sequence, stop codon stripped.
        has_stop: Whether the source CDS ended in a canonical stop codon.
        nt_to_aa: Mapping from 0-indexed nucleotide position to 0-indexed AA
            position. Stop-codon nucleotides map to ``None``.
        aa_to_nt: Mapping from 0-indexed AA position to the triplet of
            0-indexed nucleotide positions encoding it.
    """

    identifier: str
    dna: str
    protein: str
    has_stop: bool
    nt_to_aa: Dict[int, Optional[int]] = field(default_factory=dict)
    aa_to_nt: Dict[int, Tuple[int, int, int]] = field(default_factory=dict)

    def __len__(self) -> int:
        return len(self.protein)


def _clean(seq: str) -> str:
    """Strip whitespace, normalize uracil to thymine, uppercase."""
    return re.sub(r"\s+", "", seq).upper().replace("U", "T")


def _clean_protein(seq: str) -> str:
    """Strip whitespace, drop trailing stop symbol, uppercase. No U→T mapping."""
    cleaned = re.sub(r"\s+", "", seq).upper().rstrip("*")
    return cleaned


def looks_like_protein(seq: str) -> bool:
    """Heuristic: does ``seq`` read like a protein rather than DNA?

    A short DNA string (e.g. "ATGCAT") is fully contained in the AA alphabet,
    so we can't use AA-only characters as the discriminator. Instead we check
    whether *any* character is non-nucleic-acid — the presence of E, F, I, L,
    P, Q, etc. is a clear AA signal. If every character is in {A,C,G,T,U,N},
    treat it as DNA.
    """
    cleaned = re.sub(r"\s+", "", seq).upper().rstrip("*")
    if not cleaned:
        return False
    nt_safe = set("ACGTUN")
    return not set(cleaned).issubset(nt_safe)


def validate_dna(seq: str, *, require_start: bool = True, require_stop: bool = True) -> str:
    """Validate that ``seq`` is a plausible CDS.

    Checks:
      * Every character is a valid IUPAC nucleotide symbol.
      * Length is a positive multiple of 3.
      * The first codon is ATG (start) — disable with ``require_start=False``.
      * The final codon is a stop codon — disable with ``require_stop=False``.

    Returns the cleaned, uppercase DNA string. Raises
    :class:`SequenceValidationError` on any failure.
    """
    cleaned = _clean(seq)
    if not cleaned:
        raise SequenceValidationError("Empty sequence after cleaning.")

    bad = set(cleaned) - IUPAC_NT
    if bad:
        # Find the first offending character so we can point at it.
        bad_pos = next((idx for idx, ch in enumerate(cleaned) if ch in bad), None)
        raise SequenceValidationError(
            f"Sequence contains non-IUPAC characters: {sorted(bad)!r}",
            nt_position=bad_pos,
            code="invalid_char",
        )

    if len(cleaned) % 3 != 0:
        raise SequenceValidationError(
            f"Sequence length ({len(cleaned)} nt) is not a multiple of 3; "
            "cannot translate as a CDS.",
            nt_position=len(cleaned) - (len(cleaned) % 3),
            code="bad_length",
        )

    if require_start and cleaned[:3] not in START_CODONS:
        raise SequenceValidationError(
            f"Sequence does not begin with a start codon (saw {cleaned[:3]!r}).",
            nt_position=0,
            code="no_start",
        )

    if require_stop and cleaned[-3:] not in STOP_CODONS:
        raise SequenceValidationError(
            f"Sequence does not end with a stop codon (saw {cleaned[-3:]!r}).",
            nt_position=len(cleaned) - 3,
            code="no_stop",
        )

    # Reject *internal* stop codons. Strip ALL trailing stops iteratively —
    # many expression cassettes use a double or triple TAA/TGA for fail-safe
    # termination, and those tandem stops are not "premature".
    body = cleaned
    while len(body) >= 6 and body[-3:] in STOP_CODONS:
        body = body[:-3]
    for i in range(0, len(body), 3):
        if body[i : i + 3] in STOP_CODONS:
            raise SequenceValidationError(
                f"Premature stop codon at nucleotide position {i} ({body[i:i+3]}).",
                nt_position=i,
                code="premature_stop",
            )

    return cleaned


def _build_index_maps(
    n_nt: int, has_stop: bool
) -> Tuple[Dict[int, Optional[int]], Dict[int, Tuple[int, int, int]]]:
    """Construct bidirectional nt<->aa coordinate maps for a CDS of length ``n_nt``."""
    n_codons = n_nt // 3
    n_aa = n_codons - 1 if has_stop else n_codons

    nt_to_aa: Dict[int, Optional[int]] = {}
    aa_to_nt: Dict[int, Tuple[int, int, int]] = {}
    for aa_idx in range(n_aa):
        triplet = (aa_idx * 3, aa_idx * 3 + 1, aa_idx * 3 + 2)
        aa_to_nt[aa_idx] = triplet
        for nt in triplet:
            nt_to_aa[nt] = aa_idx
    if has_stop:
        for nt in range(n_aa * 3, n_nt):
            nt_to_aa[nt] = None
    return nt_to_aa, aa_to_nt


def translate_dna(
    dna: str,
    identifier: str = "query",
    *,
    require_start: bool = True,
    require_stop: bool = True,
) -> SequenceRecord:
    """Validate and translate ``dna`` into a :class:`SequenceRecord`.

    Uses NCBI Table 1 (standard) genetic code via Biopython. Ambiguous codons
    (containing IUPAC ambiguity codes like N or R) raise
    :class:`SequenceValidationError` because PLM scoring requires a determined
    protein sequence.
    """
    cleaned = validate_dna(dna, require_start=require_start, require_stop=require_stop)

    # Treat a terminal stop as a stop regardless of the require_stop flag: when
    # the user pastes a CDS but disables stop checking, we still want a clean
    # protein (no trailing "*") for downstream scoring.
    has_stop = cleaned[-3:] in STOP_CODONS
    try:
        protein = str(Seq(cleaned).translate(table=1, to_stop=has_stop, cds=False))
    except TranslationError as exc:
        raise SequenceValidationError(f"Translation failed: {exc}") from exc

    if any(ch not in UNAMBIGUOUS_NT for ch in cleaned):
        raise SequenceValidationError(
            "Sequence contains ambiguous IUPAC codes; PLM scoring requires a "
            "fully determined wild-type. Resolve ambiguity before scoring."
        )
    nt_to_aa, aa_to_nt = _build_index_maps(len(cleaned), has_stop=has_stop)

    logger.info(
        "Translated %s: %d nt -> %d aa (terminal stop=%s)",
        identifier,
        len(cleaned),
        len(protein),
        has_stop,
    )

    return SequenceRecord(
        identifier=identifier,
        dna=cleaned,
        protein=protein,
        has_stop=has_stop,
        nt_to_aa=nt_to_aa,
        aa_to_nt=aa_to_nt,
    )


# Common selection-marker name patterns. When a plasmid annotation contains
# any of these substrings we deprioritize that CDS as a default pick, because
# the user almost certainly wants to evolve their gene of interest rather
# than the antibiotic resistance gene that came with the cloning vector.
_SELECTION_MARKERS = {
    "aph", "neo", "npt", "kan",      # kanamycin / G418
    "bla", "amp",                      # ampicillin
    "cat", "cmr", "chlor",            # chloramphenicol
    "tet",                              # tetracycline
    "hph", "hyg",                      # hygromycin
    "ble", "zeo",                      # zeocin / phleomycin
    "pac", "puro",                     # puromycin
    "sm", "sptr", "aada",             # streptomycin / spectinomycin
}


def _feature_label(feature) -> str:
    """Best-effort human-readable label for a SeqRecord feature."""
    for key in ("gene", "product", "label", "note"):
        vals = feature.qualifiers.get(key)
        if vals:
            return str(vals[0])
    return feature.type or "unnamed"


def _is_selection_marker(label: str) -> bool:
    low = label.lower()
    return any(marker in low for marker in _SELECTION_MARKERS)


def list_cds_features(path: Path) -> List[Tuple[str, int]]:
    """Return [(label, length_nt), ...] for every CDS feature in a structured file.

    Useful for the desktop launcher: it can present these as a chooser so the
    user explicitly picks the gene to evolve (rather than having the engine
    guess the wrong CDS, e.g. the kanamycin resistance marker).
    """
    suffix = path.suffix.lower()
    fmt = _structured_format_for(suffix)
    record = next(SeqIO.parse(str(path), fmt), None)
    if record is None:
        return []
    out: List[Tuple[str, int]] = []
    for f in record.features:
        if f.type == "CDS":
            length = int(f.location.end) - int(f.location.start)
            out.append((_feature_label(f), length))
    return out


def _structured_format_for(suffix: str) -> str:
    return {
        ".dna": "snapgene",
        ".gb": "genbank",
        ".gbk": "genbank",
        ".genbank": "genbank",
        ".embl": "embl",
    }[suffix]


def _parse_structured_file(
    path: Path, *, cds_feature: Optional[str] = None
) -> SequenceRecord:
    """Parse a GenBank, SnapGene, or EMBL file and pick one CDS to evolve.

    Selection rules (first match wins):
      1. If ``cds_feature`` is given, find the CDS whose label contains that
         substring (case-insensitive). Raises if no match — better to fail
         loudly than silently evolve the wrong gene.
      2. Otherwise pick the longest CDS that is NOT a recognized selection
         marker (kanR/ampR/cmR/etc.). Selection markers are skipped first so
         the user's gene of interest wins by default in a typical plasmid.
      3. If every CDS looks like a selection marker, fall back to the longest.
      4. If no CDS features exist at all, scan the six reading frames for the
         longest ORF (ATG...stop).
    """
    fmt = _structured_format_for(path.suffix.lower())
    record = next(SeqIO.parse(str(path), fmt), None)
    if record is None:
        raise SequenceValidationError(f"No records found in {path}.")

    cds_features = [f for f in record.features if f.type == "CDS"]

    if cds_features and cds_feature:
        wanted = cds_feature.lower()
        match = next(
            (f for f in cds_features if wanted in _feature_label(f).lower()),
            None,
        )
        if match is None:
            available = ", ".join(_feature_label(f) for f in cds_features)
            raise SequenceValidationError(
                f"No CDS feature matching {cds_feature!r} in {path.name}. "
                f"Available CDS: {available}"
            )
        chosen = match
    elif cds_features:
        non_marker = [f for f in cds_features if not _is_selection_marker(_feature_label(f))]
        pool = non_marker if non_marker else cds_features
        chosen = max(pool, key=lambda f: int(f.location.end) - int(f.location.start))
    else:
        return _longest_orf_record(str(record.seq), identifier=record.id or path.stem)

    cds_dna = str(chosen.extract(record.seq)).upper().replace("U", "T")
    label = _feature_label(chosen)
    logger.info(
        "Selected CDS feature '%s' (%d nt) from %s.", label, len(cds_dna), path.name
    )
    return translate_dna(
        cds_dna, identifier=str(label), require_start=False, require_stop=False
    )


@dataclass
class OrfCandidate:
    """One ATG-bounded ORF discovered by a 6-frame scan of raw DNA."""

    label: str          # ORF_1, ORF_2, … assigned in descending length order.
    frame: int          # +1/+2/+3 for forward strand, -1/-2/-3 for reverse.
    start_nt: int       # 0-indexed start on the FORWARD strand.
    end_nt: int         # 0-indexed exclusive end on the forward strand.
    dna: str            # The ORF nucleotide sequence (ATG…stop).
    protein: str        # Translated protein (no trailing *).

    def to_summary(self) -> Dict[str, object]:
        return {
            "label": self.label,
            "length_nt": len(self.dna),
            "frame": self.frame,
            "protein_length": len(self.protein),
        }


def find_orfs_in_dna(dna: str, min_aa: int = 30) -> List[OrfCandidate]:
    """Scan all 6 frames for ATG…stop ORFs at least ``min_aa`` residues long.

    Returns ORFs sorted by descending protein length, labeled ``ORF_1``,
    ``ORF_2``, … This is the fallback when the user pastes a plasmid sequence
    (or any long DNA with multiple stop codons across frames) — we can't
    treat the whole thing as one CDS, but we can offer the ORFs we find as
    a chooser, the same way we do for annotated `.gb` files.
    """
    cleaned = _clean(dna)
    if not cleaned or not set(cleaned).issubset(UNAMBIGUOUS_NT):
        return []

    forward = cleaned
    reverse = str(Seq(cleaned).reverse_complement())
    raw: List[OrfCandidate] = []

    for strand_idx, seq in enumerate((forward, reverse)):
        for offset in range(3):
            i = offset
            while i + 3 <= len(seq):
                if seq[i : i + 3] != "ATG":
                    i += 3
                    continue
                j = i
                while j + 3 <= len(seq):
                    if seq[j : j + 3] in STOP_CODONS:
                        orf_dna = seq[i : j + 3]
                        orf_protein = str(Seq(orf_dna).translate(table=1, to_stop=True))
                        if len(orf_protein) >= min_aa:
                            if strand_idx == 0:
                                start = i
                                end = j + 3
                                frame = (i % 3) + 1
                            else:
                                end = len(cleaned) - i
                                start = len(cleaned) - (j + 3)
                                frame = -((i % 3) + 1)
                            raw.append(
                                OrfCandidate(
                                    label="",
                                    frame=frame,
                                    start_nt=start,
                                    end_nt=end,
                                    dna=orf_dna,
                                    protein=orf_protein,
                                )
                            )
                        i = j + 3
                        break
                    j += 3
                else:
                    break

    raw.sort(key=lambda o: -len(o.protein))
    for idx, orf in enumerate(raw):
        orf.label = f"ORF_{idx + 1}"
    return raw


def _longest_orf_record(dna: str, identifier: str) -> SequenceRecord:
    """Scan all 6 reading frames and translate the longest ATG-bounded ORF."""
    cleaned = _clean(dna)
    if any(ch not in UNAMBIGUOUS_NT for ch in cleaned):
        raise SequenceValidationError(
            "Sequence contains ambiguous nucleotides; cannot find a clean ORF."
        )

    candidates: List[str] = []
    rev = str(Seq(cleaned).reverse_complement())
    for strand_seq in (cleaned, rev):
        for offset in range(3):
            i = offset
            while i + 3 <= len(strand_seq):
                if strand_seq[i : i + 3] == "ATG":
                    j = i
                    while j + 3 <= len(strand_seq):
                        codon = strand_seq[j : j + 3]
                        if codon in STOP_CODONS:
                            candidates.append(strand_seq[i : j + 3])
                            i = j + 3
                            break
                        j += 3
                    else:
                        i = j  # unterminated ORF — skip.
                else:
                    i += 3

    if not candidates:
        raise SequenceValidationError(
            "Could not find any ORF (ATG...stop) in any of the six reading frames."
        )

    best = max(candidates, key=len)
    logger.info(
        "No CDS feature available; selected longest ORF (%d nt) by 6-frame scan.",
        len(best),
    )
    return translate_dna(
        best, identifier=identifier, require_start=True, require_stop=True
    )


def parse_protein(seq: str, identifier: str = "query") -> SequenceRecord:
    """Validate a raw amino-acid sequence and build a :class:`SequenceRecord`.

    No DNA is attached — Phase 4 will reverse-translate from ``.protein``.
    """
    cleaned = _clean_protein(seq)
    if not cleaned:
        raise SequenceValidationError("Empty protein sequence.")
    bad = set(cleaned) - AA_ALPHABET
    if bad:
        raise SequenceValidationError(
            f"Protein contains non-canonical residues: {sorted(bad)!r}. "
            "Only the 20 standard amino acids (ACDEFGHIKLMNPQRSTVWY) are supported."
        )
    logger.info("Parsed protein %s: %d aa.", identifier, len(cleaned))
    return SequenceRecord(
        identifier=identifier,
        dna="",
        protein=cleaned,
        has_stop=False,
        nt_to_aa={},
        aa_to_nt={},
    )


def _looks_like_path(s: Union[str, Path]) -> bool:
    """Cheap pre-check: should we even attempt a filesystem stat on ``s``?

    A long pasted sequence (thousands of chars, possibly multi-line) would
    blow up :meth:`pathlib.Path.is_file` with ``OSError: File name too long``
    on macOS (PATH_MAX = 1024, NAME_MAX = 255). Reject anything that obviously
    isn't a path before we hit the OS.
    """
    if isinstance(s, Path):
        return True
    if not isinstance(s, str):
        return False
    if not s or len(s) > 1024:
        return False
    if "\n" in s or "\r" in s:
        return False
    try:
        return Path(s).is_file()
    except OSError:
        return False


def parse_input(
    source: Union[str, Path],
    *,
    require_start: bool = True,
    require_stop: bool = True,
    cds_feature: Optional[str] = None,
) -> SequenceRecord:
    """Parse a sequence from a path, FASTA text, raw DNA, or raw protein.

    Detection order:
      1. If ``source`` is a Path or names an existing file, read it (FASTA-aware,
         falls back to raw text). File contents are then auto-classified as DNA
         vs protein by :func:`looks_like_protein`.
      2. Else if the string starts with ``>``, parse as inline FASTA and
         classify the first record's sequence.
      3. Otherwise treat as a raw sequence and classify.
    """
    # ------------------------------------------------------------- file input
    if _looks_like_path(source):
        path = Path(source)
        suffix = path.suffix.lower()

        # Structured formats — SnapGene/GenBank/EMBL — go through their own
        # parser because they're either binary (.dna) or have rich metadata
        # that text-only handling would munge.
        if suffix in {".dna", ".gb", ".gbk", ".genbank", ".embl"}:
            return _parse_structured_file(path, cds_feature=cds_feature)

        text = path.read_text()
        # FASTA-aware: if the file looks like FASTA, parse it; else treat
        # contents as one raw sequence whose identifier defaults to the stem.
        if text.lstrip().startswith(">"):
            record = next(SeqIO.parse(io.StringIO(text), "fasta"), None)
            if record is None:
                raise SequenceValidationError(f"No FASTA records found in {path}.")
            raw_seq = str(record.seq)
            ident = record.id or path.stem
        else:
            raw_seq = text
            ident = path.stem
        if looks_like_protein(raw_seq):
            return parse_protein(raw_seq, identifier=ident)
        return translate_dna(
            raw_seq,
            identifier=ident,
            require_start=require_start,
            require_stop=require_stop,
        )

    # ----------------------------------------------------------- string input
    text = str(source)
    if text.lstrip().startswith(">"):
        record = next(SeqIO.parse(io.StringIO(text), "fasta"), None)
        if record is None:
            raise SequenceValidationError("Inline FASTA contained no records.")
        raw_seq = str(record.seq)
        ident = record.id or "query"
        if looks_like_protein(raw_seq):
            return parse_protein(raw_seq, identifier=ident)
        return translate_dna(
            raw_seq,
            identifier=ident,
            require_start=require_start,
            require_stop=require_stop,
        )

    if looks_like_protein(text):
        return parse_protein(text, identifier="query")

    return translate_dna(
        text, identifier="query", require_start=require_start, require_stop=require_stop
    )


def list_codons(dna: str) -> List[str]:
    """Split a CDS into its in-frame codons (no validation; call ``validate_dna`` first)."""
    return [dna[i : i + 3] for i in range(0, len(dna), 3)]