from typing import List, Tuple VALID_AA = set("ACDEFGHIKLMNPQRSTVWYXBUZO") def parse_fasta(fasta_text: str) -> List[Tuple[str, str]]: """ Parse a FASTA string and return list of (protein_id, sequence). Raises ValueError on malformed or empty input. """ proteins: List[Tuple[str, str]] = [] current_id: str = "" current_seq: List[str] = [] for line in fasta_text.strip().splitlines(): line = line.strip() if not line: continue if line.startswith(">"): if current_id: seq = "".join(current_seq).upper() _validate_sequence(current_id, seq) proteins.append((current_id, seq)) current_id = line[1:].split()[0] # first token after ">" current_seq = [] else: current_seq.append(line) # last protein if current_id: seq = "".join(current_seq).upper() _validate_sequence(current_id, seq) proteins.append((current_id, seq)) if not proteins: raise ValueError("No valid protein sequences found in the FASTA file.") return proteins def _validate_sequence(protein_id: str, seq: str) -> None: if not seq: raise ValueError(f"Protein '{protein_id}' has an empty sequence.") invalid = set(seq) - VALID_AA if invalid: raise ValueError( f"Protein '{protein_id}' contains invalid amino acid characters: {invalid}" ) def sliding_window(sequence: str, window_size: int, stride: int) -> List[str]: """ Split a long sequence into overlapping windows. If the sequence fits in one window, returns it as-is. """ if len(sequence) <= window_size: return [sequence] windows = [] start = 0 while start < len(sequence): end = min(start + window_size, len(sequence)) windows.append(sequence[start:end]) if end == len(sequence): break start += stride return windows