Spaces:
Sleeping
Sleeping
| from typing import List, Tuple | |
| VALID_AA = set("ACDEFGHIKLMNPQRSTVWYXBUZO") | |
| def parse_fasta(fasta_text: str) -> List[Tuple[str, str]]: | |
| """ | |
| Parse a FASTA string and return list of (protein_id, sequence). | |
| Raises ValueError on malformed or empty input. | |
| """ | |
| proteins: List[Tuple[str, str]] = [] | |
| current_id: str = "" | |
| current_seq: List[str] = [] | |
| for line in fasta_text.strip().splitlines(): | |
| line = line.strip() | |
| if not line: | |
| continue | |
| if line.startswith(">"): | |
| if current_id: | |
| seq = "".join(current_seq).upper() | |
| _validate_sequence(current_id, seq) | |
| proteins.append((current_id, seq)) | |
| current_id = line[1:].split()[0] # first token after ">" | |
| current_seq = [] | |
| else: | |
| current_seq.append(line) | |
| # last protein | |
| if current_id: | |
| seq = "".join(current_seq).upper() | |
| _validate_sequence(current_id, seq) | |
| proteins.append((current_id, seq)) | |
| if not proteins: | |
| raise ValueError("No valid protein sequences found in the FASTA file.") | |
| return proteins | |
| def _validate_sequence(protein_id: str, seq: str) -> None: | |
| if not seq: | |
| raise ValueError(f"Protein '{protein_id}' has an empty sequence.") | |
| invalid = set(seq) - VALID_AA | |
| if invalid: | |
| raise ValueError( | |
| f"Protein '{protein_id}' contains invalid amino acid characters: {invalid}" | |
| ) | |
| def sliding_window(sequence: str, window_size: int, stride: int) -> List[str]: | |
| """ | |
| Split a long sequence into overlapping windows. | |
| If the sequence fits in one window, returns it as-is. | |
| """ | |
| if len(sequence) <= window_size: | |
| return [sequence] | |
| windows = [] | |
| start = 0 | |
| while start < len(sequence): | |
| end = min(start + window_size, len(sequence)) | |
| windows.append(sequence[start:end]) | |
| if end == len(sequence): | |
| break | |
| start += stride | |
| return windows | |