ccnasef-cyber
Add protein function prediction API
3d5b11c
Raw
History Blame Contribute Delete
2 kB
from typing import List, Tuple
VALID_AA = set("ACDEFGHIKLMNPQRSTVWYXBUZO")
def parse_fasta(fasta_text: str) -> List[Tuple[str, str]]:
"""
Parse a FASTA string and return list of (protein_id, sequence).
Raises ValueError on malformed or empty input.
"""
proteins: List[Tuple[str, str]] = []
current_id: str = ""
current_seq: List[str] = []
for line in fasta_text.strip().splitlines():
line = line.strip()
if not line:
continue
if line.startswith(">"):
if current_id:
seq = "".join(current_seq).upper()
_validate_sequence(current_id, seq)
proteins.append((current_id, seq))
current_id = line[1:].split()[0] # first token after ">"
current_seq = []
else:
current_seq.append(line)
# last protein
if current_id:
seq = "".join(current_seq).upper()
_validate_sequence(current_id, seq)
proteins.append((current_id, seq))
if not proteins:
raise ValueError("No valid protein sequences found in the FASTA file.")
return proteins
def _validate_sequence(protein_id: str, seq: str) -> None:
if not seq:
raise ValueError(f"Protein '{protein_id}' has an empty sequence.")
invalid = set(seq) - VALID_AA
if invalid:
raise ValueError(
f"Protein '{protein_id}' contains invalid amino acid characters: {invalid}"
)
def sliding_window(sequence: str, window_size: int, stride: int) -> List[str]:
"""
Split a long sequence into overlapping windows.
If the sequence fits in one window, returns it as-is.
"""
if len(sequence) <= window_size:
return [sequence]
windows = []
start = 0
while start < len(sequence):
end = min(start + window_size, len(sequence))
windows.append(sequence[start:end])
if end == len(sequence):
break
start += stride
return windows