from Bio import SeqIO from io import StringIO from typing import Optional def detect_sequence_type(seq: str) -> str: clean = seq.upper().replace("-", "").replace(".", "").replace(" ", "") if not clean: return "unknown" protein_chars = set("ACDEFGHIKLMNPQRSTVWYUBZXOJ") dna_chars = set("ACGTN") rna_chars = set("ACGUN") seq_set = set(clean) extra_chars = seq_set - protein_chars if not extra_chars: return "protein" if seq_set.issubset(rna_chars): if "U" in seq_set and "T" not in seq_set: return "rna" if seq_set.issubset(dna_chars): return "dna" if seq_set.issubset(dna_chars | {"U"}): return "rna" return "unknown" def detect_input_format(text: str) -> str: text = text.strip() if text.startswith(">"): return "fasta" if text.startswith("LOCUS") or text.startswith("DEFINITION"): return "genbank" if text.startswith(("ATOM", "HETATM")) or (text.startswith("HEADER")): return "pdb" clean = "".join(c for c in text if c.isalpha()).upper() if not clean: return "unknown" seq_type = detect_sequence_type(clean) if seq_type != "unknown": return "raw_sequence" return "unknown" def detect_source_from_accession(accession: str) -> str: acc = accession.strip().upper() if acc.startswith(("NP_", "XP_", "YP_", "WP_", "AP_", "NM_", "XM_", "NR_", "XR_")): return "ncbi" if acc[0] in "PQO" or acc[:2] in ("A0", "A1", "B0", "B1", "C0", "C1"): return "uniprot" if acc.startswith("UPI"): return "uniparc" return "ncbi" async def map_refseq_to_uniprot(refseq_id: str) -> str | None: import httpx url = "https://rest.uniprot.org/idmapping/uniprotkb/search" try: async with httpx.AsyncClient(timeout=15) as client: resp = await client.post(url, json={ "from": "RefSeq_Protein", "to": "UniProtKB", "ids": refseq_id, }) if resp.status_code != 200: return None data = resp.json() results = data.get("results") or [] if results: return results[0].get("to", {}).get("primaryAccession", "") except Exception: pass return None def validate_sequence(sequence: str) -> dict: result = { "valid": False, "sequence_type": "unknown", "format": "unknown", "length": 0, "issues": [], } if not sequence or not sequence.strip(): result["issues"] = ["Empty sequence"] return result seq_format = detect_input_format(sequence) result["format"] = seq_format if seq_format == "fasta": try: records = list(SeqIO.parse(StringIO(sequence), "fasta")) if not records: result["issues"] = ["FASTA format detected but no records parsed"] return result concat_seq = str(records[0].seq) result["length"] = len(concat_seq) result["sequence_type"] = detect_sequence_type(concat_seq) if len(concat_seq) < 6: result["issues"] = [f"Sequence too short: {len(concat_seq)} residues"] return result result["valid"] = True except Exception as e: result["issues"] = [f"FASTA parse error: {str(e)}"] return result clean = "".join(c for c in sequence if c.isalpha()).upper() if not clean: result["issues"] = ["No valid sequence characters found"] return result result["length"] = len(clean) result["sequence_type"] = detect_sequence_type(clean) if result["length"] < 6: result["issues"] = [f"Sequence too short: {result['length']} residues"] return result valid_protein = set("ACDEFGHIKLMNPQRSTVWYUBZXOJ") extra = set(clean) - valid_protein if extra and result["sequence_type"] == "protein": invalid_chars = [c for c in sorted(extra) if c not in "BZX"] if invalid_chars: result["issues"] = [f"Unusual characters for protein sequence: {', '.join(invalid_chars)}"] result["valid"] = len(result["issues"]) == 0 return result