bio-nexus-api / app /services /sequence_utils.py
Samad14's picture
Add sequencing pipeline (FASTQ -> QC -> alignment -> variant calling -> report)
da793ed verified
Raw
History Blame Contribute Delete
4.36 kB
from Bio import SeqIO
from io import StringIO
from typing import Optional
def detect_sequence_type(seq: str) -> str:
clean = seq.upper().replace("-", "").replace(".", "").replace(" ", "")
if not clean:
return "unknown"
protein_chars = set("ACDEFGHIKLMNPQRSTVWYUBZXOJ")
dna_chars = set("ACGTN")
rna_chars = set("ACGUN")
seq_set = set(clean)
extra_chars = seq_set - protein_chars
if not extra_chars:
return "protein"
if seq_set.issubset(rna_chars):
if "U" in seq_set and "T" not in seq_set:
return "rna"
if seq_set.issubset(dna_chars):
return "dna"
if seq_set.issubset(dna_chars | {"U"}):
return "rna"
return "unknown"
def detect_input_format(text: str) -> str:
text = text.strip()
if text.startswith(">"):
return "fasta"
if text.startswith("LOCUS") or text.startswith("DEFINITION"):
return "genbank"
if text.startswith(("ATOM", "HETATM")) or (text.startswith("HEADER")):
return "pdb"
clean = "".join(c for c in text if c.isalpha()).upper()
if not clean:
return "unknown"
seq_type = detect_sequence_type(clean)
if seq_type != "unknown":
return "raw_sequence"
return "unknown"
def detect_source_from_accession(accession: str) -> str:
acc = accession.strip().upper()
if acc.startswith(("NP_", "XP_", "YP_", "WP_", "AP_", "NM_", "XM_", "NR_", "XR_")):
return "ncbi"
if acc[0] in "PQO" or acc[:2] in ("A0", "A1", "B0", "B1", "C0", "C1"):
return "uniprot"
if acc.startswith("UPI"):
return "uniparc"
return "ncbi"
async def map_refseq_to_uniprot(refseq_id: str) -> str | None:
import httpx
url = "https://rest.uniprot.org/idmapping/uniprotkb/search"
try:
async with httpx.AsyncClient(timeout=15) as client:
resp = await client.post(url, json={
"from": "RefSeq_Protein",
"to": "UniProtKB",
"ids": refseq_id,
})
if resp.status_code != 200:
return None
data = resp.json()
results = data.get("results") or []
if results:
return results[0].get("to", {}).get("primaryAccession", "")
except Exception:
pass
return None
def validate_sequence(sequence: str) -> dict:
result = {
"valid": False,
"sequence_type": "unknown",
"format": "unknown",
"length": 0,
"issues": [],
}
if not sequence or not sequence.strip():
result["issues"] = ["Empty sequence"]
return result
seq_format = detect_input_format(sequence)
result["format"] = seq_format
if seq_format == "fasta":
try:
records = list(SeqIO.parse(StringIO(sequence), "fasta"))
if not records:
result["issues"] = ["FASTA format detected but no records parsed"]
return result
concat_seq = str(records[0].seq)
result["length"] = len(concat_seq)
result["sequence_type"] = detect_sequence_type(concat_seq)
if len(concat_seq) < 6:
result["issues"] = [f"Sequence too short: {len(concat_seq)} residues"]
return result
result["valid"] = True
except Exception as e:
result["issues"] = [f"FASTA parse error: {str(e)}"]
return result
clean = "".join(c for c in sequence if c.isalpha()).upper()
if not clean:
result["issues"] = ["No valid sequence characters found"]
return result
result["length"] = len(clean)
result["sequence_type"] = detect_sequence_type(clean)
if result["length"] < 6:
result["issues"] = [f"Sequence too short: {result['length']} residues"]
return result
valid_protein = set("ACDEFGHIKLMNPQRSTVWYUBZXOJ")
extra = set(clean) - valid_protein
if extra and result["sequence_type"] == "protein":
invalid_chars = [c for c in sorted(extra) if c not in "BZX"]
if invalid_chars:
result["issues"] = [f"Unusual characters for protein sequence: {', '.join(invalid_chars)}"]
result["valid"] = len(result["issues"]) == 0
return result