Refactor fetch_refseq_sequence to improve error handling and input validation
Browse files- Updated the fetch_refseq_sequence function to handle HTTP errors more robustly by distinguishing between urllib and requests exceptions.
- Added input validation to skip empty or NaN RefSeq IDs.
- Cleaned up whitespace and comma-separated values from the RefSeq ID.
- Enhanced error messages for better debugging.
- Updated exception handling in _fetch_sequence_for_row to accommodate new error handling in fetch_refseq_sequence.
- Plots/TaxDistributionPSORT.svg +0 -0
- notebooks/01_EDA_Psort.ipynb +2 -2
- src/my_utils.py +41 -22
Plots/TaxDistributionPSORT.svg
CHANGED
|
|
|
|
notebooks/01_EDA_Psort.ipynb
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
-
size
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6c455bf2fd2e35c927e325cae70eb912d288e69801397c163615a7931651a3ce
|
| 3 |
+
size 15152718
|
src/my_utils.py
CHANGED
|
@@ -32,7 +32,8 @@ from sklearn.base import BaseEstimator
|
|
| 32 |
import umap
|
| 33 |
|
| 34 |
import requests
|
| 35 |
-
from
|
|
|
|
| 36 |
from Bio import Entrez
|
| 37 |
from Bio import SeqIO
|
| 38 |
from tqdm import tqdm
|
|
@@ -524,20 +525,30 @@ def fetch_uniprot_sequence(uniprot_id: str):
|
|
| 524 |
else:
|
| 525 |
print(f'URL inválido o no accesible: {url}')
|
| 526 |
|
| 527 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 528 |
"""
|
| 529 |
Fetch the protein sequence for the given RefSeq ID using NCBI Entrez.
|
| 530 |
-
Returns the raw amino-acid sequence as a string.
|
| 531 |
"""
|
| 532 |
-
|
| 533 |
-
Entrez.email
|
| 534 |
Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
|
| 535 |
-
|
| 536 |
-
|
|
|
|
|
|
|
| 537 |
return None
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
|
|
|
|
|
|
| 541 |
try:
|
| 542 |
handle = Entrez.efetch(
|
| 543 |
db="protein",
|
|
@@ -548,18 +559,26 @@ def fetch_refseq_sequence(refseq_id : str):
|
|
| 548 |
record = SeqIO.read(handle, "fasta")
|
| 549 |
handle.close()
|
| 550 |
return str(record.seq)
|
| 551 |
-
except (HTTPError, ValueError):
|
| 552 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
|
| 554 |
response = requests.get(url, timeout=60)
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
| 561 |
-
|
| 562 |
-
|
|
|
|
|
|
|
|
|
|
| 563 |
|
| 564 |
|
| 565 |
def _fetch_sequence_for_row(idx, row):
|
|
@@ -572,7 +591,7 @@ def _fetch_sequence_for_row(idx, row):
|
|
| 572 |
if swiss_id and not pd.isna(swiss_id):
|
| 573 |
try:
|
| 574 |
sequence = fetch_uniprot_sequence(swiss_id)
|
| 575 |
-
except
|
| 576 |
print(f"Warning: SwissProt fetch failed for {swiss_id} with HTTP {e}")
|
| 577 |
sequence = None
|
| 578 |
|
|
@@ -580,7 +599,7 @@ def _fetch_sequence_for_row(idx, row):
|
|
| 580 |
if not sequence and row.get('Refseq_Accession') and not pd.isna(row['Refseq_Accession']):
|
| 581 |
try:
|
| 582 |
sequence = fetch_refseq_sequence(row['Refseq_Accession'])
|
| 583 |
-
except
|
| 584 |
print(f"Warning: RefSeq fetch failed for {row['Refseq_Accession']} with HTTP {e}")
|
| 585 |
sequence = None
|
| 586 |
|
|
@@ -588,7 +607,7 @@ def _fetch_sequence_for_row(idx, row):
|
|
| 588 |
if not sequence and row.get('Other_Accession') and not pd.isna(row['Other_Accession']):
|
| 589 |
try:
|
| 590 |
sequence = fetch_refseq_sequence(row['Other_Accession'])
|
| 591 |
-
except
|
| 592 |
print(f"Warning: RefSeq fetch failed for {row['Other_Accession']} with HTTP {e}")
|
| 593 |
sequence = None
|
| 594 |
|
|
|
|
| 32 |
import umap
|
| 33 |
|
| 34 |
import requests
|
| 35 |
+
from urllib.error import HTTPError as URLLibHTTPError
|
| 36 |
+
from requests.exceptions import HTTPError as RequestsHTTPError
|
| 37 |
from Bio import Entrez
|
| 38 |
from Bio import SeqIO
|
| 39 |
from tqdm import tqdm
|
|
|
|
| 525 |
else:
|
| 526 |
print(f'URL inválido o no accesible: {url}')
|
| 527 |
|
| 528 |
+
from Bio import Entrez, SeqIO
|
| 529 |
+
|
| 530 |
+
from io import StringIO
|
| 531 |
+
import pandas as pd
|
| 532 |
+
import requests
|
| 533 |
+
|
| 534 |
+
def fetch_refseq_sequence(refseq_id: str) -> str | None:
|
| 535 |
"""
|
| 536 |
Fetch the protein sequence for the given RefSeq ID using NCBI Entrez.
|
| 537 |
+
Returns the raw amino-acid sequence as a string, or None on failure.
|
| 538 |
"""
|
| 539 |
+
# ——— NCBI credentials ———
|
| 540 |
+
Entrez.email = "puglia.jd@gmail.com"
|
| 541 |
Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
|
| 542 |
+
|
| 543 |
+
# ——— Validate input ———
|
| 544 |
+
if not refseq_id or pd.isna(refseq_id):
|
| 545 |
+
print(f"[SKIP] Empty or NaN RefSeq ID: `{refseq_id}`")
|
| 546 |
return None
|
| 547 |
+
|
| 548 |
+
# Clean up possible whitespace or comma‐separated values
|
| 549 |
+
refseq_id = str(refseq_id).strip().split(",")[0]
|
| 550 |
+
|
| 551 |
+
# ——— 1) Try NCBI Entrez ———
|
| 552 |
try:
|
| 553 |
handle = Entrez.efetch(
|
| 554 |
db="protein",
|
|
|
|
| 559 |
record = SeqIO.read(handle, "fasta")
|
| 560 |
handle.close()
|
| 561 |
return str(record.seq)
|
|
|
|
| 562 |
|
| 563 |
+
except (URLLibHTTPError, ValueError) as e:
|
| 564 |
+
# catches bad HTTP status and parsing errors
|
| 565 |
+
print(f"[Entrez] Failed for `{refseq_id}`: {e}")
|
| 566 |
+
|
| 567 |
+
# ——— 2) Fallback: RCSB FASTA page ———
|
| 568 |
+
try:
|
| 569 |
url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
|
| 570 |
response = requests.get(url, timeout=60)
|
| 571 |
+
response.raise_for_status() # raises RequestsHTTPError on 4xx/5xx
|
| 572 |
+
fasta_io = StringIO(response.text)
|
| 573 |
+
record = SeqIO.read(fasta_io, "fasta")
|
| 574 |
+
return str(record.seq)
|
| 575 |
+
|
| 576 |
+
except (RequestsHTTPError, ValueError) as e2:
|
| 577 |
+
print(f"[RCSB] Failed for `{refseq_id}`: {e2}")
|
| 578 |
+
|
| 579 |
+
# ——— All methods failed ———
|
| 580 |
+
return None
|
| 581 |
+
|
| 582 |
|
| 583 |
|
| 584 |
def _fetch_sequence_for_row(idx, row):
|
|
|
|
| 591 |
if swiss_id and not pd.isna(swiss_id):
|
| 592 |
try:
|
| 593 |
sequence = fetch_uniprot_sequence(swiss_id)
|
| 594 |
+
except (URLLibHTTPError, RequestsHTTPError) as e:
|
| 595 |
print(f"Warning: SwissProt fetch failed for {swiss_id} with HTTP {e}")
|
| 596 |
sequence = None
|
| 597 |
|
|
|
|
| 599 |
if not sequence and row.get('Refseq_Accession') and not pd.isna(row['Refseq_Accession']):
|
| 600 |
try:
|
| 601 |
sequence = fetch_refseq_sequence(row['Refseq_Accession'])
|
| 602 |
+
except (URLLibHTTPError, RequestsHTTPError) as e:
|
| 603 |
print(f"Warning: RefSeq fetch failed for {row['Refseq_Accession']} with HTTP {e}")
|
| 604 |
sequence = None
|
| 605 |
|
|
|
|
| 607 |
if not sequence and row.get('Other_Accession') and not pd.isna(row['Other_Accession']):
|
| 608 |
try:
|
| 609 |
sequence = fetch_refseq_sequence(row['Other_Accession'])
|
| 610 |
+
except (URLLibHTTPError, RequestsHTTPError) as e:
|
| 611 |
print(f"Warning: RefSeq fetch failed for {row['Other_Accession']} with HTTP {e}")
|
| 612 |
sequence = None
|
| 613 |
|