jpuglia commited on
Commit
f944901
·
1 Parent(s): 458f017

Refactor fetch_refseq_sequence to improve error handling and input validation

Browse files

- Updated the fetch_refseq_sequence function to handle HTTP errors more robustly by distinguishing between urllib and requests exceptions.
- Added input validation to skip empty or NaN RefSeq IDs.
- Cleaned up whitespace and comma-separated values from the RefSeq ID.
- Enhanced error messages for better debugging.
- Updated exception handling in _fetch_sequence_for_row to accommodate new error handling in fetch_refseq_sequence.

Plots/TaxDistributionPSORT.svg CHANGED
notebooks/01_EDA_Psort.ipynb CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:f99370bb677795f54b6778596ada73015381847b1838e3cc22553ede7a03dbc3
3
- size 10363061
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c455bf2fd2e35c927e325cae70eb912d288e69801397c163615a7931651a3ce
3
+ size 15152718
src/my_utils.py CHANGED
@@ -32,7 +32,8 @@ from sklearn.base import BaseEstimator
32
  import umap
33
 
34
  import requests
35
- from requests.exceptions import HTTPError
 
36
  from Bio import Entrez
37
  from Bio import SeqIO
38
  from tqdm import tqdm
@@ -524,20 +525,30 @@ def fetch_uniprot_sequence(uniprot_id: str):
524
  else:
525
  print(f'URL inválido o no accesible: {url}')
526
 
527
- def fetch_refseq_sequence(refseq_id : str):
 
 
 
 
 
 
528
  """
529
  Fetch the protein sequence for the given RefSeq ID using NCBI Entrez.
530
- Returns the raw amino-acid sequence as a string.
531
  """
532
-
533
- Entrez.email = "puglia.jd@gmail.com" # REQUIRED
534
  Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
535
- # Check if the ID is NaN or None
536
- if pd.isna(refseq_id) or refseq_id is None:
 
 
537
  return None
538
-
539
- fasta_data = None
540
-
 
 
541
  try:
542
  handle = Entrez.efetch(
543
  db="protein",
@@ -548,18 +559,26 @@ def fetch_refseq_sequence(refseq_id : str):
548
  record = SeqIO.read(handle, "fasta")
549
  handle.close()
550
  return str(record.seq)
551
- except (HTTPError, ValueError):
552
 
 
 
 
 
 
 
553
  url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
554
  response = requests.get(url, timeout=60)
555
- if response.status_code == 200:
556
- try:
557
- fasta_data = response.text
558
- fasta_io = StringIO(fasta_data)
559
- record = SeqIO.read(fasta_io, "fasta")
560
- return str(record.seq)
561
- except ValueError:
562
- print(f"No se pudo convertir {fasta_data}, id: {refseq_id}")
 
 
 
563
 
564
 
565
  def _fetch_sequence_for_row(idx, row):
@@ -572,7 +591,7 @@ def _fetch_sequence_for_row(idx, row):
572
  if swiss_id and not pd.isna(swiss_id):
573
  try:
574
  sequence = fetch_uniprot_sequence(swiss_id)
575
- except HTTPError as e:
576
  print(f"Warning: SwissProt fetch failed for {swiss_id} with HTTP {e}")
577
  sequence = None
578
 
@@ -580,7 +599,7 @@ def _fetch_sequence_for_row(idx, row):
580
  if not sequence and row.get('Refseq_Accession') and not pd.isna(row['Refseq_Accession']):
581
  try:
582
  sequence = fetch_refseq_sequence(row['Refseq_Accession'])
583
- except HTTPError as e:
584
  print(f"Warning: RefSeq fetch failed for {row['Refseq_Accession']} with HTTP {e}")
585
  sequence = None
586
 
@@ -588,7 +607,7 @@ def _fetch_sequence_for_row(idx, row):
588
  if not sequence and row.get('Other_Accession') and not pd.isna(row['Other_Accession']):
589
  try:
590
  sequence = fetch_refseq_sequence(row['Other_Accession'])
591
- except HTTPError as e:
592
  print(f"Warning: RefSeq fetch failed for {row['Other_Accession']} with HTTP {e}")
593
  sequence = None
594
 
 
32
  import umap
33
 
34
  import requests
35
+ from urllib.error import HTTPError as URLLibHTTPError
36
+ from requests.exceptions import HTTPError as RequestsHTTPError
37
  from Bio import Entrez
38
  from Bio import SeqIO
39
  from tqdm import tqdm
 
525
  else:
526
  print(f'URL inválido o no accesible: {url}')
527
 
528
+ from Bio import Entrez, SeqIO
529
+
530
+ from io import StringIO
531
+ import pandas as pd
532
+ import requests
533
+
534
+ def fetch_refseq_sequence(refseq_id: str) -> str | None:
535
  """
536
  Fetch the protein sequence for the given RefSeq ID using NCBI Entrez.
537
+ Returns the raw amino-acid sequence as a string, or None on failure.
538
  """
539
+ # ——— NCBI credentials ———
540
+ Entrez.email = "puglia.jd@gmail.com"
541
  Entrez.api_key = "d768134734612d58be85117e1ff22e243807"
542
+
543
+ # ——— Validate input ———
544
+ if not refseq_id or pd.isna(refseq_id):
545
+ print(f"[SKIP] Empty or NaN RefSeq ID: `{refseq_id}`")
546
  return None
547
+
548
+ # Clean up possible whitespace or comma‐separated values
549
+ refseq_id = str(refseq_id).strip().split(",")[0]
550
+
551
+ # ——— 1) Try NCBI Entrez ———
552
  try:
553
  handle = Entrez.efetch(
554
  db="protein",
 
559
  record = SeqIO.read(handle, "fasta")
560
  handle.close()
561
  return str(record.seq)
 
562
 
563
+ except (URLLibHTTPError, ValueError) as e:
564
+ # catches bad HTTP status and parsing errors
565
+ print(f"[Entrez] Failed for `{refseq_id}`: {e}")
566
+
567
+ # ——— 2) Fallback: RCSB FASTA page ———
568
+ try:
569
  url = f"https://www.rcsb.org/fasta/entry/{refseq_id}"
570
  response = requests.get(url, timeout=60)
571
+ response.raise_for_status() # raises RequestsHTTPError on 4xx/5xx
572
+ fasta_io = StringIO(response.text)
573
+ record = SeqIO.read(fasta_io, "fasta")
574
+ return str(record.seq)
575
+
576
+ except (RequestsHTTPError, ValueError) as e2:
577
+ print(f"[RCSB] Failed for `{refseq_id}`: {e2}")
578
+
579
+ # ——— All methods failed ———
580
+ return None
581
+
582
 
583
 
584
  def _fetch_sequence_for_row(idx, row):
 
591
  if swiss_id and not pd.isna(swiss_id):
592
  try:
593
  sequence = fetch_uniprot_sequence(swiss_id)
594
+ except (URLLibHTTPError, RequestsHTTPError) as e:
595
  print(f"Warning: SwissProt fetch failed for {swiss_id} with HTTP {e}")
596
  sequence = None
597
 
 
599
  if not sequence and row.get('Refseq_Accession') and not pd.isna(row['Refseq_Accession']):
600
  try:
601
  sequence = fetch_refseq_sequence(row['Refseq_Accession'])
602
+ except (URLLibHTTPError, RequestsHTTPError) as e:
603
  print(f"Warning: RefSeq fetch failed for {row['Refseq_Accession']} with HTTP {e}")
604
  sequence = None
605
 
 
607
  if not sequence and row.get('Other_Accession') and not pd.isna(row['Other_Accession']):
608
  try:
609
  sequence = fetch_refseq_sequence(row['Other_Accession'])
610
+ except (URLLibHTTPError, RequestsHTTPError) as e:
611
  print(f"Warning: RefSeq fetch failed for {row['Other_Accession']} with HTTP {e}")
612
  sequence = None
613