Spaces:

VyLala
/

BioMetadataAudit

Running

App Files Files Community

VyLala commited on Apr 13, 2025

Commit

538d1c5

verified ·

1 Parent(s): bd86ca3

Update mtdna_classifier.py

Browse files

Files changed (1) hide show

mtdna_classifier.py +24 -6

mtdna_classifier.py CHANGED Viewed

@@ -40,7 +40,7 @@ nltk.download('punkt_tab')
             match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
             if match2:
               isolate = match2.group(1)'''
-from Bio import Entrez
 import re
 Entrez.email = "your_email@example.com"
@@ -69,11 +69,8 @@ def get_info_from_accession(accession):
     except Exception as e:
         print("❌ Entrez error:", e)
         return "", ""
-    # Return the values, even if they are empty strings
-    return pubmedID, isolate
 # Step 2: Get doi link to access the paper
-def get_doi_from_pubmed_id(pubmed_id):
     cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
     result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
     output = result.stdout
@@ -84,7 +81,28 @@ def get_doi_from_pubmed_id(pubmed_id):
     if match:
         return match.group(0)
     else:
-        return None  # or raise an Exception with a helpful message
 # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing

             match2 = re.search(r'isolate\s+([A-Za-z0-9_-]+)', line) # search on current line
             if match2:
               isolate = match2.group(1)'''
+from Bio import Entrez, Medline
 import re
 Entrez.email = "your_email@example.com"
     except Exception as e:
         print("❌ Entrez error:", e)
         return "", ""
 # Step 2: Get doi link to access the paper
+'''def get_doi_from_pubmed_id(pubmed_id):
     cmd = f'{os.environ["HOME"]}/edirect/esummary -db pubmed -id {pubmed_id} -format medline | grep -i "AID"'
     result = subprocess.run(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
     output = result.stdout
     if match:
         return match.group(0)
     else:
+        return None  # or raise an Exception with a helpful message'''
+def get_doi_from_pubmed_id(pubmed_id):
+    try:
+        handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="medline", retmode="text")
+        records = list(Medline.parse(handle))
+        handle.close()
+        if not records:
+            return None
+        record = records[0]
+        if "AID" in record:
+            for aid in record["AID"]:
+                if "[doi]" in aid:
+                    return aid.split(" ")[0]  # extract the DOI
+        return None
+    except Exception as e:
+        print(f"❌ Failed to get DOI from PubMed ID {pubmed_id}: {e}")
+        return None
 # Step 3: Extract Text: Get the paper (html text), sup. materials (pdf, doc, excel) and do text-preprocessing