Spaces:

VyLala
/

BioMetadataAudit

Build error

App Files Files Community

VyLala commited on Dec 21, 2025

Commit

82a0c67

verified ·

1 Parent(s): 0de9969

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +35 -3

pipeline.py CHANGED Viewed

@@ -311,7 +311,17 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
                    "time_cost":None,
                    "source":links,
                     "file_chunk":"",
-                   "file_all_output":""}
       if niche_cases:
         for niche in niche_cases:
           acc_score[niche] = {}
@@ -327,6 +337,8 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
       if pudID:
         id = str(pudID)
         saveTitle = title
       else:
         try:
           author_name = meta_expand["authors"].split(',')[0]  # Use last name only
@@ -396,6 +408,10 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
         if stand_country.lower() != "not found":
           acc_score["country"][stand_country.lower()] = ["ncbi"]
         else: acc_score["country"][country.lower()] = ["ncbi"]
       if sample_type.lower() != "unknown":
         acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
       # second way: LLM model
@@ -841,7 +857,7 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
         print("this is text for the last resort model")
         print(text)
-        predicted_outputs, method_used, total_query_cost, more_links = await model.query_document_info(
           niche_cases=niche_cases,
           query_word=primary_word, alternative_query_word=alternative_word,
           saveLinkFolder = sample_folder_id,
@@ -851,7 +867,12 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
         print("add more links from model.query document")
         if more_links:
           links += more_links
-          acc_score["source"] = links
         print("this is llm results: ")
         for pred_out in predicted_outputs:
             # only for country, we have to standardize
@@ -865,6 +886,9 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
                 stand_country = standardize_location.smart_country_lookup(country.lower())
                 if clean_country == "unknown" and stand_country.lower() == "not found":
                   country = "unknown"
                 if country.lower() != "unknown":
                   stand_country = standardize_location.smart_country_lookup(country.lower())
                   print("this is stand_country: ", stand_country)
@@ -874,6 +898,8 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
                         acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
                     else:
                       acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
                   else:
                     if country.lower() in acc_score["country"]:
                       if country_explanation:
@@ -882,6 +908,12 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
                     else:
                       if len(method_used + country_explanation) > 0:
                         acc_score["country"][country.lower()] = [method_used + country_explanation]
             # for sample type
             elif pred_out == "modern/ancient/unknown":
               sample_type = predicted_outputs[pred_out]["answer"]

                    "time_cost":None,
                    "source":links,
                     "file_chunk":"",
+                   "file_all_output":"",
+                   "signals":{ # default values
+                              "has_geo_loc_name": False,
+                              "has_pubmed": False,
+                              "accession_found_in_text": False,
+                              "predicted_country": None,
+                              "genbank_country": None,
+                              "num_publications": 0,
+                              "missing_key_fields": False,
+                              "known_failure_pattern": False,},
+                  }
       if niche_cases:
         for niche in niche_cases:
           acc_score[niche] = {}
       if pudID:
         id = str(pudID)
         saveTitle = title
+        # save in signals that pubmed exists
+        acc_score["signals"]["has_pubmed"] = True
       else:
         try:
           author_name = meta_expand["authors"].split(',')[0]  # Use last name only
         if stand_country.lower() != "not found":
           acc_score["country"][stand_country.lower()] = ["ncbi"]
         else: acc_score["country"][country.lower()] = ["ncbi"]
+        # write in a signals for existing country in ncbi
+        acc_score["signals"]["has_geo_loc_name"] = True
+        acc_score["signals"]["genbank_country"] = list(acc_score["country"].keys())[0]
+        acc_score["signals"]["num_publications"] += 1 # ncbi also counts as 1 source
       if sample_type.lower() != "unknown":
         acc_score["sample_type"][sample_type.lower()] = ["ncbi"]
       # second way: LLM model
         print("this is text for the last resort model")
         print(text)
+        predicted_outputs, method_used, total_query_cost, more_links, accession_found_in_text = await model.query_document_info(
           niche_cases=niche_cases,
           query_word=primary_word, alternative_query_word=alternative_word,
           saveLinkFolder = sample_folder_id,
         print("add more links from model.query document")
         if more_links:
           links += more_links
+          acc_score["source"] = links
+        # add into the number of publications
+        acc_score["signals"]["num_publication"] += len(acc_score["source"])
+        # add if accession_found_in_text or not
+        acc_score["signals"]["accession_found_in_text"] = accession_found_in_text
         print("this is llm results: ")
         for pred_out in predicted_outputs:
             # only for country, we have to standardize
                 stand_country = standardize_location.smart_country_lookup(country.lower())
                 if clean_country == "unknown" and stand_country.lower() == "not found":
                   country = "unknown"
+                  # predicted country is unknown
+                  acc_score["signals"]["predicted_country"] = "unknown"
+                  acc_score["signals"]["known_failure_pattern"] = True
                 if country.lower() != "unknown":
                   stand_country = standardize_location.smart_country_lookup(country.lower())
                   print("this is stand_country: ", stand_country)
                         acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
                     else:
                       acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
+                    # predicted country is non unknown
+                    acc_score["signals"]["predicted_country"] = stand_country.lower()
                   else:
                     if country.lower() in acc_score["country"]:
                       if country_explanation:
                     else:
                       if len(method_used + country_explanation) > 0:
                         acc_score["country"][country.lower()] = [method_used + country_explanation]
+                    # predicted country is non unknown
+                    acc_score["signals"]["predicted_country"] = country.lower()
+            else:
+              # predicted country is unknown
+              acc_score["signals"]["predicted_country"] = "unknown"
+              acc_score["signals"]["known_failure_pattern"] = True
             # for sample type
             elif pred_out == "modern/ancient/unknown":
               sample_type = predicted_outputs[pred_out]["answer"]