Spaces:

VyLala
/

BioMetadataAudit

Running

App Files Files Community

VyLala commited on Aug 9, 2025

Commit

fb8cfb6

verified ·

1 Parent(s): 20f8860

Update pipeline.py

Browse files

Files changed (1) hide show

pipeline.py +38 -3

pipeline.py CHANGED Viewed

@@ -245,10 +245,14 @@ def unique_preserve_order(seq):
     seen = set()
     return [x for x in seq if not (x in seen or seen.add(x))]
 # Main execution
-def pipeline_with_gemini(accessions,niche_cases=None):
   # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
   # there can be one accession number in the accessions
   # Prices are per 1,000 tokens
   PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
   PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
   PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
@@ -330,6 +334,9 @@ def pipeline_with_gemini(accessions,niche_cases=None):
       file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
       # file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
       # file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
       print(file_chunk_path)
       chunk_id = find_drive_file(chunk_filename, sample_folder_id)
       all_id = find_drive_file(all_filename, sample_folder_id)
@@ -386,6 +393,9 @@ def pipeline_with_gemini(accessions,niche_cases=None):
       accession, isolate = None, None
       if acc != "unknown":  accession = acc
       if iso != "unknown":  isolate = iso
       # check doi first
       if doi != "unknown":
         link = 'https://doi.org/' + doi
@@ -413,6 +423,9 @@ def pipeline_with_gemini(accessions,niche_cases=None):
         # filter the quality link
         print("saveLinkFolder as sample folder id: ", sample_folder_id)
         print("start the smart filter link")
         # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
         # if success_process:
         #   links = output_process
@@ -439,6 +452,9 @@ def pipeline_with_gemini(accessions,niche_cases=None):
       #   if not all_output:
       #     text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
       #     all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
       if chunk_exists:
         print("File chunk exists!")
         if not chunk:
@@ -466,6 +482,9 @@ def pipeline_with_gemini(accessions,niche_cases=None):
               else: query_kw = acc
               #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
               success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=100)
               if success_process:
                 text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
                 print("yes succeed for process document")
@@ -474,6 +493,9 @@ def pipeline_with_gemini(accessions,niche_cases=None):
               if context !=  "Sample ID not found.":
                 if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
                   success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
                   if success_chunk:
                     chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
                     print("yes succeed for chunk")
@@ -492,6 +514,9 @@ def pipeline_with_gemini(accessions,niche_cases=None):
               if len(data_preprocess.normalize_for_overlap(all_output)) < int(100000) and len(final_input_link)<100000:
                 print("Running merge_texts_skipping_overlap with timeout")
                 success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link),timeout=30)
                 print("Returned from timeout logic")
                 if success:
                   all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
@@ -511,7 +536,9 @@ def pipeline_with_gemini(accessions,niche_cases=None):
                   print("basic fall back")
               print("len all output after: ", len(all_output))
           #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
         else:
           chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
           all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
@@ -621,7 +648,9 @@ def pipeline_with_gemini(accessions,niche_cases=None):
           print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
           plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
           master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
       primary_word = iso
       alternative_word = acc
       print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
@@ -634,6 +663,9 @@ def pipeline_with_gemini(accessions,niche_cases=None):
       print(chunk)
       print("this is all output for the model")
       print(all_output)
       country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
           primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
           model.call_llm_api, chunk=chunk, all_output=all_output)
@@ -687,6 +719,9 @@ def pipeline_with_gemini(accessions,niche_cases=None):
           if len(method_used + sample_type_explanation)> 0:
             acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
       total_cost_title += total_query_cost
       # last resort: combine all information to give all output otherwise unknown
       if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0 or acc_score["country"] == "unknown" or acc_score["sample_type"] == "unknown":
         text = ""

     seen = set()
     return [x for x in seq if not (x in seen or seen.add(x))]
 # Main execution
+def pipeline_with_gemini(accessions,stop_flag=None, niche_cases=None):
   # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
   # there can be one accession number in the accessions
   # Prices are per 1,000 tokens
+  # Before each big step:
+  if stop_flag is not None and stop_flag.value:
+    print(f"🛑 Stop detected before starting {accession}, aborting early...")
+    return {}
   PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
   PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
   PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
       file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
       # file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
       # file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
+      if stop_flag is not None and stop_flag.value:
+        print(f"🛑 Stop processing {accession}, aborting early...")
+        return {}
       print(file_chunk_path)
       chunk_id = find_drive_file(chunk_filename, sample_folder_id)
       all_id = find_drive_file(all_filename, sample_folder_id)
       accession, isolate = None, None
       if acc != "unknown":  accession = acc
       if iso != "unknown":  isolate = iso
+      if stop_flag is not None and stop_flag.value:
+        print(f"🛑 Stop processing {accession}, aborting early...")
+        return {}
       # check doi first
       if doi != "unknown":
         link = 'https://doi.org/' + doi
         # filter the quality link
         print("saveLinkFolder as sample folder id: ", sample_folder_id)
         print("start the smart filter link")
+        if stop_flag is not None and stop_flag.value:
+            print(f"🛑 Stop processing {accession}, aborting early...")
+            return {}
         # success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,sample_folder_id),kwargs={"accession":acc})
         # if success_process:
         #   links = output_process
       #   if not all_output:
       #     text_all, table_all, document_title_all = model.read_docx_text(file_all_path)
       #     all_output = data_preprocess.normalize_for_overlap(text_all) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table_all))
+      if stop_flag is not None and stop_flag.value:
+        print(f"🛑 Stop processing {accession}, aborting early...")
+        return {}
       if chunk_exists:
         print("File chunk exists!")
         if not chunk:
               else: query_kw = acc
               #text_link, tables_link, final_input_link = data_preprocess.preprocess_document(link,saveLinkFolder, isolate=query_kw)
               success_process, output_process = run_with_timeout(data_preprocess.preprocess_document,args=(link,sample_folder_id),kwargs={"isolate":query_kw,"accession":acc},timeout=100)
+              if stop_flag is not None and stop_flag.value:
+                print(f"🛑 Stop processing {accession}, aborting early...")
+                return {}
               if success_process:
                 text_link, tables_link, final_input_link = output_process[0], output_process[1], output_process[2]
                 print("yes succeed for process document")
               if context !=  "Sample ID not found.":
                 if len(data_preprocess.normalize_for_overlap(chunk)) < 1000*1000:
                   success_chunk, the_output_chunk = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(chunk, context))
+                  if stop_flag is not None and stop_flag.value:
+                    print(f"🛑 Stop processing {accession}, aborting early...")
+                    return {}
                   if success_chunk:
                     chunk = the_output_chunk#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
                     print("yes succeed for chunk")
               if len(data_preprocess.normalize_for_overlap(all_output)) < int(100000) and len(final_input_link)<100000:
                 print("Running merge_texts_skipping_overlap with timeout")
                 success, the_output = run_with_timeout(data_preprocess.merge_texts_skipping_overlap,args=(all_output, final_input_link),timeout=30)
+                if stop_flag is not None and stop_flag.value:
+                  print(f"🛑 Stop processing {accession}, aborting early...")
+                  return {}
                 print("Returned from timeout logic")
                 if success:
                   all_output = the_output#data_preprocess.merge_texts_skipping_overlap(all_output, final_input_link)
                   print("basic fall back")
               print("len all output after: ", len(all_output))
           #country_pro, chunk, all_output = data_preprocess.process_inputToken(links, saveLinkFolder, accession=accession, isolate=isolate)
+        if stop_flag is not None and stop_flag.value:
+          print(f"🛑 Stop processing {accession}, aborting early...")
+          return {}
         else:
           chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
           all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
           print("\nRAG assets loaded from file. No re-embedding of entire document will occur.")
           plain_text_content_all, table_strings_all, document_title_all = model.read_docx_text(file_all_path)
           master_structured_lookup['document_title'] = master_structured_lookup.get('document_title', document_title_all)
+      if stop_flag is not None and stop_flag.value:
+        print(f"🛑 Stop processing {accession}, aborting early...")
+        return {}
       primary_word = iso
       alternative_word = acc
       print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
       print(chunk)
       print("this is all output for the model")
       print(all_output)
+      if stop_flag is not None and stop_flag.value:
+        print(f"🛑 Stop processing {accession}, aborting early...")
+        return {}
       country, sample_type, method_used, country_explanation, sample_type_explanation, total_query_cost =  model.query_document_info(
           primary_word, alternative_word, meta, master_structured_lookup, faiss_index, document_chunks,
           model.call_llm_api, chunk=chunk, all_output=all_output)
           if len(method_used + sample_type_explanation)> 0:
             acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
       total_cost_title += total_query_cost
+      if stop_flag is not None and stop_flag.value:
+        print(f"🛑 Stop processing {accession}, aborting early...")
+        return {}
       # last resort: combine all information to give all output otherwise unknown
       if len(acc_score["country"]) == 0 or len(acc_score["sample_type"]) == 0 or acc_score["country"] == "unknown" or acc_score["sample_type"] == "unknown":
         text = ""