VyLala commited on
Commit
d9c1837
·
verified ·
1 Parent(s): 06aa1bb

Update pipeline.py

Browse files
Files changed (1) hide show
  1. pipeline.py +226 -499
pipeline.py CHANGED
@@ -216,18 +216,18 @@ async def fetch_all(links, timeout=15):
216
  tasks = [fetch_url(session, l, timeout) for l in links]
217
  return await asyncio.gather(*tasks)
218
 
219
- async def process_link_chunk_allOutput(link, iso, acc, saveLinkFolder, out_links, all_output, chunk):
220
  print(link)
221
  if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
222
  print("break here")
223
- return "", all_output, chunk # nothing more for this link
224
 
225
  query_kw = iso if iso != "unknown" else acc
226
 
227
  # --- text extraction ---
228
- if out_links and link in out_links:
229
  print("yeah art_text available")
230
- text_link = out_links[link]
231
  else:
232
  try:
233
  print("start preprocess and extract text")
@@ -241,14 +241,15 @@ async def process_link_chunk_allOutput(link, iso, acc, saveLinkFolder, out_links
241
  asyncio.to_thread(data_preprocess.extract_table, link, saveLinkFolder),
242
  timeout=10
243
  )
244
- print("this is table link: ", str(table_links))
245
-
246
  except Exception:
247
  tables_link = []
248
 
249
  # --- merge ---
250
  try:
251
  print("just merge text and tables")
 
 
252
  try:
253
  final_input_link = text_link + ", ".join(tables_link)
254
  except:
@@ -256,20 +257,113 @@ async def process_link_chunk_allOutput(link, iso, acc, saveLinkFolder, out_links
256
  except Exception:
257
  print("no succeed here in preprocess docu")
258
  final_input_link = ""
259
-
260
- # --- context extraction ---
261
- context = data_preprocess.extract_context(final_input_link, query_kw)
262
- chunk += context
263
-
264
  # --- normalize output ---
265
  if len(final_input_link) > 1000000:
266
  final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
267
  if len(final_input_link) > 1000000:
268
  final_input_link = final_input_link[:1000000]
269
 
270
- all_output = all_output+ data_preprocess.normalize_for_overlap(all_output) + final_input_link
271
- print("done process link chunk_alloutput")
272
- return context, all_output, chunk
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
273
 
274
  from Bio import Entrez
275
  Entrez.email = "your_email@example.com" # required by NCBI
@@ -277,15 +371,11 @@ Entrez.email = "your_email@example.com" # required by NCBI
277
  # Main execution
278
  async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_cases=None):
279
  # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
280
- # there can be one accession number in the accessions
281
  # Prices are per 1,000 tokens
282
  # Before each big step:
283
  if stop_flag is not None and stop_flag.value:
284
  print(f"🛑 Stop detected before starting {accession}, aborting early...")
285
  return {}
286
- # PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
287
- # PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
288
- # PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
289
  # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
290
  PRICE_PER_1K_INPUT_LLM = 0.00010 # $0.10 per 1M input tokens
291
  PRICE_PER_1K_OUTPUT_LLM = 0.00040 # $0.40 per 1M output tokens
@@ -297,8 +387,7 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
297
  return None
298
  else:
299
  accs_output = {}
300
- #genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
301
- genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
302
  for acc in accessions:
303
  print("start gemini: ", acc)
304
  start = time.time()
@@ -310,8 +399,7 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
310
  "query_cost":total_cost_title,
311
  "time_cost":None,
312
  "source":links,
313
- "file_chunk":"",
314
- "file_all_output":"",
315
  "signals":{ # default values
316
  "has_geo_loc_name": False,
317
  "has_pubmed": False,
@@ -333,7 +421,7 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
333
  meta_expand = smart_fallback.fetch_ncbi(acc)
334
  print("meta expand: ", meta_expand)
335
  # set up step: create the folder to save document
336
- chunk, all_output, out_links, other_explain = "","", {}, ""
337
  if pudID:
338
  id = str(pudID)
339
  saveTitle = title
@@ -353,53 +441,37 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
353
  print("sample folder id: ", sample_folder_id)
354
 
355
  safe_title = sanitize_filename(saveTitle, 50)
356
- chunk_filename = f"{safe_title}_merged_document.docx"
357
  all_filename = f"{safe_title}_all_merged_document.docx"
358
- print("chunk file name and all filename: ", chunk_filename, all_filename)
359
  # Define local temp paths for reading/writing
360
- # import tempfile
361
- # tmp_dir = tempfile.mkdtemp()
362
  LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
363
  os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
364
- file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
365
  file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
366
- # file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
367
- # file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
368
  if stop_flag is not None and stop_flag.value:
369
  print(f"🛑 Stop processing {accession}, aborting early...")
370
  return {}
371
- print("this is file chunk path: ", file_chunk_path)
372
- chunk_id = find_drive_file(chunk_filename, sample_folder_id)
373
  all_id = find_drive_file(all_filename, sample_folder_id)
374
 
375
- if chunk_id and all_id:
376
  print("✅ Files already exist in Google Drive. Downloading them...")
377
- chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
378
  all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
379
- acc_score["file_chunk"] = str(chunk_filename)
380
  acc_score["file_all_output"] = str(all_filename)
381
- print("chunk_id and all_id: ")
382
- print(chunk_id, all_id)
383
- print("file chunk and all output saved in acc score: ", acc_score["file_chunk"], acc_score["file_all_output"])
384
  file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
385
  print("📄 Name:", file["name"])
386
  print("📁 Parent folder ID:", file["parents"][0])
387
  print("🔗 View link:", file["webViewLink"])
388
-
389
-
390
- # Read and parse these into `chunk` and `all_output`
391
  else:
392
  # 🔥 Remove any stale local copies
393
- if os.path.exists(file_chunk_path):
394
- os.remove(file_chunk_path)
395
- print(f"🗑️ Removed stale: {file_chunk_path}")
396
  if os.path.exists(file_all_path):
397
  os.remove(file_all_path)
398
  print(f"🗑️ Removed stale: {file_all_path}")
399
  # Try to download if already exists on Drive
400
- chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
401
  all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
402
- print("chunk exist: ", chunk_exists)
403
  # first way: ncbi method
404
  print("country.lower: ",country.lower())
405
  if country.lower() != "unknown":
@@ -417,23 +489,10 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
417
  # second way: LLM model
418
  # Preprocess the input token
419
  print(acc_score)
420
- accession, isolate = None, None
421
- if acc != "unknown": accession = acc
422
- if iso != "unknown": isolate = iso
423
  if stop_flag is not None and stop_flag.value:
424
  print(f"🛑 Stop processing {accession}, aborting early...")
425
  return {}
426
  # check doi first
427
- print("chunk filename: ", chunk_filename)
428
- if chunk_exists:
429
- print("File chunk exists!")
430
- if not chunk:
431
- print("start to get chunk")
432
- text, table, document_title = model.read_docx_text(file_chunk_path)
433
- chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
434
- if str(chunk_filename) != "":
435
- print("first time have chunk path at chunk exist: ", str(chunk_filename))
436
- acc_score["file_chunk"] = str(chunk_filename)
437
  if all_exists:
438
  print("File all output exists!")
439
  if not all_output:
@@ -442,121 +501,22 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
442
  if str(all_filename) != "":
443
  print("first time have all path at all exist: ", str(all_filename))
444
  acc_score["file_all_output"] = str(all_filename)
445
- print("acc sscore for file all output and chunk: ", acc_score["file_all_output"], acc_score["file_chunk"])
446
- if len(acc_score["file_all_output"]) == 0 and len(acc_score["file_chunk"]) == 0:
447
- if doi != "unknown":
448
- link = 'https://doi.org/' + doi
449
- # get the file to create listOfFile for each id
450
- print("link of doi: ", link)
451
- # html = extractHTML.HTML("",link)
452
- html = extractHTML.HTML(htmlContent=None, htmlLink=link, htmlFile="")
453
- jsonSM = html.getSupMaterial()
454
- article_text = await html.async_getListSection() # html.getListSection()
455
- if len(article_text) == 0:
456
- # try crossAPI
457
- metadata_text = html.fetch_crossref_metadata(link)
458
- if metadata_text:
459
- print(f"✅ CrossRef metadata fetched for {link}")
460
- other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
461
- article_text = html.mergeTextInJson(metadata_text)
462
- # also try searching pubmed with the title and extract abstract and add to article text
463
- # Step 1: Search for the paper
464
- print("search the paper's abstract on pubmed")
465
- handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
466
- record = Entrez.read(handle)
467
- id_list = record.get("IdList", [])
468
-
469
- if not id_list:
470
- print("No PubMed results found.")
471
- else:
472
- pubmed_id = id_list[0]
473
- fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
474
- fetch_record = Entrez.read(fetch_handle)
475
-
476
- # Safe extraction
477
- article = fetch_record.get("PubmedArticle", [])
478
- if not article:
479
- print("No PubmedArticle entry returned.")
480
- else:
481
- article = article[0] # the real payload
482
- try:
483
- abstract_sections = (
484
- article["MedlineCitation"]["Article"]
485
- .get("Abstract", {})
486
- .get("AbstractText", [])
487
- )
488
- full_abstract = " ".join(str(s) for s in abstract_sections)
489
-
490
- if full_abstract.strip():
491
- print("Abstract found (len={}):".format(len(full_abstract)))
492
- #print(full_abstract)
493
- article_text += full_abstract
494
- else:
495
- print("This article has **no abstract available on PubMed**.")
496
-
497
- except KeyError:
498
- print("Abstract field missing in this PubMed record.")
499
-
500
- if article_text:
501
- if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
502
- out_links[link] = article_text
503
- links.append(link)
504
- if jsonSM:
505
- links += sum((jsonSM[key] for key in jsonSM),[])
506
- if links:
507
- for l in links:
508
- out_links[l] = ""
509
- # no doi then google custom search api
510
- if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
511
- # might find the article
512
- print("no article text, start tem link")
513
- tem_links = smart_fallback.smart_google_search(acc, meta_expand)
514
- print("tem links: ", tem_links)
515
- tem_links = unique_preserve_order(tem_links)
516
- print("tem link before filtering: ", tem_links)
517
- print("saveLinkFolder as sample folder id: ", sample_folder_id)
518
- print("start the smart filter link")
519
- if stop_flag is not None and stop_flag.value:
520
- print(f"🛑 Stop processing {acc}, aborting early...")
521
- return {}
522
- output_process = await smart_fallback.async_filter_links_by_metadata(
523
- tem_links, sample_folder_id, accession=acc
524
- )
525
- if output_process: #success_process:
526
- out_links.update(output_process)
527
- print("yeah we have out_link and len: ", len(out_links))
528
- print("yes succeed for smart filter link")
529
- links += list(out_links.keys())
530
- print("link keys: ", links)
531
- else:
532
- print("no suceed, fallback to all tem links")
533
- links += tem_links
534
- print("this is links: ",links)
535
  links = unique_preserve_order(links)
 
536
  acc_score["source"] = links
537
  else:
538
  print("inside the try of reusing chunk or all output")
539
- #print("chunk filename: ", str(chunks_filename))
540
-
541
  try:
542
  temp_source = False
543
  if save_df is not None and not save_df.empty:
544
  print("save df not none")
545
- print("chunk file name: ",str(chunk_filename))
546
  print("all filename: ",str(all_filename))
547
- print("acc score for file chunk: ", acc_score["file_chunk"])
548
  print("acc score for file all output: ", acc_score["file_all_output"])
549
- if acc_score["file_chunk"]:
550
- link = save_df.loc[save_df["file_chunk"]==acc_score["file_chunk"],"Sources"].iloc[0]
551
- #link = row["Sources"].iloc[0]
552
- if "http" in link:
553
- print("yeah http in save df source")
554
- acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
555
- else: # temporary
556
- print("tempo source")
557
- #acc_score["source"] = [str(all_filename), str(chunks_filename)]
558
- temp_source = True
559
- elif acc_score["file_all_output"]:
560
  link = save_df.loc[save_df["file_all_output"]==acc_score["file_all_output"],"Sources"].iloc[0]
561
  #link = row["Sources"].iloc[0]
562
  print(link)
@@ -579,189 +539,16 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
579
  temp_source = True
580
  if temp_source:
581
  print("temp source is true so have to try again search link")
582
- if doi != "unknown":
583
- link = 'https://doi.org/' + doi
584
- # get the file to create listOfFile for each id
585
- print("link of doi: ", link)
586
- #html = extractHTML.HTML("",link)
587
- html = extractHTML.HTML(htmlContent=None, htmlLink=link, htmlFile="")
588
- jsonSM = html.getSupMaterial()
589
- article_text = await html.async_getListSection() # html.getListSection()
590
- if len(article_text) == 0:
591
- # try crossAPI
592
- metadata_text = html.fetch_crossref_metadata(link)
593
- if metadata_text:
594
- print(f"✅ CrossRef metadata fetched for {link}")
595
- other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
596
- article_text = html.mergeTextInJson(metadata_text)
597
- # Step 1: Search for the paper
598
- print("search the paper's abstract on pubmed")
599
- handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
600
- record = Entrez.read(handle)
601
- id_list = record.get("IdList", [])
602
-
603
- if not id_list:
604
- print("No PubMed results found.")
605
- else:
606
- pubmed_id = id_list[0]
607
- fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
608
- fetch_record = Entrez.read(fetch_handle)
609
-
610
- # Safe extraction
611
- article = fetch_record.get("PubmedArticle", [])
612
- if not article:
613
- print("No PubmedArticle entry returned.")
614
- else:
615
- article = article[0] # the real payload
616
- try:
617
- abstract_sections = (
618
- article["MedlineCitation"]["Article"]
619
- .get("Abstract", {})
620
- .get("AbstractText", [])
621
- )
622
- full_abstract = " ".join(str(s) for s in abstract_sections)
623
-
624
- if full_abstract.strip():
625
- print("Abstract found (len={}):".format(len(full_abstract)))
626
- #print(full_abstract)
627
- article_text += full_abstract
628
- else:
629
- print("This article has **no abstract available on PubMed**.")
630
-
631
- except KeyError:
632
- print("Abstract field missing in this PubMed record.")
633
-
634
- if article_text:
635
- if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
636
- out_links[link] = article_text
637
- links.append(link)
638
- if jsonSM:
639
- links += sum((jsonSM[key] for key in jsonSM),[])
640
- if links:
641
- for l in links:
642
- out_links[l] = ""
643
- # no doi then google custom search api
644
- if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
645
- # might find the article
646
- print("no article text, start tem link")
647
- #tem_links = mtdna_classifier.search_google_custom(title, 2)
648
- tem_links = smart_fallback.smart_google_search(acc, meta_expand)
649
- print("tem links: ", tem_links)
650
- tem_links = unique_preserve_order(tem_links)
651
- print("tem link before filtering: ", tem_links)
652
- # filter the quality link
653
- print("saveLinkFolder as sample folder id: ", sample_folder_id)
654
- print("start the smart filter link")
655
- if stop_flag is not None and stop_flag.value:
656
- print(f"🛑 Stop processing {acc}, aborting early...")
657
- return {}
658
- output_process = await smart_fallback.async_filter_links_by_metadata(
659
- tem_links, sample_folder_id, accession=acc
660
- )
661
- if output_process:#success_process:
662
- out_links.update(output_process)
663
- print("yeah we have out_link and len: ", len(out_links))
664
- print("yes succeed for smart filter link")
665
- links += list(out_links.keys())
666
- print("link keys: ", links)
667
- else:
668
- print("no suceed, fallback to all tem links")
669
- links += tem_links
670
- print("this is links: ",links)
671
  links = unique_preserve_order(links)
 
672
  acc_score["source"] = links
673
  except:
674
  try:
675
  print("in the exception and start to get link")
676
- if doi != "unknown":
677
- link = 'https://doi.org/' + doi
678
- # get the file to create listOfFile for each id
679
- print("link of doi: ", link)
680
- #html = extractHTML.HTML("",link)
681
- html = extractHTML.HTML(htmlContent=None, htmlLink=link, htmlFile="")
682
- jsonSM = html.getSupMaterial()
683
- article_text = await html.async_getListSection() # html.getListSection()
684
- if len(article_text) == 0:
685
- # try crossAPI
686
- metadata_text = html.fetch_crossref_metadata(link)
687
- if metadata_text:
688
- print(f"✅ CrossRef metadata fetched for {link}")
689
- other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
690
- article_text = html.mergeTextInJson(metadata_text)
691
- # Step 1: Search for the paper
692
- print("search the paper's abstract on pubmed")
693
- handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
694
- record = Entrez.read(handle)
695
- id_list = record.get("IdList", [])
696
-
697
- if not id_list:
698
- print("No PubMed results found.")
699
- else:
700
- pubmed_id = id_list[0]
701
- fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
702
- fetch_record = Entrez.read(fetch_handle)
703
-
704
- # Safe extraction
705
- article = fetch_record.get("PubmedArticle", [])
706
- if not article:
707
- print("No PubmedArticle entry returned.")
708
- else:
709
- article = article[0] # the real payload
710
- try:
711
- abstract_sections = (
712
- article["MedlineCitation"]["Article"]
713
- .get("Abstract", {})
714
- .get("AbstractText", [])
715
- )
716
- full_abstract = " ".join(str(s) for s in abstract_sections)
717
-
718
- if full_abstract.strip():
719
- print("Abstract found (len={}):".format(len(full_abstract)))
720
- #print(full_abstract)
721
- article_text += full_abstract
722
- else:
723
- print("This article has **no abstract available on PubMed**.")
724
-
725
- except KeyError:
726
- print("Abstract field missing in this PubMed record.")
727
- if article_text:
728
- if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
729
- out_links[link] = article_text
730
- links.append(link)
731
- if jsonSM:
732
- links += sum((jsonSM[key] for key in jsonSM),[])
733
- if links:
734
- for l in links:
735
- out_links[l] = ""
736
- # no doi then google custom search api
737
- if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
738
- # might find the article
739
- print("no article text, start tem link")
740
- #tem_links = mtdna_classifier.search_google_custom(title, 2)
741
- tem_links = smart_fallback.smart_google_search(acc, meta_expand)
742
- print("tem links: ", tem_links)
743
- tem_links = unique_preserve_order(tem_links)
744
- print("tem link before filtering: ", tem_links)
745
- # filter the quality link
746
- print("saveLinkFolder as sample folder id: ", sample_folder_id)
747
- print("start the smart filter link")
748
- if stop_flag is not None and stop_flag.value:
749
- print(f"🛑 Stop processing {acc}, aborting early...")
750
- return {}
751
- output_process = await smart_fallback.async_filter_links_by_metadata(
752
- tem_links, sample_folder_id, accession=acc
753
- )
754
- if output_process:#success_process:
755
- out_links.update(output_process)
756
- print("yeah we have out_link and len: ", len(out_links))
757
- print("yes succeed for smart filter link")
758
- links += list(out_links.keys())
759
- print("link keys: ", links)
760
- else:
761
- print("no suceed, fallback to all tem links")
762
- links += tem_links
763
- print("this is links: ",links)
764
  links = unique_preserve_order(links)
 
765
  acc_score["source"] = links
766
  except:
767
  print("except of except for source")
@@ -769,192 +556,132 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
769
  if stop_flag is not None and stop_flag.value:
770
  print(f"🛑 Stop processing {accession}, aborting early...")
771
  return {}
772
- if not chunk and not all_output:
773
- print("not chunk and all output")
774
- # else: check if we can reuse these chunk and all output of existed accession to find another
775
- if str(chunk_filename) != "":
776
- print("first time have chunk path: ", str(chunk_filename))
777
- acc_score["file_chunk"] = str(chunk_filename)
778
- if str(all_filename) != "":
779
- print("first time have all path: ", str(all_filename))
780
- acc_score["file_all_output"] = str(all_filename)
781
- if links:
782
- tasks = [
783
- process_link_chunk_allOutput(link, iso, acc, sample_folder_id, out_links, all_output, chunk)
784
- for link in links
785
- ]
786
- results = await asyncio.gather(*tasks)
787
-
788
- # combine results
789
- for context, new_all_output, new_chunk in results:
790
- all_output += new_all_output
791
- chunk += new_chunk
792
-
793
- else:
794
- chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
795
- all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
796
- if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
797
- if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
798
- if len(all_output) > 1*1000*1000:
799
- all_output = data_preprocess.normalize_for_overlap(all_output)
800
- if len(chunk) > 1*1000*1000:
801
- chunk = data_preprocess.normalize_for_overlap(chunk)
802
- print("chunk len: ", len(chunk))
803
- print("all output len: ", len(all_output))
804
  # use build context for llm function to reduce token
 
805
  reduce_context_for_llm = ""
806
- if len(all_output)>900000 or len(chunk)>900000:
807
  texts_reduce = []
808
  out_links_reduce = {}
809
- if links:
810
- for link in links:
811
- all_output_reduce, chunk_reduce, context_reduce = "", "",""
812
- context_reduce, all_output_reduce, chunk_reduce = await process_link_chunk_allOutput(link,
813
- iso, acc, sample_folder_id, out_links_reduce,
814
- all_output_reduce, chunk_reduce)
815
- texts_reduce.append(all_output_reduce)
816
- out_links_reduce[link] = {"all_output": all_output_reduce}
817
- input_prompt = ["country_name", "modern/ancient/unknown"]
818
- if niche_cases: input_prompt += niche_cases
819
- reduce_context_for_llm = data_preprocess.build_context_for_llm(texts_reduce, acc, input_prompt)
820
  if reduce_context_for_llm:
821
  print("reduce context for llm")
822
  all_output = reduce_context_for_llm
823
  else:
824
  print("reduce context no succeed")
825
- all_output = all_output[:900000]
826
-
827
- data_preprocess.save_text_to_docx(chunk, file_chunk_path)
828
- data_preprocess.save_text_to_docx(all_output, file_all_path)
829
- # Later when saving new files
830
- # data_preprocess.save_text_to_docx(chunk, chunk_filename, sample_folder_id)
831
- # data_preprocess.save_text_to_docx(all_output, all_filename, sample_folder_id)
832
-
833
- # Upload to Drive
834
- result_chunk_upload = upload_file_to_drive(file_chunk_path, chunk_filename, sample_folder_id)
835
- result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
836
- print("UPLOAD RESULT FOR CHUNK: ", result_chunk_upload)
837
- print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
838
- print("here 1")
839
-
840
- print("here 2")
841
-
842
- primary_word = iso
843
- alternative_word = acc
844
- print(f"\n--- General Query: Primary='{primary_word}' (Alternative='{alternative_word}') ---")
845
- if features.lower() not in all_output.lower():
846
- all_output += ". NCBI Features: " + features
847
- print("do last resort")
848
- if True:
849
- text = ""
850
- for key in meta_expand:
851
- text += str(key) + ": " + meta_expand[key] + "\n"
852
- if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
853
- text += data_preprocess.normalize_for_overlap(all_output)
854
- if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
855
- text += data_preprocess.normalize_for_overlap(chunk)
856
- text += ". NCBI Features: " + features
857
- print("this is text for the last resort model")
858
- print(text)
859
-
860
- predicted_outputs, method_used, total_query_cost, more_links, accession_found_in_text = await model.query_document_info(
861
- niche_cases=niche_cases,
862
- query_word=primary_word, alternative_query_word=alternative_word,
863
- saveLinkFolder = sample_folder_id,
864
- metadata=meta,
865
- master_structured_lookup=None, faiss_index=None, document_chunks=None,
866
- llm_api_function=model.call_llm_api, chunk=text, all_output=text)
867
- print("add more links from model.query document")
868
- if more_links:
869
- links += more_links
870
- acc_score["source"] = links
871
- # add into the number of publications
872
- print("done adding links into source and here are number of links: ", len(acc_score["source"]))
873
- acc_score["signals"]["num_publications"] += len(acc_score["source"])
874
- # add if accession_found_in_text or not
875
- print("accession found in text: ", accession_found_in_text)
876
- acc_score["signals"]["accession_found_in_text"] = accession_found_in_text
877
-
878
- print("this is llm results: ")
879
  for pred_out in predicted_outputs:
880
  # only for country, we have to standardize
881
  if pred_out == "country_name":
882
- country = predicted_outputs[pred_out]["answer"]
883
- country_explanation = predicted_outputs[pred_out][pred_out+"_explanation"]
884
- print(pred_out, country, country_explanation)
885
- if country_explanation: country_explanation = "-" + country_explanation
886
- if country != "unknown" and len(country)>0:
887
- clean_country = model.get_country_from_text(country.lower())
888
- stand_country = standardize_location.smart_country_lookup(country.lower())
889
- if clean_country == "unknown" and stand_country.lower() == "not found":
890
- country = "unknown"
891
- # predicted country is unknown
892
- acc_score["signals"]["predicted_country"] = "unknown"
893
- acc_score["signals"]["known_failure_pattern"] = True
894
- if country.lower() != "unknown":
895
  stand_country = standardize_location.smart_country_lookup(country.lower())
896
- print("this is stand_country: ", stand_country)
897
- if stand_country.lower() != "not found":
898
- if stand_country.lower() in acc_score["country"]:
899
- if country_explanation:
900
- acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
 
 
 
 
 
 
 
 
 
 
901
  else:
902
- acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
903
- # predicted country is non unknown
904
- acc_score["signals"]["predicted_country"] = stand_country.lower()
905
- else:
906
- if country.lower() in acc_score["country"]:
907
- if country_explanation:
908
  if len(method_used + country_explanation) > 0:
909
- acc_score["country"][country.lower()].append(method_used + country_explanation)
910
- else:
911
- if len(method_used + country_explanation) > 0:
912
- acc_score["country"][country.lower()] = [method_used + country_explanation]
913
- # predicted country is non unknown
914
- acc_score["signals"]["predicted_country"] = country.lower()
915
- else:
916
- # predicted country is unknown
917
- acc_score["signals"]["predicted_country"] = "unknown"
918
- acc_score["signals"]["known_failure_pattern"] = True
919
- # for sample type
920
- elif pred_out == "modern/ancient/unknown":
921
- sample_type = predicted_outputs[pred_out]["answer"]
922
- sample_type_explanation = predicted_outputs[pred_out][pred_out+"_explanation"]
923
- print(pred_out, sample_type, sample_type_explanation)
924
- if sample_type_explanation: sample_type_explanation = "-" + sample_type_explanation
925
- if sample_type.lower() != "unknown":
926
- if sample_type.lower() in acc_score["sample_type"]:
927
- if len(method_used + sample_type_explanation) > 0:
928
- acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
929
  else:
930
- if len(method_used + sample_type_explanation)> 0:
931
- acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
932
- # for niche cases
 
 
 
 
 
 
 
 
 
 
 
 
 
933
  else:
934
- print(pred_out)
935
- if pred_out in acc_score:
936
  answer = predicted_outputs[pred_out]["answer"]
937
  answer_explanation = predicted_outputs[pred_out][pred_out+"_explanation"]
938
- print(pred_out, answer, answer_explanation)
939
  if answer_explanation: answer_explanation = "-" + answer_explanation
940
  if answer.lower() != "unknown":
941
- print(acc_score[pred_out])
942
- print(method_used + answer_explanation)
943
  if answer.lower() in acc_score[pred_out]:
944
  if len(method_used + answer_explanation) > 0:
945
  acc_score[pred_out][answer.lower()].append(method_used + answer_explanation)
946
  else:
947
- print("answer not in acc score")
948
  if len(method_used + answer_explanation) > 0:
949
- acc_score[pred_out][answer.lower()] = [method_used + answer_explanation]
 
 
 
 
 
 
 
 
 
 
 
 
 
950
 
951
- total_cost_title += total_query_cost
952
  end = time.time()
953
- #total_cost_title += total_query_cost
954
- acc_score["query_cost"] = f"{total_cost_title:.6f}"
955
- elapsed = end - start
956
  acc_score["time_cost"] = f"{elapsed:.3f} seconds"
957
  accs_output[acc] = acc_score
958
- print(accs_output[acc])
959
-
960
  return accs_output
 
216
  tasks = [fetch_url(session, l, timeout) for l in links]
217
  return await asyncio.gather(*tasks)
218
 
219
+ async def process_link_chunk_allOutput(link, iso, acc, saveLinkFolder, linksWithTexts, all_output):
220
  print(link)
221
  if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
222
  print("break here")
223
+ return all_output # nothing more for this link
224
 
225
  query_kw = iso if iso != "unknown" else acc
226
 
227
  # --- text extraction ---
228
+ if linksWithTexts and link in linksWithTexts and linksWithTexts[link]!="":
229
  print("yeah art_text available")
230
+ text_link = linksWithTexts[link]
231
  else:
232
  try:
233
  print("start preprocess and extract text")
 
241
  asyncio.to_thread(data_preprocess.extract_table, link, saveLinkFolder),
242
  timeout=10
243
  )
244
+ print("this is len of table link: ", len(str(table_links)))
 
245
  except Exception:
246
  tables_link = []
247
 
248
  # --- merge ---
249
  try:
250
  print("just merge text and tables")
251
+ print("len of text link before mergin: ", len(text_link))
252
+ print("len of table link before merge: ", len(", ".join(tables_link)))
253
  try:
254
  final_input_link = text_link + ", ".join(tables_link)
255
  except:
 
257
  except Exception:
258
  print("no succeed here in preprocess docu")
259
  final_input_link = ""
 
 
 
 
 
260
  # --- normalize output ---
261
  if len(final_input_link) > 1000000:
262
  final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
263
  if len(final_input_link) > 1000000:
264
  final_input_link = final_input_link[:1000000]
265
 
266
+ all_output += data_preprocess.normalize_for_overlap(all_output) + final_input_link
267
+
268
+ return all_output
269
+
270
+ def extractSources(doi, linksWithTexts, links, all_output, iso, acc, saveLinkFolder, niche_cases=None):
271
+ article_text = ""
272
+ if doi != "unknown":
273
+ link = 'https://doi.org/' + doi
274
+ # get the file to create listOfFile for each id
275
+ print("link of doi: ", link)
276
+ # html = extractHTML.HTML("",link)
277
+ html = extractHTML.HTML(htmlContent=None, htmlLink=link, htmlFile="")
278
+ jsonSM = html.getSupMaterial()
279
+ article_text = await html.async_getListSection() # html.getListSection()
280
+ if len(article_text) == 0:
281
+ # try crossAPI
282
+ metadata_text = html.fetch_crossref_metadata(link)
283
+ if metadata_text:
284
+ print(f"✅ CrossRef metadata fetched for {link}")
285
+ #other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
286
+ article_text = html.mergeTextInJson(metadata_text)
287
+ # also try searching pubmed with the title and extract abstract and add to article text
288
+ # Step 1: Search for the paper
289
+ print("search the paper's abstract on pubmed")
290
+ handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
291
+ record = Entrez.read(handle)
292
+ id_list = record.get("IdList", [])
293
+
294
+ if not id_list:
295
+ print("No PubMed results found.")
296
+ else:
297
+ pubmed_id = id_list[0]
298
+ fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
299
+ fetch_record = Entrez.read(fetch_handle)
300
+
301
+ # Safe extraction
302
+ article = fetch_record.get("PubmedArticle", [])
303
+ if not article:
304
+ print("No PubmedArticle entry returned.")
305
+ else:
306
+ article = article[0] # the real payload
307
+ try:
308
+ abstract_sections = (
309
+ article["MedlineCitation"]["Article"]
310
+ .get("Abstract", {})
311
+ .get("AbstractText", [])
312
+ )
313
+ full_abstract = " ".join(str(s) for s in abstract_sections)
314
+
315
+ if full_abstract.strip():
316
+ print("Abstract found (len={}):".format(len(full_abstract)))
317
+ #print(full_abstract)
318
+ article_text += full_abstract
319
+ else:
320
+ print("This article has **no abstract available on PubMed**.")
321
+
322
+ except KeyError:
323
+ print("Abstract field missing in this PubMed record.")
324
+
325
+ if article_text:
326
+ if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
327
+ linksWithTexts[link] = article_text
328
+ links.append(link)
329
+ all_output += article_text
330
+ if jsonSM:
331
+ sup_links = sum((jsonSM[key] for key in jsonSM),[])
332
+ if sup_links:
333
+ links += sup_links
334
+ for l in sup_links:
335
+ linksWithTexts[l] = ""
336
+ more_all_output=await process_link_allOutput(l, iso, acc, saveLinkFolder, linksWithTexts, all_output)
337
+ all_output += more_all_output
338
+ if len(all_output) > 10000000:
339
+ all_output = data_preprocess.normalize_for_overlap(all_output)
340
+ print("reduce context for llm")
341
+ reduce_context_for_llm = ""
342
+ if len(all_output)>1000000:
343
+ texts_reduce = []
344
+ out_links_reduce = {}
345
+ texts_reduce.append(all_output)
346
+ out_links_reduce[link] = {"all_output": all_output}
347
+ input_prompt = ["country_name", "modern/ancient/unknown"]
348
+ if niche_cases: input_prompt += niche_cases
349
+ reduce_context_for_llm = data_preprocess.build_context_for_llm(texts_reduce, acc, input_prompt, 500000)
350
+ if reduce_context_for_llm:
351
+ print("reduce context for llm")
352
+ all_output = reduce_context_for_llm
353
+ else:
354
+ print("reduce context no succeed")
355
+ all_output = all_output[:500000]
356
+ print("length of context after reducing: ", len(all_output))
357
+ print("len new all output after sup link: ", len(all_output))
358
+ # no doi then google custom search api
359
+ if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
360
+ # might find the article
361
+ print("no article text, start tem link")
362
+ more_all_output, more_linksWithTexts, more_links = await model.getMoreInfoForAcc(iso, acc, saveLinkFolder, niche_cases)
363
+ if more_all_output: all_output += more_all_output
364
+ if more_links: links += more_links
365
+ if more_linksWithTexts: linksWithTexts.update(more_linksWithTexts)
366
+ return linksWithTexts, links, all_output
367
 
368
  from Bio import Entrez
369
  Entrez.email = "your_email@example.com" # required by NCBI
 
371
  # Main execution
372
  async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_cases=None):
373
  # output: country, sample_type, ethnic, location, money_cost, time_cost, explain
 
374
  # Prices are per 1,000 tokens
375
  # Before each big step:
376
  if stop_flag is not None and stop_flag.value:
377
  print(f"🛑 Stop detected before starting {accession}, aborting early...")
378
  return {}
 
 
 
379
  # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
380
  PRICE_PER_1K_INPUT_LLM = 0.00010 # $0.10 per 1M input tokens
381
  PRICE_PER_1K_OUTPUT_LLM = 0.00040 # $0.40 per 1M output tokens
 
387
  return None
388
  else:
389
  accs_output = {}
390
+ genai.configure(api_key=os.getenv("NEW_GOOGLE_API_KEY"))
 
391
  for acc in accessions:
392
  print("start gemini: ", acc)
393
  start = time.time()
 
399
  "query_cost":total_cost_title,
400
  "time_cost":None,
401
  "source":links,
402
+ "file_all_output":"",
 
403
  "signals":{ # default values
404
  "has_geo_loc_name": False,
405
  "has_pubmed": False,
 
421
  meta_expand = smart_fallback.fetch_ncbi(acc)
422
  print("meta expand: ", meta_expand)
423
  # set up step: create the folder to save document
424
+ all_output, linksWithTexts = "", {}
425
  if pudID:
426
  id = str(pudID)
427
  saveTitle = title
 
441
  print("sample folder id: ", sample_folder_id)
442
 
443
  safe_title = sanitize_filename(saveTitle, 50)
 
444
  all_filename = f"{safe_title}_all_merged_document.docx"
445
+ print("all filename: ", all_filename)
446
  # Define local temp paths for reading/writing
 
 
447
  LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
448
  os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
 
449
  file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
 
 
450
  if stop_flag is not None and stop_flag.value:
451
  print(f"🛑 Stop processing {accession}, aborting early...")
452
  return {}
453
+ print("this is file all path: ", file_all_path)
 
454
  all_id = find_drive_file(all_filename, sample_folder_id)
455
 
456
+ if all_id:
457
  print("✅ Files already exist in Google Drive. Downloading them...")
 
458
  all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
 
459
  acc_score["file_all_output"] = str(all_filename)
460
+ print("all_id: ")
461
+ print(all_id)
462
+ print("file all output saved in acc score: ", acc_score["file_all_output"])
463
  file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
464
  print("📄 Name:", file["name"])
465
  print("📁 Parent folder ID:", file["parents"][0])
466
  print("🔗 View link:", file["webViewLink"])
 
 
 
467
  else:
468
  # 🔥 Remove any stale local copies
 
 
 
469
  if os.path.exists(file_all_path):
470
  os.remove(file_all_path)
471
  print(f"🗑️ Removed stale: {file_all_path}")
472
  # Try to download if already exists on Drive
 
473
  all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
474
+ print("all exist: ", all_exists)
475
  # first way: ncbi method
476
  print("country.lower: ",country.lower())
477
  if country.lower() != "unknown":
 
489
  # second way: LLM model
490
  # Preprocess the input token
491
  print(acc_score)
 
 
 
492
  if stop_flag is not None and stop_flag.value:
493
  print(f"🛑 Stop processing {accession}, aborting early...")
494
  return {}
495
  # check doi first
 
 
 
 
 
 
 
 
 
 
496
  if all_exists:
497
  print("File all output exists!")
498
  if not all_output:
 
501
  if str(all_filename) != "":
502
  print("first time have all path at all exist: ", str(all_filename))
503
  acc_score["file_all_output"] = str(all_filename)
504
+ print("acc sscore for file all output: ", acc_score["file_all_output"])
505
+ if len(acc_score["file_all_output"]) == 0 or doi!="unknown":
506
+ linksWithTexts, links, all_output = extractSources(doi, linksWithTexts, links, all_output, iso, acc, sample_folder_id, niche_cases)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
507
  links = unique_preserve_order(links)
508
+ print("this is links: ",links)
509
  acc_score["source"] = links
510
  else:
511
  print("inside the try of reusing chunk or all output")
512
+ #print("chunk filename: ", str(chunks_filename))
 
513
  try:
514
  temp_source = False
515
  if save_df is not None and not save_df.empty:
516
  print("save df not none")
 
517
  print("all filename: ",str(all_filename))
 
518
  print("acc score for file all output: ", acc_score["file_all_output"])
519
+ if acc_score["file_all_output"]:
 
 
 
 
 
 
 
 
 
 
520
  link = save_df.loc[save_df["file_all_output"]==acc_score["file_all_output"],"Sources"].iloc[0]
521
  #link = row["Sources"].iloc[0]
522
  print(link)
 
539
  temp_source = True
540
  if temp_source:
541
  print("temp source is true so have to try again search link")
542
+ linksWithTexts, links, all_output = extractSources(doi, linksWithTexts, links, all_output, iso, acc, sample_folder_id, niche_cases)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
543
  links = unique_preserve_order(links)
544
+ print("links: ", links)
545
  acc_score["source"] = links
546
  except:
547
  try:
548
  print("in the exception and start to get link")
549
+ linksWithTexts, links, all_output = extractSources(doi, linksWithTexts, links, all_output, iso, acc, sample_folder_id, niche_cases)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
550
  links = unique_preserve_order(links)
551
+ print("this is links: ",links)
552
  acc_score["source"] = links
553
  except:
554
  print("except of except for source")
 
556
  if stop_flag is not None and stop_flag.value:
557
  print(f"🛑 Stop processing {accession}, aborting early...")
558
  return {}
559
+ all_output += "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
560
+ print("all output length: ", len(all_output))
561
+ if len(all_output) > 750000:
562
+ all_output = data_preprocess.normalize_for_overlap(all_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
563
  # use build context for llm function to reduce token
564
+ print("reduce context for llm")
565
  reduce_context_for_llm = ""
566
+ if len(all_output)>500000:
567
  texts_reduce = []
568
  out_links_reduce = {}
569
+ texts_reduce.append(all_output)
570
+ out_links_reduce[link] = {"all_output": all_output}
571
+ input_prompt = ["country_name", "modern/ancient/unknown"]
572
+ if niche_cases: input_prompt += niche_cases
573
+ reduce_context_for_llm = data_preprocess.build_context_for_llm(texts_reduce, acc, input_prompt, 250000)
 
 
 
 
 
 
574
  if reduce_context_for_llm:
575
  print("reduce context for llm")
576
  all_output = reduce_context_for_llm
577
  else:
578
  print("reduce context no succeed")
579
+ all_output = all_output[:250000]
580
+ print("length of context after reducing: ", len(all_output))
581
+ text = ""
582
+ for key in meta_expand:
583
+ text += str(key) + ": " + meta_expand[key] + "\n"
584
+ if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
585
+ text += data_preprocess.normalize_for_overlap(all_output)
586
+ text += ". NCBI Features: " + features
587
+ print("start to save the all output and its length: ", len(text))
588
+ data_preprocess.save_text_to_docx(all_output, file_all_path)
589
+ result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
590
+ print("UPLOAD RESULT FOR all_output: ", result_all_upload)
591
+ print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
592
+
593
+ acc_prompts = {acc: text}
594
+ print("start model")
595
+ predicted_output_info = await model.query_document_info(
596
+ niche_cases=niche_cases,
597
+ saveLinkFolder=saveLinkFolder,
598
+ llm_api_function=model.call_llm_api,
599
+ prompts=acc_prompts)
600
+ for output_acc in predicted_output_info
601
+ # update everything from the output of model for each accession
602
+ # firstly update predicted output of an accession
603
+ predicted_outputs = predicted_output_info[output_acc]["predicted_output"]
604
+ method_used = predicted_output_info[output_acc]["method_used"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
605
  for pred_out in predicted_outputs:
606
  # only for country, we have to standardize
607
  if pred_out == "country_name":
608
+ country = predicted_outputs[pred_out]["answer"]
609
+ country_explanation = predicted_outputs[pred_out][pred_out+"_explanation"]
610
+ if country_explanation: country_explanation = "-" + country_explanation
611
+ if country != "unknown" and len(country)>0:
612
+ clean_country = model.get_country_from_text(country.lower())
 
 
 
 
 
 
 
 
613
  stand_country = standardize_location.smart_country_lookup(country.lower())
614
+ if clean_country == "unknown" and stand_country.lower() == "not found":
615
+ country = "unknown"
616
+ # predicted country is unknown
617
+ acc_score["signals"]["predicted_country"] = "unknown"
618
+ acc_score["signals"]["known_failure_pattern"] = True
619
+ if country.lower() != "unknown":
620
+ stand_country = standardize_location.smart_country_lookup(country.lower())
621
+ if stand_country.lower() != "not found":
622
+ if stand_country.lower() in acc_score["country"]:
623
+ if country_explanation:
624
+ acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
625
+ else:
626
+ acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
627
+ # predicted country is non unknown
628
+ acc_score["signals"]["predicted_country"] = stand_country.lower()
629
  else:
630
+ if country.lower() in acc_score["country"]:
631
+ if country_explanation:
632
+ if len(method_used + country_explanation) > 0:
633
+ acc_score["country"][country.lower()].append(method_used + country_explanation)
634
+ else:
 
635
  if len(method_used + country_explanation) > 0:
636
+ acc_score["country"][country.lower()] = [method_used + country_explanation]
637
+ # predicted country is non unknown
638
+ acc_score["signals"]["predicted_country"] = country.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
639
  else:
640
+ # predicted country is unknown
641
+ acc_score["signals"]["predicted_country"] = "unknown"
642
+ acc_score["signals"]["known_failure_pattern"] = True
643
+ # for sample type
644
+ elif pred_out == "modern/ancient/unknown":
645
+ sample_type = predicted_outputs[pred_out]["answer"]
646
+ sample_type_explanation = predicted_outputs[pred_out][pred_out+"_explanation"]
647
+ if sample_type_explanation: sample_type_explanation = "-" + sample_type_explanation
648
+ if sample_type.lower() != "unknown":
649
+ if sample_type.lower() in acc_score["sample_type"]:
650
+ if len(method_used + sample_type_explanation) > 0:
651
+ acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
652
+ else:
653
+ if len(method_used + sample_type_explanation)> 0:
654
+ acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
655
+ # for niche cases
656
  else:
657
+ if pred_out in acc_score:
 
658
  answer = predicted_outputs[pred_out]["answer"]
659
  answer_explanation = predicted_outputs[pred_out][pred_out+"_explanation"]
 
660
  if answer_explanation: answer_explanation = "-" + answer_explanation
661
  if answer.lower() != "unknown":
 
 
662
  if answer.lower() in acc_score[pred_out]:
663
  if len(method_used + answer_explanation) > 0:
664
  acc_score[pred_out][answer.lower()].append(method_used + answer_explanation)
665
  else:
 
666
  if len(method_used + answer_explanation) > 0:
667
+ acc_score[pred_out][answer.lower()] = [method_used + answer_explanation]
668
+
669
+ # update total query cost
670
+ acc_score["query_cost"] = predicted_output_info[output_acc]["total_query_cost"]
671
+ # update more links if have from model
672
+ more_model_links = predicted_output_info[output_acc]["links"]
673
+ if more_model_links:
674
+ acc_score["source"] += more_model_links
675
+ # update signals
676
+ # add if accession_found_in_text or not
677
+ acc_score["signals"]["accession_found_in_text"] = predicted_output_info[output_acc]["accession_found_in_text"]
678
+ # add into the number of publications
679
+ acc_score["signals"]["num_publications"] += len(acc_score["source"])
680
+ print(f"end of this acc {acc}")
681
 
 
682
  end = time.time()
683
+ elapsed = (end - start)
 
 
684
  acc_score["time_cost"] = f"{elapsed:.3f} seconds"
685
  accs_output[acc] = acc_score
686
+ print(accs_output)
 
687
  return accs_output