Spaces:
Running
Running
Update pipeline.py
Browse files- pipeline.py +226 -499
pipeline.py
CHANGED
|
@@ -216,18 +216,18 @@ async def fetch_all(links, timeout=15):
|
|
| 216 |
tasks = [fetch_url(session, l, timeout) for l in links]
|
| 217 |
return await asyncio.gather(*tasks)
|
| 218 |
|
| 219 |
-
async def process_link_chunk_allOutput(link, iso, acc, saveLinkFolder,
|
| 220 |
print(link)
|
| 221 |
if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
|
| 222 |
print("break here")
|
| 223 |
-
return
|
| 224 |
|
| 225 |
query_kw = iso if iso != "unknown" else acc
|
| 226 |
|
| 227 |
# --- text extraction ---
|
| 228 |
-
if
|
| 229 |
print("yeah art_text available")
|
| 230 |
-
text_link =
|
| 231 |
else:
|
| 232 |
try:
|
| 233 |
print("start preprocess and extract text")
|
|
@@ -241,14 +241,15 @@ async def process_link_chunk_allOutput(link, iso, acc, saveLinkFolder, out_links
|
|
| 241 |
asyncio.to_thread(data_preprocess.extract_table, link, saveLinkFolder),
|
| 242 |
timeout=10
|
| 243 |
)
|
| 244 |
-
print("this is table link: ", str(table_links))
|
| 245 |
-
|
| 246 |
except Exception:
|
| 247 |
tables_link = []
|
| 248 |
|
| 249 |
# --- merge ---
|
| 250 |
try:
|
| 251 |
print("just merge text and tables")
|
|
|
|
|
|
|
| 252 |
try:
|
| 253 |
final_input_link = text_link + ", ".join(tables_link)
|
| 254 |
except:
|
|
@@ -256,20 +257,113 @@ async def process_link_chunk_allOutput(link, iso, acc, saveLinkFolder, out_links
|
|
| 256 |
except Exception:
|
| 257 |
print("no succeed here in preprocess docu")
|
| 258 |
final_input_link = ""
|
| 259 |
-
|
| 260 |
-
# --- context extraction ---
|
| 261 |
-
context = data_preprocess.extract_context(final_input_link, query_kw)
|
| 262 |
-
chunk += context
|
| 263 |
-
|
| 264 |
# --- normalize output ---
|
| 265 |
if len(final_input_link) > 1000000:
|
| 266 |
final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
|
| 267 |
if len(final_input_link) > 1000000:
|
| 268 |
final_input_link = final_input_link[:1000000]
|
| 269 |
|
| 270 |
-
all_output
|
| 271 |
-
|
| 272 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 273 |
|
| 274 |
from Bio import Entrez
|
| 275 |
Entrez.email = "your_email@example.com" # required by NCBI
|
|
@@ -277,15 +371,11 @@ Entrez.email = "your_email@example.com" # required by NCBI
|
|
| 277 |
# Main execution
|
| 278 |
async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_cases=None):
|
| 279 |
# output: country, sample_type, ethnic, location, money_cost, time_cost, explain
|
| 280 |
-
# there can be one accession number in the accessions
|
| 281 |
# Prices are per 1,000 tokens
|
| 282 |
# Before each big step:
|
| 283 |
if stop_flag is not None and stop_flag.value:
|
| 284 |
print(f"🛑 Stop detected before starting {accession}, aborting early...")
|
| 285 |
return {}
|
| 286 |
-
# PRICE_PER_1K_INPUT_LLM = 0.000075 # $0.075 per 1M tokens
|
| 287 |
-
# PRICE_PER_1K_OUTPUT_LLM = 0.0003 # $0.30 per 1M tokens
|
| 288 |
-
# PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
|
| 289 |
# Gemini 2.5 Flash-Lite pricing per 1,000 tokens
|
| 290 |
PRICE_PER_1K_INPUT_LLM = 0.00010 # $0.10 per 1M input tokens
|
| 291 |
PRICE_PER_1K_OUTPUT_LLM = 0.00040 # $0.40 per 1M output tokens
|
|
@@ -297,8 +387,7 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 297 |
return None
|
| 298 |
else:
|
| 299 |
accs_output = {}
|
| 300 |
-
|
| 301 |
-
genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
|
| 302 |
for acc in accessions:
|
| 303 |
print("start gemini: ", acc)
|
| 304 |
start = time.time()
|
|
@@ -310,8 +399,7 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 310 |
"query_cost":total_cost_title,
|
| 311 |
"time_cost":None,
|
| 312 |
"source":links,
|
| 313 |
-
"
|
| 314 |
-
"file_all_output":"",
|
| 315 |
"signals":{ # default values
|
| 316 |
"has_geo_loc_name": False,
|
| 317 |
"has_pubmed": False,
|
|
@@ -333,7 +421,7 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 333 |
meta_expand = smart_fallback.fetch_ncbi(acc)
|
| 334 |
print("meta expand: ", meta_expand)
|
| 335 |
# set up step: create the folder to save document
|
| 336 |
-
|
| 337 |
if pudID:
|
| 338 |
id = str(pudID)
|
| 339 |
saveTitle = title
|
|
@@ -353,53 +441,37 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 353 |
print("sample folder id: ", sample_folder_id)
|
| 354 |
|
| 355 |
safe_title = sanitize_filename(saveTitle, 50)
|
| 356 |
-
chunk_filename = f"{safe_title}_merged_document.docx"
|
| 357 |
all_filename = f"{safe_title}_all_merged_document.docx"
|
| 358 |
-
print("
|
| 359 |
# Define local temp paths for reading/writing
|
| 360 |
-
# import tempfile
|
| 361 |
-
# tmp_dir = tempfile.mkdtemp()
|
| 362 |
LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
|
| 363 |
os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
|
| 364 |
-
file_chunk_path = os.path.join(LOCAL_TEMP_DIR, chunk_filename)
|
| 365 |
file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
|
| 366 |
-
# file_chunk_path = os.path.join(tempfile.gettempdir(), chunk_filename)
|
| 367 |
-
# file_all_path = os.path.join(tempfile.gettempdir(), all_filename)
|
| 368 |
if stop_flag is not None and stop_flag.value:
|
| 369 |
print(f"🛑 Stop processing {accession}, aborting early...")
|
| 370 |
return {}
|
| 371 |
-
print("this is file
|
| 372 |
-
chunk_id = find_drive_file(chunk_filename, sample_folder_id)
|
| 373 |
all_id = find_drive_file(all_filename, sample_folder_id)
|
| 374 |
|
| 375 |
-
if
|
| 376 |
print("✅ Files already exist in Google Drive. Downloading them...")
|
| 377 |
-
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
| 378 |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
| 379 |
-
acc_score["file_chunk"] = str(chunk_filename)
|
| 380 |
acc_score["file_all_output"] = str(all_filename)
|
| 381 |
-
print("
|
| 382 |
-
print(
|
| 383 |
-
print("file
|
| 384 |
file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
|
| 385 |
print("📄 Name:", file["name"])
|
| 386 |
print("📁 Parent folder ID:", file["parents"][0])
|
| 387 |
print("🔗 View link:", file["webViewLink"])
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
# Read and parse these into `chunk` and `all_output`
|
| 391 |
else:
|
| 392 |
# 🔥 Remove any stale local copies
|
| 393 |
-
if os.path.exists(file_chunk_path):
|
| 394 |
-
os.remove(file_chunk_path)
|
| 395 |
-
print(f"🗑️ Removed stale: {file_chunk_path}")
|
| 396 |
if os.path.exists(file_all_path):
|
| 397 |
os.remove(file_all_path)
|
| 398 |
print(f"🗑️ Removed stale: {file_all_path}")
|
| 399 |
# Try to download if already exists on Drive
|
| 400 |
-
chunk_exists = download_file_from_drive(chunk_filename, sample_folder_id, file_chunk_path)
|
| 401 |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
| 402 |
-
print("
|
| 403 |
# first way: ncbi method
|
| 404 |
print("country.lower: ",country.lower())
|
| 405 |
if country.lower() != "unknown":
|
|
@@ -417,23 +489,10 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 417 |
# second way: LLM model
|
| 418 |
# Preprocess the input token
|
| 419 |
print(acc_score)
|
| 420 |
-
accession, isolate = None, None
|
| 421 |
-
if acc != "unknown": accession = acc
|
| 422 |
-
if iso != "unknown": isolate = iso
|
| 423 |
if stop_flag is not None and stop_flag.value:
|
| 424 |
print(f"🛑 Stop processing {accession}, aborting early...")
|
| 425 |
return {}
|
| 426 |
# check doi first
|
| 427 |
-
print("chunk filename: ", chunk_filename)
|
| 428 |
-
if chunk_exists:
|
| 429 |
-
print("File chunk exists!")
|
| 430 |
-
if not chunk:
|
| 431 |
-
print("start to get chunk")
|
| 432 |
-
text, table, document_title = model.read_docx_text(file_chunk_path)
|
| 433 |
-
chunk = data_preprocess.normalize_for_overlap(text) + "\n" + data_preprocess.normalize_for_overlap(". ".join(table))
|
| 434 |
-
if str(chunk_filename) != "":
|
| 435 |
-
print("first time have chunk path at chunk exist: ", str(chunk_filename))
|
| 436 |
-
acc_score["file_chunk"] = str(chunk_filename)
|
| 437 |
if all_exists:
|
| 438 |
print("File all output exists!")
|
| 439 |
if not all_output:
|
|
@@ -442,121 +501,22 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 442 |
if str(all_filename) != "":
|
| 443 |
print("first time have all path at all exist: ", str(all_filename))
|
| 444 |
acc_score["file_all_output"] = str(all_filename)
|
| 445 |
-
print("acc sscore for file all output
|
| 446 |
-
if len(acc_score["file_all_output"]) == 0
|
| 447 |
-
|
| 448 |
-
link = 'https://doi.org/' + doi
|
| 449 |
-
# get the file to create listOfFile for each id
|
| 450 |
-
print("link of doi: ", link)
|
| 451 |
-
# html = extractHTML.HTML("",link)
|
| 452 |
-
html = extractHTML.HTML(htmlContent=None, htmlLink=link, htmlFile="")
|
| 453 |
-
jsonSM = html.getSupMaterial()
|
| 454 |
-
article_text = await html.async_getListSection() # html.getListSection()
|
| 455 |
-
if len(article_text) == 0:
|
| 456 |
-
# try crossAPI
|
| 457 |
-
metadata_text = html.fetch_crossref_metadata(link)
|
| 458 |
-
if metadata_text:
|
| 459 |
-
print(f"✅ CrossRef metadata fetched for {link}")
|
| 460 |
-
other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
|
| 461 |
-
article_text = html.mergeTextInJson(metadata_text)
|
| 462 |
-
# also try searching pubmed with the title and extract abstract and add to article text
|
| 463 |
-
# Step 1: Search for the paper
|
| 464 |
-
print("search the paper's abstract on pubmed")
|
| 465 |
-
handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
|
| 466 |
-
record = Entrez.read(handle)
|
| 467 |
-
id_list = record.get("IdList", [])
|
| 468 |
-
|
| 469 |
-
if not id_list:
|
| 470 |
-
print("No PubMed results found.")
|
| 471 |
-
else:
|
| 472 |
-
pubmed_id = id_list[0]
|
| 473 |
-
fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
|
| 474 |
-
fetch_record = Entrez.read(fetch_handle)
|
| 475 |
-
|
| 476 |
-
# Safe extraction
|
| 477 |
-
article = fetch_record.get("PubmedArticle", [])
|
| 478 |
-
if not article:
|
| 479 |
-
print("No PubmedArticle entry returned.")
|
| 480 |
-
else:
|
| 481 |
-
article = article[0] # the real payload
|
| 482 |
-
try:
|
| 483 |
-
abstract_sections = (
|
| 484 |
-
article["MedlineCitation"]["Article"]
|
| 485 |
-
.get("Abstract", {})
|
| 486 |
-
.get("AbstractText", [])
|
| 487 |
-
)
|
| 488 |
-
full_abstract = " ".join(str(s) for s in abstract_sections)
|
| 489 |
-
|
| 490 |
-
if full_abstract.strip():
|
| 491 |
-
print("Abstract found (len={}):".format(len(full_abstract)))
|
| 492 |
-
#print(full_abstract)
|
| 493 |
-
article_text += full_abstract
|
| 494 |
-
else:
|
| 495 |
-
print("This article has **no abstract available on PubMed**.")
|
| 496 |
-
|
| 497 |
-
except KeyError:
|
| 498 |
-
print("Abstract field missing in this PubMed record.")
|
| 499 |
-
|
| 500 |
-
if article_text:
|
| 501 |
-
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
|
| 502 |
-
out_links[link] = article_text
|
| 503 |
-
links.append(link)
|
| 504 |
-
if jsonSM:
|
| 505 |
-
links += sum((jsonSM[key] for key in jsonSM),[])
|
| 506 |
-
if links:
|
| 507 |
-
for l in links:
|
| 508 |
-
out_links[l] = ""
|
| 509 |
-
# no doi then google custom search api
|
| 510 |
-
if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
|
| 511 |
-
# might find the article
|
| 512 |
-
print("no article text, start tem link")
|
| 513 |
-
tem_links = smart_fallback.smart_google_search(acc, meta_expand)
|
| 514 |
-
print("tem links: ", tem_links)
|
| 515 |
-
tem_links = unique_preserve_order(tem_links)
|
| 516 |
-
print("tem link before filtering: ", tem_links)
|
| 517 |
-
print("saveLinkFolder as sample folder id: ", sample_folder_id)
|
| 518 |
-
print("start the smart filter link")
|
| 519 |
-
if stop_flag is not None and stop_flag.value:
|
| 520 |
-
print(f"🛑 Stop processing {acc}, aborting early...")
|
| 521 |
-
return {}
|
| 522 |
-
output_process = await smart_fallback.async_filter_links_by_metadata(
|
| 523 |
-
tem_links, sample_folder_id, accession=acc
|
| 524 |
-
)
|
| 525 |
-
if output_process: #success_process:
|
| 526 |
-
out_links.update(output_process)
|
| 527 |
-
print("yeah we have out_link and len: ", len(out_links))
|
| 528 |
-
print("yes succeed for smart filter link")
|
| 529 |
-
links += list(out_links.keys())
|
| 530 |
-
print("link keys: ", links)
|
| 531 |
-
else:
|
| 532 |
-
print("no suceed, fallback to all tem links")
|
| 533 |
-
links += tem_links
|
| 534 |
-
print("this is links: ",links)
|
| 535 |
links = unique_preserve_order(links)
|
|
|
|
| 536 |
acc_score["source"] = links
|
| 537 |
else:
|
| 538 |
print("inside the try of reusing chunk or all output")
|
| 539 |
-
#print("chunk filename: ", str(chunks_filename))
|
| 540 |
-
|
| 541 |
try:
|
| 542 |
temp_source = False
|
| 543 |
if save_df is not None and not save_df.empty:
|
| 544 |
print("save df not none")
|
| 545 |
-
print("chunk file name: ",str(chunk_filename))
|
| 546 |
print("all filename: ",str(all_filename))
|
| 547 |
-
print("acc score for file chunk: ", acc_score["file_chunk"])
|
| 548 |
print("acc score for file all output: ", acc_score["file_all_output"])
|
| 549 |
-
if acc_score["
|
| 550 |
-
link = save_df.loc[save_df["file_chunk"]==acc_score["file_chunk"],"Sources"].iloc[0]
|
| 551 |
-
#link = row["Sources"].iloc[0]
|
| 552 |
-
if "http" in link:
|
| 553 |
-
print("yeah http in save df source")
|
| 554 |
-
acc_score["source"] = [x for x in link.split("\n") if x.strip()]#row["Sources"].tolist()
|
| 555 |
-
else: # temporary
|
| 556 |
-
print("tempo source")
|
| 557 |
-
#acc_score["source"] = [str(all_filename), str(chunks_filename)]
|
| 558 |
-
temp_source = True
|
| 559 |
-
elif acc_score["file_all_output"]:
|
| 560 |
link = save_df.loc[save_df["file_all_output"]==acc_score["file_all_output"],"Sources"].iloc[0]
|
| 561 |
#link = row["Sources"].iloc[0]
|
| 562 |
print(link)
|
|
@@ -579,189 +539,16 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 579 |
temp_source = True
|
| 580 |
if temp_source:
|
| 581 |
print("temp source is true so have to try again search link")
|
| 582 |
-
|
| 583 |
-
link = 'https://doi.org/' + doi
|
| 584 |
-
# get the file to create listOfFile for each id
|
| 585 |
-
print("link of doi: ", link)
|
| 586 |
-
#html = extractHTML.HTML("",link)
|
| 587 |
-
html = extractHTML.HTML(htmlContent=None, htmlLink=link, htmlFile="")
|
| 588 |
-
jsonSM = html.getSupMaterial()
|
| 589 |
-
article_text = await html.async_getListSection() # html.getListSection()
|
| 590 |
-
if len(article_text) == 0:
|
| 591 |
-
# try crossAPI
|
| 592 |
-
metadata_text = html.fetch_crossref_metadata(link)
|
| 593 |
-
if metadata_text:
|
| 594 |
-
print(f"✅ CrossRef metadata fetched for {link}")
|
| 595 |
-
other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
|
| 596 |
-
article_text = html.mergeTextInJson(metadata_text)
|
| 597 |
-
# Step 1: Search for the paper
|
| 598 |
-
print("search the paper's abstract on pubmed")
|
| 599 |
-
handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
|
| 600 |
-
record = Entrez.read(handle)
|
| 601 |
-
id_list = record.get("IdList", [])
|
| 602 |
-
|
| 603 |
-
if not id_list:
|
| 604 |
-
print("No PubMed results found.")
|
| 605 |
-
else:
|
| 606 |
-
pubmed_id = id_list[0]
|
| 607 |
-
fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
|
| 608 |
-
fetch_record = Entrez.read(fetch_handle)
|
| 609 |
-
|
| 610 |
-
# Safe extraction
|
| 611 |
-
article = fetch_record.get("PubmedArticle", [])
|
| 612 |
-
if not article:
|
| 613 |
-
print("No PubmedArticle entry returned.")
|
| 614 |
-
else:
|
| 615 |
-
article = article[0] # the real payload
|
| 616 |
-
try:
|
| 617 |
-
abstract_sections = (
|
| 618 |
-
article["MedlineCitation"]["Article"]
|
| 619 |
-
.get("Abstract", {})
|
| 620 |
-
.get("AbstractText", [])
|
| 621 |
-
)
|
| 622 |
-
full_abstract = " ".join(str(s) for s in abstract_sections)
|
| 623 |
-
|
| 624 |
-
if full_abstract.strip():
|
| 625 |
-
print("Abstract found (len={}):".format(len(full_abstract)))
|
| 626 |
-
#print(full_abstract)
|
| 627 |
-
article_text += full_abstract
|
| 628 |
-
else:
|
| 629 |
-
print("This article has **no abstract available on PubMed**.")
|
| 630 |
-
|
| 631 |
-
except KeyError:
|
| 632 |
-
print("Abstract field missing in this PubMed record.")
|
| 633 |
-
|
| 634 |
-
if article_text:
|
| 635 |
-
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
|
| 636 |
-
out_links[link] = article_text
|
| 637 |
-
links.append(link)
|
| 638 |
-
if jsonSM:
|
| 639 |
-
links += sum((jsonSM[key] for key in jsonSM),[])
|
| 640 |
-
if links:
|
| 641 |
-
for l in links:
|
| 642 |
-
out_links[l] = ""
|
| 643 |
-
# no doi then google custom search api
|
| 644 |
-
if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
|
| 645 |
-
# might find the article
|
| 646 |
-
print("no article text, start tem link")
|
| 647 |
-
#tem_links = mtdna_classifier.search_google_custom(title, 2)
|
| 648 |
-
tem_links = smart_fallback.smart_google_search(acc, meta_expand)
|
| 649 |
-
print("tem links: ", tem_links)
|
| 650 |
-
tem_links = unique_preserve_order(tem_links)
|
| 651 |
-
print("tem link before filtering: ", tem_links)
|
| 652 |
-
# filter the quality link
|
| 653 |
-
print("saveLinkFolder as sample folder id: ", sample_folder_id)
|
| 654 |
-
print("start the smart filter link")
|
| 655 |
-
if stop_flag is not None and stop_flag.value:
|
| 656 |
-
print(f"🛑 Stop processing {acc}, aborting early...")
|
| 657 |
-
return {}
|
| 658 |
-
output_process = await smart_fallback.async_filter_links_by_metadata(
|
| 659 |
-
tem_links, sample_folder_id, accession=acc
|
| 660 |
-
)
|
| 661 |
-
if output_process:#success_process:
|
| 662 |
-
out_links.update(output_process)
|
| 663 |
-
print("yeah we have out_link and len: ", len(out_links))
|
| 664 |
-
print("yes succeed for smart filter link")
|
| 665 |
-
links += list(out_links.keys())
|
| 666 |
-
print("link keys: ", links)
|
| 667 |
-
else:
|
| 668 |
-
print("no suceed, fallback to all tem links")
|
| 669 |
-
links += tem_links
|
| 670 |
-
print("this is links: ",links)
|
| 671 |
links = unique_preserve_order(links)
|
|
|
|
| 672 |
acc_score["source"] = links
|
| 673 |
except:
|
| 674 |
try:
|
| 675 |
print("in the exception and start to get link")
|
| 676 |
-
|
| 677 |
-
link = 'https://doi.org/' + doi
|
| 678 |
-
# get the file to create listOfFile for each id
|
| 679 |
-
print("link of doi: ", link)
|
| 680 |
-
#html = extractHTML.HTML("",link)
|
| 681 |
-
html = extractHTML.HTML(htmlContent=None, htmlLink=link, htmlFile="")
|
| 682 |
-
jsonSM = html.getSupMaterial()
|
| 683 |
-
article_text = await html.async_getListSection() # html.getListSection()
|
| 684 |
-
if len(article_text) == 0:
|
| 685 |
-
# try crossAPI
|
| 686 |
-
metadata_text = html.fetch_crossref_metadata(link)
|
| 687 |
-
if metadata_text:
|
| 688 |
-
print(f"✅ CrossRef metadata fetched for {link}")
|
| 689 |
-
other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
|
| 690 |
-
article_text = html.mergeTextInJson(metadata_text)
|
| 691 |
-
# Step 1: Search for the paper
|
| 692 |
-
print("search the paper's abstract on pubmed")
|
| 693 |
-
handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
|
| 694 |
-
record = Entrez.read(handle)
|
| 695 |
-
id_list = record.get("IdList", [])
|
| 696 |
-
|
| 697 |
-
if not id_list:
|
| 698 |
-
print("No PubMed results found.")
|
| 699 |
-
else:
|
| 700 |
-
pubmed_id = id_list[0]
|
| 701 |
-
fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
|
| 702 |
-
fetch_record = Entrez.read(fetch_handle)
|
| 703 |
-
|
| 704 |
-
# Safe extraction
|
| 705 |
-
article = fetch_record.get("PubmedArticle", [])
|
| 706 |
-
if not article:
|
| 707 |
-
print("No PubmedArticle entry returned.")
|
| 708 |
-
else:
|
| 709 |
-
article = article[0] # the real payload
|
| 710 |
-
try:
|
| 711 |
-
abstract_sections = (
|
| 712 |
-
article["MedlineCitation"]["Article"]
|
| 713 |
-
.get("Abstract", {})
|
| 714 |
-
.get("AbstractText", [])
|
| 715 |
-
)
|
| 716 |
-
full_abstract = " ".join(str(s) for s in abstract_sections)
|
| 717 |
-
|
| 718 |
-
if full_abstract.strip():
|
| 719 |
-
print("Abstract found (len={}):".format(len(full_abstract)))
|
| 720 |
-
#print(full_abstract)
|
| 721 |
-
article_text += full_abstract
|
| 722 |
-
else:
|
| 723 |
-
print("This article has **no abstract available on PubMed**.")
|
| 724 |
-
|
| 725 |
-
except KeyError:
|
| 726 |
-
print("Abstract field missing in this PubMed record.")
|
| 727 |
-
if article_text:
|
| 728 |
-
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
|
| 729 |
-
out_links[link] = article_text
|
| 730 |
-
links.append(link)
|
| 731 |
-
if jsonSM:
|
| 732 |
-
links += sum((jsonSM[key] for key in jsonSM),[])
|
| 733 |
-
if links:
|
| 734 |
-
for l in links:
|
| 735 |
-
out_links[l] = ""
|
| 736 |
-
# no doi then google custom search api
|
| 737 |
-
if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
|
| 738 |
-
# might find the article
|
| 739 |
-
print("no article text, start tem link")
|
| 740 |
-
#tem_links = mtdna_classifier.search_google_custom(title, 2)
|
| 741 |
-
tem_links = smart_fallback.smart_google_search(acc, meta_expand)
|
| 742 |
-
print("tem links: ", tem_links)
|
| 743 |
-
tem_links = unique_preserve_order(tem_links)
|
| 744 |
-
print("tem link before filtering: ", tem_links)
|
| 745 |
-
# filter the quality link
|
| 746 |
-
print("saveLinkFolder as sample folder id: ", sample_folder_id)
|
| 747 |
-
print("start the smart filter link")
|
| 748 |
-
if stop_flag is not None and stop_flag.value:
|
| 749 |
-
print(f"🛑 Stop processing {acc}, aborting early...")
|
| 750 |
-
return {}
|
| 751 |
-
output_process = await smart_fallback.async_filter_links_by_metadata(
|
| 752 |
-
tem_links, sample_folder_id, accession=acc
|
| 753 |
-
)
|
| 754 |
-
if output_process:#success_process:
|
| 755 |
-
out_links.update(output_process)
|
| 756 |
-
print("yeah we have out_link and len: ", len(out_links))
|
| 757 |
-
print("yes succeed for smart filter link")
|
| 758 |
-
links += list(out_links.keys())
|
| 759 |
-
print("link keys: ", links)
|
| 760 |
-
else:
|
| 761 |
-
print("no suceed, fallback to all tem links")
|
| 762 |
-
links += tem_links
|
| 763 |
-
print("this is links: ",links)
|
| 764 |
links = unique_preserve_order(links)
|
|
|
|
| 765 |
acc_score["source"] = links
|
| 766 |
except:
|
| 767 |
print("except of except for source")
|
|
@@ -769,192 +556,132 @@ async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_ca
|
|
| 769 |
if stop_flag is not None and stop_flag.value:
|
| 770 |
print(f"🛑 Stop processing {accession}, aborting early...")
|
| 771 |
return {}
|
| 772 |
-
|
| 773 |
-
|
| 774 |
-
|
| 775 |
-
|
| 776 |
-
print("first time have chunk path: ", str(chunk_filename))
|
| 777 |
-
acc_score["file_chunk"] = str(chunk_filename)
|
| 778 |
-
if str(all_filename) != "":
|
| 779 |
-
print("first time have all path: ", str(all_filename))
|
| 780 |
-
acc_score["file_all_output"] = str(all_filename)
|
| 781 |
-
if links:
|
| 782 |
-
tasks = [
|
| 783 |
-
process_link_chunk_allOutput(link, iso, acc, sample_folder_id, out_links, all_output, chunk)
|
| 784 |
-
for link in links
|
| 785 |
-
]
|
| 786 |
-
results = await asyncio.gather(*tasks)
|
| 787 |
-
|
| 788 |
-
# combine results
|
| 789 |
-
for context, new_all_output, new_chunk in results:
|
| 790 |
-
all_output += new_all_output
|
| 791 |
-
chunk += new_chunk
|
| 792 |
-
|
| 793 |
-
else:
|
| 794 |
-
chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
| 795 |
-
all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
| 796 |
-
if not chunk: chunk = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
| 797 |
-
if not all_output: all_output = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
| 798 |
-
if len(all_output) > 1*1000*1000:
|
| 799 |
-
all_output = data_preprocess.normalize_for_overlap(all_output)
|
| 800 |
-
if len(chunk) > 1*1000*1000:
|
| 801 |
-
chunk = data_preprocess.normalize_for_overlap(chunk)
|
| 802 |
-
print("chunk len: ", len(chunk))
|
| 803 |
-
print("all output len: ", len(all_output))
|
| 804 |
# use build context for llm function to reduce token
|
|
|
|
| 805 |
reduce_context_for_llm = ""
|
| 806 |
-
if len(all_output)>
|
| 807 |
texts_reduce = []
|
| 808 |
out_links_reduce = {}
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
all_output_reduce, chunk_reduce)
|
| 815 |
-
texts_reduce.append(all_output_reduce)
|
| 816 |
-
out_links_reduce[link] = {"all_output": all_output_reduce}
|
| 817 |
-
input_prompt = ["country_name", "modern/ancient/unknown"]
|
| 818 |
-
if niche_cases: input_prompt += niche_cases
|
| 819 |
-
reduce_context_for_llm = data_preprocess.build_context_for_llm(texts_reduce, acc, input_prompt)
|
| 820 |
if reduce_context_for_llm:
|
| 821 |
print("reduce context for llm")
|
| 822 |
all_output = reduce_context_for_llm
|
| 823 |
else:
|
| 824 |
print("reduce context no succeed")
|
| 825 |
-
all_output = all_output[:
|
| 826 |
-
|
| 827 |
-
|
| 828 |
-
|
| 829 |
-
|
| 830 |
-
|
| 831 |
-
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
print("
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
| 849 |
-
|
| 850 |
-
|
| 851 |
-
text += str(key) + ": " + meta_expand[key] + "\n"
|
| 852 |
-
if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
|
| 853 |
-
text += data_preprocess.normalize_for_overlap(all_output)
|
| 854 |
-
if len(data_preprocess.normalize_for_overlap(chunk)) > 0:
|
| 855 |
-
text += data_preprocess.normalize_for_overlap(chunk)
|
| 856 |
-
text += ". NCBI Features: " + features
|
| 857 |
-
print("this is text for the last resort model")
|
| 858 |
-
print(text)
|
| 859 |
-
|
| 860 |
-
predicted_outputs, method_used, total_query_cost, more_links, accession_found_in_text = await model.query_document_info(
|
| 861 |
-
niche_cases=niche_cases,
|
| 862 |
-
query_word=primary_word, alternative_query_word=alternative_word,
|
| 863 |
-
saveLinkFolder = sample_folder_id,
|
| 864 |
-
metadata=meta,
|
| 865 |
-
master_structured_lookup=None, faiss_index=None, document_chunks=None,
|
| 866 |
-
llm_api_function=model.call_llm_api, chunk=text, all_output=text)
|
| 867 |
-
print("add more links from model.query document")
|
| 868 |
-
if more_links:
|
| 869 |
-
links += more_links
|
| 870 |
-
acc_score["source"] = links
|
| 871 |
-
# add into the number of publications
|
| 872 |
-
print("done adding links into source and here are number of links: ", len(acc_score["source"]))
|
| 873 |
-
acc_score["signals"]["num_publications"] += len(acc_score["source"])
|
| 874 |
-
# add if accession_found_in_text or not
|
| 875 |
-
print("accession found in text: ", accession_found_in_text)
|
| 876 |
-
acc_score["signals"]["accession_found_in_text"] = accession_found_in_text
|
| 877 |
-
|
| 878 |
-
print("this is llm results: ")
|
| 879 |
for pred_out in predicted_outputs:
|
| 880 |
# only for country, we have to standardize
|
| 881 |
if pred_out == "country_name":
|
| 882 |
-
|
| 883 |
-
|
| 884 |
-
|
| 885 |
-
|
| 886 |
-
|
| 887 |
-
clean_country = model.get_country_from_text(country.lower())
|
| 888 |
-
stand_country = standardize_location.smart_country_lookup(country.lower())
|
| 889 |
-
if clean_country == "unknown" and stand_country.lower() == "not found":
|
| 890 |
-
country = "unknown"
|
| 891 |
-
# predicted country is unknown
|
| 892 |
-
acc_score["signals"]["predicted_country"] = "unknown"
|
| 893 |
-
acc_score["signals"]["known_failure_pattern"] = True
|
| 894 |
-
if country.lower() != "unknown":
|
| 895 |
stand_country = standardize_location.smart_country_lookup(country.lower())
|
| 896 |
-
|
| 897 |
-
|
| 898 |
-
|
| 899 |
-
|
| 900 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 901 |
else:
|
| 902 |
-
|
| 903 |
-
|
| 904 |
-
|
| 905 |
-
|
| 906 |
-
|
| 907 |
-
if country_explanation:
|
| 908 |
if len(method_used + country_explanation) > 0:
|
| 909 |
-
acc_score["country"][country.lower()]
|
| 910 |
-
|
| 911 |
-
|
| 912 |
-
acc_score["country"][country.lower()] = [method_used + country_explanation]
|
| 913 |
-
# predicted country is non unknown
|
| 914 |
-
acc_score["signals"]["predicted_country"] = country.lower()
|
| 915 |
-
else:
|
| 916 |
-
# predicted country is unknown
|
| 917 |
-
acc_score["signals"]["predicted_country"] = "unknown"
|
| 918 |
-
acc_score["signals"]["known_failure_pattern"] = True
|
| 919 |
-
# for sample type
|
| 920 |
-
elif pred_out == "modern/ancient/unknown":
|
| 921 |
-
sample_type = predicted_outputs[pred_out]["answer"]
|
| 922 |
-
sample_type_explanation = predicted_outputs[pred_out][pred_out+"_explanation"]
|
| 923 |
-
print(pred_out, sample_type, sample_type_explanation)
|
| 924 |
-
if sample_type_explanation: sample_type_explanation = "-" + sample_type_explanation
|
| 925 |
-
if sample_type.lower() != "unknown":
|
| 926 |
-
if sample_type.lower() in acc_score["sample_type"]:
|
| 927 |
-
if len(method_used + sample_type_explanation) > 0:
|
| 928 |
-
acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
|
| 929 |
else:
|
| 930 |
-
|
| 931 |
-
|
| 932 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 933 |
else:
|
| 934 |
-
|
| 935 |
-
if pred_out in acc_score:
|
| 936 |
answer = predicted_outputs[pred_out]["answer"]
|
| 937 |
answer_explanation = predicted_outputs[pred_out][pred_out+"_explanation"]
|
| 938 |
-
print(pred_out, answer, answer_explanation)
|
| 939 |
if answer_explanation: answer_explanation = "-" + answer_explanation
|
| 940 |
if answer.lower() != "unknown":
|
| 941 |
-
print(acc_score[pred_out])
|
| 942 |
-
print(method_used + answer_explanation)
|
| 943 |
if answer.lower() in acc_score[pred_out]:
|
| 944 |
if len(method_used + answer_explanation) > 0:
|
| 945 |
acc_score[pred_out][answer.lower()].append(method_used + answer_explanation)
|
| 946 |
else:
|
| 947 |
-
print("answer not in acc score")
|
| 948 |
if len(method_used + answer_explanation) > 0:
|
| 949 |
-
acc_score[pred_out][answer.lower()] = [method_used + answer_explanation]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 950 |
|
| 951 |
-
total_cost_title += total_query_cost
|
| 952 |
end = time.time()
|
| 953 |
-
|
| 954 |
-
acc_score["query_cost"] = f"{total_cost_title:.6f}"
|
| 955 |
-
elapsed = end - start
|
| 956 |
acc_score["time_cost"] = f"{elapsed:.3f} seconds"
|
| 957 |
accs_output[acc] = acc_score
|
| 958 |
-
|
| 959 |
-
|
| 960 |
return accs_output
|
|
|
|
| 216 |
tasks = [fetch_url(session, l, timeout) for l in links]
|
| 217 |
return await asyncio.gather(*tasks)
|
| 218 |
|
| 219 |
+
async def process_link_chunk_allOutput(link, iso, acc, saveLinkFolder, linksWithTexts, all_output):
|
| 220 |
print(link)
|
| 221 |
if len(data_preprocess.normalize_for_overlap(all_output)) > 600000:
|
| 222 |
print("break here")
|
| 223 |
+
return all_output # nothing more for this link
|
| 224 |
|
| 225 |
query_kw = iso if iso != "unknown" else acc
|
| 226 |
|
| 227 |
# --- text extraction ---
|
| 228 |
+
if linksWithTexts and link in linksWithTexts and linksWithTexts[link]!="":
|
| 229 |
print("yeah art_text available")
|
| 230 |
+
text_link = linksWithTexts[link]
|
| 231 |
else:
|
| 232 |
try:
|
| 233 |
print("start preprocess and extract text")
|
|
|
|
| 241 |
asyncio.to_thread(data_preprocess.extract_table, link, saveLinkFolder),
|
| 242 |
timeout=10
|
| 243 |
)
|
| 244 |
+
print("this is len of table link: ", len(str(table_links)))
|
|
|
|
| 245 |
except Exception:
|
| 246 |
tables_link = []
|
| 247 |
|
| 248 |
# --- merge ---
|
| 249 |
try:
|
| 250 |
print("just merge text and tables")
|
| 251 |
+
print("len of text link before mergin: ", len(text_link))
|
| 252 |
+
print("len of table link before merge: ", len(", ".join(tables_link)))
|
| 253 |
try:
|
| 254 |
final_input_link = text_link + ", ".join(tables_link)
|
| 255 |
except:
|
|
|
|
| 257 |
except Exception:
|
| 258 |
print("no succeed here in preprocess docu")
|
| 259 |
final_input_link = ""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
# --- normalize output ---
|
| 261 |
if len(final_input_link) > 1000000:
|
| 262 |
final_input_link = data_preprocess.normalize_for_overlap(final_input_link)
|
| 263 |
if len(final_input_link) > 1000000:
|
| 264 |
final_input_link = final_input_link[:1000000]
|
| 265 |
|
| 266 |
+
all_output += data_preprocess.normalize_for_overlap(all_output) + final_input_link
|
| 267 |
+
|
| 268 |
+
return all_output
|
| 269 |
+
|
| 270 |
+
def extractSources(doi, linksWithTexts, links, all_output, iso, acc, saveLinkFolder, niche_cases=None):
|
| 271 |
+
article_text = ""
|
| 272 |
+
if doi != "unknown":
|
| 273 |
+
link = 'https://doi.org/' + doi
|
| 274 |
+
# get the file to create listOfFile for each id
|
| 275 |
+
print("link of doi: ", link)
|
| 276 |
+
# html = extractHTML.HTML("",link)
|
| 277 |
+
html = extractHTML.HTML(htmlContent=None, htmlLink=link, htmlFile="")
|
| 278 |
+
jsonSM = html.getSupMaterial()
|
| 279 |
+
article_text = await html.async_getListSection() # html.getListSection()
|
| 280 |
+
if len(article_text) == 0:
|
| 281 |
+
# try crossAPI
|
| 282 |
+
metadata_text = html.fetch_crossref_metadata(link)
|
| 283 |
+
if metadata_text:
|
| 284 |
+
print(f"✅ CrossRef metadata fetched for {link}")
|
| 285 |
+
#other_explain = "Because full-text is restricted by the publisher, our system uses abstracts and metadata to remain compliant while still supporting exploratory analysis, search, and literature linking."
|
| 286 |
+
article_text = html.mergeTextInJson(metadata_text)
|
| 287 |
+
# also try searching pubmed with the title and extract abstract and add to article text
|
| 288 |
+
# Step 1: Search for the paper
|
| 289 |
+
print("search the paper's abstract on pubmed")
|
| 290 |
+
handle = Entrez.esearch(db="pubmed", term=title, retmax=1)
|
| 291 |
+
record = Entrez.read(handle)
|
| 292 |
+
id_list = record.get("IdList", [])
|
| 293 |
+
|
| 294 |
+
if not id_list:
|
| 295 |
+
print("No PubMed results found.")
|
| 296 |
+
else:
|
| 297 |
+
pubmed_id = id_list[0]
|
| 298 |
+
fetch_handle = Entrez.efetch(db="pubmed", id=pubmed_id, rettype="xml", retmode="xml")
|
| 299 |
+
fetch_record = Entrez.read(fetch_handle)
|
| 300 |
+
|
| 301 |
+
# Safe extraction
|
| 302 |
+
article = fetch_record.get("PubmedArticle", [])
|
| 303 |
+
if not article:
|
| 304 |
+
print("No PubmedArticle entry returned.")
|
| 305 |
+
else:
|
| 306 |
+
article = article[0] # the real payload
|
| 307 |
+
try:
|
| 308 |
+
abstract_sections = (
|
| 309 |
+
article["MedlineCitation"]["Article"]
|
| 310 |
+
.get("Abstract", {})
|
| 311 |
+
.get("AbstractText", [])
|
| 312 |
+
)
|
| 313 |
+
full_abstract = " ".join(str(s) for s in abstract_sections)
|
| 314 |
+
|
| 315 |
+
if full_abstract.strip():
|
| 316 |
+
print("Abstract found (len={}):".format(len(full_abstract)))
|
| 317 |
+
#print(full_abstract)
|
| 318 |
+
article_text += full_abstract
|
| 319 |
+
else:
|
| 320 |
+
print("This article has **no abstract available on PubMed**.")
|
| 321 |
+
|
| 322 |
+
except KeyError:
|
| 323 |
+
print("Abstract field missing in this PubMed record.")
|
| 324 |
+
|
| 325 |
+
if article_text:
|
| 326 |
+
if "Just a moment...Enable JavaScript and cookies to continue".lower() not in article_text.lower() or "403 Forbidden Request".lower() not in article_text.lower():
|
| 327 |
+
linksWithTexts[link] = article_text
|
| 328 |
+
links.append(link)
|
| 329 |
+
all_output += article_text
|
| 330 |
+
if jsonSM:
|
| 331 |
+
sup_links = sum((jsonSM[key] for key in jsonSM),[])
|
| 332 |
+
if sup_links:
|
| 333 |
+
links += sup_links
|
| 334 |
+
for l in sup_links:
|
| 335 |
+
linksWithTexts[l] = ""
|
| 336 |
+
more_all_output=await process_link_allOutput(l, iso, acc, saveLinkFolder, linksWithTexts, all_output)
|
| 337 |
+
all_output += more_all_output
|
| 338 |
+
if len(all_output) > 10000000:
|
| 339 |
+
all_output = data_preprocess.normalize_for_overlap(all_output)
|
| 340 |
+
print("reduce context for llm")
|
| 341 |
+
reduce_context_for_llm = ""
|
| 342 |
+
if len(all_output)>1000000:
|
| 343 |
+
texts_reduce = []
|
| 344 |
+
out_links_reduce = {}
|
| 345 |
+
texts_reduce.append(all_output)
|
| 346 |
+
out_links_reduce[link] = {"all_output": all_output}
|
| 347 |
+
input_prompt = ["country_name", "modern/ancient/unknown"]
|
| 348 |
+
if niche_cases: input_prompt += niche_cases
|
| 349 |
+
reduce_context_for_llm = data_preprocess.build_context_for_llm(texts_reduce, acc, input_prompt, 500000)
|
| 350 |
+
if reduce_context_for_llm:
|
| 351 |
+
print("reduce context for llm")
|
| 352 |
+
all_output = reduce_context_for_llm
|
| 353 |
+
else:
|
| 354 |
+
print("reduce context no succeed")
|
| 355 |
+
all_output = all_output[:500000]
|
| 356 |
+
print("length of context after reducing: ", len(all_output))
|
| 357 |
+
print("len new all output after sup link: ", len(all_output))
|
| 358 |
+
# no doi then google custom search api
|
| 359 |
+
if doi=="unknown" or len(article_text) == 0 or "Just a moment...Enable JavaScript and cookies to continue".lower() in article_text.lower() or "403 Forbidden Request".lower() in article_text.lower():
|
| 360 |
+
# might find the article
|
| 361 |
+
print("no article text, start tem link")
|
| 362 |
+
more_all_output, more_linksWithTexts, more_links = await model.getMoreInfoForAcc(iso, acc, saveLinkFolder, niche_cases)
|
| 363 |
+
if more_all_output: all_output += more_all_output
|
| 364 |
+
if more_links: links += more_links
|
| 365 |
+
if more_linksWithTexts: linksWithTexts.update(more_linksWithTexts)
|
| 366 |
+
return linksWithTexts, links, all_output
|
| 367 |
|
| 368 |
from Bio import Entrez
|
| 369 |
Entrez.email = "your_email@example.com" # required by NCBI
|
|
|
|
| 371 |
# Main execution
|
| 372 |
async def pipeline_with_gemini(accessions,stop_flag=None, save_df=None, niche_cases=None):
|
| 373 |
# output: country, sample_type, ethnic, location, money_cost, time_cost, explain
|
|
|
|
| 374 |
# Prices are per 1,000 tokens
|
| 375 |
# Before each big step:
|
| 376 |
if stop_flag is not None and stop_flag.value:
|
| 377 |
print(f"🛑 Stop detected before starting {accession}, aborting early...")
|
| 378 |
return {}
|
|
|
|
|
|
|
|
|
|
| 379 |
# Gemini 2.5 Flash-Lite pricing per 1,000 tokens
|
| 380 |
PRICE_PER_1K_INPUT_LLM = 0.00010 # $0.10 per 1M input tokens
|
| 381 |
PRICE_PER_1K_OUTPUT_LLM = 0.00040 # $0.40 per 1M output tokens
|
|
|
|
| 387 |
return None
|
| 388 |
else:
|
| 389 |
accs_output = {}
|
| 390 |
+
genai.configure(api_key=os.getenv("NEW_GOOGLE_API_KEY"))
|
|
|
|
| 391 |
for acc in accessions:
|
| 392 |
print("start gemini: ", acc)
|
| 393 |
start = time.time()
|
|
|
|
| 399 |
"query_cost":total_cost_title,
|
| 400 |
"time_cost":None,
|
| 401 |
"source":links,
|
| 402 |
+
"file_all_output":"",
|
|
|
|
| 403 |
"signals":{ # default values
|
| 404 |
"has_geo_loc_name": False,
|
| 405 |
"has_pubmed": False,
|
|
|
|
| 421 |
meta_expand = smart_fallback.fetch_ncbi(acc)
|
| 422 |
print("meta expand: ", meta_expand)
|
| 423 |
# set up step: create the folder to save document
|
| 424 |
+
all_output, linksWithTexts = "", {}
|
| 425 |
if pudID:
|
| 426 |
id = str(pudID)
|
| 427 |
saveTitle = title
|
|
|
|
| 441 |
print("sample folder id: ", sample_folder_id)
|
| 442 |
|
| 443 |
safe_title = sanitize_filename(saveTitle, 50)
|
|
|
|
| 444 |
all_filename = f"{safe_title}_all_merged_document.docx"
|
| 445 |
+
print("all filename: ", all_filename)
|
| 446 |
# Define local temp paths for reading/writing
|
|
|
|
|
|
|
| 447 |
LOCAL_TEMP_DIR = "/mnt/data/generated_docs"
|
| 448 |
os.makedirs(LOCAL_TEMP_DIR, exist_ok=True)
|
|
|
|
| 449 |
file_all_path = os.path.join(LOCAL_TEMP_DIR, all_filename)
|
|
|
|
|
|
|
| 450 |
if stop_flag is not None and stop_flag.value:
|
| 451 |
print(f"🛑 Stop processing {accession}, aborting early...")
|
| 452 |
return {}
|
| 453 |
+
print("this is file all path: ", file_all_path)
|
|
|
|
| 454 |
all_id = find_drive_file(all_filename, sample_folder_id)
|
| 455 |
|
| 456 |
+
if all_id:
|
| 457 |
print("✅ Files already exist in Google Drive. Downloading them...")
|
|
|
|
| 458 |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
|
|
|
| 459 |
acc_score["file_all_output"] = str(all_filename)
|
| 460 |
+
print("all_id: ")
|
| 461 |
+
print(all_id)
|
| 462 |
+
print("file all output saved in acc score: ", acc_score["file_all_output"])
|
| 463 |
file = drive_service.files().get(fileId="1LUJRTrq8yt4S4lLwCvTmlxaKqpr0nvEn", fields="id, name, parents, webViewLink").execute()
|
| 464 |
print("📄 Name:", file["name"])
|
| 465 |
print("📁 Parent folder ID:", file["parents"][0])
|
| 466 |
print("🔗 View link:", file["webViewLink"])
|
|
|
|
|
|
|
|
|
|
| 467 |
else:
|
| 468 |
# 🔥 Remove any stale local copies
|
|
|
|
|
|
|
|
|
|
| 469 |
if os.path.exists(file_all_path):
|
| 470 |
os.remove(file_all_path)
|
| 471 |
print(f"🗑️ Removed stale: {file_all_path}")
|
| 472 |
# Try to download if already exists on Drive
|
|
|
|
| 473 |
all_exists = download_file_from_drive(all_filename, sample_folder_id, file_all_path)
|
| 474 |
+
print("all exist: ", all_exists)
|
| 475 |
# first way: ncbi method
|
| 476 |
print("country.lower: ",country.lower())
|
| 477 |
if country.lower() != "unknown":
|
|
|
|
| 489 |
# second way: LLM model
|
| 490 |
# Preprocess the input token
|
| 491 |
print(acc_score)
|
|
|
|
|
|
|
|
|
|
| 492 |
if stop_flag is not None and stop_flag.value:
|
| 493 |
print(f"🛑 Stop processing {accession}, aborting early...")
|
| 494 |
return {}
|
| 495 |
# check doi first
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 496 |
if all_exists:
|
| 497 |
print("File all output exists!")
|
| 498 |
if not all_output:
|
|
|
|
| 501 |
if str(all_filename) != "":
|
| 502 |
print("first time have all path at all exist: ", str(all_filename))
|
| 503 |
acc_score["file_all_output"] = str(all_filename)
|
| 504 |
+
print("acc sscore for file all output: ", acc_score["file_all_output"])
|
| 505 |
+
if len(acc_score["file_all_output"]) == 0 or doi!="unknown":
|
| 506 |
+
linksWithTexts, links, all_output = extractSources(doi, linksWithTexts, links, all_output, iso, acc, sample_folder_id, niche_cases)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 507 |
links = unique_preserve_order(links)
|
| 508 |
+
print("this is links: ",links)
|
| 509 |
acc_score["source"] = links
|
| 510 |
else:
|
| 511 |
print("inside the try of reusing chunk or all output")
|
| 512 |
+
#print("chunk filename: ", str(chunks_filename))
|
|
|
|
| 513 |
try:
|
| 514 |
temp_source = False
|
| 515 |
if save_df is not None and not save_df.empty:
|
| 516 |
print("save df not none")
|
|
|
|
| 517 |
print("all filename: ",str(all_filename))
|
|
|
|
| 518 |
print("acc score for file all output: ", acc_score["file_all_output"])
|
| 519 |
+
if acc_score["file_all_output"]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
link = save_df.loc[save_df["file_all_output"]==acc_score["file_all_output"],"Sources"].iloc[0]
|
| 521 |
#link = row["Sources"].iloc[0]
|
| 522 |
print(link)
|
|
|
|
| 539 |
temp_source = True
|
| 540 |
if temp_source:
|
| 541 |
print("temp source is true so have to try again search link")
|
| 542 |
+
linksWithTexts, links, all_output = extractSources(doi, linksWithTexts, links, all_output, iso, acc, sample_folder_id, niche_cases)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
links = unique_preserve_order(links)
|
| 544 |
+
print("links: ", links)
|
| 545 |
acc_score["source"] = links
|
| 546 |
except:
|
| 547 |
try:
|
| 548 |
print("in the exception and start to get link")
|
| 549 |
+
linksWithTexts, links, all_output = extractSources(doi, linksWithTexts, links, all_output, iso, acc, sample_folder_id, niche_cases)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 550 |
links = unique_preserve_order(links)
|
| 551 |
+
print("this is links: ",links)
|
| 552 |
acc_score["source"] = links
|
| 553 |
except:
|
| 554 |
print("except of except for source")
|
|
|
|
| 556 |
if stop_flag is not None and stop_flag.value:
|
| 557 |
print(f"🛑 Stop processing {accession}, aborting early...")
|
| 558 |
return {}
|
| 559 |
+
all_output += "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + features
|
| 560 |
+
print("all output length: ", len(all_output))
|
| 561 |
+
if len(all_output) > 750000:
|
| 562 |
+
all_output = data_preprocess.normalize_for_overlap(all_output)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 563 |
# use build context for llm function to reduce token
|
| 564 |
+
print("reduce context for llm")
|
| 565 |
reduce_context_for_llm = ""
|
| 566 |
+
if len(all_output)>500000:
|
| 567 |
texts_reduce = []
|
| 568 |
out_links_reduce = {}
|
| 569 |
+
texts_reduce.append(all_output)
|
| 570 |
+
out_links_reduce[link] = {"all_output": all_output}
|
| 571 |
+
input_prompt = ["country_name", "modern/ancient/unknown"]
|
| 572 |
+
if niche_cases: input_prompt += niche_cases
|
| 573 |
+
reduce_context_for_llm = data_preprocess.build_context_for_llm(texts_reduce, acc, input_prompt, 250000)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 574 |
if reduce_context_for_llm:
|
| 575 |
print("reduce context for llm")
|
| 576 |
all_output = reduce_context_for_llm
|
| 577 |
else:
|
| 578 |
print("reduce context no succeed")
|
| 579 |
+
all_output = all_output[:250000]
|
| 580 |
+
print("length of context after reducing: ", len(all_output))
|
| 581 |
+
text = ""
|
| 582 |
+
for key in meta_expand:
|
| 583 |
+
text += str(key) + ": " + meta_expand[key] + "\n"
|
| 584 |
+
if len(data_preprocess.normalize_for_overlap(all_output)) > 0:
|
| 585 |
+
text += data_preprocess.normalize_for_overlap(all_output)
|
| 586 |
+
text += ". NCBI Features: " + features
|
| 587 |
+
print("start to save the all output and its length: ", len(text))
|
| 588 |
+
data_preprocess.save_text_to_docx(all_output, file_all_path)
|
| 589 |
+
result_all_upload = upload_file_to_drive(file_all_path, all_filename, sample_folder_id)
|
| 590 |
+
print("UPLOAD RESULT FOR all_output: ", result_all_upload)
|
| 591 |
+
print(f"🔗 Uploaded file: https://drive.google.com/file/d/{result_chunk_upload}/view")
|
| 592 |
+
|
| 593 |
+
acc_prompts = {acc: text}
|
| 594 |
+
print("start model")
|
| 595 |
+
predicted_output_info = await model.query_document_info(
|
| 596 |
+
niche_cases=niche_cases,
|
| 597 |
+
saveLinkFolder=saveLinkFolder,
|
| 598 |
+
llm_api_function=model.call_llm_api,
|
| 599 |
+
prompts=acc_prompts)
|
| 600 |
+
for output_acc in predicted_output_info
|
| 601 |
+
# update everything from the output of model for each accession
|
| 602 |
+
# firstly update predicted output of an accession
|
| 603 |
+
predicted_outputs = predicted_output_info[output_acc]["predicted_output"]
|
| 604 |
+
method_used = predicted_output_info[output_acc]["method_used"]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 605 |
for pred_out in predicted_outputs:
|
| 606 |
# only for country, we have to standardize
|
| 607 |
if pred_out == "country_name":
|
| 608 |
+
country = predicted_outputs[pred_out]["answer"]
|
| 609 |
+
country_explanation = predicted_outputs[pred_out][pred_out+"_explanation"]
|
| 610 |
+
if country_explanation: country_explanation = "-" + country_explanation
|
| 611 |
+
if country != "unknown" and len(country)>0:
|
| 612 |
+
clean_country = model.get_country_from_text(country.lower())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 613 |
stand_country = standardize_location.smart_country_lookup(country.lower())
|
| 614 |
+
if clean_country == "unknown" and stand_country.lower() == "not found":
|
| 615 |
+
country = "unknown"
|
| 616 |
+
# predicted country is unknown
|
| 617 |
+
acc_score["signals"]["predicted_country"] = "unknown"
|
| 618 |
+
acc_score["signals"]["known_failure_pattern"] = True
|
| 619 |
+
if country.lower() != "unknown":
|
| 620 |
+
stand_country = standardize_location.smart_country_lookup(country.lower())
|
| 621 |
+
if stand_country.lower() != "not found":
|
| 622 |
+
if stand_country.lower() in acc_score["country"]:
|
| 623 |
+
if country_explanation:
|
| 624 |
+
acc_score["country"][stand_country.lower()].append(method_used + country_explanation)
|
| 625 |
+
else:
|
| 626 |
+
acc_score["country"][stand_country.lower()] = [method_used + country_explanation]
|
| 627 |
+
# predicted country is non unknown
|
| 628 |
+
acc_score["signals"]["predicted_country"] = stand_country.lower()
|
| 629 |
else:
|
| 630 |
+
if country.lower() in acc_score["country"]:
|
| 631 |
+
if country_explanation:
|
| 632 |
+
if len(method_used + country_explanation) > 0:
|
| 633 |
+
acc_score["country"][country.lower()].append(method_used + country_explanation)
|
| 634 |
+
else:
|
|
|
|
| 635 |
if len(method_used + country_explanation) > 0:
|
| 636 |
+
acc_score["country"][country.lower()] = [method_used + country_explanation]
|
| 637 |
+
# predicted country is non unknown
|
| 638 |
+
acc_score["signals"]["predicted_country"] = country.lower()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 639 |
else:
|
| 640 |
+
# predicted country is unknown
|
| 641 |
+
acc_score["signals"]["predicted_country"] = "unknown"
|
| 642 |
+
acc_score["signals"]["known_failure_pattern"] = True
|
| 643 |
+
# for sample type
|
| 644 |
+
elif pred_out == "modern/ancient/unknown":
|
| 645 |
+
sample_type = predicted_outputs[pred_out]["answer"]
|
| 646 |
+
sample_type_explanation = predicted_outputs[pred_out][pred_out+"_explanation"]
|
| 647 |
+
if sample_type_explanation: sample_type_explanation = "-" + sample_type_explanation
|
| 648 |
+
if sample_type.lower() != "unknown":
|
| 649 |
+
if sample_type.lower() in acc_score["sample_type"]:
|
| 650 |
+
if len(method_used + sample_type_explanation) > 0:
|
| 651 |
+
acc_score["sample_type"][sample_type.lower()].append(method_used + sample_type_explanation)
|
| 652 |
+
else:
|
| 653 |
+
if len(method_used + sample_type_explanation)> 0:
|
| 654 |
+
acc_score["sample_type"][sample_type.lower()] = [method_used + sample_type_explanation]
|
| 655 |
+
# for niche cases
|
| 656 |
else:
|
| 657 |
+
if pred_out in acc_score:
|
|
|
|
| 658 |
answer = predicted_outputs[pred_out]["answer"]
|
| 659 |
answer_explanation = predicted_outputs[pred_out][pred_out+"_explanation"]
|
|
|
|
| 660 |
if answer_explanation: answer_explanation = "-" + answer_explanation
|
| 661 |
if answer.lower() != "unknown":
|
|
|
|
|
|
|
| 662 |
if answer.lower() in acc_score[pred_out]:
|
| 663 |
if len(method_used + answer_explanation) > 0:
|
| 664 |
acc_score[pred_out][answer.lower()].append(method_used + answer_explanation)
|
| 665 |
else:
|
|
|
|
| 666 |
if len(method_used + answer_explanation) > 0:
|
| 667 |
+
acc_score[pred_out][answer.lower()] = [method_used + answer_explanation]
|
| 668 |
+
|
| 669 |
+
# update total query cost
|
| 670 |
+
acc_score["query_cost"] = predicted_output_info[output_acc]["total_query_cost"]
|
| 671 |
+
# update more links if have from model
|
| 672 |
+
more_model_links = predicted_output_info[output_acc]["links"]
|
| 673 |
+
if more_model_links:
|
| 674 |
+
acc_score["source"] += more_model_links
|
| 675 |
+
# update signals
|
| 676 |
+
# add if accession_found_in_text or not
|
| 677 |
+
acc_score["signals"]["accession_found_in_text"] = predicted_output_info[output_acc]["accession_found_in_text"]
|
| 678 |
+
# add into the number of publications
|
| 679 |
+
acc_score["signals"]["num_publications"] += len(acc_score["source"])
|
| 680 |
+
print(f"end of this acc {acc}")
|
| 681 |
|
|
|
|
| 682 |
end = time.time()
|
| 683 |
+
elapsed = (end - start)
|
|
|
|
|
|
|
| 684 |
acc_score["time_cost"] = f"{elapsed:.3f} seconds"
|
| 685 |
accs_output[acc] = acc_score
|
| 686 |
+
print(accs_output)
|
|
|
|
| 687 |
return accs_output
|