Spaces:

VyLala
/

BioMetadataAudit

Build error

App Files Files Community

VyLala commited on Jan 10

Commit

06aa1bb

verified ·

1 Parent(s): ea8597c

Update model.py

Browse files

Files changed (1) hide show

model.py +276 -446

model.py CHANGED Viewed

@@ -17,7 +17,8 @@ import asyncio
 import google.generativeai as genai
 #genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
-genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
 import nltk
 from nltk.corpus import stopwords
@@ -972,7 +973,146 @@ def safe_call_llm(prompt, model="gemini-2.5-flash-lite", max_retries=5):
     raise RuntimeError("❌ Failed after max retries because of repeated rate limits.")
-async def query_document_info(niche_cases, query_word, alternative_query_word, saveLinkFolder, metadata, master_structured_lookup, faiss_index, document_chunks, llm_api_function, chunk=None, all_output=None, model_ai=None):
     """
     Queries the document using a hybrid approach:
     1. Local structured lookup (fast, cheap, accurate for known patterns).
@@ -980,453 +1120,143 @@ async def query_document_info(niche_cases, query_word, alternative_query_word, s
     """
     print("inside the model.query_doc_info")
     outputs, links, accession_found_in_text = {}, [], False
-    if model_ai:
-      if model_ai == "gemini-1.5-flash-latest":
-        genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
-        PRICE_PER_1K_INPUT_LLM = 0.000075  # $0.075 per 1M tokens
-        PRICE_PER_1K_OUTPUT_LLM = 0.0003   # $0.30 per 1M tokens
-        PRICE_PER_1K_EMBEDDING_INPUT = 0.000025 # $0.025 per 1M tokens
-        global_llm_model_for_counting_tokens = genai.GenerativeModel("gemini-1.5-flash-latest")#('gemini-1.5-flash-latest')
-    else:
-      genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
-      # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
-      PRICE_PER_1K_INPUT_LLM = 0.00010      # $0.10 per 1M input tokens
-      PRICE_PER_1K_OUTPUT_LLM = 0.00040     # $0.40 per 1M output tokens
-      # Embedding-001 pricing per 1,000 input tokens
-      PRICE_PER_1K_EMBEDDING_INPUT = 0.00015  # $0.15 per 1M input tokens
-      global_llm_model_for_counting_tokens = genai.GenerativeModel("gemini-2.5-flash-lite")#('gemini-1.5-flash-latest')
-    if metadata:
-      extracted_country, extracted_specific_location, extracted_ethnicity, extracted_type = metadata["country"], metadata["specific_location"], metadata["ethnicity"], metadata["sample_type"]
-      extracted_col_date, extracted_iso, extracted_title, extracted_features = metadata["collection_date"], metadata["isolate"], metadata["title"], metadata["all_features"]
-    else:
-      extracted_country, extracted_specific_location, extracted_ethnicity, extracted_type = "unknown", "unknown", "unknown", "unknown"
-      extracted_col_date, extracted_iso, extracted_title = "unknown", "unknown", "unknown"
-    # --- NEW: Pre-process alternative_query_word to remove '.X' suffix if present ---
-    if alternative_query_word:
-        alternative_query_word_cleaned = alternative_query_word.split('.')[0]
-    else:
-        alternative_query_word_cleaned = alternative_query_word
-    country_explanation, sample_type_explanation = None, None
-    # Use the consolidated final_structured_entries for direct lookup
-    # final_structured_entries = master_structured_lookup.get('final_structured_entries', {})
-    # document_title = master_structured_lookup.get('document_title', 'Unknown Document Title') # Retrieve document title
-    # Default values for all extracted fields. These will be updated.
-    method_used = 'unknown' # Will be updated based on the method that yields a result
-    population_code_from_sl = 'unknown' # To pass to RAG prompt if available
-    total_query_cost = 0
-    # Attempt 1: Try primary query_word (e.g., isolate name) with structured lookup
-    # try:
-    #     print("try attempt 1 in model query")
-    #     structured_info = final_structured_entries.get(query_word.upper())
-    #     if structured_info:
-    #         if extracted_country == 'unknown':
-    #           extracted_country = structured_info['country']
-    #         if extracted_type == 'unknown':
-    #           extracted_type = structured_info['type']
-    #         # if extracted_ethnicity == 'unknown':
-    #         #   extracted_ethnicity = structured_info.get('ethnicity', 'unknown') # Get ethnicity from structured lookup
-    #         # if extracted_specific_location == 'unknown':
-    #         #   extracted_specific_location = structured_info.get('specific_location', 'unknown') # Get specific_location from structured lookup
-    #         population_code_from_sl = structured_info['population_code']
-    #         method_used = "structured_lookup_direct"
-    #         print(f"'{query_word}' found in structured lookup (direct match).")
-    # except:
-    #     print("pass attempt 1 in model query")
-    #     pass
-    # # Attempt 2: Try primary query_word with heuristic range lookup if direct fails (only if not already resolved)
-    # try:
-    #     print("try attempt 2 in model query")
-    #     if method_used == 'unknown':
-    #         query_prefix, query_num_str = _parse_individual_code_parts(query_word)
-    #         if query_prefix is not None and query_num_str is not None:
-    #             try: query_num = int(query_num_str)
-    #             except ValueError: query_num = None
-    #             if query_num is not None:
-    #                 query_prefix_upper = query_prefix.upper()
-    #                 contiguous_ranges = master_structured_lookup.get('contiguous_ranges', defaultdict(list))
-    #                 pop_code_to_country = master_structured_lookup.get('pop_code_to_country', {})
-    #                 pop_code_to_ethnicity = master_structured_lookup.get('pop_code_to_ethnicity', {})
-    #                 pop_code_to_specific_loc = master_structured_lookup.get('pop_code_to_specific_loc', {})
-    #                 if query_prefix_upper in contiguous_ranges:
-    #                     for start_num, end_num, pop_code_for_range in contiguous_ranges[query_prefix_upper]:
-    #                         if start_num <= query_num <= end_num:
-    #                             country_from_heuristic = pop_code_to_country.get(pop_code_for_range, 'unknown')
-    #                             if country_from_heuristic != 'unknown':
-    #                                 if extracted_country == 'unknown':
-    #                                   extracted_country = country_from_heuristic
-    #                                 if extracted_type == 'unknown':
-    #                                   extracted_type = 'modern'
-    #                                 # if extracted_ethnicity == 'unknown':
-    #                                 #   extracted_ethnicity = pop_code_to_ethnicity.get(pop_code_for_range, 'unknown')
-    #                                 # if extracted_specific_location == 'unknown':
-    #                                 #   extracted_specific_location = pop_code_to_specific_loc.get(pop_code_for_range, 'unknown')
-    #                                 population_code_from_sl = pop_code_for_range
-    #                                 method_used = "structured_lookup_heuristic_range_match"
-    #                                 print(f"'{query_word}' not direct. Heuristic: Falls within range {query_prefix_upper}{start_num}-{query_prefix_upper}{end_num}.")
-    #                                 break
-    #                             else:
-    #                                 print(f"'{query_word}' heuristic match found, but country unknown. Will fall to RAG below.")
-    # except:
-    #     print("pass attempt 2 in model query")
-    #     pass
-    # # Attempt 3: If primary query_word failed all structured lookups, try alternative_query_word (cleaned)
-    # try:
-    #     print("try attempt 3 in model query")
-    #     if method_used == 'unknown' and alternative_query_word_cleaned and alternative_query_word_cleaned != query_word:
-    #         print(f"'{query_word}' not found in structured (or heuristic). Trying alternative '{alternative_query_word_cleaned}'.")
-    #         # Try direct lookup for alternative word
-    #         structured_info_alt = final_structured_entries.get(alternative_query_word_cleaned.upper())
-    #         if structured_info_alt:
-    #           if extracted_country == 'unknown':
-    #             extracted_country = structured_info_alt['country']
-    #           if extracted_type == 'unknown':
-    #             extracted_type = structured_info_alt['type']
-    #           # if extracted_ethnicity == 'unknown':
-    #           #   extracted_ethnicity = structured_info_alt.get('ethnicity', 'unknown')
-    #           # if extracted_specific_location == 'unknown':
-    #           #   extracted_specific_location = structured_info_alt.get('specific_location', 'unknown')
-    #           population_code_from_sl = structured_info_alt['population_code']
-    #           method_used = "structured_lookup_alt_direct"
-    #           print(f"Alternative '{alternative_query_word_cleaned}' found in structured lookup (direct match).")
-    #         else:
-    #             # Try heuristic lookup for alternative word
-    #             alt_prefix, alt_num_str = _parse_individual_code_parts(alternative_query_word_cleaned)
-    #             if alt_prefix is not None and alt_num_str is not None:
-    #                 try: alt_num = int(alt_num_str)
-    #                 except ValueError: alt_num = None
-    #                 if alt_num is not None:
-    #                     alt_prefix_upper = alt_prefix.upper()
-    #                     contiguous_ranges = master_structured_lookup.get('contiguous_ranges', defaultdict(list))
-    #                     pop_code_to_country = master_structured_lookup.get('pop_code_to_country', {})
-    #                     pop_code_to_ethnicity = master_structured_lookup.get('pop_code_to_ethnicity', {})
-    #                     pop_code_to_specific_loc = master_structured_lookup.get('pop_code_to_specific_loc', {})
-    #                     if alt_prefix_upper in contiguous_ranges:
-    #                         for start_num, end_num, pop_code_for_range in contiguous_ranges[alt_prefix_upper]:
-    #                             if start_num <= alt_num <= end_num:
-    #                                 country_from_heuristic_alt = pop_code_to_country.get(pop_code_for_range, 'unknown')
-    #                                 if country_from_heuristic_alt != 'unknown':
-    #                                   if extracted_country == 'unknown':
-    #                                     extracted_country = country_from_heuristic_alt
-    #                                   if extracted_type == 'unknown':
-    #                                     extracted_type = 'modern'
-    #                                   # if extracted_ethnicity == 'unknown':
-    #                                   #   extracted_ethnicity = pop_code_to_ethnicity.get(pop_code_for_range, 'unknown')
-    #                                   # if extracted_specific_location == 'unknown':
-    #                                   #   extracted_specific_location = pop_code_to_specific_loc.get(pop_code_for_range, 'unknown')
-    #                                   population_code_from_sl = pop_code_for_range
-    #                                   method_used = "structured_lookup_alt_heuristic_range_match"
-    #                                   break
-    #                                 else:
-    #                                     print(f"Alternative '{alternative_query_word_cleaned}' heuristic match found, but country unknown. Will fall to RAG below.")
-    # except:
-    #     print("pass attempt 3 in model query")
-    #     pass
-    # use the context_for_llm to detect present_ancient before using llm model
-    # retrieved_chunks_text = []
-    # if document_chunks:
-    #   for idx in range(len(document_chunks)):
-    #           retrieved_chunks_text.append(document_chunks[idx])
-    # context_for_llm = ""
-    # all_context = "\n".join(retrieved_chunks_text) #
-    # listOfcontexts = {"chunk": chunk,
-    #         "all_output": all_output,
-    #         "document_chunk": all_context}
-    # label, context_for_llm = chooseContextLLM(listOfcontexts, query_word)
-    # if not context_for_llm:
-    #   label, context_for_llm = chooseContextLLM(listOfcontexts, alternative_query_word_cleaned)
-    #   if not context_for_llm:
-    #     context_for_llm = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + extracted_features
-    # if context_for_llm:
-    #   extracted_type, explain = mtdna_classifier.detect_ancient_flag(context_for_llm)
-    #   extracted_type = extracted_type.lower()
-    #   sample_type_explanation = explain
-    # 5. Execute RAG if needed (either full RAG or targeted RAG for missing fields)
-    # Determine if a RAG call is necessary
-    # run_rag = (extracted_country == 'unknown' or extracted_type == 'unknown')# or \
-    #            #extracted_ethnicity == 'unknown' or extracted_specific_location == 'unknown')
-    run_rag = True
-    if run_rag:
-        print("try run rag")
-        context_for_llm = ""
-        # Determine the phrase for LLM query
-        rag_query_phrase = ""
-        if query_word.lower() != "unknown":
-            rag_query_phrase += f"the mtDNA isolate name '{query_word}'"
-        # Accession number (alternative_query_word)
-        if (
-            alternative_query_word_cleaned
-            and alternative_query_word_cleaned != query_word
-            and alternative_query_word_cleaned.lower() != "unknown"
-        ):
-            if rag_query_phrase:
-                rag_query_phrase += f" or its accession number '{alternative_query_word_cleaned}'"
-            else:
-                rag_query_phrase += f"the accession number '{alternative_query_word_cleaned}'"
-        # Construct a more specific semantic query phrase for embedding if structured info is available
-        semantic_query_for_embedding = rag_query_phrase # Default
-        prompt_instruction_prefix = ""
-        output_format_str = ""
-        # Determine if it's a full RAG or targeted RAG scenario based on what's already extracted
-        is_full_rag_scenario = True#(extracted_country == 'unknown')
-        if is_full_rag_scenario: # Full RAG scenario
-            output_format_str = "country_name, modern/ancient/unknown"#, ethnicity, specific_location/unknown"
-            explain_list = "country or sample type (modern/ancient)"
-            if niche_cases:
-              output_format_str += ", "+ ", ".join(niche_cases)# "ethnicity, specific_location/unknown"
-              explain_list += " or "+ " or ".join(niche_cases)
-            method_used = "rag_llm"
-            print(f"Proceeding to FULL RAG for {rag_query_phrase}.")
-        current_embedding_cost = 0
-        print("direct to llm")
-        listOfcontexts = {"chunk": chunk,
-            "all_output": all_output,
-            "document_chunk": chunk}
-        label, context_for_llm = chooseContextLLM(listOfcontexts, query_word)
-        if not context_for_llm:
-          label, context_for_llm = chooseContextLLM(listOfcontexts, alternative_query_word_cleaned)
-          if not context_for_llm:
-            context_for_llm = "Collection_date: " + col_date +". Isolate: " + iso + ". Title: " + title + ". Features: " + extracted_features
-        if len(context_for_llm) > 1000*1000:
-          context_for_llm = context_for_llm[:900000]
-        # fix the prompt better:
-        # firstly clarify more by saying which type of organism, prioritize homo sapiens
-        features = metadata["all_features"]
-        organism = "general"
-        if features != "unknown":
-          if "organism" in features:
-            try:
-              organism = features.split("organism: ")[1].split("\n")[0]
-            except:
-              organism = features.replace("\n","; ")
-        niche_prompt = ""
-        if niche_cases:
-          fields_list = ", ".join(niche_cases)
-          niche_prompt = (
-            f"Also, extract {fields_list}. "
-            f"If not explicitly stated, infer the most specific related or contextually relevant value. "
-            f"If no information is found, write 'unknown'. "
-          )
-        prompt_for_llm = (
-    f"{prompt_instruction_prefix}"
-    f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} "
-    f"or the mitochondrial DNA sample in {organism} if these identifiers are not explicitly found. "
-    f"Identify its **primary associated geographic location**, preferring the most specific available: "
-    f"first try to determine the exact country; if no country is explicitly mentioned, then provide "
-    f"the next most specific region, continent, island, or other clear geographic area mentioned. "
-    f"If no geographic clues at all are present, state 'unknown' for location. "
-    f"Also, determine if the genetic sample is from a 'modern' (present-day living individual) "
-    f"or 'ancient' (prehistoric/archaeological) source. "
-    f"If the text does not specify ancient or archaeological context, assume 'modern'. "
-    f"{niche_prompt}"
-    f"Provide only {output_format_str}. "
-    f"If any information is not explicitly present, use the fallback rules above before defaulting to 'unknown'. "
-    f"For each non-'unknown' field in {explain_list}, write one sentence explaining how it was inferred from the text "
-    f"(one sentence for each). "
-    f"Format your answer so that:\n"
-    f"1. The **first line** contains only the {output_format_str} values separated by commas.\n"
-    f"2. The **second line onward** contains the explanations based on the order of the non-unknown {output_format_str} answer.\n"
-    f"\nText Snippets:\n{context_for_llm}")
-        print("this is prompt: ", prompt_for_llm)
-        # check if accession in text or not
-        if alternative_query_word_cleaned.lower() in prompt_for_llm.lower():
           accession_found_in_text = True
-        if model_ai:
-          print("back up to ", model_ai)
-          #llm_response_text, model_instance = call_llm_api(prompt_for_llm, model=model_ai)
-          llm_response_text, model_instance = safe_call_llm(prompt_for_llm, model=model_ai)
         else:
-          print("still 2.5 flash gemini")
-          llm_response_text, model_instance = safe_call_llm(prompt_for_llm)
-          #llm_response_text, model_instance = call_llm_api(prompt_for_llm)
-        print("\n--- DEBUG INFO FOR RAG ---")
-        print("Retrieved Context Sent to LLM (first 500 chars):")
-        print(context_for_llm[:500] + "..." if len(context_for_llm) > 500 else context_for_llm)
-        print("\nRaw LLM Response:")
-        print(llm_response_text)
-        print("--- END DEBUG INFO ---")
-        llm_cost = 0
-        if model_instance:
-            try:
-                input_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(prompt_for_llm).total_tokens
-                output_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(llm_response_text).total_tokens
-                print(f"  DEBUG: LLM Input tokens: {input_llm_tokens}")
-                print(f"  DEBUG: LLM Output tokens: {output_llm_tokens}")
-                llm_cost = (input_llm_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
-                           (output_llm_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
-                print(f"  DEBUG: Estimated LLM cost: ${llm_cost:.6f}")
-            except Exception as e:
-                print(f"  DEBUG: Error counting LLM tokens: {e}")
-                llm_cost = 0
-        total_query_cost += current_embedding_cost + llm_cost
-        print(f"  DEBUG: Total estimated cost for this RAG query: ${total_query_cost:.6f}")
-        metadata_list = parse_multi_sample_llm_output(llm_response_text, output_format_str)
-        print(metadata_list)
-        again_output_format, general_knowledge_prompt = "", ""
-        # if at least 1 answer is unknown, then do smart queries to get more sources besides doi
-        unknown_count = sum(1 for v in metadata_list.values() if v.get("answer").lower() == "unknown")
-        if unknown_count >= 1:
-          print("at least 1 unknown outputs")
-          out_links = {}
-          iso, acc = query_word, alternative_query_word
-          meta_expand = smart_fallback.fetch_ncbi(acc)
-          tem_links = smart_fallback.smart_google_search(acc, meta_expand)
-          tem_links = pipeline.unique_preserve_order(tem_links)
-          print("this is tem links with acc: ", tem_links)
-          # filter the quality link
-          print("start the smart filter link")
-          #success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,saveLinkFolder),kwargs={"accession":acc},timeout=90)
-          output_process = await smart_fallback.async_filter_links_by_metadata(
-              tem_links, saveLinkFolder, accession=acc
-          )
-          if output_process:
-            out_links.update(output_process)
-            print("yeah we have out_link and len: ", len(out_links))
-            print("yes succeed for smart filter link")
-            links += list(out_links.keys())
-            print("link keys: ", links)
-          if links:
-            tasks = [
-                pipeline.process_link_chunk_allOutput(link, iso, acc, saveLinkFolder, out_links, all_output, chunk)
-                for link in links
-            ]
-            print(f"Number of tasks to gather: {len(tasks)}")
-            try:
-                #results = await asyncio.gather(*tasks)
-                results = await asyncio.gather(*tasks, return_exceptions=True)
-                print(f"Results: {results}")
-                for result in results:
-                    if isinstance(result, Exception):
-                        print(f"Error in task: {result}")
-                    else:
-                        print(f"Task completed successfully")
-                print("Finished gathering results")
-            except Exception as e:
-                print(f"Error in gathering: {e}")
-            #results = await asyncio.gather(*tasks)
-            # combine results
-            print("get results for context_for_llm")
-            for context, new_all_output, new_chunk in results:
-                print("inside new results")
-                context_for_llm += new_all_output
-                context_for_llm += new_chunk
-            print("len of context after merge all: ", len(context_for_llm))
-          if len(context_for_llm) > 750000:
-            context_for_llm = data_preprocess.normalize_for_overlap(context_for_llm)
-            if len(context_for_llm) > 750000:
-              # use build context for llm function to reduce token
-              texts_reduce = []
-              out_links_reduce = {}
-              reduce_context_for_llm = ""
-              if links:
-                for link in links:
-                  all_output_reduce, chunk_reduce, context_reduce = "", "",""
-                  context_reduce, all_output_reduce, chunk_reduce = await pipeline.process_link_chunk_allOutput(link,
-                            iso, acc, saveLinkFolder, out_links_reduce,
-                            all_output_reduce, chunk_reduce)
-                  texts_reduce.append(all_output_reduce)
-                  out_links_reduce[link] = {"all_output": all_output_reduce}
-                input_prompt = ["country_name", "modern/ancient/unknown"]
-                if niche_cases: input_prompt += niche_cases
-                reduce_context_for_llm = data_preprocess.build_context_for_llm(texts_reduce, acc, input_prompt)
-              if reduce_context_for_llm:
-                print("reduce context for llm")
-                context_for_llm = reduce_context_for_llm
-              else:
-                print("no reduce context for llm despite>1M")
-                context_for_llm = context_for_llm[:250000]
-        for key in metadata_list:
-          answer = metadata_list[key]["answer"]
-          if answer.lower() in " ".join(["unknown", "unspecified","could not get response from llm api.", "undefined"]):
-            print("have to do again")
-            again_output_format = key
-            print("output format:", again_output_format)
-            general_knowledge_prompt = (
-          f"{prompt_instruction_prefix}"
-          f"Given the following text snippets, analyze the entity/concept {rag_query_phrase} "
-          f"or the mitochondrial DNA sample in {organism} if these identifiers are not explicitly found. "
-          f"Identify and extract {again_output_format}"
-          f"If not explicitly stated, infer the most specific related or contextually relevant value. "
-          f"If no information is found, write 'unknown'. "
-          f"Provide only {again_output_format}. "
-          f"For non-'unknown' field in {again_output_format}, write one sentence explaining how it was inferred from the text "
-          f"Format your answer so that:\n"
-          f"1. The **first line** contains only the {again_output_format} answer.\n"
-          f"2. The **second line onward** contains the explanations based on the non-unknown {again_output_format} answer.\n"
-          f"\nText Snippets:\n{context_for_llm}")
-            print("len of prompt:", len(general_knowledge_prompt))
-            if alternative_query_word_cleaned.lower() in general_knowledge_prompt.lower():
-              accession_found_in_text = True
-            if general_knowledge_prompt:
-              if model_ai:
-                print("back up to ", model_ai)
-                llm_response_text, model_instance = safe_call_llm(general_knowledge_prompt, model=model_ai)
-                #llm_response_text, model_instance = call_llm_api(general_knowledge_prompt, model=model_ai)
-              else:
-                print("still 2.5 flash gemini")
-                llm_response_text, model_instance = safe_call_llm(general_knowledge_prompt)
-                #llm_response_text, model_instance = call_llm_api(general_knowledge_prompt)
-              print("\n--- DEBUG INFO FOR RAG ---")
-              print("Retrieved Context Sent to LLM (first 500 chars):")
-              print(context_for_llm[:500] + "..." if len(context_for_llm) > 500 else context_for_llm)
-              print("\nRaw LLM Response:")
-              print(llm_response_text)
-              print("--- END DEBUG INFO ---")
-              llm_cost = 0
-              if model_instance:
-                  try:
-                      input_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(prompt_for_llm).total_tokens
-                      output_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(llm_response_text).total_tokens
-                      print(f"  DEBUG: LLM Input tokens: {input_llm_tokens}")
-                      print(f"  DEBUG: LLM Output tokens: {output_llm_tokens}")
-                      llm_cost = (input_llm_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
-                                  (output_llm_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
-                      print(f"  DEBUG: Estimated LLM cost: ${llm_cost:.6f}")
-                  except Exception as e:
-                      print(f"  DEBUG: Error counting LLM tokens: {e}")
-                      llm_cost = 0
-              total_query_cost += current_embedding_cost + llm_cost
-              print("total query cost in again: ", total_query_cost)
-              metadata_list_one_case = parse_multi_sample_llm_output(llm_response_text, again_output_format)
-              print("metadata list after running again unknown output: ", metadata_list)
-              for key in metadata_list_one_case:
-                print("keys of outputs: ", outputs.keys())
-                if key not in list(outputs.keys()):
-                  print("this is key and about to be added into outputs: ", key)
-                  outputs[key] = metadata_list_one_case[key]
-          else:
-            outputs[key] = metadata_list[key]
-    print("all done and method used: ", outputs, method_used)
-    print("total cost: ", total_query_cost)
-    return outputs, method_used, total_query_cost, links, accession_found_in_text

 import google.generativeai as genai
 #genai.configure(api_key=os.getenv("GOOGLE_API_KEY"))
+#genai.configure(api_key=os.getenv("GOOGLE_API_KEY_BACKUP"))
+genai.configure(api_key=os.getenv("NEW_GOOGLE_API_KEY"))
 import nltk
 from nltk.corpus import stopwords
     raise RuntimeError("❌ Failed after max retries because of repeated rate limits.")
+def outputs_from_multiPrompts(raw_response: str, output_format_str, acc_prompts):
+  # Split the text based on the pattern '**Prompt X:'
+  raw_response = re.split(r'\*\*Prompt \d+:', text)
+  # Remove any empty sections from the split list
+  prompts = [prompt.strip() for prompt in raw_response if prompt.strip()]
+  # Create a list of output strings
+  outputs = {}
+  accs = list(acc_prompts.keys())
+  # Loop through the prompts and combine the header and body
+  for i in range(0, len(prompts)):
+      prompt_header = prompts[i].strip()  # This is the "USA, unknown, Venezuela" or similar part
+      prompt_header = re.sub(r'^\*\*\n', '', prompt_header)  # Remove any leading '**\n'
+      accession, output = accs[i], ""
+      if i + 1 < len(prompts):  # Check if there is a next body text
+          prompt_body = prompts[i + 1].strip()  # This is the body of the response
+          # Remove any unwanted '**\n' before the prompt content
+          output = f"{prompt_header}\n\n{prompt_body}"
+      else:
+          # If no body exists, add only the header (though this case shouldn't occur in this example)
+          output = f"{prompt_header}\n\n"
+      metadata_list = parse_multi_sample_llm_output(output, output_format_str)
+      outputs[accession] = metadata_list
+  return outputs
+def multi_prompts(dictsAccs, output_format_str, niche_cases=None, prompt_template="default"):
+  prompts = {}
+  """dictsAccs = {
+    "acc1": "text1",
+    "acc2": "text2",
+    "acc3": "text3" }"""
+  if niche_cases:
+    fields_list = ", ".join(niche_cases)
+    niche_prompt = (
+      f"Also, extract {fields_list}. "
+      f"If not explicitly stated, infer the most specific related or contextually relevant value. "
+      f"If no information is found, write 'unknown'. "
+    )
+    #output_format_str += ", " + ", ".join(niche_cases)
+  else: niche_prompt = ""
+  for acc_pos in range(len(list(dictsAccs.keys()))):
+    acc = list(dictsAccs.keys())[acc_pos]
+    if acc:
+        acc_cleaned = acc.split('.')[0]
+    else:
+        acc_cleaned = acc
+    accession_found_in_text = False
+    context_for_llm = dictsAccs[acc]
+    if prompt_template == "default":
+      prompt_for_llm = (
+      f"Prompt {acc_pos+1}: "
+      f"Given the following text snippets, analyze the entity/concept of this accession number {acc_cleaned} "
+      #f"or the mitochondrial DNA sample if these identifiers are not explicitly found. "
+      f"Identify its **primary associated geographic location**, preferring the most specific available: "
+      f"first try to determine the exact country; if no country is explicitly mentioned, then provide "
+      f"the next most specific region, continent, island, or other clear geographic area mentioned. "
+      f"If no geographic clues at all are present, state 'unknown' for location. "
+      f"Also, determine if the genetic sample is from a 'modern' (present-day living individual) "
+      f"or 'ancient' (prehistoric/archaeological) source. "
+      f"If the text does not specify ancient or archaeological context, assume 'modern'. "
+      f"{niche_prompt}"
+      f"Provide only {output_format_str}. "
+      f"If any information is not explicitly present, use the fallback rules above before defaulting to 'unknown'. "
+      f"For each non-'unknown' field, write one sentence explaining how it was inferred from the text "
+      f"(one sentence for each). "
+      f"Format your answer so that:\n"
+      f"1. The **first line** contains only the {output_format_str} values separated by commas.\n"
+      f"2. The **second line onward** contains the explanations based on the order of the non-unknown {output_format_str} answer.\n"
+      f"\nText Snippets:\n{context_for_llm}")
+      # check if accession in text or not
+      if acc_cleaned.lower() in context_for_llm.lower():
+        accession_found_in_text = True
+      # save values in prompts:
+      prompts[acc] = [prompt_for_llm, accession_found_in_text]
+  return prompts
+async def getMoreInfoForAcc(iso=None, acc=None, saveLinkFolder=None, niche_cases=None, limit_context=250000):
+  linksWithTexts, links, context_for_llm = {}, [], ""
+  meta_expand = smart_fallback.fetch_ncbi(acc)
+  raw_tem_links = smart_fallback.smart_google_search(acc, meta_expand)
+  tem_links = pipeline.unique_preserve_order(raw_tem_links)
+  print("this is tem links with acc: ", tem_links)
+  # filter the quality link
+  print("start the smart filter link")
+  #success_process, output_process = run_with_timeout(smart_fallback.filter_links_by_metadata,args=(tem_links,saveLinkFolder),kwargs={"accession":acc},timeout=90)
+  output_process = await smart_fallback.async_filter_links_by_metadata(
+      tem_links, saveLinkFolder, accession=acc
+  )
+  print('inside getMoreInfoForAcc and here is outputProcess: ', output_process)
+  if output_process:
+    linksWithTexts.update(output_process)
+    print("yeah we have linksWithTexts and len: ", len(linksWithTexts))
+    print("yes succeed for smart filter link")
+    links += list(linksWithTexts.keys())
+    print("link keys: ", links)
+  else:
+    print("not have output_process")
+    links += tem_links
+  if links:
+    # use build context for llm function to reduce token
+    texts_reduce = []
+    linksWithTexts_reduce = {}
+    reduce_context_for_llm = ""
+    print("links:", links)
+    for link in links:
+      print("link: ", link)
+      new_all_output = await pipeline.process_link_allOutput(link,
+                iso, acc, saveLinkFolder, linksWithTexts_reduce, context_for_llm)
+      print("done all output")
+      context_for_llm += new_all_output
+      texts_reduce.append(new_all_output)
+      linksWithTexts_reduce[link] = {"all_output": new_all_output}
+    # tasks = [
+    #     pipeline.process_link_allOutput(link, iso, acc, saveLinkFolder, linksWithTexts, all_output)
+    #     for link in links
+    # ]
+    # results = await asyncio.gather(*tasks)
+    # print("this is result:", results)
+    # # combine results
+    # for new_all_output in results:
+    #   context_for_llm += new_all_output
+    print("len of context after merge all: ", len(context_for_llm))
+  if len(context_for_llm) > 500000:
+    context_for_llm = data_preprocess.normalize_for_overlap(context_for_llm)
+    if len(context_for_llm) > 500000:
+      if links:
+        input_prompt = ["country_name", "modern/ancient/unknown"]
+        if niche_cases: input_prompt += niche_cases
+        reduce_context_for_llm = data_preprocess.build_context_for_llm(texts_reduce, acc, input_prompt, limit_context)
+      if reduce_context_for_llm:
+        print("reduce context for llm")
+        context_for_llm = reduce_context_for_llm
+      else:
+        print("no reduce context for llm despite>1M")
+        context_for_llm = context_for_llm[:limit_context]
+  return context_for_llm, linksWithTexts, links
+async def query_document_info(niche_cases, saveLinkFolder, llm_api_function, prompts):
     """
     Queries the document using a hybrid approach:
     1. Local structured lookup (fast, cheap, accurate for known patterns).
     """
     print("inside the model.query_doc_info")
     outputs, links, accession_found_in_text = {}, [], False
+    genai.configure(api_key=os.getenv("NEW_GOOGLE_API_KEY"))
+    # Gemini 2.5 Flash-Lite pricing per 1,000 tokens
+    PRICE_PER_1K_INPUT_LLM = 0.00010      # $0.10 per 1M input tokens
+    PRICE_PER_1K_OUTPUT_LLM = 0.00040     # $0.40 per 1M output tokens
+    # Embedding-001 pricing per 1,000 input tokens
+    PRICE_PER_1K_EMBEDDING_INPUT = 0.00015  # $0.15 per 1M input tokens
+    global_llm_model_for_counting_tokens = genai.GenerativeModel("gemini-2.5-flash-lite")#('gemini-1.5-flash-latest')
+    # Determine fields to ask LLM for and output format based on what's known/needed
+    output_format_str = "country_name, modern/ancient/unknown"
+    method_used = 'rag_llm' # Will be updated based on the method that yields a result
+    if niche_cases:
+      output_format_str += ", " + ", ".join(niche_cases)
+    # Calculate embedding cost for the primary query word
+    total_query_cost, current_embedding_cost = 0, 0
+    created_prompts = multi_prompts(prompts, output_format_str, niche_cases=niche_cases, prompt_template="default")
+    print("done create prompt and length: ", len(created_prompts))
+    prompt_for_llm = []
+    for acc in created_prompts:
+      outputs[acc] = {"predicted_output":"",
+                      "method_used": method_used,
+                      "total_query_cost":None,
+                      "links": [],
+                      "accession_found_in_text":created_prompts[acc][1],
+                      }
+      prompt_for_llm.append(created_prompts[acc][0])
+    prompt_for_llm = "\n".join(prompt_for_llm) #there is only 1 prompt created #+ "\n" + "Give answer for each prompt"
+    print("length of prompt: ", len(prompt_for_llm))
+    print("use 2.5 flash gemini")
+    llm_response_text, model_instance = call_llm_api(prompt_for_llm)
+    print("\n--- DEBUG INFO FOR RAG ---")
+    print("Retrieved Context Sent to LLM (first 500 chars):")
+    print(prompt_for_llm[:500] + "..." if len(prompt_for_llm) > 500 else prompt_for_llm)
+    print("\nRaw LLM Response:")
+    print(llm_response_text)
+    print("--- END DEBUG INFO ---")
+    llm_cost = 0
+    if model_instance:
+        try:
+            input_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(prompt_for_llm).total_tokens
+            output_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(llm_response_text).total_tokens
+            print(f"  DEBUG: LLM Input tokens: {input_llm_tokens}")
+            print(f"  DEBUG: LLM Output tokens: {output_llm_tokens}")
+            llm_cost = (input_llm_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
+                       (output_llm_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
+            print(f"  DEBUG: Estimated LLM cost: ${llm_cost:.6f}")
+        except Exception as e:
+            print(f"  DEBUG: Error counting LLM tokens: {e}")
+            llm_cost = 0
+    total_query_cost += current_embedding_cost + llm_cost
+    print(f"  DEBUG: Total estimated cost for this RAG query: ${total_query_cost:.6f}")
+    metadata_list = parse_multi_sample_llm_output(llm_response_text, output_format_str)
+    multi_metadata_lists = [metadata_list]
+    list_accs = list(prompts.keys())
+    if acc:
+      acc_cleaned = acc.split(".")[0]
+    else: acc_cleaned = acc
+    for metadata_list_pos in range(len(multi_metadata_lists)):
+      metadata_list = multi_metadata_lists[metadata_list_pos]
+      print(metadata_list)
+      acc = list_accs[metadata_list_pos]
+      again_output_format, general_knowledge_prompt = "", ""
+      output_acc = {}
+      # if at least 1 answer is unknown, then do smart queries to get more sources besides doi
+      unknown_count = sum(1 for v in metadata_list.values() if v.get("answer").lower() == "unknown")
+      if unknown_count >= 1:
+        print("at least 1 unknown outputs")
+        context_for_llm, linksWithTexts, more_links = await getMoreInfoForAcc(iso=None, acc=acc, saveLinkFolder=saveLinkFolder, niche_cases=niche_cases, limit_context=250000)
+        links += more_links
+        if acc_cleaned.lower() in context_for_llm.lower():
           accession_found_in_text = True
+          # update again accession found in text due to new context for llm
+          outputs[acc]["accession_found_in_text"] = accession_found_in_text
+        # update links for output of acc
+        outputs[acc]["links"] = links
+      else:
+        context_for_llm = prompts[acc]
+      for key in metadata_list:
+        answer = metadata_list[key]["answer"]
+        if answer.lower() in " ".join(["unknown", "unspecified","could not get response from llm api.", "undefined"]):
+          print("have to do again")
+          again_output_format = key
+          print("output format:", again_output_format)
+          general_knowledge_prompt = (
+        f"Given the following text snippets, analyze the entity/concept of this accession number {acc_cleaned} "
+        #f"or the mitochondrial DNA sample if these identifiers are not explicitly found. "
+        f"Identify and extract {again_output_format}"
+        f"If not explicitly stated, infer the most specific related or contextually relevant value. "
+        f"If no information is found, write 'unknown'. "
+        f"Provide only {again_output_format}. "
+        f"For non-'unknown' field in {again_output_format}, write one sentence explaining how it was inferred from the text "
+        f"Format your answer so that:\n"
+        f"1. The **first line** contains only the {again_output_format} answer.\n"
+        f"2. The **second line onward** contains the explanations based on the non-unknown {again_output_format} answer.\n"
+        f"\nText Snippets:\n{context_for_llm}")
+          print("len of general prompt:", len(general_knowledge_prompt))
+          if general_knowledge_prompt:
+            print("use 2.5 flash gemini")
+            llm_response_text, model_instance = call_llm_api(general_knowledge_prompt)
+            print("\n--- DEBUG INFO FOR RAG ---")
+            print("Retrieved Context Sent to LLM (first 500 chars):")
+            print(context_for_llm[:500] + "..." if len(context_for_llm) > 500 else context_for_llm)
+            print("\nRaw LLM Response:")
+            print(llm_response_text)
+            print("--- END DEBUG INFO ---")
+            llm_cost = 0
+            if model_instance:
+                try:
+                    input_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(prompt_for_llm).total_tokens
+                    output_llm_tokens = global_llm_model_for_counting_tokens.count_tokens(llm_response_text).total_tokens
+                    print(f"  DEBUG: LLM Input tokens: {input_llm_tokens}")
+                    print(f"  DEBUG: LLM Output tokens: {output_llm_tokens}")
+                    llm_cost = (input_llm_tokens / 1000) * PRICE_PER_1K_INPUT_LLM + \
+                                (output_llm_tokens / 1000) * PRICE_PER_1K_OUTPUT_LLM
+                    print(f"  DEBUG: Estimated LLM cost: ${llm_cost:.6f}")
+                except Exception as e:
+                    print(f"  DEBUG: Error counting LLM tokens: {e}")
+                    llm_cost = 0
+            total_query_cost += current_embedding_cost + llm_cost
+            print("total query cost in again: ", total_query_cost)
+            metadata_list_niche = parse_multi_sample_llm_output(llm_response_text, again_output_format)
+            print(f"metadata list output for {again_output_format}: {metadata_list}")
+            for key_niche in metadata_list_niche:
+              if key_niche not in outputs.keys():
+                output_acc[key_niche] = metadata_list_niche[key_niche]
         else:
+            output_acc[key] = metadata_list[key]
+      outputs[acc]["predicted_output"] = output_acc
+      outputs[acc]["total_query_cost"] = total_query_cost
+      print("total cost: ", total_query_cost)
+      print(f"total output of {acc}: {outputs[acc]}")
+    return outputs