WeMWish commited on
Commit
16b63c6
·
1 Parent(s): 022aa77

fix literature search

Browse files
Files changed (1) hide show
  1. agents/generation_agent.py +141 -61
agents/generation_agent.py CHANGED
@@ -390,7 +390,6 @@ class GenerationAgent:
390
  return {"thought": "Error: OpenAI client not initialized.", "python_code": "", "status": "ERROR"}
391
 
392
  # PHASE 2 FOR IMAGES: If we have an image file ID, transition directly to image analysis
393
- # Ensure file-ID format consistency by stripping "file-" prefix if present
394
  if image_file_id_for_prompt:
395
  if image_file_id_for_prompt.startswith("file-"):
396
  image_file_id_for_prompt = image_file_id_for_prompt[5:] # Remove "file-" prefix
@@ -400,90 +399,171 @@ class GenerationAgent:
400
  return {
401
  "thought": "I will analyze the image using the describe_image tool",
402
  "status": "AWAITING_DATA",
403
- "python_code": f"print(json.dumps({{'intermediate_data_for_llm': tools.describe_image('{image_file_id_for_prompt}')}})))",
404
- "explanation": "I'll analyze the image directly and provide my observations."
405
  }
406
 
407
  # Look for JSON blocks in conversation history
408
- for turn in reversed(conversation_history[-6:]):
409
- content = turn.get("content", "")
410
- m = re.search(r"```json\s*(.*?)\s*```", content, flags=re.DOTALL)
411
- if not m:
412
- continue
 
 
 
 
413
 
414
  try:
415
- json_data = json.loads(m.group(1))
 
 
416
 
417
  # PHASE 3 FOR IMAGES: Check for image description JSON
418
- if "description" in json_data:
419
  print(f"[GenerationAgent] Found image description JSON in conversation history, proceeding to Phase 3 (CODE_COMPLETE)")
420
  return {
421
- "thought": "I've analyzed the image and now I'll provide the description.",
422
  "status": "CODE_COMPLETE",
423
  "python_code": "",
424
- "explanation": json_data["description"]
425
  }
426
 
427
  # PHASE 3 FOR TF ANALYSIS: Check for TF analysis results
428
- elif "top_tfs" in json_data:
429
- print(f"[GenerationAgent] Found TF analysis JSON in conversation history, proceeding to Phase 3 (CODE_COMPLETE)")
430
- top_tfs = json_data.get("top_tfs", [])
431
  formatted_tfs = ", ".join(top_tfs) if isinstance(top_tfs, list) else str(top_tfs)
432
  return {
433
- "thought": "I've retrieved the top transcription factors as requested.",
434
  "status": "CODE_COMPLETE",
435
  "python_code": "",
436
  "explanation": f"The top transcription factors are: {formatted_tfs}"
437
  }
438
 
439
- # PHASE 2 FOR TF ANALYSIS: Check for raw data that needs analysis
440
- elif "intermediate_data_for_llm" in json_data:
441
- print(f"[GenerationAgent] Found raw data JSON in conversation history, proceeding to Phase 2 (AWAITING_ANALYSIS_CODE)")
442
 
443
- # extract N from the query
444
- nq = re.search(r"(?:top|first|most important)\s+(\d+)", user_query, flags=re.IGNORECASE)
445
- N = int(nq.group(1)) if nq else 10
446
-
447
- # build the instruction to slice first N items
448
- user_content = (
449
- f"You've already fetched the data (shown above). Now, *do not sort*. "
450
- f"Just write Python code that takes the first {N} entries of that list "
451
- f"and prints their TF names as a JSON object with the key 'top_tfs'.\n\n"
452
- "Output STRICTLY as JSON with keys:\n"
453
- " • thought: your step-by-step reasoning\n"
454
- " • status: \"AWAITING_ANALYSIS_CODE\"\n"
455
- " • python_code: only the slicing code that MUST use json.dumps\n"
456
- " • explanation: brief user-facing note\n"
457
- )
458
-
459
- msgs = [
460
- # Enforce JSON-only output format
461
- {
462
- "role": "system",
463
- "content": (
464
- "IMPORTANT: Respond with *only* a single valid JSON object—no plaintext, no markdown, "
465
- "no code fences. The JSON must have exactly these keys: "
466
- "`thought` (string), `status` (string), "
467
- "`python_code` (string), and `explanation` (string)."
468
- )
469
- },
470
- {"role":"system", "content": GENERATION_ASSISTANT_INSTRUCTIONS},
471
- {"role":"assistant", "content": self.available_tools_docs_static},
472
- {"role":"assistant", "content": self.excel_data_docs},
473
- {"role":"assistant", "content": self.discovered_excel_schema_str_for_prompt},
474
- {"role":"assistant", "content": self.www_file_manifest_str_for_prompt},
475
- {"role":"user", "content": user_content},
476
- ]
477
 
478
- # Extract image file ID if needed
479
- image_file_id_match = re.search(r"File ID: ['\\\"](file_[a-zA-Z0-9]+)['\\\"]", user_query)
480
- parsed_image_file_id = None
481
- if image_file_id_match:
482
- parsed_image_file_id = image_file_id_match.group(1)
483
- elif image_file_id_for_prompt:
484
- parsed_image_file_id = image_file_id_for_prompt
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
485
 
486
- return self._generate_with_chat_completion_raw(msgs, parsed_image_file_id)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
487
 
488
  # Unknown JSON format
489
  else:
 
390
  return {"thought": "Error: OpenAI client not initialized.", "python_code": "", "status": "ERROR"}
391
 
392
  # PHASE 2 FOR IMAGES: If we have an image file ID, transition directly to image analysis
 
393
  if image_file_id_for_prompt:
394
  if image_file_id_for_prompt.startswith("file-"):
395
  image_file_id_for_prompt = image_file_id_for_prompt[5:] # Remove "file-" prefix
 
399
  return {
400
  "thought": "I will analyze the image using the describe_image tool",
401
  "status": "AWAITING_DATA",
402
+ "python_code": f"print(json.dumps({{'intermediate_data_for_llm': tools.describe_image(\'{image_file_id_for_prompt}\')}}))",
403
+ "explanation": "I\'ll analyze the image directly and provide my observations."
404
  }
405
 
406
  # Look for JSON blocks in conversation history
407
+ for turn in reversed(conversation_history[-6:]): # Check last 6 turns for relevant context
408
+ content_from_history = turn.get("content", "")
409
+ # Regex to find ```json ... ``` blocks
410
+ # Using re.DOTALL to make . match newlines within the JSON block
411
+ # Using re.IGNORECASE for ```json opening tag flexibility (though strictly lowercase is typical)
412
+ json_block_match = re.search(r"```json\\s*(.*?)\\s*```", content_from_history, flags=re.DOTALL | re.IGNORECASE)
413
+
414
+ if not json_block_match:
415
+ continue # No JSON block in this turn's content
416
 
417
  try:
418
+ # The actual JSON string is in group(1) of the match
419
+ json_string_from_history = json_block_match.group(1)
420
+ json_data_from_history = json.loads(json_string_from_history)
421
 
422
  # PHASE 3 FOR IMAGES: Check for image description JSON
423
+ if "description" in json_data_from_history and "intermediate_data_for_llm" not in json_data_from_history: # Avoid conflict if key names overlap
424
  print(f"[GenerationAgent] Found image description JSON in conversation history, proceeding to Phase 3 (CODE_COMPLETE)")
425
  return {
426
+ "thought": "I have analyzed the image based on the description found in history and will provide the summary.",
427
  "status": "CODE_COMPLETE",
428
  "python_code": "",
429
+ "explanation": json_data_from_history["description"]
430
  }
431
 
432
  # PHASE 3 FOR TF ANALYSIS: Check for TF analysis results
433
+ elif "top_tfs" in json_data_from_history:
434
+ print(f"[GenerationAgent] Found TF analysis JSON (top_tfs) in conversation history, proceeding to Phase 3 (CODE_COMPLETE)")
435
+ top_tfs = json_data_from_history.get("top_tfs", [])
436
  formatted_tfs = ", ".join(top_tfs) if isinstance(top_tfs, list) else str(top_tfs)
437
  return {
438
+ "thought": "I have retrieved the top transcription factors as requested from history and will present them.",
439
  "status": "CODE_COMPLETE",
440
  "python_code": "",
441
  "explanation": f"The top transcription factors are: {formatted_tfs}"
442
  }
443
 
444
+ # Check for 'intermediate_data_for_llm' which indicates fetched data
445
+ elif "intermediate_data_for_llm" in json_data_from_history:
446
+ intermediate_content = json_data_from_history["intermediate_data_for_llm"]
447
 
448
+ # Determine if this data is from a literature search tool
449
+ is_literature_search_data = False
450
+ if "CONTEXT_FROM_RESOURCE_FETCH" in content_from_history:
451
+ # Example history content: "CONTEXT_FROM_RESOURCE_FETCH (original_identifier: print(json.dumps({'intermediate_data_for_llm': tools.multi_source_literature_search(...)}))): ..."
452
+ if ("tools.multi_source_literature_search" in content_from_history or
453
+ "tools.fetch_text_from_urls" in content_from_history):
454
+ is_literature_search_data = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
455
 
456
+ if is_literature_search_data:
457
+ print(f"[GenerationAgent] Found literature search data (intermediate_data_for_llm) in history. Proceeding to summarization.")
458
+ # Instruction to summarize the literature data based on the original user query.
459
+ # The 'intermediate_content' is the actual list of papers/abstracts.
460
+ # The 'user_query' is the original question that triggered the search.
461
+
462
+ summarization_prompt_text = (
463
+ f"You have received literature search results (provided in the conversation history under 'intermediate_data_for_llm'). "
464
+ f"The original user query was: '{user_query}'.\n\n"
465
+ f"Please analyze this literature data in context of the user's query. "
466
+ f"Follow Step 3 of your literature search tool usage guidelines: 'Generate summaries using your LLM capabilities'. "
467
+ f"Provide a comprehensive answer to the user's query based on the fetched literature. "
468
+ f"Ensure your answer is human-readable and directly addresses the query.\n\n"
469
+ f"Your final output should be a JSON object with 'status': 'CODE_COMPLETE', "
470
+ f"and your summary in the 'explanation' field. The 'python_code' field should be empty."
471
+ )
472
+
473
+ # Construct messages for the LLM call
474
+ # This re-uses the standard message setup but with the specific summarization prompt.
475
+ # We are not asking it to generate code here, but to generate the final explanation.
476
+ messages_for_summarization = [
477
+ {"role": "system", "content": "IMPORTANT: Respond with *only* a single valid JSON object—no plaintext, no markdown, no code fences. The JSON must have exactly these keys: `thought` (string), `status` (string), `python_code` (string), and `explanation` (string)."},
478
+ {"role": "system", "content": GENERATION_ASSISTANT_INSTRUCTIONS},
479
+ {"role": "assistant", "content": "--- STATIC TOOL DOCUMENTATION (Reference as needed) ---\n" + self.available_tools_docs_static},
480
+ {"role": "assistant", "content": "--- EXCEL DATA DOCUMENTATION (Reference as needed) ---\n" + self.excel_data_docs},
481
+ {"role": "assistant", "content": "--- DYNAMICALLY DISCOVERED EXCEL SCHEMAS (Reference as needed) ---\n" + self.discovered_excel_schema_str_for_prompt},
482
+ {"role": "assistant", "content": "--- WWW DIRECTORY FILE MANIFEST (Reference for all other available files) ---\n" + self.www_file_manifest_str_for_prompt},
483
+ # Include relevant parts of conversation history so LLM sees the data it needs to summarize.
484
+ # The `intermediate_data_for_llm` is already part of the `conversation_history` fed to `generate_code_plan`,
485
+ # and the LLM has access to it via its own `GENERATION_ASSISTANT_INSTRUCTIONS` (step 2).
486
+ # The key is this new `user_content` string guiding its action.
487
+ {"role": "user", "content": summarization_prompt_text}
488
+ ]
489
+
490
+ # Add paper file if available (though less relevant for this specific summarization task, maintain consistency)
491
+ if self.paper_file_id:
492
+ formatted_paper_id = self.paper_file_id
493
+ if not formatted_paper_id.startswith("file-"): formatted_paper_id = f"file-{formatted_paper_id}"
494
+ # Ensure 'content' is a list if adding multiple parts (text + file)
495
+ if isinstance(messages_for_summarization[-1]["content"], str):
496
+ messages_for_summarization[-1]["content"] = [{"type": "text", "text": messages_for_summarization[-1]["content"]}]
497
+ messages_for_summarization[-1]["content"].append({"type": "file", "file": {"file_id": formatted_paper_id}})
498
+
499
+ # Call LLM to get the summarization plan (which should be status: CODE_COMPLETE)
500
+ # No image_file_id is needed here as we are processing text data.
501
+ return self._generate_with_chat_completion_raw(messages_for_summarization)
502
+
503
+ else: # It's intermediate_data_for_llm, but not identified as literature search - assume TF data or other structured data
504
+ print(f"[GenerationAgent] Found non-literature raw data (intermediate_data_for_llm) in history, proceeding to Phase 2 (AWAITING_ANALYSIS_CODE) for potential slicing/analysis.")
505
+
506
+ # Existing logic for TF-like data, trying to extract N, etc.
507
+ # This part assumes the data is a list and might need slicing.
508
+ nq = re.search(r"(?:top|first|most important|list(?: the|)|show(?: me the|))\s*(\d+)", user_query, flags=re.IGNORECASE)
509
+ N_extracted = int(nq.group(1)) if nq and nq.group(1) else 10 # Default to 10 if not specified or group is empty
510
+
511
+ # More robust N extraction, ensure N is at least 1.
512
+ # Fallback to a default if query doesn't specify N for "top N" type queries on this data.
513
+ N = max(1, N_extracted)
514
+
515
+
516
+ # build the instruction to slice first N items
517
+ # This prompt is specific to data that can be sliced like a list of records.
518
+ tf_slicing_prompt_text = (
519
+ f"You have already fetched data, which is present in the conversation history under 'intermediate_data_for_llm'. "
520
+ f"The user's query is: '{user_query}'.\n\n"
521
+ f"Based on the user query, it seems they might be interested in the first {N} items from this data. "
522
+ f"Write Python code that attempts to extract and present the first {N} items from the 'intermediate_data_for_llm' list. "
523
+ f"Assume 'intermediate_data_for_llm' contains a list of dictionaries or similar structures. "
524
+ f"Your Python code should access this list, take the first {N} elements, and then print these elements as a JSON object with a key like 'extracted_items'. "
525
+ f"If the data is not a list or not structured as expected, the code should handle potential errors gracefully (e.g., print an empty list or an error message within the JSON output).\n\n"
526
+ f"Output STRICTLY as JSON with keys:\n"
527
+ f" • thought: your step-by-step reasoning\n"
528
+ f" • status: \"AWAITING_ANALYSIS_CODE\"\n"
529
+ f" • python_code: only the slicing/extraction code that MUST use json.dumps\n"
530
+ f" • explanation: brief user-facing note like 'Preparing to extract the first {N} items from the fetched data.'\n"
531
+ )
532
+
533
+ msgs_for_slicing = [
534
+ {"role": "system", "content": "IMPORTANT: Respond with *only* a single valid JSON object—no plaintext, no markdown, no code fences. The JSON must have exactly these keys: `thought` (string), `status` (string), `python_code` (string), and `explanation` (string)."},
535
+ {"role":"system", "content": GENERATION_ASSISTANT_INSTRUCTIONS},
536
+ {"role":"assistant", "content": "--- STATIC TOOL DOCUMENTATION (Reference as needed) ---\n" + self.available_tools_docs_static},
537
+ {"role":"assistant", "content": "--- EXCEL DATA DOCUMENTATION (Reference as needed) ---\n" + self.excel_data_docs}, # Corrected typo from ASSISTANT_CONTENT_EXCEL_DATA_DOCS
538
+ {"role":"assistant", "content": "--- DYNAMICALLY DISCOVERED EXCEL SCHEMAS (Reference as needed) ---\n" + self.discovered_excel_schema_str_for_prompt},
539
+ {"role":"assistant", "content": "--- WWW DIRECTORY FILE MANIFEST (Reference for all other available files) ---\n" + self.www_file_manifest_str_for_prompt},
540
+ {"role":"user", "content": tf_slicing_prompt_text},
541
+ ]
542
 
543
+ # Add paper file if available
544
+ if self.paper_file_id:
545
+ formatted_paper_id = self.paper_file_id
546
+ if not formatted_paper_id.startswith("file-"): formatted_paper_id = f"file-{formatted_paper_id}"
547
+ # Ensure 'content' is a list if adding multiple parts (text + file)
548
+ if isinstance(msgs_for_slicing[-1]["content"], str):
549
+ msgs_for_slicing[-1]["content"] = [{"type": "text", "text": msgs_for_slicing[-1]["content"]}]
550
+ msgs_for_slicing[-1]["content"].append({"type": "file", "file": {"file_id": formatted_paper_id}})
551
+
552
+ # Extract image file ID if needed for this path too (though less likely relevant for slicing non-image data)
553
+ # This maintains consistency with the original structure if image_file_id_for_prompt was intended for this path.
554
+ # However, the primary focus here is the textual data in intermediate_data_for_llm.
555
+ parsed_image_file_id = None
556
+ if image_file_id_for_prompt: # Use the one passed to the function if available
557
+ parsed_image_file_id = image_file_id_for_prompt
558
+ if parsed_image_file_id.startswith("file-"): parsed_image_file_id = parsed_image_file_id[5:]
559
+ # Fallback: Try to parse from user_query if not directly provided (less reliable)
560
+ elif not parsed_image_file_id: # Add check to ensure we don't overwrite if already set
561
+ image_file_id_match = re.search(r"File ID: ['\"](file_[a-zA-Z0-9]+)['\"]", user_query)
562
+ if image_file_id_match:
563
+ parsed_image_file_id = image_file_id_match.group(1)
564
+ if parsed_image_file_id.startswith("file-"): parsed_image_file_id = parsed_image_file_id[5:]
565
+
566
+ return self._generate_with_chat_completion_raw(msgs_for_slicing, parsed_image_file_id)
567
 
568
  # Unknown JSON format
569
  else: