Spaces:

taijichat
/

chat

Running

App Files Files Community

WeMWish commited on Jun 4, 2025

Commit

16b63c6

1 Parent(s): 022aa77

fix literature search

Browse files

Files changed (1) hide show

agents/generation_agent.py +141 -61

agents/generation_agent.py CHANGED Viewed

@@ -390,7 +390,6 @@ class GenerationAgent:
             return {"thought": "Error: OpenAI client not initialized.", "python_code": "", "status": "ERROR"}
         # PHASE 2 FOR IMAGES: If we have an image file ID, transition directly to image analysis
-        # Ensure file-ID format consistency by stripping "file-" prefix if present
         if image_file_id_for_prompt:
             if image_file_id_for_prompt.startswith("file-"):
                 image_file_id_for_prompt = image_file_id_for_prompt[5:]  # Remove "file-" prefix
@@ -400,90 +399,171 @@ class GenerationAgent:
             return {
                 "thought": "I will analyze the image using the describe_image tool",
                 "status": "AWAITING_DATA",
-                "python_code": f"print(json.dumps({{'intermediate_data_for_llm': tools.describe_image('{image_file_id_for_prompt}')}})))",
-                "explanation": "I'll analyze the image directly and provide my observations."
             }
         # Look for JSON blocks in conversation history
-        for turn in reversed(conversation_history[-6:]):
-            content = turn.get("content", "")
-            m = re.search(r"```json\s*(.*?)\s*```", content, flags=re.DOTALL)
-            if not m:
-                continue
             try:
-                json_data = json.loads(m.group(1))
                 # PHASE 3 FOR IMAGES: Check for image description JSON
-                if "description" in json_data:
                     print(f"[GenerationAgent] Found image description JSON in conversation history, proceeding to Phase 3 (CODE_COMPLETE)")
                     return {
-                        "thought": "I've analyzed the image and now I'll provide the description.",
                         "status": "CODE_COMPLETE",
                         "python_code": "",
-                        "explanation": json_data["description"]
                     }
                 # PHASE 3 FOR TF ANALYSIS: Check for TF analysis results
-                elif "top_tfs" in json_data:
-                    print(f"[GenerationAgent] Found TF analysis JSON in conversation history, proceeding to Phase 3 (CODE_COMPLETE)")
-                    top_tfs = json_data.get("top_tfs", [])
                     formatted_tfs = ", ".join(top_tfs) if isinstance(top_tfs, list) else str(top_tfs)
                     return {
-                        "thought": "I've retrieved the top transcription factors as requested.",
                         "status": "CODE_COMPLETE",
                         "python_code": "",
                         "explanation": f"The top transcription factors are: {formatted_tfs}"
                     }
-                # PHASE 2 FOR TF ANALYSIS: Check for raw data that needs analysis
-                elif "intermediate_data_for_llm" in json_data:
-                    print(f"[GenerationAgent] Found raw data JSON in conversation history, proceeding to Phase 2 (AWAITING_ANALYSIS_CODE)")
-                    # extract N from the query
-                    nq = re.search(r"(?:top|first|most important)\s+(\d+)", user_query, flags=re.IGNORECASE)
-                    N = int(nq.group(1)) if nq else 10
-                    # build the instruction to slice first N items
-                    user_content = (
-                        f"You've already fetched the data (shown above).  Now, *do not sort*.  "
-                        f"Just write Python code that takes the first {N} entries of that list "
-                        f"and prints their TF names as a JSON object with the key 'top_tfs'.\n\n"
-                        "Output STRICTLY as JSON with keys:\n"
-                        "  • thought: your step-by-step reasoning\n"
-                        "  • status: \"AWAITING_ANALYSIS_CODE\"\n"
-                        "  • python_code: only the slicing code that MUST use json.dumps\n"
-                        "  • explanation: brief user-facing note\n"
-                    )
-                    msgs = [
-                        # Enforce JSON-only output format
-                        {
-                            "role": "system",
-                            "content": (
-                                "IMPORTANT: Respond with *only* a single valid JSON object—no plaintext, no markdown, "
-                                "no code fences.  The JSON must have exactly these keys: "
-                                "`thought` (string), `status` (string), "
-                                "`python_code` (string), and `explanation` (string)."
-                            )
-                        },
-                        {"role":"system",    "content": GENERATION_ASSISTANT_INSTRUCTIONS},
-                        {"role":"assistant", "content": self.available_tools_docs_static},
-                        {"role":"assistant", "content": self.excel_data_docs},
-                        {"role":"assistant", "content": self.discovered_excel_schema_str_for_prompt},
-                        {"role":"assistant", "content": self.www_file_manifest_str_for_prompt},
-                        {"role":"user",      "content": user_content},
-                    ]
-                    # Extract image file ID if needed
-                    image_file_id_match = re.search(r"File ID: ['\\\"](file_[a-zA-Z0-9]+)['\\\"]", user_query)
-                    parsed_image_file_id = None
-                    if image_file_id_match:
-                        parsed_image_file_id = image_file_id_match.group(1)
-                    elif image_file_id_for_prompt:
-                        parsed_image_file_id = image_file_id_for_prompt
-                    return self._generate_with_chat_completion_raw(msgs, parsed_image_file_id)
                 # Unknown JSON format
                 else:

             return {"thought": "Error: OpenAI client not initialized.", "python_code": "", "status": "ERROR"}
         # PHASE 2 FOR IMAGES: If we have an image file ID, transition directly to image analysis
         if image_file_id_for_prompt:
             if image_file_id_for_prompt.startswith("file-"):
                 image_file_id_for_prompt = image_file_id_for_prompt[5:]  # Remove "file-" prefix
             return {
                 "thought": "I will analyze the image using the describe_image tool",
                 "status": "AWAITING_DATA",
+                "python_code": f"print(json.dumps({{'intermediate_data_for_llm': tools.describe_image(\'{image_file_id_for_prompt}\')}}))",
+                "explanation": "I\'ll analyze the image directly and provide my observations."
             }
         # Look for JSON blocks in conversation history
+        for turn in reversed(conversation_history[-6:]): # Check last 6 turns for relevant context
+            content_from_history = turn.get("content", "")
+            # Regex to find ```json ... ``` blocks
+            # Using re.DOTALL to make . match newlines within the JSON block
+            # Using re.IGNORECASE for ```json opening tag flexibility (though strictly lowercase is typical)
+            json_block_match = re.search(r"```json\\s*(.*?)\\s*```", content_from_history, flags=re.DOTALL | re.IGNORECASE)
+            if not json_block_match:
+                continue # No JSON block in this turn's content
             try:
+                # The actual JSON string is in group(1) of the match
+                json_string_from_history = json_block_match.group(1)
+                json_data_from_history = json.loads(json_string_from_history)
                 # PHASE 3 FOR IMAGES: Check for image description JSON
+                if "description" in json_data_from_history and "intermediate_data_for_llm" not in json_data_from_history: # Avoid conflict if key names overlap
                     print(f"[GenerationAgent] Found image description JSON in conversation history, proceeding to Phase 3 (CODE_COMPLETE)")
                     return {
+                        "thought": "I have analyzed the image based on the description found in history and will provide the summary.",
                         "status": "CODE_COMPLETE",
                         "python_code": "",
+                        "explanation": json_data_from_history["description"]
                     }
                 # PHASE 3 FOR TF ANALYSIS: Check for TF analysis results
+                elif "top_tfs" in json_data_from_history:
+                    print(f"[GenerationAgent] Found TF analysis JSON (top_tfs) in conversation history, proceeding to Phase 3 (CODE_COMPLETE)")
+                    top_tfs = json_data_from_history.get("top_tfs", [])
                     formatted_tfs = ", ".join(top_tfs) if isinstance(top_tfs, list) else str(top_tfs)
                     return {
+                        "thought": "I have retrieved the top transcription factors as requested from history and will present them.",
                         "status": "CODE_COMPLETE",
                         "python_code": "",
                         "explanation": f"The top transcription factors are: {formatted_tfs}"
                     }
+                # Check for 'intermediate_data_for_llm' which indicates fetched data
+                elif "intermediate_data_for_llm" in json_data_from_history:
+                    intermediate_content = json_data_from_history["intermediate_data_for_llm"]
+                    # Determine if this data is from a literature search tool
+                    is_literature_search_data = False
+                    if "CONTEXT_FROM_RESOURCE_FETCH" in content_from_history:
+                        # Example history content: "CONTEXT_FROM_RESOURCE_FETCH (original_identifier: print(json.dumps({'intermediate_data_for_llm': tools.multi_source_literature_search(...)}))): ..."
+                        if ("tools.multi_source_literature_search" in content_from_history or
+                            "tools.fetch_text_from_urls" in content_from_history):
+                            is_literature_search_data = True
+                    if is_literature_search_data:
+                        print(f"[GenerationAgent] Found literature search data (intermediate_data_for_llm) in history. Proceeding to summarization.")
+                        # Instruction to summarize the literature data based on the original user query.
+                        # The 'intermediate_content' is the actual list of papers/abstracts.
+                        # The 'user_query' is the original question that triggered the search.
+                        summarization_prompt_text = (
+                            f"You have received literature search results (provided in the conversation history under 'intermediate_data_for_llm'). "
+                            f"The original user query was: '{user_query}'.\n\n"
+                            f"Please analyze this literature data in context of the user's query. "
+                            f"Follow Step 3 of your literature search tool usage guidelines: 'Generate summaries using your LLM capabilities'. "
+                            f"Provide a comprehensive answer to the user's query based on the fetched literature. "
+                            f"Ensure your answer is human-readable and directly addresses the query.\n\n"
+                            f"Your final output should be a JSON object with 'status': 'CODE_COMPLETE', "
+                            f"and your summary in the 'explanation' field. The 'python_code' field should be empty."
+                        )
+                        # Construct messages for the LLM call
+                        # This re-uses the standard message setup but with the specific summarization prompt.
+                        # We are not asking it to generate code here, but to generate the final explanation.
+                        messages_for_summarization = [
+                            {"role": "system", "content": "IMPORTANT: Respond with *only* a single valid JSON object—no plaintext, no markdown, no code fences. The JSON must have exactly these keys: `thought` (string), `status` (string), `python_code` (string), and `explanation` (string)."},
+                            {"role": "system", "content": GENERATION_ASSISTANT_INSTRUCTIONS},
+                            {"role": "assistant", "content": "--- STATIC TOOL DOCUMENTATION (Reference as needed) ---\n" + self.available_tools_docs_static},
+                            {"role": "assistant", "content": "--- EXCEL DATA DOCUMENTATION (Reference as needed) ---\n" + self.excel_data_docs},
+                            {"role": "assistant", "content": "--- DYNAMICALLY DISCOVERED EXCEL SCHEMAS (Reference as needed) ---\n" + self.discovered_excel_schema_str_for_prompt},
+                            {"role": "assistant", "content": "--- WWW DIRECTORY FILE MANIFEST (Reference for all other available files) ---\n" + self.www_file_manifest_str_for_prompt},
+                            # Include relevant parts of conversation history so LLM sees the data it needs to summarize.
+                            # The `intermediate_data_for_llm` is already part of the `conversation_history` fed to `generate_code_plan`,
+                            # and the LLM has access to it via its own `GENERATION_ASSISTANT_INSTRUCTIONS` (step 2).
+                            # The key is this new `user_content` string guiding its action.
+                            {"role": "user", "content": summarization_prompt_text}
+                        ]
+                        # Add paper file if available (though less relevant for this specific summarization task, maintain consistency)
+                        if self.paper_file_id:
+                            formatted_paper_id = self.paper_file_id
+                            if not formatted_paper_id.startswith("file-"): formatted_paper_id = f"file-{formatted_paper_id}"
+                            # Ensure 'content' is a list if adding multiple parts (text + file)
+                            if isinstance(messages_for_summarization[-1]["content"], str):
+                                messages_for_summarization[-1]["content"] = [{"type": "text", "text": messages_for_summarization[-1]["content"]}]
+                            messages_for_summarization[-1]["content"].append({"type": "file", "file": {"file_id": formatted_paper_id}})
+                        # Call LLM to get the summarization plan (which should be status: CODE_COMPLETE)
+                        # No image_file_id is needed here as we are processing text data.
+                        return self._generate_with_chat_completion_raw(messages_for_summarization)
+                    else: # It's intermediate_data_for_llm, but not identified as literature search - assume TF data or other structured data
+                        print(f"[GenerationAgent] Found non-literature raw data (intermediate_data_for_llm) in history, proceeding to Phase 2 (AWAITING_ANALYSIS_CODE) for potential slicing/analysis.")
+                        # Existing logic for TF-like data, trying to extract N, etc.
+                        # This part assumes the data is a list and might need slicing.
+                        nq = re.search(r"(?:top|first|most important|list(?: the|)|show(?: me the|))\s*(\d+)", user_query, flags=re.IGNORECASE)
+                        N_extracted = int(nq.group(1)) if nq and nq.group(1) else 10 # Default to 10 if not specified or group is empty
+                        # More robust N extraction, ensure N is at least 1.
+                        # Fallback to a default if query doesn't specify N for "top N" type queries on this data.
+                        N = max(1, N_extracted)
+                        # build the instruction to slice first N items
+                        # This prompt is specific to data that can be sliced like a list of records.
+                        tf_slicing_prompt_text = (
+                            f"You have already fetched data, which is present in the conversation history under 'intermediate_data_for_llm'. "
+                            f"The user's query is: '{user_query}'.\n\n"
+                            f"Based on the user query, it seems they might be interested in the first {N} items from this data. "
+                            f"Write Python code that attempts to extract and present the first {N} items from the 'intermediate_data_for_llm' list. "
+                            f"Assume 'intermediate_data_for_llm' contains a list of dictionaries or similar structures. "
+                            f"Your Python code should access this list, take the first {N} elements, and then print these elements as a JSON object with a key like 'extracted_items'. "
+                            f"If the data is not a list or not structured as expected, the code should handle potential errors gracefully (e.g., print an empty list or an error message within the JSON output).\n\n"
+                            f"Output STRICTLY as JSON with keys:\n"
+                            f"  • thought: your step-by-step reasoning\n"
+                            f"  • status: \"AWAITING_ANALYSIS_CODE\"\n"
+                            f"  • python_code: only the slicing/extraction code that MUST use json.dumps\n"
+                            f"  • explanation: brief user-facing note like 'Preparing to extract the first {N} items from the fetched data.'\n"
+                        )
+                        msgs_for_slicing = [
+                            {"role": "system", "content": "IMPORTANT: Respond with *only* a single valid JSON object—no plaintext, no markdown, no code fences. The JSON must have exactly these keys: `thought` (string), `status` (string), `python_code` (string), and `explanation` (string)."},
+                            {"role":"system",    "content": GENERATION_ASSISTANT_INSTRUCTIONS},
+                            {"role":"assistant", "content": "--- STATIC TOOL DOCUMENTATION (Reference as needed) ---\n" + self.available_tools_docs_static},
+                            {"role":"assistant", "content": "--- EXCEL DATA DOCUMENTATION (Reference as needed) ---\n" + self.excel_data_docs}, # Corrected typo from ASSISTANT_CONTENT_EXCEL_DATA_DOCS
+                            {"role":"assistant", "content": "--- DYNAMICALLY DISCOVERED EXCEL SCHEMAS (Reference as needed) ---\n" + self.discovered_excel_schema_str_for_prompt},
+                            {"role":"assistant", "content": "--- WWW DIRECTORY FILE MANIFEST (Reference for all other available files) ---\n" + self.www_file_manifest_str_for_prompt},
+                            {"role":"user",      "content": tf_slicing_prompt_text},
+                        ]
+                        # Add paper file if available
+                        if self.paper_file_id:
+                            formatted_paper_id = self.paper_file_id
+                            if not formatted_paper_id.startswith("file-"): formatted_paper_id = f"file-{formatted_paper_id}"
+                            # Ensure 'content' is a list if adding multiple parts (text + file)
+                            if isinstance(msgs_for_slicing[-1]["content"], str):
+                                msgs_for_slicing[-1]["content"] = [{"type": "text", "text": msgs_for_slicing[-1]["content"]}]
+                            msgs_for_slicing[-1]["content"].append({"type": "file", "file": {"file_id": formatted_paper_id}})
+                        # Extract image file ID if needed for this path too (though less likely relevant for slicing non-image data)
+                        # This maintains consistency with the original structure if image_file_id_for_prompt was intended for this path.
+                        # However, the primary focus here is the textual data in intermediate_data_for_llm.
+                        parsed_image_file_id = None
+                        if image_file_id_for_prompt: # Use the one passed to the function if available
+                             parsed_image_file_id = image_file_id_for_prompt
+                             if parsed_image_file_id.startswith("file-"): parsed_image_file_id = parsed_image_file_id[5:]
+                        # Fallback: Try to parse from user_query if not directly provided (less reliable)
+                        elif not parsed_image_file_id: # Add check to ensure we don't overwrite if already set
+                            image_file_id_match = re.search(r"File ID: ['\"](file_[a-zA-Z0-9]+)['\"]", user_query)
+                            if image_file_id_match:
+                                parsed_image_file_id = image_file_id_match.group(1)
+                                if parsed_image_file_id.startswith("file-"): parsed_image_file_id = parsed_image_file_id[5:]
+                        return self._generate_with_chat_completion_raw(msgs_for_slicing, parsed_image_file_id)
                 # Unknown JSON format
                 else: