Final_Assignment_Agents

Sleeping

App Files Files Community

ernani commited on May 1, 2025

Commit

b13a99c

1 Parent(s): 794ea68

fixing some prompts - adjusting outputs

Browse files

Files changed (2) hide show

manage_agents.py +114 -50
tools.py +122 -129

manage_agents.py CHANGED Viewed

@@ -16,6 +16,7 @@ from tools import (
     AudioTool,
     ExcelTool,
     WebSearchTool,
     PythonTool,
     ContentProcessingError
 )
@@ -43,6 +44,7 @@ class ContentTypeAgent:
             "audio": AudioTool(),
             "excel": ExcelTool(),
             "web": WebSearchTool(),
             "python": PythonTool()
         }
@@ -263,7 +265,7 @@ class ContentTranslateAgent:
             For example, if asked "What is 2+2?", respond simply with "4".
             If external information is needed, respond ONLY with 'TOOLS_REQUIRED'.
-            Your Final Answer (or TOOLS_REQUIRED):"""
         )
         self.chain = (
             {"question": RunnablePassthrough()}
@@ -281,7 +283,7 @@ class StateGraphAgent:
     """Modern implementation of MainAgent for tool orchestration"""
     def __init__(self):
-        self.llm = ChatOpenAI(temperature=0.2, model="gpt-4o-mini")
         # llm = HuggingFaceEndpoint(
         #     repo_id="Qwen/Qwen2.5-Coder-32B-Instruct",
         #     #repo_id="meta-llama/Llama-3.3-70B-Instruct",
@@ -291,7 +293,6 @@ class StateGraphAgent:
         # self.llm = ChatHuggingFace(llm=llm, verbose=True)
         # Initialize tools
         self.wikipedia_tool = WikipediaTool()
         self.web_search_tool = WebSearchTool()
@@ -300,6 +301,7 @@ class StateGraphAgent:
         self.audio_tool = AudioTool()
         self.excel_tool = ExcelTool()
         self.python_tool = PythonTool()
         # Create a dictionary of tools for easy access
         self.tools = {
@@ -310,6 +312,7 @@ class StateGraphAgent:
             "audio": self.audio_tool,
             "excel": self.excel_tool,
             "python": self.python_tool,
         }
         # Tool usage tracking
@@ -371,6 +374,23 @@ class StateGraphAgent:
                         "required": ["question"]
                     }
                 }
             }
         ]
@@ -535,6 +555,10 @@ class StateGraphAgent:
             query = args.get("query", "")
             self.last_used_tool = "web"
             result = self.web_search_tool._run(query)
         elif tool_name == "analyze_youtube":
             url = args.get("url", "")
             question = args.get("question", "")
@@ -585,10 +609,18 @@ class StateGraphAgent:
         Question: {question}
         If you can answer this directly (like math, text reversal, etc), provide the answer.
         Your answer should be concise and direct. Focus only on answering the question.
         No additional words or explanations.
         Format:
-        <answer>
         Otherwise respond with 'TOOLS_REQUIRED'."""
         response = self.llm.invoke(direct_query)
@@ -598,13 +630,36 @@ class StateGraphAgent:
     def _optimize_query(self, question):
         """Create an optimized search query for the question"""
-        query_prompt = f"""You are an agent that needs to understand user questions and formulate optimized search queries.
         Question: {question}
         Your task is to create an optimized search query that will retrieve the most relevant information.
         Focus on extracting key entities, relationships, and constraints from the question.
         Return only the optimized search query."""
         response = self.llm.invoke(query_prompt)
@@ -634,16 +689,26 @@ class StateGraphAgent:
                     content = self._execute_tool(tool_name, args)
                     # Generate final answer
-                    answer_prompt = f"""Based on the processed file information, answer the question precisely.
                     Question: {question}
                     File information: {content}
                     Your answer should be concise and direct. Focus only on answering the question.
                     No additional words or explanations.
                     Format:
-                    <answer>
                     """
                     response = self.llm.invoke(answer_prompt)
@@ -668,7 +733,7 @@ class StateGraphAgent:
                 No additional words or explanations.
                 Format:
-                <answer>
                 """
                 response = self.llm.invoke(answer_prompt)
@@ -695,8 +760,12 @@ class StateGraphAgent:
                 Example:
                 Question: What is the capital of France?
                 Answer: Paris
                 Format:
-                <answer>
                 """
                 response = self.llm.invoke(answer_prompt)
@@ -720,8 +789,15 @@ class StateGraphAgent:
                 If asked for a city name without abbreviations, make sure to provide the full name (e.g., "Saint Petersburg" instead of "St. Petersburg").
                 If asked for only a first name or a code, provide only that specific information.
                 No additional words or explanations.
                 Format:
-                <answer>
                 """
                 response = self.llm.invoke(answer_prompt)
@@ -753,7 +829,7 @@ class StateGraphAgent:
                 Use the search_wikipedia tool to find relevant information. Be concise and direct.
                 No additional words or explanations.
                 Format:
-                <answer>
                 """
             else:
                 system_prompt = f"""Answer this question using web search or other appropriate tools.
@@ -765,7 +841,7 @@ class StateGraphAgent:
                 Use the most appropriate tool to find the information needed. Be concise and direct.
                 No additional words or explanations.
                 Format:
-                <answer>
                 """
             # Get response from tool-equipped LLM
@@ -798,7 +874,7 @@ class StateGraphAgent:
                 Question: What is the capital of France?
                 Answer: Paris
                 Format:
-                <answer>
                 """
                 final_response = self.llm.invoke(answer_prompt)
@@ -829,6 +905,7 @@ class MainAgent:
         self.tools = {
             "wiki": self.wikipedia_tool,
             "web": self.web_search_tool,
             "youtube": self.youtube_tool,
             "image": self.image_tool,
             "audio": self.audio_tool,
@@ -997,7 +1074,7 @@ class MainAgent:
             Your task is to create an optimized search query that will retrieve the most relevant information.
             Focus on extracting key entities, relationships, and constraints from the question.
-            If the question is about searching something on the web, use the search_web tool or wikipedia tool.
             Example:
             Question: What is the capital of France?
@@ -1269,7 +1346,7 @@ class MainAgent:
             Your answer should be:
             "Paris"
             Format:
-            <answer>
             """
             response = self.llm.invoke(query)
@@ -1292,7 +1369,6 @@ class MainAgent:
         # Add synthesize_answer node
         def _synthesize_answer(state):
             import re
-            # Find the original question and the latest FunctionMessage (tool output)
             question = None
             tool_output = None
             for msg in state.messages:
@@ -1301,48 +1377,36 @@ class MainAgent:
                 if isinstance(msg, FunctionMessage):
                     tool_output = msg.content
             if not question or not tool_output:
-                return state  # Defensive: should not happen
-            # Compose the answer prompt
-            answer_prompt = f"""You are a helpful AI assistant. Use the following context to answer the question as directly and concisely as possible.
-            Context: {tool_output}
-            Question: {question}
             Instructions:
-            - Output ONLY the answer, with no extra words, no sentences, no restatement, no quotes, and no explanations.
-            - Do NOT repeat or rephrase the question.
-            - Do NOT include any introductory or closing phrases.
-            - If the answer is a single word, number, or phrase, output only that.
-            - If the answer is a list, output only the list as requested (e.g., comma-separated, one per line, etc.).
-            - If the answer is not present in the context, output "NOT FOUND".
-            Examples of correct answers:
-            Q: What is the capital of France?
-            A: Paris
-            Q: What does Teal'c say in response to the question \"Isn't that hot?\"
-            A: extremely
-            Q: List the ingredients.
-            A: salt, flour, eggs
-            Examples of incorrect answers (do NOT do this):
-            - The answer is Paris.
-            - The final numeric output is 0.
-            - The vegetables are: acorns, bell pepper, ...
-            - Answer: extremely
-            Now, output ONLY the answer.
-            Output Format:
-            <answer>
             """
             response = self.llm.invoke(answer_prompt)
             answer = response.content if hasattr(response, 'content') else str(response)
             # Remove any prefix like "Final Answer:" or "Answer:" and strip whitespace
-            answer = re.sub(r'^(final answer:|answer:|<answer>|</answer>)', '', answer, flags=re.IGNORECASE).strip()
             state.messages.append(AIMessage(content=answer))
             return state
         builder.add_node("synthesize_answer", _synthesize_answer)

     AudioTool,
     ExcelTool,
     WebSearchTool,
+    ArvixSearchTool,
     PythonTool,
     ContentProcessingError
 )
             "audio": AudioTool(),
             "excel": ExcelTool(),
             "web": WebSearchTool(),
+            "arvix": ArvixSearchTool(),
             "python": PythonTool()
         }
             For example, if asked "What is 2+2?", respond simply with "4".
             If external information is needed, respond ONLY with 'TOOLS_REQUIRED'.
+            Your output - only the answer without any additional words (or TOOLS_REQUIRED):"""
         )
         self.chain = (
             {"question": RunnablePassthrough()}
     """Modern implementation of MainAgent for tool orchestration"""
     def __init__(self):
+        self.llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")
         # llm = HuggingFaceEndpoint(
         #     repo_id="Qwen/Qwen2.5-Coder-32B-Instruct",
         #     #repo_id="meta-llama/Llama-3.3-70B-Instruct",
         # self.llm = ChatHuggingFace(llm=llm, verbose=True)
         # Initialize tools
         self.wikipedia_tool = WikipediaTool()
         self.web_search_tool = WebSearchTool()
         self.audio_tool = AudioTool()
         self.excel_tool = ExcelTool()
         self.python_tool = PythonTool()
+        self.arvix_tool = ArvixSearchTool()
         # Create a dictionary of tools for easy access
         self.tools = {
             "audio": self.audio_tool,
             "excel": self.excel_tool,
             "python": self.python_tool,
+            "arvix": self.arvix_tool
         }
         # Tool usage tracking
                         "required": ["question"]
                     }
                 }
+            },
+            {
+                "type": "function",
+                "function": {
+                    "name": "search_arxiv",
+                    "description": "Search Arxiv for a query and return maximum 3 results as formatted string.",
+                    "parameters": {
+                        "type": "object",
+                        "properties": {
+                            "query": {
+                                "type": "string",
+                                "description": "The query to search Arxiv for"
+                            }
+                        },
+                        "required": ["query"]
+                    }
+                }
             }
         ]
             query = args.get("query", "")
             self.last_used_tool = "web"
             result = self.web_search_tool._run(query)
+        elif tool_name == "search_arxiv":
+            query = args.get("query", "")
+            self.last_used_tool = "arvix"
+            result = self.arvix_tool._run(query)
         elif tool_name == "analyze_youtube":
             url = args.get("url", "")
             question = args.get("question", "")
         Question: {question}
         If you can answer this directly (like math, text reversal, etc), provide the answer.
+        Undertand the necessary skills you need before answering the question.
         Your answer should be concise and direct. Focus only on answering the question.
+        - RULES:
+        - Understand the context of the question first.
+        - What is the main entity of the question?
+        - What is the answer to the question?
+        - If you need to use a tool, respond with 'TOOLS_REQUIRED'.
         No additional words or explanations.
         Format:
+        Output only the answer.
         Otherwise respond with 'TOOLS_REQUIRED'."""
         response = self.llm.invoke(direct_query)
     def _optimize_query(self, question):
         """Create an optimized search query for the question"""
+        query_prompt = f"""
+        You are an agent specialized in researching information on the web.
+        Your task is to read the asked question and:
         Question: {question}
         Your task is to create an optimized search query that will retrieve the most relevant information.
         Focus on extracting key entities, relationships, and constraints from the question.
+        - RULES:
+        - Understand the context of the question first.
+        - What is the main entity of the question?
+        - Use only the necessary keywords to search the web.
+        - Do not include any other text or comments, just the optimized search query.
+        If the question is:
+        Q: What is the capital of France?
+        your optimized search query should be: capital of France
+        Q: How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.?
+        your optimized search query should be: Mercedes Sosa Musician
+        Based on the examples above:
+        - Understand the context of the question first.
+        - What is the main entity, if it's an actor, musician, etc, use the role and the name of the person.
+        - Remember, you are not making a question, you are retrieving information from the web.
         Return only the optimized search query."""
         response = self.llm.invoke(query_prompt)
                     content = self._execute_tool(tool_name, args)
                     # Generate final answer
+                    answer_prompt = f"""You are a data analyst. You are good at analyzing data and extracting information from files.
+                    Your task is to analyze the question and answer it based on the file information.
                     Question: {question}
                     File information: {content}
                     Your answer should be concise and direct. Focus only on answering the question.
                     No additional words or explanations.
+                    If the question is about a table, use the table information to answer the question.
+                    Understand the table and the question first before answering.
+                    Categorize the table information
+                    Calculate the answer based on the table information.
+                    If the question is related to a video, audio, or image, use the content information to answer the question.
+                    Understand the context of the question first before answering.
+                    If you can't answer the question based on the content, respond with 'TOOLS_REQUIRED'.
                     Format:
+                    Output only the answer.
                     """
                     response = self.llm.invoke(answer_prompt)
                 No additional words or explanations.
                 Format:
+                Output only the answer.
                 """
                 response = self.llm.invoke(answer_prompt)
                 Example:
                 Question: What is the capital of France?
                 Answer: Paris
+                Question: How many wheels does the car have?
+                Answer: 4
                 Format:
+                Output only the answer.
                 """
                 response = self.llm.invoke(answer_prompt)
                 If asked for a city name without abbreviations, make sure to provide the full name (e.g., "Saint Petersburg" instead of "St. Petersburg").
                 If asked for only a first name or a code, provide only that specific information.
                 No additional words or explanations.
+                Example:
+                Question: What is the capital of France?
+                Answer: Paris
+                Question: How many wheels does the car have?
+                Answer: 4
                 Format:
+                Output only the answer.
                 """
                 response = self.llm.invoke(answer_prompt)
                 Use the search_wikipedia tool to find relevant information. Be concise and direct.
                 No additional words or explanations.
                 Format:
+                Output only the answer.
                 """
             else:
                 system_prompt = f"""Answer this question using web search or other appropriate tools.
                 Use the most appropriate tool to find the information needed. Be concise and direct.
                 No additional words or explanations.
                 Format:
+                Output only the answer.
                 """
             # Get response from tool-equipped LLM
                 Question: What is the capital of France?
                 Answer: Paris
                 Format:
+                Output only the answer.
                 """
                 final_response = self.llm.invoke(answer_prompt)
         self.tools = {
             "wiki": self.wikipedia_tool,
             "web": self.web_search_tool,
+            "arvix": self.arvix_search_tool,
             "youtube": self.youtube_tool,
             "image": self.image_tool,
             "audio": self.audio_tool,
             Your task is to create an optimized search query that will retrieve the most relevant information.
             Focus on extracting key entities, relationships, and constraints from the question.
+            If the question is about searching something on the web, use the search_web tool, wikipedia tool or search_arxiv tool.
             Example:
             Question: What is the capital of France?
             Your answer should be:
             "Paris"
             Format:
+            Output only the answer.
             """
             response = self.llm.invoke(query)
         # Add synthesize_answer node
         def _synthesize_answer(state):
             import re
             question = None
             tool_output = None
             for msg in state.messages:
                 if isinstance(msg, FunctionMessage):
                     tool_output = msg.content
             if not question or not tool_output:
+                return state
+            answer_prompt = f"""
+            You are a helpful AI assistant.
+            You are given a question and some context (which may be empty or incomplete).
             Instructions:
+            1. Carefully read the context and the question.
+            2. If the context contains all the information needed to answer the question, answer it directly.
+            3. If the context is missing information, identify what is missing.
+            4. If you need more information, request the use of the available tools (such as Wikipedia, web search, or other domain-specific tools) to find the answer.
+            5. Once you have all the necessary information, answer the question as directly and concisely as possible, following any formatting instructions.
+            Rules:
+            - Do not make up information not present in the context or found via tools.
+            - If you use a tool, state which tool you are using and why, then use it and incorporate the result.
+            - Output only the final answer, unless specifically asked for reasoning steps.
+            - Do not include any other text or comments, just the answer.
+            - If the answer is a list, output only the list as requested (e.g., comma-separated, one per line, etc.).
+            - If the answer is: how many wheels does the car have?, output only the number, not a sentence.
+            Context:
+            {tool_output}
+            Question:
+            {question}
             """
             response = self.llm.invoke(answer_prompt)
             answer = response.content if hasattr(response, 'content') else str(response)
             # Remove any prefix like "Final Answer:" or "Answer:" and strip whitespace
+            answer = re.sub(r'^(Final Answer:|Answer:)', '', answer, flags=re.IGNORECASE).strip()
             state.messages.append(AIMessage(content=answer))
             return state
         builder.add_node("synthesize_answer", _synthesize_answer)

tools.py CHANGED Viewed

@@ -5,8 +5,8 @@ import requests
 from langchain.tools import BaseTool
 from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
-from langchain_community.tools import WikipediaQueryRun, DuckDuckGoSearchResults
-from langchain_community.document_loaders import PythonLoader
 from langchain_community.utilities import WikipediaAPIWrapper, DuckDuckGoSearchAPIWrapper
 import pytube
 from PIL import Image
@@ -99,11 +99,27 @@ class WikipediaTool(BaseTool):
     def _run(self, question: str) -> str:
         """Search Wikipedia and return the result as a string"""
         try:
-            # First, try with optimized query
-            result = self.wikipedia_tool.run(question)
-            return result
         except Exception as e:
             return f"Error searching Wikipedia: {str(e)}"
@@ -593,64 +609,11 @@ class ExcelTool(BaseContentTool):
     def _dataframe_to_text(self, df: pd.DataFrame) -> str:
         """Convert DataFrame to a readable text format optimized for LLM analysis."""
-        text_parts = []
-        # Basic DataFrame information
-        text_parts.append(f"DataFrame Summary:")
-        text_parts.append(f"Total Rows: {len(df)}")
-        text_parts.append(f"Total Columns: {len(df.columns)}")
-        # Column information with data types
-        text_parts.append("\nColumn Information:")
-        for column in df.columns:
-            dtype = df[column].dtype
-            sample_values = ", ".join(str(x) for x in df[column].head(3).tolist())
-            text_parts.append(f"- {column} (Type: {dtype}): Sample values: {sample_values}")
-        # Classification hints
-        text_parts.append("\nColumn Classification Hints:")
-        # Identify potential category columns
-        category_cols = [col for col in df.columns if any(term in str(col).lower() for term in ['category', 'type', 'item', 'product'])]
-        if category_cols:
-            text_parts.append(f"Potential category/item columns: {', '.join(category_cols)}")
-            # For each category column, list unique values
-            for col in category_cols:
-                unique_vals = df[col].unique()
-                if len(unique_vals) < 20:  # Only if there aren't too many
-                    text_parts.append(f"Unique values in {col}: {', '.join(str(x) for x in unique_vals)}")
-        # Identify potential price/value columns
-        value_cols = [col for col in df.columns if any(term in str(col).lower() for term in ['price', 'cost', 'sale', 'revenue', 'amount', 'total'])]
-        if value_cols:
-            text_parts.append(f"Potential value/price columns: {', '.join(value_cols)}")
-            # Sum of each value column
-            for col in value_cols:
-                if pd.api.types.is_numeric_dtype(df[col]):
-                    text_parts.append(f"Sum of {col}: {df[col].sum()}")
-        # Data sample (first 10 rows in a clean tabular format)
-        text_parts.append("\nData Sample (first 10 rows):")
-        # Format the DataFrame as a string table
-        sample_df = df.head(10)
-        headers = sample_df.columns.tolist()
-        rows = []
-        # Add header row
-        header_row = " | ".join(str(h) for h in headers)
-        rows.append(header_row)
-        rows.append("-" * len(header_row))
-        # Add data rows
-        for _, row in sample_df.iterrows():
-            rows.append(" | ".join(str(row[h]) for h in headers))
-        text_parts.append("\n".join(rows))
-        return "\n".join(text_parts)
     def _run(self, task_id: str, question: str = "") -> List[Document]:
         """Process Excel file content and return documents with extracted information."""
@@ -692,124 +655,154 @@ class ExcelTool(BaseContentTool):
         """Async version of _run."""
         return self._run(task_id)
 class WebSearchTool(BaseTool):
     """Tool for web search using DuckDuckGo"""
     name: str = "web_search"
     description: str = "Search the web for information. Useful for questions about current events, specific facts, or topics not covered in Wikipedia."
-    search_tool: DuckDuckGoSearchResults = Field(default_factory=DuckDuckGoSearchResults)
     print("WebSearchTool initialized")
-    def _extract_links_from_results(self, search_result: str) -> list:
-        """Extract links from search results using string splitting"""
         links = []
         try:
-            # Split by 'link:' and process each part except the first one
-            parts = search_result.split('link:')
-            # Skip the first part (before the first 'link:')
-            for part in parts[1:]:
-                # Get the URL by splitting at the first comma
-                url = part.split(',')[0].strip()
-                if url.startswith('http'):
-                    links.append(url)
-            # Add debug output
         except Exception as e:
             print(f"Error extracting links: {str(e)}")
         return links
     def _is_promising_link(self, link: str, query: str) -> bool:
-        """Determine if a link is promising based on the query"""
         query_terms = set(query.lower().split())
-        # Exclude common non-content sites
         excluded_domains = [
             'youtube.com', 'facebook.com', 'twitter.com', 'instagram.com',
             'pinterest.com', 'reddit.com', 'tiktok.com', 'linkedin.com'
         ]
         for domain in excluded_domains:
             if domain in link:
                 return False
-        # Prefer certain credible domains
         preferred_domains = [
             'wikipedia.org', 'britannica.com', 'scholarpedia.org',
             '.edu', '.gov', '.org'
         ]
         for domain in preferred_domains:
             if domain in link:
                 return True
-        return True  # Default to True to allow scraping
     def _scrape_page_content(self, url: str) -> str:
-        """Scrape the content of a webpage"""
         try:
             headers = {
                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
             }
             response = requests.get(url, headers=headers, timeout=10)
             response.raise_for_status()
-            # Check if we got HTML content
             content_type = response.headers.get('Content-Type', '')
             if 'text/html' not in content_type:
                 return ""
-            # Use BeautifulSoup to parse the HTML
             from bs4 import BeautifulSoup
             soup = BeautifulSoup(response.text, 'html.parser')
-            # Remove script and style elements
             for script in soup(["script", "style", "nav", "footer", "header"]):
                 script.decompose()
-            # Extract text content
             text = soup.get_text(separator=' ', strip=True)
-            # Clean up the text
             lines = (line.strip() for line in text.splitlines())
             chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
             text = '\n'.join(chunk for chunk in chunks if chunk)
-            # Limit the length
-            return text[:5000]  # Limit to 5000 chars
         except Exception as e:
             return f"Error scraping page content {str(e)}"
     def _run(self, query: str) -> str:
-        """Search the web and return results as a string"""
         try:
-            # First perform the DuckDuckGo search
             search_result = self.search_tool.run(query, max_results=5)
-            # Extract links from the search results
             links = self._extract_links_from_results(search_result)
-            # Process up to 3 promising links
-            additional_content = []
             processed_count = 0
             for link in links:
                 if processed_count >= 3:
                     break
-                if self._is_promising_link(link, query):
-                    content = self._scrape_page_content(link)
-                    if content:
-                        additional_content.append(f"Additional content from {link}:\n{content}\n")
-                        processed_count += 1
-            # Combine the search results with the additional content
-            combined_result = search_result
-            if additional_content:
-                combined_result += "\n\n" + "\n\n".join(additional_content)
-            return combined_result
         except Exception as e:
             return f"Error searching the web: {str(e)}"

 from langchain.tools import BaseTool
 from langchain.schema import Document
 from langchain.text_splitter import RecursiveCharacterTextSplitter
+from langchain_community.tools import WikipediaQueryRun, DuckDuckGoSearchResults, TavilySearchResults
+from langchain_community.document_loaders import PythonLoader, ArxivLoader
 from langchain_community.utilities import WikipediaAPIWrapper, DuckDuckGoSearchAPIWrapper
 import pytube
 from PIL import Image
     def _run(self, question: str) -> str:
         """Search Wikipedia and return the result as a string"""
         try:
+            # Try with optimized query
+            results = self.wikipedia_tool.api_wrapper.run(question)
+            # results is a list of dicts with 'title', 'summary', 'content', etc.
+            formatted_results = []
+            for res in results:
+                # Skip disambiguation pages
+                if 'disambiguation' in res.get('title', '').lower():
+                    continue
+                summary = res.get('summary') or res.get('content') or ''
+                if not summary:
+                    continue
+                formatted_results.append(
+                    f'<Document source="wikipedia" title="{res.get("title", "")}">\n{summary}\n</Document>'
+                )
+            if not formatted_results:
+                # Fallback to web search if nothing found
+                from langchain_community.tools import DuckDuckGoSearchResults
+                web = DuckDuckGoSearchResults()
+                web_result = web.run(question, max_results=2)
+                return f"<Document source=\"web_fallback\">\n{web_result}\n</Document>"
+            return "\n\n---\n\n".join(formatted_results)[:8000]
         except Exception as e:
             return f"Error searching Wikipedia: {str(e)}"
     def _dataframe_to_text(self, df: pd.DataFrame) -> str:
         """Convert DataFrame to a readable text format optimized for LLM analysis."""
+        # Use to_string for a clean, tabular format
+        table_str = df.to_string(index=False)
+        print("table_str")
+        print(table_str)
+        return f"Table:\n{table_str}"
     def _run(self, task_id: str, question: str = "") -> List[Document]:
         """Process Excel file content and return documents with extracted information."""
         """Async version of _run."""
         return self._run(task_id)
+class ArvixSearchTool(BaseTool):
+    """Tool for searching Arxiv for a query and returning maximum 3 results as formatted string."""
+    name: str = "arvix_search"
+    description: str = "Search Arxiv for a query and return maximum 3 results as formatted string."
+    def _run(self, query: str) -> str:
+        """Search Arxiv for a query and return maximum 3 results as formatted string."""
+        try:
+            search_docs = ArxivLoader(query=query, load_max_docs=3).load()
+            # Rank by keyword overlap
+            def score(doc):
+                qwords = set(query.lower().split())
+                content = (doc.page_content or "").lower()
+                return sum(1 for w in qwords if w in content)
+            search_docs = sorted(search_docs, key=score, reverse=True)
+            formatted = []
+            for doc in search_docs:
+                meta = doc.metadata
+                title = meta.get('Title') or meta.get('title') or ''
+                authors = meta.get('Authors') or meta.get('authors') or ''
+                year = meta.get('Year') or meta.get('year') or ''
+                link = meta.get('Entry ID') or meta.get('entry_id') or ''
+                abstract = doc.page_content[:1200]
+                formatted.append(
+                    f'<Document source="arxiv" title="{title}" authors="{authors}" year="{year}" link="{link}">\n{abstract}\n</Document>'
+                )
+            if not formatted:
+                return "No relevant arXiv results found."
+            return "\n\n---\n\n".join(formatted)[:8000]
+        except Exception as e:
+            return f"Error searching arXiv: {str(e)}"
 class WebSearchTool(BaseTool):
     """Tool for web search using DuckDuckGo"""
     name: str = "web_search"
     description: str = "Search the web for information. Useful for questions about current events, specific facts, or topics not covered in Wikipedia."
+    #search_tool: DuckDuckGoSearchResults = Field(default_factory=DuckDuckGoSearchResults)
+    search_tool: TavilySearchResults = Field(default_factory=TavilySearchResults)
     print("WebSearchTool initialized")
+    def _extract_links_from_results(self, search_result) -> list:
+        """Extract links from search results, robust to type."""
         links = []
         try:
+            # If result is a string (old DuckDuckGo style)
+            if isinstance(search_result, str):
+                parts = search_result.split('link:')
+                for part in parts[1:]:
+                    url = part.split(',')[0].strip()
+                    if url.startswith('http') and url not in links:
+                        links.append(url)
+            # If result is a list of dicts (Tavily or other modern search tools)
+            elif isinstance(search_result, list):
+                for item in search_result:
+                    if isinstance(item, dict) and 'url' in item:
+                        url = item['url']
+                        if url.startswith('http') and url not in links:
+                            links.append(url)
+            # Add more handling if your tool returns other types
         except Exception as e:
             print(f"Error extracting links: {str(e)}")
         return links
     def _is_promising_link(self, link: str, query: str) -> bool:
         query_terms = set(query.lower().split())
         excluded_domains = [
             'youtube.com', 'facebook.com', 'twitter.com', 'instagram.com',
             'pinterest.com', 'reddit.com', 'tiktok.com', 'linkedin.com'
         ]
         for domain in excluded_domains:
             if domain in link:
                 return False
         preferred_domains = [
             'wikipedia.org', 'britannica.com', 'scholarpedia.org',
             '.edu', '.gov', '.org'
         ]
         for domain in preferred_domains:
             if domain in link:
                 return True
+        return True
     def _scrape_page_content(self, url: str) -> str:
         try:
             headers = {
                 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
             }
             response = requests.get(url, headers=headers, timeout=10)
             response.raise_for_status()
             content_type = response.headers.get('Content-Type', '')
             if 'text/html' not in content_type:
                 return ""
             from bs4 import BeautifulSoup
             soup = BeautifulSoup(response.text, 'html.parser')
             for script in soup(["script", "style", "nav", "footer", "header"]):
                 script.decompose()
             text = soup.get_text(separator=' ', strip=True)
             lines = (line.strip() for line in text.splitlines())
             chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
             text = '\n'.join(chunk for chunk in chunks if chunk)
+            return text[:5000]
         except Exception as e:
             return f"Error scraping page content {str(e)}"
+    def _extract_most_relevant_chunk(self, content: str, query: str) -> str:
+        paragraphs = content.split('\n')
+        query_words = set(query.lower().split())
+        best_score = 0
+        best_para = paragraphs[0] if paragraphs else ""
+        for para in paragraphs:
+            score = sum(1 for word in query_words if word in para.lower())
+            if score > best_score:
+                best_score = score
+                best_para = para
+        return best_para
+    def _get_page_title(self, url: str) -> str:
+        try:
+            headers = {'User-Agent': 'Mozilla/5.0'}
+            response = requests.get(url, headers=headers, timeout=5)
+            from bs4 import BeautifulSoup
+            soup = BeautifulSoup(response.text, 'html.parser')
+            return soup.title.string.strip() if soup.title and soup.title.string else url
+        except Exception:
+            return url
     def _run(self, query: str) -> str:
         try:
             search_result = self.search_tool.run(query, max_results=5)
+            print("query")
+            print(query)
             links = self._extract_links_from_results(search_result)
+            seen = set()
+            results = []
             processed_count = 0
             for link in links:
                 if processed_count >= 3:
                     break
+                if link in seen or not self._is_promising_link(link, query):
+                    continue
+                seen.add(link)
+                content = self._scrape_page_content(link)
+                if content:
+                    best_chunk = self._extract_most_relevant_chunk(content, query)
+                    title = self._get_page_title(link)
+                    results.append(f'<Document source="{link}" title="{title}">\n{best_chunk}\n</Document>')
+                    processed_count += 1
+            combined_result = search_result + "\n\n" + "\n\n".join(results) if results else search_result
+            return combined_result[:10000]
+            return search_result[:10000]
         except Exception as e:
             return f"Error searching the web: {str(e)}"