Spaces:
Sleeping
Sleeping
ernani
commited on
Commit
·
b13a99c
1
Parent(s):
794ea68
fixing some prompts - adjusting outputs
Browse files- manage_agents.py +114 -50
- tools.py +122 -129
manage_agents.py
CHANGED
|
@@ -16,6 +16,7 @@ from tools import (
|
|
| 16 |
AudioTool,
|
| 17 |
ExcelTool,
|
| 18 |
WebSearchTool,
|
|
|
|
| 19 |
PythonTool,
|
| 20 |
ContentProcessingError
|
| 21 |
)
|
|
@@ -43,6 +44,7 @@ class ContentTypeAgent:
|
|
| 43 |
"audio": AudioTool(),
|
| 44 |
"excel": ExcelTool(),
|
| 45 |
"web": WebSearchTool(),
|
|
|
|
| 46 |
"python": PythonTool()
|
| 47 |
}
|
| 48 |
|
|
@@ -263,7 +265,7 @@ class ContentTranslateAgent:
|
|
| 263 |
For example, if asked "What is 2+2?", respond simply with "4".
|
| 264 |
If external information is needed, respond ONLY with 'TOOLS_REQUIRED'.
|
| 265 |
|
| 266 |
-
Your
|
| 267 |
)
|
| 268 |
self.chain = (
|
| 269 |
{"question": RunnablePassthrough()}
|
|
@@ -281,7 +283,7 @@ class StateGraphAgent:
|
|
| 281 |
"""Modern implementation of MainAgent for tool orchestration"""
|
| 282 |
|
| 283 |
def __init__(self):
|
| 284 |
-
self.llm = ChatOpenAI(temperature=0
|
| 285 |
# llm = HuggingFaceEndpoint(
|
| 286 |
# repo_id="Qwen/Qwen2.5-Coder-32B-Instruct",
|
| 287 |
# #repo_id="meta-llama/Llama-3.3-70B-Instruct",
|
|
@@ -291,7 +293,6 @@ class StateGraphAgent:
|
|
| 291 |
|
| 292 |
# self.llm = ChatHuggingFace(llm=llm, verbose=True)
|
| 293 |
|
| 294 |
-
|
| 295 |
# Initialize tools
|
| 296 |
self.wikipedia_tool = WikipediaTool()
|
| 297 |
self.web_search_tool = WebSearchTool()
|
|
@@ -300,6 +301,7 @@ class StateGraphAgent:
|
|
| 300 |
self.audio_tool = AudioTool()
|
| 301 |
self.excel_tool = ExcelTool()
|
| 302 |
self.python_tool = PythonTool()
|
|
|
|
| 303 |
|
| 304 |
# Create a dictionary of tools for easy access
|
| 305 |
self.tools = {
|
|
@@ -310,6 +312,7 @@ class StateGraphAgent:
|
|
| 310 |
"audio": self.audio_tool,
|
| 311 |
"excel": self.excel_tool,
|
| 312 |
"python": self.python_tool,
|
|
|
|
| 313 |
}
|
| 314 |
|
| 315 |
# Tool usage tracking
|
|
@@ -371,6 +374,23 @@ class StateGraphAgent:
|
|
| 371 |
"required": ["question"]
|
| 372 |
}
|
| 373 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 374 |
}
|
| 375 |
]
|
| 376 |
|
|
@@ -535,6 +555,10 @@ class StateGraphAgent:
|
|
| 535 |
query = args.get("query", "")
|
| 536 |
self.last_used_tool = "web"
|
| 537 |
result = self.web_search_tool._run(query)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 538 |
elif tool_name == "analyze_youtube":
|
| 539 |
url = args.get("url", "")
|
| 540 |
question = args.get("question", "")
|
|
@@ -585,10 +609,18 @@ class StateGraphAgent:
|
|
| 585 |
Question: {question}
|
| 586 |
|
| 587 |
If you can answer this directly (like math, text reversal, etc), provide the answer.
|
|
|
|
| 588 |
Your answer should be concise and direct. Focus only on answering the question.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 589 |
No additional words or explanations.
|
| 590 |
Format:
|
| 591 |
-
|
| 592 |
Otherwise respond with 'TOOLS_REQUIRED'."""
|
| 593 |
|
| 594 |
response = self.llm.invoke(direct_query)
|
|
@@ -598,13 +630,36 @@ class StateGraphAgent:
|
|
| 598 |
|
| 599 |
def _optimize_query(self, question):
|
| 600 |
"""Create an optimized search query for the question"""
|
| 601 |
-
query_prompt = f"""
|
|
|
|
|
|
|
| 602 |
|
| 603 |
Question: {question}
|
| 604 |
|
| 605 |
Your task is to create an optimized search query that will retrieve the most relevant information.
|
| 606 |
Focus on extracting key entities, relationships, and constraints from the question.
|
| 607 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 608 |
Return only the optimized search query."""
|
| 609 |
|
| 610 |
response = self.llm.invoke(query_prompt)
|
|
@@ -634,16 +689,26 @@ class StateGraphAgent:
|
|
| 634 |
content = self._execute_tool(tool_name, args)
|
| 635 |
|
| 636 |
# Generate final answer
|
| 637 |
-
answer_prompt = f"""
|
| 638 |
-
|
|
|
|
| 639 |
Question: {question}
|
| 640 |
|
| 641 |
File information: {content}
|
| 642 |
|
| 643 |
Your answer should be concise and direct. Focus only on answering the question.
|
| 644 |
No additional words or explanations.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 645 |
Format:
|
| 646 |
-
|
| 647 |
"""
|
| 648 |
|
| 649 |
response = self.llm.invoke(answer_prompt)
|
|
@@ -668,7 +733,7 @@ class StateGraphAgent:
|
|
| 668 |
|
| 669 |
No additional words or explanations.
|
| 670 |
Format:
|
| 671 |
-
|
| 672 |
"""
|
| 673 |
|
| 674 |
response = self.llm.invoke(answer_prompt)
|
|
@@ -695,8 +760,12 @@ class StateGraphAgent:
|
|
| 695 |
Example:
|
| 696 |
Question: What is the capital of France?
|
| 697 |
Answer: Paris
|
|
|
|
|
|
|
|
|
|
|
|
|
| 698 |
Format:
|
| 699 |
-
|
| 700 |
"""
|
| 701 |
|
| 702 |
response = self.llm.invoke(answer_prompt)
|
|
@@ -720,8 +789,15 @@ class StateGraphAgent:
|
|
| 720 |
If asked for a city name without abbreviations, make sure to provide the full name (e.g., "Saint Petersburg" instead of "St. Petersburg").
|
| 721 |
If asked for only a first name or a code, provide only that specific information.
|
| 722 |
No additional words or explanations.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 723 |
Format:
|
| 724 |
-
|
| 725 |
"""
|
| 726 |
|
| 727 |
response = self.llm.invoke(answer_prompt)
|
|
@@ -753,7 +829,7 @@ class StateGraphAgent:
|
|
| 753 |
Use the search_wikipedia tool to find relevant information. Be concise and direct.
|
| 754 |
No additional words or explanations.
|
| 755 |
Format:
|
| 756 |
-
|
| 757 |
"""
|
| 758 |
else:
|
| 759 |
system_prompt = f"""Answer this question using web search or other appropriate tools.
|
|
@@ -765,7 +841,7 @@ class StateGraphAgent:
|
|
| 765 |
Use the most appropriate tool to find the information needed. Be concise and direct.
|
| 766 |
No additional words or explanations.
|
| 767 |
Format:
|
| 768 |
-
|
| 769 |
"""
|
| 770 |
|
| 771 |
# Get response from tool-equipped LLM
|
|
@@ -798,7 +874,7 @@ class StateGraphAgent:
|
|
| 798 |
Question: What is the capital of France?
|
| 799 |
Answer: Paris
|
| 800 |
Format:
|
| 801 |
-
|
| 802 |
"""
|
| 803 |
|
| 804 |
final_response = self.llm.invoke(answer_prompt)
|
|
@@ -829,6 +905,7 @@ class MainAgent:
|
|
| 829 |
self.tools = {
|
| 830 |
"wiki": self.wikipedia_tool,
|
| 831 |
"web": self.web_search_tool,
|
|
|
|
| 832 |
"youtube": self.youtube_tool,
|
| 833 |
"image": self.image_tool,
|
| 834 |
"audio": self.audio_tool,
|
|
@@ -997,7 +1074,7 @@ class MainAgent:
|
|
| 997 |
Your task is to create an optimized search query that will retrieve the most relevant information.
|
| 998 |
Focus on extracting key entities, relationships, and constraints from the question.
|
| 999 |
|
| 1000 |
-
If the question is about searching something on the web, use the search_web tool or
|
| 1001 |
|
| 1002 |
Example:
|
| 1003 |
Question: What is the capital of France?
|
|
@@ -1269,7 +1346,7 @@ class MainAgent:
|
|
| 1269 |
Your answer should be:
|
| 1270 |
"Paris"
|
| 1271 |
Format:
|
| 1272 |
-
|
| 1273 |
"""
|
| 1274 |
|
| 1275 |
response = self.llm.invoke(query)
|
|
@@ -1292,7 +1369,6 @@ class MainAgent:
|
|
| 1292 |
# Add synthesize_answer node
|
| 1293 |
def _synthesize_answer(state):
|
| 1294 |
import re
|
| 1295 |
-
# Find the original question and the latest FunctionMessage (tool output)
|
| 1296 |
question = None
|
| 1297 |
tool_output = None
|
| 1298 |
for msg in state.messages:
|
|
@@ -1301,48 +1377,36 @@ class MainAgent:
|
|
| 1301 |
if isinstance(msg, FunctionMessage):
|
| 1302 |
tool_output = msg.content
|
| 1303 |
if not question or not tool_output:
|
| 1304 |
-
return state
|
| 1305 |
-
|
| 1306 |
-
|
| 1307 |
-
|
| 1308 |
-
|
| 1309 |
-
Context: {tool_output}
|
| 1310 |
-
|
| 1311 |
-
Question: {question}
|
| 1312 |
|
| 1313 |
Instructions:
|
| 1314 |
-
|
| 1315 |
-
|
| 1316 |
-
|
| 1317 |
-
|
| 1318 |
-
|
| 1319 |
-
- If the answer is not present in the context, output "NOT FOUND".
|
| 1320 |
-
|
| 1321 |
-
Examples of correct answers:
|
| 1322 |
-
Q: What is the capital of France?
|
| 1323 |
-
A: Paris
|
| 1324 |
-
|
| 1325 |
-
Q: What does Teal'c say in response to the question \"Isn't that hot?\"
|
| 1326 |
-
A: extremely
|
| 1327 |
|
| 1328 |
-
|
| 1329 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1330 |
|
| 1331 |
-
|
| 1332 |
-
|
| 1333 |
-
- The final numeric output is 0.
|
| 1334 |
-
- The vegetables are: acorns, bell pepper, ...
|
| 1335 |
-
- Answer: extremely
|
| 1336 |
|
| 1337 |
-
|
| 1338 |
-
|
| 1339 |
-
<answer>
|
| 1340 |
"""
|
| 1341 |
-
|
| 1342 |
response = self.llm.invoke(answer_prompt)
|
| 1343 |
answer = response.content if hasattr(response, 'content') else str(response)
|
| 1344 |
# Remove any prefix like "Final Answer:" or "Answer:" and strip whitespace
|
| 1345 |
-
answer = re.sub(r'^(
|
| 1346 |
state.messages.append(AIMessage(content=answer))
|
| 1347 |
return state
|
| 1348 |
builder.add_node("synthesize_answer", _synthesize_answer)
|
|
|
|
| 16 |
AudioTool,
|
| 17 |
ExcelTool,
|
| 18 |
WebSearchTool,
|
| 19 |
+
ArvixSearchTool,
|
| 20 |
PythonTool,
|
| 21 |
ContentProcessingError
|
| 22 |
)
|
|
|
|
| 44 |
"audio": AudioTool(),
|
| 45 |
"excel": ExcelTool(),
|
| 46 |
"web": WebSearchTool(),
|
| 47 |
+
"arvix": ArvixSearchTool(),
|
| 48 |
"python": PythonTool()
|
| 49 |
}
|
| 50 |
|
|
|
|
| 265 |
For example, if asked "What is 2+2?", respond simply with "4".
|
| 266 |
If external information is needed, respond ONLY with 'TOOLS_REQUIRED'.
|
| 267 |
|
| 268 |
+
Your output - only the answer without any additional words (or TOOLS_REQUIRED):"""
|
| 269 |
)
|
| 270 |
self.chain = (
|
| 271 |
{"question": RunnablePassthrough()}
|
|
|
|
| 283 |
"""Modern implementation of MainAgent for tool orchestration"""
|
| 284 |
|
| 285 |
def __init__(self):
|
| 286 |
+
self.llm = ChatOpenAI(temperature=0, model="gpt-4o-mini")
|
| 287 |
# llm = HuggingFaceEndpoint(
|
| 288 |
# repo_id="Qwen/Qwen2.5-Coder-32B-Instruct",
|
| 289 |
# #repo_id="meta-llama/Llama-3.3-70B-Instruct",
|
|
|
|
| 293 |
|
| 294 |
# self.llm = ChatHuggingFace(llm=llm, verbose=True)
|
| 295 |
|
|
|
|
| 296 |
# Initialize tools
|
| 297 |
self.wikipedia_tool = WikipediaTool()
|
| 298 |
self.web_search_tool = WebSearchTool()
|
|
|
|
| 301 |
self.audio_tool = AudioTool()
|
| 302 |
self.excel_tool = ExcelTool()
|
| 303 |
self.python_tool = PythonTool()
|
| 304 |
+
self.arvix_tool = ArvixSearchTool()
|
| 305 |
|
| 306 |
# Create a dictionary of tools for easy access
|
| 307 |
self.tools = {
|
|
|
|
| 312 |
"audio": self.audio_tool,
|
| 313 |
"excel": self.excel_tool,
|
| 314 |
"python": self.python_tool,
|
| 315 |
+
"arvix": self.arvix_tool
|
| 316 |
}
|
| 317 |
|
| 318 |
# Tool usage tracking
|
|
|
|
| 374 |
"required": ["question"]
|
| 375 |
}
|
| 376 |
}
|
| 377 |
+
},
|
| 378 |
+
{
|
| 379 |
+
"type": "function",
|
| 380 |
+
"function": {
|
| 381 |
+
"name": "search_arxiv",
|
| 382 |
+
"description": "Search Arxiv for a query and return maximum 3 results as formatted string.",
|
| 383 |
+
"parameters": {
|
| 384 |
+
"type": "object",
|
| 385 |
+
"properties": {
|
| 386 |
+
"query": {
|
| 387 |
+
"type": "string",
|
| 388 |
+
"description": "The query to search Arxiv for"
|
| 389 |
+
}
|
| 390 |
+
},
|
| 391 |
+
"required": ["query"]
|
| 392 |
+
}
|
| 393 |
+
}
|
| 394 |
}
|
| 395 |
]
|
| 396 |
|
|
|
|
| 555 |
query = args.get("query", "")
|
| 556 |
self.last_used_tool = "web"
|
| 557 |
result = self.web_search_tool._run(query)
|
| 558 |
+
elif tool_name == "search_arxiv":
|
| 559 |
+
query = args.get("query", "")
|
| 560 |
+
self.last_used_tool = "arvix"
|
| 561 |
+
result = self.arvix_tool._run(query)
|
| 562 |
elif tool_name == "analyze_youtube":
|
| 563 |
url = args.get("url", "")
|
| 564 |
question = args.get("question", "")
|
|
|
|
| 609 |
Question: {question}
|
| 610 |
|
| 611 |
If you can answer this directly (like math, text reversal, etc), provide the answer.
|
| 612 |
+
Undertand the necessary skills you need before answering the question.
|
| 613 |
Your answer should be concise and direct. Focus only on answering the question.
|
| 614 |
+
|
| 615 |
+
- RULES:
|
| 616 |
+
- Understand the context of the question first.
|
| 617 |
+
- What is the main entity of the question?
|
| 618 |
+
- What is the answer to the question?
|
| 619 |
+
- If you need to use a tool, respond with 'TOOLS_REQUIRED'.
|
| 620 |
+
|
| 621 |
No additional words or explanations.
|
| 622 |
Format:
|
| 623 |
+
Output only the answer.
|
| 624 |
Otherwise respond with 'TOOLS_REQUIRED'."""
|
| 625 |
|
| 626 |
response = self.llm.invoke(direct_query)
|
|
|
|
| 630 |
|
| 631 |
def _optimize_query(self, question):
|
| 632 |
"""Create an optimized search query for the question"""
|
| 633 |
+
query_prompt = f"""
|
| 634 |
+
You are an agent specialized in researching information on the web.
|
| 635 |
+
Your task is to read the asked question and:
|
| 636 |
|
| 637 |
Question: {question}
|
| 638 |
|
| 639 |
Your task is to create an optimized search query that will retrieve the most relevant information.
|
| 640 |
Focus on extracting key entities, relationships, and constraints from the question.
|
| 641 |
|
| 642 |
+
- RULES:
|
| 643 |
+
- Understand the context of the question first.
|
| 644 |
+
- What is the main entity of the question?
|
| 645 |
+
- Use only the necessary keywords to search the web.
|
| 646 |
+
- Do not include any other text or comments, just the optimized search query.
|
| 647 |
+
|
| 648 |
+
|
| 649 |
+
If the question is:
|
| 650 |
+
|
| 651 |
+
Q: What is the capital of France?
|
| 652 |
+
your optimized search query should be: capital of France
|
| 653 |
+
|
| 654 |
+
Q: How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.?
|
| 655 |
+
your optimized search query should be: Mercedes Sosa Musician
|
| 656 |
+
|
| 657 |
+
Based on the examples above:
|
| 658 |
+
- Understand the context of the question first.
|
| 659 |
+
- What is the main entity, if it's an actor, musician, etc, use the role and the name of the person.
|
| 660 |
+
- Remember, you are not making a question, you are retrieving information from the web.
|
| 661 |
+
|
| 662 |
+
|
| 663 |
Return only the optimized search query."""
|
| 664 |
|
| 665 |
response = self.llm.invoke(query_prompt)
|
|
|
|
| 689 |
content = self._execute_tool(tool_name, args)
|
| 690 |
|
| 691 |
# Generate final answer
|
| 692 |
+
answer_prompt = f"""You are a data analyst. You are good at analyzing data and extracting information from files.
|
| 693 |
+
Your task is to analyze the question and answer it based on the file information.
|
| 694 |
+
|
| 695 |
Question: {question}
|
| 696 |
|
| 697 |
File information: {content}
|
| 698 |
|
| 699 |
Your answer should be concise and direct. Focus only on answering the question.
|
| 700 |
No additional words or explanations.
|
| 701 |
+
|
| 702 |
+
If the question is about a table, use the table information to answer the question.
|
| 703 |
+
Understand the table and the question first before answering.
|
| 704 |
+
Categorize the table information
|
| 705 |
+
Calculate the answer based on the table information.
|
| 706 |
+
|
| 707 |
+
If the question is related to a video, audio, or image, use the content information to answer the question.
|
| 708 |
+
Understand the context of the question first before answering.
|
| 709 |
+
If you can't answer the question based on the content, respond with 'TOOLS_REQUIRED'.
|
| 710 |
Format:
|
| 711 |
+
Output only the answer.
|
| 712 |
"""
|
| 713 |
|
| 714 |
response = self.llm.invoke(answer_prompt)
|
|
|
|
| 733 |
|
| 734 |
No additional words or explanations.
|
| 735 |
Format:
|
| 736 |
+
Output only the answer.
|
| 737 |
"""
|
| 738 |
|
| 739 |
response = self.llm.invoke(answer_prompt)
|
|
|
|
| 760 |
Example:
|
| 761 |
Question: What is the capital of France?
|
| 762 |
Answer: Paris
|
| 763 |
+
|
| 764 |
+
Question: How many wheels does the car have?
|
| 765 |
+
Answer: 4
|
| 766 |
+
|
| 767 |
Format:
|
| 768 |
+
Output only the answer.
|
| 769 |
"""
|
| 770 |
|
| 771 |
response = self.llm.invoke(answer_prompt)
|
|
|
|
| 789 |
If asked for a city name without abbreviations, make sure to provide the full name (e.g., "Saint Petersburg" instead of "St. Petersburg").
|
| 790 |
If asked for only a first name or a code, provide only that specific information.
|
| 791 |
No additional words or explanations.
|
| 792 |
+
Example:
|
| 793 |
+
Question: What is the capital of France?
|
| 794 |
+
Answer: Paris
|
| 795 |
+
|
| 796 |
+
Question: How many wheels does the car have?
|
| 797 |
+
Answer: 4
|
| 798 |
+
|
| 799 |
Format:
|
| 800 |
+
Output only the answer.
|
| 801 |
"""
|
| 802 |
|
| 803 |
response = self.llm.invoke(answer_prompt)
|
|
|
|
| 829 |
Use the search_wikipedia tool to find relevant information. Be concise and direct.
|
| 830 |
No additional words or explanations.
|
| 831 |
Format:
|
| 832 |
+
Output only the answer.
|
| 833 |
"""
|
| 834 |
else:
|
| 835 |
system_prompt = f"""Answer this question using web search or other appropriate tools.
|
|
|
|
| 841 |
Use the most appropriate tool to find the information needed. Be concise and direct.
|
| 842 |
No additional words or explanations.
|
| 843 |
Format:
|
| 844 |
+
Output only the answer.
|
| 845 |
"""
|
| 846 |
|
| 847 |
# Get response from tool-equipped LLM
|
|
|
|
| 874 |
Question: What is the capital of France?
|
| 875 |
Answer: Paris
|
| 876 |
Format:
|
| 877 |
+
Output only the answer.
|
| 878 |
"""
|
| 879 |
|
| 880 |
final_response = self.llm.invoke(answer_prompt)
|
|
|
|
| 905 |
self.tools = {
|
| 906 |
"wiki": self.wikipedia_tool,
|
| 907 |
"web": self.web_search_tool,
|
| 908 |
+
"arvix": self.arvix_search_tool,
|
| 909 |
"youtube": self.youtube_tool,
|
| 910 |
"image": self.image_tool,
|
| 911 |
"audio": self.audio_tool,
|
|
|
|
| 1074 |
Your task is to create an optimized search query that will retrieve the most relevant information.
|
| 1075 |
Focus on extracting key entities, relationships, and constraints from the question.
|
| 1076 |
|
| 1077 |
+
If the question is about searching something on the web, use the search_web tool, wikipedia tool or search_arxiv tool.
|
| 1078 |
|
| 1079 |
Example:
|
| 1080 |
Question: What is the capital of France?
|
|
|
|
| 1346 |
Your answer should be:
|
| 1347 |
"Paris"
|
| 1348 |
Format:
|
| 1349 |
+
Output only the answer.
|
| 1350 |
"""
|
| 1351 |
|
| 1352 |
response = self.llm.invoke(query)
|
|
|
|
| 1369 |
# Add synthesize_answer node
|
| 1370 |
def _synthesize_answer(state):
|
| 1371 |
import re
|
|
|
|
| 1372 |
question = None
|
| 1373 |
tool_output = None
|
| 1374 |
for msg in state.messages:
|
|
|
|
| 1377 |
if isinstance(msg, FunctionMessage):
|
| 1378 |
tool_output = msg.content
|
| 1379 |
if not question or not tool_output:
|
| 1380 |
+
return state
|
| 1381 |
+
answer_prompt = f"""
|
| 1382 |
+
You are a helpful AI assistant.
|
| 1383 |
+
You are given a question and some context (which may be empty or incomplete).
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1384 |
|
| 1385 |
Instructions:
|
| 1386 |
+
1. Carefully read the context and the question.
|
| 1387 |
+
2. If the context contains all the information needed to answer the question, answer it directly.
|
| 1388 |
+
3. If the context is missing information, identify what is missing.
|
| 1389 |
+
4. If you need more information, request the use of the available tools (such as Wikipedia, web search, or other domain-specific tools) to find the answer.
|
| 1390 |
+
5. Once you have all the necessary information, answer the question as directly and concisely as possible, following any formatting instructions.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1391 |
|
| 1392 |
+
Rules:
|
| 1393 |
+
- Do not make up information not present in the context or found via tools.
|
| 1394 |
+
- If you use a tool, state which tool you are using and why, then use it and incorporate the result.
|
| 1395 |
+
- Output only the final answer, unless specifically asked for reasoning steps.
|
| 1396 |
+
- Do not include any other text or comments, just the answer.
|
| 1397 |
+
- If the answer is a list, output only the list as requested (e.g., comma-separated, one per line, etc.).
|
| 1398 |
+
- If the answer is: how many wheels does the car have?, output only the number, not a sentence.
|
| 1399 |
|
| 1400 |
+
Context:
|
| 1401 |
+
{tool_output}
|
|
|
|
|
|
|
|
|
|
| 1402 |
|
| 1403 |
+
Question:
|
| 1404 |
+
{question}
|
|
|
|
| 1405 |
"""
|
|
|
|
| 1406 |
response = self.llm.invoke(answer_prompt)
|
| 1407 |
answer = response.content if hasattr(response, 'content') else str(response)
|
| 1408 |
# Remove any prefix like "Final Answer:" or "Answer:" and strip whitespace
|
| 1409 |
+
answer = re.sub(r'^(Final Answer:|Answer:)', '', answer, flags=re.IGNORECASE).strip()
|
| 1410 |
state.messages.append(AIMessage(content=answer))
|
| 1411 |
return state
|
| 1412 |
builder.add_node("synthesize_answer", _synthesize_answer)
|
tools.py
CHANGED
|
@@ -5,8 +5,8 @@ import requests
|
|
| 5 |
from langchain.tools import BaseTool
|
| 6 |
from langchain.schema import Document
|
| 7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
-
from langchain_community.tools import WikipediaQueryRun, DuckDuckGoSearchResults
|
| 9 |
-
from langchain_community.document_loaders import PythonLoader
|
| 10 |
from langchain_community.utilities import WikipediaAPIWrapper, DuckDuckGoSearchAPIWrapper
|
| 11 |
import pytube
|
| 12 |
from PIL import Image
|
|
@@ -99,11 +99,27 @@ class WikipediaTool(BaseTool):
|
|
| 99 |
def _run(self, question: str) -> str:
|
| 100 |
"""Search Wikipedia and return the result as a string"""
|
| 101 |
try:
|
| 102 |
-
#
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
except Exception as e:
|
| 108 |
return f"Error searching Wikipedia: {str(e)}"
|
| 109 |
|
|
@@ -593,64 +609,11 @@ class ExcelTool(BaseContentTool):
|
|
| 593 |
|
| 594 |
def _dataframe_to_text(self, df: pd.DataFrame) -> str:
|
| 595 |
"""Convert DataFrame to a readable text format optimized for LLM analysis."""
|
| 596 |
-
|
| 597 |
-
|
| 598 |
-
|
| 599 |
-
|
| 600 |
-
|
| 601 |
-
text_parts.append(f"Total Columns: {len(df.columns)}")
|
| 602 |
-
|
| 603 |
-
# Column information with data types
|
| 604 |
-
text_parts.append("\nColumn Information:")
|
| 605 |
-
for column in df.columns:
|
| 606 |
-
dtype = df[column].dtype
|
| 607 |
-
sample_values = ", ".join(str(x) for x in df[column].head(3).tolist())
|
| 608 |
-
text_parts.append(f"- {column} (Type: {dtype}): Sample values: {sample_values}")
|
| 609 |
-
|
| 610 |
-
# Classification hints
|
| 611 |
-
text_parts.append("\nColumn Classification Hints:")
|
| 612 |
-
|
| 613 |
-
# Identify potential category columns
|
| 614 |
-
category_cols = [col for col in df.columns if any(term in str(col).lower() for term in ['category', 'type', 'item', 'product'])]
|
| 615 |
-
if category_cols:
|
| 616 |
-
text_parts.append(f"Potential category/item columns: {', '.join(category_cols)}")
|
| 617 |
-
|
| 618 |
-
# For each category column, list unique values
|
| 619 |
-
for col in category_cols:
|
| 620 |
-
unique_vals = df[col].unique()
|
| 621 |
-
if len(unique_vals) < 20: # Only if there aren't too many
|
| 622 |
-
text_parts.append(f"Unique values in {col}: {', '.join(str(x) for x in unique_vals)}")
|
| 623 |
-
|
| 624 |
-
# Identify potential price/value columns
|
| 625 |
-
value_cols = [col for col in df.columns if any(term in str(col).lower() for term in ['price', 'cost', 'sale', 'revenue', 'amount', 'total'])]
|
| 626 |
-
if value_cols:
|
| 627 |
-
text_parts.append(f"Potential value/price columns: {', '.join(value_cols)}")
|
| 628 |
-
|
| 629 |
-
# Sum of each value column
|
| 630 |
-
for col in value_cols:
|
| 631 |
-
if pd.api.types.is_numeric_dtype(df[col]):
|
| 632 |
-
text_parts.append(f"Sum of {col}: {df[col].sum()}")
|
| 633 |
-
|
| 634 |
-
# Data sample (first 10 rows in a clean tabular format)
|
| 635 |
-
text_parts.append("\nData Sample (first 10 rows):")
|
| 636 |
-
|
| 637 |
-
# Format the DataFrame as a string table
|
| 638 |
-
sample_df = df.head(10)
|
| 639 |
-
headers = sample_df.columns.tolist()
|
| 640 |
-
rows = []
|
| 641 |
-
|
| 642 |
-
# Add header row
|
| 643 |
-
header_row = " | ".join(str(h) for h in headers)
|
| 644 |
-
rows.append(header_row)
|
| 645 |
-
rows.append("-" * len(header_row))
|
| 646 |
-
|
| 647 |
-
# Add data rows
|
| 648 |
-
for _, row in sample_df.iterrows():
|
| 649 |
-
rows.append(" | ".join(str(row[h]) for h in headers))
|
| 650 |
-
|
| 651 |
-
text_parts.append("\n".join(rows))
|
| 652 |
-
|
| 653 |
-
return "\n".join(text_parts)
|
| 654 |
|
| 655 |
def _run(self, task_id: str, question: str = "") -> List[Document]:
|
| 656 |
"""Process Excel file content and return documents with extracted information."""
|
|
@@ -692,124 +655,154 @@ class ExcelTool(BaseContentTool):
|
|
| 692 |
"""Async version of _run."""
|
| 693 |
return self._run(task_id)
|
| 694 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 695 |
class WebSearchTool(BaseTool):
|
| 696 |
"""Tool for web search using DuckDuckGo"""
|
| 697 |
name: str = "web_search"
|
| 698 |
description: str = "Search the web for information. Useful for questions about current events, specific facts, or topics not covered in Wikipedia."
|
| 699 |
-
search_tool: DuckDuckGoSearchResults = Field(default_factory=DuckDuckGoSearchResults)
|
|
|
|
|
|
|
| 700 |
print("WebSearchTool initialized")
|
| 701 |
-
def _extract_links_from_results(self, search_result
|
| 702 |
-
"""Extract links from search results
|
| 703 |
links = []
|
| 704 |
try:
|
| 705 |
-
#
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
| 709 |
-
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 716 |
except Exception as e:
|
| 717 |
print(f"Error extracting links: {str(e)}")
|
| 718 |
-
|
| 719 |
return links
|
| 720 |
-
|
| 721 |
def _is_promising_link(self, link: str, query: str) -> bool:
|
| 722 |
-
"""Determine if a link is promising based on the query"""
|
| 723 |
query_terms = set(query.lower().split())
|
| 724 |
-
|
| 725 |
-
# Exclude common non-content sites
|
| 726 |
excluded_domains = [
|
| 727 |
'youtube.com', 'facebook.com', 'twitter.com', 'instagram.com',
|
| 728 |
'pinterest.com', 'reddit.com', 'tiktok.com', 'linkedin.com'
|
| 729 |
]
|
| 730 |
-
|
| 731 |
for domain in excluded_domains:
|
| 732 |
if domain in link:
|
| 733 |
return False
|
| 734 |
-
|
| 735 |
-
# Prefer certain credible domains
|
| 736 |
preferred_domains = [
|
| 737 |
'wikipedia.org', 'britannica.com', 'scholarpedia.org',
|
| 738 |
'.edu', '.gov', '.org'
|
| 739 |
]
|
| 740 |
-
|
| 741 |
for domain in preferred_domains:
|
| 742 |
if domain in link:
|
| 743 |
return True
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
def _scrape_page_content(self, url: str) -> str:
|
| 748 |
-
"""Scrape the content of a webpage"""
|
| 749 |
try:
|
| 750 |
headers = {
|
| 751 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 752 |
}
|
| 753 |
response = requests.get(url, headers=headers, timeout=10)
|
| 754 |
response.raise_for_status()
|
| 755 |
-
|
| 756 |
-
# Check if we got HTML content
|
| 757 |
content_type = response.headers.get('Content-Type', '')
|
| 758 |
if 'text/html' not in content_type:
|
| 759 |
return ""
|
| 760 |
-
|
| 761 |
-
# Use BeautifulSoup to parse the HTML
|
| 762 |
from bs4 import BeautifulSoup
|
| 763 |
soup = BeautifulSoup(response.text, 'html.parser')
|
| 764 |
-
|
| 765 |
-
# Remove script and style elements
|
| 766 |
for script in soup(["script", "style", "nav", "footer", "header"]):
|
| 767 |
script.decompose()
|
| 768 |
-
|
| 769 |
-
# Extract text content
|
| 770 |
text = soup.get_text(separator=' ', strip=True)
|
| 771 |
-
|
| 772 |
-
# Clean up the text
|
| 773 |
lines = (line.strip() for line in text.splitlines())
|
| 774 |
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
| 775 |
text = '\n'.join(chunk for chunk in chunks if chunk)
|
| 776 |
-
|
| 777 |
-
# Limit the length
|
| 778 |
-
return text[:5000] # Limit to 5000 chars
|
| 779 |
-
|
| 780 |
except Exception as e:
|
| 781 |
return f"Error scraping page content {str(e)}"
|
| 782 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 783 |
def _run(self, query: str) -> str:
|
| 784 |
-
"""Search the web and return results as a string"""
|
| 785 |
try:
|
| 786 |
-
# First perform the DuckDuckGo search
|
| 787 |
search_result = self.search_tool.run(query, max_results=5)
|
| 788 |
-
|
| 789 |
-
|
| 790 |
links = self._extract_links_from_results(search_result)
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
additional_content = []
|
| 794 |
processed_count = 0
|
| 795 |
-
|
| 796 |
for link in links:
|
| 797 |
if processed_count >= 3:
|
| 798 |
break
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
return combined_result
|
| 813 |
-
|
| 814 |
except Exception as e:
|
| 815 |
return f"Error searching the web: {str(e)}"
|
|
|
|
| 5 |
from langchain.tools import BaseTool
|
| 6 |
from langchain.schema import Document
|
| 7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
+
from langchain_community.tools import WikipediaQueryRun, DuckDuckGoSearchResults, TavilySearchResults
|
| 9 |
+
from langchain_community.document_loaders import PythonLoader, ArxivLoader
|
| 10 |
from langchain_community.utilities import WikipediaAPIWrapper, DuckDuckGoSearchAPIWrapper
|
| 11 |
import pytube
|
| 12 |
from PIL import Image
|
|
|
|
| 99 |
def _run(self, question: str) -> str:
|
| 100 |
"""Search Wikipedia and return the result as a string"""
|
| 101 |
try:
|
| 102 |
+
# Try with optimized query
|
| 103 |
+
results = self.wikipedia_tool.api_wrapper.run(question)
|
| 104 |
+
# results is a list of dicts with 'title', 'summary', 'content', etc.
|
| 105 |
+
formatted_results = []
|
| 106 |
+
for res in results:
|
| 107 |
+
# Skip disambiguation pages
|
| 108 |
+
if 'disambiguation' in res.get('title', '').lower():
|
| 109 |
+
continue
|
| 110 |
+
summary = res.get('summary') or res.get('content') or ''
|
| 111 |
+
if not summary:
|
| 112 |
+
continue
|
| 113 |
+
formatted_results.append(
|
| 114 |
+
f'<Document source="wikipedia" title="{res.get("title", "")}">\n{summary}\n</Document>'
|
| 115 |
+
)
|
| 116 |
+
if not formatted_results:
|
| 117 |
+
# Fallback to web search if nothing found
|
| 118 |
+
from langchain_community.tools import DuckDuckGoSearchResults
|
| 119 |
+
web = DuckDuckGoSearchResults()
|
| 120 |
+
web_result = web.run(question, max_results=2)
|
| 121 |
+
return f"<Document source=\"web_fallback\">\n{web_result}\n</Document>"
|
| 122 |
+
return "\n\n---\n\n".join(formatted_results)[:8000]
|
| 123 |
except Exception as e:
|
| 124 |
return f"Error searching Wikipedia: {str(e)}"
|
| 125 |
|
|
|
|
| 609 |
|
| 610 |
def _dataframe_to_text(self, df: pd.DataFrame) -> str:
|
| 611 |
"""Convert DataFrame to a readable text format optimized for LLM analysis."""
|
| 612 |
+
# Use to_string for a clean, tabular format
|
| 613 |
+
table_str = df.to_string(index=False)
|
| 614 |
+
print("table_str")
|
| 615 |
+
print(table_str)
|
| 616 |
+
return f"Table:\n{table_str}"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 617 |
|
| 618 |
def _run(self, task_id: str, question: str = "") -> List[Document]:
|
| 619 |
"""Process Excel file content and return documents with extracted information."""
|
|
|
|
| 655 |
"""Async version of _run."""
|
| 656 |
return self._run(task_id)
|
| 657 |
|
| 658 |
+
class ArvixSearchTool(BaseTool):
|
| 659 |
+
"""Tool for searching Arxiv for a query and returning maximum 3 results as formatted string."""
|
| 660 |
+
name: str = "arvix_search"
|
| 661 |
+
description: str = "Search Arxiv for a query and return maximum 3 results as formatted string."
|
| 662 |
+
|
| 663 |
+
def _run(self, query: str) -> str:
|
| 664 |
+
"""Search Arxiv for a query and return maximum 3 results as formatted string."""
|
| 665 |
+
try:
|
| 666 |
+
search_docs = ArxivLoader(query=query, load_max_docs=3).load()
|
| 667 |
+
# Rank by keyword overlap
|
| 668 |
+
def score(doc):
|
| 669 |
+
qwords = set(query.lower().split())
|
| 670 |
+
content = (doc.page_content or "").lower()
|
| 671 |
+
return sum(1 for w in qwords if w in content)
|
| 672 |
+
search_docs = sorted(search_docs, key=score, reverse=True)
|
| 673 |
+
formatted = []
|
| 674 |
+
for doc in search_docs:
|
| 675 |
+
meta = doc.metadata
|
| 676 |
+
title = meta.get('Title') or meta.get('title') or ''
|
| 677 |
+
authors = meta.get('Authors') or meta.get('authors') or ''
|
| 678 |
+
year = meta.get('Year') or meta.get('year') or ''
|
| 679 |
+
link = meta.get('Entry ID') or meta.get('entry_id') or ''
|
| 680 |
+
abstract = doc.page_content[:1200]
|
| 681 |
+
formatted.append(
|
| 682 |
+
f'<Document source="arxiv" title="{title}" authors="{authors}" year="{year}" link="{link}">\n{abstract}\n</Document>'
|
| 683 |
+
)
|
| 684 |
+
if not formatted:
|
| 685 |
+
return "No relevant arXiv results found."
|
| 686 |
+
return "\n\n---\n\n".join(formatted)[:8000]
|
| 687 |
+
except Exception as e:
|
| 688 |
+
return f"Error searching arXiv: {str(e)}"
|
| 689 |
+
|
| 690 |
class WebSearchTool(BaseTool):
|
| 691 |
"""Tool for web search using DuckDuckGo"""
|
| 692 |
name: str = "web_search"
|
| 693 |
description: str = "Search the web for information. Useful for questions about current events, specific facts, or topics not covered in Wikipedia."
|
| 694 |
+
#search_tool: DuckDuckGoSearchResults = Field(default_factory=DuckDuckGoSearchResults)
|
| 695 |
+
search_tool: TavilySearchResults = Field(default_factory=TavilySearchResults)
|
| 696 |
+
|
| 697 |
print("WebSearchTool initialized")
|
| 698 |
+
def _extract_links_from_results(self, search_result) -> list:
|
| 699 |
+
"""Extract links from search results, robust to type."""
|
| 700 |
links = []
|
| 701 |
try:
|
| 702 |
+
# If result is a string (old DuckDuckGo style)
|
| 703 |
+
if isinstance(search_result, str):
|
| 704 |
+
parts = search_result.split('link:')
|
| 705 |
+
for part in parts[1:]:
|
| 706 |
+
url = part.split(',')[0].strip()
|
| 707 |
+
if url.startswith('http') and url not in links:
|
| 708 |
+
links.append(url)
|
| 709 |
+
# If result is a list of dicts (Tavily or other modern search tools)
|
| 710 |
+
elif isinstance(search_result, list):
|
| 711 |
+
for item in search_result:
|
| 712 |
+
if isinstance(item, dict) and 'url' in item:
|
| 713 |
+
url = item['url']
|
| 714 |
+
if url.startswith('http') and url not in links:
|
| 715 |
+
links.append(url)
|
| 716 |
+
# Add more handling if your tool returns other types
|
| 717 |
except Exception as e:
|
| 718 |
print(f"Error extracting links: {str(e)}")
|
|
|
|
| 719 |
return links
|
| 720 |
+
|
| 721 |
def _is_promising_link(self, link: str, query: str) -> bool:
|
|
|
|
| 722 |
query_terms = set(query.lower().split())
|
|
|
|
|
|
|
| 723 |
excluded_domains = [
|
| 724 |
'youtube.com', 'facebook.com', 'twitter.com', 'instagram.com',
|
| 725 |
'pinterest.com', 'reddit.com', 'tiktok.com', 'linkedin.com'
|
| 726 |
]
|
|
|
|
| 727 |
for domain in excluded_domains:
|
| 728 |
if domain in link:
|
| 729 |
return False
|
|
|
|
|
|
|
| 730 |
preferred_domains = [
|
| 731 |
'wikipedia.org', 'britannica.com', 'scholarpedia.org',
|
| 732 |
'.edu', '.gov', '.org'
|
| 733 |
]
|
|
|
|
| 734 |
for domain in preferred_domains:
|
| 735 |
if domain in link:
|
| 736 |
return True
|
| 737 |
+
return True
|
| 738 |
+
|
|
|
|
| 739 |
def _scrape_page_content(self, url: str) -> str:
|
|
|
|
| 740 |
try:
|
| 741 |
headers = {
|
| 742 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 743 |
}
|
| 744 |
response = requests.get(url, headers=headers, timeout=10)
|
| 745 |
response.raise_for_status()
|
|
|
|
|
|
|
| 746 |
content_type = response.headers.get('Content-Type', '')
|
| 747 |
if 'text/html' not in content_type:
|
| 748 |
return ""
|
|
|
|
|
|
|
| 749 |
from bs4 import BeautifulSoup
|
| 750 |
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
|
|
|
| 751 |
for script in soup(["script", "style", "nav", "footer", "header"]):
|
| 752 |
script.decompose()
|
|
|
|
|
|
|
| 753 |
text = soup.get_text(separator=' ', strip=True)
|
|
|
|
|
|
|
| 754 |
lines = (line.strip() for line in text.splitlines())
|
| 755 |
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
| 756 |
text = '\n'.join(chunk for chunk in chunks if chunk)
|
| 757 |
+
return text[:5000]
|
|
|
|
|
|
|
|
|
|
| 758 |
except Exception as e:
|
| 759 |
return f"Error scraping page content {str(e)}"
|
| 760 |
|
| 761 |
+
def _extract_most_relevant_chunk(self, content: str, query: str) -> str:
|
| 762 |
+
paragraphs = content.split('\n')
|
| 763 |
+
query_words = set(query.lower().split())
|
| 764 |
+
best_score = 0
|
| 765 |
+
best_para = paragraphs[0] if paragraphs else ""
|
| 766 |
+
for para in paragraphs:
|
| 767 |
+
score = sum(1 for word in query_words if word in para.lower())
|
| 768 |
+
if score > best_score:
|
| 769 |
+
best_score = score
|
| 770 |
+
best_para = para
|
| 771 |
+
return best_para
|
| 772 |
+
|
| 773 |
+
def _get_page_title(self, url: str) -> str:
|
| 774 |
+
try:
|
| 775 |
+
headers = {'User-Agent': 'Mozilla/5.0'}
|
| 776 |
+
response = requests.get(url, headers=headers, timeout=5)
|
| 777 |
+
from bs4 import BeautifulSoup
|
| 778 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 779 |
+
return soup.title.string.strip() if soup.title and soup.title.string else url
|
| 780 |
+
except Exception:
|
| 781 |
+
return url
|
| 782 |
+
|
| 783 |
def _run(self, query: str) -> str:
|
|
|
|
| 784 |
try:
|
|
|
|
| 785 |
search_result = self.search_tool.run(query, max_results=5)
|
| 786 |
+
print("query")
|
| 787 |
+
print(query)
|
| 788 |
links = self._extract_links_from_results(search_result)
|
| 789 |
+
seen = set()
|
| 790 |
+
results = []
|
|
|
|
| 791 |
processed_count = 0
|
|
|
|
| 792 |
for link in links:
|
| 793 |
if processed_count >= 3:
|
| 794 |
break
|
| 795 |
+
if link in seen or not self._is_promising_link(link, query):
|
| 796 |
+
continue
|
| 797 |
+
seen.add(link)
|
| 798 |
+
content = self._scrape_page_content(link)
|
| 799 |
+
if content:
|
| 800 |
+
best_chunk = self._extract_most_relevant_chunk(content, query)
|
| 801 |
+
title = self._get_page_title(link)
|
| 802 |
+
results.append(f'<Document source="{link}" title="{title}">\n{best_chunk}\n</Document>')
|
| 803 |
+
processed_count += 1
|
| 804 |
+
combined_result = search_result + "\n\n" + "\n\n".join(results) if results else search_result
|
| 805 |
+
return combined_result[:10000]
|
| 806 |
+
return search_result[:10000]
|
|
|
|
|
|
|
|
|
|
| 807 |
except Exception as e:
|
| 808 |
return f"Error searching the web: {str(e)}"
|