Spaces:
Build error
Build error
ernani
commited on
Commit
·
14e6771
1
Parent(s):
fb60291
improving web searching - added web scrapping when the search returns links - fixed content_type identification
Browse files- manage_agents.py +68 -0
- tools.py +113 -5
manage_agents.py
CHANGED
|
@@ -292,6 +292,9 @@ class MainAgent:
|
|
| 292 |
# Create LLM with tools bound for tool-using capabilities
|
| 293 |
self.general_tools = [self.wikipedia_tool, self.web_search_tool]
|
| 294 |
self.llm_with_tools = self.llm.bind_tools(self.general_tools)
|
|
|
|
|
|
|
|
|
|
| 295 |
|
| 296 |
def _format_question(self, question: str) -> str:
|
| 297 |
"""Format the question to be more specific and clear"""
|
|
@@ -349,15 +352,20 @@ class MainAgent:
|
|
| 349 |
|
| 350 |
def process_question(self, task_id: str, question: str, file_name: str = "") -> str:
|
| 351 |
try:
|
|
|
|
|
|
|
|
|
|
| 352 |
# First check if we can answer this directly without tools
|
| 353 |
direct_answer = self.content_translate.answer_or_flag(question)
|
| 354 |
if direct_answer != "TOOLS_REQUIRED":
|
|
|
|
| 355 |
return direct_answer
|
| 356 |
|
| 357 |
# If we have a file to process, use specialized tools
|
| 358 |
if file_name:
|
| 359 |
# Identify content type based on file extension
|
| 360 |
content_type, parameter, task_id = self.content_type_agent.identify_content_type(question, file_name, task_id)
|
|
|
|
| 361 |
|
| 362 |
if content_type in self.tools:
|
| 363 |
tool = self.tools[content_type]
|
|
@@ -451,6 +459,30 @@ class MainAgent:
|
|
| 451 |
|
| 452 |
response = self.llm.invoke(audio_analysis_prompt)
|
| 453 |
return response.content if hasattr(response, 'content') else str(response)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 454 |
else:
|
| 455 |
# Even for other tools, pass the question if the method accepts it
|
| 456 |
try:
|
|
@@ -489,12 +521,45 @@ class MainAgent:
|
|
| 489 |
else:
|
| 490 |
return f"Unsupported file type: {content_type}"
|
| 491 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
# For general questions (no files), use improved search strategy
|
| 493 |
question_lower = question.lower()
|
| 494 |
answer = None
|
| 495 |
|
| 496 |
# Check for Wikipedia specific questions first
|
| 497 |
if "wikipedia" in question_lower:
|
|
|
|
| 498 |
question = self._format_question(question)
|
| 499 |
wiki_result = self.wikipedia_tool._run(question)
|
| 500 |
answer = self._generate_answer_from_context(question, wiki_result)
|
|
@@ -502,6 +567,7 @@ class MainAgent:
|
|
| 502 |
return answer
|
| 503 |
|
| 504 |
# Use general web search
|
|
|
|
| 505 |
query = self._format_question(question)
|
| 506 |
web_result = self.web_search_tool._run(query)
|
| 507 |
answer = self._generate_answer_from_context(question, web_result)
|
|
@@ -511,6 +577,7 @@ class MainAgent:
|
|
| 511 |
|
| 512 |
# If no good answer from web search, try with Wikipedia as a last resource
|
| 513 |
if "wikipedia" not in question_lower: # Only if not already tried
|
|
|
|
| 514 |
question = self._format_question(question)
|
| 515 |
wiki_result = self.wikipedia_tool._run(question)
|
| 516 |
answer = self._generate_answer_from_context(question, wiki_result)
|
|
@@ -518,6 +585,7 @@ class MainAgent:
|
|
| 518 |
return answer
|
| 519 |
|
| 520 |
# If we still don't have a good answer, use the general tools approach
|
|
|
|
| 521 |
answer = self._get_answer_using_tools(question)
|
| 522 |
|
| 523 |
return answer
|
|
|
|
| 292 |
# Create LLM with tools bound for tool-using capabilities
|
| 293 |
self.general_tools = [self.wikipedia_tool, self.web_search_tool]
|
| 294 |
self.llm_with_tools = self.llm.bind_tools(self.general_tools)
|
| 295 |
+
|
| 296 |
+
# Tool usage tracking
|
| 297 |
+
self.last_used_tool = None
|
| 298 |
|
| 299 |
def _format_question(self, question: str) -> str:
|
| 300 |
"""Format the question to be more specific and clear"""
|
|
|
|
| 352 |
|
| 353 |
def process_question(self, task_id: str, question: str, file_name: str = "") -> str:
|
| 354 |
try:
|
| 355 |
+
# Reset tool tracking
|
| 356 |
+
self.last_used_tool = None
|
| 357 |
+
|
| 358 |
# First check if we can answer this directly without tools
|
| 359 |
direct_answer = self.content_translate.answer_or_flag(question)
|
| 360 |
if direct_answer != "TOOLS_REQUIRED":
|
| 361 |
+
self.last_used_tool = "direct"
|
| 362 |
return direct_answer
|
| 363 |
|
| 364 |
# If we have a file to process, use specialized tools
|
| 365 |
if file_name:
|
| 366 |
# Identify content type based on file extension
|
| 367 |
content_type, parameter, task_id = self.content_type_agent.identify_content_type(question, file_name, task_id)
|
| 368 |
+
self.last_used_tool = content_type
|
| 369 |
|
| 370 |
if content_type in self.tools:
|
| 371 |
tool = self.tools[content_type]
|
|
|
|
| 459 |
|
| 460 |
response = self.llm.invoke(audio_analysis_prompt)
|
| 461 |
return response.content if hasattr(response, 'content') else str(response)
|
| 462 |
+
elif content_type == "youtube":
|
| 463 |
+
result = tool._run(task_id, question=question)
|
| 464 |
+
|
| 465 |
+
# Use specialized prompt for YouTube analysis
|
| 466 |
+
youtube_analysis_prompt = f"""
|
| 467 |
+
Analyze this YouTube video and provide an extremely concise answer:
|
| 468 |
+
|
| 469 |
+
Question: {question}
|
| 470 |
+
|
| 471 |
+
YouTube Video:
|
| 472 |
+
{result}
|
| 473 |
+
|
| 474 |
+
Instructions:
|
| 475 |
+
1. Pay careful attention to the specific format requested in the question
|
| 476 |
+
2. Extract only the information needed to answer the question
|
| 477 |
+
|
| 478 |
+
When answering, provide ONLY the precise answer requested.
|
| 479 |
+
Do not include explanations, steps, reasoning, or additional text.
|
| 480 |
+
Be direct and specific. GAIA benchmark requires exact matching answers.
|
| 481 |
+
For example, if asked "What is the color of the sky?", respond simply with "blue".
|
| 482 |
+
"""
|
| 483 |
+
|
| 484 |
+
response = self.llm.invoke(youtube_analysis_prompt)
|
| 485 |
+
return response.content if hasattr(response, 'content') else str(response)
|
| 486 |
else:
|
| 487 |
# Even for other tools, pass the question if the method accepts it
|
| 488 |
try:
|
|
|
|
| 521 |
else:
|
| 522 |
return f"Unsupported file type: {content_type}"
|
| 523 |
|
| 524 |
+
# For general questions (no files), check for special content types first
|
| 525 |
+
# This is important for things like YouTube URLs that don't have a file
|
| 526 |
+
content_type, parameter, _ = self.content_type_agent.identify_content_type(question, "", task_id)
|
| 527 |
+
|
| 528 |
+
# Handle YouTube URLs in general questions
|
| 529 |
+
if content_type == "youtube":
|
| 530 |
+
self.last_used_tool = "youtube"
|
| 531 |
+
youtube_url = parameter if parameter.startswith("http") else question
|
| 532 |
+
result = self.youtube_tool._run(youtube_url, question=question)
|
| 533 |
+
|
| 534 |
+
# Use specialized prompt for YouTube analysis
|
| 535 |
+
youtube_analysis_prompt = f"""
|
| 536 |
+
Analyze this YouTube video and provide an extremely concise answer:
|
| 537 |
+
|
| 538 |
+
Question: {question}
|
| 539 |
+
|
| 540 |
+
YouTube Video:
|
| 541 |
+
{result}
|
| 542 |
+
|
| 543 |
+
Instructions:
|
| 544 |
+
1. Pay careful attention to the specific format requested in the question
|
| 545 |
+
2. Extract only the information needed to answer the question
|
| 546 |
+
|
| 547 |
+
When answering, provide ONLY the precise answer requested.
|
| 548 |
+
Do not include explanations, steps, reasoning, or additional text.
|
| 549 |
+
Be direct and specific. GAIA benchmark requires exact matching answers.
|
| 550 |
+
For example, if asked "What is the color of the sky?", respond simply with "blue".
|
| 551 |
+
"""
|
| 552 |
+
|
| 553 |
+
response = self.llm.invoke(youtube_analysis_prompt)
|
| 554 |
+
return response.content if hasattr(response, 'content') else str(response)
|
| 555 |
+
|
| 556 |
# For general questions (no files), use improved search strategy
|
| 557 |
question_lower = question.lower()
|
| 558 |
answer = None
|
| 559 |
|
| 560 |
# Check for Wikipedia specific questions first
|
| 561 |
if "wikipedia" in question_lower:
|
| 562 |
+
self.last_used_tool = "wiki"
|
| 563 |
question = self._format_question(question)
|
| 564 |
wiki_result = self.wikipedia_tool._run(question)
|
| 565 |
answer = self._generate_answer_from_context(question, wiki_result)
|
|
|
|
| 567 |
return answer
|
| 568 |
|
| 569 |
# Use general web search
|
| 570 |
+
self.last_used_tool = "web"
|
| 571 |
query = self._format_question(question)
|
| 572 |
web_result = self.web_search_tool._run(query)
|
| 573 |
answer = self._generate_answer_from_context(question, web_result)
|
|
|
|
| 577 |
|
| 578 |
# If no good answer from web search, try with Wikipedia as a last resource
|
| 579 |
if "wikipedia" not in question_lower: # Only if not already tried
|
| 580 |
+
self.last_used_tool = "wiki"
|
| 581 |
question = self._format_question(question)
|
| 582 |
wiki_result = self.wikipedia_tool._run(question)
|
| 583 |
answer = self._generate_answer_from_context(question, wiki_result)
|
|
|
|
| 585 |
return answer
|
| 586 |
|
| 587 |
# If we still don't have a good answer, use the general tools approach
|
| 588 |
+
self.last_used_tool = "general"
|
| 589 |
answer = self._get_answer_using_tools(question)
|
| 590 |
|
| 591 |
return answer
|
tools.py
CHANGED
|
@@ -5,9 +5,9 @@ import requests
|
|
| 5 |
from langchain.tools import BaseTool
|
| 6 |
from langchain.schema import Document
|
| 7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
-
from langchain_community.tools import WikipediaQueryRun,
|
| 9 |
from langchain_community.document_loaders import PythonLoader
|
| 10 |
-
from langchain_community.utilities import WikipediaAPIWrapper
|
| 11 |
import pytube
|
| 12 |
from PIL import Image
|
| 13 |
import pandas as pd
|
|
@@ -154,6 +154,8 @@ class YouTubeVideoTool(BaseContentTool):
|
|
| 154 |
for entry in transcript_list
|
| 155 |
])
|
| 156 |
|
|
|
|
|
|
|
| 157 |
return transcript_text
|
| 158 |
|
| 159 |
except Exception as e:
|
|
@@ -696,14 +698,120 @@ class WebSearchTool(BaseTool):
|
|
| 696 |
"""Tool for web search using DuckDuckGo"""
|
| 697 |
name: str = "web_search"
|
| 698 |
description: str = "Search the web for information. Useful for questions about current events, specific facts, or topics not covered in Wikipedia."
|
| 699 |
-
search_tool:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
|
| 701 |
def _run(self, query: str) -> str:
|
| 702 |
"""Search the web and return results as a string"""
|
| 703 |
try:
|
| 704 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 705 |
|
| 706 |
-
return
|
| 707 |
|
| 708 |
except Exception as e:
|
| 709 |
return f"Error searching the web: {str(e)}"
|
|
|
|
| 5 |
from langchain.tools import BaseTool
|
| 6 |
from langchain.schema import Document
|
| 7 |
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 8 |
+
from langchain_community.tools import WikipediaQueryRun, DuckDuckGoSearchResults
|
| 9 |
from langchain_community.document_loaders import PythonLoader
|
| 10 |
+
from langchain_community.utilities import WikipediaAPIWrapper, DuckDuckGoSearchAPIWrapper
|
| 11 |
import pytube
|
| 12 |
from PIL import Image
|
| 13 |
import pandas as pd
|
|
|
|
| 154 |
for entry in transcript_list
|
| 155 |
])
|
| 156 |
|
| 157 |
+
print(f"Transcript text: {transcript_text}")
|
| 158 |
+
|
| 159 |
return transcript_text
|
| 160 |
|
| 161 |
except Exception as e:
|
|
|
|
| 698 |
"""Tool for web search using DuckDuckGo"""
|
| 699 |
name: str = "web_search"
|
| 700 |
description: str = "Search the web for information. Useful for questions about current events, specific facts, or topics not covered in Wikipedia."
|
| 701 |
+
search_tool: DuckDuckGoSearchResults = Field(default_factory=DuckDuckGoSearchResults)
|
| 702 |
+
|
| 703 |
+
def _extract_links_from_results(self, search_result: str) -> list:
|
| 704 |
+
"""Extract links from search results using string splitting"""
|
| 705 |
+
links = []
|
| 706 |
+
try:
|
| 707 |
+
# Split by 'link:' and process each part except the first one
|
| 708 |
+
parts = search_result.split('link:')
|
| 709 |
+
|
| 710 |
+
# Skip the first part (before the first 'link:')
|
| 711 |
+
for part in parts[1:]:
|
| 712 |
+
# Get the URL by splitting at the first comma
|
| 713 |
+
url = part.split(',')[0].strip()
|
| 714 |
+
if url.startswith('http'):
|
| 715 |
+
links.append(url)
|
| 716 |
+
|
| 717 |
+
# Add debug output
|
| 718 |
+
except Exception as e:
|
| 719 |
+
print(f"Error extracting links: {str(e)}")
|
| 720 |
+
|
| 721 |
+
return links
|
| 722 |
+
|
| 723 |
+
def _is_promising_link(self, link: str, query: str) -> bool:
|
| 724 |
+
"""Determine if a link is promising based on the query"""
|
| 725 |
+
query_terms = set(query.lower().split())
|
| 726 |
+
|
| 727 |
+
# Exclude common non-content sites
|
| 728 |
+
excluded_domains = [
|
| 729 |
+
'youtube.com', 'facebook.com', 'twitter.com', 'instagram.com',
|
| 730 |
+
'pinterest.com', 'reddit.com', 'tiktok.com', 'linkedin.com'
|
| 731 |
+
]
|
| 732 |
+
|
| 733 |
+
for domain in excluded_domains:
|
| 734 |
+
if domain in link:
|
| 735 |
+
return False
|
| 736 |
+
|
| 737 |
+
# Prefer certain credible domains
|
| 738 |
+
preferred_domains = [
|
| 739 |
+
'wikipedia.org', 'britannica.com', 'scholarpedia.org',
|
| 740 |
+
'.edu', '.gov', '.org'
|
| 741 |
+
]
|
| 742 |
+
|
| 743 |
+
for domain in preferred_domains:
|
| 744 |
+
if domain in link:
|
| 745 |
+
return True
|
| 746 |
+
|
| 747 |
+
return True # Default to True to allow scraping
|
| 748 |
+
|
| 749 |
+
def _scrape_page_content(self, url: str) -> str:
|
| 750 |
+
"""Scrape the content of a webpage"""
|
| 751 |
+
try:
|
| 752 |
+
headers = {
|
| 753 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
| 754 |
+
}
|
| 755 |
+
response = requests.get(url, headers=headers, timeout=10)
|
| 756 |
+
response.raise_for_status()
|
| 757 |
+
|
| 758 |
+
# Check if we got HTML content
|
| 759 |
+
content_type = response.headers.get('Content-Type', '')
|
| 760 |
+
if 'text/html' not in content_type:
|
| 761 |
+
return ""
|
| 762 |
+
|
| 763 |
+
# Use BeautifulSoup to parse the HTML
|
| 764 |
+
from bs4 import BeautifulSoup
|
| 765 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 766 |
+
|
| 767 |
+
# Remove script and style elements
|
| 768 |
+
for script in soup(["script", "style", "nav", "footer", "header"]):
|
| 769 |
+
script.decompose()
|
| 770 |
+
|
| 771 |
+
# Extract text content
|
| 772 |
+
text = soup.get_text(separator=' ', strip=True)
|
| 773 |
+
|
| 774 |
+
# Clean up the text
|
| 775 |
+
lines = (line.strip() for line in text.splitlines())
|
| 776 |
+
chunks = (phrase.strip() for line in lines for phrase in line.split(" "))
|
| 777 |
+
text = '\n'.join(chunk for chunk in chunks if chunk)
|
| 778 |
+
|
| 779 |
+
# Limit the length
|
| 780 |
+
return text[:5000] # Limit to 5000 chars
|
| 781 |
+
|
| 782 |
+
except Exception as e:
|
| 783 |
+
return f"Error scraping page content {str(e)}"
|
| 784 |
|
| 785 |
def _run(self, query: str) -> str:
|
| 786 |
"""Search the web and return results as a string"""
|
| 787 |
try:
|
| 788 |
+
# First perform the DuckDuckGo search
|
| 789 |
+
search_result = self.search_tool.run(query, max_results=5)
|
| 790 |
+
|
| 791 |
+
# Extract links from the search results
|
| 792 |
+
links = self._extract_links_from_results(search_result)
|
| 793 |
+
|
| 794 |
+
# Process up to 3 promising links
|
| 795 |
+
additional_content = []
|
| 796 |
+
processed_count = 0
|
| 797 |
+
|
| 798 |
+
for link in links:
|
| 799 |
+
if processed_count >= 3:
|
| 800 |
+
break
|
| 801 |
+
|
| 802 |
+
if self._is_promising_link(link, query):
|
| 803 |
+
content = self._scrape_page_content(link)
|
| 804 |
+
if content:
|
| 805 |
+
additional_content.append(f"Additional content from {link}:\n{content}\n")
|
| 806 |
+
processed_count += 1
|
| 807 |
+
|
| 808 |
+
# Combine the search results with the additional content
|
| 809 |
+
combined_result = search_result
|
| 810 |
+
|
| 811 |
+
if additional_content:
|
| 812 |
+
combined_result += "\n\n" + "\n\n".join(additional_content)
|
| 813 |
|
| 814 |
+
return combined_result
|
| 815 |
|
| 816 |
except Exception as e:
|
| 817 |
return f"Error searching the web: {str(e)}"
|