Spaces:
Sleeping
Sleeping
Samuel Thomas
commited on
Commit
Β·
d5fffa5
1
Parent(s):
9e30ca3
ddgo debug
Browse files
tools.py
CHANGED
|
@@ -785,6 +785,7 @@ class PythonExecutorTool(BaseTool):
|
|
| 785 |
"""Async version - delegates to sync implementation."""
|
| 786 |
return self._run(file_path, run_manager)
|
| 787 |
|
|
|
|
| 788 |
class EnhancedDuckDuckGoSearchTool(BaseTool):
|
| 789 |
name: str = "enhanced_search"
|
| 790 |
description: str = (
|
|
@@ -796,13 +797,10 @@ class EnhancedDuckDuckGoSearchTool(BaseTool):
|
|
| 796 |
)
|
| 797 |
max_results: int = 3
|
| 798 |
max_chars_per_page: int = 12000
|
| 799 |
-
session: Any = None
|
| 800 |
-
|
| 801 |
|
| 802 |
-
# Use model_post_init for initialization logic in Pydantic v2+
|
| 803 |
def model_post_init(self, __context: Any) -> None:
|
| 804 |
super().model_post_init(__context)
|
| 805 |
-
# Initialize HTTP session here
|
| 806 |
self.session = requests.Session()
|
| 807 |
self.session.headers.update({
|
| 808 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
@@ -812,100 +810,70 @@ class EnhancedDuckDuckGoSearchTool(BaseTool):
|
|
| 812 |
'Connection': 'keep-alive',
|
| 813 |
'Upgrade-Insecure-Requests': '1',
|
| 814 |
})
|
| 815 |
-
|
| 816 |
-
def _search_duckduckgo(self,
|
| 817 |
"""Perform DuckDuckGo search and return results."""
|
| 818 |
try:
|
| 819 |
with DDGS() as ddgs:
|
| 820 |
-
results = list(ddgs.text(
|
| 821 |
return results
|
| 822 |
except Exception as e:
|
| 823 |
logger.error(f"DuckDuckGo search failed: {e}")
|
| 824 |
return []
|
| 825 |
-
|
| 826 |
def _extract_content_from_url(self, url: str, timeout: int = 10) -> Optional[str]:
|
| 827 |
"""Extract clean text content from a web page."""
|
| 828 |
try:
|
| 829 |
-
# Skip certain file types
|
| 830 |
if any(url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']):
|
| 831 |
return "Content type not supported for extraction"
|
| 832 |
-
|
| 833 |
response = self.session.get(url, timeout=timeout, allow_redirects=True)
|
| 834 |
response.raise_for_status()
|
| 835 |
-
|
| 836 |
-
# Check content type
|
| 837 |
content_type = response.headers.get('content-type', '').lower()
|
| 838 |
if 'text/html' not in content_type:
|
| 839 |
return "Non-HTML content detected"
|
| 840 |
-
|
| 841 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
for script in soup(["script", "style", "nav", "header", "footer", "aside", "form"]):
|
| 845 |
-
script.decompose()
|
| 846 |
-
|
| 847 |
-
# Try to find main content areas
|
| 848 |
main_content = None
|
| 849 |
-
for selector in ['main', 'article', '.content', '#content', '.post', '.entry']:
|
| 850 |
main_content = soup.select_one(selector)
|
| 851 |
if main_content:
|
| 852 |
break
|
| 853 |
-
|
| 854 |
if not main_content:
|
| 855 |
main_content = soup.find('body') or soup
|
| 856 |
-
|
| 857 |
-
# Extract text
|
| 858 |
text = main_content.get_text(separator='\n', strip=True)
|
| 859 |
-
|
| 860 |
-
# Clean up the text
|
| 861 |
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 862 |
text = '\n'.join(lines)
|
| 863 |
-
|
| 864 |
-
# Remove excessive whitespace
|
| 865 |
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 866 |
text = re.sub(r' {2,}', ' ', text)
|
| 867 |
-
|
| 868 |
-
# Truncate if too long
|
| 869 |
if len(text) > self.max_chars_per_page:
|
| 870 |
text = text[:self.max_chars_per_page] + "\n[Content truncated...]"
|
| 871 |
-
|
| 872 |
return text
|
| 873 |
-
|
| 874 |
except requests.exceptions.Timeout:
|
|
|
|
| 875 |
return "Page loading timed out"
|
| 876 |
except requests.exceptions.RequestException as e:
|
|
|
|
| 877 |
return f"Failed to retrieve page: {str(e)}"
|
| 878 |
except Exception as e:
|
| 879 |
logger.error(f"Content extraction failed for {url}: {e}")
|
| 880 |
return "Failed to extract content from page"
|
| 881 |
-
|
| 882 |
def _format_search_result(self, result: Dict, content: str) -> str:
|
| 883 |
"""Format a single search result with its content."""
|
| 884 |
title = result.get('title', 'No title')
|
| 885 |
url = result.get('href', 'No URL')
|
| 886 |
snippet = result.get('body', 'No snippet')
|
| 887 |
-
|
| 888 |
-
|
| 889 |
-
π **{title}**
|
| 890 |
-
URL: {url}
|
| 891 |
-
Snippet: {snippet}
|
| 892 |
-
|
| 893 |
-
π **Page Content:**
|
| 894 |
-
{content}
|
| 895 |
-
---
|
| 896 |
-
"""
|
| 897 |
-
return formatted
|
| 898 |
-
|
| 899 |
def run(self, tool_input: Union[str, Dict]) -> str:
|
| 900 |
query_str: Optional[str] = None
|
| 901 |
|
| 902 |
if isinstance(tool_input, dict):
|
| 903 |
-
# Try common keys where the actual query string might be stored
|
| 904 |
if "query" in tool_input and isinstance(tool_input["query"], str):
|
| 905 |
query_str = tool_input["query"]
|
| 906 |
elif "input" in tool_input and isinstance(tool_input["input"], str):
|
| 907 |
query_str = tool_input["input"]
|
| 908 |
-
# Add more checks if other dictionary structures are possible
|
| 909 |
else:
|
| 910 |
return "Invalid input: Dictionary received, but does not contain a recognizable string query under 'query' or 'input' keys."
|
| 911 |
elif isinstance(tool_input, str):
|
|
@@ -913,20 +881,20 @@ Snippet: {snippet}
|
|
| 913 |
else:
|
| 914 |
return f"Invalid input type: Expected a string or a dictionary, but got {type(tool_input).__name__}."
|
| 915 |
|
| 916 |
-
"""Execute the enhanced search."""
|
| 917 |
-
|
|
|
|
|
|
|
| 918 |
return "Please provide a search query."
|
| 919 |
|
| 920 |
-
|
| 921 |
-
logger.info(f"Searching for: {
|
| 922 |
|
| 923 |
-
#
|
| 924 |
-
search_results = self._search_duckduckgo(query)
|
| 925 |
|
| 926 |
if not search_results:
|
| 927 |
-
return f"No search results found for query: {
|
| 928 |
|
| 929 |
-
# Process each result and extract content
|
| 930 |
enhanced_results = []
|
| 931 |
processed_count = 0
|
| 932 |
|
|
@@ -934,42 +902,36 @@ Snippet: {snippet}
|
|
| 934 |
url = result.get('href', '')
|
| 935 |
if not url:
|
| 936 |
continue
|
| 937 |
-
|
| 938 |
logger.info(f"Processing result {i+1}: {url}")
|
| 939 |
-
|
| 940 |
-
# Extract content from the page
|
| 941 |
content = self._extract_content_from_url(url)
|
| 942 |
-
|
| 943 |
-
if content and len(content.strip()) > 50: # Only include results with substantial content
|
| 944 |
formatted_result = self._format_search_result(result, content)
|
| 945 |
enhanced_results.append(formatted_result)
|
| 946 |
processed_count += 1
|
| 947 |
-
|
| 948 |
-
# Small delay to be respectful to servers
|
| 949 |
-
time.sleep(0.5)
|
| 950 |
|
| 951 |
if not enhanced_results:
|
| 952 |
-
return f"Search completed but no content could be extracted from the pages for query: {
|
| 953 |
|
| 954 |
-
|
| 955 |
-
response = f"""π **Enhanced Search Results for: "{query}"**
|
| 956 |
Found {len(search_results)} results, successfully processed {processed_count} pages with content.
|
| 957 |
|
| 958 |
{''.join(enhanced_results)}
|
| 959 |
|
| 960 |
π‘ **Summary:** Retrieved and processed content from {processed_count} web pages to provide comprehensive information about your search query.
|
| 961 |
-
"""
|
| 962 |
|
| 963 |
-
#
|
| 964 |
-
if len(response) > 12000:
|
| 965 |
response = response[:12000] + "\n[Response truncated to prevent memory issues]"
|
| 966 |
|
| 967 |
return response
|
| 968 |
|
| 969 |
-
def _run(self,
|
| 970 |
-
"""Required by BaseTool interface."""
|
| 971 |
-
|
| 972 |
-
|
|
|
|
|
|
|
| 973 |
# --- Agent State Definition ---
|
| 974 |
class AgentState(TypedDict):
|
| 975 |
messages: Annotated[List[AnyMessage], lambda x, y: x + y]
|
|
|
|
| 785 |
"""Async version - delegates to sync implementation."""
|
| 786 |
return self._run(file_path, run_manager)
|
| 787 |
|
| 788 |
+
|
| 789 |
class EnhancedDuckDuckGoSearchTool(BaseTool):
|
| 790 |
name: str = "enhanced_search"
|
| 791 |
description: str = (
|
|
|
|
| 797 |
)
|
| 798 |
max_results: int = 3
|
| 799 |
max_chars_per_page: int = 12000
|
| 800 |
+
session: Any = None
|
|
|
|
| 801 |
|
|
|
|
| 802 |
def model_post_init(self, __context: Any) -> None:
|
| 803 |
super().model_post_init(__context)
|
|
|
|
| 804 |
self.session = requests.Session()
|
| 805 |
self.session.headers.update({
|
| 806 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
|
|
|
| 810 |
'Connection': 'keep-alive',
|
| 811 |
'Upgrade-Insecure-Requests': '1',
|
| 812 |
})
|
| 813 |
+
|
| 814 |
+
def _search_duckduckgo(self, query_term: str) -> List[Dict]: # Renamed 'query' to 'query_term' for clarity
|
| 815 |
"""Perform DuckDuckGo search and return results."""
|
| 816 |
try:
|
| 817 |
with DDGS() as ddgs:
|
| 818 |
+
results = list(ddgs.text(query_term, max_results=self.max_results))
|
| 819 |
return results
|
| 820 |
except Exception as e:
|
| 821 |
logger.error(f"DuckDuckGo search failed: {e}")
|
| 822 |
return []
|
| 823 |
+
|
| 824 |
def _extract_content_from_url(self, url: str, timeout: int = 10) -> Optional[str]:
|
| 825 |
"""Extract clean text content from a web page."""
|
| 826 |
try:
|
|
|
|
| 827 |
if any(url.lower().endswith(ext) for ext in ['.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx']):
|
| 828 |
return "Content type not supported for extraction"
|
|
|
|
| 829 |
response = self.session.get(url, timeout=timeout, allow_redirects=True)
|
| 830 |
response.raise_for_status()
|
|
|
|
|
|
|
| 831 |
content_type = response.headers.get('content-type', '').lower()
|
| 832 |
if 'text/html' not in content_type:
|
| 833 |
return "Non-HTML content detected"
|
|
|
|
| 834 |
soup = BeautifulSoup(response.content, 'html.parser')
|
| 835 |
+
for script_or_style in soup(["script", "style", "nav", "header", "footer", "aside", "form"]):
|
| 836 |
+
script_or_style.decompose()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 837 |
main_content = None
|
| 838 |
+
for selector in ['main', 'article', '.content', '#content', '.post', '.entry-content', '.entry']: # Added .entry-content
|
| 839 |
main_content = soup.select_one(selector)
|
| 840 |
if main_content:
|
| 841 |
break
|
|
|
|
| 842 |
if not main_content:
|
| 843 |
main_content = soup.find('body') or soup
|
|
|
|
|
|
|
| 844 |
text = main_content.get_text(separator='\n', strip=True)
|
|
|
|
|
|
|
| 845 |
lines = [line.strip() for line in text.split('\n') if line.strip()]
|
| 846 |
text = '\n'.join(lines)
|
|
|
|
|
|
|
| 847 |
text = re.sub(r'\n{3,}', '\n\n', text)
|
| 848 |
text = re.sub(r' {2,}', ' ', text)
|
|
|
|
|
|
|
| 849 |
if len(text) > self.max_chars_per_page:
|
| 850 |
text = text[:self.max_chars_per_page] + "\n[Content truncated...]"
|
|
|
|
| 851 |
return text
|
|
|
|
| 852 |
except requests.exceptions.Timeout:
|
| 853 |
+
logger.warning(f"Page loading timed out for {url}")
|
| 854 |
return "Page loading timed out"
|
| 855 |
except requests.exceptions.RequestException as e:
|
| 856 |
+
logger.warning(f"Failed to retrieve page {url}: {str(e)}")
|
| 857 |
return f"Failed to retrieve page: {str(e)}"
|
| 858 |
except Exception as e:
|
| 859 |
logger.error(f"Content extraction failed for {url}: {e}")
|
| 860 |
return "Failed to extract content from page"
|
| 861 |
+
|
| 862 |
def _format_search_result(self, result: Dict, content: str) -> str:
|
| 863 |
"""Format a single search result with its content."""
|
| 864 |
title = result.get('title', 'No title')
|
| 865 |
url = result.get('href', 'No URL')
|
| 866 |
snippet = result.get('body', 'No snippet')
|
| 867 |
+
return f"π **{title}**\nURL: {url}\nSnippet: {snippet}\n\nπ **Page Content:**\n{content}\n---\n"
|
| 868 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 869 |
def run(self, tool_input: Union[str, Dict]) -> str:
|
| 870 |
query_str: Optional[str] = None
|
| 871 |
|
| 872 |
if isinstance(tool_input, dict):
|
|
|
|
| 873 |
if "query" in tool_input and isinstance(tool_input["query"], str):
|
| 874 |
query_str = tool_input["query"]
|
| 875 |
elif "input" in tool_input and isinstance(tool_input["input"], str):
|
| 876 |
query_str = tool_input["input"]
|
|
|
|
| 877 |
else:
|
| 878 |
return "Invalid input: Dictionary received, but does not contain a recognizable string query under 'query' or 'input' keys."
|
| 879 |
elif isinstance(tool_input, str):
|
|
|
|
| 881 |
else:
|
| 882 |
return f"Invalid input type: Expected a string or a dictionary, but got {type(tool_input).__name__}."
|
| 883 |
|
| 884 |
+
# The misplaced docstring """Execute the enhanced search.""" was removed from here.
|
| 885 |
+
|
| 886 |
+
# Use query_str consistently from now on
|
| 887 |
+
if not query_str or not query_str.strip():
|
| 888 |
return "Please provide a search query."
|
| 889 |
|
| 890 |
+
query_str = query_str.strip() # Apply strip to query_str
|
| 891 |
+
logger.info(f"Searching for: {query_str}") # Use query_str
|
| 892 |
|
| 893 |
+
search_results = self._search_duckduckgo(query_str) # Use query_str
|
|
|
|
| 894 |
|
| 895 |
if not search_results:
|
| 896 |
+
return f"No search results found for query: {query_str}" # Use query_str
|
| 897 |
|
|
|
|
| 898 |
enhanced_results = []
|
| 899 |
processed_count = 0
|
| 900 |
|
|
|
|
| 902 |
url = result.get('href', '')
|
| 903 |
if not url:
|
| 904 |
continue
|
|
|
|
| 905 |
logger.info(f"Processing result {i+1}: {url}")
|
|
|
|
|
|
|
| 906 |
content = self._extract_content_from_url(url)
|
| 907 |
+
if content and len(content.strip()) > 50:
|
|
|
|
| 908 |
formatted_result = self._format_search_result(result, content)
|
| 909 |
enhanced_results.append(formatted_result)
|
| 910 |
processed_count += 1
|
| 911 |
+
time.sleep(0.5) # Consider making this configurable or adjusting based on use case
|
|
|
|
|
|
|
| 912 |
|
| 913 |
if not enhanced_results:
|
| 914 |
+
return f"Search completed but no content could be extracted from the pages for query: {query_str}" # Use query_str
|
| 915 |
|
| 916 |
+
response = f"""π **Enhanced Search Results for: "{query_str}"**
|
|
|
|
| 917 |
Found {len(search_results)} results, successfully processed {processed_count} pages with content.
|
| 918 |
|
| 919 |
{''.join(enhanced_results)}
|
| 920 |
|
| 921 |
π‘ **Summary:** Retrieved and processed content from {processed_count} web pages to provide comprehensive information about your search query.
|
| 922 |
+
""" # Use query_str
|
| 923 |
|
| 924 |
+
if len(response) > 12000: # This limit is arbitrary; consider if it should relate to self.max_chars_per_page
|
|
|
|
| 925 |
response = response[:12000] + "\n[Response truncated to prevent memory issues]"
|
| 926 |
|
| 927 |
return response
|
| 928 |
|
| 929 |
+
def _run(self, query_or_tool_input: Union[str, Dict]) -> str: # Updated to reflect run's input
|
| 930 |
+
"""Required by BaseTool interface. Handles various input types."""
|
| 931 |
+
# This _run method now correctly passes the input to the run method,
|
| 932 |
+
# which is designed to handle both string and dictionary inputs.
|
| 933 |
+
return self.run(query_or_tool_input)
|
| 934 |
+
|
| 935 |
# --- Agent State Definition ---
|
| 936 |
class AgentState(TypedDict):
|
| 937 |
messages: Annotated[List[AnyMessage], lambda x, y: x + y]
|