Spaces:
Sleeping
Sleeping
Improved search with url scraping
Browse files- agent.py +173 -3
- requirements.txt +3 -1
agent.py
CHANGED
|
@@ -16,8 +16,10 @@ import random
|
|
| 16 |
import json
|
| 17 |
import re
|
| 18 |
import requests
|
| 19 |
-
from urllib.parse import quote
|
| 20 |
import sys
|
|
|
|
|
|
|
| 21 |
|
| 22 |
from apify_client import ApifyClient
|
| 23 |
|
|
@@ -165,7 +167,104 @@ def apify_google_search(query: str, limit: int = 10) -> str:
|
|
| 165 |
except Exception as e:
|
| 166 |
print(f"Error using Apify: {str(e)}")
|
| 167 |
return fallback_search(query)
|
| 168 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 169 |
def format_search_results(results: List[Dict], query: str) -> str:
|
| 170 |
"""Format the search results into a readable string"""
|
| 171 |
if not results or len(results) == 0:
|
|
@@ -269,6 +368,8 @@ SYSTEM_PROMPT = """Answer the following questions as best you can. DO NOT rely o
|
|
| 269 |
|
| 270 |
web_search: Search the web for current information. Provide a specific search query.
|
| 271 |
python_code: Execute Python code. Provide the complete Python code as a string. Use this tool to calculate math problems.
|
|
|
|
|
|
|
| 272 |
|
| 273 |
The way you use the tools is by specifying a json blob.
|
| 274 |
Specifically, this json should have an `action` key (with the name of the tool to use) and an `action_input` key (with the input to the tool going here).
|
|
@@ -276,6 +377,7 @@ Specifically, this json should have an `action` key (with the name of the tool t
|
|
| 276 |
The only values that should be in the "action" field are:
|
| 277 |
web_search: Search the web for current information, args: {"query": {"type": "string"}}
|
| 278 |
python_code: Execute Python code, args: {"code": {"type": "string"}}
|
|
|
|
| 279 |
|
| 280 |
IMPORTANT: Make sure your JSON is properly formatted with double quotes around keys and string values.
|
| 281 |
|
|
@@ -288,6 +390,15 @@ example use:
|
|
| 288 |
}
|
| 289 |
```
|
| 290 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
ALWAYS follow this specific format for your responses. Your entire response will follow this pattern:
|
| 292 |
|
| 293 |
Question: [the user's question]
|
|
@@ -323,6 +434,8 @@ IMPORTANT: You MUST strictly follow the ReAct pattern (Reasoning, Action, Observ
|
|
| 323 |
4. Based on the observation, continue with another thought
|
| 324 |
5. This cycle repeats until you have enough information to provide a final answer
|
| 325 |
|
|
|
|
|
|
|
| 326 |
... (this Thought/Action/Observation cycle can repeat as needed) ...
|
| 327 |
|
| 328 |
Thought: I now know the final answer
|
|
@@ -350,6 +463,11 @@ tools_config = [
|
|
| 350 |
"name": "python_code",
|
| 351 |
"description": "Execute Python code. Provide the complete Python code as a string in the format: {\"code\": \"your python code here\"}",
|
| 352 |
"func": run_python_code
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 353 |
}
|
| 354 |
]
|
| 355 |
|
|
@@ -361,6 +479,7 @@ chat_with_tools = chat
|
|
| 361 |
class ActionInput(TypedDict, total=False):
|
| 362 |
query: Optional[str]
|
| 363 |
code: Optional[str]
|
|
|
|
| 364 |
|
| 365 |
class AgentState(TypedDict, total=False):
|
| 366 |
messages: Annotated[list[AnyMessage], add_messages]
|
|
@@ -577,6 +696,51 @@ def python_code_node(state: AgentState) -> Dict[str, Any]:
|
|
| 577 |
"action_input": None # Clear the action input
|
| 578 |
}
|
| 579 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 580 |
# Router function to direct to the correct tool
|
| 581 |
def router(state: AgentState) -> str:
|
| 582 |
"""Route to the appropriate tool based on the current_tool field."""
|
|
@@ -589,6 +753,8 @@ def router(state: AgentState) -> str:
|
|
| 589 |
return "web_search"
|
| 590 |
elif tool == "python_code":
|
| 591 |
return "python_code"
|
|
|
|
|
|
|
| 592 |
else:
|
| 593 |
return "end"
|
| 594 |
|
|
@@ -601,6 +767,7 @@ def create_agent_graph() -> StateGraph:
|
|
| 601 |
builder.add_node("assistant", assistant)
|
| 602 |
builder.add_node("web_search", web_search_node)
|
| 603 |
builder.add_node("python_code", python_code_node)
|
|
|
|
| 604 |
|
| 605 |
# Define edges: these determine how the control flow moves
|
| 606 |
builder.add_edge(START, "assistant")
|
|
@@ -627,6 +794,7 @@ def create_agent_graph() -> StateGraph:
|
|
| 627 |
{
|
| 628 |
"web_search": "web_search",
|
| 629 |
"python_code": "python_code",
|
|
|
|
| 630 |
"end": END
|
| 631 |
}
|
| 632 |
)
|
|
@@ -634,6 +802,7 @@ def create_agent_graph() -> StateGraph:
|
|
| 634 |
# Tools always go back to assistant
|
| 635 |
builder.add_edge("web_search", "assistant")
|
| 636 |
builder.add_edge("python_code", "assistant")
|
|
|
|
| 637 |
|
| 638 |
# Compile with a reasonable recursion limit to prevent infinite loops
|
| 639 |
return builder.compile()
|
|
@@ -691,4 +860,5 @@ if __name__ == "__main__":
|
|
| 691 |
agent = TurboNerd(max_execution_time=60)
|
| 692 |
response = agent("How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.")
|
| 693 |
print("\nFinal Response:")
|
| 694 |
-
print(response)
|
|
|
|
|
|
| 16 |
import json
|
| 17 |
import re
|
| 18 |
import requests
|
| 19 |
+
from urllib.parse import quote, urlparse
|
| 20 |
import sys
|
| 21 |
+
from bs4 import BeautifulSoup
|
| 22 |
+
import html2text
|
| 23 |
|
| 24 |
from apify_client import ApifyClient
|
| 25 |
|
|
|
|
| 167 |
except Exception as e:
|
| 168 |
print(f"Error using Apify: {str(e)}")
|
| 169 |
return fallback_search(query)
|
| 170 |
+
def scrape_webpage(url: str) -> str:
|
| 171 |
+
"""
|
| 172 |
+
Safely scrape content from a specified URL.
|
| 173 |
+
|
| 174 |
+
Args:
|
| 175 |
+
url: The URL to scrape
|
| 176 |
+
|
| 177 |
+
Returns:
|
| 178 |
+
Formatted webpage content as text
|
| 179 |
+
"""
|
| 180 |
+
# Check if the URL is valid
|
| 181 |
+
try:
|
| 182 |
+
# Parse the URL to validate it
|
| 183 |
+
parsed_url = urlparse(url)
|
| 184 |
+
if not parsed_url.scheme or not parsed_url.netloc:
|
| 185 |
+
return f"Error: Invalid URL format: {url}. Please provide a valid URL with http:// or https:// prefix."
|
| 186 |
+
|
| 187 |
+
# Block potentially dangerous URLs
|
| 188 |
+
blocked_domains = [
|
| 189 |
+
"localhost", "127.0.0.1", "0.0.0.0",
|
| 190 |
+
"192.168.", "10.0.", "172.16.", "172.17.", "172.18.", "172.19.", "172.20.",
|
| 191 |
+
"172.21.", "172.22.", "172.23.", "172.24.", "172.25.", "172.26.", "172.27.",
|
| 192 |
+
"172.28.", "172.29.", "172.30.", "172.31."
|
| 193 |
+
]
|
| 194 |
+
|
| 195 |
+
if any(domain in parsed_url.netloc for domain in blocked_domains):
|
| 196 |
+
return f"Error: Access to internal/local URLs is blocked for security: {url}"
|
| 197 |
+
|
| 198 |
+
print(f"Scraping URL: {url}")
|
| 199 |
+
|
| 200 |
+
# Set user agent to avoid being blocked
|
| 201 |
+
headers = {
|
| 202 |
+
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
|
| 203 |
+
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 204 |
+
'Accept-Language': 'en-US,en;q=0.5',
|
| 205 |
+
'Connection': 'keep-alive',
|
| 206 |
+
'Upgrade-Insecure-Requests': '1',
|
| 207 |
+
'Cache-Control': 'max-age=0',
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
# Set a reasonable timeout to avoid hanging
|
| 211 |
+
timeout = 10
|
| 212 |
+
|
| 213 |
+
# Make the request
|
| 214 |
+
response = requests.get(url, headers=headers, timeout=timeout)
|
| 215 |
+
|
| 216 |
+
# Check if request was successful
|
| 217 |
+
if response.status_code != 200:
|
| 218 |
+
return f"Error: Failed to fetch the webpage. Status code: {response.status_code}"
|
| 219 |
+
|
| 220 |
+
# Use BeautifulSoup to parse the HTML
|
| 221 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
| 222 |
+
|
| 223 |
+
# Remove script and style elements that are not relevant to content
|
| 224 |
+
for script_or_style in soup(["script", "style", "iframe", "footer", "nav"]):
|
| 225 |
+
script_or_style.decompose()
|
| 226 |
+
|
| 227 |
+
# Get the page title
|
| 228 |
+
title = soup.title.string if soup.title else "No title found"
|
| 229 |
+
|
| 230 |
+
# Extract the main content
|
| 231 |
+
# First try to find main content areas
|
| 232 |
+
main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content')
|
| 233 |
+
|
| 234 |
+
# If no main content area is found, use the entire body
|
| 235 |
+
if not main_content:
|
| 236 |
+
main_content = soup.body
|
| 237 |
+
|
| 238 |
+
# Convert to plain text
|
| 239 |
+
h = html2text.HTML2Text()
|
| 240 |
+
h.ignore_links = False
|
| 241 |
+
h.ignore_images = True
|
| 242 |
+
h.ignore_tables = False
|
| 243 |
+
h.unicode_snob = True
|
| 244 |
+
|
| 245 |
+
if main_content:
|
| 246 |
+
text_content = h.handle(str(main_content))
|
| 247 |
+
else:
|
| 248 |
+
text_content = h.handle(response.text)
|
| 249 |
+
|
| 250 |
+
# Limit content length to avoid overwhelming the model
|
| 251 |
+
max_content_length = 99999999999
|
| 252 |
+
if len(text_content) > max_content_length:
|
| 253 |
+
text_content = text_content[:max_content_length] + "\n\n[Content truncated due to length...]"
|
| 254 |
+
|
| 255 |
+
# Format the response
|
| 256 |
+
result = f"Title: {title}\nURL: {url}\n\n{text_content}"
|
| 257 |
+
|
| 258 |
+
return result
|
| 259 |
+
|
| 260 |
+
except requests.exceptions.Timeout:
|
| 261 |
+
return f"Error: Request timed out while trying to access {url}"
|
| 262 |
+
except requests.exceptions.ConnectionError:
|
| 263 |
+
return f"Error: Failed to connect to {url}. The site might be down or the URL might be incorrect."
|
| 264 |
+
except requests.exceptions.RequestException as e:
|
| 265 |
+
return f"Error requesting {url}: {str(e)}"
|
| 266 |
+
except Exception as e:
|
| 267 |
+
return f"Error scraping webpage {url}: {str(e)}"
|
| 268 |
def format_search_results(results: List[Dict], query: str) -> str:
|
| 269 |
"""Format the search results into a readable string"""
|
| 270 |
if not results or len(results) == 0:
|
|
|
|
| 368 |
|
| 369 |
web_search: Search the web for current information. Provide a specific search query.
|
| 370 |
python_code: Execute Python code. Provide the complete Python code as a string. Use this tool to calculate math problems.
|
| 371 |
+
webpage_scrape: Scrape content from a specific webpage URL. Provide a valid URL to extract information from a particular web page.
|
| 372 |
+
|
| 373 |
|
| 374 |
The way you use the tools is by specifying a json blob.
|
| 375 |
Specifically, this json should have an `action` key (with the name of the tool to use) and an `action_input` key (with the input to the tool going here).
|
|
|
|
| 377 |
The only values that should be in the "action" field are:
|
| 378 |
web_search: Search the web for current information, args: {"query": {"type": "string"}}
|
| 379 |
python_code: Execute Python code, args: {"code": {"type": "string"}}
|
| 380 |
+
webpage_scrape: Scrape a specific webpage, args: {"url": {"type": "string"}}
|
| 381 |
|
| 382 |
IMPORTANT: Make sure your JSON is properly formatted with double quotes around keys and string values.
|
| 383 |
|
|
|
|
| 390 |
}
|
| 391 |
```
|
| 392 |
|
| 393 |
+
Or for scraping a webpage:
|
| 394 |
+
|
| 395 |
+
```json
|
| 396 |
+
{
|
| 397 |
+
"action": "webpage_scrape",
|
| 398 |
+
"action_input": {"url": "https://en.wikipedia.org/wiki/Artificial_intelligence"}
|
| 399 |
+
}
|
| 400 |
+
```
|
| 401 |
+
|
| 402 |
ALWAYS follow this specific format for your responses. Your entire response will follow this pattern:
|
| 403 |
|
| 404 |
Question: [the user's question]
|
|
|
|
| 434 |
4. Based on the observation, continue with another thought
|
| 435 |
5. This cycle repeats until you have enough information to provide a final answer
|
| 436 |
|
| 437 |
+
NEVER fake or simulate tool output yourself.
|
| 438 |
+
|
| 439 |
... (this Thought/Action/Observation cycle can repeat as needed) ...
|
| 440 |
|
| 441 |
Thought: I now know the final answer
|
|
|
|
| 463 |
"name": "python_code",
|
| 464 |
"description": "Execute Python code. Provide the complete Python code as a string in the format: {\"code\": \"your python code here\"}",
|
| 465 |
"func": run_python_code
|
| 466 |
+
},
|
| 467 |
+
{
|
| 468 |
+
"name": "webpage_scrape",
|
| 469 |
+
"description": "Scrape content from a specific webpage URL. Provide a valid URL in the format: {\"url\": \"https://example.com\"}",
|
| 470 |
+
"func": scrape_webpage
|
| 471 |
}
|
| 472 |
]
|
| 473 |
|
|
|
|
| 479 |
class ActionInput(TypedDict, total=False):
|
| 480 |
query: Optional[str]
|
| 481 |
code: Optional[str]
|
| 482 |
+
url: Optional[str]
|
| 483 |
|
| 484 |
class AgentState(TypedDict, total=False):
|
| 485 |
messages: Annotated[list[AnyMessage], add_messages]
|
|
|
|
| 696 |
"action_input": None # Clear the action input
|
| 697 |
}
|
| 698 |
|
| 699 |
+
def webpage_scrape_node(state: AgentState) -> Dict[str, Any]:
|
| 700 |
+
"""Node that scrapes content from a specific webpage URL."""
|
| 701 |
+
print("Webpage Scrape Tool Called...\n\n")
|
| 702 |
+
|
| 703 |
+
# Extract tool arguments
|
| 704 |
+
action_input = state.get("action_input", {})
|
| 705 |
+
print(f"Webpage scrape action_input: {action_input}")
|
| 706 |
+
|
| 707 |
+
# Try different ways to extract the URL
|
| 708 |
+
url = ""
|
| 709 |
+
if isinstance(action_input, dict):
|
| 710 |
+
url = action_input.get("url", "")
|
| 711 |
+
elif isinstance(action_input, str):
|
| 712 |
+
url = action_input
|
| 713 |
+
|
| 714 |
+
print(f"Scraping URL: '{url}'")
|
| 715 |
+
|
| 716 |
+
# Safety check - don't run with empty URL
|
| 717 |
+
if not url:
|
| 718 |
+
result = "Error: No URL provided. Please provide a valid URL to scrape."
|
| 719 |
+
else:
|
| 720 |
+
# Call the webpage scraping function
|
| 721 |
+
result = scrape_webpage(url)
|
| 722 |
+
|
| 723 |
+
print(f"Scraping result length: {len(result)}")
|
| 724 |
+
|
| 725 |
+
# Format the observation to continue the ReAct cycle
|
| 726 |
+
# Always prefix with "Observation:" for consistency in the ReAct cycle
|
| 727 |
+
tool_message = AIMessage(
|
| 728 |
+
content=f"Observation: {result.strip()}"
|
| 729 |
+
)
|
| 730 |
+
|
| 731 |
+
# Print the observation that will be sent back to the assistant
|
| 732 |
+
print("\n=== TOOL OBSERVATION ===")
|
| 733 |
+
content_preview = tool_message.content[:500] + "..." if len(tool_message.content) > 500 else tool_message.content
|
| 734 |
+
print(content_preview)
|
| 735 |
+
print("=== END OBSERVATION ===\n")
|
| 736 |
+
|
| 737 |
+
# Return the updated state
|
| 738 |
+
return {
|
| 739 |
+
"messages": state["messages"] + [tool_message],
|
| 740 |
+
"current_tool": None, # Reset the current tool
|
| 741 |
+
"action_input": None # Clear the action input
|
| 742 |
+
}
|
| 743 |
+
|
| 744 |
# Router function to direct to the correct tool
|
| 745 |
def router(state: AgentState) -> str:
|
| 746 |
"""Route to the appropriate tool based on the current_tool field."""
|
|
|
|
| 753 |
return "web_search"
|
| 754 |
elif tool == "python_code":
|
| 755 |
return "python_code"
|
| 756 |
+
elif tool == "webpage_scrape":
|
| 757 |
+
return "webpage_scrape"
|
| 758 |
else:
|
| 759 |
return "end"
|
| 760 |
|
|
|
|
| 767 |
builder.add_node("assistant", assistant)
|
| 768 |
builder.add_node("web_search", web_search_node)
|
| 769 |
builder.add_node("python_code", python_code_node)
|
| 770 |
+
builder.add_node("webpage_scrape", webpage_scrape_node)
|
| 771 |
|
| 772 |
# Define edges: these determine how the control flow moves
|
| 773 |
builder.add_edge(START, "assistant")
|
|
|
|
| 794 |
{
|
| 795 |
"web_search": "web_search",
|
| 796 |
"python_code": "python_code",
|
| 797 |
+
"webpage_scrape": "webpage_scrape",
|
| 798 |
"end": END
|
| 799 |
}
|
| 800 |
)
|
|
|
|
| 802 |
# Tools always go back to assistant
|
| 803 |
builder.add_edge("web_search", "assistant")
|
| 804 |
builder.add_edge("python_code", "assistant")
|
| 805 |
+
builder.add_edge("webpage_scrape", "assistant")
|
| 806 |
|
| 807 |
# Compile with a reasonable recursion limit to prevent infinite loops
|
| 808 |
return builder.compile()
|
|
|
|
| 860 |
agent = TurboNerd(max_execution_time=60)
|
| 861 |
response = agent("How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.")
|
| 862 |
print("\nFinal Response:")
|
| 863 |
+
print(response)
|
| 864 |
+
|
requirements.txt
CHANGED
|
@@ -5,4 +5,6 @@ langchain
|
|
| 5 |
langchain-openai
|
| 6 |
duckduckgo-search
|
| 7 |
langchain-community
|
| 8 |
-
apify-client
|
|
|
|
|
|
|
|
|
| 5 |
langchain-openai
|
| 6 |
duckduckgo-search
|
| 7 |
langchain-community
|
| 8 |
+
apify-client
|
| 9 |
+
beautifulsoup4
|
| 10 |
+
html2text
|