Lasdw commited on
Commit
bf8f953
·
1 Parent(s): d3ebcb4

Improved search with url scraping

Browse files
Files changed (2) hide show
  1. agent.py +173 -3
  2. requirements.txt +3 -1
agent.py CHANGED
@@ -16,8 +16,10 @@ import random
16
  import json
17
  import re
18
  import requests
19
- from urllib.parse import quote
20
  import sys
 
 
21
 
22
  from apify_client import ApifyClient
23
 
@@ -165,7 +167,104 @@ def apify_google_search(query: str, limit: int = 10) -> str:
165
  except Exception as e:
166
  print(f"Error using Apify: {str(e)}")
167
  return fallback_search(query)
168
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
169
  def format_search_results(results: List[Dict], query: str) -> str:
170
  """Format the search results into a readable string"""
171
  if not results or len(results) == 0:
@@ -269,6 +368,8 @@ SYSTEM_PROMPT = """Answer the following questions as best you can. DO NOT rely o
269
 
270
  web_search: Search the web for current information. Provide a specific search query.
271
  python_code: Execute Python code. Provide the complete Python code as a string. Use this tool to calculate math problems.
 
 
272
 
273
  The way you use the tools is by specifying a json blob.
274
  Specifically, this json should have an `action` key (with the name of the tool to use) and an `action_input` key (with the input to the tool going here).
@@ -276,6 +377,7 @@ Specifically, this json should have an `action` key (with the name of the tool t
276
  The only values that should be in the "action" field are:
277
  web_search: Search the web for current information, args: {"query": {"type": "string"}}
278
  python_code: Execute Python code, args: {"code": {"type": "string"}}
 
279
 
280
  IMPORTANT: Make sure your JSON is properly formatted with double quotes around keys and string values.
281
 
@@ -288,6 +390,15 @@ example use:
288
  }
289
  ```
290
 
 
 
 
 
 
 
 
 
 
291
  ALWAYS follow this specific format for your responses. Your entire response will follow this pattern:
292
 
293
  Question: [the user's question]
@@ -323,6 +434,8 @@ IMPORTANT: You MUST strictly follow the ReAct pattern (Reasoning, Action, Observ
323
  4. Based on the observation, continue with another thought
324
  5. This cycle repeats until you have enough information to provide a final answer
325
 
 
 
326
  ... (this Thought/Action/Observation cycle can repeat as needed) ...
327
 
328
  Thought: I now know the final answer
@@ -350,6 +463,11 @@ tools_config = [
350
  "name": "python_code",
351
  "description": "Execute Python code. Provide the complete Python code as a string in the format: {\"code\": \"your python code here\"}",
352
  "func": run_python_code
 
 
 
 
 
353
  }
354
  ]
355
 
@@ -361,6 +479,7 @@ chat_with_tools = chat
361
  class ActionInput(TypedDict, total=False):
362
  query: Optional[str]
363
  code: Optional[str]
 
364
 
365
  class AgentState(TypedDict, total=False):
366
  messages: Annotated[list[AnyMessage], add_messages]
@@ -577,6 +696,51 @@ def python_code_node(state: AgentState) -> Dict[str, Any]:
577
  "action_input": None # Clear the action input
578
  }
579
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
580
  # Router function to direct to the correct tool
581
  def router(state: AgentState) -> str:
582
  """Route to the appropriate tool based on the current_tool field."""
@@ -589,6 +753,8 @@ def router(state: AgentState) -> str:
589
  return "web_search"
590
  elif tool == "python_code":
591
  return "python_code"
 
 
592
  else:
593
  return "end"
594
 
@@ -601,6 +767,7 @@ def create_agent_graph() -> StateGraph:
601
  builder.add_node("assistant", assistant)
602
  builder.add_node("web_search", web_search_node)
603
  builder.add_node("python_code", python_code_node)
 
604
 
605
  # Define edges: these determine how the control flow moves
606
  builder.add_edge(START, "assistant")
@@ -627,6 +794,7 @@ def create_agent_graph() -> StateGraph:
627
  {
628
  "web_search": "web_search",
629
  "python_code": "python_code",
 
630
  "end": END
631
  }
632
  )
@@ -634,6 +802,7 @@ def create_agent_graph() -> StateGraph:
634
  # Tools always go back to assistant
635
  builder.add_edge("web_search", "assistant")
636
  builder.add_edge("python_code", "assistant")
 
637
 
638
  # Compile with a reasonable recursion limit to prevent infinite loops
639
  return builder.compile()
@@ -691,4 +860,5 @@ if __name__ == "__main__":
691
  agent = TurboNerd(max_execution_time=60)
692
  response = agent("How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.")
693
  print("\nFinal Response:")
694
- print(response)
 
 
16
  import json
17
  import re
18
  import requests
19
+ from urllib.parse import quote, urlparse
20
  import sys
21
+ from bs4 import BeautifulSoup
22
+ import html2text
23
 
24
  from apify_client import ApifyClient
25
 
 
167
  except Exception as e:
168
  print(f"Error using Apify: {str(e)}")
169
  return fallback_search(query)
170
+ def scrape_webpage(url: str) -> str:
171
+ """
172
+ Safely scrape content from a specified URL.
173
+
174
+ Args:
175
+ url: The URL to scrape
176
+
177
+ Returns:
178
+ Formatted webpage content as text
179
+ """
180
+ # Check if the URL is valid
181
+ try:
182
+ # Parse the URL to validate it
183
+ parsed_url = urlparse(url)
184
+ if not parsed_url.scheme or not parsed_url.netloc:
185
+ return f"Error: Invalid URL format: {url}. Please provide a valid URL with http:// or https:// prefix."
186
+
187
+ # Block potentially dangerous URLs
188
+ blocked_domains = [
189
+ "localhost", "127.0.0.1", "0.0.0.0",
190
+ "192.168.", "10.0.", "172.16.", "172.17.", "172.18.", "172.19.", "172.20.",
191
+ "172.21.", "172.22.", "172.23.", "172.24.", "172.25.", "172.26.", "172.27.",
192
+ "172.28.", "172.29.", "172.30.", "172.31."
193
+ ]
194
+
195
+ if any(domain in parsed_url.netloc for domain in blocked_domains):
196
+ return f"Error: Access to internal/local URLs is blocked for security: {url}"
197
+
198
+ print(f"Scraping URL: {url}")
199
+
200
+ # Set user agent to avoid being blocked
201
+ headers = {
202
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
203
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
204
+ 'Accept-Language': 'en-US,en;q=0.5',
205
+ 'Connection': 'keep-alive',
206
+ 'Upgrade-Insecure-Requests': '1',
207
+ 'Cache-Control': 'max-age=0',
208
+ }
209
+
210
+ # Set a reasonable timeout to avoid hanging
211
+ timeout = 10
212
+
213
+ # Make the request
214
+ response = requests.get(url, headers=headers, timeout=timeout)
215
+
216
+ # Check if request was successful
217
+ if response.status_code != 200:
218
+ return f"Error: Failed to fetch the webpage. Status code: {response.status_code}"
219
+
220
+ # Use BeautifulSoup to parse the HTML
221
+ soup = BeautifulSoup(response.text, 'html.parser')
222
+
223
+ # Remove script and style elements that are not relevant to content
224
+ for script_or_style in soup(["script", "style", "iframe", "footer", "nav"]):
225
+ script_or_style.decompose()
226
+
227
+ # Get the page title
228
+ title = soup.title.string if soup.title else "No title found"
229
+
230
+ # Extract the main content
231
+ # First try to find main content areas
232
+ main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content')
233
+
234
+ # If no main content area is found, use the entire body
235
+ if not main_content:
236
+ main_content = soup.body
237
+
238
+ # Convert to plain text
239
+ h = html2text.HTML2Text()
240
+ h.ignore_links = False
241
+ h.ignore_images = True
242
+ h.ignore_tables = False
243
+ h.unicode_snob = True
244
+
245
+ if main_content:
246
+ text_content = h.handle(str(main_content))
247
+ else:
248
+ text_content = h.handle(response.text)
249
+
250
+ # Limit content length to avoid overwhelming the model
251
+ max_content_length = 99999999999
252
+ if len(text_content) > max_content_length:
253
+ text_content = text_content[:max_content_length] + "\n\n[Content truncated due to length...]"
254
+
255
+ # Format the response
256
+ result = f"Title: {title}\nURL: {url}\n\n{text_content}"
257
+
258
+ return result
259
+
260
+ except requests.exceptions.Timeout:
261
+ return f"Error: Request timed out while trying to access {url}"
262
+ except requests.exceptions.ConnectionError:
263
+ return f"Error: Failed to connect to {url}. The site might be down or the URL might be incorrect."
264
+ except requests.exceptions.RequestException as e:
265
+ return f"Error requesting {url}: {str(e)}"
266
+ except Exception as e:
267
+ return f"Error scraping webpage {url}: {str(e)}"
268
  def format_search_results(results: List[Dict], query: str) -> str:
269
  """Format the search results into a readable string"""
270
  if not results or len(results) == 0:
 
368
 
369
  web_search: Search the web for current information. Provide a specific search query.
370
  python_code: Execute Python code. Provide the complete Python code as a string. Use this tool to calculate math problems.
371
+ webpage_scrape: Scrape content from a specific webpage URL. Provide a valid URL to extract information from a particular web page.
372
+
373
 
374
  The way you use the tools is by specifying a json blob.
375
  Specifically, this json should have an `action` key (with the name of the tool to use) and an `action_input` key (with the input to the tool going here).
 
377
  The only values that should be in the "action" field are:
378
  web_search: Search the web for current information, args: {"query": {"type": "string"}}
379
  python_code: Execute Python code, args: {"code": {"type": "string"}}
380
+ webpage_scrape: Scrape a specific webpage, args: {"url": {"type": "string"}}
381
 
382
  IMPORTANT: Make sure your JSON is properly formatted with double quotes around keys and string values.
383
 
 
390
  }
391
  ```
392
 
393
+ Or for scraping a webpage:
394
+
395
+ ```json
396
+ {
397
+ "action": "webpage_scrape",
398
+ "action_input": {"url": "https://en.wikipedia.org/wiki/Artificial_intelligence"}
399
+ }
400
+ ```
401
+
402
  ALWAYS follow this specific format for your responses. Your entire response will follow this pattern:
403
 
404
  Question: [the user's question]
 
434
  4. Based on the observation, continue with another thought
435
  5. This cycle repeats until you have enough information to provide a final answer
436
 
437
+ NEVER fake or simulate tool output yourself.
438
+
439
  ... (this Thought/Action/Observation cycle can repeat as needed) ...
440
 
441
  Thought: I now know the final answer
 
463
  "name": "python_code",
464
  "description": "Execute Python code. Provide the complete Python code as a string in the format: {\"code\": \"your python code here\"}",
465
  "func": run_python_code
466
+ },
467
+ {
468
+ "name": "webpage_scrape",
469
+ "description": "Scrape content from a specific webpage URL. Provide a valid URL in the format: {\"url\": \"https://example.com\"}",
470
+ "func": scrape_webpage
471
  }
472
  ]
473
 
 
479
  class ActionInput(TypedDict, total=False):
480
  query: Optional[str]
481
  code: Optional[str]
482
+ url: Optional[str]
483
 
484
  class AgentState(TypedDict, total=False):
485
  messages: Annotated[list[AnyMessage], add_messages]
 
696
  "action_input": None # Clear the action input
697
  }
698
 
699
+ def webpage_scrape_node(state: AgentState) -> Dict[str, Any]:
700
+ """Node that scrapes content from a specific webpage URL."""
701
+ print("Webpage Scrape Tool Called...\n\n")
702
+
703
+ # Extract tool arguments
704
+ action_input = state.get("action_input", {})
705
+ print(f"Webpage scrape action_input: {action_input}")
706
+
707
+ # Try different ways to extract the URL
708
+ url = ""
709
+ if isinstance(action_input, dict):
710
+ url = action_input.get("url", "")
711
+ elif isinstance(action_input, str):
712
+ url = action_input
713
+
714
+ print(f"Scraping URL: '{url}'")
715
+
716
+ # Safety check - don't run with empty URL
717
+ if not url:
718
+ result = "Error: No URL provided. Please provide a valid URL to scrape."
719
+ else:
720
+ # Call the webpage scraping function
721
+ result = scrape_webpage(url)
722
+
723
+ print(f"Scraping result length: {len(result)}")
724
+
725
+ # Format the observation to continue the ReAct cycle
726
+ # Always prefix with "Observation:" for consistency in the ReAct cycle
727
+ tool_message = AIMessage(
728
+ content=f"Observation: {result.strip()}"
729
+ )
730
+
731
+ # Print the observation that will be sent back to the assistant
732
+ print("\n=== TOOL OBSERVATION ===")
733
+ content_preview = tool_message.content[:500] + "..." if len(tool_message.content) > 500 else tool_message.content
734
+ print(content_preview)
735
+ print("=== END OBSERVATION ===\n")
736
+
737
+ # Return the updated state
738
+ return {
739
+ "messages": state["messages"] + [tool_message],
740
+ "current_tool": None, # Reset the current tool
741
+ "action_input": None # Clear the action input
742
+ }
743
+
744
  # Router function to direct to the correct tool
745
  def router(state: AgentState) -> str:
746
  """Route to the appropriate tool based on the current_tool field."""
 
753
  return "web_search"
754
  elif tool == "python_code":
755
  return "python_code"
756
+ elif tool == "webpage_scrape":
757
+ return "webpage_scrape"
758
  else:
759
  return "end"
760
 
 
767
  builder.add_node("assistant", assistant)
768
  builder.add_node("web_search", web_search_node)
769
  builder.add_node("python_code", python_code_node)
770
+ builder.add_node("webpage_scrape", webpage_scrape_node)
771
 
772
  # Define edges: these determine how the control flow moves
773
  builder.add_edge(START, "assistant")
 
794
  {
795
  "web_search": "web_search",
796
  "python_code": "python_code",
797
+ "webpage_scrape": "webpage_scrape",
798
  "end": END
799
  }
800
  )
 
802
  # Tools always go back to assistant
803
  builder.add_edge("web_search", "assistant")
804
  builder.add_edge("python_code", "assistant")
805
+ builder.add_edge("webpage_scrape", "assistant")
806
 
807
  # Compile with a reasonable recursion limit to prevent infinite loops
808
  return builder.compile()
 
860
  agent = TurboNerd(max_execution_time=60)
861
  response = agent("How many studio albums were published by Mercedes Sosa between 2000 and 2009 (included)? You can use the latest 2022 version of english wikipedia.")
862
  print("\nFinal Response:")
863
+ print(response)
864
+
requirements.txt CHANGED
@@ -5,4 +5,6 @@ langchain
5
  langchain-openai
6
  duckduckgo-search
7
  langchain-community
8
- apify-client
 
 
 
5
  langchain-openai
6
  duckduckgo-search
7
  langchain-community
8
+ apify-client
9
+ beautifulsoup4
10
+ html2text