Lasdw commited on
Commit
64d4d94
·
1 Parent(s): f4b3c44

updated exctact_json function

Browse files
Files changed (2) hide show
  1. agent.py +100 -2
  2. tools.py +55 -5
agent.py CHANGED
@@ -65,7 +65,7 @@ The only values that should be in the "action" field are:
65
  python_code: Execute Python code. Use this tool to calculate math problems. make sure to use prints to be able to view the final result. args: {"code": {"type": "string"}}
66
  wikipedia_search: Search Wikipedia for information about a specific topic. Optionally specify the number of results to return, args: {"query": {"type": "string"}, "num_results": {"type": "integer", "optional": true}}
67
  tavily_search: Search the web using Tavily. Optionally specify search_depth as 'basic' or 'comprehensive', args: {"query": {"type": "string"}, "search_depth": {"type": "string", "optional": true}}
68
- arxiv_search: Search ArXiv for publications. Optionally specify max_results to control the number of papers returned, args: {"query": {"type": "string"}, "max_results": {"type": "integer", "optional": true}}
69
  webpage_scrape: Scrape a specific webpage, args: {"url": {"type": "string"}}
70
  supabase_operation: Perform database operations, args: {"operation_type": {"type": "string"}, "table": {"type": "string"}, "data": {"type": "object", "optional": true}, "filters": {"type": "object", "optional": true}}
71
  excel_to_text: Convert Excel to Markdown table with attachment, args: {"excel_path": {"type": "string"}, "file_content": {"type": "string"}, "sheet_name": {"type": "string", "optional": true}}
@@ -346,15 +346,41 @@ def assistant(state: AgentState) -> Dict[str, Any]:
346
  return state_update
347
 
348
  def extract_json_from_text(text: str) -> dict:
349
- """Extract JSON from text, handling markdown code blocks."""
350
  try:
351
  import re
 
 
 
 
 
 
352
 
353
  print(f"Attempting to extract JSON from text: {text[:200]}...")
354
 
355
  # First, clean up the text to handle specific patterns that might confuse parsing
356
  text = text.replace('\\n', '\n').replace('\\"', '"')
357
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
358
  # Pattern 1: Look for "Action:" followed by a markdown code block
359
  action_match = re.search(r"Action:\s*```(?:python|json)?\s*(.*?)```", text, re.DOTALL)
360
  if action_match:
@@ -424,7 +450,79 @@ def extract_json_from_text(text: str) -> dict:
424
  except json.JSONDecodeError:
425
  pass
426
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
  print("Could not extract valid JSON from text using any pattern")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
428
  return None
429
 
430
  except Exception as e:
 
65
  python_code: Execute Python code. Use this tool to calculate math problems. make sure to use prints to be able to view the final result. args: {"code": {"type": "string"}}
66
  wikipedia_search: Search Wikipedia for information about a specific topic. Optionally specify the number of results to return, args: {"query": {"type": "string"}, "num_results": {"type": "integer", "optional": true}}
67
  tavily_search: Search the web using Tavily. Optionally specify search_depth as 'basic' or 'comprehensive', args: {"query": {"type": "string"}, "search_depth": {"type": "string", "optional": true}}
68
+ arxiv_search: Search ArXiv for publications,news and other resources. Optionally specify max_results to control the number of papers returned, args: {"query": {"type": "string"}, "max_results": {"type": "integer", "optional": true}}
69
  webpage_scrape: Scrape a specific webpage, args: {"url": {"type": "string"}}
70
  supabase_operation: Perform database operations, args: {"operation_type": {"type": "string"}, "table": {"type": "string"}, "data": {"type": "object", "optional": true}, "filters": {"type": "object", "optional": true}}
71
  excel_to_text: Convert Excel to Markdown table with attachment, args: {"excel_path": {"type": "string"}, "file_content": {"type": "string"}, "sheet_name": {"type": "string", "optional": true}}
 
346
  return state_update
347
 
348
  def extract_json_from_text(text: str) -> dict:
349
+ """Extract JSON from text, handling markdown code blocks and other formats."""
350
  try:
351
  import re
352
+ import json
353
+
354
+ # Return empty if text is None or very short (less than 10 chars)
355
+ if not text or len(text.strip()) < 10:
356
+ print("Warning: Empty or very short text input to JSON extraction")
357
+ return None
358
 
359
  print(f"Attempting to extract JSON from text: {text[:200]}...")
360
 
361
  # First, clean up the text to handle specific patterns that might confuse parsing
362
  text = text.replace('\\n', '\n').replace('\\"', '"')
363
 
364
+ # Case 1: "Final Answer:" detection - if present, return None to indicate we should end
365
+ if "Final Answer:" in text:
366
+ print("Detected 'Final Answer' - no tool action needed")
367
+ return None
368
+
369
+ # Case 2: Extract direct python dictionary representation without JSON formatting
370
+ if "action_input" in text and not '{"action"' in text and not '{"action_input"' in text:
371
+ # Try regex to extract a Python dict-like structure
372
+ action_match = re.search(r"action:\s*(\w+)", text, re.IGNORECASE)
373
+ input_match = re.search(r"action_input:\s*(\{.+?\})", text, re.DOTALL | re.IGNORECASE)
374
+
375
+ if action_match and input_match:
376
+ action = action_match.group(1).strip()
377
+ try:
378
+ action_input = eval(input_match.group(1)) # Be careful with eval!
379
+ if isinstance(action_input, dict):
380
+ return {"action": action, "action_input": action_input}
381
+ except:
382
+ pass
383
+
384
  # Pattern 1: Look for "Action:" followed by a markdown code block
385
  action_match = re.search(r"Action:\s*```(?:python|json)?\s*(.*?)```", text, re.DOTALL)
386
  if action_match:
 
450
  except json.JSONDecodeError:
451
  pass
452
 
453
+ # Pattern 5: Look for simple text patterns like "I need to use tool X to search for Y"
454
+ tool_patterns = [
455
+ (r"(?:use|using|need to use|should use|will use)(?:\s+the)?\s+(\w+)(?:\s+tool)?\s+to\s+(?:search|find|look up|research)(?:\s+for)?\s+['\"](.*?)['\"]",
456
+ lambda m: {"action": m.group(1).lower(), "action_input": {"query": m.group(2)}}),
457
+ (r"(?:use|using|need to use|should use|will use)(?:\s+the)?\s+(\w+)(?:\s+tool)?\s+to\s+(?:search|find|look up|research)\s+(?:for\s+)?(.+?)(?=\.|$)",
458
+ lambda m: {"action": m.group(1).lower(), "action_input": {"query": m.group(2).strip()}}),
459
+ (r"(?:use|using|need to use|should use|will use)(?:\s+the)?\s+(\w+)(?:\s+tool)?\s+on\s+['\"](.*?)['\"]",
460
+ lambda m: {"action": m.group(1).lower(), "action_input": {"query": m.group(2)}})
461
+ ]
462
+
463
+ for pattern, formatter in tool_patterns:
464
+ match = re.search(pattern, text, re.IGNORECASE)
465
+ if match:
466
+ try:
467
+ result = formatter(match)
468
+ # Map common words to actual tool names
469
+ tool_mapping = {
470
+ "tavily": "tavily_search",
471
+ "wikipedia": "wikipedia_search",
472
+ "arxiv": "arxiv_search",
473
+ "web": "tavily_search",
474
+ "python": "python_code",
475
+ "excel": "excel_to_text",
476
+ "youtube": "process_youtube_video",
477
+ "webpage": "webpage_scrape",
478
+ "scrape": "webpage_scrape"
479
+ }
480
+
481
+ if result["action"].lower() in tool_mapping:
482
+ result["action"] = tool_mapping[result["action"].lower()]
483
+
484
+ print(f"Extracted tool action using pattern: {result}")
485
+ return result
486
+ except Exception as e:
487
+ print(f"Error formatting pattern match: {e}")
488
+
489
+ # Fallback: If we detect thinking about a specific topic, suggest a related tool
490
+ fallback_patterns = [
491
+ (r"(?:I need|I should|I will|let me|I can)(?:\s+(?:to))?\s+(?:search|find|look for)\s+(?:information|details|data)?\s+(?:about|on|regarding)?\s+(.+?)(?=\.|$)",
492
+ lambda m: {"action": "tavily_search", "action_input": {"query": m.group(1).strip()}}),
493
+ (r"(?:I will|I should|let me)(?:\s+now)?\s+(?:search|check)(?:\s+on)?\s+wikipedia\s+(?:for)?\s+(.+?)(?=\.|$)",
494
+ lambda m: {"action": "wikipedia_search", "action_input": {"query": m.group(1).strip()}}),
495
+ (r"(?:I need|I should|I will|let me)(?:\s+to)?\s+(?:execute|run|write|use)\s+(?:some|a)?\s+python\s+(?:code)?",
496
+ lambda m: {"action": "python_code", "action_input": {"code": "# Example code\nprint('Please specify Python code to execute')"}})
497
+ ]
498
+
499
+ for pattern, formatter in fallback_patterns:
500
+ match = re.search(pattern, text, re.IGNORECASE)
501
+ if match:
502
+ try:
503
+ result = formatter(match)
504
+ print(f"Used fallback pattern to suggest tool: {result}")
505
+ return result
506
+ except Exception as e:
507
+ print(f"Error in fallback pattern: {e}")
508
+
509
  print("Could not extract valid JSON from text using any pattern")
510
+
511
+ # Last resort fallback: If we've tried everything and failed, and the response is long
512
+ # enough to suggest the model is doing work but not formatting correctly, try to
513
+ # extract any query-like content and suggest a tavily search
514
+ if len(text) > 200 and "search" in text.lower():
515
+ # Extract a potential query - look for sentences with search-related terms
516
+ search_sentences = re.findall(r'[^.!?]*(?:search|find|look for|investigate|research)[^.!?]*[.!?]', text)
517
+ if search_sentences:
518
+ best_sentence = max(search_sentences, key=len)
519
+ # Clean up the query
520
+ query = re.sub(r'(?:I will|I should|I need to|I want to|Let me|I\'ll|I can)\s+(?:search|find|look for|investigate|research)(?:\s+for)?\s+', '', best_sentence)
521
+ query = query.strip('.!? \t\n')
522
+ if query and len(query) > 5:
523
+ print(f"Last resort fallback: suggesting Tavily search for: {query}")
524
+ return {"action": "tavily_search", "action_input": {"query": query}}
525
+
526
  return None
527
 
528
  except Exception as e:
tools.py CHANGED
@@ -297,24 +297,74 @@ def scrape_webpage(url: str) -> str:
297
 
298
  print(f"Scraping URL: {url}")
299
 
300
- # Set user agent to avoid being blocked
 
 
 
 
 
 
 
 
 
 
 
 
301
  headers = {
302
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
303
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
304
  'Accept-Language': 'en-US,en;q=0.5',
 
305
  'Connection': 'keep-alive',
306
  'Upgrade-Insecure-Requests': '1',
307
  'Cache-Control': 'max-age=0',
 
 
 
 
 
308
  }
309
 
310
  # Set a reasonable timeout to avoid hanging
311
- timeout = 10
312
 
313
- # Make the request
314
- response = requests.get(url, headers=headers, timeout=timeout)
 
315
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
316
  # Check if request was successful
317
  if response.status_code != 200:
 
 
318
  return f"Error: Failed to fetch the webpage. Status code: {response.status_code}"
319
 
320
  # Use BeautifulSoup to parse the HTML
 
297
 
298
  print(f"Scraping URL: {url}")
299
 
300
+ # Rotate between different user agents to avoid being blocked
301
+ user_agents = [
302
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
303
+ 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15',
304
+ 'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0',
305
+ 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
306
+ 'Mozilla/5.0 (iPhone; CPU iPhone OS 15_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/96.0.4664.53 Mobile/15E148 Safari/604.1'
307
+ ]
308
+
309
+ import random
310
+ selected_user_agent = random.choice(user_agents)
311
+
312
+ # Set more comprehensive headers that mimic a real browser
313
  headers = {
314
+ 'User-Agent': selected_user_agent,
315
  'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
316
  'Accept-Language': 'en-US,en;q=0.5',
317
+ 'Accept-Encoding': 'gzip, deflate, br',
318
  'Connection': 'keep-alive',
319
  'Upgrade-Insecure-Requests': '1',
320
  'Cache-Control': 'max-age=0',
321
+ 'Sec-Fetch-Dest': 'document',
322
+ 'Sec-Fetch-Mode': 'navigate',
323
+ 'Sec-Fetch-Site': 'none',
324
+ 'Sec-Fetch-User': '?1',
325
+ 'Pragma': 'no-cache'
326
  }
327
 
328
  # Set a reasonable timeout to avoid hanging
329
+ timeout = 15
330
 
331
+ # Implement a simple retry mechanism
332
+ max_retries = 3
333
+ retry_delay = 2 # seconds
334
 
335
+ for attempt in range(max_retries):
336
+ try:
337
+ # Make the request
338
+ response = requests.get(url, headers=headers, timeout=timeout)
339
+
340
+ # If successful, break out of retry loop
341
+ if response.status_code == 200:
342
+ break
343
+
344
+ # If we got a 403 Forbidden error, try a different approach
345
+ if response.status_code == 403 and attempt < max_retries - 1:
346
+ print(f"Received 403 Forbidden. Retrying with different headers (attempt {attempt + 1})...")
347
+ # Change user agent for next attempt
348
+ headers['User-Agent'] = random.choice(user_agents)
349
+ # Add a referrer from a major site to appear more legitimate
350
+ headers['Referer'] = 'https://www.google.com/'
351
+ import time
352
+ time.sleep(retry_delay)
353
+ continue
354
+
355
+ except (requests.exceptions.RequestException, requests.exceptions.Timeout) as e:
356
+ if attempt < max_retries - 1:
357
+ print(f"Request failed: {e}. Retrying (attempt {attempt + 1})...")
358
+ import time
359
+ time.sleep(retry_delay)
360
+ continue
361
+ else:
362
+ raise
363
+
364
  # Check if request was successful
365
  if response.status_code != 200:
366
+ if response.status_code == 403:
367
+ return f"Error: Access Forbidden (403). The website is actively blocking scrapers or requires authentication. Try using a different search method like Tavily search instead."
368
  return f"Error: Failed to fetch the webpage. Status code: {response.status_code}"
369
 
370
  # Use BeautifulSoup to parse the HTML