Spaces:
Sleeping
Sleeping
updated exctact_json function
Browse files
agent.py
CHANGED
|
@@ -65,7 +65,7 @@ The only values that should be in the "action" field are:
|
|
| 65 |
python_code: Execute Python code. Use this tool to calculate math problems. make sure to use prints to be able to view the final result. args: {"code": {"type": "string"}}
|
| 66 |
wikipedia_search: Search Wikipedia for information about a specific topic. Optionally specify the number of results to return, args: {"query": {"type": "string"}, "num_results": {"type": "integer", "optional": true}}
|
| 67 |
tavily_search: Search the web using Tavily. Optionally specify search_depth as 'basic' or 'comprehensive', args: {"query": {"type": "string"}, "search_depth": {"type": "string", "optional": true}}
|
| 68 |
-
arxiv_search: Search ArXiv for publications. Optionally specify max_results to control the number of papers returned, args: {"query": {"type": "string"}, "max_results": {"type": "integer", "optional": true}}
|
| 69 |
webpage_scrape: Scrape a specific webpage, args: {"url": {"type": "string"}}
|
| 70 |
supabase_operation: Perform database operations, args: {"operation_type": {"type": "string"}, "table": {"type": "string"}, "data": {"type": "object", "optional": true}, "filters": {"type": "object", "optional": true}}
|
| 71 |
excel_to_text: Convert Excel to Markdown table with attachment, args: {"excel_path": {"type": "string"}, "file_content": {"type": "string"}, "sheet_name": {"type": "string", "optional": true}}
|
|
@@ -346,15 +346,41 @@ def assistant(state: AgentState) -> Dict[str, Any]:
|
|
| 346 |
return state_update
|
| 347 |
|
| 348 |
def extract_json_from_text(text: str) -> dict:
|
| 349 |
-
"""Extract JSON from text, handling markdown code blocks."""
|
| 350 |
try:
|
| 351 |
import re
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 352 |
|
| 353 |
print(f"Attempting to extract JSON from text: {text[:200]}...")
|
| 354 |
|
| 355 |
# First, clean up the text to handle specific patterns that might confuse parsing
|
| 356 |
text = text.replace('\\n', '\n').replace('\\"', '"')
|
| 357 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 358 |
# Pattern 1: Look for "Action:" followed by a markdown code block
|
| 359 |
action_match = re.search(r"Action:\s*```(?:python|json)?\s*(.*?)```", text, re.DOTALL)
|
| 360 |
if action_match:
|
|
@@ -424,7 +450,79 @@ def extract_json_from_text(text: str) -> dict:
|
|
| 424 |
except json.JSONDecodeError:
|
| 425 |
pass
|
| 426 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 427 |
print("Could not extract valid JSON from text using any pattern")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 428 |
return None
|
| 429 |
|
| 430 |
except Exception as e:
|
|
|
|
| 65 |
python_code: Execute Python code. Use this tool to calculate math problems. make sure to use prints to be able to view the final result. args: {"code": {"type": "string"}}
|
| 66 |
wikipedia_search: Search Wikipedia for information about a specific topic. Optionally specify the number of results to return, args: {"query": {"type": "string"}, "num_results": {"type": "integer", "optional": true}}
|
| 67 |
tavily_search: Search the web using Tavily. Optionally specify search_depth as 'basic' or 'comprehensive', args: {"query": {"type": "string"}, "search_depth": {"type": "string", "optional": true}}
|
| 68 |
+
arxiv_search: Search ArXiv for publications,news and other resources. Optionally specify max_results to control the number of papers returned, args: {"query": {"type": "string"}, "max_results": {"type": "integer", "optional": true}}
|
| 69 |
webpage_scrape: Scrape a specific webpage, args: {"url": {"type": "string"}}
|
| 70 |
supabase_operation: Perform database operations, args: {"operation_type": {"type": "string"}, "table": {"type": "string"}, "data": {"type": "object", "optional": true}, "filters": {"type": "object", "optional": true}}
|
| 71 |
excel_to_text: Convert Excel to Markdown table with attachment, args: {"excel_path": {"type": "string"}, "file_content": {"type": "string"}, "sheet_name": {"type": "string", "optional": true}}
|
|
|
|
| 346 |
return state_update
|
| 347 |
|
| 348 |
def extract_json_from_text(text: str) -> dict:
|
| 349 |
+
"""Extract JSON from text, handling markdown code blocks and other formats."""
|
| 350 |
try:
|
| 351 |
import re
|
| 352 |
+
import json
|
| 353 |
+
|
| 354 |
+
# Return empty if text is None or very short (less than 10 chars)
|
| 355 |
+
if not text or len(text.strip()) < 10:
|
| 356 |
+
print("Warning: Empty or very short text input to JSON extraction")
|
| 357 |
+
return None
|
| 358 |
|
| 359 |
print(f"Attempting to extract JSON from text: {text[:200]}...")
|
| 360 |
|
| 361 |
# First, clean up the text to handle specific patterns that might confuse parsing
|
| 362 |
text = text.replace('\\n', '\n').replace('\\"', '"')
|
| 363 |
|
| 364 |
+
# Case 1: "Final Answer:" detection - if present, return None to indicate we should end
|
| 365 |
+
if "Final Answer:" in text:
|
| 366 |
+
print("Detected 'Final Answer' - no tool action needed")
|
| 367 |
+
return None
|
| 368 |
+
|
| 369 |
+
# Case 2: Extract direct python dictionary representation without JSON formatting
|
| 370 |
+
if "action_input" in text and not '{"action"' in text and not '{"action_input"' in text:
|
| 371 |
+
# Try regex to extract a Python dict-like structure
|
| 372 |
+
action_match = re.search(r"action:\s*(\w+)", text, re.IGNORECASE)
|
| 373 |
+
input_match = re.search(r"action_input:\s*(\{.+?\})", text, re.DOTALL | re.IGNORECASE)
|
| 374 |
+
|
| 375 |
+
if action_match and input_match:
|
| 376 |
+
action = action_match.group(1).strip()
|
| 377 |
+
try:
|
| 378 |
+
action_input = eval(input_match.group(1)) # Be careful with eval!
|
| 379 |
+
if isinstance(action_input, dict):
|
| 380 |
+
return {"action": action, "action_input": action_input}
|
| 381 |
+
except:
|
| 382 |
+
pass
|
| 383 |
+
|
| 384 |
# Pattern 1: Look for "Action:" followed by a markdown code block
|
| 385 |
action_match = re.search(r"Action:\s*```(?:python|json)?\s*(.*?)```", text, re.DOTALL)
|
| 386 |
if action_match:
|
|
|
|
| 450 |
except json.JSONDecodeError:
|
| 451 |
pass
|
| 452 |
|
| 453 |
+
# Pattern 5: Look for simple text patterns like "I need to use tool X to search for Y"
|
| 454 |
+
tool_patterns = [
|
| 455 |
+
(r"(?:use|using|need to use|should use|will use)(?:\s+the)?\s+(\w+)(?:\s+tool)?\s+to\s+(?:search|find|look up|research)(?:\s+for)?\s+['\"](.*?)['\"]",
|
| 456 |
+
lambda m: {"action": m.group(1).lower(), "action_input": {"query": m.group(2)}}),
|
| 457 |
+
(r"(?:use|using|need to use|should use|will use)(?:\s+the)?\s+(\w+)(?:\s+tool)?\s+to\s+(?:search|find|look up|research)\s+(?:for\s+)?(.+?)(?=\.|$)",
|
| 458 |
+
lambda m: {"action": m.group(1).lower(), "action_input": {"query": m.group(2).strip()}}),
|
| 459 |
+
(r"(?:use|using|need to use|should use|will use)(?:\s+the)?\s+(\w+)(?:\s+tool)?\s+on\s+['\"](.*?)['\"]",
|
| 460 |
+
lambda m: {"action": m.group(1).lower(), "action_input": {"query": m.group(2)}})
|
| 461 |
+
]
|
| 462 |
+
|
| 463 |
+
for pattern, formatter in tool_patterns:
|
| 464 |
+
match = re.search(pattern, text, re.IGNORECASE)
|
| 465 |
+
if match:
|
| 466 |
+
try:
|
| 467 |
+
result = formatter(match)
|
| 468 |
+
# Map common words to actual tool names
|
| 469 |
+
tool_mapping = {
|
| 470 |
+
"tavily": "tavily_search",
|
| 471 |
+
"wikipedia": "wikipedia_search",
|
| 472 |
+
"arxiv": "arxiv_search",
|
| 473 |
+
"web": "tavily_search",
|
| 474 |
+
"python": "python_code",
|
| 475 |
+
"excel": "excel_to_text",
|
| 476 |
+
"youtube": "process_youtube_video",
|
| 477 |
+
"webpage": "webpage_scrape",
|
| 478 |
+
"scrape": "webpage_scrape"
|
| 479 |
+
}
|
| 480 |
+
|
| 481 |
+
if result["action"].lower() in tool_mapping:
|
| 482 |
+
result["action"] = tool_mapping[result["action"].lower()]
|
| 483 |
+
|
| 484 |
+
print(f"Extracted tool action using pattern: {result}")
|
| 485 |
+
return result
|
| 486 |
+
except Exception as e:
|
| 487 |
+
print(f"Error formatting pattern match: {e}")
|
| 488 |
+
|
| 489 |
+
# Fallback: If we detect thinking about a specific topic, suggest a related tool
|
| 490 |
+
fallback_patterns = [
|
| 491 |
+
(r"(?:I need|I should|I will|let me|I can)(?:\s+(?:to))?\s+(?:search|find|look for)\s+(?:information|details|data)?\s+(?:about|on|regarding)?\s+(.+?)(?=\.|$)",
|
| 492 |
+
lambda m: {"action": "tavily_search", "action_input": {"query": m.group(1).strip()}}),
|
| 493 |
+
(r"(?:I will|I should|let me)(?:\s+now)?\s+(?:search|check)(?:\s+on)?\s+wikipedia\s+(?:for)?\s+(.+?)(?=\.|$)",
|
| 494 |
+
lambda m: {"action": "wikipedia_search", "action_input": {"query": m.group(1).strip()}}),
|
| 495 |
+
(r"(?:I need|I should|I will|let me)(?:\s+to)?\s+(?:execute|run|write|use)\s+(?:some|a)?\s+python\s+(?:code)?",
|
| 496 |
+
lambda m: {"action": "python_code", "action_input": {"code": "# Example code\nprint('Please specify Python code to execute')"}})
|
| 497 |
+
]
|
| 498 |
+
|
| 499 |
+
for pattern, formatter in fallback_patterns:
|
| 500 |
+
match = re.search(pattern, text, re.IGNORECASE)
|
| 501 |
+
if match:
|
| 502 |
+
try:
|
| 503 |
+
result = formatter(match)
|
| 504 |
+
print(f"Used fallback pattern to suggest tool: {result}")
|
| 505 |
+
return result
|
| 506 |
+
except Exception as e:
|
| 507 |
+
print(f"Error in fallback pattern: {e}")
|
| 508 |
+
|
| 509 |
print("Could not extract valid JSON from text using any pattern")
|
| 510 |
+
|
| 511 |
+
# Last resort fallback: If we've tried everything and failed, and the response is long
|
| 512 |
+
# enough to suggest the model is doing work but not formatting correctly, try to
|
| 513 |
+
# extract any query-like content and suggest a tavily search
|
| 514 |
+
if len(text) > 200 and "search" in text.lower():
|
| 515 |
+
# Extract a potential query - look for sentences with search-related terms
|
| 516 |
+
search_sentences = re.findall(r'[^.!?]*(?:search|find|look for|investigate|research)[^.!?]*[.!?]', text)
|
| 517 |
+
if search_sentences:
|
| 518 |
+
best_sentence = max(search_sentences, key=len)
|
| 519 |
+
# Clean up the query
|
| 520 |
+
query = re.sub(r'(?:I will|I should|I need to|I want to|Let me|I\'ll|I can)\s+(?:search|find|look for|investigate|research)(?:\s+for)?\s+', '', best_sentence)
|
| 521 |
+
query = query.strip('.!? \t\n')
|
| 522 |
+
if query and len(query) > 5:
|
| 523 |
+
print(f"Last resort fallback: suggesting Tavily search for: {query}")
|
| 524 |
+
return {"action": "tavily_search", "action_input": {"query": query}}
|
| 525 |
+
|
| 526 |
return None
|
| 527 |
|
| 528 |
except Exception as e:
|
tools.py
CHANGED
|
@@ -297,24 +297,74 @@ def scrape_webpage(url: str) -> str:
|
|
| 297 |
|
| 298 |
print(f"Scraping URL: {url}")
|
| 299 |
|
| 300 |
-
#
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 301 |
headers = {
|
| 302 |
-
'User-Agent':
|
| 303 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 304 |
'Accept-Language': 'en-US,en;q=0.5',
|
|
|
|
| 305 |
'Connection': 'keep-alive',
|
| 306 |
'Upgrade-Insecure-Requests': '1',
|
| 307 |
'Cache-Control': 'max-age=0',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 308 |
}
|
| 309 |
|
| 310 |
# Set a reasonable timeout to avoid hanging
|
| 311 |
-
timeout =
|
| 312 |
|
| 313 |
-
#
|
| 314 |
-
|
|
|
|
| 315 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 316 |
# Check if request was successful
|
| 317 |
if response.status_code != 200:
|
|
|
|
|
|
|
| 318 |
return f"Error: Failed to fetch the webpage. Status code: {response.status_code}"
|
| 319 |
|
| 320 |
# Use BeautifulSoup to parse the HTML
|
|
|
|
| 297 |
|
| 298 |
print(f"Scraping URL: {url}")
|
| 299 |
|
| 300 |
+
# Rotate between different user agents to avoid being blocked
|
| 301 |
+
user_agents = [
|
| 302 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.110 Safari/537.36',
|
| 303 |
+
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/15.1 Safari/605.1.15',
|
| 304 |
+
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:94.0) Gecko/20100101 Firefox/94.0',
|
| 305 |
+
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/96.0.4664.45 Safari/537.36',
|
| 306 |
+
'Mozilla/5.0 (iPhone; CPU iPhone OS 15_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) CriOS/96.0.4664.53 Mobile/15E148 Safari/604.1'
|
| 307 |
+
]
|
| 308 |
+
|
| 309 |
+
import random
|
| 310 |
+
selected_user_agent = random.choice(user_agents)
|
| 311 |
+
|
| 312 |
+
# Set more comprehensive headers that mimic a real browser
|
| 313 |
headers = {
|
| 314 |
+
'User-Agent': selected_user_agent,
|
| 315 |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
| 316 |
'Accept-Language': 'en-US,en;q=0.5',
|
| 317 |
+
'Accept-Encoding': 'gzip, deflate, br',
|
| 318 |
'Connection': 'keep-alive',
|
| 319 |
'Upgrade-Insecure-Requests': '1',
|
| 320 |
'Cache-Control': 'max-age=0',
|
| 321 |
+
'Sec-Fetch-Dest': 'document',
|
| 322 |
+
'Sec-Fetch-Mode': 'navigate',
|
| 323 |
+
'Sec-Fetch-Site': 'none',
|
| 324 |
+
'Sec-Fetch-User': '?1',
|
| 325 |
+
'Pragma': 'no-cache'
|
| 326 |
}
|
| 327 |
|
| 328 |
# Set a reasonable timeout to avoid hanging
|
| 329 |
+
timeout = 15
|
| 330 |
|
| 331 |
+
# Implement a simple retry mechanism
|
| 332 |
+
max_retries = 3
|
| 333 |
+
retry_delay = 2 # seconds
|
| 334 |
|
| 335 |
+
for attempt in range(max_retries):
|
| 336 |
+
try:
|
| 337 |
+
# Make the request
|
| 338 |
+
response = requests.get(url, headers=headers, timeout=timeout)
|
| 339 |
+
|
| 340 |
+
# If successful, break out of retry loop
|
| 341 |
+
if response.status_code == 200:
|
| 342 |
+
break
|
| 343 |
+
|
| 344 |
+
# If we got a 403 Forbidden error, try a different approach
|
| 345 |
+
if response.status_code == 403 and attempt < max_retries - 1:
|
| 346 |
+
print(f"Received 403 Forbidden. Retrying with different headers (attempt {attempt + 1})...")
|
| 347 |
+
# Change user agent for next attempt
|
| 348 |
+
headers['User-Agent'] = random.choice(user_agents)
|
| 349 |
+
# Add a referrer from a major site to appear more legitimate
|
| 350 |
+
headers['Referer'] = 'https://www.google.com/'
|
| 351 |
+
import time
|
| 352 |
+
time.sleep(retry_delay)
|
| 353 |
+
continue
|
| 354 |
+
|
| 355 |
+
except (requests.exceptions.RequestException, requests.exceptions.Timeout) as e:
|
| 356 |
+
if attempt < max_retries - 1:
|
| 357 |
+
print(f"Request failed: {e}. Retrying (attempt {attempt + 1})...")
|
| 358 |
+
import time
|
| 359 |
+
time.sleep(retry_delay)
|
| 360 |
+
continue
|
| 361 |
+
else:
|
| 362 |
+
raise
|
| 363 |
+
|
| 364 |
# Check if request was successful
|
| 365 |
if response.status_code != 200:
|
| 366 |
+
if response.status_code == 403:
|
| 367 |
+
return f"Error: Access Forbidden (403). The website is actively blocking scrapers or requires authentication. Try using a different search method like Tavily search instead."
|
| 368 |
return f"Error: Failed to fetch the webpage. Status code: {response.status_code}"
|
| 369 |
|
| 370 |
# Use BeautifulSoup to parse the HTML
|