Lasdw commited on
Commit
942e3f2
·
1 Parent(s): 4a2c0eb

updated system prompt to all no tool to be called

Browse files
Files changed (3) hide show
  1. .gitignore +12 -1
  2. agent.py +88 -97
  3. tools.py +625 -0
.gitignore CHANGED
@@ -1,3 +1,14 @@
1
  .env
2
  .env.*
3
- image.png
 
 
 
 
 
 
 
 
 
 
 
 
1
  .env
2
  .env.*
3
+ image.png
4
+
5
+ GAIA
6
+ GAIA/*
7
+
8
+ pycache/*
9
+ __pycache__/*
10
+
11
+ *.pyc
12
+ *.pyo
13
+ *.pyd
14
+
agent.py CHANGED
@@ -4,29 +4,19 @@ from typing import TypedDict, Annotated, Dict, Any, Optional, Union, List
4
  from pathlib import Path
5
  from langgraph.graph.message import add_messages
6
  from langchain_core.messages import AnyMessage, HumanMessage, AIMessage, SystemMessage, ToolMessage
7
- from langgraph.prebuilt import ToolNode
8
- from langchain.tools import Tool
9
  from langgraph.graph import START, END, StateGraph
10
- from langgraph.prebuilt import tools_condition
11
  from langchain_openai import ChatOpenAI
12
- from langchain_community.tools import DuckDuckGoSearchRun
13
- import getpass
14
- import subprocess
15
  import tempfile
16
- import time
17
  import random
18
  import json
19
- import re
20
  import requests
21
  from urllib.parse import quote, urlparse
22
- import sys
23
  from bs4 import BeautifulSoup
24
  import html2text
25
  import pandas as pd
26
  from tabulate import tabulate
27
  import base64
28
 
29
- from apify_client import ApifyClient
30
  from langchain_community.document_loaders import WikipediaLoader
31
  from langchain_community.document_loaders import ArxivLoader
32
  from langchain_community.tools.tavily_search import TavilySearchResults
@@ -764,6 +754,8 @@ excel_to_text: Convert Excel to Markdown table with attachment, args: {"excel_pa
764
 
765
  IMPORTANT: Make sure your JSON is properly formatted with double quotes around keys and string values.
766
 
 
 
767
  Example use for tools:
768
 
769
  ```json
@@ -971,15 +963,9 @@ def assistant(state: AgentState) -> Dict[str, Any]:
971
  # Combine system message with the (potentially pruned) core messages
972
  messages_for_llm = [system_msg] + llm_input_core_messages
973
 
974
- # Log the messages being sent to LLM for debugging
975
- # print(f"Messages for LLM (count: {len(messages_for_llm)}):")
976
- # for i, msg in enumerate(messages_for_llm):
977
- # print(f" {i}: Type={type(msg).__name__}, Content='{str(msg.content)[:100].replace('\\n', ' ')}...'")
978
-
979
  # Get response from the assistant
980
  response = chat_with_tools.invoke(messages_for_llm, stop=["Observation:"])
981
  print(f"Assistant response type: {type(response)}")
982
- # print(f"Response content (first 300 chars): {response.content[:300].replace('\n', ' ')}...")
983
  content_preview = response.content[:300].replace('\n', ' ')
984
  print(f"Response content (first 300 chars): {content_preview}...")
985
 
@@ -997,113 +983,118 @@ def assistant(state: AgentState) -> Dict[str, Any]:
997
  if action_json and "action" in action_json and "action_input" in action_json:
998
  tool_name = action_json["action"]
999
  tool_input = action_json["action_input"]
1000
- print(f"Extracted tool: {tool_name}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1001
  print(f"Tool input: {tool_input}")
1002
 
1003
  tool_call_id = f"call_{random.randint(1000000, 9999999)}"
1004
 
1005
  state_update["current_tool"] = tool_name
1006
  state_update["action_input"] = tool_input
1007
- # state_update["tool_call_id"] = tool_call_id # If needed by your graph
1008
  else:
1009
  print("No tool action found or 'Final Answer' detected in response.")
1010
  state_update["current_tool"] = None
1011
  state_update["action_input"] = None
1012
-
1013
  return state_update
1014
 
1015
  def extract_json_from_text(text: str) -> dict:
1016
  """Extract JSON from text, handling markdown code blocks."""
1017
  try:
1018
- import re # Import re at the beginning of the function
1019
 
1020
  print(f"Attempting to extract JSON from text: {text[:200]}...")
1021
 
1022
- # Look for "Action:" followed by a markdown code block - common LLM output pattern
1023
- # This handles cases where the LLM outputs something like:
1024
- # Action:
1025
- # ```python
1026
- # code here
1027
- # ```
1028
  action_match = re.search(r"Action:\s*```(?:python|json)?\s*(.*?)```", text, re.DOTALL)
1029
  if action_match:
1030
  action_content = action_match.group(1).strip()
1031
  print(f"Found action content from markdown block: {action_content[:100]}...")
1032
 
1033
- # If it looks like Python code, try to create a proper JSON structure
1034
- if "=" in action_content or "import" in action_content or "print" in action_content:
1035
- print("Detected Python code, formatting as action_input")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1036
  return {
1037
- "action": "python_code",
1038
- "action_input": {"code": action_content}
1039
  }
 
 
1040
 
1041
- # Look for markdown code blocks - the most common pattern
1042
- if "```" in text:
1043
- print("Found markdown code block")
1044
- # Find all code blocks
1045
- blocks = []
1046
- lines = text.split('\n')
1047
- i = 0
1048
- while i < len(lines):
1049
- line = lines[i]
1050
- if "```" in line:
1051
- # Start of code block
1052
- start_idx = i + 1
1053
- i += 1
1054
- # Find the end of the code block
1055
- while i < len(lines) and "```" not in lines[i]:
1056
- i += 1
1057
- if i < len(lines):
1058
- # Found the end
1059
- block_content = '\n'.join(lines[start_idx:i])
1060
- blocks.append(block_content)
1061
- i += 1
1062
-
1063
- # Try to parse each block as JSON
1064
- for block in blocks:
1065
- block = block.strip()
1066
- print(f"Trying to parse block: {block[:100]}...")
1067
- try:
1068
- # Clean the block - sometimes there might be a language identifier
1069
- if block.startswith("json"):
1070
- block = block[4:].strip()
1071
-
1072
- # Validate JSON before parsing
1073
- parsed = json.loads(block)
1074
- print(f"Successfully parsed JSON: {parsed}")
1075
- return parsed
1076
- except json.JSONDecodeError as e:
1077
- print(f"JSON parse error: {e}")
1078
- continue
1079
-
1080
- # Look for JSON-like patterns in the text using a more precise regex
1081
- # Match balanced braces
1082
- # No need to import re again here
1083
-
1084
- # Try to find JSON objects with proper brace matching
1085
- brace_count = 0
1086
- start_pos = -1
1087
-
1088
- for i, char in enumerate(text):
1089
- if char == '{':
1090
- if brace_count == 0:
1091
- start_pos = i
1092
- brace_count += 1
1093
- elif char == '}':
1094
- brace_count -= 1
1095
- if brace_count == 0 and start_pos >= 0:
1096
- # Found a complete JSON object
1097
- json_candidate = text[start_pos:i+1]
1098
- try:
1099
- parsed = json.loads(json_candidate)
1100
- print(f"Found valid JSON: {parsed}")
1101
- return parsed
1102
- except json.JSONDecodeError:
1103
- continue
1104
-
1105
- # If we're here, we couldn't find a valid JSON object
1106
- print("Could not extract valid JSON from text")
1107
  return None
1108
 
1109
  except Exception as e:
 
4
  from pathlib import Path
5
  from langgraph.graph.message import add_messages
6
  from langchain_core.messages import AnyMessage, HumanMessage, AIMessage, SystemMessage, ToolMessage
 
 
7
  from langgraph.graph import START, END, StateGraph
 
8
  from langchain_openai import ChatOpenAI
 
 
 
9
  import tempfile
 
10
  import random
11
  import json
 
12
  import requests
13
  from urllib.parse import quote, urlparse
 
14
  from bs4 import BeautifulSoup
15
  import html2text
16
  import pandas as pd
17
  from tabulate import tabulate
18
  import base64
19
 
 
20
  from langchain_community.document_loaders import WikipediaLoader
21
  from langchain_community.document_loaders import ArxivLoader
22
  from langchain_community.tools.tavily_search import TavilySearchResults
 
754
 
755
  IMPORTANT: Make sure your JSON is properly formatted with double quotes around keys and string values.
756
 
757
+ If you do not want to use any tool AND have not yet arrived at a solution, call the python_code tool with an empty string as the code.
758
+
759
  Example use for tools:
760
 
761
  ```json
 
963
  # Combine system message with the (potentially pruned) core messages
964
  messages_for_llm = [system_msg] + llm_input_core_messages
965
 
 
 
 
 
 
966
  # Get response from the assistant
967
  response = chat_with_tools.invoke(messages_for_llm, stop=["Observation:"])
968
  print(f"Assistant response type: {type(response)}")
 
969
  content_preview = response.content[:300].replace('\n', ' ')
970
  print(f"Response content (first 300 chars): {content_preview}...")
971
 
 
983
  if action_json and "action" in action_json and "action_input" in action_json:
984
  tool_name = action_json["action"]
985
  tool_input = action_json["action_input"]
986
+
987
+ # Handle nested JSON issue - if action_input is a string containing JSON
988
+ if tool_name == "python_code" and isinstance(tool_input, dict) and "code" in tool_input:
989
+ code = tool_input["code"]
990
+ if code.startswith("{") and ("action" in code or "action_input" in code):
991
+ try:
992
+ # Try to see if this is a nested JSON structure
993
+ nested_json = json.loads(code)
994
+ if isinstance(nested_json, dict) and "action" in nested_json and "action_input" in nested_json:
995
+ # Replace with the nested structure
996
+ tool_name = nested_json["action"]
997
+ tool_input = nested_json["action_input"]
998
+ print(f"Unwrapped nested JSON. New tool: {tool_name}")
999
+ print(f"New tool input: {tool_input}")
1000
+ except:
1001
+ # If it fails, keep original values
1002
+ pass
1003
+
1004
+ print(f"Using tool: {tool_name}")
1005
  print(f"Tool input: {tool_input}")
1006
 
1007
  tool_call_id = f"call_{random.randint(1000000, 9999999)}"
1008
 
1009
  state_update["current_tool"] = tool_name
1010
  state_update["action_input"] = tool_input
 
1011
  else:
1012
  print("No tool action found or 'Final Answer' detected in response.")
1013
  state_update["current_tool"] = None
1014
  state_update["action_input"] = None
1015
+
1016
  return state_update
1017
 
1018
  def extract_json_from_text(text: str) -> dict:
1019
  """Extract JSON from text, handling markdown code blocks."""
1020
  try:
1021
+ import re
1022
 
1023
  print(f"Attempting to extract JSON from text: {text[:200]}...")
1024
 
1025
+ # First, clean up the text to handle specific patterns that might confuse parsing
1026
+ text = text.replace('\\n', '\n').replace('\\"', '"')
1027
+
1028
+ # Pattern 1: Look for "Action:" followed by a markdown code block
 
 
1029
  action_match = re.search(r"Action:\s*```(?:python|json)?\s*(.*?)```", text, re.DOTALL)
1030
  if action_match:
1031
  action_content = action_match.group(1).strip()
1032
  print(f"Found action content from markdown block: {action_content[:100]}...")
1033
 
1034
+ # Try to parse as JSON first
1035
+ try:
1036
+ parsed_json = json.loads(action_content)
1037
+ if "action" in parsed_json and "action_input" in parsed_json:
1038
+ return parsed_json
1039
+ except json.JSONDecodeError:
1040
+ # If it's Python code, create action structure
1041
+ if "=" in action_content or "import" in action_content or "print" in action_content:
1042
+ print("Detected Python code, formatting as action_input")
1043
+ return {
1044
+ "action": "python_code",
1045
+ "action_input": {"code": action_content}
1046
+ }
1047
+
1048
+ # Pattern 2: Look for regular markdown code blocks
1049
+ code_blocks = re.findall(r"```(?:json|python)?(.+?)```", text, re.DOTALL)
1050
+ for block in code_blocks:
1051
+ block = block.strip()
1052
+ print(f"Processing code block: {block[:100]}...")
1053
+
1054
+ # Try to parse as JSON
1055
+ try:
1056
+ parsed = json.loads(block)
1057
+ if "action" in parsed and "action_input" in parsed:
1058
+ print(f"Successfully parsed JSON block: {parsed}")
1059
+ return parsed
1060
+ except json.JSONDecodeError:
1061
+ # If it's Python code, create action structure
1062
+ if "=" in block or "import" in block or "print" in block or "def " in block:
1063
+ print("Detected Python code in block, formatting as action_input")
1064
+ return {
1065
+ "action": "python_code",
1066
+ "action_input": {"code": block}
1067
+ }
1068
+
1069
+ # Pattern 3: Direct JSON object ({...}) in the text
1070
+ json_matches = re.findall(r"\{[\s\S]*?\}", text)
1071
+ for json_str in json_matches:
1072
+ try:
1073
+ parsed = json.loads(json_str)
1074
+ if "action" in parsed and "action_input" in parsed:
1075
+ print(f"Found valid JSON object: {parsed}")
1076
+ return parsed
1077
+ except json.JSONDecodeError:
1078
+ continue
1079
+
1080
+ # Pattern 4: Look for patterns like 'action': 'tool_name', 'action_input': {...}
1081
+ action_pattern = re.search(r"['\"](action)['\"]:\s*['\"](\w+)['\"]", text)
1082
+ action_input_pattern = re.search(r"['\"](action_input)['\"]:\s*(\{.+\})", text, re.DOTALL)
1083
+
1084
+ if action_pattern and action_input_pattern:
1085
+ action = action_pattern.group(2)
1086
+ action_input_str = action_input_pattern.group(2)
1087
+
1088
+ try:
1089
+ action_input = json.loads(action_input_str)
1090
  return {
1091
+ "action": action,
1092
+ "action_input": action_input
1093
  }
1094
+ except json.JSONDecodeError:
1095
+ pass
1096
 
1097
+ print("Could not extract valid JSON from text using any pattern")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1098
  return None
1099
 
1100
  except Exception as e:
tools.py ADDED
@@ -0,0 +1,625 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+ from typing import Dict, Any, Optional, Union, List
4
+ from pathlib import Path
5
+ import tempfile
6
+ import base64
7
+ import json
8
+ import requests
9
+ from urllib.parse import urlparse
10
+ from bs4 import BeautifulSoup
11
+ import html2text
12
+ import pandas as pd
13
+ from tabulate import tabulate
14
+ from langchain_community.document_loaders import WikipediaLoader
15
+ from langchain_community.document_loaders import ArxivLoader
16
+ from langchain_community.tools.tavily_search import TavilySearchResults
17
+ from supabase import create_client, Client
18
+
19
+ load_dotenv()
20
+
21
+ def run_python_code(code: str):
22
+ """Execute Python code safely using exec() instead of subprocess."""
23
+ # Check for potentially dangerous operations
24
+ dangerous_operations = [
25
+ "os.system", "os.popen", "os.unlink", "os.remove",
26
+ "subprocess.run", "subprocess.call", "subprocess.Popen",
27
+ "shutil.rmtree", "shutil.move", "shutil.copy",
28
+ "open(", "file(", "eval(", "exec(",
29
+ "__import__", "input(", "raw_input(",
30
+ "__builtins__", "globals(", "locals(",
31
+ "compile(", "execfile(", "reload("
32
+ ]
33
+
34
+ # Safe imports that should be allowed
35
+ safe_imports = {
36
+ "import datetime", "import math", "import random",
37
+ "import statistics", "import collections", "import itertools",
38
+ "import re", "import json", "import csv", "import numpy",
39
+ "import pandas", "from math import", "from datetime import",
40
+ "from statistics import", "from collections import",
41
+ "from itertools import"
42
+ }
43
+
44
+ # Check for dangerous operations
45
+ for dangerous_op in dangerous_operations:
46
+ if dangerous_op in code:
47
+ return f"Error: Code contains potentially unsafe operations: {dangerous_op}"
48
+
49
+ # Check each line for imports
50
+ for line in code.splitlines():
51
+ line = line.strip()
52
+ if line.startswith("import ") or line.startswith("from "):
53
+ # Check if it's in our safe list
54
+ is_safe = any(line.startswith(safe_import) for safe_import in safe_imports)
55
+ # Also allow basic numpy/pandas imports
56
+ is_safe = is_safe or line.startswith("import numpy") or line.startswith("import pandas")
57
+ if not is_safe:
58
+ return f"Error: Code contains potentially unsafe import: {line}"
59
+
60
+ try:
61
+ # Capture stdout to get print output
62
+ import io
63
+ import sys
64
+ from contextlib import redirect_stdout
65
+
66
+ # Create a restricted globals environment
67
+ restricted_globals = {
68
+ '__builtins__': {
69
+ 'abs': abs, 'all': all, 'any': any, 'bin': bin, 'bool': bool,
70
+ 'chr': chr, 'dict': dict, 'dir': dir, 'divmod': divmod,
71
+ 'enumerate': enumerate, 'filter': filter, 'float': float,
72
+ 'format': format, 'hex': hex, 'int': int, 'len': len,
73
+ 'list': list, 'map': map, 'max': max, 'min': min, 'oct': oct,
74
+ 'ord': ord, 'pow': pow, 'print': print, 'range': range,
75
+ 'reversed': reversed, 'round': round, 'set': set, 'slice': slice,
76
+ 'sorted': sorted, 'str': str, 'sum': sum, 'tuple': tuple,
77
+ 'type': type, 'zip': zip,
78
+ }
79
+ }
80
+
81
+ # Allow safe modules
82
+ import math
83
+ import datetime
84
+ import random
85
+ import statistics
86
+ import collections
87
+ import itertools
88
+ import re
89
+ import json
90
+ import csv
91
+
92
+ restricted_globals['math'] = math
93
+ restricted_globals['datetime'] = datetime
94
+ restricted_globals['random'] = random
95
+ restricted_globals['statistics'] = statistics
96
+ restricted_globals['collections'] = collections
97
+ restricted_globals['itertools'] = itertools
98
+ restricted_globals['re'] = re
99
+ restricted_globals['json'] = json
100
+ restricted_globals['csv'] = csv
101
+
102
+ # Try to import numpy and pandas if available
103
+ try:
104
+ import numpy as np
105
+ restricted_globals['numpy'] = np
106
+ restricted_globals['np'] = np
107
+ except ImportError:
108
+ pass
109
+
110
+ try:
111
+ import pandas as pd
112
+ restricted_globals['pandas'] = pd
113
+ restricted_globals['pd'] = pd
114
+ except ImportError:
115
+ pass
116
+
117
+ # Create local scope
118
+ local_scope = {}
119
+
120
+ # Capture stdout
121
+ captured_output = io.StringIO()
122
+
123
+ # Execute the entire code block at once
124
+ with redirect_stdout(captured_output):
125
+ # Try to evaluate as expression first (for simple expressions)
126
+ lines = code.strip().split('\n')
127
+ if len(lines) == 1 and not any(keyword in code for keyword in ['=', 'import', 'from', 'def', 'class', 'if', 'for', 'while', 'try', 'with']):
128
+ try:
129
+ result = eval(code, restricted_globals, local_scope)
130
+ print(f"Result: {result}")
131
+ except:
132
+ # If eval fails, use exec
133
+ exec(code, restricted_globals, local_scope)
134
+ else:
135
+ # For multi-line code, execute the entire block
136
+ exec(code, restricted_globals, local_scope)
137
+
138
+ # Get the captured output
139
+ output = captured_output.getvalue()
140
+
141
+ if output.strip():
142
+ return output.strip()
143
+ else:
144
+ # If no output, check if there's a result from the last expression
145
+ lines = code.strip().split('\n')
146
+ last_line = lines[-1].strip() if lines else ""
147
+
148
+ # If the last line looks like an expression, try to evaluate it
149
+ if last_line and not any(keyword in last_line for keyword in ['=', 'import', 'from', 'def', 'class', 'if', 'for', 'while', 'try', 'with', 'print']):
150
+ try:
151
+ result = eval(last_line, restricted_globals, local_scope)
152
+ return f"Result: {result}"
153
+ except:
154
+ pass
155
+
156
+ return "Code executed successfully with no output."
157
+
158
+ except SyntaxError as e:
159
+ return f"Syntax Error: {str(e)}"
160
+ except NameError as e:
161
+ return f"Name Error: {str(e)}"
162
+ except ZeroDivisionError as e:
163
+ return f"Zero Division Error: {str(e)}"
164
+ except Exception as e:
165
+ return f"Error executing code: {str(e)}"
166
+
167
+ def scrape_webpage(url: str) -> str:
168
+ """
169
+ Safely scrape content from a specified URL.
170
+
171
+ Args:
172
+ url: The URL to scrape
173
+
174
+ Returns:
175
+ Formatted webpage content as text
176
+ """
177
+ # Check if the URL is valid
178
+ try:
179
+ # Parse the URL to validate it
180
+ parsed_url = urlparse(url)
181
+ if not parsed_url.scheme or not parsed_url.netloc:
182
+ return f"Error: Invalid URL format: {url}. Please provide a valid URL with http:// or https:// prefix."
183
+
184
+ # Block potentially dangerous URLs
185
+ blocked_domains = [
186
+ "localhost", "127.0.0.1", "0.0.0.0",
187
+ "192.168.", "10.0.", "172.16.", "172.17.", "172.18.", "172.19.", "172.20.",
188
+ "172.21.", "172.22.", "172.23.", "172.24.", "172.25.", "172.26.", "172.27.",
189
+ "172.28.", "172.29.", "172.30.", "172.31."
190
+ ]
191
+
192
+ if any(domain in parsed_url.netloc for domain in blocked_domains):
193
+ return f"Error: Access to internal/local URLs is blocked for security: {url}"
194
+
195
+ print(f"Scraping URL: {url}")
196
+
197
+ # Set user agent to avoid being blocked
198
+ headers = {
199
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
200
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
201
+ 'Accept-Language': 'en-US,en;q=0.5',
202
+ 'Connection': 'keep-alive',
203
+ 'Upgrade-Insecure-Requests': '1',
204
+ 'Cache-Control': 'max-age=0',
205
+ }
206
+
207
+ # Set a reasonable timeout to avoid hanging
208
+ timeout = 10
209
+
210
+ # Make the request
211
+ response = requests.get(url, headers=headers, timeout=timeout)
212
+
213
+ # Check if request was successful
214
+ if response.status_code != 200:
215
+ return f"Error: Failed to fetch the webpage. Status code: {response.status_code}"
216
+
217
+ # Use BeautifulSoup to parse the HTML
218
+ soup = BeautifulSoup(response.text, 'html.parser')
219
+
220
+ # Remove script and style elements that are not relevant to content
221
+ for script_or_style in soup(["script", "style", "iframe", "footer", "nav"]):
222
+ script_or_style.decompose()
223
+
224
+ # Get the page title
225
+ title = soup.title.string if soup.title else "No title found"
226
+
227
+ # Extract the main content
228
+ # First try to find main content areas
229
+ main_content = soup.find('main') or soup.find('article') or soup.find(id='content') or soup.find(class_='content')
230
+
231
+ # If no main content area is found, use the entire body
232
+ if not main_content:
233
+ main_content = soup.body
234
+
235
+ # Convert to plain text
236
+ h = html2text.HTML2Text()
237
+ h.ignore_links = False
238
+ h.ignore_images = True
239
+ h.ignore_tables = False
240
+ h.unicode_snob = True
241
+
242
+ if main_content:
243
+ text_content = h.handle(str(main_content))
244
+ else:
245
+ text_content = h.handle(response.text)
246
+
247
+ # Limit content length to avoid overwhelming the model
248
+ max_content_length = 99999999999
249
+ if len(text_content) > max_content_length:
250
+ text_content = text_content[:max_content_length] + "\n\n[Content truncated due to length...]"
251
+
252
+ # Format the response
253
+ result = f"Title: {title}\nURL: {url}\n\n{text_content}"
254
+
255
+ return result
256
+
257
+ except requests.exceptions.Timeout:
258
+ return f"Error: Request timed out while trying to access {url}"
259
+ except requests.exceptions.ConnectionError:
260
+ return f"Error: Failed to connect to {url}. The site might be down or the URL might be incorrect."
261
+ except requests.exceptions.RequestException as e:
262
+ return f"Error requesting {url}: {str(e)}"
263
+ except Exception as e:
264
+ return f"Error scraping webpage {url}: {str(e)}"
265
+
266
+ def wikipedia_search(query: str, num_results: int = 3) -> str:
267
+ """
268
+ Search Wikipedia for information about a specific query.
269
+
270
+ Args:
271
+ query: Search query
272
+ num_results: Number of search results to return (default: 3)
273
+
274
+ Returns:
275
+ Formatted Wikipedia search results
276
+ """
277
+ try:
278
+ # Validate input
279
+ if not query or not isinstance(query, str):
280
+ return "Error: Please provide a valid search query."
281
+
282
+ # Ensure num_results is valid
283
+ try:
284
+ num_results = int(num_results)
285
+ if num_results <= 0:
286
+ num_results = 3 # Default to 3 if invalid
287
+ except:
288
+ num_results = 3 # Default to 3 if conversion fails
289
+
290
+ print(f"Searching Wikipedia for: {query}")
291
+
292
+ # Use WikipediaLoader from LangChain
293
+ loader = WikipediaLoader(query=query, load_max_docs=num_results)
294
+ docs = loader.load()
295
+
296
+ if not docs:
297
+ return f"No Wikipedia results found for '{query}'. Try refining your search."
298
+
299
+ # Format the results
300
+ formatted_results = f"Wikipedia search results for '{query}':\n\n"
301
+
302
+ for i, doc in enumerate(docs, 1):
303
+ title = doc.metadata.get('title', 'Unknown Title')
304
+ source = doc.metadata.get('source', 'No URL')
305
+ content = doc.page_content
306
+
307
+ # Truncate content if too long
308
+ if len(content) > 500:
309
+ content = content[:500] + "..."
310
+
311
+ formatted_results += f"{i}. {title}\n"
312
+ formatted_results += f" URL: {source}\n"
313
+ formatted_results += f" {content}\n\n"
314
+
315
+ return formatted_results
316
+
317
+ except Exception as e:
318
+ return f"Error searching Wikipedia: {str(e)}"
319
+
320
+ def tavily_search(query: str, search_depth: str = "basic") -> str:
321
+ """
322
+ Search the web using the Tavily Search API.
323
+
324
+ Args:
325
+ query: Search query
326
+ search_depth: Depth of search ('basic' or 'comprehensive')
327
+
328
+ Returns:
329
+ Formatted search results from Tavily
330
+ """
331
+ try:
332
+ # Check for API key
333
+ tavily_api_key = os.environ.get("TAVILY_API_KEY")
334
+ if not tavily_api_key:
335
+ return "Error: Tavily API key not found. Please set the TAVILY_API_KEY environment variable."
336
+
337
+ # Validate input
338
+ if not query or not isinstance(query, str):
339
+ return "Error: Please provide a valid search query."
340
+
341
+ # Validate search_depth
342
+ if search_depth not in ["basic", "comprehensive"]:
343
+ search_depth = "basic" # Default to basic if invalid
344
+
345
+ print(f"Searching Tavily for: {query} (depth: {search_depth})")
346
+
347
+ # Initialize the Tavily search tool
348
+ search = TavilySearchResults(api_key=tavily_api_key)
349
+
350
+ # Execute the search
351
+ results = search.invoke({"query": query, "search_depth": search_depth})
352
+
353
+ if not results:
354
+ return f"No Tavily search results found for '{query}'. Try refining your search."
355
+
356
+ # Format the results
357
+ formatted_results = f"Tavily search results for '{query}':\n\n"
358
+
359
+ for i, result in enumerate(results, 1):
360
+ formatted_results += f"{i}. {result.get('title', 'No title')}\n"
361
+ formatted_results += f" URL: {result.get('url', 'No URL')}\n"
362
+ formatted_results += f" {result.get('content', 'No content')}\n\n"
363
+
364
+ return formatted_results
365
+
366
+ except Exception as e:
367
+ return f"Error searching with Tavily: {str(e)}"
368
+
369
+ def arxiv_search(query: str, max_results: int = 5) -> str:
370
+ """
371
+ Search ArXiv for scientific papers matching the query.
372
+
373
+ Args:
374
+ query: Search query for ArXiv
375
+ max_results: Maximum number of results to return
376
+
377
+ Returns:
378
+ Formatted ArXiv search results
379
+ """
380
+ try:
381
+ # Validate input
382
+ if not query or not isinstance(query, str):
383
+ return "Error: Please provide a valid search query."
384
+
385
+ # Ensure max_results is valid
386
+ try:
387
+ max_results = int(max_results)
388
+ if max_results <= 0 or max_results > 10:
389
+ max_results = 5 # Default to 5 if invalid or too large
390
+ except:
391
+ max_results = 5 # Default to 5 if conversion fails
392
+
393
+ print(f"Searching ArXiv for: {query}")
394
+
395
+ # Use ArxivLoader from LangChain
396
+ loader = ArxivLoader(
397
+ query=query,
398
+ load_max_docs=max_results,
399
+ load_all_available_meta=True
400
+ )
401
+
402
+ docs = loader.load()
403
+
404
+ if not docs:
405
+ return f"No ArXiv papers found for '{query}'. Try refining your search."
406
+
407
+ # Format the results
408
+ formatted_results = f"ArXiv papers for '{query}':\n\n"
409
+
410
+ for i, doc in enumerate(docs, 1):
411
+ meta = doc.metadata
412
+ title = meta.get('Title', 'Unknown Title')
413
+ url = meta.get('Entry ID', 'No URL')
414
+ authors = meta.get('Authors', 'Unknown Authors')
415
+ published = meta.get('Published', 'Unknown Date')
416
+
417
+ formatted_results += f"{i}. {title}\n"
418
+ formatted_results += f" URL: {url}\n"
419
+ formatted_results += f" Authors: {authors}\n"
420
+ formatted_results += f" Published: {published}\n"
421
+
422
+ # Add abstract, truncated if too long
423
+ abstract = doc.page_content.replace('\n', ' ')
424
+ if len(abstract) > 300:
425
+ abstract = abstract[:300] + "..."
426
+ formatted_results += f" Abstract: {abstract}\n\n"
427
+
428
+ return formatted_results
429
+
430
+ except Exception as e:
431
+ return f"Error searching ArXiv: {str(e)}"
432
+
433
+ def supabase_operation(operation_type: str, table: str, data: dict = None, filters: dict = None) -> str:
434
+ """
435
+ Perform operations on Supabase database.
436
+
437
+ Args:
438
+ operation_type: Type of operation ('insert', 'select', 'update', 'delete')
439
+ table: Name of the table to operate on
440
+ data: Data to insert/update (for insert/update operations)
441
+ filters: Filters for select/update/delete operations (e.g., {"id": 1})
442
+
443
+ Returns:
444
+ Result of the operation as a formatted string
445
+ """
446
+ try:
447
+ # Get Supabase credentials from environment variables
448
+ supabase_url = os.environ.get("SUPABASE_URL")
449
+ supabase_key = os.environ.get("SUPABASE_ANON_KEY")
450
+
451
+ if not supabase_url or not supabase_key:
452
+ return "Error: Supabase credentials not found. Please set SUPABASE_URL and SUPABASE_ANON_KEY environment variables."
453
+
454
+ # Create Supabase client
455
+ supabase: Client = create_client(supabase_url, supabase_key)
456
+
457
+ # Validate inputs
458
+ if not table:
459
+ return "Error: Table name is required."
460
+
461
+ if operation_type not in ['insert', 'select', 'update', 'delete']:
462
+ return "Error: Invalid operation type. Use 'insert', 'select', 'update', or 'delete'."
463
+
464
+ # Perform the operation based on type
465
+ if operation_type == 'insert':
466
+ if not data:
467
+ return "Error: Data is required for insert operation."
468
+
469
+ result = supabase.table(table).insert(data).execute()
470
+ return f"Insert successful: {len(result.data)} row(s) inserted into {table}"
471
+
472
+ elif operation_type == 'select':
473
+ query = supabase.table(table).select("*")
474
+
475
+ # Apply filters if provided
476
+ if filters:
477
+ for key, value in filters.items():
478
+ query = query.eq(key, value)
479
+
480
+ result = query.execute()
481
+ return f"Select successful: Found {len(result.data)} row(s) in {table}\nData: {json.dumps(result.data, indent=2)}"
482
+
483
+ elif operation_type == 'update':
484
+ if not data or not filters:
485
+ return "Error: Both data and filters are required for update operation."
486
+
487
+ query = supabase.table(table).update(data)
488
+
489
+ # Apply filters
490
+ for key, value in filters.items():
491
+ query = query.eq(key, value)
492
+
493
+ result = query.execute()
494
+ return f"Update successful: {len(result.data)} row(s) updated in {table}"
495
+
496
+ elif operation_type == 'delete':
497
+ if not filters:
498
+ return "Error: Filters are required for delete operation."
499
+
500
+ query = supabase.table(table).delete()
501
+
502
+ # Apply filters
503
+ for key, value in filters.items():
504
+ query = query.eq(key, value)
505
+
506
+ result = query.execute()
507
+ return f"Delete successful: Rows deleted from {table}"
508
+
509
+ except Exception as e:
510
+ return f"Error performing Supabase operation: {str(e)}"
511
+
512
+ def excel_to_text(excel_path: str, sheet_name: Optional[str] = None, file_content: Optional[bytes] = None) -> str:
513
+ """
514
+ Read an Excel file and return a Markdown table of the requested sheet.
515
+
516
+ Args:
517
+ excel_path: Path to the Excel file (.xlsx or .xls) or name for the attached file.
518
+ sheet_name: Optional name or index of the sheet to read. If None, reads the first sheet.
519
+ file_content: Optional binary content of the file if provided as an attachment.
520
+
521
+ Returns:
522
+ A Markdown table representing the Excel sheet, or an error message if the file is not found or cannot be read.
523
+ """
524
+ try:
525
+ # Handle file attachment case
526
+ if file_content:
527
+ # Create a temporary file to save the attachment
528
+ with tempfile.NamedTemporaryFile(suffix='.xlsx', delete=False) as temp_file:
529
+ temp_file.write(file_content)
530
+ temp_path = temp_file.name
531
+
532
+ print(f"Saved attached Excel file to temporary location: {temp_path}")
533
+ file_path = Path(temp_path)
534
+ else:
535
+ # Regular file path case
536
+ file_path = Path(excel_path).expanduser().resolve()
537
+ if not file_path.is_file():
538
+ return f"Error: Excel file not found at {file_path}"
539
+
540
+ # Process the Excel file
541
+ sheet: Union[str, int] = (
542
+ int(sheet_name)
543
+ if sheet_name and sheet_name.isdigit()
544
+ else sheet_name or 0
545
+ )
546
+
547
+ df = pd.read_excel(file_path, sheet_name=sheet)
548
+
549
+ # Clean up temporary file if we created one
550
+ if file_content and os.path.exists(temp_path):
551
+ os.unlink(temp_path)
552
+ print(f"Deleted temporary Excel file: {temp_path}")
553
+
554
+ if hasattr(df, "to_markdown"):
555
+ return df.to_markdown(index=False)
556
+
557
+ return tabulate(df, headers="keys", tablefmt="github", showindex=False)
558
+
559
+ except Exception as e:
560
+ # Clean up temporary file in case of error
561
+ if file_content and 'temp_path' in locals() and os.path.exists(temp_path):
562
+ os.unlink(temp_path)
563
+ print(f"Deleted temporary Excel file due to error: {temp_path}")
564
+ return f"Error reading Excel file: {e}"
565
+
566
+ def save_attachment_to_tempfile(file_content_b64: str, file_extension: str = '.xlsx') -> str:
567
+ """
568
+ Decode a base64 file content and save it to a temporary file.
569
+
570
+ Args:
571
+ file_content_b64: Base64 encoded file content
572
+ file_extension: File extension to use for the temporary file
573
+
574
+ Returns:
575
+ Path to the saved temporary file
576
+ """
577
+ try:
578
+ # Decode the base64 content
579
+ file_content = base64.b64decode(file_content_b64)
580
+
581
+ # Create a temporary file with the appropriate extension
582
+ with tempfile.NamedTemporaryFile(suffix=file_extension, delete=False) as temp_file:
583
+ temp_file.write(file_content)
584
+ temp_path = temp_file.name
585
+
586
+ print(f"Saved attachment to temporary file: {temp_path}")
587
+ return temp_path
588
+
589
+ except Exception as e:
590
+ print(f"Error saving attachment: {e}")
591
+ return None
592
+
593
+ # Define the tools configuration
594
+ tools_config = [
595
+ {
596
+ "name": "python_code",
597
+ "description": "Execute Python code. Provide the complete Python code as a string in the format: {\"code\": \"your python code here\"}",
598
+ "func": run_python_code
599
+ },
600
+ {
601
+ "name": "wikipedia_search",
602
+ "description": "Search Wikipedia for information about a specific topic. Provide a query in the format: {\"query\": \"your topic\", \"num_results\": 3}",
603
+ "func": wikipedia_search
604
+ },
605
+ {
606
+ "name": "tavily_search",
607
+ "description": "Search the web using Tavily for more comprehensive results. Provide a query in the format: {\"query\": \"your search query\", \"search_depth\": \"basic\"}",
608
+ "func": tavily_search
609
+ },
610
+ {
611
+ "name": "arxiv_search",
612
+ "description": "Search ArXiv for scientific papers. Provide a query in the format: {\"query\": \"your research topic\", \"max_results\": 5}",
613
+ "func": arxiv_search
614
+ },
615
+ {
616
+ "name": "supabase_operation",
617
+ "description": "Perform database operations on Supabase (insert, select, update, delete). Provide operation_type, table name, and optional data/filters. ",
618
+ "func": supabase_operation
619
+ },
620
+ {
621
+ "name": "excel_to_text",
622
+ "description": "Read an Excel file and return a Markdown table. You can provide either the path to an Excel file or use a file attachment. For attachments, provide a base64-encoded string of the file content and a filename.",
623
+ "func": excel_to_text
624
+ }
625
+ ]