mdicio commited on
Commit
8c5bbef
·
1 Parent(s): 81917a3
Files changed (6) hide show
  1. agent.py +287 -0
  2. app.py +67 -35
  3. app_template.py +196 -0
  4. requirements copy.txt +23 -0
  5. requirements.txt +22 -1
  6. tools.py +1114 -0
agent.py ADDED
@@ -0,0 +1,287 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from dotenv import load_dotenv
3
+
4
+ # Import models from SmolaAgents
5
+ from smolagents import CodeAgent, LiteLLMModel, OpenAIServerModel
6
+
7
+ # Import SmolaAgents tools
8
+ from smolagents.default_tools import FinalAnswerTool, PythonInterpreterTool
9
+
10
+ # Import custom tools
11
+ from tools import (
12
+ AddDocumentToVectorStoreTool,
13
+ ArxivSearchTool,
14
+ DownloadFileFromLinkTool,
15
+ DuckDuckGoSearchTool,
16
+ QueryVectorStoreTool,
17
+ ReadFileContentTool,
18
+ TranscibeVideoFileTool,
19
+ TranscribeAudioTool,
20
+ VisitWebpageTool,
21
+ WikipediaSearchTool,
22
+ image_question_answering
23
+ )
24
+
25
+ # Import utility functions
26
+ from utils import extract_final_answer, replace_tool_mentions
27
+
28
+
29
+ class BoomBot:
30
+ def __init__(self, provider="meta"):
31
+ """
32
+ Initialize the BoomBot with the specified provider.
33
+
34
+ Args:
35
+ provider (str): The model provider to use (e.g., "groq", "qwen", "gemma", "anthropic", "deepinfra", "meta")
36
+ """
37
+ load_dotenv()
38
+ self.provider = provider
39
+ self.model = self._initialize_model()
40
+ self.agent = self._create_agent()
41
+
42
+ def _initialize_model(self):
43
+ """
44
+ Initialize the appropriate model based on the provider.
45
+
46
+ Returns:
47
+ The initialized model object
48
+ """
49
+ if self.provider == "qwen":
50
+ qwen_model = "ollama_chat/qwen3:8b"
51
+ return LiteLLMModel(
52
+ model_id=qwen_model,
53
+ device="cuda",
54
+ num_ctx=32768,
55
+ temperature=0.6,
56
+ top_p=0.95,
57
+ )
58
+ elif self.provider == "gemma":
59
+ gemma_model = "ollama_chat/gemma3:12b-it-qat"
60
+ return LiteLLMModel(
61
+ model_id=gemma_model,
62
+ num_ctx=65536,
63
+ temperature=1.0,
64
+ device="cuda",
65
+ top_k=64,
66
+ top_p=0.95,
67
+ min_p=0.0,
68
+ )
69
+ elif self.provider == "anthropic":
70
+ model_id = "anthropic/claude-3-5-sonnet-latest"
71
+ return LiteLLMModel(model_id=model_id, temperature=0.6, max_tokens=8192)
72
+ elif self.provider == "deepinfra":
73
+ deepinfra_model = "Qwen/Qwen3-235B-A22B"
74
+ return OpenAIServerModel(
75
+ model_id=deepinfra_model,
76
+ api_base="https://api.deepinfra.com/v1/openai",
77
+ # api_key=os.environ["DEEPINFRA_API_KEY"],
78
+ flatten_messages_as_text=True,
79
+ max_tokens=8192,
80
+ temperature=0.1,
81
+ )
82
+ elif self.provider == "meta":
83
+ meta_model = "meta-llama/Llama-3.3-70B-Instruct-Turbo"
84
+ return OpenAIServerModel(
85
+ model_id=meta_model,
86
+ api_base="https://api.deepinfra.com/v1/openai",
87
+ # api_key=os.environ["DEEPINFRA_API_KEY"],
88
+ flatten_messages_as_text=True,
89
+ max_tokens=8192,
90
+ temperature=0.7,
91
+ )
92
+ elif self.provider == "groq":
93
+ # Default to use groq's claude-3-opus or llama-3
94
+ model_id = "claude-3-opus-20240229"
95
+ return LiteLLMModel(model_id=model_id, temperature=0.7, max_tokens=8192)
96
+ else:
97
+ raise ValueError(f"Unsupported provider: {self.provider}")
98
+
99
+ def _create_agent(self):
100
+ """
101
+ Create and configure the agent with all necessary tools.
102
+
103
+ Returns:
104
+ The configured CodeAgent
105
+ """
106
+ # Initialize tools
107
+ download_file = DownloadFileFromLinkTool()
108
+ read_file_content = ReadFileContentTool()
109
+ visit_webpage = VisitWebpageTool()
110
+ transcribe_video = TranscibeVideoFileTool()
111
+ transcribe_audio = TranscribeAudioTool()
112
+ get_wikipedia_info = WikipediaSearchTool()
113
+ web_searcher = DuckDuckGoSearchTool()
114
+ arxiv_search = ArxivSearchTool()
115
+ add_doc_vectorstore = AddDocumentToVectorStoreTool()
116
+ retrieve_doc_vectorstore = QueryVectorStoreTool()
117
+
118
+ # SmolaAgents default tools
119
+ python_interpreter = PythonInterpreterTool()
120
+ final_answer = FinalAnswerTool()
121
+
122
+ # Combine all tools
123
+ agent_tools = [
124
+ web_searcher,
125
+ download_file,
126
+ read_file_content,
127
+ visit_webpage,
128
+ transcribe_video,
129
+ transcribe_audio,
130
+ get_wikipedia_info,
131
+ arxiv_search,
132
+ add_doc_vectorstore,
133
+ retrieve_doc_vectorstore,
134
+ image_question_answering,
135
+ python_interpreter,
136
+ final_answer,
137
+ ]
138
+
139
+ # Additional imports for the Python interpreter
140
+ additional_imports = [
141
+ "json",
142
+ "os",
143
+ "glob",
144
+ "pathlib",
145
+ "pandas",
146
+ "numpy",
147
+ "matplotlib",
148
+ "seaborn",
149
+ "sklearn",
150
+ "tqdm",
151
+ "argparse",
152
+ "pickle",
153
+ "io",
154
+ "re",
155
+ "datetime",
156
+ "collections",
157
+ "math",
158
+ "random",
159
+ "csv",
160
+ "zipfile",
161
+ "itertools",
162
+ "functools",
163
+ ]
164
+
165
+ # Create the agent
166
+ agent = CodeAgent(
167
+ tools=agent_tools,
168
+ max_steps=12,
169
+ model=self.model,
170
+ add_base_tools=False,
171
+ stream_outputs=True,
172
+ additional_authorized_imports=additional_imports,
173
+ )
174
+
175
+ # Modify the system prompt
176
+ modified_prompt = replace_tool_mentions(agent.system_prompt)
177
+ agent.system_prompt = modified_prompt
178
+
179
+ return agent
180
+
181
+ def _get_system_prompt(self):
182
+ """
183
+ Return the system prompt for the agent.
184
+
185
+ Returns:
186
+ str: The system prompt
187
+ """
188
+ return """
189
+ YOUR BEHAVIOR GUIDELINES:
190
+ • Do NOT make unfounded assumptions—always ground answers in reliable sources or search results.
191
+ • For math or puzzles: break the problem into code/math, then solve programmatically.
192
+
193
+ RESEARCH WORKFLOW (in rough priority order):
194
+ 1. SEARCH
195
+ - Try web_search, wikipedia_search, or arxiv_search first.
196
+ - Refine your query rather than repeating the exact same terms.
197
+ - If one search tool yields insufficient info, switch to another before downloading.
198
+ 2. VISIT
199
+ - Use visit_webpage to extract and read page content when a promising link appears after one of the SEARCH tools.
200
+ - For each visited link, also download the file and add to the vector store, you might need to query this later, especially if you have a lot of search results.
201
+ 3. EVALUATE
202
+ - ✅ If the page or search snippet fully answers the question, respond immediately.
203
+ - ❌ If not, move on to deeper investigation.
204
+ 4. DOWNLOAD
205
+ - Use download_file_from_link tool on relevant links found (yes you can download webpages as html).
206
+ - For arXiv papers, target the /pdf/ or DOI link (e.g https://arxiv.org/pdf/2011.10672).
207
+ -
208
+ 5. INDEX & QUERY
209
+ - Add downloaded documents to the vector store with add_document_to_vector_store.
210
+ - Use query_downloaded_documents for detailed answers.
211
+ 6. READ
212
+ - You have access to a read_file_content tool to read most types of files. You can also directly interact with downloaded files in your python code (do this for csv files and excel files)
213
+
214
+
215
+ FALLBACK & ADAPTATION:
216
+ • If a tool fails, reformulate your query or try a different search method before dropping to download.
217
+ • If a tool fails multiple times, try a different tool.
218
+ • For arXiv: you might discover a paper link via web_search tool and then directly use download_file_from_link tool
219
+
220
+ COMMON TOOL CHAINS (conceptual outlines):
221
+ These are just guidelines, each task might require a unique workflow.
222
+ A tool can provide useful information for the task, it will not always contain the answer. You need to work to get to a final_answer that makes sense.
223
+
224
+ • FACTUAL Qs:
225
+ web_search → final_answer
226
+ • CURRENT EVENTS:
227
+ To have some summary information use web_search, that might output a promising website to visit and read content from using (visit_webpage or download_file_from_link and read_file_content)
228
+ web_search → visit_webpage → final_answer
229
+ • DOCUMENT-BASED Qs:
230
+ web_search → download_file_from_link → add_document_to_vector_store → query_downloaded_documents → final_answer
231
+ • ARXIV PAPERS:
232
+ The arxiv search tool provides a list of results with summary content, to inspect the whole paper you need to download it with download_file_from_link tool.
233
+ arxiv_search → download_file_from_link → read_file_content
234
+ If that fails
235
+ arxiv_search → download_file_from_link → add_document_to_vector_store → query_downloaded_documents
236
+ • MEDIA ANALYSIS:
237
+ download_file_from_link → transcribe_video/transcribe_audio/describe_image → final_answer
238
+
239
+ FINAL ANSWER FORMAT:
240
+ - Begin with "FINAL ANSWER: "
241
+ - Number → digits only (e.g., 42)
242
+ - String → exact text (e.g., Pope Francis)
243
+ - List → comma-separated, one space (e.g., 2, 3, 4)
244
+ - Conclude with: FINAL ANSWER: <your_answer>
245
+ """
246
+
247
+ def run(self, question: str, task_id: str, to_download) -> str:
248
+ """
249
+ Run the agent with the given question, task_id, and download flag.
250
+
251
+ Args:
252
+ question (str): The question or task for the agent to process
253
+ task_id (str): A unique identifier for the task
254
+ to_download (Bool): Flag indicating whether to download resources
255
+
256
+ Returns:
257
+ str: The agent's response
258
+ """
259
+ prompt = self._get_system_prompt()
260
+ # Task introduction
261
+ prompt += "\nHere is the Task you need to solve:\n\n"
262
+ prompt += f"Task: {question}\n\n"
263
+
264
+ # Include download instructions if applicable
265
+ if to_download:
266
+ link = f"https://agents-course-unit4-scoring.hf.space/files/{task_id}"
267
+ prompt += (
268
+ "IMPORTANT: Before solving the task, you must download a required file.\n"
269
+ f"Use the `download_file_from_link` tool with this link: {link}\n"
270
+ "After downloading, use the appropriate tool to read or process the file "
271
+ "before attempting to solve the task.\n\n"
272
+ )
273
+
274
+ # Run the agent with the given question
275
+ result = self.agent.generate_response(question)
276
+
277
+ # Extract the final answer from the result
278
+ final_answer = extract_final_answer(result)
279
+
280
+ return final_answer
281
+
282
+
283
+ # Example of how to use this code (commented out)
284
+ # if __name__ == "__main__":
285
+ # agent = BasicAgent()
286
+ # response = agent("What is the current population of Tokyo?", "population_query", True)
287
+ # print(f"Response: {response}")
app.py CHANGED
@@ -1,34 +1,38 @@
 
1
  import os
 
2
  import gradio as gr
3
- import requests
4
- import inspect
5
  import pandas as pd
 
 
 
6
 
7
  # (Keep Constants as is)
8
  # --- Constants ---
9
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
 
11
- # --- Basic Agent Definition ---
12
- # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
13
  class BasicAgent:
14
  def __init__(self):
15
  print("BasicAgent initialized.")
16
- def __call__(self, question: str) -> str:
 
 
17
  print(f"Agent received question (first 50 chars): {question[:50]}...")
18
- fixed_answer = "This is a default answer."
19
- print(f"Agent returning fixed answer: {fixed_answer}")
20
- return fixed_answer
21
 
22
- def run_and_submit_all( profile: gr.OAuthProfile | None):
23
  """
24
  Fetches all questions, runs the BasicAgent on them, submits all answers,
25
  and displays the results.
26
  """
27
  # --- Determine HF Space Runtime URL and Repo URL ---
28
- space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
29
 
30
  if profile:
31
- username= f"{profile.username}"
32
  print(f"User logged in: {username}")
33
  else:
34
  print("User not logged in.")
@@ -55,16 +59,16 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
55
  response.raise_for_status()
56
  questions_data = response.json()
57
  if not questions_data:
58
- print("Fetched questions list is empty.")
59
- return "Fetched questions list is empty or invalid format.", None
60
  print(f"Fetched {len(questions_data)} questions.")
61
  except requests.exceptions.RequestException as e:
62
  print(f"Error fetching questions: {e}")
63
  return f"Error fetching questions: {e}", None
64
  except requests.exceptions.JSONDecodeError as e:
65
- print(f"Error decoding JSON response from questions endpoint: {e}")
66
- print(f"Response text: {response.text[:500]}")
67
- return f"Error decoding server response for questions: {e}", None
68
  except Exception as e:
69
  print(f"An unexpected error occurred fetching questions: {e}")
70
  return f"An unexpected error occurred fetching questions: {e}", None
@@ -76,23 +80,48 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
76
  for item in questions_data:
77
  task_id = item.get("task_id")
78
  question_text = item.get("question")
 
 
 
 
 
 
 
79
  if not task_id or question_text is None:
80
  print(f"Skipping item with missing task_id or question: {item}")
81
  continue
82
  try:
83
- submitted_answer = agent(question_text)
84
- answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
85
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
 
 
 
 
 
 
 
 
86
  except Exception as e:
87
- print(f"Error running agent on task {task_id}: {e}")
88
- results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
 
 
 
 
 
 
89
 
90
  if not answers_payload:
91
  print("Agent did not produce any answers to submit.")
92
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
93
 
94
- # 4. Prepare Submission
95
- submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
 
 
 
 
96
  status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
97
  print(status_update)
98
 
@@ -162,20 +191,19 @@ with gr.Blocks() as demo:
162
 
163
  run_button = gr.Button("Run Evaluation & Submit All Answers")
164
 
165
- status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
 
 
166
  # Removed max_rows=10 from DataFrame constructor
167
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
168
 
169
- run_button.click(
170
- fn=run_and_submit_all,
171
- outputs=[status_output, results_table]
172
- )
173
 
174
  if __name__ == "__main__":
175
- print("\n" + "-"*30 + " App Starting " + "-"*30)
176
  # Check for SPACE_HOST and SPACE_ID at startup for information
177
  space_host_startup = os.getenv("SPACE_HOST")
178
- space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
179
 
180
  if space_host_startup:
181
  print(f"✅ SPACE_HOST found: {space_host_startup}")
@@ -183,14 +211,18 @@ if __name__ == "__main__":
183
  else:
184
  print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
185
 
186
- if space_id_startup: # Print repo URLs if SPACE_ID is found
187
  print(f"✅ SPACE_ID found: {space_id_startup}")
188
  print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
189
- print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
 
 
190
  else:
191
- print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
 
 
192
 
193
- print("-"*(60 + len(" App Starting ")) + "\n")
194
 
195
  print("Launching Gradio Interface for Basic Agent Evaluation...")
196
- demo.launch(debug=True, share=False)
 
1
+ # app.py
2
  import os
3
+
4
  import gradio as gr
 
 
5
  import pandas as pd
6
+ import requests
7
+
8
+ from agent import BoomBot
9
 
10
  # (Keep Constants as is)
11
  # --- Constants ---
12
  DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
13
 
14
+
15
+ # --- Basic Agent Definition --
16
  class BasicAgent:
17
  def __init__(self):
18
  print("BasicAgent initialized.")
19
+ self.agent = BoomBot(provider="deepinfra")
20
+
21
+ def __call__(self, question: str, task_id: str, to_download) -> str:
22
  print(f"Agent received question (first 50 chars): {question[:50]}...")
23
+ return self.agent.run(question, task_id, to_download)
24
+
 
25
 
26
+ def run_and_submit_all(profile: gr.OAuthProfile | None):
27
  """
28
  Fetches all questions, runs the BasicAgent on them, submits all answers,
29
  and displays the results.
30
  """
31
  # --- Determine HF Space Runtime URL and Repo URL ---
32
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
33
 
34
  if profile:
35
+ username = f"{profile.username}"
36
  print(f"User logged in: {username}")
37
  else:
38
  print("User not logged in.")
 
59
  response.raise_for_status()
60
  questions_data = response.json()
61
  if not questions_data:
62
+ print("Fetched questions list is empty.")
63
+ return "Fetched questions list is empty or invalid format.", None
64
  print(f"Fetched {len(questions_data)} questions.")
65
  except requests.exceptions.RequestException as e:
66
  print(f"Error fetching questions: {e}")
67
  return f"Error fetching questions: {e}", None
68
  except requests.exceptions.JSONDecodeError as e:
69
+ print(f"Error decoding JSON response from questions endpoint: {e}")
70
+ print(f"Response text: {response.text[:500]}")
71
+ return f"Error decoding server response for questions: {e}", None
72
  except Exception as e:
73
  print(f"An unexpected error occurred fetching questions: {e}")
74
  return f"An unexpected error occurred fetching questions: {e}", None
 
80
  for item in questions_data:
81
  task_id = item.get("task_id")
82
  question_text = item.get("question")
83
+ file_name = item.get("file_name", "")
84
+
85
+ if file_name.strip() != "":
86
+ to_download = True
87
+ else:
88
+ to_download = False
89
+
90
  if not task_id or question_text is None:
91
  print(f"Skipping item with missing task_id or question: {item}")
92
  continue
93
  try:
94
+ submitted_answer = agent(question_text, task_id, to_download=to_download)
95
+ answers_payload.append(
96
+ {"task_id": task_id, "submitted_answer": submitted_answer}
97
+ )
98
+ results_log.append(
99
+ {
100
+ "Task ID": task_id,
101
+ "Question": question_text,
102
+ "Submitted Answer": submitted_answer,
103
+ }
104
+ )
105
  except Exception as e:
106
+ print(f"Error running agent on task {task_id}: {e}")
107
+ results_log.append(
108
+ {
109
+ "Task ID": task_id,
110
+ "Question": question_text,
111
+ "Submitted Answer": f"AGENT ERROR: {e}",
112
+ }
113
+ )
114
 
115
  if not answers_payload:
116
  print("Agent did not produce any answers to submit.")
117
  return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
118
 
119
+ # 4. Prepare Submission
120
+ submission_data = {
121
+ "username": username.strip(),
122
+ "agent_code": agent_code,
123
+ "answers": answers_payload,
124
+ }
125
  status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
126
  print(status_update)
127
 
 
191
 
192
  run_button = gr.Button("Run Evaluation & Submit All Answers")
193
 
194
+ status_output = gr.Textbox(
195
+ label="Run Status / Submission Result", lines=5, interactive=False
196
+ )
197
  # Removed max_rows=10 from DataFrame constructor
198
  results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
199
 
200
+ run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
 
 
 
201
 
202
  if __name__ == "__main__":
203
+ print("\n" + "-" * 30 + " App Starting " + "-" * 30)
204
  # Check for SPACE_HOST and SPACE_ID at startup for information
205
  space_host_startup = os.getenv("SPACE_HOST")
206
+ space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
207
 
208
  if space_host_startup:
209
  print(f"✅ SPACE_HOST found: {space_host_startup}")
 
211
  else:
212
  print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
213
 
214
+ if space_id_startup: # Print repo URLs if SPACE_ID is found
215
  print(f"✅ SPACE_ID found: {space_id_startup}")
216
  print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
217
+ print(
218
+ f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main"
219
+ )
220
  else:
221
+ print(
222
+ "ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined."
223
+ )
224
 
225
+ print("-" * (60 + len(" App Starting ")) + "\n")
226
 
227
  print("Launching Gradio Interface for Basic Agent Evaluation...")
228
+ demo.launch(debug=True, share=False)
app_template.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import gradio as gr
3
+ import requests
4
+ import inspect
5
+ import pandas as pd
6
+
7
+ # (Keep Constants as is)
8
+ # --- Constants ---
9
+ DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
10
+
11
+ # --- Basic Agent Definition ---
12
+ # ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
13
+ class BasicAgent:
14
+ def __init__(self):
15
+ print("BasicAgent initialized.")
16
+ def __call__(self, question: str) -> str:
17
+ print(f"Agent received question (first 50 chars): {question[:50]}...")
18
+ fixed_answer = "This is a default answer."
19
+ print(f"Agent returning fixed answer: {fixed_answer}")
20
+ return fixed_answer
21
+
22
+ def run_and_submit_all( profile: gr.OAuthProfile | None):
23
+ """
24
+ Fetches all questions, runs the BasicAgent on them, submits all answers,
25
+ and displays the results.
26
+ """
27
+ # --- Determine HF Space Runtime URL and Repo URL ---
28
+ space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
29
+
30
+ if profile:
31
+ username= f"{profile.username}"
32
+ print(f"User logged in: {username}")
33
+ else:
34
+ print("User not logged in.")
35
+ return "Please Login to Hugging Face with the button.", None
36
+
37
+ api_url = DEFAULT_API_URL
38
+ questions_url = f"{api_url}/questions"
39
+ submit_url = f"{api_url}/submit"
40
+
41
+ # 1. Instantiate Agent ( modify this part to create your agent)
42
+ try:
43
+ agent = BasicAgent()
44
+ except Exception as e:
45
+ print(f"Error instantiating agent: {e}")
46
+ return f"Error initializing agent: {e}", None
47
+ # In the case of an app running as a hugging Face space, this link points toward your codebase ( usefull for others so please keep it public)
48
+ agent_code = f"https://huggingface.co/spaces/{space_id}/tree/main"
49
+ print(agent_code)
50
+
51
+ # 2. Fetch Questions
52
+ print(f"Fetching questions from: {questions_url}")
53
+ try:
54
+ response = requests.get(questions_url, timeout=15)
55
+ response.raise_for_status()
56
+ questions_data = response.json()
57
+ if not questions_data:
58
+ print("Fetched questions list is empty.")
59
+ return "Fetched questions list is empty or invalid format.", None
60
+ print(f"Fetched {len(questions_data)} questions.")
61
+ except requests.exceptions.RequestException as e:
62
+ print(f"Error fetching questions: {e}")
63
+ return f"Error fetching questions: {e}", None
64
+ except requests.exceptions.JSONDecodeError as e:
65
+ print(f"Error decoding JSON response from questions endpoint: {e}")
66
+ print(f"Response text: {response.text[:500]}")
67
+ return f"Error decoding server response for questions: {e}", None
68
+ except Exception as e:
69
+ print(f"An unexpected error occurred fetching questions: {e}")
70
+ return f"An unexpected error occurred fetching questions: {e}", None
71
+
72
+ # 3. Run your Agent
73
+ results_log = []
74
+ answers_payload = []
75
+ print(f"Running agent on {len(questions_data)} questions...")
76
+ for item in questions_data:
77
+ task_id = item.get("task_id")
78
+ question_text = item.get("question")
79
+ if not task_id or question_text is None:
80
+ print(f"Skipping item with missing task_id or question: {item}")
81
+ continue
82
+ try:
83
+ submitted_answer = agent(question_text)
84
+ answers_payload.append({"task_id": task_id, "submitted_answer": submitted_answer})
85
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": submitted_answer})
86
+ except Exception as e:
87
+ print(f"Error running agent on task {task_id}: {e}")
88
+ results_log.append({"Task ID": task_id, "Question": question_text, "Submitted Answer": f"AGENT ERROR: {e}"})
89
+
90
+ if not answers_payload:
91
+ print("Agent did not produce any answers to submit.")
92
+ return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
93
+
94
+ # 4. Prepare Submission
95
+ submission_data = {"username": username.strip(), "agent_code": agent_code, "answers": answers_payload}
96
+ status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
97
+ print(status_update)
98
+
99
+ # 5. Submit
100
+ print(f"Submitting {len(answers_payload)} answers to: {submit_url}")
101
+ try:
102
+ response = requests.post(submit_url, json=submission_data, timeout=60)
103
+ response.raise_for_status()
104
+ result_data = response.json()
105
+ final_status = (
106
+ f"Submission Successful!\n"
107
+ f"User: {result_data.get('username')}\n"
108
+ f"Overall Score: {result_data.get('score', 'N/A')}% "
109
+ f"({result_data.get('correct_count', '?')}/{result_data.get('total_attempted', '?')} correct)\n"
110
+ f"Message: {result_data.get('message', 'No message received.')}"
111
+ )
112
+ print("Submission successful.")
113
+ results_df = pd.DataFrame(results_log)
114
+ return final_status, results_df
115
+ except requests.exceptions.HTTPError as e:
116
+ error_detail = f"Server responded with status {e.response.status_code}."
117
+ try:
118
+ error_json = e.response.json()
119
+ error_detail += f" Detail: {error_json.get('detail', e.response.text)}"
120
+ except requests.exceptions.JSONDecodeError:
121
+ error_detail += f" Response: {e.response.text[:500]}"
122
+ status_message = f"Submission Failed: {error_detail}"
123
+ print(status_message)
124
+ results_df = pd.DataFrame(results_log)
125
+ return status_message, results_df
126
+ except requests.exceptions.Timeout:
127
+ status_message = "Submission Failed: The request timed out."
128
+ print(status_message)
129
+ results_df = pd.DataFrame(results_log)
130
+ return status_message, results_df
131
+ except requests.exceptions.RequestException as e:
132
+ status_message = f"Submission Failed: Network error - {e}"
133
+ print(status_message)
134
+ results_df = pd.DataFrame(results_log)
135
+ return status_message, results_df
136
+ except Exception as e:
137
+ status_message = f"An unexpected error occurred during submission: {e}"
138
+ print(status_message)
139
+ results_df = pd.DataFrame(results_log)
140
+ return status_message, results_df
141
+
142
+
143
+ # --- Build Gradio Interface using Blocks ---
144
+ with gr.Blocks() as demo:
145
+ gr.Markdown("# Basic Agent Evaluation Runner")
146
+ gr.Markdown(
147
+ """
148
+ **Instructions:**
149
+
150
+ 1. Please clone this space, then modify the code to define your agent's logic, the tools, the necessary packages, etc ...
151
+ 2. Log in to your Hugging Face account using the button below. This uses your HF username for submission.
152
+ 3. Click 'Run Evaluation & Submit All Answers' to fetch questions, run your agent, submit answers, and see the score.
153
+
154
+ ---
155
+ **Disclaimers:**
156
+ Once clicking on the "submit button, it can take quite some time ( this is the time for the agent to go through all the questions).
157
+ This space provides a basic setup and is intentionally sub-optimal to encourage you to develop your own, more robust solution. For instance for the delay process of the submit button, a solution could be to cache the answers and submit in a seperate action or even to answer the questions in async.
158
+ """
159
+ )
160
+
161
+ gr.LoginButton()
162
+
163
+ run_button = gr.Button("Run Evaluation & Submit All Answers")
164
+
165
+ status_output = gr.Textbox(label="Run Status / Submission Result", lines=5, interactive=False)
166
+ # Removed max_rows=10 from DataFrame constructor
167
+ results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
168
+
169
+ run_button.click(
170
+ fn=run_and_submit_all,
171
+ outputs=[status_output, results_table]
172
+ )
173
+
174
+ if __name__ == "__main__":
175
+ print("\n" + "-"*30 + " App Starting " + "-"*30)
176
+ # Check for SPACE_HOST and SPACE_ID at startup for information
177
+ space_host_startup = os.getenv("SPACE_HOST")
178
+ space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
179
+
180
+ if space_host_startup:
181
+ print(f"✅ SPACE_HOST found: {space_host_startup}")
182
+ print(f" Runtime URL should be: https://{space_host_startup}.hf.space")
183
+ else:
184
+ print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
185
+
186
+ if space_id_startup: # Print repo URLs if SPACE_ID is found
187
+ print(f"✅ SPACE_ID found: {space_id_startup}")
188
+ print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
189
+ print(f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main")
190
+ else:
191
+ print("ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined.")
192
+
193
+ print("-"*(60 + len(" App Starting ")) + "\n")
194
+
195
+ print("Launching Gradio Interface for Basic Agent Evaluation...")
196
+ demo.launch(debug=True, share=False)
requirements copy.txt ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4
2
+ chromadb
3
+ duckduckgo_search
4
+ gradio
5
+ huggingface_hub
6
+ langchain
7
+ langchain-chroma
8
+ langchain-community
9
+ langchain-core
10
+ langchain-groq
11
+ langchain-huggingface
12
+ langchain-google-genai
13
+ langchain-tavily
14
+ langgraph
15
+ markdownify
16
+ pandas
17
+ protobuf==3.20.*
18
+ PyMuPDF
19
+ python-dotenv
20
+ requests
21
+ sentence-transformers
22
+ smolagents
23
+ traitlets
requirements.txt CHANGED
@@ -1,2 +1,23 @@
 
 
 
1
  gradio
2
- requests
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ beautifulsoup4
2
+ chromadb
3
+ duckduckgo_search
4
  gradio
5
+ huggingface_hub
6
+ langchain
7
+ langchain-chroma
8
+ langchain-community
9
+ langchain-core
10
+ langchain-groq
11
+ langchain-huggingface
12
+ langchain-google-genai
13
+ langchain-tavily
14
+ langgraph
15
+ markdownify
16
+ pandas
17
+ protobuf==3.20.*
18
+ PyMuPDF
19
+ python-dotenv
20
+ requests
21
+ sentence-transformers
22
+ smolagents
23
+ traitlets
tools.py ADDED
@@ -0,0 +1,1114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import html
2
+ import json
3
+ import mimetypes
4
+ import os
5
+ import re
6
+ import time
7
+ import traceback
8
+ from pathlib import Path
9
+ from typing import Dict, List
10
+ from urllib.parse import urlparse
11
+
12
+ import chromadb
13
+ import chromadb.utils.embedding_functions as embedding_functions
14
+ import fitz # PyMuPDF
15
+ import pandas as pd
16
+ import requests
17
+ from bs4 import BeautifulSoup
18
+ from duckduckgo_search import DDGS
19
+ from duckduckgo_search.exceptions import (
20
+ ConversationLimitException,
21
+ DuckDuckGoSearchException,
22
+ RatelimitException,
23
+ TimeoutException,
24
+ )
25
+ from langchain_community.document_loaders import (
26
+ BSHTMLLoader,
27
+ JSONLoader,
28
+ PyPDFLoader,
29
+ TextLoader,
30
+ UnstructuredFileLoader,
31
+ )
32
+ from langchain.text_splitter import RecursiveCharacterTextSplitter
33
+ from langchain_community.tools import BraveSearch
34
+ from markdownify import markdownify
35
+ from smolagents import Tool, tool
36
+ from smolagents.utils import truncate_content
37
+
38
+ from typing import Dict, List
39
+
40
+ import requests
41
+ from bs4 import BeautifulSoup
42
+ from urllib.parse import quote_plus
43
+
44
+ class ReadFileContentTool(Tool):
45
+ name = "read_file_content"
46
+ description = """Reads local files in various formats (text, CSV, Excel, PDF, HTML, etc.) and returns their content as readable text. Automatically detects and processes the appropriate file format."""
47
+
48
+ inputs = {
49
+ "file_path": {
50
+ "type": "string",
51
+ "description": "The full path to the file from which the content should be read.",
52
+ }
53
+ }
54
+ output_type = "string"
55
+
56
+ def forward(self, file_path: str) -> str:
57
+ if not os.path.exists(file_path):
58
+ return f"❌ File does not exist: {file_path}"
59
+
60
+ ext = os.path.splitext(file_path)[1].lower()
61
+
62
+ try:
63
+ if ext == ".txt":
64
+ with open(file_path, "r", encoding="utf-8") as f:
65
+ return truncate_content(f.read())
66
+
67
+ elif ext == ".csv":
68
+ df = pd.read_csv(file_path)
69
+ return truncate_content(
70
+ f"CSV Content:\n{df.to_string(index=False)}\n\nColumn names: {', '.join(df.columns)}"
71
+ )
72
+
73
+ elif ext in [".xlsx", ".xls"]:
74
+ df = pd.read_excel(file_path)
75
+ return truncate_content(
76
+ f"Excel Content:\n{df.to_string(index=False)}\n\nColumn names: {', '.join(df.columns)}"
77
+ )
78
+
79
+ elif ext == ".pdf":
80
+ doc = fitz.open(file_path)
81
+ text = "".join([page.get_text() for page in doc])
82
+ doc.close()
83
+ return truncate_content(
84
+ text.strip() or "⚠️ PDF contains no readable text."
85
+ )
86
+
87
+ elif ext == ".json":
88
+ with open(file_path, "r", encoding="utf-8") as f:
89
+ return truncate_content(f.read())
90
+
91
+ elif ext == ".py":
92
+ with open(file_path, "r", encoding="utf-8") as f:
93
+ return truncate_content(f.read())
94
+
95
+ elif ext in [".html", ".htm"]:
96
+ with open(file_path, "r", encoding="utf-8") as f:
97
+ html = f.read()
98
+ try:
99
+ markdown = markdownify(html).strip()
100
+ markdown = re.sub(r"\n{3,}", "\n\n", markdown)
101
+ return f"📄 HTML content (converted to Markdown):\n\n{truncate_content(markdown)}"
102
+ except Exception:
103
+ soup = BeautifulSoup(html, "html.parser")
104
+ text = soup.get_text(separator="\n").strip()
105
+ return f"📄 HTML content (raw text fallback):\n\n{truncate_content(text)}"
106
+
107
+ elif ext in [".mp3", ".wav"]:
108
+ return f"ℹ️ Audio file detected: {os.path.basename(file_path)}. Use transcribe_audio tool to process the audio content."
109
+
110
+ elif ext in [".mp4", ".mov", ".avi"]:
111
+ return f"ℹ️ Video file detected: {os.path.basename(file_path)}. Use transcribe_video tool to process the video content."
112
+
113
+ else:
114
+ return f"ℹ️ Unsupported file type: {ext}. File saved at {file_path}"
115
+
116
+ except Exception as e:
117
+ return f"❌ Could not read {file_path}: {e}"
118
+
119
+
120
+ class WikipediaSearchTool(Tool):
121
+ name = "wikipedia_search"
122
+ description = """Searches Wikipedia for a specific topic and returns a concise summary. Useful for background information on subjects, concepts, historical events, or scientific topics."""
123
+
124
+ inputs = {
125
+ "query": {
126
+ "type": "string",
127
+ "description": "The query or subject to search for on Wikipedia.",
128
+ }
129
+ }
130
+ output_type = "string"
131
+
132
+ def forward(self, query: str) -> str:
133
+ print(f"EXECUTING TOOL: wikipedia_search(query='{query}')")
134
+ try:
135
+ search_link = f"https://en.wikipedia.org/w/api.php?action=query&list=search&srsearch={query}&format=json"
136
+ search_response = requests.get(search_link, timeout=10)
137
+ search_response.raise_for_status()
138
+ search_data = search_response.json()
139
+
140
+ if not search_data.get("query", {}).get("search", []):
141
+ return f"No Wikipedia info for '{query}'."
142
+
143
+ page_id = search_data["query"]["search"][0]["pageid"]
144
+
145
+ content_link = (
146
+ f"https://en.wikipedia.org/w/api.php?action=query&prop=extracts&"
147
+ f"exintro=1&explaintext=1&pageids={page_id}&format=json"
148
+ )
149
+ content_response = requests.get(content_link, timeout=10)
150
+ content_response.raise_for_status()
151
+ content_data = content_response.json()
152
+
153
+ extract = content_data["query"]["pages"][str(page_id)]["extract"]
154
+ if len(extract) > 1500:
155
+ extract = extract[:1500] + "..."
156
+
157
+ result = f"Wikipedia summary for '{query}':\n{extract}"
158
+ print(f"-> Tool Result (Wikipedia): {result[:100]}...")
159
+ return result
160
+
161
+ except Exception as e:
162
+ print(f"❌ Error in wikipedia_search: {e}")
163
+ traceback.print_exc()
164
+ return f"Error wiki: {e}"
165
+
166
+
167
+ class TranscribeAudioTool(Tool):
168
+ name = "transcribe_audio"
169
+ description = """Converts spoken content in audio files to text. Handles various audio formats and produces a transcript of the spoken content for analysis."""
170
+
171
+ inputs = {
172
+ "file_path": {
173
+ "type": "string",
174
+ "description": "The full path to the audio file that needs to be transcribed.",
175
+ }
176
+ }
177
+ output_type = "string"
178
+
179
+ def forward(self, file_path: str) -> str:
180
+ try:
181
+ import os
182
+ import tempfile
183
+
184
+ import speech_recognition as sr
185
+ from pydub import AudioSegment
186
+
187
+ # Verify file exists
188
+ if not os.path.exists(file_path):
189
+ return (
190
+ f"❌ Audio file not found at: {file_path}. Download the file first."
191
+ )
192
+
193
+ # Initialize recognizer
194
+ recognizer = sr.Recognizer()
195
+
196
+ # Convert to WAV if not already (needed for speech_recognition)
197
+ file_ext = os.path.splitext(file_path)[1].lower()
198
+
199
+ if file_ext != ".wav":
200
+ # Create temp WAV file
201
+ temp_wav = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
202
+
203
+ # Convert to WAV using pydub
204
+ audio = AudioSegment.from_file(file_path)
205
+ audio.export(temp_wav, format="wav")
206
+ audio_path = temp_wav
207
+ else:
208
+ audio_path = file_path
209
+
210
+ # Transcribe audio using Google's speech recognition
211
+ with sr.AudioFile(audio_path) as source:
212
+ audio_data = recognizer.record(source)
213
+ transcript = recognizer.recognize_google(audio_data)
214
+
215
+ # Clean up temp file if created
216
+ if file_ext != ".wav" and os.path.exists(temp_wav):
217
+ os.remove(temp_wav)
218
+
219
+ return transcript.strip()
220
+
221
+ except Exception as e:
222
+ return f"❌ Transcription failed: {str(e)}"
223
+
224
+
225
+ class TranscibeVideoFileTool(Tool):
226
+ name = "transcribe_video"
227
+ description = """Extracts and transcribes speech from video files. Converts the audio portion of videos into readable text for analysis or reference."""
228
+
229
+ inputs = {
230
+ "file_path": {
231
+ "type": "string",
232
+ "description": "The full path to the video file that needs to be transcribed.",
233
+ }
234
+ }
235
+ output_type = "string"
236
+
237
+ def forward(self, file_path: str) -> str:
238
+ try:
239
+ # Verify file exists
240
+ if not os.path.exists(file_path):
241
+ return (
242
+ f"❌ Video file not found at: {file_path}. Download the file first."
243
+ )
244
+
245
+ import os
246
+ import tempfile
247
+
248
+ import moviepy.editor as mp
249
+ import speech_recognition as sr
250
+
251
+ # Extract audio from video
252
+ video = mp.VideoFileClip(file_path)
253
+
254
+ # Create temporary audio file
255
+ temp_audio = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
256
+
257
+ # Extract audio to WAV format (required for speech_recognition)
258
+ video.audio.write_audiofile(temp_audio, verbose=False, logger=None)
259
+ video.close()
260
+
261
+ # Initialize recognizer
262
+ recognizer = sr.Recognizer()
263
+
264
+ # Transcribe audio
265
+ with sr.AudioFile(temp_audio) as source:
266
+ audio_data = recognizer.record(source)
267
+ transcript = recognizer.recognize_google(audio_data)
268
+
269
+ # Clean up temp file
270
+ if os.path.exists(temp_audio):
271
+ os.remove(temp_audio)
272
+
273
+ return transcript.strip()
274
+
275
+ except Exception as e:
276
+ return f"❌ Video processing failed: {str(e)}"
277
+
278
+
279
+ class BraveWebSearchTool(Tool):
280
+ name = "web_search"
281
+ description = """Performs web searches and returns content from top results. Provides real-time information from across the internet including current events, facts, and website content relevant to your query."""
282
+
283
+ inputs = {
284
+ "query": {
285
+ "type": "string",
286
+ "description": "A web search query string (e.g., a question or query).",
287
+ }
288
+ }
289
+ output_type = "string"
290
+
291
+ # api_key = os.getenv("BRAVE_SEARCH_API_KEY")
292
+ api_key=None
293
+ count = 3
294
+ char_limit = 4000 # Adjust based on LLM context window
295
+ tool = BraveSearch.from_api_key(api_key=api_key, search_kwargs={"count": count})
296
+
297
+ def extract_main_text(self, url: str, char_limit: int) -> str:
298
+ try:
299
+ headers = {"User-Agent": "Mozilla/5.0"}
300
+ response = requests.get(url, headers=headers, timeout=10)
301
+ soup = BeautifulSoup(response.text, "html.parser")
302
+
303
+ # Remove scripts/styles
304
+ for tag in soup(["script", "style", "noscript"]):
305
+ tag.extract()
306
+
307
+ # Heuristic: extract visible text from body
308
+ body = soup.body
309
+ if not body:
310
+ return "⚠️ Could not extract content."
311
+
312
+ text = " ".join(t.strip() for t in body.stripped_strings)
313
+ return text[:char_limit].strip()
314
+ except Exception as e:
315
+ return f"⚠️ Failed to extract article: {e}"
316
+
317
+ def forward(self, query: str) -> str:
318
+ try:
319
+ results_json = self.tool.run(query)
320
+ results = (
321
+ json.loads(results_json)
322
+ if isinstance(results_json, str)
323
+ else results_json
324
+ )
325
+
326
+ output_parts = []
327
+ for i, r in enumerate(results[: self.count], start=1):
328
+ title = html.unescape(r.get("title", "").strip())
329
+ link = r.get("link", "").strip()
330
+
331
+ article_text = self.extract_main_text(link, self.char_limit)
332
+
333
+ result_block = (
334
+ f"Result {i}:\n"
335
+ f"Title: {title}\n"
336
+ f"URL: {link}\n"
337
+ f"Extracted Content:\n{article_text}\n"
338
+ )
339
+ output_parts.append(result_block)
340
+
341
+ return "\n\n".join(output_parts).strip()
342
+
343
+ except Exception as e:
344
+ return f"Search failed: {str(e)}"
345
+
346
+
347
+ class DescribeImageTool(Tool):
348
+ name = "describe_image"
349
+ description = """Analyzes images and generates detailed text descriptions. Identifies objects, scenes, text, and visual elements within the image to provide context or understanding."""
350
+
351
+ inputs = {
352
+ "image_path": {
353
+ "type": "string",
354
+ "description": "The full path to the image file to describe.",
355
+ }
356
+ }
357
+ output_type = "string"
358
+
359
+ def forward(self, image_path: str) -> str:
360
+ import os
361
+
362
+ from PIL import Image
363
+ from transformers import BlipForConditionalGeneration, BlipProcessor
364
+
365
+ if not os.path.exists(image_path):
366
+ return f"❌ Image file does not exist: {image_path}"
367
+
368
+ try:
369
+ processor = BlipProcessor.from_pretrained(
370
+ "Salesforce/blip-image-captioning-base", use_fast=True
371
+ )
372
+ model = BlipForConditionalGeneration.from_pretrained(
373
+ "Salesforce/blip-image-captioning-base"
374
+ )
375
+
376
+ image = Image.open(image_path).convert("RGB")
377
+ inputs = processor(images=image, return_tensors="pt")
378
+ output_ids = model.generate(**inputs)
379
+
380
+ caption = processor.decode(output_ids[0], skip_special_tokens=True)
381
+ return caption.strip() or "⚠️ No caption could be generated."
382
+ except Exception as e:
383
+ return f"❌ Failed to describe image: {e}"
384
+
385
+
386
+ class DownloadFileFromLinkTool(Tool):
387
+ name = "download_file_from_link"
388
+ description = "Downloads files from a URL and saves them locally. Supports various formats including PDFs, documents, images, and data files. Returns the local file path for further processing."
389
+
390
+ inputs = {
391
+ "link": {"type": "string", "description": "The URL to download the file from."},
392
+ "file_name": {
393
+ "type": "string",
394
+ "description": "Desired name of the saved file, without extension.",
395
+ "nullable": True,
396
+ },
397
+ }
398
+
399
+ output_type = "string"
400
+ SUPPORTED_EXTENSIONS = {
401
+ ".xlsx",
402
+ ".pdf",
403
+ ".txt",
404
+ ".csv",
405
+ ".json",
406
+ ".xml",
407
+ ".html",
408
+ ".jpg",
409
+ ".jpeg",
410
+ ".png",
411
+ ".mp4",
412
+ ".mp3",
413
+ ".wav",
414
+ ".zip",
415
+ }
416
+
417
+ def forward(self, link: str, file_name: str = "taskfile") -> str:
418
+ print(f"⬇️ Downloading file from: {link}")
419
+ dir_path = "./downloads"
420
+ os.makedirs(dir_path, exist_ok=True)
421
+
422
+ try:
423
+ response = requests.get(link, stream=True, timeout=30)
424
+ except requests.RequestException as e:
425
+ return f"❌ Error: Request failed - {e}"
426
+
427
+ if response.status_code != 200:
428
+ return (
429
+ f"❌ Error: Unable to fetch file. Status code: {response.status_code}"
430
+ )
431
+
432
+ # Step 1: Try extracting extension from provided filename
433
+ base_name, provided_ext = os.path.splitext(file_name)
434
+ provided_ext = provided_ext.lower()
435
+
436
+ # Step 2: Check if provided extension is supported
437
+ if provided_ext and provided_ext in self.SUPPORTED_EXTENSIONS:
438
+ ext = provided_ext
439
+ else:
440
+ # Step 3: Try to infer from Content-Type
441
+ content_type = (
442
+ response.headers.get("Content-Type", "").split(";")[0].strip()
443
+ )
444
+ guessed_ext = mimetypes.guess_extension(content_type or "") or ""
445
+
446
+ # Step 4: If mimetype returned .bin or nothing useful, try to fallback to URL
447
+ if guessed_ext in ("", ".bin"):
448
+ parsed_link = urlparse(link)
449
+ _, url_ext = os.path.splitext(parsed_link.path)
450
+ if url_ext.lower() in self.SUPPORTED_EXTENSIONS:
451
+ ext = url_ext.lower()
452
+ else:
453
+ return f"⚠️ Warning: Cannot determine a valid file extension from '{content_type}' or URL. Please retry with an explicit valid filename and extension."
454
+ else:
455
+ ext = guessed_ext
456
+
457
+ # Step 5: Final path and save
458
+ file_path = os.path.join(dir_path, base_name + ext)
459
+ downloaded = 0
460
+
461
+ with open(file_path, "wb") as f:
462
+ for chunk in response.iter_content(chunk_size=1024):
463
+ if chunk:
464
+ f.write(chunk)
465
+ downloaded += len(chunk)
466
+
467
+ return file_path
468
+
469
+
470
+ class DuckDuckGoSearchTool(Tool):
471
+ name = "web_search"
472
+ description = """Performs web searches and returns content from top results. Provides real-time information from across the internet including current events, facts, and website content relevant to your query."""
473
+
474
+ inputs = {
475
+ "query": {
476
+ "type": "string",
477
+ "description": "The search query to run on DuckDuckGo",
478
+ },
479
+ }
480
+ output_type = "string"
481
+
482
+ def _configure(self, max_retries: int = 3, retry_sleep: int = 3):
483
+ self._max_retries = max_retries
484
+ self._retry_sleep = retry_sleep
485
+
486
+ def forward(self, query: str) -> str:
487
+ self._configure()
488
+ print(
489
+ f"EXECUTING TOOL: duckduckgo_search(query='{query}', top_results={top_results})"
490
+ )
491
+
492
+ top_results = 5
493
+
494
+ retries = 0
495
+ max_retries = getattr(self, "_max_retries", 3)
496
+ retry_sleep = getattr(self, "_retry_sleep", 2)
497
+
498
+ while retries < max_retries:
499
+ try:
500
+ results = DDGS().text(
501
+ keywords=query,
502
+ region="wt-wt",
503
+ safesearch="moderate",
504
+ max_results=top_results,
505
+ )
506
+
507
+ if not results:
508
+ return "No results found."
509
+
510
+ output_lines = []
511
+ for idx, res in enumerate(results[:top_results], start=1):
512
+ title = res.get("title", "N/A")
513
+ url = res.get("href", "N/A")
514
+ snippet = res.get("body", "N/A")
515
+
516
+ output_lines.append(
517
+ f"Result {idx}:\n"
518
+ f"Title: {title}\n"
519
+ f"URL: {url}\n"
520
+ f"Snippet: {snippet}\n"
521
+ )
522
+
523
+ output = "\n".join(output_lines)
524
+
525
+ print(f"-> Tool Result (DuckDuckGo): {output[:1500]}...")
526
+ return output
527
+
528
+ except (
529
+ DuckDuckGoSearchException,
530
+ TimeoutException,
531
+ RatelimitException,
532
+ ConversationLimitException,
533
+ ) as e:
534
+ retries += 1
535
+ print(
536
+ f"⚠️ DuckDuckGo Exception (Attempt {retries}/{max_retries}): {type(e).__name__}: {e}"
537
+ )
538
+ traceback.print_exc()
539
+ time.sleep(retry_sleep)
540
+
541
+ except Exception as e:
542
+ print(f"❌ Unexpected Error: {e}")
543
+ traceback.print_exc()
544
+ return f"Unhandled exception during DuckDuckGo search: {e}"
545
+
546
+ return f"❌ Failed to retrieve results after {max_retries} retries."
547
+
548
+ huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
549
+ model_name="sentence-transformers/all-mpnet-base-v2"
550
+ )
551
+ SUPPORTED_EXTENSIONS = [
552
+ ".txt",
553
+ ".md",
554
+ ".py",
555
+ ".pdf",
556
+ ".json",
557
+ ".jsonl",
558
+ ".html",
559
+ ".htm",
560
+ ]
561
+
562
+ class AddDocumentToVectorStoreTool(Tool):
563
+ name = "add_document_to_vector_store"
564
+ description = "Processes a document and adds it to the vector database for semantic search. Automatically chunks files and creates text embeddings to enable powerful content retrieval."
565
+
566
+ inputs = {
567
+ "file_path": {
568
+ "type": "string",
569
+ "description": "Absolute path to the file to be indexed.",
570
+ }
571
+ }
572
+
573
+ output_type = "string"
574
+
575
+ def _load_file(self, path: Path):
576
+ """Select the right loader for the file extension."""
577
+ if path.suffix == ".pdf":
578
+ return PyPDFLoader(str(path)).load()
579
+ elif path.suffix == ".json":
580
+ return JSONLoader(str(path), jq_schema=".").load()
581
+ elif path.suffix in [".md"]:
582
+ return UnstructuredFileLoader(str(path)).load()
583
+ elif path.suffix in [".html", ".htm"]:
584
+ return BSHTMLLoader(str(path)).load()
585
+ else: # fallback for .txt, .py, etc.
586
+ return TextLoader(str(path)).load()
587
+
588
+ def forward(self, file_path: str) -> str:
589
+ print(f"📄 Adding document to vector store: {file_path}")
590
+ try:
591
+ collection_name = "vectorstore"
592
+ path = Path(file_path)
593
+ if not path.exists() or path.suffix not in SUPPORTED_EXTENSIONS:
594
+ return f"Unsupported or missing file: {file_path}"
595
+
596
+ docs = self._load_file(path)
597
+ text_splitter = RecursiveCharacterTextSplitter(
598
+ chunk_size=500, chunk_overlap=50
599
+ )
600
+ split_docs = text_splitter.split_documents(docs)
601
+
602
+ client = chromadb.Client(
603
+ chromadb.config.Settings(
604
+ persist_directory="./chroma_store",
605
+ )
606
+ )
607
+
608
+ collection = client.get_or_create_collection(
609
+ name=collection_name,
610
+ configuration={"embedding_function": huggingface_ef},
611
+ )
612
+
613
+ texts = [doc.page_content for doc in split_docs]
614
+ metadatas = [doc.metadata for doc in split_docs]
615
+
616
+ collection.add(
617
+ documents=texts,
618
+ metadatas=metadatas,
619
+ ids=[f"{path.stem}_{i}" for i in range(len(texts))],
620
+ )
621
+
622
+ return f"✅ Successfully added {len(texts)} chunks from '{file_path}' to collection '{collection_name}'."
623
+
624
+ except Exception as e:
625
+ print(f"❌ Error in add_to_vector_store: {e}")
626
+ traceback.print_exc()
627
+ return f"Error: {e}"
628
+
629
+ class QueryVectorStoreTool(Tool):
630
+ name = "query_downloaded_documents"
631
+ description = "Performs semantic searches across your downloaded documents. Use detailed queries to find specific information, concepts, or answers from your collected resources."
632
+
633
+ inputs = {
634
+ "query": {
635
+ "type": "string",
636
+ "description": "The search query. Ensure this is constructed intelligently so to retrieve the most relevant outputs.",
637
+ },
638
+ "top_k": {
639
+ "type": "integer",
640
+ "description": "Number of top results to retrieve. Usually between 3 and 30",
641
+ "nullable": True,
642
+ },
643
+ }
644
+ output_type = "string"
645
+
646
+ def forward(self, query: str, top_k: int = 5) -> str:
647
+ collection_name = "vectorstore"
648
+
649
+ if k < 3:
650
+ k = 3
651
+ if k > 30:
652
+ k = 30
653
+
654
+ print(f"🔎 Querying vector store '{collection_name}' with: '{query}'")
655
+ try:
656
+ client = chromadb.Client(
657
+ chromadb.config.Settings(
658
+ persist_directory="./chroma_store",
659
+ )
660
+ )
661
+ collection = client.get_collection(name=collection_name)
662
+
663
+ results = collection.query(
664
+ query_texts=[query],
665
+ n_results=top_k,
666
+ )
667
+
668
+ formatted = []
669
+ for i in range(len(results["documents"][0])):
670
+ doc = results["documents"][0][i]
671
+ metadata = results["metadatas"][0][i]
672
+ formatted.append(
673
+ f"Result {i+1}:\n" f"Content: {doc}\n" f"Metadata: {metadata}\n"
674
+ )
675
+
676
+ return "\n".join(formatted) or "No relevant documents found."
677
+
678
+ except Exception as e:
679
+ print(f"❌ Error in query_vector_store: {e}")
680
+ traceback.print_exc()
681
+ return f"Error querying vector store: {e}"
682
+
683
+ @tool
684
+ def image_question_answering(image_path: str, prompt: str) -> str:
685
+ """
686
+ Analyzes images and answers specific questions about their content. Can identify objects, read text, describe scenes, or interpret visual information based on your questions.
687
+
688
+ Args:
689
+ image_path: The path to the image file
690
+ prompt: The question to ask about the image
691
+
692
+ Returns:
693
+ A string answer generated by the local Ollama model
694
+ """
695
+ # Check for supported file types
696
+ file_extension = image_path.lower().split(".")[-1]
697
+ if file_extension not in ["jpg", "jpeg", "png", "bmp", "gif", "webp"]:
698
+ return "Unsupported file type. Please provide an image."
699
+
700
+ path = Path(image_path)
701
+ if not path.exists():
702
+ return f"File not found at: {image_path}"
703
+
704
+ # Send the image and prompt to Ollama's local model
705
+ response = chat(
706
+ model="llava", # Assuming your model is named 'lava'
707
+ messages=[
708
+ {
709
+ "role": "user",
710
+ "content": prompt,
711
+ "images": [path],
712
+ },
713
+ ],
714
+ options={"temperature": 0.2}, # Slight randomness for naturalness
715
+ )
716
+
717
+ return response.message.content.strip()
718
+
719
+ class VisitWebpageTool(Tool):
720
+ name = "visit_webpage"
721
+ description = "Loads a webpage from a URL and converts its content to markdown format. Use this to browse websites, extract information, or identify downloadable resources from a specific web address."
722
+ inputs = {
723
+ "url": {
724
+ "type": "string",
725
+ "description": "The url of the webpage to visit.",
726
+ }
727
+ }
728
+ output_type = "string"
729
+
730
+ def forward(self, url: str) -> str:
731
+ try:
732
+ from urllib.parse import urlparse
733
+
734
+ import requests
735
+ from bs4 import BeautifulSoup
736
+ from markdownify import markdownify
737
+ from requests.exceptions import RequestException
738
+ from smolagents.utils import truncate_content
739
+ except ImportError as e:
740
+ raise ImportError(
741
+ "You must install packages `markdownify`, `requests`, and `beautifulsoup4` to run this tool: for instance run `pip install markdownify requests beautifulsoup4`."
742
+ ) from e
743
+
744
+ try:
745
+ # Get the webpage content
746
+ headers = {
747
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
748
+ }
749
+ response = requests.get(url, headers=headers, timeout=20)
750
+ response.raise_for_status()
751
+
752
+ # Parse the HTML with BeautifulSoup
753
+ soup = BeautifulSoup(response.text, "html.parser")
754
+
755
+ # Extract domain name for context
756
+ domain = urlparse(url).netloc
757
+
758
+ # Remove common clutter elements
759
+ self._remove_clutter(soup)
760
+
761
+ # Try to identify and prioritize main content
762
+ main_content = self._extract_main_content(soup)
763
+
764
+ if main_content:
765
+ # Convert the cleaned HTML to markdown
766
+ markdown_content = markdownify(str(main_content)).strip()
767
+ else:
768
+ # Fallback to full page content if main content extraction fails
769
+ markdown_content = markdownify(str(soup)).strip()
770
+
771
+ # Post-process the markdown content
772
+ markdown_content = self._clean_markdown(markdown_content)
773
+
774
+ # Add source information
775
+ result = f"Content from {domain}:\n\n{markdown_content}"
776
+
777
+ return truncate_content(result, 40000)
778
+
779
+ except requests.exceptions.Timeout:
780
+ return "The request timed out. Please try again later or check the URL."
781
+ except RequestException as e:
782
+ return f"Error fetching the webpage: {str(e)}"
783
+ except Exception as e:
784
+ return f"An unexpected error occurred: {str(e)}"
785
+
786
+ def _remove_clutter(self, soup):
787
+ """Remove common elements that clutter web pages."""
788
+ # Common non-content elements to remove
789
+ clutter_selectors = [
790
+ "header",
791
+ "footer",
792
+ "nav",
793
+ ".nav",
794
+ ".navigation",
795
+ ".menu",
796
+ ".sidebar",
797
+ ".footer",
798
+ ".header",
799
+ "#footer",
800
+ "#header",
801
+ "#nav",
802
+ "#sidebar",
803
+ ".widget",
804
+ ".cookie",
805
+ ".cookies",
806
+ ".ad",
807
+ ".ads",
808
+ ".advertisement",
809
+ "script",
810
+ "style",
811
+ "noscript",
812
+ "iframe",
813
+ ".social",
814
+ ".share",
815
+ ".comment",
816
+ ".comments",
817
+ ".subscription",
818
+ ".newsletter",
819
+ '[role="banner"]',
820
+ '[role="navigation"]',
821
+ '[role="complementary"]',
822
+ ]
823
+
824
+ for selector in clutter_selectors:
825
+ for element in soup.select(selector):
826
+ element.decompose()
827
+
828
+ # Remove hidden elements
829
+ for hidden in soup.select(
830
+ '[style*="display: none"], [style*="display:none"], [style*="visibility: hidden"], [style*="visibility:hidden"], [hidden]'
831
+ ):
832
+ hidden.decompose()
833
+
834
+ def _extract_main_content(self, soup):
835
+ """Try to identify and extract the main content of the page."""
836
+ # Priority order for common main content containers
837
+ main_content_selectors = [
838
+ "main",
839
+ '[role="main"]',
840
+ "article",
841
+ ".content",
842
+ ".main-content",
843
+ ".post-content",
844
+ "#content",
845
+ "#main",
846
+ "#main-content",
847
+ ".article",
848
+ ".post",
849
+ ".entry",
850
+ ".page-content",
851
+ ".entry-content",
852
+ ]
853
+
854
+ # Try to find the main content container
855
+ for selector in main_content_selectors:
856
+ main_content = soup.select(selector)
857
+ if main_content:
858
+ # If multiple matches, find the one with the most text content
859
+ if len(main_content) > 1:
860
+ return max(main_content, key=lambda x: len(x.get_text()))
861
+ return main_content[0]
862
+
863
+ # If no main content container found, look for the largest text block
864
+ paragraphs = soup.find_all("p")
865
+ if paragraphs:
866
+ # Find the parent that contains the most paragraphs
867
+ parents = {}
868
+ for p in paragraphs:
869
+ if p.parent:
870
+ if p.parent not in parents:
871
+ parents[p.parent] = 0
872
+ parents[p.parent] += 1
873
+
874
+ if parents:
875
+ # Return the parent with the most paragraphs
876
+ return max(parents.items(), key=lambda x: x[1])[0]
877
+
878
+ # Return None if we can't identify main content
879
+ return None
880
+
881
+ def _clean_markdown(self, content):
882
+ """Clean up the markdown content."""
883
+ # Normalize whitespace
884
+ content = re.sub(r"\n{3,}", "\n\n", content)
885
+
886
+ # Remove consecutive duplicate links
887
+ content = re.sub(r"(\[.*?\]\(.*?\))\s*\1+", r"\1", content)
888
+
889
+ # Remove very short lines that are likely menu items
890
+ lines = content.split("\n")
891
+ filtered_lines = []
892
+
893
+ # Skip consecutive short lines (likely menus)
894
+ short_line_threshold = 40 # characters
895
+ consecutive_short_lines = 0
896
+ max_consecutive_short_lines = 3
897
+
898
+ for line in lines:
899
+ stripped_line = line.strip()
900
+ if len(
901
+ stripped_line
902
+ ) < short_line_threshold and not stripped_line.startswith("#"):
903
+ consecutive_short_lines += 1
904
+ if consecutive_short_lines > max_consecutive_short_lines:
905
+ continue
906
+ else:
907
+ consecutive_short_lines = 0
908
+
909
+ filtered_lines.append(line)
910
+
911
+ content = "\n".join(filtered_lines)
912
+
913
+ # Remove duplicate headers
914
+ seen_headers = set()
915
+ lines = content.split("\n")
916
+ filtered_lines = []
917
+
918
+ for line in lines:
919
+ if line.startswith("#"):
920
+ header_text = line.strip()
921
+ if header_text in seen_headers:
922
+ continue
923
+ seen_headers.add(header_text)
924
+ filtered_lines.append(line)
925
+
926
+ content = "\n".join(filtered_lines)
927
+
928
+ # Remove lines containing common footer patterns
929
+ footer_patterns = [
930
+ r"^copyright",
931
+ r"^©",
932
+ r"^all rights reserved",
933
+ r"^terms",
934
+ r"^privacy policy",
935
+ r"^contact us",
936
+ r"^follow us",
937
+ r"^social media",
938
+ r"^disclaimer",
939
+ ]
940
+
941
+ footer_pattern = "|".join(footer_patterns)
942
+ lines = content.split("\n")
943
+ filtered_lines = []
944
+
945
+ for line in lines:
946
+ if not re.search(footer_pattern, line.lower()):
947
+ filtered_lines.append(line)
948
+
949
+ content = "\n".join(filtered_lines)
950
+
951
+ return content
952
+
953
+ class ArxivSearchTool(Tool):
954
+ name = "arxiv_search"
955
+ description = """Searches arXiv for academic papers and returns structured information including titles, authors, publication dates, abstracts, and download links."""
956
+
957
+ inputs = {
958
+ "query": {
959
+ "type": "string",
960
+ "description": "A research-related query (e.g., 'AI regulation')",
961
+ },
962
+ "from_date": {
963
+ "type": "string",
964
+ "description": "Optional search start date in format (YYYY or YYYY-MM or YYYY-MM-DD) (e.g., '2022-06' or '2022' or '2022-04-12')",
965
+ "nullable": True,
966
+ },
967
+ "to_date": {
968
+ "type": "string",
969
+ "description": "Optional search end date in (YYYY or YYYY-MM or YYYY-MM-DD) (e.g., '2022-06' or '2022' or '2022-04-12')",
970
+ "nullable": True,
971
+ },
972
+ }
973
+
974
+ output_type = "string"
975
+
976
+ def forward(
977
+ self,
978
+ query: str,
979
+ from_date: str = None,
980
+ to_date: str = None,
981
+ ) -> str:
982
+ # 1) build URL
983
+ url = build_arxiv_url(query, from_date, to_date, size=50)
984
+
985
+ # 2) fetch & parse
986
+ try:
987
+ papers = fetch_and_parse_arxiv(url)
988
+ except Exception as e:
989
+ return f"❌ Failed to fetch or parse arXiv results: {e}"
990
+
991
+ if not papers:
992
+ return "No results found for your query."
993
+
994
+ # 3) format into a single string
995
+ output_lines = []
996
+ for idx, p in enumerate(papers, start=1):
997
+ output_lines += [
998
+ f"🔍 RESULT {idx}",
999
+ f"Title : {p['title']}",
1000
+ f"Authors : {p['authors']}",
1001
+ f"Published : {p['published']}",
1002
+ f"Summary : {p['abstract'][:500]}{'...' if len(p['abstract'])>500 else ''}",
1003
+ f"Entry ID : {p['entry_link']}",
1004
+ f"Download link: {p['download_link']}",
1005
+ "",
1006
+ ]
1007
+
1008
+ return "\n".join(output_lines).strip()
1009
+
1010
+ def fetch_and_parse_arxiv(url: str) -> List[Dict[str, str]]:
1011
+ """
1012
+ Fetches the given arXiv advanced‐search URL, parses the HTML,
1013
+ and returns a list of results. Each result is a dict containing:
1014
+ - title
1015
+ - authors
1016
+ - published
1017
+ - abstract
1018
+ - entry_link
1019
+ - doi (or "[N/A]" if none)
1020
+ """
1021
+ resp = requests.get(url)
1022
+ resp.raise_for_status()
1023
+ soup = BeautifulSoup(resp.text, "html.parser")
1024
+
1025
+ results = []
1026
+ for li in soup.find_all("li", class_="arxiv-result"):
1027
+ # Title
1028
+ t = li.find("p", class_="title")
1029
+ title = t.get_text(strip=True) if t else ""
1030
+
1031
+ # Authors
1032
+ a = li.find("p", class_="authors")
1033
+ authors = a.get_text(strip=True).replace("Authors:", "").strip() if a else ""
1034
+
1035
+ # Abstract
1036
+ ab = li.find("span", class_="abstract-full")
1037
+ abstract = (
1038
+ ab.get_text(strip=True).replace("Abstract:", "").strip() if ab else ""
1039
+ )
1040
+
1041
+ # Published date
1042
+ d = li.find("p", class_="is-size-7")
1043
+ published = d.get_text(strip=True) if d else ""
1044
+
1045
+ # Entry link
1046
+ lt = li.find("p", class_="list-title")
1047
+ entry_link = lt.find("a")["href"] if lt and lt.find("a") else ""
1048
+
1049
+ # DOI
1050
+ idblock = li.find("p", class_="list-identifier")
1051
+ if idblock:
1052
+ for a_tag in idblock.find_all("a", href=True):
1053
+ if "doi.org" in a_tag["href"]:
1054
+ doi = a_tag["href"]
1055
+ break
1056
+
1057
+ results.append(
1058
+ {
1059
+ "title": title,
1060
+ "authors": authors,
1061
+ "published": published,
1062
+ "abstract": abstract,
1063
+ "entry_link": entry_link,
1064
+ "download_link": (
1065
+ entry_link.replace("abs", "pdf") if "abs" in entry_link else "N/A"
1066
+ ),
1067
+ }
1068
+ )
1069
+
1070
+ return results
1071
+
1072
+ def build_arxiv_url(
1073
+ query: str, from_date: str = None, to_date: str = None, size: int = 50
1074
+ ) -> str:
1075
+ """
1076
+ Build an arXiv advanced-search URL matching the exact segment order:
1077
+ 1) ?advanced
1078
+ 2) terms-0-operator=AND
1079
+ 3) terms-0-term=…
1080
+ 4) terms-0-field=all
1081
+ 5) classification-physics_archives=all
1082
+ 6) classification-include_cross_list=include
1083
+ [ optional date‐range block ]
1084
+ 7) abstracts=show
1085
+ 8) size=…
1086
+ 9) order=-announced_date_first
1087
+ If from_date or to_date is None, the date-range block is omitted.
1088
+ """
1089
+ base = "https://arxiv.org/search/advanced?advanced="
1090
+ parts = [
1091
+ "&terms-0-operator=AND",
1092
+ f"&terms-0-term={quote_plus(query)}",
1093
+ "&terms-0-field=all",
1094
+ "&classification-physics_archives=all",
1095
+ "&classification-include_cross_list=include",
1096
+ ]
1097
+
1098
+ # optional date-range filtering
1099
+ if from_date and to_date:
1100
+ parts += [
1101
+ "&date-year=",
1102
+ "&date-filter_by=date_range",
1103
+ f"&date-from_date={from_date}",
1104
+ f"&date-to_date={to_date}",
1105
+ "&date-date_type=submitted_date",
1106
+ ]
1107
+
1108
+ parts += [
1109
+ "&abstracts=show",
1110
+ f"&size={size}",
1111
+ "&order=-announced_date_first",
1112
+ ]
1113
+
1114
+ return base + "".join(parts)