fd
Browse files- .gitignore +9 -0
- agent.py +16 -11
- app.py +3 -0
- app_template.py +53 -28
- realreq.txt +12 -0
- requirements.txt +11 -23
- tools.py +18 -24
.gitignore
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
.env
|
| 2 |
+
ragdata/
|
| 3 |
+
chroma_store
|
| 4 |
+
.python-version
|
| 5 |
+
downloads/
|
| 6 |
+
.python_version
|
| 7 |
+
*.jsonl
|
| 8 |
+
*__pycache__/
|
| 9 |
+
*.log
|
agent.py
CHANGED
|
@@ -1,5 +1,7 @@
|
|
| 1 |
import os
|
|
|
|
| 2 |
from dotenv import load_dotenv
|
|
|
|
| 3 |
load_dotenv()
|
| 4 |
|
| 5 |
# Import models from SmolaAgents
|
|
@@ -20,7 +22,7 @@ from tools import (
|
|
| 20 |
TranscribeAudioTool,
|
| 21 |
VisitWebpageTool,
|
| 22 |
WikipediaSearchTool,
|
| 23 |
-
image_question_answering
|
| 24 |
)
|
| 25 |
|
| 26 |
# Import utility functions
|
|
@@ -69,11 +71,13 @@ class BoomBot:
|
|
| 69 |
)
|
| 70 |
elif self.provider == "anthropic":
|
| 71 |
model_id = "anthropic/claude-3-5-haiku-latest"
|
| 72 |
-
return LiteLLMModel(
|
| 73 |
-
|
| 74 |
-
|
| 75 |
-
|
| 76 |
-
|
|
|
|
|
|
|
| 77 |
elif self.provider == "deepinfra":
|
| 78 |
deepinfra_model = "Qwen/Qwen3-235B-A22B"
|
| 79 |
return OpenAIServerModel(
|
|
@@ -277,7 +281,7 @@ class BoomBot:
|
|
| 277 |
)
|
| 278 |
|
| 279 |
# Run the agent with the given question
|
| 280 |
-
result = self.agent.
|
| 281 |
|
| 282 |
# Extract the final answer from the result
|
| 283 |
final_answer = extract_final_answer(result)
|
|
@@ -286,7 +290,8 @@ class BoomBot:
|
|
| 286 |
|
| 287 |
|
| 288 |
# Example of how to use this code (commented out)
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
+
|
| 3 |
from dotenv import load_dotenv
|
| 4 |
+
|
| 5 |
load_dotenv()
|
| 6 |
|
| 7 |
# Import models from SmolaAgents
|
|
|
|
| 22 |
TranscribeAudioTool,
|
| 23 |
VisitWebpageTool,
|
| 24 |
WikipediaSearchTool,
|
| 25 |
+
image_question_answering,
|
| 26 |
)
|
| 27 |
|
| 28 |
# Import utility functions
|
|
|
|
| 71 |
)
|
| 72 |
elif self.provider == "anthropic":
|
| 73 |
model_id = "anthropic/claude-3-5-haiku-latest"
|
| 74 |
+
return LiteLLMModel(
|
| 75 |
+
model_id=model_id,
|
| 76 |
+
temperature=0.6,
|
| 77 |
+
max_tokens=8192,
|
| 78 |
+
api_key=os.getenv("ANTHROPIC_API_KEY"),
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
elif self.provider == "deepinfra":
|
| 82 |
deepinfra_model = "Qwen/Qwen3-235B-A22B"
|
| 83 |
return OpenAIServerModel(
|
|
|
|
| 281 |
)
|
| 282 |
|
| 283 |
# Run the agent with the given question
|
| 284 |
+
result = self.agent.run(question)
|
| 285 |
|
| 286 |
# Extract the final answer from the result
|
| 287 |
final_answer = extract_final_answer(result)
|
|
|
|
| 290 |
|
| 291 |
|
| 292 |
# Example of how to use this code (commented out)
|
| 293 |
+
if __name__ == "__main__":
|
| 294 |
+
agent = BoomBot(provider="gemma")
|
| 295 |
+
question = "In the year 2020, where were koi fish found in the watershed with the id 02040203? Give only the name of the pond, lake, or stream where the fish were found, and not the name of the city or county."
|
| 296 |
+
response = agent.run(question=question, task_id="1", to_download=False)
|
| 297 |
+
print(f"Response: {response}")
|
app.py
CHANGED
|
@@ -12,7 +12,10 @@ from agent import BoomBot
|
|
| 12 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 13 |
|
| 14 |
from dotenv import load_dotenv
|
|
|
|
| 15 |
load_dotenv()
|
|
|
|
|
|
|
| 16 |
# --- Basic Agent Definition --
|
| 17 |
class BasicAgent:
|
| 18 |
def __init__(self):
|
|
|
|
| 12 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 13 |
|
| 14 |
from dotenv import load_dotenv
|
| 15 |
+
|
| 16 |
load_dotenv()
|
| 17 |
+
|
| 18 |
+
|
| 19 |
# --- Basic Agent Definition --
|
| 20 |
class BasicAgent:
|
| 21 |
def __init__(self):
|
app_template.py
CHANGED
|
@@ -1,34 +1,38 @@
|
|
|
|
|
| 1 |
import os
|
|
|
|
| 2 |
import gradio as gr
|
| 3 |
-
import requests
|
| 4 |
-
import inspect
|
| 5 |
import pandas as pd
|
|
|
|
| 6 |
|
| 7 |
# (Keep Constants as is)
|
| 8 |
# --- Constants ---
|
| 9 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 10 |
|
|
|
|
| 11 |
# --- Basic Agent Definition ---
|
| 12 |
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
| 13 |
class BasicAgent:
|
| 14 |
def __init__(self):
|
| 15 |
print("BasicAgent initialized.")
|
|
|
|
| 16 |
def __call__(self, question: str) -> str:
|
| 17 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
| 18 |
fixed_answer = "This is a default answer."
|
| 19 |
print(f"Agent returning fixed answer: {fixed_answer}")
|
| 20 |
return fixed_answer
|
| 21 |
|
| 22 |
-
|
|
|
|
| 23 |
"""
|
| 24 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
| 25 |
and displays the results.
|
| 26 |
"""
|
| 27 |
# --- Determine HF Space Runtime URL and Repo URL ---
|
| 28 |
-
space_id = os.getenv("SPACE_ID")
|
| 29 |
|
| 30 |
if profile:
|
| 31 |
-
username= f"{profile.username}"
|
| 32 |
print(f"User logged in: {username}")
|
| 33 |
else:
|
| 34 |
print("User not logged in.")
|
|
@@ -55,16 +59,16 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 55 |
response.raise_for_status()
|
| 56 |
questions_data = response.json()
|
| 57 |
if not questions_data:
|
| 58 |
-
|
| 59 |
-
|
| 60 |
print(f"Fetched {len(questions_data)} questions.")
|
| 61 |
except requests.exceptions.RequestException as e:
|
| 62 |
print(f"Error fetching questions: {e}")
|
| 63 |
return f"Error fetching questions: {e}", None
|
| 64 |
except requests.exceptions.JSONDecodeError as e:
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
except Exception as e:
|
| 69 |
print(f"An unexpected error occurred fetching questions: {e}")
|
| 70 |
return f"An unexpected error occurred fetching questions: {e}", None
|
|
@@ -81,18 +85,36 @@ def run_and_submit_all( profile: gr.OAuthProfile | None):
|
|
| 81 |
continue
|
| 82 |
try:
|
| 83 |
submitted_answer = agent(question_text)
|
| 84 |
-
answers_payload.append(
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
except Exception as e:
|
| 87 |
-
|
| 88 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
if not answers_payload:
|
| 91 |
print("Agent did not produce any answers to submit.")
|
| 92 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 93 |
|
| 94 |
-
# 4. Prepare Submission
|
| 95 |
-
submission_data = {
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
| 97 |
print(status_update)
|
| 98 |
|
|
@@ -162,20 +184,19 @@ with gr.Blocks() as demo:
|
|
| 162 |
|
| 163 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 164 |
|
| 165 |
-
status_output = gr.Textbox(
|
|
|
|
|
|
|
| 166 |
# Removed max_rows=10 from DataFrame constructor
|
| 167 |
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 168 |
|
| 169 |
-
run_button.click(
|
| 170 |
-
fn=run_and_submit_all,
|
| 171 |
-
outputs=[status_output, results_table]
|
| 172 |
-
)
|
| 173 |
|
| 174 |
if __name__ == "__main__":
|
| 175 |
-
print("\n" + "-"*30 + " App Starting " + "-"*30)
|
| 176 |
# Check for SPACE_HOST and SPACE_ID at startup for information
|
| 177 |
space_host_startup = os.getenv("SPACE_HOST")
|
| 178 |
-
space_id_startup = os.getenv("SPACE_ID")
|
| 179 |
|
| 180 |
if space_host_startup:
|
| 181 |
print(f"✅ SPACE_HOST found: {space_host_startup}")
|
|
@@ -183,14 +204,18 @@ if __name__ == "__main__":
|
|
| 183 |
else:
|
| 184 |
print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
|
| 185 |
|
| 186 |
-
if space_id_startup:
|
| 187 |
print(f"✅ SPACE_ID found: {space_id_startup}")
|
| 188 |
print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
|
| 189 |
-
print(
|
|
|
|
|
|
|
| 190 |
else:
|
| 191 |
-
print(
|
|
|
|
|
|
|
| 192 |
|
| 193 |
-
print("-"*(60 + len(" App Starting ")) + "\n")
|
| 194 |
|
| 195 |
print("Launching Gradio Interface for Basic Agent Evaluation...")
|
| 196 |
-
demo.launch(debug=True, share=False)
|
|
|
|
| 1 |
+
import inspect
|
| 2 |
import os
|
| 3 |
+
|
| 4 |
import gradio as gr
|
|
|
|
|
|
|
| 5 |
import pandas as pd
|
| 6 |
+
import requests
|
| 7 |
|
| 8 |
# (Keep Constants as is)
|
| 9 |
# --- Constants ---
|
| 10 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 11 |
|
| 12 |
+
|
| 13 |
# --- Basic Agent Definition ---
|
| 14 |
# ----- THIS IS WERE YOU CAN BUILD WHAT YOU WANT ------
|
| 15 |
class BasicAgent:
|
| 16 |
def __init__(self):
|
| 17 |
print("BasicAgent initialized.")
|
| 18 |
+
|
| 19 |
def __call__(self, question: str) -> str:
|
| 20 |
print(f"Agent received question (first 50 chars): {question[:50]}...")
|
| 21 |
fixed_answer = "This is a default answer."
|
| 22 |
print(f"Agent returning fixed answer: {fixed_answer}")
|
| 23 |
return fixed_answer
|
| 24 |
|
| 25 |
+
|
| 26 |
+
def run_and_submit_all(profile: gr.OAuthProfile | None):
|
| 27 |
"""
|
| 28 |
Fetches all questions, runs the BasicAgent on them, submits all answers,
|
| 29 |
and displays the results.
|
| 30 |
"""
|
| 31 |
# --- Determine HF Space Runtime URL and Repo URL ---
|
| 32 |
+
space_id = os.getenv("SPACE_ID") # Get the SPACE_ID for sending link to the code
|
| 33 |
|
| 34 |
if profile:
|
| 35 |
+
username = f"{profile.username}"
|
| 36 |
print(f"User logged in: {username}")
|
| 37 |
else:
|
| 38 |
print("User not logged in.")
|
|
|
|
| 59 |
response.raise_for_status()
|
| 60 |
questions_data = response.json()
|
| 61 |
if not questions_data:
|
| 62 |
+
print("Fetched questions list is empty.")
|
| 63 |
+
return "Fetched questions list is empty or invalid format.", None
|
| 64 |
print(f"Fetched {len(questions_data)} questions.")
|
| 65 |
except requests.exceptions.RequestException as e:
|
| 66 |
print(f"Error fetching questions: {e}")
|
| 67 |
return f"Error fetching questions: {e}", None
|
| 68 |
except requests.exceptions.JSONDecodeError as e:
|
| 69 |
+
print(f"Error decoding JSON response from questions endpoint: {e}")
|
| 70 |
+
print(f"Response text: {response.text[:500]}")
|
| 71 |
+
return f"Error decoding server response for questions: {e}", None
|
| 72 |
except Exception as e:
|
| 73 |
print(f"An unexpected error occurred fetching questions: {e}")
|
| 74 |
return f"An unexpected error occurred fetching questions: {e}", None
|
|
|
|
| 85 |
continue
|
| 86 |
try:
|
| 87 |
submitted_answer = agent(question_text)
|
| 88 |
+
answers_payload.append(
|
| 89 |
+
{"task_id": task_id, "submitted_answer": submitted_answer}
|
| 90 |
+
)
|
| 91 |
+
results_log.append(
|
| 92 |
+
{
|
| 93 |
+
"Task ID": task_id,
|
| 94 |
+
"Question": question_text,
|
| 95 |
+
"Submitted Answer": submitted_answer,
|
| 96 |
+
}
|
| 97 |
+
)
|
| 98 |
except Exception as e:
|
| 99 |
+
print(f"Error running agent on task {task_id}: {e}")
|
| 100 |
+
results_log.append(
|
| 101 |
+
{
|
| 102 |
+
"Task ID": task_id,
|
| 103 |
+
"Question": question_text,
|
| 104 |
+
"Submitted Answer": f"AGENT ERROR: {e}",
|
| 105 |
+
}
|
| 106 |
+
)
|
| 107 |
|
| 108 |
if not answers_payload:
|
| 109 |
print("Agent did not produce any answers to submit.")
|
| 110 |
return "Agent did not produce any answers to submit.", pd.DataFrame(results_log)
|
| 111 |
|
| 112 |
+
# 4. Prepare Submission
|
| 113 |
+
submission_data = {
|
| 114 |
+
"username": username.strip(),
|
| 115 |
+
"agent_code": agent_code,
|
| 116 |
+
"answers": answers_payload,
|
| 117 |
+
}
|
| 118 |
status_update = f"Agent finished. Submitting {len(answers_payload)} answers for user '{username}'..."
|
| 119 |
print(status_update)
|
| 120 |
|
|
|
|
| 184 |
|
| 185 |
run_button = gr.Button("Run Evaluation & Submit All Answers")
|
| 186 |
|
| 187 |
+
status_output = gr.Textbox(
|
| 188 |
+
label="Run Status / Submission Result", lines=5, interactive=False
|
| 189 |
+
)
|
| 190 |
# Removed max_rows=10 from DataFrame constructor
|
| 191 |
results_table = gr.DataFrame(label="Questions and Agent Answers", wrap=True)
|
| 192 |
|
| 193 |
+
run_button.click(fn=run_and_submit_all, outputs=[status_output, results_table])
|
|
|
|
|
|
|
|
|
|
| 194 |
|
| 195 |
if __name__ == "__main__":
|
| 196 |
+
print("\n" + "-" * 30 + " App Starting " + "-" * 30)
|
| 197 |
# Check for SPACE_HOST and SPACE_ID at startup for information
|
| 198 |
space_host_startup = os.getenv("SPACE_HOST")
|
| 199 |
+
space_id_startup = os.getenv("SPACE_ID") # Get SPACE_ID at startup
|
| 200 |
|
| 201 |
if space_host_startup:
|
| 202 |
print(f"✅ SPACE_HOST found: {space_host_startup}")
|
|
|
|
| 204 |
else:
|
| 205 |
print("ℹ️ SPACE_HOST environment variable not found (running locally?).")
|
| 206 |
|
| 207 |
+
if space_id_startup: # Print repo URLs if SPACE_ID is found
|
| 208 |
print(f"✅ SPACE_ID found: {space_id_startup}")
|
| 209 |
print(f" Repo URL: https://huggingface.co/spaces/{space_id_startup}")
|
| 210 |
+
print(
|
| 211 |
+
f" Repo Tree URL: https://huggingface.co/spaces/{space_id_startup}/tree/main"
|
| 212 |
+
)
|
| 213 |
else:
|
| 214 |
+
print(
|
| 215 |
+
"ℹ️ SPACE_ID environment variable not found (running locally?). Repo URL cannot be determined."
|
| 216 |
+
)
|
| 217 |
|
| 218 |
+
print("-" * (60 + len(" App Starting ")) + "\n")
|
| 219 |
|
| 220 |
print("Launching Gradio Interface for Basic Agent Evaluation...")
|
| 221 |
+
demo.launch(debug=True, share=False)
|
realreq.txt
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
dotenv
|
| 2 |
+
smolagents
|
| 3 |
+
ollama
|
| 4 |
+
chromadb
|
| 5 |
+
pymupdf
|
| 6 |
+
pandas
|
| 7 |
+
bs4
|
| 8 |
+
duckduckgo-search
|
| 9 |
+
langchain_community
|
| 10 |
+
markdownify
|
| 11 |
+
smolagents[litellm]
|
| 12 |
+
smolagents[openai]
|
requirements.txt
CHANGED
|
@@ -1,25 +1,13 @@
|
|
| 1 |
-
beautifulsoup4
|
| 2 |
-
chromadb
|
| 3 |
-
duckduckgo_search
|
| 4 |
gradio
|
| 5 |
-
|
| 6 |
-
langchain
|
| 7 |
-
langchain-chroma
|
| 8 |
-
langchain-community
|
| 9 |
-
langchain-core
|
| 10 |
-
langchain-groq
|
| 11 |
-
langchain-huggingface
|
| 12 |
-
langchain-google-genai
|
| 13 |
-
langchain-tavily
|
| 14 |
-
langgraph
|
| 15 |
-
markdownify
|
| 16 |
-
pandas
|
| 17 |
-
protobuf==3.20.*
|
| 18 |
-
PyMuPDF
|
| 19 |
-
python-dotenv
|
| 20 |
-
requests
|
| 21 |
-
sentence-transformers
|
| 22 |
smolagents
|
| 23 |
-
|
| 24 |
-
|
| 25 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
gradio
|
| 2 |
+
dotenv
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
smolagents
|
| 4 |
+
ollama
|
| 5 |
+
chromadb
|
| 6 |
+
pymupdf
|
| 7 |
+
pandas
|
| 8 |
+
bs4
|
| 9 |
+
duckduckgo-search
|
| 10 |
+
langchain_community
|
| 11 |
+
markdownify
|
| 12 |
+
smolagents[litellm]
|
| 13 |
+
smolagents[openai]
|
tools.py
CHANGED
|
@@ -7,10 +7,7 @@ import time
|
|
| 7 |
import traceback
|
| 8 |
from pathlib import Path
|
| 9 |
from typing import Dict, List
|
| 10 |
-
from urllib.parse import urlparse
|
| 11 |
-
from pathlib import Path
|
| 12 |
-
from ollama import chat
|
| 13 |
-
from PIL import Image
|
| 14 |
|
| 15 |
import chromadb
|
| 16 |
import chromadb.utils.embedding_functions as embedding_functions
|
|
@@ -18,6 +15,7 @@ import fitz # PyMuPDF
|
|
| 18 |
import pandas as pd
|
| 19 |
import requests
|
| 20 |
from bs4 import BeautifulSoup
|
|
|
|
| 21 |
from duckduckgo_search import DDGS
|
| 22 |
from duckduckgo_search.exceptions import (
|
| 23 |
ConversationLimitException,
|
|
@@ -25,6 +23,7 @@ from duckduckgo_search.exceptions import (
|
|
| 25 |
RatelimitException,
|
| 26 |
TimeoutException,
|
| 27 |
)
|
|
|
|
| 28 |
from langchain_community.document_loaders import (
|
| 29 |
BSHTMLLoader,
|
| 30 |
JSONLoader,
|
|
@@ -32,21 +31,16 @@ from langchain_community.document_loaders import (
|
|
| 32 |
TextLoader,
|
| 33 |
UnstructuredFileLoader,
|
| 34 |
)
|
| 35 |
-
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 36 |
from langchain_community.tools import BraveSearch
|
| 37 |
from markdownify import markdownify
|
|
|
|
|
|
|
| 38 |
from smolagents import Tool, tool
|
| 39 |
from smolagents.utils import truncate_content
|
| 40 |
|
| 41 |
-
from typing import Dict, List
|
| 42 |
-
|
| 43 |
-
import requests
|
| 44 |
-
from bs4 import BeautifulSoup
|
| 45 |
-
from urllib.parse import quote_plus
|
| 46 |
-
|
| 47 |
-
from dotenv import load_dotenv
|
| 48 |
load_dotenv()
|
| 49 |
|
|
|
|
| 50 |
class ReadFileContentTool(Tool):
|
| 51 |
name = "read_file_content"
|
| 52 |
description = """Reads local files in various formats (text, CSV, Excel, PDF, HTML, etc.) and returns their content as readable text. Automatically detects and processes the appropriate file format."""
|
|
@@ -295,7 +289,7 @@ class BraveWebSearchTool(Tool):
|
|
| 295 |
output_type = "string"
|
| 296 |
|
| 297 |
# api_key = os.getenv("BRAVE_SEARCH_API_KEY")
|
| 298 |
-
api_key=None
|
| 299 |
count = 3
|
| 300 |
char_limit = 4000 # Adjust based on LLM context window
|
| 301 |
tool = BraveSearch.from_api_key(api_key=api_key, search_kwargs={"count": count})
|
|
@@ -491,9 +485,6 @@ class DuckDuckGoSearchTool(Tool):
|
|
| 491 |
|
| 492 |
def forward(self, query: str) -> str:
|
| 493 |
self._configure()
|
| 494 |
-
print(
|
| 495 |
-
f"EXECUTING TOOL: duckduckgo_search(query='{query}', top_results={top_results})"
|
| 496 |
-
)
|
| 497 |
|
| 498 |
top_results = 5
|
| 499 |
|
|
@@ -551,6 +542,7 @@ class DuckDuckGoSearchTool(Tool):
|
|
| 551 |
|
| 552 |
return f"❌ Failed to retrieve results after {max_retries} retries."
|
| 553 |
|
|
|
|
| 554 |
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
|
| 555 |
model_name="sentence-transformers/all-mpnet-base-v2"
|
| 556 |
)
|
|
@@ -565,6 +557,7 @@ SUPPORTED_EXTENSIONS = [
|
|
| 565 |
".htm",
|
| 566 |
]
|
| 567 |
|
|
|
|
| 568 |
class AddDocumentToVectorStoreTool(Tool):
|
| 569 |
name = "add_document_to_vector_store"
|
| 570 |
description = "Processes a document and adds it to the vector database for semantic search. Automatically chunks files and creates text embeddings to enable powerful content retrieval."
|
|
@@ -632,6 +625,7 @@ class AddDocumentToVectorStoreTool(Tool):
|
|
| 632 |
traceback.print_exc()
|
| 633 |
return f"Error: {e}"
|
| 634 |
|
|
|
|
| 635 |
class QueryVectorStoreTool(Tool):
|
| 636 |
name = "query_downloaded_documents"
|
| 637 |
description = "Performs semantic searches across your downloaded documents. Use detailed queries to find specific information, concepts, or answers from your collected resources."
|
|
@@ -640,16 +634,11 @@ class QueryVectorStoreTool(Tool):
|
|
| 640 |
"query": {
|
| 641 |
"type": "string",
|
| 642 |
"description": "The search query. Ensure this is constructed intelligently so to retrieve the most relevant outputs.",
|
| 643 |
-
}
|
| 644 |
-
"top_k": {
|
| 645 |
-
"type": "integer",
|
| 646 |
-
"description": "Number of top results to retrieve. Usually between 3 and 30",
|
| 647 |
-
"nullable": True,
|
| 648 |
-
},
|
| 649 |
}
|
| 650 |
output_type = "string"
|
| 651 |
|
| 652 |
-
def forward(self, query: str
|
| 653 |
collection_name = "vectorstore"
|
| 654 |
|
| 655 |
if k < 3:
|
|
@@ -668,7 +657,7 @@ class QueryVectorStoreTool(Tool):
|
|
| 668 |
|
| 669 |
results = collection.query(
|
| 670 |
query_texts=[query],
|
| 671 |
-
n_results=
|
| 672 |
)
|
| 673 |
|
| 674 |
formatted = []
|
|
@@ -686,6 +675,7 @@ class QueryVectorStoreTool(Tool):
|
|
| 686 |
traceback.print_exc()
|
| 687 |
return f"Error querying vector store: {e}"
|
| 688 |
|
|
|
|
| 689 |
@tool
|
| 690 |
def image_question_answering(image_path: str, prompt: str) -> str:
|
| 691 |
"""
|
|
@@ -722,6 +712,7 @@ def image_question_answering(image_path: str, prompt: str) -> str:
|
|
| 722 |
|
| 723 |
return response.message.content.strip()
|
| 724 |
|
|
|
|
| 725 |
class VisitWebpageTool(Tool):
|
| 726 |
name = "visit_webpage"
|
| 727 |
description = "Loads a webpage from a URL and converts its content to markdown format. Use this to browse websites, extract information, or identify downloadable resources from a specific web address."
|
|
@@ -956,6 +947,7 @@ class VisitWebpageTool(Tool):
|
|
| 956 |
|
| 957 |
return content
|
| 958 |
|
|
|
|
| 959 |
class ArxivSearchTool(Tool):
|
| 960 |
name = "arxiv_search"
|
| 961 |
description = """Searches arXiv for academic papers and returns structured information including titles, authors, publication dates, abstracts, and download links."""
|
|
@@ -1013,6 +1005,7 @@ class ArxivSearchTool(Tool):
|
|
| 1013 |
|
| 1014 |
return "\n".join(output_lines).strip()
|
| 1015 |
|
|
|
|
| 1016 |
def fetch_and_parse_arxiv(url: str) -> List[Dict[str, str]]:
|
| 1017 |
"""
|
| 1018 |
Fetches the given arXiv advanced‐search URL, parses the HTML,
|
|
@@ -1075,6 +1068,7 @@ def fetch_and_parse_arxiv(url: str) -> List[Dict[str, str]]:
|
|
| 1075 |
|
| 1076 |
return results
|
| 1077 |
|
|
|
|
| 1078 |
def build_arxiv_url(
|
| 1079 |
query: str, from_date: str = None, to_date: str = None, size: int = 50
|
| 1080 |
) -> str:
|
|
|
|
| 7 |
import traceback
|
| 8 |
from pathlib import Path
|
| 9 |
from typing import Dict, List
|
| 10 |
+
from urllib.parse import quote_plus, urlparse
|
|
|
|
|
|
|
|
|
|
| 11 |
|
| 12 |
import chromadb
|
| 13 |
import chromadb.utils.embedding_functions as embedding_functions
|
|
|
|
| 15 |
import pandas as pd
|
| 16 |
import requests
|
| 17 |
from bs4 import BeautifulSoup
|
| 18 |
+
from dotenv import load_dotenv
|
| 19 |
from duckduckgo_search import DDGS
|
| 20 |
from duckduckgo_search.exceptions import (
|
| 21 |
ConversationLimitException,
|
|
|
|
| 23 |
RatelimitException,
|
| 24 |
TimeoutException,
|
| 25 |
)
|
| 26 |
+
from langchain.text_splitter import RecursiveCharacterTextSplitter
|
| 27 |
from langchain_community.document_loaders import (
|
| 28 |
BSHTMLLoader,
|
| 29 |
JSONLoader,
|
|
|
|
| 31 |
TextLoader,
|
| 32 |
UnstructuredFileLoader,
|
| 33 |
)
|
|
|
|
| 34 |
from langchain_community.tools import BraveSearch
|
| 35 |
from markdownify import markdownify
|
| 36 |
+
from ollama import chat
|
| 37 |
+
from PIL import Image
|
| 38 |
from smolagents import Tool, tool
|
| 39 |
from smolagents.utils import truncate_content
|
| 40 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
load_dotenv()
|
| 42 |
|
| 43 |
+
|
| 44 |
class ReadFileContentTool(Tool):
|
| 45 |
name = "read_file_content"
|
| 46 |
description = """Reads local files in various formats (text, CSV, Excel, PDF, HTML, etc.) and returns their content as readable text. Automatically detects and processes the appropriate file format."""
|
|
|
|
| 289 |
output_type = "string"
|
| 290 |
|
| 291 |
# api_key = os.getenv("BRAVE_SEARCH_API_KEY")
|
| 292 |
+
api_key = None
|
| 293 |
count = 3
|
| 294 |
char_limit = 4000 # Adjust based on LLM context window
|
| 295 |
tool = BraveSearch.from_api_key(api_key=api_key, search_kwargs={"count": count})
|
|
|
|
| 485 |
|
| 486 |
def forward(self, query: str) -> str:
|
| 487 |
self._configure()
|
|
|
|
|
|
|
|
|
|
| 488 |
|
| 489 |
top_results = 5
|
| 490 |
|
|
|
|
| 542 |
|
| 543 |
return f"❌ Failed to retrieve results after {max_retries} retries."
|
| 544 |
|
| 545 |
+
|
| 546 |
huggingface_ef = embedding_functions.HuggingFaceEmbeddingFunction(
|
| 547 |
model_name="sentence-transformers/all-mpnet-base-v2"
|
| 548 |
)
|
|
|
|
| 557 |
".htm",
|
| 558 |
]
|
| 559 |
|
| 560 |
+
|
| 561 |
class AddDocumentToVectorStoreTool(Tool):
|
| 562 |
name = "add_document_to_vector_store"
|
| 563 |
description = "Processes a document and adds it to the vector database for semantic search. Automatically chunks files and creates text embeddings to enable powerful content retrieval."
|
|
|
|
| 625 |
traceback.print_exc()
|
| 626 |
return f"Error: {e}"
|
| 627 |
|
| 628 |
+
|
| 629 |
class QueryVectorStoreTool(Tool):
|
| 630 |
name = "query_downloaded_documents"
|
| 631 |
description = "Performs semantic searches across your downloaded documents. Use detailed queries to find specific information, concepts, or answers from your collected resources."
|
|
|
|
| 634 |
"query": {
|
| 635 |
"type": "string",
|
| 636 |
"description": "The search query. Ensure this is constructed intelligently so to retrieve the most relevant outputs.",
|
| 637 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
}
|
| 639 |
output_type = "string"
|
| 640 |
|
| 641 |
+
def forward(self, query: str) -> str:
|
| 642 |
collection_name = "vectorstore"
|
| 643 |
|
| 644 |
if k < 3:
|
|
|
|
| 657 |
|
| 658 |
results = collection.query(
|
| 659 |
query_texts=[query],
|
| 660 |
+
n_results=k,
|
| 661 |
)
|
| 662 |
|
| 663 |
formatted = []
|
|
|
|
| 675 |
traceback.print_exc()
|
| 676 |
return f"Error querying vector store: {e}"
|
| 677 |
|
| 678 |
+
|
| 679 |
@tool
|
| 680 |
def image_question_answering(image_path: str, prompt: str) -> str:
|
| 681 |
"""
|
|
|
|
| 712 |
|
| 713 |
return response.message.content.strip()
|
| 714 |
|
| 715 |
+
|
| 716 |
class VisitWebpageTool(Tool):
|
| 717 |
name = "visit_webpage"
|
| 718 |
description = "Loads a webpage from a URL and converts its content to markdown format. Use this to browse websites, extract information, or identify downloadable resources from a specific web address."
|
|
|
|
| 947 |
|
| 948 |
return content
|
| 949 |
|
| 950 |
+
|
| 951 |
class ArxivSearchTool(Tool):
|
| 952 |
name = "arxiv_search"
|
| 953 |
description = """Searches arXiv for academic papers and returns structured information including titles, authors, publication dates, abstracts, and download links."""
|
|
|
|
| 1005 |
|
| 1006 |
return "\n".join(output_lines).strip()
|
| 1007 |
|
| 1008 |
+
|
| 1009 |
def fetch_and_parse_arxiv(url: str) -> List[Dict[str, str]]:
|
| 1010 |
"""
|
| 1011 |
Fetches the given arXiv advanced‐search URL, parses the HTML,
|
|
|
|
| 1068 |
|
| 1069 |
return results
|
| 1070 |
|
| 1071 |
+
|
| 1072 |
def build_arxiv_url(
|
| 1073 |
query: str, from_date: str = None, to_date: str = None, size: int = 50
|
| 1074 |
) -> str:
|