Spaces:
Sleeping
Sleeping
Arxiv
Browse files- requirements.txt +11 -14
- tools.py +33 -55
requirements.txt
CHANGED
|
@@ -1,19 +1,16 @@
|
|
|
|
|
|
|
|
| 1 |
gradio
|
| 2 |
-
requests
|
| 3 |
-
pillow
|
| 4 |
-
pytesseract
|
| 5 |
-
langgraph
|
| 6 |
langchain
|
| 7 |
-
openai
|
| 8 |
-
pandas
|
| 9 |
-
langchain_openai
|
| 10 |
langchain_community
|
|
|
|
|
|
|
| 11 |
openai
|
| 12 |
-
duckduckgo-search
|
| 13 |
-
regex
|
| 14 |
-
pytesseract
|
| 15 |
openpyxl
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
| 19 |
-
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
arxiv
|
| 2 |
+
duckduckgo-search
|
| 3 |
gradio
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
langchain
|
|
|
|
|
|
|
|
|
|
| 5 |
langchain_community
|
| 6 |
+
langchain_openai
|
| 7 |
+
langgraph
|
| 8 |
openai
|
|
|
|
|
|
|
|
|
|
| 9 |
openpyxl
|
| 10 |
+
pandas
|
| 11 |
+
pillow
|
| 12 |
+
PyMuPDF
|
| 13 |
+
pytesseract
|
| 14 |
+
regex
|
| 15 |
+
requests
|
| 16 |
+
wikipedia
|
tools.py
CHANGED
|
@@ -10,6 +10,9 @@ import os
|
|
| 10 |
from duckduckgo_search import DDGS
|
| 11 |
from langchain_core.tools import tool
|
| 12 |
from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
|
|
|
|
|
|
|
|
|
|
| 13 |
|
| 14 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 15 |
|
|
@@ -272,7 +275,7 @@ def wikipedia_search_tool(wiki_query: str) -> str:
|
|
| 272 |
return error_msg
|
| 273 |
|
| 274 |
@tool
|
| 275 |
-
def arxiv_search_tool(
|
| 276 |
"""
|
| 277 |
TOOL NAME: ArXiv Academic Search Tool
|
| 278 |
|
|
@@ -285,62 +288,37 @@ def arxiv_search_tool(arxiv_query: str) -> str:
|
|
| 285 |
- "What are recent studies on climate change?"
|
| 286 |
- "Search for papers on quantum computing"
|
| 287 |
"""
|
| 288 |
-
print(f"DEBUG: reached arxiv_search_tool with query: {arxiv_query}")
|
| 289 |
try:
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
| 293 |
-
|
| 294 |
-
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
| 304 |
-
|
| 305 |
-
|
| 306 |
-
|
| 307 |
-
|
| 308 |
-
|
| 309 |
-
|
| 310 |
-
|
| 311 |
-
|
| 312 |
-
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
print(f"DEBUG: Using title: {title}")
|
| 319 |
-
|
| 320 |
-
# Trim content to key information only (reduced from 2000 to 800 characters)
|
| 321 |
-
content = doc.page_content[:800] if len(doc.page_content) > 800 else doc.page_content
|
| 322 |
-
|
| 323 |
-
# Add document but keep it concise
|
| 324 |
-
result += f"\n\nArXiv Result {counter}: {title}\nAbstract/Summary: {content}..."
|
| 325 |
-
counter += 1
|
| 326 |
-
|
| 327 |
-
# Stop after 2 documents to keep response manageable
|
| 328 |
-
if counter > 2:
|
| 329 |
-
break
|
| 330 |
-
|
| 331 |
-
if not result.strip():
|
| 332 |
-
return "No ArXiv results found for the given query. [END_OF_SEARCH]"
|
| 333 |
-
|
| 334 |
-
# Add clear end marker
|
| 335 |
-
result += "\n\n[END_OF_SEARCH] - ArXiv search complete. Use this information to answer the question."
|
| 336 |
-
|
| 337 |
-
print(f"DEBUG: Final ArXiv result length: {len(result)}")
|
| 338 |
-
return result
|
| 339 |
-
|
| 340 |
except Exception as e:
|
| 341 |
-
|
| 342 |
-
print(f"DEBUG: {error_msg}")
|
| 343 |
-
return error_msg
|
| 344 |
|
| 345 |
|
| 346 |
from langchain_openai import ChatOpenAI
|
|
|
|
| 10 |
from duckduckgo_search import DDGS
|
| 11 |
from langchain_core.tools import tool
|
| 12 |
from langchain_community.document_loaders import WikipediaLoader, ArxivLoader
|
| 13 |
+
import arxiv
|
| 14 |
+
import fitz # PyMuPDF
|
| 15 |
+
import tempfile
|
| 16 |
|
| 17 |
DEFAULT_API_URL = "https://agents-course-unit4-scoring.hf.space"
|
| 18 |
|
|
|
|
| 275 |
return error_msg
|
| 276 |
|
| 277 |
@tool
|
| 278 |
+
def arxiv_search_tool(query: str) -> str:
|
| 279 |
"""
|
| 280 |
TOOL NAME: ArXiv Academic Search Tool
|
| 281 |
|
|
|
|
| 288 |
- "What are recent studies on climate change?"
|
| 289 |
- "Search for papers on quantum computing"
|
| 290 |
"""
|
|
|
|
| 291 |
try:
|
| 292 |
+
# Search arXiv for the top result
|
| 293 |
+
search = arxiv.Search(query=query, max_results=1, sort_by=arxiv.SortCriterion.Relevance)
|
| 294 |
+
result = next(search.results(), None)
|
| 295 |
+
|
| 296 |
+
if not result:
|
| 297 |
+
return "No results found. [END_OF_SEARCH]"
|
| 298 |
+
|
| 299 |
+
# Download PDF
|
| 300 |
+
pdf_url = result.pdf_url
|
| 301 |
+
response = requests.get(pdf_url)
|
| 302 |
+
response.raise_for_status()
|
| 303 |
+
|
| 304 |
+
# Save and open PDF
|
| 305 |
+
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as tmp:
|
| 306 |
+
tmp.write(response.content)
|
| 307 |
+
tmp.flush()
|
| 308 |
+
|
| 309 |
+
doc = fitz.open(tmp.name)
|
| 310 |
+
text = ""
|
| 311 |
+
for page in doc:
|
| 312 |
+
text += page.get_text()
|
| 313 |
+
|
| 314 |
+
# Clean and trim text
|
| 315 |
+
text = " ".join(text.split())
|
| 316 |
+
summary = text[:3000] + "..." if len(text) > 3000 else text
|
| 317 |
+
|
| 318 |
+
return f"Title: {result.title}\n\nSummary:\n{summary}\n\n[END_OF_SEARCH]"
|
| 319 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 320 |
except Exception as e:
|
| 321 |
+
return f"Error fetching arXiv content: {e} [END_OF_SEARCH]"
|
|
|
|
|
|
|
| 322 |
|
| 323 |
|
| 324 |
from langchain_openai import ChatOpenAI
|