Spaces:
Sleeping
Sleeping
File size: 6,609 Bytes
038ed0c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 | from langchain_community.document_loaders import WikipediaLoader
from langchain_community.document_loaders import ArxivLoader
from langchain_core.tools import tool
from youtube_transcript_api import YouTubeTranscriptApi
import os
@tool
def multiply(a: int, b: int) -> int:
"""Multiply two numbers.
Args:
a: first int
b: second int
"""
return a * b
@tool
def wiki_search(query: str) -> str:
"""Search Wikipedia for a query and return up to 4 articles.
Args:
query: The search query."""
try:
import wikipedia
wikipedia.API_URL = "https://en.wikipedia.org/w/api.php"
wikipedia.set_rate_limiting(True)
search_docs = WikipediaLoader(query=query, load_max_docs=4).load()
except Exception as e:
return f"Wikipedia search failed: {e}"
formatted_search_docs = "\n\n---\n\n".join(
[
f'<Document source="{doc.metadata["source"]}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content}\n</Document>'
for doc in search_docs
])
return formatted_search_docs or "(no Wikipedia results)"
@tool
def web_search(query: str) -> str:
"""Search the public web via DuckDuckGo (no API key). Returns titles, URLs and short snippets.
Args:
query: The search query."""
try:
from ddgs import DDGS
except ImportError as e:
return f"Web search unavailable (install ddgs): {e}"
max_results = int(os.getenv("DDG_MAX_RESULTS", "8"))
q = (query or "").strip()
if not q:
return "(empty query)"
timeout = int(os.getenv("DDG_TIMEOUT", "25"))
try:
with DDGS(timeout=timeout) as ddgs:
hits = list(ddgs.text(q, max_results=max_results))
except Exception as e:
return f"DuckDuckGo search failed: {e}"
if not hits:
return "(no web results)"
parts: list[str] = []
for r in hits:
title = (r.get("title") or "").strip()
url = (r.get("href") or r.get("url") or "").strip()
body = (r.get("body") or "")[:1500]
parts.append(f'<Document source="{url}" page=""/>\n{title}\n{body}\n</Document>')
return "\n\n---\n\n".join(parts)
@tool
def arvix_search(query: str) -> str:
"""Search Arxiv for a query and return maximum 3 result.
Args:
query: The search query."""
try:
search_docs = ArxivLoader(query=query, load_max_docs=3).load()
except Exception as e:
return f"Arxiv search failed: {e}"
formatted_search_docs = "\n\n---\n\n".join(
[
f'<Document source="{doc.metadata.get("source", doc.metadata.get("entry_id", ""))}" page="{doc.metadata.get("page", "")}"/>\n{doc.page_content[:1000]}\n</Document>'
for doc in search_docs
])
return formatted_search_docs or "(no Arxiv results)"
@tool
def execute_python_code(source: str) -> str:
"""Run Python source in an isolated subprocess (same interpreter). Returns stdout; includes stderr if non-zero exit.
Use when the question embeds or attaches Python code and you need the actual printed/numeric output.
Args:
source: Python source code to execute as a single string."""
import subprocess
import sys
import os
proc = subprocess.run(
[sys.executable, "-c", source],
capture_output=True,
text=True,
timeout=int(os.getenv("PYTHON_TOOL_TIMEOUT", "45")),
)
out = (proc.stdout or "").strip()
err = (proc.stderr or "").strip()
if proc.returncode != 0:
combined = f"exit={proc.returncode}\nSTDOUT:\n{out}\nSTDERR:\n{err}".strip()
return combined[:8000]
text = out if out else "(empty stdout)"
if err:
text = f"{text}\nSTDERR:\n{err}"
return text[:8000]
@tool
def read_excel_format(file_path: str) -> str:
"""Read an Excel (.xlsx) file and return all its sheets as Markdown tables.
Use this tool whenever the question references a spreadsheet or .xlsx file.
Prefer this over execute_python_code when you just need to read and reason about
tabular data — no need to write any code.
Args:
file_path: Absolute path to the .xlsx file as provided in the 'file_path' field of the question.
"""
try:
import pandas as pd
except ImportError:
return "pandas is not installed. Run: pip install pandas openpyxl"
if not os.path.exists(file_path):
return f"File not found: {file_path}"
try:
xl = pd.ExcelFile(file_path)
except Exception as e:
return f"Failed to open Excel file: {e}"
filename = os.path.basename(file_path)
parts: list[str] = [f"**File:** `{filename}`\n"]
for sheet_name in xl.sheet_names:
try:
df = xl.parse(sheet_name)
except Exception as e:
parts.append(f"### Sheet: {sheet_name}\n(error reading sheet: {e})\n")
continue
parts.append(f"### Sheet: `{sheet_name}` — {df.shape[0]} rows × {df.shape[1]} columns\n")
parts.append(df.to_markdown(index=False))
parts.append("")
return "\n".join(parts)
@tool
def YouTubeVideoAnalysisTool(video_id: str) -> str:
"""
Fetches the transcript of a YouTube video by its ID and performs.
Args:
video_id: The ID of the YouTube video.
Returns:
video transcript in text format.
"""
try:
fetched = YouTubeTranscriptApi().fetch(video_id)
full_transcript = " ".join([snippet.text for snippet in fetched])
except Exception as e:
return f"An error occurred while fetching the YouTube transcript: {e}"
return "the transcript of the youtube video is the following: "+ full_transcript
@tool
def transcribe_mp3(file_path: str) -> str:
"""Transcribe an MP3 audio file to text using Whisper (Hugging Face Inference API).
Use this tool when the question references an .mp3 audio file.
Args:
file_path: Absolute path to the .mp3 file.
"""
if not os.path.exists(file_path):
return f"File not found: {file_path}"
token = os.getenv("HF_TOKEN")
if not token:
return "HF_TOKEN is not set in the environment."
try:
from huggingface_hub import InferenceClient
client = InferenceClient(api_key=token)
with open(file_path, "rb") as f:
output = client.automatic_speech_recognition(
f.read(),
model="openai/whisper-large-v3",
)
return output.text or "(empty transcription)"
except Exception as e:
return f"Transcription failed: {e}" |