Spaces:
Runtime error
Runtime error
| from __future__ import annotations | |
| import os | |
| import json | |
| import csv | |
| from typing import Optional | |
| import pdfplumber | |
| import httpx | |
| from bs4 import BeautifulSoup | |
| from langchain_core.tools import tool | |
| from langchain_community.tools import DuckDuckGoSearchRun | |
| # ------------------------- | |
| # 1) DuckDuckGo search tool | |
| # ------------------------- | |
| _ddg = DuckDuckGoSearchRun() | |
| def web_search(query: str) -> str: | |
| """Search the web (DuckDuckGo) and return text results.""" | |
| # DuckDuckGoSearchRun returns a string summary of results | |
| return _ddg.run(query) | |
| # # ------------------------- | |
| # # 2) Local file reader tool | |
| # # ------------------------- | |
| # def _read_pdf(path: str) -> str: | |
| # text = [] | |
| # with pdfplumber.open(path) as pdf: | |
| # for page in pdf.pages: | |
| # page_text = page.extract_text() | |
| # if page_text: | |
| # text.append(page_text) | |
| # return "\n".join(text) | |
| # def _read_json(path: str) -> str: | |
| # with open(path, "r", encoding="utf-8") as f: | |
| # data = json.load(f) | |
| # return json.dumps(data, indent=2, ensure_ascii=False) | |
| # def _read_txt(path: str) -> str: | |
| # with open(path, "r", encoding="utf-8") as f: | |
| # return f.read() | |
| # def _read_csv(path: str) -> str: | |
| # rows = [] | |
| # with open(path, newline="", encoding="utf-8") as f: | |
| # reader = csv.reader(f) | |
| # for row in reader: | |
| # rows.append(", ".join(row)) | |
| # return "\n".join(rows) | |
| # @tool("file_reader") | |
| # def file_reader(path: str) -> str: | |
| # """ | |
| # Read local files and return extracted text. | |
| # Supports PDF, JSON, TXT, and CSV. | |
| # """ | |
| # if not os.path.exists(path): | |
| # return f"Error: file not found at {path}" | |
| # ext = os.path.splitext(path)[1].lower() | |
| # try: | |
| # if ext == ".pdf": | |
| # return _read_pdf(path) | |
| # if ext == ".json": | |
| # return _read_json(path) | |
| # if ext == ".txt": | |
| # return _read_txt(path) | |
| # if ext == ".csv": | |
| # return _read_csv(path) | |
| # return f"Unsupported file type: {ext}" | |
| # except Exception as e: | |
| # return f"Error reading file: {e}" | |
| # ------------------------- | |
| # 3) Web fetch tool | |
| # ------------------------- | |
| def _clean_html_to_text(html: str, max_lines: int = 5000) -> str: | |
| soup = BeautifulSoup(html, "html.parser") | |
| # Remove noisy tags | |
| for tag in soup(["script", "style", "noscript", "nav", "footer", "header", "aside"]): | |
| tag.decompose() | |
| text = soup.get_text(separator="\n") | |
| lines = [line.strip() for line in text.splitlines() if line.strip()] | |
| return "\n".join(lines[:max_lines]) | |
| def web_fetch(url: str) -> str: | |
| """ | |
| Retrieves and reads the text content of a specific URL. | |
| Use this to read articles, documentation, or static webpages. | |
| Do NOT use this tool for YouTube URLs (use 'youtube_transcript' instead). | |
| Limitations: | |
| - Returns cleaned plain text, not raw HTML. | |
| - Cannot execute JavaScript (may fail on heavy SPAs or dynamic sites). | |
| - Content is truncated at 5000 lines. | |
| """ | |
| try: | |
| with httpx.Client(follow_redirects=True, timeout=20) as client: | |
| r = client.get( | |
| url, | |
| headers={ | |
| # Some sites block empty UA; this helps | |
| "User-Agent": "Mozilla/5.0 (compatible; LangChainTool/1.0)" | |
| }, | |
| ) | |
| r.raise_for_status() | |
| return _clean_html_to_text(r.text, max_lines=5000) | |
| except Exception as e: | |
| return f"Error fetching page: {e}" | |
| from langchain_core.tools import tool | |
| from youtube_transcript_api import YouTubeTranscriptApi | |
| def _extract_video_id(url: str) -> str: | |
| # handles https://www.youtube.com/watch?v=VIDEOID | |
| import urllib.parse as up | |
| q = up.urlparse(url) | |
| if q.hostname in ("www.youtube.com", "youtube.com"): | |
| return up.parse_qs(q.query).get("v", [""])[0] | |
| if q.hostname == "youtu.be": | |
| return q.path.lstrip("/") | |
| return "" | |
| def youtube_transcript(url: str) -> str: | |
| """ | |
| Retrieves the full English transcript text from a YouTube video URL. | |
| Use this tool when a user asks questions about a video's content, wants a summary, | |
| or needs specific quotes. | |
| Note: This tool only supports videos with English captions/subtitles. | |
| """ | |
| vid = _extract_video_id(url) | |
| if not vid: | |
| return "Error: could not parse video id" | |
| try: | |
| chunks = YouTubeTranscriptApi.get_transcript(vid, languages=["en"]) | |
| return "\n".join([c["text"] for c in chunks]) | |
| except Exception as e: | |
| return f"Error fetching transcript: {e}" | |