Spaces:
Running
Running
| """ | |
| Web scraping tools for fetching content from URLs. | |
| """ | |
| import logging | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from .base import register_tool | |
| logger = logging.getLogger(__name__) | |
| def tool_read_link(url: str) -> dict: | |
| """ | |
| Fetches and parses a URL to extract the main content. | |
| """ | |
| try: | |
| # 1. Fetch content with timeout | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
| } | |
| response = requests.get(url, headers=headers, timeout=10) | |
| response.raise_for_status() | |
| # 2. Parse with BeautifulSoup | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Remove script and style elements | |
| for script_or_style in soup(["script", "style", "nav", "footer", "header"]): | |
| script_or_style.decompose() | |
| # 3. Extract title and main text | |
| title = soup.title.string if soup.title else "No Title" | |
| # Simple extraction: get all paragraphs | |
| paragraphs = soup.find_all('p') | |
| text_content = "\n".join([p.get_text().strip() for p in paragraphs if p.get_text().strip()]) | |
| # Truncate content for LLM context (approx 3000 chars) | |
| if len(text_content) > 3000: | |
| text_content = text_content[:3000] + "..." | |
| return { | |
| "status": "success", | |
| "data": { | |
| "url": url, | |
| "title": title.strip(), | |
| "content": text_content | |
| } | |
| } | |
| except requests.exceptions.Timeout: | |
| return {"status": "error", "message": "Yêu cầu quá hạn (timeout) khi truy cập URL."} | |
| except Exception as e: | |
| logger.error(f"Error in read_link: {e}") | |
| return {"status": "error", "message": f"Không thể đọc nội dung từ link: {str(e)}"} | |