""" Web scraping tools for fetching content from URLs. """ import logging import requests from bs4 import BeautifulSoup from .base import register_tool logger = logging.getLogger(__name__) @register_tool( name="read_link", description="Truy cập và bóc tách nội dung chính từ một đường dẫn (URL). Giúp Agent hiểu nội dung bài báo, tài liệu online.", parameters=[ {"name": "url", "type": "string", "description": "Đường dẫn (URL) cần đọc nội dung.", "required": True} ] ) def tool_read_link(url: str) -> dict: """ Fetches and parses a URL to extract the main content. """ try: # 1. Fetch content with timeout headers = { 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' } response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() # 2. Parse with BeautifulSoup soup = BeautifulSoup(response.text, 'html.parser') # Remove script and style elements for script_or_style in soup(["script", "style", "nav", "footer", "header"]): script_or_style.decompose() # 3. Extract title and main text title = soup.title.string if soup.title else "No Title" # Simple extraction: get all paragraphs paragraphs = soup.find_all('p') text_content = "\n".join([p.get_text().strip() for p in paragraphs if p.get_text().strip()]) # Truncate content for LLM context (approx 3000 chars) if len(text_content) > 3000: text_content = text_content[:3000] + "..." return { "status": "success", "data": { "url": url, "title": title.strip(), "content": text_content } } except requests.exceptions.Timeout: return {"status": "error", "message": "Yêu cầu quá hạn (timeout) khi truy cập URL."} except Exception as e: logger.error(f"Error in read_link: {e}") return {"status": "error", "message": f"Không thể đọc nội dung từ link: {str(e)}"}