Final_Assignment_Template

Sleeping

App Files Files Community

Final_Assignment_Template / tools /webpage.py

jwlee-ai

Upload folder using huggingface_hub

c2446d5 verified 21 days ago

raw

history blame contribute delete

2.85 kB

	"""일반 웹페이지 fetch + 마크다운 변환 툴.

	Content-Type 분기:
	- text/html → BeautifulSoup → markdownify
	- application/pdf → pypdf로 페이지별 텍스트 추출 (arxiv·NASA TR 등 외부 PDF용)
	"""
	import io
	import re
	import requests
	from bs4 import BeautifulSoup
	from markdownify import markdownify as md
	from smolagents import tool


	def _handle_pdf_url(content: bytes) -> str:
	"""외부 PDF URL 본문을 페이지별 텍스트로 변환. attachments._handle_pdf와 동일 패턴."""
	try:
	from pypdf import PdfReader
	reader = PdfReader(io.BytesIO(content))
	parts = []
	for i, page in enumerate(reader.pages):
	try:
	txt = page.extract_text() or ""
	except Exception as pe:
	txt = f"(extraction failed: {pe})"
	parts.append(f"--- Page {i+1} ---\n{txt}")
	combined = "\n\n".join(parts)
	if len(combined) > 12000:
	combined = combined[:12000] + "\n...[truncated]"
	return f"[PDF, {len(reader.pages)} pages]\n{combined}"
	except Exception as e:
	return f"PDF parse error: {e}"


	@tool
	def visit_webpage(url: str) -> str:
	"""Fetch a web page (HTML or PDF) and return its readable text (truncated to ~12k chars).

	HTML pages are converted to markdown. PDF URLs are parsed page-by-page via pypdf —
	useful for arxiv papers, NASA technical reports, and other linked PDF documents.

	Args:
	url: The full URL of the webpage or PDF to fetch.
	"""
	try:
	# 일부 사이트(특히 위키미디어 외)가 빈 User-Agent를 차단하므로 헤더를 명시한다.
	headers = {"User-Agent": "Mozilla/5.0 (compatible; GAIA-Agent/1.0)"}
	resp = requests.get(url, headers=headers, timeout=20)
	resp.raise_for_status()
	content_type = resp.headers.get("Content-Type", "").lower()
	# PDF: pypdf로 텍스트 추출. arxiv 논문 등 GAIA에 자주 등장.
	if "application/pdf" in content_type or url.lower().endswith(".pdf"):
	return _handle_pdf_url(resp.content)
	# HTML: 기존 흐름.
	soup = BeautifulSoup(resp.text, "html.parser")
	# 본문과 무관한 노이즈 제거: 스크립트/스타일/noscript 블록.
	for tag in soup(["script", "style", "noscript"]):
	tag.decompose()
	markdown = md(str(soup))
	# markdownify가 종종 빈 줄을 줄줄이 만들어내므로 압축해서 토큰을 절약한다.
	markdown = re.sub(r"\n{3,}", "\n\n", markdown).strip()
	# LLM 컨텍스트 보호: 너무 큰 페이지는 잘라서 반환한다.
	if len(markdown) > 12000:
	markdown = markdown[:12000] + "\n...[truncated]"
	return markdown
	except Exception as e:
	return f"visit_webpage error: {e}"