Yash030's picture
Deploy claude-code-nvidia proxy to Hugging Face Spaces
0157ac7
"""HTML parsing for web_search / web_fetch."""
from __future__ import annotations
import html
import re
from html.parser import HTMLParser
from typing import Any
from urllib.parse import parse_qs, unquote, urlparse
class SearchResultParser(HTMLParser):
"""DuckDuckGo lite HTML: extract result links and titles."""
def __init__(self) -> None:
super().__init__()
self.results: list[dict[str, str]] = []
self._href: str | None = None
self._title_parts: list[str] = []
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if tag != "a":
return
href = dict(attrs).get("href")
if not href or "uddg=" not in href:
return
parsed = urlparse(href)
query = parse_qs(parsed.query)
uddg = query.get("uddg", [""])[0]
if not uddg:
return
self._href = unquote(uddg)
self._title_parts = []
def handle_data(self, data: str) -> None:
if self._href is not None:
self._title_parts.append(data)
def handle_endtag(self, tag: str) -> None:
if tag != "a" or self._href is None:
return
title = " ".join("".join(self._title_parts).split())
if title and not any(result["url"] == self._href for result in self.results):
self.results.append({"title": html.unescape(title), "url": self._href})
self._href = None
self._title_parts = []
class HTMLTextParser(HTMLParser):
"""Strip scripts/styles and collect visible text + title for fetch previews."""
def __init__(self) -> None:
super().__init__()
self.title = ""
self.text_parts: list[str] = []
self._in_title = False
self._skip_depth = 0
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
if tag in {"script", "style", "noscript"}:
self._skip_depth += 1
elif tag == "title":
self._in_title = True
def handle_endtag(self, tag: str) -> None:
if tag in {"script", "style", "noscript"} and self._skip_depth:
self._skip_depth -= 1
elif tag == "title":
self._in_title = False
def handle_data(self, data: str) -> None:
text = " ".join(data.split())
if not text:
return
if self._in_title:
self.title = f"{self.title} {text}".strip()
elif not self._skip_depth:
self.text_parts.append(text)
def content_text(content: Any) -> str:
if isinstance(content, str):
return content
if isinstance(content, list):
parts = []
for item in content:
if isinstance(item, dict):
parts.append(str(item.get("text", "")))
else:
parts.append(str(getattr(item, "text", "")))
return "\n".join(part for part in parts if part)
return str(content)
def extract_query(text: str) -> str:
match = re.search(r"query:\s*(.+)", text, flags=re.IGNORECASE | re.DOTALL)
if match:
return match.group(1).strip().strip("\"'")
return text.strip()
def extract_url(text: str) -> str:
match = re.search(r"https?://\S+", text)
return match.group(0).rstrip(").,]") if match else text.strip()