Spaces:
Running
Running
| """HTML parsing for web_search / web_fetch.""" | |
| from __future__ import annotations | |
| import html | |
| import re | |
| from html.parser import HTMLParser | |
| from typing import Any | |
| from urllib.parse import parse_qs, unquote, urlparse | |
| class SearchResultParser(HTMLParser): | |
| """DuckDuckGo lite HTML: extract result links and titles.""" | |
| def __init__(self) -> None: | |
| super().__init__() | |
| self.results: list[dict[str, str]] = [] | |
| self._href: str | None = None | |
| self._title_parts: list[str] = [] | |
| def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: | |
| if tag != "a": | |
| return | |
| href = dict(attrs).get("href") | |
| if not href or "uddg=" not in href: | |
| return | |
| parsed = urlparse(href) | |
| query = parse_qs(parsed.query) | |
| uddg = query.get("uddg", [""])[0] | |
| if not uddg: | |
| return | |
| self._href = unquote(uddg) | |
| self._title_parts = [] | |
| def handle_data(self, data: str) -> None: | |
| if self._href is not None: | |
| self._title_parts.append(data) | |
| def handle_endtag(self, tag: str) -> None: | |
| if tag != "a" or self._href is None: | |
| return | |
| title = " ".join("".join(self._title_parts).split()) | |
| if title and not any(result["url"] == self._href for result in self.results): | |
| self.results.append({"title": html.unescape(title), "url": self._href}) | |
| self._href = None | |
| self._title_parts = [] | |
| class HTMLTextParser(HTMLParser): | |
| """Strip scripts/styles and collect visible text + title for fetch previews.""" | |
| def __init__(self) -> None: | |
| super().__init__() | |
| self.title = "" | |
| self.text_parts: list[str] = [] | |
| self._in_title = False | |
| self._skip_depth = 0 | |
| def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: | |
| if tag in {"script", "style", "noscript"}: | |
| self._skip_depth += 1 | |
| elif tag == "title": | |
| self._in_title = True | |
| def handle_endtag(self, tag: str) -> None: | |
| if tag in {"script", "style", "noscript"} and self._skip_depth: | |
| self._skip_depth -= 1 | |
| elif tag == "title": | |
| self._in_title = False | |
| def handle_data(self, data: str) -> None: | |
| text = " ".join(data.split()) | |
| if not text: | |
| return | |
| if self._in_title: | |
| self.title = f"{self.title} {text}".strip() | |
| elif not self._skip_depth: | |
| self.text_parts.append(text) | |
| def content_text(content: Any) -> str: | |
| if isinstance(content, str): | |
| return content | |
| if isinstance(content, list): | |
| parts = [] | |
| for item in content: | |
| if isinstance(item, dict): | |
| parts.append(str(item.get("text", ""))) | |
| else: | |
| parts.append(str(getattr(item, "text", ""))) | |
| return "\n".join(part for part in parts if part) | |
| return str(content) | |
| def extract_query(text: str) -> str: | |
| match = re.search(r"query:\s*(.+)", text, flags=re.IGNORECASE | re.DOTALL) | |
| if match: | |
| return match.group(1).strip().strip("\"'") | |
| return text.strip() | |
| def extract_url(text: str) -> str: | |
| match = re.search(r"https?://\S+", text) | |
| return match.group(0).rstrip(").,]") if match else text.strip() | |