Spaces:

vROMjs
/

vrom-hub

Sleeping

App Files Files Community

philipp-zettl commited on Apr 25

Commit

da365b2

verified ·

1 Parent(s): 7afed3f

Add vrom_hub/fetcher.py

Browse files

Files changed (1) hide show

vrom_hub/fetcher.py +199 -0

vrom_hub/fetcher.py ADDED Viewed

	@@ -0,0 +1,199 @@

+"""
+Documentation page fetcher.
+Supports:
+- HF documentation pages (explore_hf_docs style)
+- Raw markdown content
+- URLs to fetch markdown/HTML
+"""
+from __future__ import annotations
+import logging
+import re
+from typing import Optional
+from urllib.parse import urlparse
+import requests
+from vrom_hub.chunker import DocPage
+logger = logging.getLogger(__name__)
+def _url_to_source_file(url: str) -> str:
+    """Convert a URL to a source_file path (e.g. 'trl/index.md')."""
+    parsed = urlparse(url)
+    path = parsed.path.rstrip("/")
+    # HF docs pattern: /docs/{lib}/{page}
+    hf_match = re.match(r'/docs/([^/]+)/(.*)', path)
+    if hf_match:
+        lib = hf_match.group(1)
+        page = hf_match.group(2) or "index"
+        return f"{lib}/{page}.md"
+    # Generic: use last path segments
+    segments = [s for s in path.split("/") if s]
+    if segments:
+        return "/".join(segments[-2:]) + ".md" if len(segments) >= 2 else segments[-1] + ".md"
+    return "unknown.md"
+def _extract_title_from_markdown(content: str) -> str:
+    """Extract the first heading from markdown content."""
+    match = re.search(r'^#\s+(.+)$', content, re.MULTILINE)
+    if match:
+        return match.group(1).strip()
+    # Fallback: first non-empty line
+    for line in content.split('\n'):
+        line = line.strip()
+        if line:
+            return line[:100]
+    return "Untitled"
+def _html_to_markdown(html: str) -> str:
+    """Basic HTML to markdown conversion (strip tags, preserve structure)."""
+    # Remove script and style tags
+    html = re.sub(r'<script[\s\S]*?</script>', '', html, flags=re.IGNORECASE)
+    html = re.sub(r'<style[\s\S]*?</style>', '', html, flags=re.IGNORECASE)
+    # Convert common tags
+    html = re.sub(r'<h1[^>]*>(.*?)</h1>', r'# \1', html, flags=re.IGNORECASE | re.DOTALL)
+    html = re.sub(r'<h2[^>]*>(.*?)</h2>', r'## \1', html, flags=re.IGNORECASE | re.DOTALL)
+    html = re.sub(r'<h3[^>]*>(.*?)</h3>', r'### \1', html, flags=re.IGNORECASE | re.DOTALL)
+    html = re.sub(r'<h4[^>]*>(.*?)</h4>', r'#### \1', html, flags=re.IGNORECASE | re.DOTALL)
+    html = re.sub(r'<code[^>]*>(.*?)</code>', r'`\1`', html, flags=re.IGNORECASE | re.DOTALL)
+    html = re.sub(r'<pre[^>]*>(.*?)</pre>', r'```\n\1\n```', html, flags=re.IGNORECASE | re.DOTALL)
+    html = re.sub(r'<br\s*/?>', '\n', html, flags=re.IGNORECASE)
+    html = re.sub(r'<p[^>]*>', '\n\n', html, flags=re.IGNORECASE)
+    html = re.sub(r'</p>', '', html, flags=re.IGNORECASE)
+    html = re.sub(r'<li[^>]*>', '\n- ', html, flags=re.IGNORECASE)
+    # Strip remaining tags
+    html = re.sub(r'<[^>]+>', '', html)
+    # Clean up whitespace
+    html = re.sub(r'\n{3,}', '\n\n', html)
+    return html.strip()
+class DocFetcher:
+    """
+    Fetches documentation pages and converts them to DocPage objects.
+    """
+    def __init__(self, timeout: int = 30):
+        self.timeout = timeout
+        self.session = requests.Session()
+        self.session.headers.update({
+            "User-Agent": "vROM-Hub-Backend/0.1.0"
+        })
+    def fetch_url(self, url: str, title: str | None = None) -> DocPage:
+        """
+        Fetch a documentation page from a URL.
+        Handles both markdown and HTML responses.
+        """
+        logger.info(f"Fetching: {url}")
+        resp = self.session.get(url, timeout=self.timeout)
+        resp.raise_for_status()
+        content_type = resp.headers.get("content-type", "")
+        content = resp.text
+        # Convert HTML to markdown if needed
+        if "html" in content_type.lower():
+            content = _html_to_markdown(content)
+        if title is None:
+            title = _extract_title_from_markdown(content)
+        source_file = _url_to_source_file(url)
+        return DocPage(
+            content=content,
+            source_file=source_file,
+            url=url,
+            title=title,
+        )
+    def from_markdown(
+        self,
+        content: str,
+        url: str = "",
+        source_file: str = "doc.md",
+        title: str | None = None,
+    ) -> DocPage:
+        """Create a DocPage from raw markdown content."""
+        if title is None:
+            title = _extract_title_from_markdown(content)
+        return DocPage(
+            content=content,
+            source_file=source_file,
+            url=url,
+            title=title,
+        )
+    def fetch_hf_docs(self, endpoint: str, pages: list[str] | None = None) -> list[DocPage]:
+        """
+        Fetch documentation pages from Hugging Face docs.
+        Args:
+            endpoint: Library name (e.g. "trl", "transformers", "peft")
+            pages: Specific page paths to fetch (e.g. ["index", "sft_trainer"]).
+                   If None, fetches the index page.
+        Returns:
+            List of DocPage objects
+        """
+        if pages is None:
+            pages = ["index"]
+        doc_pages = []
+        base_url = f"https://huggingface.co/docs/{endpoint}"
+        for page in pages:
+            url = f"{base_url}/{page}"
+            md_url = f"https://huggingface.co/docs/{endpoint}/{page}.md"
+            try:
+                # Try markdown version first
+                doc_page = self.fetch_url(md_url, title=None)
+                doc_page.url = url  # Use clean URL
+                doc_pages.append(doc_page)
+            except requests.HTTPError:
+                try:
+                    # Fallback to HTML
+                    doc_page = self.fetch_url(url, title=None)
+                    doc_pages.append(doc_page)
+                except requests.HTTPError as e:
+                    logger.warning(f"Failed to fetch {url}: {e}")
+        return doc_pages
+    def from_pages(self, pages: list[dict]) -> list[DocPage]:
+        """
+        Convert a list of page dicts to DocPage objects.
+        Each dict should have:
+            - content: str (markdown)
+            - url: str (optional)
+            - source_file: str (optional)
+            - title: str (optional)
+        """
+        doc_pages = []
+        for p in pages:
+            content = p["content"]
+            url = p.get("url", "")
+            source_file = p.get("source_file", _url_to_source_file(url) if url else "doc.md")
+            title = p.get("title", _extract_title_from_markdown(content))
+            doc_pages.append(DocPage(
+                content=content,
+                source_file=source_file,
+                url=url,
+                title=title,
+            ))
+        return doc_pages