Spaces:

NeerajCodz
/

scrapeRL

Sleeping

File size: 8,083 Bytes

27cde0c

"""HTML processing utilities for ScrapeRL backend."""

import re
from typing import Any, Optional
from bs4 import BeautifulSoup, Tag, NavigableString

from app.utils.logging import get_logger

logger = get_logger(__name__)


def parse_html(html: str, parser: str = "html.parser") -> BeautifulSoup:
    """
    Parse HTML string into a BeautifulSoup object.

    Args:
        html: Raw HTML string
        parser: Parser to use (html.parser, lxml, html5lib)

    Returns:
        Parsed BeautifulSoup object
    """
    return BeautifulSoup(html, parser)


def clean_html(
    html: str,
    remove_scripts: bool = True,
    remove_styles: bool = True,
    remove_comments: bool = True,
    remove_tags: Optional[list[str]] = None,
) -> str:
    """
    Clean HTML by removing unwanted elements.

    Args:
        html: Raw HTML string
        remove_scripts: Remove <script> tags
        remove_styles: Remove <style> tags
        remove_comments: Remove HTML comments
        remove_tags: Additional tags to remove

    Returns:
        Cleaned HTML string
    """
    soup = parse_html(html)

    # Remove script tags
    if remove_scripts:
        for script in soup.find_all("script"):
            script.decompose()

    # Remove style tags
    if remove_styles:
        for style in soup.find_all("style"):
            style.decompose()

    # Remove comments
    if remove_comments:
        from bs4 import Comment

        for comment in soup.find_all(string=lambda text: isinstance(text, Comment)):
            comment.extract()

    # Remove additional specified tags
    if remove_tags:
        for tag_name in remove_tags:
            for tag in soup.find_all(tag_name):
                tag.decompose()

    return str(soup)


def extract_text(
    html: str,
    separator: str = " ",
    strip: bool = True,
) -> str:
    """
    Extract plain text from HTML.

    Args:
        html: Raw HTML string
        separator: String to join text segments
        strip: Strip whitespace from result

    Returns:
        Extracted plain text
    """
    soup = parse_html(html)

    # Remove script and style elements
    for element in soup(["script", "style", "noscript"]):
        element.decompose()

    text = soup.get_text(separator=separator)

    if strip:
        # Normalize whitespace
        text = re.sub(r"\s+", " ", text).strip()

    return text


def semantic_chunk(
    html: str,
    max_chunk_size: int = 4000,
    overlap: int = 200,
) -> list[dict[str, Any]]:
    """
    Split HTML content into semantic chunks based on structure.

    Args:
        html: Raw HTML string
        max_chunk_size: Maximum characters per chunk
        overlap: Number of characters to overlap between chunks

    Returns:
        List of chunk dictionaries with text and metadata
    """
    soup = parse_html(html)
    chunks: list[dict[str, Any]] = []

    # Remove non-content elements
    for element in soup(["script", "style", "noscript", "nav", "footer", "header"]):
        element.decompose()

    # Find semantic boundaries
    semantic_tags = ["article", "section", "div", "p", "h1", "h2", "h3", "h4", "h5", "h6"]

    def get_text_content(element: Tag | NavigableString) -> str:
        if isinstance(element, NavigableString):
            return str(element).strip()
        return element.get_text(separator=" ", strip=True)

    current_chunk = ""
    current_metadata: dict[str, Any] = {"tags": [], "headings": []}

    for element in soup.find_all(semantic_tags):
        text = get_text_content(element)
        if not text:
            continue

        tag_name = element.name if isinstance(element, Tag) else "text"

        # Check if adding this would exceed max size
        if len(current_chunk) + len(text) + 1 > max_chunk_size:
            if current_chunk:
                chunks.append({
                    "text": current_chunk.strip(),
                    "metadata": current_metadata.copy(),
                    "char_count": len(current_chunk),
                })
            # Start new chunk with overlap
            if overlap > 0 and current_chunk:
                current_chunk = current_chunk[-overlap:] + " " + text
            else:
                current_chunk = text
            current_metadata = {"tags": [tag_name], "headings": []}
        else:
            current_chunk += " " + text if current_chunk else text
            current_metadata["tags"].append(tag_name)

        # Track headings
        if tag_name in ["h1", "h2", "h3", "h4", "h5", "h6"]:
            current_metadata["headings"].append(text[:100])

    # Add remaining content
    if current_chunk.strip():
        chunks.append({
            "text": current_chunk.strip(),
            "metadata": current_metadata,
            "char_count": len(current_chunk),
        })

    # If no semantic chunks found, fall back to simple chunking
    if not chunks:
        text = extract_text(html)
        for i in range(0, len(text), max_chunk_size - overlap):
            chunk_text = text[i : i + max_chunk_size]
            if chunk_text.strip():
                chunks.append({
                    "text": chunk_text.strip(),
                    "metadata": {"tags": [], "headings": []},
                    "char_count": len(chunk_text),
                })

    return chunks


def extract_links(
    html: str,
    base_url: Optional[str] = None,
    include_text: bool = True,
) -> list[dict[str, str]]:
    """
    Extract all links from HTML.

    Args:
        html: Raw HTML string
        base_url: Base URL for resolving relative links
        include_text: Include link text in results

    Returns:
        List of link dictionaries with href and optionally text
    """
    from urllib.parse import urljoin

    soup = parse_html(html)
    links: list[dict[str, str]] = []

    for anchor in soup.find_all("a", href=True):
        href = anchor.get("href", "")
        if not href or href.startswith("#") or href.startswith("javascript:"):
            continue

        # Resolve relative URLs
        if base_url and not href.startswith(("http://", "https://", "//")):
            href = urljoin(base_url, href)

        link_data: dict[str, str] = {"href": href}

        if include_text:
            link_data["text"] = anchor.get_text(strip=True)

        # Include title if present
        title = anchor.get("title")
        if title:
            link_data["title"] = title

        links.append(link_data)

    return links


def extract_tables(
    html: str,
    include_headers: bool = True,
) -> list[dict[str, Any]]:
    """
    Extract tables from HTML as structured data.

    Args:
        html: Raw HTML string
        include_headers: Try to identify and include header rows

    Returns:
        List of table dictionaries with headers and rows
    """
    soup = parse_html(html)
    tables: list[dict[str, Any]] = []

    for table in soup.find_all("table"):
        table_data: dict[str, Any] = {
            "headers": [],
            "rows": [],
        }

        # Extract headers from thead or first row
        if include_headers:
            thead = table.find("thead")
            if thead:
                header_row = thead.find("tr")
                if header_row:
                    table_data["headers"] = [
                        th.get_text(strip=True)
                        for th in header_row.find_all(["th", "td"])
                    ]

        # Extract body rows
        tbody = table.find("tbody") or table
        for row in tbody.find_all("tr"):
            cells = row.find_all(["td", "th"])
            row_data = [cell.get_text(strip=True) for cell in cells]

            # If no headers yet and this looks like a header row
            if include_headers and not table_data["headers"] and row.find("th"):
                table_data["headers"] = row_data
            else:
                if row_data:  # Skip empty rows
                    table_data["rows"].append(row_data)

        if table_data["rows"] or table_data["headers"]:
            tables.append(table_data)

    return tables