Spaces:

NitishStark
/

INIclaw

Configuration error

File size: 13,021 Bytes

0722e92

# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
# SPDX-License-Identifier: Apache-2.0
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Structured content extraction functions for headings, code blocks, links, and images."""

import re
from typing import TYPE_CHECKING, Any

from docutils import nodes
from sphinx import addnodes
from sphinx.util import logging

if TYPE_CHECKING:
    from sphinx.environment import BuildEnvironment

logger = logging.getLogger(__name__)


def extract_headings(doctree: nodes.document) -> list[dict[str, Any]]:
    """Extract headings from document tree."""
    headings = []

    # Extract headings from section nodes
    for node in doctree.traverse(nodes.section):
        # Get the title node
        title_node = node.next_node(nodes.title)
        if title_node:
            title_text = title_node.astext().strip()
            if title_text:
                # Determine heading level based on nesting
                level = 1
                parent = node.parent
                while parent and isinstance(parent, nodes.section):
                    level += 1
                    parent = parent.parent

                # Generate ID (similar to how Sphinx does it)
                heading_id = re.sub(r"[^\w\-_]", "", title_text.lower().replace(" ", "-"))

                headings.append({"text": title_text, "level": level, "id": heading_id})

    # Also check for standalone title nodes (like document title)
    for node in doctree.traverse(nodes.title):
        if node.parent and not isinstance(node.parent, nodes.section):
            title_text = node.astext().strip()
            if title_text:
                heading_id = re.sub(r"[^\w\-_]", "", title_text.lower().replace(" ", "-"))
                headings.append({"text": title_text, "level": 1, "id": heading_id})

    # Remove duplicates while preserving order
    seen = set()
    unique_headings = []
    for heading in headings:
        heading_key = (heading["text"], heading["level"])
        if heading_key not in seen:
            seen.add(heading_key)
            unique_headings.append(heading)

    return unique_headings


def extract_code_blocks(doctree: nodes.document) -> list[dict[str, Any]]:
    """Extract code blocks from document tree."""
    code_blocks = []

    for node in doctree.traverse(nodes.literal_block):
        code_content = node.astext().strip()
        if code_content:
            # Try to determine language from classes or attributes
            language = "text"  # default

            if hasattr(node, "attributes") and "classes" in node.attributes:
                classes = node.attributes["classes"]
                for cls in classes:
                    if cls.startswith("language-"):
                        language = cls[9:]  # Remove 'language-' prefix
                        break
                    elif cls in [
                        "python",
                        "bash",
                        "javascript",
                        "json",
                        "yaml",
                        "sql",
                        "html",
                        "css",
                        "cpp",
                        "c",
                        "java",
                        "rust",
                        "go",
                    ]:
                        language = cls
                        break

            # Also check for highlight language
            if hasattr(node, "attributes") and "highlight_args" in node.attributes:
                highlight_args = node.attributes["highlight_args"]
                if "language" in highlight_args:
                    language = highlight_args["language"]

            code_blocks.append({"content": code_content, "language": language})

    return code_blocks


def extract_links(
    doctree: nodes.document,
    env: "BuildEnvironment | None" = None,
    docname: str = "",
) -> list[dict[str, Any]]:
    """Extract links from document tree with enhanced metadata.

    Args:
        doctree: The document tree to extract links from
        env: Optional Sphinx build environment for title resolution
        docname: Current document name for relative URL resolution

    Returns:
        List of link dictionaries with text, url, type, and optional metadata
    """
    links = []

    # Extract standard reference nodes
    for node in doctree.traverse(nodes.reference):
        link = _extract_reference_node(node, env, docname)
        if link:
            links.append(link)

    # Extract download reference nodes
    for node in doctree.traverse(addnodes.download_reference):
        link = _extract_download_reference(node)
        if link:
            links.append(link)

    return links


def _extract_reference_node(
    node: nodes.reference,
    env: "BuildEnvironment | None",
    current_docname: str,
) -> dict[str, Any] | None:
    """Extract metadata from a reference node."""
    link_text = node.astext().strip()
    if not link_text:
        return None

    attrs = getattr(node, "attributes", {})
    link: dict[str, Any] = {"text": link_text, "type": "internal"}

    # Extract URL from various attributes
    if "refuri" in attrs:
        link["url"] = attrs["refuri"]
        # Classify link type
        if attrs["refuri"].startswith(("http://", "https://", "ftp://", "mailto:")):
            link["type"] = "external"
        elif attrs["refuri"].startswith("#"):
            link["type"] = "anchor"
        else:
            link["type"] = "internal"
            # Normalize internal URLs
            link["url"] = _normalize_internal_url(attrs["refuri"], current_docname)
    elif "refid" in attrs:
        link["url"] = f"#{attrs['refid']}"
        link["type"] = "anchor"
    elif "reftarget" in attrs:
        link["url"] = attrs["reftarget"]
        link["type"] = "internal"

    # Extract cross-reference metadata (from :ref:, :doc:, {ref}, {doc}, etc.)
    if "refdoc" in attrs:
        link["target_doc"] = attrs["refdoc"]
        if link["type"] == "internal":
            link["type"] = "cross_reference"

    if "reftype" in attrs:
        link["ref_type"] = attrs["reftype"]

    # Try to improve link text if it looks like a filename
    if env and _looks_like_filename(link_text):
        better_text = _resolve_link_text(link_text, attrs, env)
        if better_text and better_text != link_text:
            link["text"] = better_text
            link["original_text"] = link_text  # Keep original for debugging

    # Only return if we have a URL or target_doc
    if link.get("url") or link.get("target_doc"):
        return link
    return None


def _extract_download_reference(node: addnodes.download_reference) -> dict[str, Any] | None:
    """Extract metadata from a download reference node."""
    link_text = node.astext().strip()
    attrs = getattr(node, "attributes", {})

    if not link_text:
        return None

    link: dict[str, Any] = {
        "text": link_text,
        "type": "download",
    }

    if "reftarget" in attrs:
        link["url"] = attrs["reftarget"]
    if "filename" in attrs:
        link["filename"] = attrs["filename"]

    return link if link.get("url") else None


def _normalize_internal_url(url: str, current_docname: str) -> str:
    """Normalize internal URLs to consistent format.

    Converts .md/.rst extensions to .html and resolves relative paths.
    """
    if not url:
        return url

    # Already absolute or external
    if url.startswith(("/", "http://", "https://", "#")):
        # Just normalize extension for absolute internal paths
        if url.startswith("/"):
            return _normalize_extension(url)
        return url

    # Relative URL - resolve against current document
    if current_docname:
        # Get directory of current document
        if "/" in current_docname:
            base_dir = current_docname.rsplit("/", 1)[0]
            url = f"{base_dir}/{url}"

    return _normalize_extension(url)


def _normalize_extension(url: str) -> str:
    """Normalize file extensions to .html."""
    # Split off anchor if present
    anchor = ""
    if "#" in url:
        url, anchor = url.rsplit("#", 1)
        anchor = f"#{anchor}"

    # Replace source extensions with .html
    for ext in (".md", ".rst", ".txt"):
        if url.endswith(ext):
            url = url[: -len(ext)] + ".html"
            break

    # Add .html if no extension
    if url and not url.endswith(".html") and "." not in url.rsplit("/", 1)[-1]:
        url = url + ".html"

    return url + anchor


def _looks_like_filename(text: str) -> bool:
    """Check if text looks like a filename/docname rather than readable text."""
    if not text:
        return False

    # Single word with no spaces, possibly with path separators
    if " " not in text and ("/" in text or text == text.lower()):
        # But not if it's a reasonable title-like word
        if len(text) > 2 and text[0].isupper() and text[1:].islower():
            return False
        return True

    # Contains path separators
    if "/" in text or "\\" in text:
        return True

    # Ends with file extension
    if re.search(r"\.(md|rst|html|txt)$", text, re.IGNORECASE):
        return True

    return False


def _resolve_link_text(
    text: str,
    attrs: dict[str, Any],
    env: "BuildEnvironment",
) -> str:
    """Try to resolve a filename-like link text to a proper title."""
    # Try to get the target document name
    target_doc = attrs.get("refdoc") or attrs.get("reftarget", "")

    # Clean up the target
    target_doc = target_doc.replace(".html", "").replace(".md", "").replace(".rst", "")

    if target_doc and hasattr(env, "titles") and target_doc in env.titles:
        title_node = env.titles[target_doc]
        if title_node:
            return title_node.astext().strip()

    # Fallback: humanize the filename
    return _humanize_filename(text)


def _humanize_filename(filename: str) -> str:
    """Convert a filename to human-readable text."""
    # Get just the filename part
    if "/" in filename:
        filename = filename.rsplit("/", 1)[-1]

    # Remove extension
    for ext in (".md", ".rst", ".html", ".txt"):
        if filename.endswith(ext):
            filename = filename[: -len(ext)]
            break

    # Replace separators with spaces
    filename = filename.replace("-", " ").replace("_", " ")

    # Title case
    return filename.title()


def extract_images(doctree: nodes.document) -> list[dict[str, Any]]:
    """Extract images from document tree."""
    images = []

    # Extract standalone images
    images.extend(_extract_standalone_images(doctree))

    # Extract images within figures
    images.extend(_extract_figure_images(doctree))

    return images


def _extract_standalone_images(doctree: nodes.document) -> list[dict[str, Any]]:
    """Extract standalone image nodes."""
    images = []

    for node in doctree.traverse(nodes.image):
        if hasattr(node, "attributes"):
            image_info = _build_image_info(node.attributes)
            if image_info:
                images.append(image_info)

    return images


def _extract_figure_images(doctree: nodes.document) -> list[dict[str, Any]]:
    """Extract images from figure nodes."""
    images = []

    for node in doctree.traverse(nodes.figure):
        for img_node in node.traverse(nodes.image):
            if hasattr(img_node, "attributes"):
                image_info = _build_image_info(img_node.attributes)
                if image_info:
                    # Add caption from figure
                    caption = _extract_figure_caption(node)
                    if caption:
                        image_info["caption"] = caption
                    images.append(image_info)

    return images


def _build_image_info(attrs: dict[str, Any]) -> dict[str, Any] | None:
    """Build image info dictionary from attributes."""
    image_src = attrs.get("uri", "")
    if not image_src:
        return None

    image_info = {"src": image_src, "alt": attrs.get("alt", "")}

    # Add optional attributes
    for attr_name in ["title", "width", "height"]:
        if attr_name in attrs:
            image_info[attr_name] = attrs[attr_name]

    return image_info


def _extract_figure_caption(figure_node: nodes.figure) -> str:
    """Extract caption text from figure node."""
    for caption_node in figure_node.traverse(nodes.caption):
        return caption_node.astext().strip()
    return ""