# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Structured content extraction functions for headings, code blocks, links, and images.""" import re from typing import TYPE_CHECKING, Any from docutils import nodes from sphinx import addnodes from sphinx.util import logging if TYPE_CHECKING: from sphinx.environment import BuildEnvironment logger = logging.getLogger(__name__) def extract_headings(doctree: nodes.document) -> list[dict[str, Any]]: """Extract headings from document tree.""" headings = [] # Extract headings from section nodes for node in doctree.traverse(nodes.section): # Get the title node title_node = node.next_node(nodes.title) if title_node: title_text = title_node.astext().strip() if title_text: # Determine heading level based on nesting level = 1 parent = node.parent while parent and isinstance(parent, nodes.section): level += 1 parent = parent.parent # Generate ID (similar to how Sphinx does it) heading_id = re.sub(r"[^\w\-_]", "", title_text.lower().replace(" ", "-")) headings.append({"text": title_text, "level": level, "id": heading_id}) # Also check for standalone title nodes (like document title) for node in doctree.traverse(nodes.title): if node.parent and not isinstance(node.parent, nodes.section): title_text = node.astext().strip() if title_text: heading_id = re.sub(r"[^\w\-_]", "", title_text.lower().replace(" ", "-")) headings.append({"text": title_text, "level": 1, "id": heading_id}) # Remove duplicates while preserving order seen = set() unique_headings = [] for heading in headings: heading_key = (heading["text"], heading["level"]) if heading_key not in seen: seen.add(heading_key) unique_headings.append(heading) return unique_headings def extract_code_blocks(doctree: nodes.document) -> list[dict[str, Any]]: """Extract code blocks from document tree.""" code_blocks = [] for node in doctree.traverse(nodes.literal_block): code_content = node.astext().strip() if code_content: # Try to determine language from classes or attributes language = "text" # default if hasattr(node, "attributes") and "classes" in node.attributes: classes = node.attributes["classes"] for cls in classes: if cls.startswith("language-"): language = cls[9:] # Remove 'language-' prefix break elif cls in [ "python", "bash", "javascript", "json", "yaml", "sql", "html", "css", "cpp", "c", "java", "rust", "go", ]: language = cls break # Also check for highlight language if hasattr(node, "attributes") and "highlight_args" in node.attributes: highlight_args = node.attributes["highlight_args"] if "language" in highlight_args: language = highlight_args["language"] code_blocks.append({"content": code_content, "language": language}) return code_blocks def extract_links( doctree: nodes.document, env: "BuildEnvironment | None" = None, docname: str = "", ) -> list[dict[str, Any]]: """Extract links from document tree with enhanced metadata. Args: doctree: The document tree to extract links from env: Optional Sphinx build environment for title resolution docname: Current document name for relative URL resolution Returns: List of link dictionaries with text, url, type, and optional metadata """ links = [] # Extract standard reference nodes for node in doctree.traverse(nodes.reference): link = _extract_reference_node(node, env, docname) if link: links.append(link) # Extract download reference nodes for node in doctree.traverse(addnodes.download_reference): link = _extract_download_reference(node) if link: links.append(link) return links def _extract_reference_node( node: nodes.reference, env: "BuildEnvironment | None", current_docname: str, ) -> dict[str, Any] | None: """Extract metadata from a reference node.""" link_text = node.astext().strip() if not link_text: return None attrs = getattr(node, "attributes", {}) link: dict[str, Any] = {"text": link_text, "type": "internal"} # Extract URL from various attributes if "refuri" in attrs: link["url"] = attrs["refuri"] # Classify link type if attrs["refuri"].startswith(("http://", "https://", "ftp://", "mailto:")): link["type"] = "external" elif attrs["refuri"].startswith("#"): link["type"] = "anchor" else: link["type"] = "internal" # Normalize internal URLs link["url"] = _normalize_internal_url(attrs["refuri"], current_docname) elif "refid" in attrs: link["url"] = f"#{attrs['refid']}" link["type"] = "anchor" elif "reftarget" in attrs: link["url"] = attrs["reftarget"] link["type"] = "internal" # Extract cross-reference metadata (from :ref:, :doc:, {ref}, {doc}, etc.) if "refdoc" in attrs: link["target_doc"] = attrs["refdoc"] if link["type"] == "internal": link["type"] = "cross_reference" if "reftype" in attrs: link["ref_type"] = attrs["reftype"] # Try to improve link text if it looks like a filename if env and _looks_like_filename(link_text): better_text = _resolve_link_text(link_text, attrs, env) if better_text and better_text != link_text: link["text"] = better_text link["original_text"] = link_text # Keep original for debugging # Only return if we have a URL or target_doc if link.get("url") or link.get("target_doc"): return link return None def _extract_download_reference(node: addnodes.download_reference) -> dict[str, Any] | None: """Extract metadata from a download reference node.""" link_text = node.astext().strip() attrs = getattr(node, "attributes", {}) if not link_text: return None link: dict[str, Any] = { "text": link_text, "type": "download", } if "reftarget" in attrs: link["url"] = attrs["reftarget"] if "filename" in attrs: link["filename"] = attrs["filename"] return link if link.get("url") else None def _normalize_internal_url(url: str, current_docname: str) -> str: """Normalize internal URLs to consistent format. Converts .md/.rst extensions to .html and resolves relative paths. """ if not url: return url # Already absolute or external if url.startswith(("/", "http://", "https://", "#")): # Just normalize extension for absolute internal paths if url.startswith("/"): return _normalize_extension(url) return url # Relative URL - resolve against current document if current_docname: # Get directory of current document if "/" in current_docname: base_dir = current_docname.rsplit("/", 1)[0] url = f"{base_dir}/{url}" return _normalize_extension(url) def _normalize_extension(url: str) -> str: """Normalize file extensions to .html.""" # Split off anchor if present anchor = "" if "#" in url: url, anchor = url.rsplit("#", 1) anchor = f"#{anchor}" # Replace source extensions with .html for ext in (".md", ".rst", ".txt"): if url.endswith(ext): url = url[: -len(ext)] + ".html" break # Add .html if no extension if url and not url.endswith(".html") and "." not in url.rsplit("/", 1)[-1]: url = url + ".html" return url + anchor def _looks_like_filename(text: str) -> bool: """Check if text looks like a filename/docname rather than readable text.""" if not text: return False # Single word with no spaces, possibly with path separators if " " not in text and ("/" in text or text == text.lower()): # But not if it's a reasonable title-like word if len(text) > 2 and text[0].isupper() and text[1:].islower(): return False return True # Contains path separators if "/" in text or "\\" in text: return True # Ends with file extension if re.search(r"\.(md|rst|html|txt)$", text, re.IGNORECASE): return True return False def _resolve_link_text( text: str, attrs: dict[str, Any], env: "BuildEnvironment", ) -> str: """Try to resolve a filename-like link text to a proper title.""" # Try to get the target document name target_doc = attrs.get("refdoc") or attrs.get("reftarget", "") # Clean up the target target_doc = target_doc.replace(".html", "").replace(".md", "").replace(".rst", "") if target_doc and hasattr(env, "titles") and target_doc in env.titles: title_node = env.titles[target_doc] if title_node: return title_node.astext().strip() # Fallback: humanize the filename return _humanize_filename(text) def _humanize_filename(filename: str) -> str: """Convert a filename to human-readable text.""" # Get just the filename part if "/" in filename: filename = filename.rsplit("/", 1)[-1] # Remove extension for ext in (".md", ".rst", ".html", ".txt"): if filename.endswith(ext): filename = filename[: -len(ext)] break # Replace separators with spaces filename = filename.replace("-", " ").replace("_", " ") # Title case return filename.title() def extract_images(doctree: nodes.document) -> list[dict[str, Any]]: """Extract images from document tree.""" images = [] # Extract standalone images images.extend(_extract_standalone_images(doctree)) # Extract images within figures images.extend(_extract_figure_images(doctree)) return images def _extract_standalone_images(doctree: nodes.document) -> list[dict[str, Any]]: """Extract standalone image nodes.""" images = [] for node in doctree.traverse(nodes.image): if hasattr(node, "attributes"): image_info = _build_image_info(node.attributes) if image_info: images.append(image_info) return images def _extract_figure_images(doctree: nodes.document) -> list[dict[str, Any]]: """Extract images from figure nodes.""" images = [] for node in doctree.traverse(nodes.figure): for img_node in node.traverse(nodes.image): if hasattr(img_node, "attributes"): image_info = _build_image_info(img_node.attributes) if image_info: # Add caption from figure caption = _extract_figure_caption(node) if caption: image_info["caption"] = caption images.append(image_info) return images def _build_image_info(attrs: dict[str, Any]) -> dict[str, Any] | None: """Build image info dictionary from attributes.""" image_src = attrs.get("uri", "") if not image_src: return None image_info = {"src": image_src, "alt": attrs.get("alt", "")} # Add optional attributes for attr_name in ["title", "width", "height"]: if attr_name in attrs: image_info[attr_name] = attrs[attr_name] return image_info def _extract_figure_caption(figure_node: nodes.figure) -> str: """Extract caption text from figure node.""" for caption_node in figure_node.traverse(nodes.caption): return caption_node.astext().strip() return ""