# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. # SPDX-License-Identifier: Apache-2.0 # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. """Main content extraction orchestration.""" from typing import Any from docutils import nodes from sphinx.environment import BuildEnvironment from sphinx.util import logging from .structured import extract_code_blocks, extract_headings, extract_images, extract_links from .text import ( clean_text_for_llm, extract_clean_text_content, extract_keywords, extract_raw_markdown, extract_summary, extract_text_content, ) logger = logging.getLogger(__name__) def extract_document_content(env: BuildEnvironment, docname: str, content_cache: dict) -> dict[str, Any]: """Extract content from document optimized for LLM/search use cases.""" if docname in content_cache: return content_cache[docname] try: logger.debug(f"Starting content extraction for {docname}") doctree = env.get_doctree(docname) # Get extraction settings extraction_settings = _get_extraction_settings(env) # Extract main content content = _extract_main_content(doctree, env, docname, extraction_settings) # Extract additional features based on settings (pass env for link resolution) _extract_additional_features(content, doctree, docname, extraction_settings, env) # Cache and return result content_cache[docname] = content logger.debug(f"Successfully extracted content for {docname}") except Exception: logger.exception(f"Critical error extracting content from {docname}") content = _get_empty_content_dict() content_cache[docname] = content return content_cache[docname] def _get_extraction_settings(env: BuildEnvironment) -> dict[str, bool]: """Extract all extraction-related settings from environment config.""" config = getattr(env.app, "config", None) json_settings = getattr(config, "json_output_settings", {}) if config else {} return { "fast_extraction": json_settings.get("fast_text_extraction", False), "lazy_extraction": json_settings.get("lazy_extraction", False), "skip_complex": json_settings.get("skip_complex_parsing", False), "filter_clutter": json_settings.get("filter_search_clutter", True), } def _extract_main_content( doctree: nodes.document, env: BuildEnvironment, docname: str, settings: dict[str, bool] ) -> dict[str, Any]: """Extract main text content with appropriate strategy.""" content = {} try: if settings["fast_extraction"]: content["content"] = extract_text_content(doctree) content["format"] = "text" logger.debug(f"Fast text extraction for {docname}: {len(content['content'])} chars") else: content = _extract_with_fallbacks(doctree, env, docname) # Apply content filtering if enabled if settings["filter_clutter"] and content.get("content"): _apply_content_filtering(content, docname) except Exception as e: # noqa: BLE001 logger.warning(f"Error extracting main content from {docname}: {e}") content = {"content": "", "format": "text"} return content def _extract_with_fallbacks(doctree: nodes.document, env: BuildEnvironment, docname: str) -> dict[str, Any]: """Extract content with multiple fallback strategies.""" # Try clean text first (pass env for link title resolution) clean_text = extract_clean_text_content(doctree, env) if clean_text: logger.debug(f"Extracted clean text content for {docname}: {len(clean_text)} chars") return {"content": clean_text, "format": "text"} # Fallback to raw markdown raw_markdown = extract_raw_markdown(env, docname) if raw_markdown: logger.debug(f"Fallback to raw markdown for {docname}: {len(raw_markdown)} chars") return {"content": raw_markdown, "format": "markdown"} # Final fallback to basic text logger.debug(f"Fallback to basic text extraction for {docname}") return {"content": extract_text_content(doctree), "format": "text"} def _apply_content_filtering(content: dict[str, Any], docname: str) -> None: """Apply content filtering to remove clutter.""" original_length = len(content["content"]) content["content"] = clean_text_for_llm(content["content"]) filtered_length = len(content["content"]) if original_length != filtered_length: logger.debug(f"Content filtering for {docname}: {original_length} -> {filtered_length} chars") def _extract_additional_features( content: dict[str, Any], doctree: nodes.document, docname: str, settings: dict[str, bool], env: BuildEnvironment | None = None, ) -> None: """Extract additional features based on extraction settings.""" if settings["lazy_extraction"]: _set_empty_additional_features(content) return # Extract basic features _extract_basic_features(content, doctree, docname) # Extract complex features if not skipped if not settings["skip_complex"]: _extract_complex_features(content, doctree, docname, env) else: _set_empty_complex_features(content) # Extract keywords if not lazy if not settings["lazy_extraction"]: _extract_keywords_feature(content, docname) else: content["keywords"] = [] def _extract_basic_features(content: dict[str, Any], doctree: nodes.document, docname: str) -> None: """Extract basic features: headings and summary.""" features = [ ("headings", extract_headings, []), ("summary", extract_summary, ""), ] for feature_name, extract_func, default_value in features: try: result = extract_func(doctree) content[feature_name] = result if feature_name == "headings": logger.debug(f"Extracted {len(result)} headings from {docname}") except Exception as e: # noqa: BLE001, PERF203 logger.warning(f"Error extracting {feature_name} from {docname}: {e}") content[feature_name] = default_value def _extract_complex_features( content: dict[str, Any], doctree: nodes.document, docname: str, env: BuildEnvironment | None = None, ) -> None: """Extract complex features: code blocks, links, and images.""" # Code blocks and images don't need env simple_features = [ ("code_blocks", extract_code_blocks), ("images", extract_images), ] for feature_name, extract_func in simple_features: try: result = extract_func(doctree) content[feature_name] = result logger.debug(f"Extracted {len(result)} {feature_name} from {docname}") except Exception as e: # noqa: BLE001, PERF203 logger.warning(f"Error extracting {feature_name} from {docname}: {e}") content[feature_name] = [] # Links need env for title resolution try: content["links"] = extract_links(doctree, env, docname) logger.debug(f"Extracted {len(content['links'])} links from {docname}") except Exception as e: # noqa: BLE001 logger.warning(f"Error extracting links from {docname}: {e}") content["links"] = [] def _extract_keywords_feature(content: dict[str, Any], docname: str) -> None: """Extract keywords from content and headings.""" try: content["keywords"] = extract_keywords(content.get("content", ""), content.get("headings", [])) logger.debug(f"Extracted {len(content['keywords'])} keywords from {docname}") except Exception as e: # noqa: BLE001 logger.warning(f"Error extracting keywords from {docname}: {e}") content["keywords"] = [] def _set_empty_additional_features(content: dict[str, Any]) -> None: """Set empty values for all additional features (lazy extraction).""" features = ["headings", "summary", "code_blocks", "links", "images", "keywords"] for feature in features: content[feature] = [] if feature != "summary" else "" def _set_empty_complex_features(content: dict[str, Any]) -> None: """Set empty values for complex features only.""" for feature in ["code_blocks", "links", "images"]: content[feature] = [] def _get_empty_content_dict() -> dict[str, Any]: """Get empty content dictionary for error cases.""" return { "content": "", "format": "text", "headings": [], "summary": "", "code_blocks": [], "links": [], "images": [], "keywords": [], }