Spaces:
Configuration error
Configuration error
| # SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. | |
| # SPDX-License-Identifier: Apache-2.0 | |
| # | |
| # Licensed under the Apache License, Version 2.0 (the "License"); | |
| # you may not use this file except in compliance with the License. | |
| # You may obtain a copy of the License at | |
| # | |
| # http://www.apache.org/licenses/LICENSE-2.0 | |
| # | |
| # Unless required by applicable law or agreed to in writing, software | |
| # distributed under the License is distributed on an "AS IS" BASIS, | |
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
| # See the License for the specific language governing permissions and | |
| # limitations under the License. | |
| """Main content extraction orchestration.""" | |
| from typing import Any | |
| from docutils import nodes | |
| from sphinx.environment import BuildEnvironment | |
| from sphinx.util import logging | |
| from .structured import extract_code_blocks, extract_headings, extract_images, extract_links | |
| from .text import ( | |
| clean_text_for_llm, | |
| extract_clean_text_content, | |
| extract_keywords, | |
| extract_raw_markdown, | |
| extract_summary, | |
| extract_text_content, | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def extract_document_content(env: BuildEnvironment, docname: str, content_cache: dict) -> dict[str, Any]: | |
| """Extract content from document optimized for LLM/search use cases.""" | |
| if docname in content_cache: | |
| return content_cache[docname] | |
| try: | |
| logger.debug(f"Starting content extraction for {docname}") | |
| doctree = env.get_doctree(docname) | |
| # Get extraction settings | |
| extraction_settings = _get_extraction_settings(env) | |
| # Extract main content | |
| content = _extract_main_content(doctree, env, docname, extraction_settings) | |
| # Extract additional features based on settings (pass env for link resolution) | |
| _extract_additional_features(content, doctree, docname, extraction_settings, env) | |
| # Cache and return result | |
| content_cache[docname] = content | |
| logger.debug(f"Successfully extracted content for {docname}") | |
| except Exception: | |
| logger.exception(f"Critical error extracting content from {docname}") | |
| content = _get_empty_content_dict() | |
| content_cache[docname] = content | |
| return content_cache[docname] | |
| def _get_extraction_settings(env: BuildEnvironment) -> dict[str, bool]: | |
| """Extract all extraction-related settings from environment config.""" | |
| config = getattr(env.app, "config", None) | |
| json_settings = getattr(config, "json_output_settings", {}) if config else {} | |
| return { | |
| "fast_extraction": json_settings.get("fast_text_extraction", False), | |
| "lazy_extraction": json_settings.get("lazy_extraction", False), | |
| "skip_complex": json_settings.get("skip_complex_parsing", False), | |
| "filter_clutter": json_settings.get("filter_search_clutter", True), | |
| } | |
| def _extract_main_content( | |
| doctree: nodes.document, env: BuildEnvironment, docname: str, settings: dict[str, bool] | |
| ) -> dict[str, Any]: | |
| """Extract main text content with appropriate strategy.""" | |
| content = {} | |
| try: | |
| if settings["fast_extraction"]: | |
| content["content"] = extract_text_content(doctree) | |
| content["format"] = "text" | |
| logger.debug(f"Fast text extraction for {docname}: {len(content['content'])} chars") | |
| else: | |
| content = _extract_with_fallbacks(doctree, env, docname) | |
| # Apply content filtering if enabled | |
| if settings["filter_clutter"] and content.get("content"): | |
| _apply_content_filtering(content, docname) | |
| except Exception as e: # noqa: BLE001 | |
| logger.warning(f"Error extracting main content from {docname}: {e}") | |
| content = {"content": "", "format": "text"} | |
| return content | |
| def _extract_with_fallbacks(doctree: nodes.document, env: BuildEnvironment, docname: str) -> dict[str, Any]: | |
| """Extract content with multiple fallback strategies.""" | |
| # Try clean text first (pass env for link title resolution) | |
| clean_text = extract_clean_text_content(doctree, env) | |
| if clean_text: | |
| logger.debug(f"Extracted clean text content for {docname}: {len(clean_text)} chars") | |
| return {"content": clean_text, "format": "text"} | |
| # Fallback to raw markdown | |
| raw_markdown = extract_raw_markdown(env, docname) | |
| if raw_markdown: | |
| logger.debug(f"Fallback to raw markdown for {docname}: {len(raw_markdown)} chars") | |
| return {"content": raw_markdown, "format": "markdown"} | |
| # Final fallback to basic text | |
| logger.debug(f"Fallback to basic text extraction for {docname}") | |
| return {"content": extract_text_content(doctree), "format": "text"} | |
| def _apply_content_filtering(content: dict[str, Any], docname: str) -> None: | |
| """Apply content filtering to remove clutter.""" | |
| original_length = len(content["content"]) | |
| content["content"] = clean_text_for_llm(content["content"]) | |
| filtered_length = len(content["content"]) | |
| if original_length != filtered_length: | |
| logger.debug(f"Content filtering for {docname}: {original_length} -> {filtered_length} chars") | |
| def _extract_additional_features( | |
| content: dict[str, Any], | |
| doctree: nodes.document, | |
| docname: str, | |
| settings: dict[str, bool], | |
| env: BuildEnvironment | None = None, | |
| ) -> None: | |
| """Extract additional features based on extraction settings.""" | |
| if settings["lazy_extraction"]: | |
| _set_empty_additional_features(content) | |
| return | |
| # Extract basic features | |
| _extract_basic_features(content, doctree, docname) | |
| # Extract complex features if not skipped | |
| if not settings["skip_complex"]: | |
| _extract_complex_features(content, doctree, docname, env) | |
| else: | |
| _set_empty_complex_features(content) | |
| # Extract keywords if not lazy | |
| if not settings["lazy_extraction"]: | |
| _extract_keywords_feature(content, docname) | |
| else: | |
| content["keywords"] = [] | |
| def _extract_basic_features(content: dict[str, Any], doctree: nodes.document, docname: str) -> None: | |
| """Extract basic features: headings and summary.""" | |
| features = [ | |
| ("headings", extract_headings, []), | |
| ("summary", extract_summary, ""), | |
| ] | |
| for feature_name, extract_func, default_value in features: | |
| try: | |
| result = extract_func(doctree) | |
| content[feature_name] = result | |
| if feature_name == "headings": | |
| logger.debug(f"Extracted {len(result)} headings from {docname}") | |
| except Exception as e: # noqa: BLE001, PERF203 | |
| logger.warning(f"Error extracting {feature_name} from {docname}: {e}") | |
| content[feature_name] = default_value | |
| def _extract_complex_features( | |
| content: dict[str, Any], | |
| doctree: nodes.document, | |
| docname: str, | |
| env: BuildEnvironment | None = None, | |
| ) -> None: | |
| """Extract complex features: code blocks, links, and images.""" | |
| # Code blocks and images don't need env | |
| simple_features = [ | |
| ("code_blocks", extract_code_blocks), | |
| ("images", extract_images), | |
| ] | |
| for feature_name, extract_func in simple_features: | |
| try: | |
| result = extract_func(doctree) | |
| content[feature_name] = result | |
| logger.debug(f"Extracted {len(result)} {feature_name} from {docname}") | |
| except Exception as e: # noqa: BLE001, PERF203 | |
| logger.warning(f"Error extracting {feature_name} from {docname}: {e}") | |
| content[feature_name] = [] | |
| # Links need env for title resolution | |
| try: | |
| content["links"] = extract_links(doctree, env, docname) | |
| logger.debug(f"Extracted {len(content['links'])} links from {docname}") | |
| except Exception as e: # noqa: BLE001 | |
| logger.warning(f"Error extracting links from {docname}: {e}") | |
| content["links"] = [] | |
| def _extract_keywords_feature(content: dict[str, Any], docname: str) -> None: | |
| """Extract keywords from content and headings.""" | |
| try: | |
| content["keywords"] = extract_keywords(content.get("content", ""), content.get("headings", [])) | |
| logger.debug(f"Extracted {len(content['keywords'])} keywords from {docname}") | |
| except Exception as e: # noqa: BLE001 | |
| logger.warning(f"Error extracting keywords from {docname}: {e}") | |
| content["keywords"] = [] | |
| def _set_empty_additional_features(content: dict[str, Any]) -> None: | |
| """Set empty values for all additional features (lazy extraction).""" | |
| features = ["headings", "summary", "code_blocks", "links", "images", "keywords"] | |
| for feature in features: | |
| content[feature] = [] if feature != "summary" else "" | |
| def _set_empty_complex_features(content: dict[str, Any]) -> None: | |
| """Set empty values for complex features only.""" | |
| for feature in ["code_blocks", "links", "images"]: | |
| content[feature] = [] | |
| def _get_empty_content_dict() -> dict[str, Any]: | |
| """Get empty content dictionary for error cases.""" | |
| return { | |
| "content": "", | |
| "format": "text", | |
| "headings": [], | |
| "summary": "", | |
| "code_blocks": [], | |
| "links": [], | |
| "images": [], | |
| "keywords": [], | |
| } | |