Spaces:

Nitishkumar-ai
/

INIclaw

Configuration error

INIclaw / docs /_ext /json_output /content /extractor.py

Nitish kumar

Upload folder using huggingface_hub

0722e92 verified 18 days ago

9.17 kB

	# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Main content extraction orchestration."""

	from typing import Any

	from docutils import nodes
	from sphinx.environment import BuildEnvironment
	from sphinx.util import logging

	from .structured import extract_code_blocks, extract_headings, extract_images, extract_links
	from .text import (
	clean_text_for_llm,
	extract_clean_text_content,
	extract_keywords,
	extract_raw_markdown,
	extract_summary,
	extract_text_content,
	)

	logger = logging.getLogger(__name__)


	def extract_document_content(env: BuildEnvironment, docname: str, content_cache: dict) -> dict[str, Any]:
	"""Extract content from document optimized for LLM/search use cases."""
	if docname in content_cache:
	return content_cache[docname]

	try:
	logger.debug(f"Starting content extraction for {docname}")
	doctree = env.get_doctree(docname)

	# Get extraction settings
	extraction_settings = _get_extraction_settings(env)

	# Extract main content
	content = _extract_main_content(doctree, env, docname, extraction_settings)

	# Extract additional features based on settings (pass env for link resolution)
	_extract_additional_features(content, doctree, docname, extraction_settings, env)

	# Cache and return result
	content_cache[docname] = content
	logger.debug(f"Successfully extracted content for {docname}")

	except Exception:
	logger.exception(f"Critical error extracting content from {docname}")
	content = _get_empty_content_dict()
	content_cache[docname] = content

	return content_cache[docname]


	def _get_extraction_settings(env: BuildEnvironment) -> dict[str, bool]:
	"""Extract all extraction-related settings from environment config."""
	config = getattr(env.app, "config", None)
	json_settings = getattr(config, "json_output_settings", {}) if config else {}

	return {
	"fast_extraction": json_settings.get("fast_text_extraction", False),
	"lazy_extraction": json_settings.get("lazy_extraction", False),
	"skip_complex": json_settings.get("skip_complex_parsing", False),
	"filter_clutter": json_settings.get("filter_search_clutter", True),
	}


	def _extract_main_content(
	doctree: nodes.document, env: BuildEnvironment, docname: str, settings: dict[str, bool]
	) -> dict[str, Any]:
	"""Extract main text content with appropriate strategy."""
	content = {}

	try:
	if settings["fast_extraction"]:
	content["content"] = extract_text_content(doctree)
	content["format"] = "text"
	logger.debug(f"Fast text extraction for {docname}: {len(content['content'])} chars")
	else:
	content = _extract_with_fallbacks(doctree, env, docname)

	# Apply content filtering if enabled
	if settings["filter_clutter"] and content.get("content"):
	_apply_content_filtering(content, docname)

	except Exception as e: # noqa: BLE001
	logger.warning(f"Error extracting main content from {docname}: {e}")
	content = {"content": "", "format": "text"}

	return content


	def _extract_with_fallbacks(doctree: nodes.document, env: BuildEnvironment, docname: str) -> dict[str, Any]:
	"""Extract content with multiple fallback strategies."""
	# Try clean text first (pass env for link title resolution)
	clean_text = extract_clean_text_content(doctree, env)
	if clean_text:
	logger.debug(f"Extracted clean text content for {docname}: {len(clean_text)} chars")
	return {"content": clean_text, "format": "text"}

	# Fallback to raw markdown
	raw_markdown = extract_raw_markdown(env, docname)
	if raw_markdown:
	logger.debug(f"Fallback to raw markdown for {docname}: {len(raw_markdown)} chars")
	return {"content": raw_markdown, "format": "markdown"}

	# Final fallback to basic text
	logger.debug(f"Fallback to basic text extraction for {docname}")
	return {"content": extract_text_content(doctree), "format": "text"}


	def _apply_content_filtering(content: dict[str, Any], docname: str) -> None:
	"""Apply content filtering to remove clutter."""
	original_length = len(content["content"])
	content["content"] = clean_text_for_llm(content["content"])
	filtered_length = len(content["content"])

	if original_length != filtered_length:
	logger.debug(f"Content filtering for {docname}: {original_length} -> {filtered_length} chars")


	def _extract_additional_features(
	content: dict[str, Any],
	doctree: nodes.document,
	docname: str,
	settings: dict[str, bool],
	env: BuildEnvironment \| None = None,
	) -> None:
	"""Extract additional features based on extraction settings."""
	if settings["lazy_extraction"]:
	_set_empty_additional_features(content)
	return

	# Extract basic features
	_extract_basic_features(content, doctree, docname)

	# Extract complex features if not skipped
	if not settings["skip_complex"]:
	_extract_complex_features(content, doctree, docname, env)
	else:
	_set_empty_complex_features(content)

	# Extract keywords if not lazy
	if not settings["lazy_extraction"]:
	_extract_keywords_feature(content, docname)
	else:
	content["keywords"] = []


	def _extract_basic_features(content: dict[str, Any], doctree: nodes.document, docname: str) -> None:
	"""Extract basic features: headings and summary."""
	features = [
	("headings", extract_headings, []),
	("summary", extract_summary, ""),
	]

	for feature_name, extract_func, default_value in features:
	try:
	result = extract_func(doctree)
	content[feature_name] = result
	if feature_name == "headings":
	logger.debug(f"Extracted {len(result)} headings from {docname}")
	except Exception as e: # noqa: BLE001, PERF203
	logger.warning(f"Error extracting {feature_name} from {docname}: {e}")
	content[feature_name] = default_value


	def _extract_complex_features(
	content: dict[str, Any],
	doctree: nodes.document,
	docname: str,
	env: BuildEnvironment \| None = None,
	) -> None:
	"""Extract complex features: code blocks, links, and images."""
	# Code blocks and images don't need env
	simple_features = [
	("code_blocks", extract_code_blocks),
	("images", extract_images),
	]

	for feature_name, extract_func in simple_features:
	try:
	result = extract_func(doctree)
	content[feature_name] = result
	logger.debug(f"Extracted {len(result)} {feature_name} from {docname}")
	except Exception as e: # noqa: BLE001, PERF203
	logger.warning(f"Error extracting {feature_name} from {docname}: {e}")
	content[feature_name] = []

	# Links need env for title resolution
	try:
	content["links"] = extract_links(doctree, env, docname)
	logger.debug(f"Extracted {len(content['links'])} links from {docname}")
	except Exception as e: # noqa: BLE001
	logger.warning(f"Error extracting links from {docname}: {e}")
	content["links"] = []


	def _extract_keywords_feature(content: dict[str, Any], docname: str) -> None:
	"""Extract keywords from content and headings."""
	try:
	content["keywords"] = extract_keywords(content.get("content", ""), content.get("headings", []))
	logger.debug(f"Extracted {len(content['keywords'])} keywords from {docname}")
	except Exception as e: # noqa: BLE001
	logger.warning(f"Error extracting keywords from {docname}: {e}")
	content["keywords"] = []


	def _set_empty_additional_features(content: dict[str, Any]) -> None:
	"""Set empty values for all additional features (lazy extraction)."""
	features = ["headings", "summary", "code_blocks", "links", "images", "keywords"]
	for feature in features:
	content[feature] = [] if feature != "summary" else ""


	def _set_empty_complex_features(content: dict[str, Any]) -> None:
	"""Set empty values for complex features only."""
	for feature in ["code_blocks", "links", "images"]:
	content[feature] = []


	def _get_empty_content_dict() -> dict[str, Any]:
	"""Get empty content dictionary for error cases."""
	return {
	"content": "",
	"format": "text",
	"headings": [],
	"summary": "",
	"code_blocks": [],
	"links": [],
	"images": [],
	"keywords": [],
	}