Spaces:

Nitishkumar-ai
/

INIclaw

Configuration error

INIclaw / docs /_ext /json_output /config.py

Nitish kumar

Upload folder using huggingface_hub

0722e92 verified 24 days ago

9.61 kB

	# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Configuration management for JSON output extension."""

	from typing import Any

	from sphinx.application import Sphinx
	from sphinx.config import Config
	from sphinx.util import logging

	logger = logging.getLogger(__name__)

	# Constants
	MAX_PARALLEL_WORKERS = 32


	def get_default_settings() -> dict[str, Any]:
	"""Get default configuration settings for json_output extension."""
	return {
	"enabled": True,
	"exclude_patterns": ["_build", "_templates", "_static"],
	"verbose": True, # Enable by default for better user feedback
	"parallel": True, # Enable parallel processing by default for speed
	"include_children": True,
	"include_child_content": True,
	"main_index_mode": "full", # 'disabled', 'metadata_only', 'full'
	"max_main_index_docs": 0, # No limit by default for comprehensive search
	# Search optimization features
	"extract_code_blocks": True, # Include code blocks in search data
	"extract_links": True, # Include internal/external links
	"extract_images": True, # Include image references
	"extract_keywords": True, # Auto-extract technical keywords
	"include_doc_type": True, # Auto-detect document types
	"include_section_path": True, # Include hierarchical section paths
	# Link extraction options
	"link_normalization": True, # Normalize internal URLs to absolute paths
	"link_include_ref_type": True, # Include ref_type metadata (ref, doc, etc.)
	"link_include_target_doc": True, # Include target_doc for cross-references
	"link_resolve_titles": True, # Resolve filename-like link text to document titles
	# Performance controls
	"content_max_length": 50000, # Max content length per document (0 = no limit)
	"summary_max_length": 500, # Max summary length
	"keywords_max_count": 50, # Max keywords per document
	# Output format options
	"minify_json": True, # Minify JSON by default for better performance
	"separate_content": False, # Store content in separate .content.json files
	# Speed optimizations
	"parallel_workers": "auto", # Number of parallel workers
	"batch_size": 50, # Process documents in batches
	"cache_aggressive": True, # Enable aggressive caching
	"lazy_extraction": False, # Skip feature extraction (keywords, links, etc.) for faster builds
	"skip_large_files": 100000, # Skip files larger than N bytes
	"incremental_build": True, # Only process changed files
	"memory_limit_mb": 512, # Memory limit per worker
	"fast_text_extraction": True, # Use faster text extraction
	"skip_complex_parsing": False, # Skip complex parsing features
	# Content filtering
	"filter_search_clutter": True, # Remove SVG, toctree, and other non-searchable content
	# Global metadata from conf.py
	"global_metadata": {}, # User-defined global fields (book, product, site)
	"infer_global_metadata": True, # Auto-infer from Sphinx config (project, release)
	}


	def apply_config_defaults(settings: dict[str, Any]) -> dict[str, Any]:
	"""Apply default values to settings dictionary."""
	defaults = get_default_settings()

	for key, default_value in defaults.items():
	if key not in settings:
	settings[key] = default_value

	return settings


	def validate_config(_app: Sphinx, config: Config) -> None:
	"""Validate configuration values."""
	settings = _ensure_settings_dict(config)
	settings = apply_config_defaults(settings)
	config.json_output_settings = settings

	_validate_core_settings(settings)
	_validate_content_limits(settings)
	_validate_boolean_settings(settings)
	_validate_integer_settings(settings)
	_validate_parallel_workers(settings)
	_validate_global_metadata(settings)


	def _ensure_settings_dict(config: Config) -> dict[str, Any]:
	"""Ensure settings is a valid dictionary."""
	settings = getattr(config, "json_output_settings", {})
	if not isinstance(settings, dict):
	logger.warning("json_output_settings must be a dictionary. Using defaults.")
	settings = {}
	config.json_output_settings = settings
	return settings


	def _validate_core_settings(settings: dict[str, Any]) -> None:
	"""Validate core configuration settings."""
	# Validate main index mode
	valid_modes = ["disabled", "metadata_only", "full"]
	mode = settings.get("main_index_mode", "full")
	if mode not in valid_modes:
	logger.warning(f"Invalid main_index_mode '{mode}'. Using 'full'. Valid options: {valid_modes}")
	settings["main_index_mode"] = "full"

	# Validate exclude patterns
	patterns = settings.get("exclude_patterns", [])
	if not isinstance(patterns, list):
	logger.warning("exclude_patterns must be a list. Using default.")
	settings["exclude_patterns"] = ["_build", "_templates", "_static"]


	def _validate_content_limits(settings: dict[str, Any]) -> None:
	"""Validate content-related limit settings."""
	limit_settings = {
	"max_main_index_docs": (0, "0 (no limit)"),
	"content_max_length": (50000, "50000 (0 = no limit)"),
	"summary_max_length": (500, "500"),
	"keywords_max_count": (50, "50"),
	}

	for setting, (default_val, description) in limit_settings.items():
	value = settings.get(setting, default_val)
	if not isinstance(value, int) or value < 0:
	logger.warning(f"Invalid {setting} '{value}'. Using {description}.")
	settings[setting] = default_val


	def _validate_boolean_settings(settings: dict[str, Any]) -> None:
	"""Validate boolean configuration settings."""
	bool_settings = [
	"enabled",
	"verbose",
	"parallel",
	"include_children",
	"include_child_content",
	"extract_code_blocks",
	"extract_links",
	"extract_images",
	"extract_keywords",
	"include_doc_type",
	"include_section_path",
	"link_normalization",
	"link_include_ref_type",
	"link_include_target_doc",
	"link_resolve_titles",
	"minify_json",
	"separate_content",
	"cache_aggressive",
	"lazy_extraction",
	"incremental_build",
	"fast_text_extraction",
	"skip_complex_parsing",
	"filter_search_clutter",
	"infer_global_metadata",
	]

	defaults = get_default_settings()
	for setting in bool_settings:
	if setting in settings and not isinstance(settings.get(setting), bool):
	logger.warning(f"Setting '{setting}' must be boolean. Using default.")
	settings[setting] = defaults[setting]


	def _validate_integer_settings(settings: dict[str, Any]) -> None:
	"""Validate integer configuration settings with ranges."""
	int_settings = {
	"batch_size": (1, 1000), # min, max
	"skip_large_files": (0, None), # 0 = disabled
	"memory_limit_mb": (64, 8192), # reasonable memory limits
	}

	defaults = get_default_settings()
	for setting, (min_val, max_val) in int_settings.items():
	if setting in settings:
	value = settings[setting]
	if not isinstance(value, int) or value < min_val or (max_val and value > max_val):
	logger.warning(
	f"Setting '{setting}' must be integer between {min_val} and {max_val or 'unlimited'}. Using default."
	)
	settings[setting] = defaults[setting]


	def _validate_parallel_workers(settings: dict[str, Any]) -> None:
	"""Validate parallel_workers setting (can be 'auto' or integer)."""
	if "parallel_workers" in settings:
	value = settings["parallel_workers"]
	if value != "auto" and (not isinstance(value, int) or value < 1 or value > MAX_PARALLEL_WORKERS):
	logger.warning(
	f"Setting 'parallel_workers' must be 'auto' or integer between 1 and {MAX_PARALLEL_WORKERS}. Using default."
	)
	defaults = get_default_settings()
	settings["parallel_workers"] = defaults["parallel_workers"]


	def _validate_global_metadata(settings: dict[str, Any]) -> None:
	"""Validate global_metadata setting structure."""
	global_metadata = settings.get("global_metadata", {})

	if not isinstance(global_metadata, dict):
	logger.warning("global_metadata must be a dictionary. Using empty default.")
	settings["global_metadata"] = {}
	return

	# Validate known top-level keys have dict values
	valid_sections = ["book", "product", "site"]
	for section in valid_sections:
	if section in global_metadata and not isinstance(global_metadata[section], dict):
	logger.warning(f"global_metadata.{section} must be a dictionary. Removing invalid value.")
	del global_metadata[section]