Spaces:

Nitishkumar-ai
/

INIclaw

Configuration error

INIclaw / docs /_ext /json_output /content /text.py

Nitish kumar

Upload folder using huggingface_hub

0722e92 verified 23 days ago

12.3 kB

	# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
	# SPDX-License-Identifier: Apache-2.0
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.

	"""Text content extraction functions."""

	import re
	from typing import Any

	from docutils import nodes
	from sphinx.environment import BuildEnvironment
	from sphinx.util import logging

	logger = logging.getLogger(__name__)

	# Constants
	MIN_SUBSTANTIAL_CONTENT_LENGTH = 50
	MAX_SUMMARY_LENGTH = 300
	MIN_KEYWORD_LENGTH = 3
	MAX_KEYWORDS_RETURNED = 50


	def extract_raw_markdown(env: BuildEnvironment, docname: str) -> str \| None:
	"""Extract raw markdown from source file."""
	try:
	source_path = env.doc2path(docname)
	if not source_path or not source_path.exists():
	return None

	with open(source_path, encoding="utf-8") as f:
	content = f.read()

	# Remove frontmatter if present
	if content.startswith("---"):
	end_marker = content.find("\n---\n", 3)
	if end_marker != -1:
	content = content[end_marker + 5 :] # Skip the second ---\n

	return content.strip()

	except Exception as e: # noqa: BLE001
	logger.debug(f"Could not extract raw markdown from {docname}: {e}")
	return None


	def extract_text_content(doctree: nodes.document) -> str:
	"""Extract plain text content from document tree."""
	text_parts = []

	for node in doctree.traverse(nodes.Text):
	text_parts.append(node.astext())

	return " ".join(text_parts).strip()


	def extract_clean_text_content(doctree: nodes.document, env: BuildEnvironment \| None = None) -> str:
	"""Extract clean text content, filtering out navigation elements.

	Args:
	doctree: The document tree to extract text from
	env: Optional Sphinx environment for resolving link titles

	Returns:
	Cleaned text content suitable for search/LLM consumption
	"""
	text_parts = []
	# Track nodes we've already processed (to avoid duplicate text from references)
	processed_refs = set()

	for node in doctree.traverse():
	# Skip certain node types that aren't content
	if isinstance(node, (nodes.target, nodes.substitution_definition)):
	continue

	# Skip toctree and other directive content
	if hasattr(node, "tagname") and node.tagname in ["toctree", "index", "meta"]:
	continue

	# Handle reference nodes specially - extract and potentially improve link text
	if isinstance(node, nodes.reference):
	ref_id = id(node)
	if ref_id not in processed_refs:
	processed_refs.add(ref_id)
	link_text = _get_improved_link_text(node, env)
	if link_text:
	text_parts.append(link_text)
	continue

	# Extract text from text nodes (but skip if inside a reference we already processed)
	if isinstance(node, nodes.Text):
	# Check if this text node is inside a reference
	parent = node.parent
	if isinstance(parent, nodes.reference) and id(parent) in processed_refs:
	continue # Already handled by reference processing

	text = node.astext().strip()
	if text and not text.startswith("¶"): # Skip permalink symbols
	text_parts.append(text)

	# Join and clean up the text
	full_text = " ".join(text_parts)

	# Clean up whitespace
	full_text = re.sub(r"\s+", " ", full_text)

	return full_text.strip()


	def _get_improved_link_text(node: nodes.reference, env: BuildEnvironment \| None) -> str:
	"""Get improved link text, resolving filenames to titles where possible."""
	text = node.astext().strip()
	if not text:
	return ""

	# If text doesn't look like a filename, use it as-is
	if not _text_looks_like_filename(text):
	return text

	# Try to resolve to a better title
	attrs = getattr(node, "attributes", {})

	# Try refdoc first (target document for cross-references)
	target_doc = attrs.get("refdoc", "")

	# Try reftarget as fallback
	if not target_doc:
	target_doc = attrs.get("reftarget", "")
	# Clean up the target
	target_doc = target_doc.replace(".html", "").replace(".md", "").replace(".rst", "")

	# Look up title in env.titles
	if target_doc and env and hasattr(env, "titles") and target_doc in env.titles:
	title_node = env.titles[target_doc]
	if title_node:
	resolved_title = title_node.astext().strip()
	if resolved_title:
	return resolved_title

	# Fallback: humanize the filename
	return _humanize_link_text(text)


	def _text_looks_like_filename(text: str) -> bool:
	"""Check if text looks like a filename rather than readable text."""
	if not text:
	return False

	# Contains path separators
	if "/" in text or "\\" in text:
	return True

	# Ends with file extension
	if re.search(r"\.(md\|rst\|html\|txt)$", text, re.IGNORECASE):
	return True

	# Single lowercase word (like "index", "readme", "configuration")
	if " " not in text and text == text.lower() and len(text) > 2:
	# But allow proper nouns that happen to be lowercase in context
	return True

	return False


	def _humanize_link_text(text: str) -> str:
	"""Convert filename-like text to human-readable form."""
	# Get just the filename part
	if "/" in text:
	text = text.rsplit("/", 1)[-1]

	# Remove extension
	for ext in (".md", ".rst", ".html", ".txt"):
	if text.endswith(ext):
	text = text[: -len(ext)]
	break

	# Replace separators with spaces
	text = text.replace("-", " ").replace("_", " ")

	# Title case
	return text.title()


	def clean_text_for_llm(text: str) -> str:
	"""Clean text content to make it more suitable for LLM processing and search indexing."""
	if not text:
	return ""

	# Remove SVG content (common in documentation)
	text = re.sub(r"<svg[^>]>.?</svg>", "", text, flags=re.DOTALL \| re.IGNORECASE)

	# Remove HTML comments
	text = re.sub(r"<!--.*?-->", "", text, flags=re.DOTALL)

	# Remove empty directive blocks (common MyST artifacts)
	text = re.sub(r"^\s```\{[^}]+\}\s```\s*$", "", text, flags=re.MULTILINE)

	# Remove toctree artifacts
	text = re.sub(r"^\s:caption:.$", "", text, flags=re.MULTILINE)
	text = re.sub(r"^\s:hidden:\s$", "", text, flags=re.MULTILINE)
	text = re.sub(r"^\s:glob:\s$", "", text, flags=re.MULTILINE)
	text = re.sub(r"^\s:maxdepth:\s\d+\s*$", "", text, flags=re.MULTILINE)

	# Remove common MyST directive markers that aren't useful for search
	text = re.sub(r"^\s:::\{[^}]+\}\s$", "", text, flags=re.MULTILINE)
	text = re.sub(r"^\s:::\s$", "", text, flags=re.MULTILINE)

	# Clean up code block language indicators
	text = re.sub(r"```(\w+)\s*\n", "```\n", text)

	# Remove excessive whitespace but preserve paragraph breaks
	text = re.sub(r"\n\s\n\s\n+", "\n\n", text) # Multiple line breaks -> double
	text = re.sub(r"[ \t]+", " ", text) # Multiple spaces/tabs -> single space

	# Remove lines that are just punctuation or symbols
	lines = text.split("\n")
	cleaned_lines = []
	for line in lines:
	stripped = line.strip()
	# Keep line if it has actual words (not just punctuation/symbols)
	if stripped and re.search(r"[a-zA-Z0-9]", stripped):
	# Remove standalone punctuation at start/end
	stripped = re.sub(r"^[^\w\s]+\s*", "", stripped)
	stripped = re.sub(r"\s*[^\w\s]+$", "", stripped)
	if stripped:
	cleaned_lines.append(stripped)

	text = "\n".join(cleaned_lines)

	# Final cleanup
	return text.strip()


	def extract_directive_content(directive_block: str) -> str:
	"""Extract meaningful content from MyST directive blocks."""
	if not directive_block:
	return ""

	# Remove the directive syntax but keep the content
	lines = directive_block.split("\n")
	content_lines = []
	in_content = False

	for line in lines:
	# Skip directive header lines
	if line.strip().startswith(":::") or line.strip().startswith("```{"):
	in_content = True
	continue
	elif line.strip() == ":::" or line.strip() == "```":
	continue
	elif line.strip().startswith(":") and not in_content:
	# Skip directive options
	continue

	# Include content lines
	if in_content or not line.strip().startswith(":"):
	content_lines.append(line)

	return "\n".join(content_lines).strip()


	def extract_summary(doctree: nodes.document) -> str:
	"""Extract a summary from the document (first paragraph or section)."""
	# Try to find the first substantial paragraph
	for node in doctree.traverse(nodes.paragraph):
	text = node.astext().strip()
	if text and len(text) > MIN_SUBSTANTIAL_CONTENT_LENGTH: # Substantial content
	# Clean and truncate
	text = re.sub(r"\s+", " ", text)
	if len(text) > MAX_SUMMARY_LENGTH:
	text = text[:297] + "..."
	return text

	# Fallback: use first MAX_SUMMARY_LENGTH characters of any text
	text = extract_text_content(doctree)
	if text:
	text = re.sub(r"\s+", " ", text)
	if len(text) > MAX_SUMMARY_LENGTH:
	text = text[:297] + "..."
	return text

	return ""


	def extract_keywords(content: str, headings: list[dict[str, Any]]) -> list[str]:
	"""Extract relevant keywords from content for search optimization."""
	if not content:
	return []

	keywords = set()

	# Add heading text as keywords
	for heading in headings:
	if "text" in heading:
	# Split heading into words and add significant ones
	words = re.findall(r"\b[a-zA-Z]{3,}\b", heading["text"].lower())
	keywords.update(words)

	# Extract technical terms (often capitalized or have specific patterns)
	# API names, class names, function names, etc.
	tech_terms = re.findall(r"\b[A-Z][a-zA-Z0-9_][a-z][a-zA-Z0-9_]\b", content)
	keywords.update(term.lower() for term in tech_terms)

	# Extract quoted terms (often important concepts)
	quoted_terms = re.findall(r'["`]([^"`]{3,20})["`]', content)
	for term in quoted_terms:
	if re.match(r"^[a-zA-Z][a-zA-Z0-9_\-\s]*$", term):
	keywords.add(term.lower().strip())

	# Extract common patterns for documentation keywords
	# Configuration keys, file extensions, command names
	config_keys = re.findall(r"\b[a-z_]+[a-z0-9_]\s[:=]", content)
	keywords.update(key.rstrip(":=").strip() for key in config_keys)

	# File extensions
	extensions = re.findall(r"\.[a-z]{2,4}\b", content.lower())
	keywords.update(ext.lstrip(".") for ext in extensions)

	# Remove common stop words and very short terms
	stop_words = {
	"the",
	"and",
	"for",
	"are",
	"but",
	"not",
	"you",
	"all",
	"can",
	"had",
	"her",
	"was",
	"one",
	"our",
	"out",
	"day",
	"get",
	"has",
	"him",
	"his",
	"how",
	"its",
	"may",
	"new",
	"now",
	"old",
	"see",
	"two",
	"who",
	"boy",
	"did",
	"she",
	"use",
	"way",
	"what",
	"when",
	"will",
	}
	keywords = {kw for kw in keywords if len(kw) >= MIN_KEYWORD_LENGTH and kw not in stop_words}

	# Return sorted list, limited to reasonable number
	return sorted(keywords)[:MAX_KEYWORDS_RETURNED]