Spaces:

KinetoLabs
/

SmokeScan

Paused

App Files Files Community

SmokeScan / rag /index_builder.py

KinetoLabs

Reduce thinking model max_new_tokens to fix slow inference

0699c5f 2 days ago

raw

history blame contribute delete

5.72 kB

	"""Index builder for FDAM RAG knowledge base.

	Processes markdown documents from RAG-KB/ and indexes them in ChromaDB.

	Usage:
	python -m rag.index_builder [--rebuild]
	"""

	import argparse
	from pathlib import Path

	from rag.chunker import SemanticChunker
	from rag.vectorstore import ChromaVectorStore


	# Document configuration: filename -> (category, priority)
	DOCUMENT_CONFIG = {
	# PRIMARY - FDAM Methodology (authoritative source)
	"FDAM_v4_METHODOLOGY.md": ("methodology", "primary"),
	# REFERENCE - Threshold Tables (critical for metals clearance)
	"Metals clearance criteria-QVC.md": ("thresholds", "reference-threshold"),
	# REFERENCE - Narrative (supporting documentation)
	"air-o-cell-method-guide-atlas.md": ("lab-methods", "reference-narrative"),
	"Industrial Hygiene Lab Services Guide.md": ("lab-methods", "reference-narrative"),
	"Fire Remediation Processes and Methodologies_ A Review of Industry-Endorsed Standards.md": (
	"cleaning-procedures",
	"reference-narrative",
	),
	"Technical Guide for Wildfire Restoration - Key Information.md": (
	"wildfire",
	"reference-narrative",
	),
	"wildfire_soot_particulate_removal_full_text_extraction.md": (
	"wildfire",
	"reference-narrative",
	),
	}

	# Files to skip (per user decision)
	SKIP_FILES = {
	"Lead Contamination in Indoor Firing_Gun Ranges _ Atlantic Environmental.pdf",
	}


	def get_rag_kb_path() -> Path:
	"""Get path to RAG-KB directory."""
	# Try relative to this file first
	this_dir = Path(__file__).parent
	rag_kb = this_dir.parent / "RAG-KB"
	if rag_kb.exists():
	return rag_kb

	# Try from current working directory
	rag_kb = Path("RAG-KB")
	if rag_kb.exists():
	return rag_kb

	raise FileNotFoundError("Could not find RAG-KB directory")


	def get_chroma_path() -> Path:
	"""Get path to ChromaDB persistence directory."""
	this_dir = Path(__file__).parent
	chroma_path = this_dir.parent / "chroma_db"
	return chroma_path


	def build_index(rebuild: bool = False) -> dict:
	"""Build the RAG index from RAG-KB documents.

	Args:
	rebuild: If True, clear existing index before building

	Returns:
	Statistics about the indexing operation
	"""
	rag_kb_path = get_rag_kb_path()
	chroma_path = get_chroma_path()

	print(f"RAG-KB path: {rag_kb_path}")
	print(f"ChromaDB path: {chroma_path}")

	# Initialize components
	chunker = SemanticChunker()
	vectorstore = ChromaVectorStore(persist_directory=str(chroma_path))

	if rebuild:
	print("Rebuilding index - clearing existing data...")
	vectorstore.clear()

	stats = {
	"documents_processed": 0,
	"documents_skipped": 0,
	"chunks_created": 0,
	"errors": [],
	}

	# Process markdown files
	for md_file in rag_kb_path.glob("*.md"):
	filename = md_file.name

	# Skip files not in config or in skip list
	if filename in SKIP_FILES:
	print(f"Skipping (excluded): {filename}")
	stats["documents_skipped"] += 1
	continue

	if filename not in DOCUMENT_CONFIG:
	print(f"Skipping (not configured): {filename}")
	stats["documents_skipped"] += 1
	continue

	category, priority = DOCUMENT_CONFIG[filename]
	print(f"Processing: {filename} ({category}, {priority})")

	try:
	# Read and chunk document
	text = md_file.read_text(encoding="utf-8")
	chunks = chunker.chunk_document(
	text=text,
	source=filename,
	category=category,
	priority=priority,
	)

	# Check if source already indexed (for incremental updates)
	existing_count = vectorstore.delete_by_source(filename)
	if existing_count > 0:
	print(f" Replaced {existing_count} existing chunks")

	# Add to vectorstore
	added = vectorstore.add_chunks(chunks)
	print(f" Added {added} chunks")

	stats["documents_processed"] += 1
	stats["chunks_created"] += added

	except Exception as e:
	error_msg = f"Error processing {filename}: {e}"
	print(f" ERROR: {e}")
	stats["errors"].append(error_msg)

	# Report on PDFs that need conversion
	for pdf_file in rag_kb_path.glob("*.pdf"):
	if pdf_file.name not in SKIP_FILES:
	print(f"Note: PDF needs conversion to .md: {pdf_file.name}")

	# Print summary
	print("\n" + "=" * 50)
	print("Index Build Complete")
	print("=" * 50)
	print(f"Documents processed: {stats['documents_processed']}")
	print(f"Documents skipped: {stats['documents_skipped']}")
	print(f"Total chunks created: {stats['chunks_created']}")

	if stats["errors"]:
	print(f"Errors: {len(stats['errors'])}")
	for err in stats["errors"]:
	print(f" - {err}")

	# Print collection stats
	collection_stats = vectorstore.get_stats()
	print("\nCollection stats:")
	print(f" Total chunks in DB: {collection_stats['total_chunks']}")
	print(f" Categories: {collection_stats['categories']}")
	print(f" Priorities: {collection_stats['priorities']}")

	return stats


	def main():
	"""CLI entry point."""
	parser = argparse.ArgumentParser(
	description="Build FDAM RAG knowledge base index"
	)
	parser.add_argument(
	"--rebuild",
	action="store_true",
	help="Clear existing index and rebuild from scratch",
	)
	args = parser.parse_args()

	build_index(rebuild=args.rebuild)


	if __name__ == "__main__":
	main()