SmokeScan / rag /index_builder.py
KinetoLabs's picture
Reduce thinking model max_new_tokens to fix slow inference
0699c5f
"""Index builder for FDAM RAG knowledge base.
Processes markdown documents from RAG-KB/ and indexes them in ChromaDB.
Usage:
python -m rag.index_builder [--rebuild]
"""
import argparse
from pathlib import Path
from rag.chunker import SemanticChunker
from rag.vectorstore import ChromaVectorStore
# Document configuration: filename -> (category, priority)
DOCUMENT_CONFIG = {
# PRIMARY - FDAM Methodology (authoritative source)
"FDAM_v4_METHODOLOGY.md": ("methodology", "primary"),
# REFERENCE - Threshold Tables (critical for metals clearance)
"Metals clearance criteria-QVC.md": ("thresholds", "reference-threshold"),
# REFERENCE - Narrative (supporting documentation)
"air-o-cell-method-guide-atlas.md": ("lab-methods", "reference-narrative"),
"Industrial Hygiene Lab Services Guide.md": ("lab-methods", "reference-narrative"),
"Fire Remediation Processes and Methodologies_ A Review of Industry-Endorsed Standards.md": (
"cleaning-procedures",
"reference-narrative",
),
"Technical Guide for Wildfire Restoration - Key Information.md": (
"wildfire",
"reference-narrative",
),
"wildfire_soot_particulate_removal_full_text_extraction.md": (
"wildfire",
"reference-narrative",
),
}
# Files to skip (per user decision)
SKIP_FILES = {
"Lead Contamination in Indoor Firing_Gun Ranges _ Atlantic Environmental.pdf",
}
def get_rag_kb_path() -> Path:
"""Get path to RAG-KB directory."""
# Try relative to this file first
this_dir = Path(__file__).parent
rag_kb = this_dir.parent / "RAG-KB"
if rag_kb.exists():
return rag_kb
# Try from current working directory
rag_kb = Path("RAG-KB")
if rag_kb.exists():
return rag_kb
raise FileNotFoundError("Could not find RAG-KB directory")
def get_chroma_path() -> Path:
"""Get path to ChromaDB persistence directory."""
this_dir = Path(__file__).parent
chroma_path = this_dir.parent / "chroma_db"
return chroma_path
def build_index(rebuild: bool = False) -> dict:
"""Build the RAG index from RAG-KB documents.
Args:
rebuild: If True, clear existing index before building
Returns:
Statistics about the indexing operation
"""
rag_kb_path = get_rag_kb_path()
chroma_path = get_chroma_path()
print(f"RAG-KB path: {rag_kb_path}")
print(f"ChromaDB path: {chroma_path}")
# Initialize components
chunker = SemanticChunker()
vectorstore = ChromaVectorStore(persist_directory=str(chroma_path))
if rebuild:
print("Rebuilding index - clearing existing data...")
vectorstore.clear()
stats = {
"documents_processed": 0,
"documents_skipped": 0,
"chunks_created": 0,
"errors": [],
}
# Process markdown files
for md_file in rag_kb_path.glob("*.md"):
filename = md_file.name
# Skip files not in config or in skip list
if filename in SKIP_FILES:
print(f"Skipping (excluded): {filename}")
stats["documents_skipped"] += 1
continue
if filename not in DOCUMENT_CONFIG:
print(f"Skipping (not configured): {filename}")
stats["documents_skipped"] += 1
continue
category, priority = DOCUMENT_CONFIG[filename]
print(f"Processing: {filename} ({category}, {priority})")
try:
# Read and chunk document
text = md_file.read_text(encoding="utf-8")
chunks = chunker.chunk_document(
text=text,
source=filename,
category=category,
priority=priority,
)
# Check if source already indexed (for incremental updates)
existing_count = vectorstore.delete_by_source(filename)
if existing_count > 0:
print(f" Replaced {existing_count} existing chunks")
# Add to vectorstore
added = vectorstore.add_chunks(chunks)
print(f" Added {added} chunks")
stats["documents_processed"] += 1
stats["chunks_created"] += added
except Exception as e:
error_msg = f"Error processing {filename}: {e}"
print(f" ERROR: {e}")
stats["errors"].append(error_msg)
# Report on PDFs that need conversion
for pdf_file in rag_kb_path.glob("*.pdf"):
if pdf_file.name not in SKIP_FILES:
print(f"Note: PDF needs conversion to .md: {pdf_file.name}")
# Print summary
print("\n" + "=" * 50)
print("Index Build Complete")
print("=" * 50)
print(f"Documents processed: {stats['documents_processed']}")
print(f"Documents skipped: {stats['documents_skipped']}")
print(f"Total chunks created: {stats['chunks_created']}")
if stats["errors"]:
print(f"Errors: {len(stats['errors'])}")
for err in stats["errors"]:
print(f" - {err}")
# Print collection stats
collection_stats = vectorstore.get_stats()
print("\nCollection stats:")
print(f" Total chunks in DB: {collection_stats['total_chunks']}")
print(f" Categories: {collection_stats['categories']}")
print(f" Priorities: {collection_stats['priorities']}")
return stats
def main():
"""CLI entry point."""
parser = argparse.ArgumentParser(
description="Build FDAM RAG knowledge base index"
)
parser.add_argument(
"--rebuild",
action="store_true",
help="Clear existing index and rebuild from scratch",
)
args = parser.parse_args()
build_index(rebuild=args.rebuild)
if __name__ == "__main__":
main()