"""Index builder for FDAM RAG knowledge base. Processes markdown documents from RAG-KB/ and indexes them in ChromaDB. Usage: python -m rag.index_builder [--rebuild] """ import argparse from pathlib import Path from rag.chunker import SemanticChunker from rag.vectorstore import ChromaVectorStore # Document configuration: filename -> (category, priority) DOCUMENT_CONFIG = { # PRIMARY - FDAM Methodology (authoritative source) "FDAM_v4_METHODOLOGY.md": ("methodology", "primary"), # REFERENCE - Threshold Tables (critical for metals clearance) "Metals clearance criteria-QVC.md": ("thresholds", "reference-threshold"), # REFERENCE - Narrative (supporting documentation) "air-o-cell-method-guide-atlas.md": ("lab-methods", "reference-narrative"), "Industrial Hygiene Lab Services Guide.md": ("lab-methods", "reference-narrative"), "Fire Remediation Processes and Methodologies_ A Review of Industry-Endorsed Standards.md": ( "cleaning-procedures", "reference-narrative", ), "Technical Guide for Wildfire Restoration - Key Information.md": ( "wildfire", "reference-narrative", ), "wildfire_soot_particulate_removal_full_text_extraction.md": ( "wildfire", "reference-narrative", ), } # Files to skip (per user decision) SKIP_FILES = { "Lead Contamination in Indoor Firing_Gun Ranges _ Atlantic Environmental.pdf", } def get_rag_kb_path() -> Path: """Get path to RAG-KB directory.""" # Try relative to this file first this_dir = Path(__file__).parent rag_kb = this_dir.parent / "RAG-KB" if rag_kb.exists(): return rag_kb # Try from current working directory rag_kb = Path("RAG-KB") if rag_kb.exists(): return rag_kb raise FileNotFoundError("Could not find RAG-KB directory") def get_chroma_path() -> Path: """Get path to ChromaDB persistence directory.""" this_dir = Path(__file__).parent chroma_path = this_dir.parent / "chroma_db" return chroma_path def build_index(rebuild: bool = False) -> dict: """Build the RAG index from RAG-KB documents. Args: rebuild: If True, clear existing index before building Returns: Statistics about the indexing operation """ rag_kb_path = get_rag_kb_path() chroma_path = get_chroma_path() print(f"RAG-KB path: {rag_kb_path}") print(f"ChromaDB path: {chroma_path}") # Initialize components chunker = SemanticChunker() vectorstore = ChromaVectorStore(persist_directory=str(chroma_path)) if rebuild: print("Rebuilding index - clearing existing data...") vectorstore.clear() stats = { "documents_processed": 0, "documents_skipped": 0, "chunks_created": 0, "errors": [], } # Process markdown files for md_file in rag_kb_path.glob("*.md"): filename = md_file.name # Skip files not in config or in skip list if filename in SKIP_FILES: print(f"Skipping (excluded): {filename}") stats["documents_skipped"] += 1 continue if filename not in DOCUMENT_CONFIG: print(f"Skipping (not configured): {filename}") stats["documents_skipped"] += 1 continue category, priority = DOCUMENT_CONFIG[filename] print(f"Processing: {filename} ({category}, {priority})") try: # Read and chunk document text = md_file.read_text(encoding="utf-8") chunks = chunker.chunk_document( text=text, source=filename, category=category, priority=priority, ) # Check if source already indexed (for incremental updates) existing_count = vectorstore.delete_by_source(filename) if existing_count > 0: print(f" Replaced {existing_count} existing chunks") # Add to vectorstore added = vectorstore.add_chunks(chunks) print(f" Added {added} chunks") stats["documents_processed"] += 1 stats["chunks_created"] += added except Exception as e: error_msg = f"Error processing {filename}: {e}" print(f" ERROR: {e}") stats["errors"].append(error_msg) # Report on PDFs that need conversion for pdf_file in rag_kb_path.glob("*.pdf"): if pdf_file.name not in SKIP_FILES: print(f"Note: PDF needs conversion to .md: {pdf_file.name}") # Print summary print("\n" + "=" * 50) print("Index Build Complete") print("=" * 50) print(f"Documents processed: {stats['documents_processed']}") print(f"Documents skipped: {stats['documents_skipped']}") print(f"Total chunks created: {stats['chunks_created']}") if stats["errors"]: print(f"Errors: {len(stats['errors'])}") for err in stats["errors"]: print(f" - {err}") # Print collection stats collection_stats = vectorstore.get_stats() print("\nCollection stats:") print(f" Total chunks in DB: {collection_stats['total_chunks']}") print(f" Categories: {collection_stats['categories']}") print(f" Priorities: {collection_stats['priorities']}") return stats def main(): """CLI entry point.""" parser = argparse.ArgumentParser( description="Build FDAM RAG knowledge base index" ) parser.add_argument( "--rebuild", action="store_true", help="Clear existing index and rebuild from scratch", ) args = parser.parse_args() build_index(rebuild=args.rebuild) if __name__ == "__main__": main()