Spaces:
Paused
Paused
| """Index builder for FDAM RAG knowledge base. | |
| Processes markdown documents from RAG-KB/ and indexes them in ChromaDB. | |
| Usage: | |
| python -m rag.index_builder [--rebuild] | |
| """ | |
| import argparse | |
| from pathlib import Path | |
| from rag.chunker import SemanticChunker | |
| from rag.vectorstore import ChromaVectorStore | |
| # Document configuration: filename -> (category, priority) | |
| DOCUMENT_CONFIG = { | |
| # PRIMARY - FDAM Methodology (authoritative source) | |
| "FDAM_v4_METHODOLOGY.md": ("methodology", "primary"), | |
| # REFERENCE - Threshold Tables (critical for metals clearance) | |
| "Metals clearance criteria-QVC.md": ("thresholds", "reference-threshold"), | |
| # REFERENCE - Narrative (supporting documentation) | |
| "air-o-cell-method-guide-atlas.md": ("lab-methods", "reference-narrative"), | |
| "Industrial Hygiene Lab Services Guide.md": ("lab-methods", "reference-narrative"), | |
| "Fire Remediation Processes and Methodologies_ A Review of Industry-Endorsed Standards.md": ( | |
| "cleaning-procedures", | |
| "reference-narrative", | |
| ), | |
| "Technical Guide for Wildfire Restoration - Key Information.md": ( | |
| "wildfire", | |
| "reference-narrative", | |
| ), | |
| "wildfire_soot_particulate_removal_full_text_extraction.md": ( | |
| "wildfire", | |
| "reference-narrative", | |
| ), | |
| } | |
| # Files to skip (per user decision) | |
| SKIP_FILES = { | |
| "Lead Contamination in Indoor Firing_Gun Ranges _ Atlantic Environmental.pdf", | |
| } | |
| def get_rag_kb_path() -> Path: | |
| """Get path to RAG-KB directory.""" | |
| # Try relative to this file first | |
| this_dir = Path(__file__).parent | |
| rag_kb = this_dir.parent / "RAG-KB" | |
| if rag_kb.exists(): | |
| return rag_kb | |
| # Try from current working directory | |
| rag_kb = Path("RAG-KB") | |
| if rag_kb.exists(): | |
| return rag_kb | |
| raise FileNotFoundError("Could not find RAG-KB directory") | |
| def get_chroma_path() -> Path: | |
| """Get path to ChromaDB persistence directory.""" | |
| this_dir = Path(__file__).parent | |
| chroma_path = this_dir.parent / "chroma_db" | |
| return chroma_path | |
| def build_index(rebuild: bool = False) -> dict: | |
| """Build the RAG index from RAG-KB documents. | |
| Args: | |
| rebuild: If True, clear existing index before building | |
| Returns: | |
| Statistics about the indexing operation | |
| """ | |
| rag_kb_path = get_rag_kb_path() | |
| chroma_path = get_chroma_path() | |
| print(f"RAG-KB path: {rag_kb_path}") | |
| print(f"ChromaDB path: {chroma_path}") | |
| # Initialize components | |
| chunker = SemanticChunker() | |
| vectorstore = ChromaVectorStore(persist_directory=str(chroma_path)) | |
| if rebuild: | |
| print("Rebuilding index - clearing existing data...") | |
| vectorstore.clear() | |
| stats = { | |
| "documents_processed": 0, | |
| "documents_skipped": 0, | |
| "chunks_created": 0, | |
| "errors": [], | |
| } | |
| # Process markdown files | |
| for md_file in rag_kb_path.glob("*.md"): | |
| filename = md_file.name | |
| # Skip files not in config or in skip list | |
| if filename in SKIP_FILES: | |
| print(f"Skipping (excluded): {filename}") | |
| stats["documents_skipped"] += 1 | |
| continue | |
| if filename not in DOCUMENT_CONFIG: | |
| print(f"Skipping (not configured): {filename}") | |
| stats["documents_skipped"] += 1 | |
| continue | |
| category, priority = DOCUMENT_CONFIG[filename] | |
| print(f"Processing: {filename} ({category}, {priority})") | |
| try: | |
| # Read and chunk document | |
| text = md_file.read_text(encoding="utf-8") | |
| chunks = chunker.chunk_document( | |
| text=text, | |
| source=filename, | |
| category=category, | |
| priority=priority, | |
| ) | |
| # Check if source already indexed (for incremental updates) | |
| existing_count = vectorstore.delete_by_source(filename) | |
| if existing_count > 0: | |
| print(f" Replaced {existing_count} existing chunks") | |
| # Add to vectorstore | |
| added = vectorstore.add_chunks(chunks) | |
| print(f" Added {added} chunks") | |
| stats["documents_processed"] += 1 | |
| stats["chunks_created"] += added | |
| except Exception as e: | |
| error_msg = f"Error processing {filename}: {e}" | |
| print(f" ERROR: {e}") | |
| stats["errors"].append(error_msg) | |
| # Report on PDFs that need conversion | |
| for pdf_file in rag_kb_path.glob("*.pdf"): | |
| if pdf_file.name not in SKIP_FILES: | |
| print(f"Note: PDF needs conversion to .md: {pdf_file.name}") | |
| # Print summary | |
| print("\n" + "=" * 50) | |
| print("Index Build Complete") | |
| print("=" * 50) | |
| print(f"Documents processed: {stats['documents_processed']}") | |
| print(f"Documents skipped: {stats['documents_skipped']}") | |
| print(f"Total chunks created: {stats['chunks_created']}") | |
| if stats["errors"]: | |
| print(f"Errors: {len(stats['errors'])}") | |
| for err in stats["errors"]: | |
| print(f" - {err}") | |
| # Print collection stats | |
| collection_stats = vectorstore.get_stats() | |
| print("\nCollection stats:") | |
| print(f" Total chunks in DB: {collection_stats['total_chunks']}") | |
| print(f" Categories: {collection_stats['categories']}") | |
| print(f" Priorities: {collection_stats['priorities']}") | |
| return stats | |
| def main(): | |
| """CLI entry point.""" | |
| parser = argparse.ArgumentParser( | |
| description="Build FDAM RAG knowledge base index" | |
| ) | |
| parser.add_argument( | |
| "--rebuild", | |
| action="store_true", | |
| help="Clear existing index and rebuild from scratch", | |
| ) | |
| args = parser.parse_args() | |
| build_index(rebuild=args.rebuild) | |
| if __name__ == "__main__": | |
| main() | |