#!/usr/bin/env python3 """Script to ingest documents and save to pickle for in-memory use.""" import sys import logging import pickle from pathlib import Path # Add src to path sys.path.insert(0, str(Path(__file__).parent.parent)) from src.config import get_config from src.document_processing.loader import MarkdownDocumentLoader from src.document_processing.chunker import SemanticChunker from src.indexing.memory_indexer import MemoryDocumentIndexer def setup_logging(): """Configure logging.""" logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) def main(): """Main ingestion workflow.""" setup_logging() logger = logging.getLogger(__name__) logger.info("Starting document ingestion process (in-memory)...") # Load configuration config = get_config() logger.info(f"Using documents path: {config.document_processing.documents_path}") # Load documents logger.info("Loading markdown documents...") loader = MarkdownDocumentLoader(config.document_processing.documents_path) documents = loader.load_documents() if not documents: logger.error("No documents loaded. Exiting.") sys.exit(1) logger.info(f"Loaded {len(documents)} documents") # Chunk documents logger.info("Chunking documents...") chunker = SemanticChunker( chunk_size=config.document_processing.chunk_size, chunk_overlap=config.document_processing.chunk_overlap, min_chunk_size=config.document_processing.min_chunk_size, ) chunked_documents = chunker.chunk_documents(documents) logger.info(f"Created {len(chunked_documents)} chunks") # Index documents in memory logger.info("Indexing documents in memory...") indexer = MemoryDocumentIndexer(llm_config=config.llm) indexed_count = indexer.index_documents(chunked_documents) logger.info(f"Successfully indexed {indexed_count} document chunks") # Save document store to pickle for later use output_file = Path("data/document_store.pkl") output_file.parent.mkdir(parents=True, exist_ok=True) logger.info(f"Saving document store to {output_file}...") with open(output_file, "wb") as f: pickle.dump(indexer.document_store, f) logger.info("✅ Document ingestion completed successfully!") logger.info(f"Document store saved to: {output_file}") logger.info(f"Total documents indexed: {indexed_count}") if __name__ == "__main__": main()