Spaces:
Running on CPU Upgrade
Running on CPU Upgrade
| #!/usr/bin/env python3 | |
| """Script to ingest documents and save to pickle for in-memory use.""" | |
| import sys | |
| import logging | |
| import pickle | |
| from pathlib import Path | |
| # Add src to path | |
| sys.path.insert(0, str(Path(__file__).parent.parent)) | |
| from src.config import get_config | |
| from src.document_processing.loader import MarkdownDocumentLoader | |
| from src.document_processing.chunker import SemanticChunker | |
| from src.indexing.memory_indexer import MemoryDocumentIndexer | |
| def setup_logging(): | |
| """Configure logging.""" | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", | |
| ) | |
| def main(): | |
| """Main ingestion workflow.""" | |
| setup_logging() | |
| logger = logging.getLogger(__name__) | |
| logger.info("Starting document ingestion process (in-memory)...") | |
| # Load configuration | |
| config = get_config() | |
| logger.info(f"Using documents path: {config.document_processing.documents_path}") | |
| # Load documents | |
| logger.info("Loading markdown documents...") | |
| loader = MarkdownDocumentLoader(config.document_processing.documents_path) | |
| documents = loader.load_documents() | |
| if not documents: | |
| logger.error("No documents loaded. Exiting.") | |
| sys.exit(1) | |
| logger.info(f"Loaded {len(documents)} documents") | |
| # Chunk documents | |
| logger.info("Chunking documents...") | |
| chunker = SemanticChunker( | |
| chunk_size=config.document_processing.chunk_size, | |
| chunk_overlap=config.document_processing.chunk_overlap, | |
| min_chunk_size=config.document_processing.min_chunk_size, | |
| ) | |
| chunked_documents = chunker.chunk_documents(documents) | |
| logger.info(f"Created {len(chunked_documents)} chunks") | |
| # Index documents in memory | |
| logger.info("Indexing documents in memory...") | |
| indexer = MemoryDocumentIndexer(llm_config=config.llm) | |
| indexed_count = indexer.index_documents(chunked_documents) | |
| logger.info(f"Successfully indexed {indexed_count} document chunks") | |
| # Save document store to pickle for later use | |
| output_file = Path("data/document_store.pkl") | |
| output_file.parent.mkdir(parents=True, exist_ok=True) | |
| logger.info(f"Saving document store to {output_file}...") | |
| with open(output_file, "wb") as f: | |
| pickle.dump(indexer.document_store, f) | |
| logger.info("✅ Document ingestion completed successfully!") | |
| logger.info(f"Document store saved to: {output_file}") | |
| logger.info(f"Total documents indexed: {indexed_count}") | |
| if __name__ == "__main__": | |
| main() | |