#!/usr/bin/env python3 """Script to ingest documents and save to JSON (pickle doesn't work properly).""" import sys import logging import json from pathlib import Path sys.path.insert(0, str(Path(__file__).parent.parent)) from src.config import get_config from src.document_processing.loader import MarkdownDocumentLoader from src.document_processing.chunker import SemanticChunker from src.indexing.memory_indexer import MemoryDocumentIndexer def setup_logging(): """Configure logging.""" logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", ) def main(): """Main ingestion workflow.""" setup_logging() logger = logging.getLogger(__name__) logger.info("Starting document ingestion process...") # Load configuration config = get_config() logger.info(f"Using documents path: {config.document_processing.documents_path}") # Load documents logger.info("Loading markdown documents...") loader = MarkdownDocumentLoader(config.document_processing.documents_path) documents = loader.load_documents() if not documents: logger.error("No documents loaded. Exiting.") sys.exit(1) logger.info(f"Loaded {len(documents)} documents") # Chunk documents logger.info("Chunking documents...") chunker = SemanticChunker( chunk_size=config.document_processing.chunk_size, chunk_overlap=config.document_processing.chunk_overlap, min_chunk_size=config.document_processing.min_chunk_size, ) chunked_documents = chunker.chunk_documents(documents) logger.info(f"Created {len(chunked_documents)} chunks") # Index documents in memory logger.info("Indexing documents (generating embeddings)...") indexer = MemoryDocumentIndexer(llm_config=config.llm) indexed_count = indexer.index_documents(chunked_documents) logger.info(f"Successfully indexed {indexed_count} document chunks") # Save embedded documents to JSON output_file = Path("data/embedded_documents.json") output_file.parent.mkdir(parents=True, exist_ok=True) logger.info(f"Saving embedded documents to {output_file}...") # Serialize documents docs_data = [] for doc in indexer.document_store.filter_documents(): docs_data.append({ "id": doc.id, "content": doc.content, "embedding": doc.embedding, "meta": doc.meta or {} }) with open(output_file, "w") as f: json.dump(docs_data, f) logger.info("✅ Document ingestion completed successfully!") logger.info(f"Embedded documents saved to: {output_file}") logger.info(f"Total documents indexed: {len(docs_data)}") if __name__ == "__main__": main()