bfh-studadmin-assist / scripts /ingest_documents_memory.py
awellis's picture
Refactor RAG Email Assistant for in-memory processing; update configurations, implement memory indexing and retrieval, enhance Gradio UI, and streamline document ingestion.
0021e2f
#!/usr/bin/env python3
"""Script to ingest documents and save to pickle for in-memory use."""
import sys
import logging
import pickle
from pathlib import Path
# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.config import get_config
from src.document_processing.loader import MarkdownDocumentLoader
from src.document_processing.chunker import SemanticChunker
from src.indexing.memory_indexer import MemoryDocumentIndexer
def setup_logging():
"""Configure logging."""
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
def main():
"""Main ingestion workflow."""
setup_logging()
logger = logging.getLogger(__name__)
logger.info("Starting document ingestion process (in-memory)...")
# Load configuration
config = get_config()
logger.info(f"Using documents path: {config.document_processing.documents_path}")
# Load documents
logger.info("Loading markdown documents...")
loader = MarkdownDocumentLoader(config.document_processing.documents_path)
documents = loader.load_documents()
if not documents:
logger.error("No documents loaded. Exiting.")
sys.exit(1)
logger.info(f"Loaded {len(documents)} documents")
# Chunk documents
logger.info("Chunking documents...")
chunker = SemanticChunker(
chunk_size=config.document_processing.chunk_size,
chunk_overlap=config.document_processing.chunk_overlap,
min_chunk_size=config.document_processing.min_chunk_size,
)
chunked_documents = chunker.chunk_documents(documents)
logger.info(f"Created {len(chunked_documents)} chunks")
# Index documents in memory
logger.info("Indexing documents in memory...")
indexer = MemoryDocumentIndexer(llm_config=config.llm)
indexed_count = indexer.index_documents(chunked_documents)
logger.info(f"Successfully indexed {indexed_count} document chunks")
# Save document store to pickle for later use
output_file = Path("data/document_store.pkl")
output_file.parent.mkdir(parents=True, exist_ok=True)
logger.info(f"Saving document store to {output_file}...")
with open(output_file, "wb") as f:
pickle.dump(indexer.document_store, f)
logger.info("✅ Document ingestion completed successfully!")
logger.info(f"Document store saved to: {output_file}")
logger.info(f"Total documents indexed: {indexed_count}")
if __name__ == "__main__":
main()