Spaces:

awellis
/

bfh-studadmin-assist

Running on CPU Upgrade

bfh-studadmin-assist / scripts /ingest_documents_memory.py

Refactor RAG Email Assistant for in-memory processing; update configurations, implement memory indexing and retrieval, enhance Gradio UI, and streamline document ingestion.

0021e2f 5 months ago

raw

history blame contribute delete

2.52 kB

	#!/usr/bin/env python3
	"""Script to ingest documents and save to pickle for in-memory use."""

	import sys
	import logging
	import pickle
	from pathlib import Path

	# Add src to path
	sys.path.insert(0, str(Path(__file__).parent.parent))

	from src.config import get_config
	from src.document_processing.loader import MarkdownDocumentLoader
	from src.document_processing.chunker import SemanticChunker
	from src.indexing.memory_indexer import MemoryDocumentIndexer


	def setup_logging():
	"""Configure logging."""
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
	)


	def main():
	"""Main ingestion workflow."""
	setup_logging()
	logger = logging.getLogger(__name__)

	logger.info("Starting document ingestion process (in-memory)...")

	# Load configuration
	config = get_config()
	logger.info(f"Using documents path: {config.document_processing.documents_path}")

	# Load documents
	logger.info("Loading markdown documents...")
	loader = MarkdownDocumentLoader(config.document_processing.documents_path)
	documents = loader.load_documents()

	if not documents:
	logger.error("No documents loaded. Exiting.")
	sys.exit(1)

	logger.info(f"Loaded {len(documents)} documents")

	# Chunk documents
	logger.info("Chunking documents...")
	chunker = SemanticChunker(
	chunk_size=config.document_processing.chunk_size,
	chunk_overlap=config.document_processing.chunk_overlap,
	min_chunk_size=config.document_processing.min_chunk_size,
	)
	chunked_documents = chunker.chunk_documents(documents)

	logger.info(f"Created {len(chunked_documents)} chunks")

	# Index documents in memory
	logger.info("Indexing documents in memory...")
	indexer = MemoryDocumentIndexer(llm_config=config.llm)

	indexed_count = indexer.index_documents(chunked_documents)

	logger.info(f"Successfully indexed {indexed_count} document chunks")

	# Save document store to pickle for later use
	output_file = Path("data/document_store.pkl")
	output_file.parent.mkdir(parents=True, exist_ok=True)

	logger.info(f"Saving document store to {output_file}...")
	with open(output_file, "wb") as f:
	pickle.dump(indexer.document_store, f)

	logger.info("✅ Document ingestion completed successfully!")
	logger.info(f"Document store saved to: {output_file}")
	logger.info(f"Total documents indexed: {indexed_count}")


	if __name__ == "__main__":
	main()