Spaces:

awellis
/

bfh-studadmin-assist

Running on CPU Upgrade

bfh-studadmin-assist / scripts /ingest_documents_json.py

Refactor document ingestion and processing; update configurations for chunking and retrieval, enhance error logging, and implement markdown-aware chunking

78a356b 5 months ago

raw

history blame contribute delete

2.78 kB

	#!/usr/bin/env python3
	"""Script to ingest documents and save to JSON (pickle doesn't work properly)."""

	import sys
	import logging
	import json
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).parent.parent))

	from src.config import get_config
	from src.document_processing.loader import MarkdownDocumentLoader
	from src.document_processing.chunker import SemanticChunker
	from src.indexing.memory_indexer import MemoryDocumentIndexer


	def setup_logging():
	"""Configure logging."""
	logging.basicConfig(
	level=logging.INFO,
	format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
	)


	def main():
	"""Main ingestion workflow."""
	setup_logging()
	logger = logging.getLogger(__name__)

	logger.info("Starting document ingestion process...")

	# Load configuration
	config = get_config()
	logger.info(f"Using documents path: {config.document_processing.documents_path}")

	# Load documents
	logger.info("Loading markdown documents...")
	loader = MarkdownDocumentLoader(config.document_processing.documents_path)
	documents = loader.load_documents()

	if not documents:
	logger.error("No documents loaded. Exiting.")
	sys.exit(1)

	logger.info(f"Loaded {len(documents)} documents")

	# Chunk documents
	logger.info("Chunking documents...")
	chunker = SemanticChunker(
	chunk_size=config.document_processing.chunk_size,
	chunk_overlap=config.document_processing.chunk_overlap,
	min_chunk_size=config.document_processing.min_chunk_size,
	)
	chunked_documents = chunker.chunk_documents(documents)

	logger.info(f"Created {len(chunked_documents)} chunks")

	# Index documents in memory
	logger.info("Indexing documents (generating embeddings)...")
	indexer = MemoryDocumentIndexer(llm_config=config.llm)
	indexed_count = indexer.index_documents(chunked_documents)

	logger.info(f"Successfully indexed {indexed_count} document chunks")

	# Save embedded documents to JSON
	output_file = Path("data/embedded_documents.json")
	output_file.parent.mkdir(parents=True, exist_ok=True)

	logger.info(f"Saving embedded documents to {output_file}...")

	# Serialize documents
	docs_data = []
	for doc in indexer.document_store.filter_documents():
	docs_data.append({
	"id": doc.id,
	"content": doc.content,
	"embedding": doc.embedding,
	"meta": doc.meta or {}
	})

	with open(output_file, "w") as f:
	json.dump(docs_data, f)

	logger.info("✅ Document ingestion completed successfully!")
	logger.info(f"Embedded documents saved to: {output_file}")
	logger.info(f"Total documents indexed: {len(docs_data)}")


	if __name__ == "__main__":
	main()