bfh-studadmin-assist / scripts /ingest_documents_json.py
awellis's picture
Refactor document ingestion and processing; update configurations for chunking and retrieval, enhance error logging, and implement markdown-aware chunking
78a356b
#!/usr/bin/env python3
"""Script to ingest documents and save to JSON (pickle doesn't work properly)."""
import sys
import logging
import json
from pathlib import Path
sys.path.insert(0, str(Path(__file__).parent.parent))
from src.config import get_config
from src.document_processing.loader import MarkdownDocumentLoader
from src.document_processing.chunker import SemanticChunker
from src.indexing.memory_indexer import MemoryDocumentIndexer
def setup_logging():
"""Configure logging."""
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
def main():
"""Main ingestion workflow."""
setup_logging()
logger = logging.getLogger(__name__)
logger.info("Starting document ingestion process...")
# Load configuration
config = get_config()
logger.info(f"Using documents path: {config.document_processing.documents_path}")
# Load documents
logger.info("Loading markdown documents...")
loader = MarkdownDocumentLoader(config.document_processing.documents_path)
documents = loader.load_documents()
if not documents:
logger.error("No documents loaded. Exiting.")
sys.exit(1)
logger.info(f"Loaded {len(documents)} documents")
# Chunk documents
logger.info("Chunking documents...")
chunker = SemanticChunker(
chunk_size=config.document_processing.chunk_size,
chunk_overlap=config.document_processing.chunk_overlap,
min_chunk_size=config.document_processing.min_chunk_size,
)
chunked_documents = chunker.chunk_documents(documents)
logger.info(f"Created {len(chunked_documents)} chunks")
# Index documents in memory
logger.info("Indexing documents (generating embeddings)...")
indexer = MemoryDocumentIndexer(llm_config=config.llm)
indexed_count = indexer.index_documents(chunked_documents)
logger.info(f"Successfully indexed {indexed_count} document chunks")
# Save embedded documents to JSON
output_file = Path("data/embedded_documents.json")
output_file.parent.mkdir(parents=True, exist_ok=True)
logger.info(f"Saving embedded documents to {output_file}...")
# Serialize documents
docs_data = []
for doc in indexer.document_store.filter_documents():
docs_data.append({
"id": doc.id,
"content": doc.content,
"embedding": doc.embedding,
"meta": doc.meta or {}
})
with open(output_file, "w") as f:
json.dump(docs_data, f)
logger.info("✅ Document ingestion completed successfully!")
logger.info(f"Embedded documents saved to: {output_file}")
logger.info(f"Total documents indexed: {len(docs_data)}")
if __name__ == "__main__":
main()