Spaces:

awellis
/

bfh-studadmin-assist

Running on CPU Upgrade

File size: 2,782 Bytes

78a356b

#!/usr/bin/env python3
"""Script to ingest documents and save to JSON (pickle doesn't work properly)."""

import sys
import logging
import json
from pathlib import Path

sys.path.insert(0, str(Path(__file__).parent.parent))

from src.config import get_config
from src.document_processing.loader import MarkdownDocumentLoader
from src.document_processing.chunker import SemanticChunker
from src.indexing.memory_indexer import MemoryDocumentIndexer


def setup_logging():
    """Configure logging."""
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )


def main():
    """Main ingestion workflow."""
    setup_logging()
    logger = logging.getLogger(__name__)

    logger.info("Starting document ingestion process...")

    # Load configuration
    config = get_config()
    logger.info(f"Using documents path: {config.document_processing.documents_path}")

    # Load documents
    logger.info("Loading markdown documents...")
    loader = MarkdownDocumentLoader(config.document_processing.documents_path)
    documents = loader.load_documents()

    if not documents:
        logger.error("No documents loaded. Exiting.")
        sys.exit(1)

    logger.info(f"Loaded {len(documents)} documents")

    # Chunk documents
    logger.info("Chunking documents...")
    chunker = SemanticChunker(
        chunk_size=config.document_processing.chunk_size,
        chunk_overlap=config.document_processing.chunk_overlap,
        min_chunk_size=config.document_processing.min_chunk_size,
    )
    chunked_documents = chunker.chunk_documents(documents)

    logger.info(f"Created {len(chunked_documents)} chunks")

    # Index documents in memory
    logger.info("Indexing documents (generating embeddings)...")
    indexer = MemoryDocumentIndexer(llm_config=config.llm)
    indexed_count = indexer.index_documents(chunked_documents)

    logger.info(f"Successfully indexed {indexed_count} document chunks")

    # Save embedded documents to JSON
    output_file = Path("data/embedded_documents.json")
    output_file.parent.mkdir(parents=True, exist_ok=True)

    logger.info(f"Saving embedded documents to {output_file}...")

    # Serialize documents
    docs_data = []
    for doc in indexer.document_store.filter_documents():
        docs_data.append({
            "id": doc.id,
            "content": doc.content,
            "embedding": doc.embedding,
            "meta": doc.meta or {}
        })

    with open(output_file, "w") as f:
        json.dump(docs_data, f)

    logger.info("✅ Document ingestion completed successfully!")
    logger.info(f"Embedded documents saved to: {output_file}")
    logger.info(f"Total documents indexed: {len(docs_data)}")


if __name__ == "__main__":
    main()