File size: 2,524 Bytes
0021e2f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#!/usr/bin/env python3
"""Script to ingest documents and save to pickle for in-memory use."""

import sys
import logging
import pickle
from pathlib import Path

# Add src to path
sys.path.insert(0, str(Path(__file__).parent.parent))

from src.config import get_config
from src.document_processing.loader import MarkdownDocumentLoader
from src.document_processing.chunker import SemanticChunker
from src.indexing.memory_indexer import MemoryDocumentIndexer


def setup_logging():
    """Configure logging."""
    logging.basicConfig(
        level=logging.INFO,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
    )


def main():
    """Main ingestion workflow."""
    setup_logging()
    logger = logging.getLogger(__name__)

    logger.info("Starting document ingestion process (in-memory)...")

    # Load configuration
    config = get_config()
    logger.info(f"Using documents path: {config.document_processing.documents_path}")

    # Load documents
    logger.info("Loading markdown documents...")
    loader = MarkdownDocumentLoader(config.document_processing.documents_path)
    documents = loader.load_documents()

    if not documents:
        logger.error("No documents loaded. Exiting.")
        sys.exit(1)

    logger.info(f"Loaded {len(documents)} documents")

    # Chunk documents
    logger.info("Chunking documents...")
    chunker = SemanticChunker(
        chunk_size=config.document_processing.chunk_size,
        chunk_overlap=config.document_processing.chunk_overlap,
        min_chunk_size=config.document_processing.min_chunk_size,
    )
    chunked_documents = chunker.chunk_documents(documents)

    logger.info(f"Created {len(chunked_documents)} chunks")

    # Index documents in memory
    logger.info("Indexing documents in memory...")
    indexer = MemoryDocumentIndexer(llm_config=config.llm)

    indexed_count = indexer.index_documents(chunked_documents)

    logger.info(f"Successfully indexed {indexed_count} document chunks")

    # Save document store to pickle for later use
    output_file = Path("data/document_store.pkl")
    output_file.parent.mkdir(parents=True, exist_ok=True)

    logger.info(f"Saving document store to {output_file}...")
    with open(output_file, "wb") as f:
        pickle.dump(indexer.document_store, f)

    logger.info("✅ Document ingestion completed successfully!")
    logger.info(f"Document store saved to: {output_file}")
    logger.info(f"Total documents indexed: {indexed_count}")


if __name__ == "__main__":
    main()