File size: 1,313 Bytes
f866820
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# RAG-document-assistant/scripts/test_ingestion.py
"""
CLI utility for testing ingestion pipeline.

Purpose:
    Tests the document ingestion pipeline by loading documents, chunking them,
    and printing summary statistics. Does not generate embeddings or save files.

Inputs:
    docs_dir (str): Path to directory containing markdown documents

Outputs:
    Prints document and chunk counts to stdout
    Displays a sample chunk for verification

Usage:
    python scripts/test_ingestion.py <docs_dir>

Example:
    python scripts/test_ingestion.py ./sample_docs
"""

from src.ingestion.load_docs import load_markdown_docs
from src.ingestion.chunker import chunk_documents

import sys
import os

def main():
    if len(sys.argv) < 2:
        print("Usage: python scripts/test_ingestion.py <docs_dir>")
        sys.exit(1)

    docs_dir = sys.argv[1]
    docs = load_markdown_docs(docs_dir)

    chunks = chunk_documents(docs, max_tokens=300, overlap=50)

    total_docs = len([d for d in docs if d.get("status") == "OK"])
    total_chunks = len(chunks)

    print(f"Docs loaded: {total_docs}")
    print(f"Chunks generated: {total_chunks}")

    # Print a sample chunk for verification
    if chunks:
        print("\nSample chunk:")
        print(chunks[0]["text"][:300])

if __name__ == "__main__":
    main()