Spaces:
Sleeping
Sleeping
File size: 1,313 Bytes
f866820 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 |
# RAG-document-assistant/scripts/test_ingestion.py
"""
CLI utility for testing ingestion pipeline.
Purpose:
Tests the document ingestion pipeline by loading documents, chunking them,
and printing summary statistics. Does not generate embeddings or save files.
Inputs:
docs_dir (str): Path to directory containing markdown documents
Outputs:
Prints document and chunk counts to stdout
Displays a sample chunk for verification
Usage:
python scripts/test_ingestion.py <docs_dir>
Example:
python scripts/test_ingestion.py ./sample_docs
"""
from src.ingestion.load_docs import load_markdown_docs
from src.ingestion.chunker import chunk_documents
import sys
import os
def main():
if len(sys.argv) < 2:
print("Usage: python scripts/test_ingestion.py <docs_dir>")
sys.exit(1)
docs_dir = sys.argv[1]
docs = load_markdown_docs(docs_dir)
chunks = chunk_documents(docs, max_tokens=300, overlap=50)
total_docs = len([d for d in docs if d.get("status") == "OK"])
total_chunks = len(chunks)
print(f"Docs loaded: {total_docs}")
print(f"Chunks generated: {total_chunks}")
# Print a sample chunk for verification
if chunks:
print("\nSample chunk:")
print(chunks[0]["text"][:300])
if __name__ == "__main__":
main() |