File size: 2,663 Bytes
cff1a2a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
"""
Ingestion CLI for the Insurance RAG System.
Supports incremental and force re-ingestion.
"""

import os
import sys
import argparse

# Ensure project root is in path
sys.path.append(os.path.dirname(os.path.abspath(__file__)))

from ingestion.pipeline import IngestionPipeline
from rag.vector_store import VectorStoreManager
from ingestion.state import IngestionState
from tqdm import tqdm


def main():
    parser = argparse.ArgumentParser(description="Insurance Document Ingestion")
    parser.add_argument("--force", action="store_true", 
                       help="Force full re-ingestion of all documents")
    args = parser.parse_args()

    DOCS_DIR = "docs"
    state = IngestionState()
    pipeline = IngestionPipeline(DOCS_DIR)
    vector_manager = VectorStoreManager()

    # 1. Identify files to process
    all_files = []
    for root, _, files in os.walk(DOCS_DIR):
        for file in files:
            if file.lower().endswith(('.pdf', '.docx')):
                all_files.append(os.path.join(root, file))

    files_to_process = []
    if args.force:
        files_to_process = all_files
        print("FORCE mode: Re-indexing all files with enhanced metadata.")
    else:
        for f in all_files:
            if state.is_file_changed(f):
                files_to_process.append(f)
        
    if not files_to_process:
        print("Everything is up to date. No new or modified documents found.")
        return

    print(f"Found {len(files_to_process)} documents to process.")

    # 2. Process files using enhanced pipeline with progress percentage
    all_chunks = []
    total_files = len(files_to_process)
    for idx, file_path in enumerate(files_to_process, 1):
        try:
            # Show percentage progress
            percent = (idx / total_files) * 100
            print(f"\r[{percent:5.1f}%] Processing ({idx}/{total_files}): {os.path.basename(file_path)[:40]:<40}", end="", flush=True)
            
            chunks = pipeline.process_single_file(file_path)
            all_chunks.extend(chunks)
            state.update_file(file_path)
        except Exception as e:
            print(f"\nError on {file_path}: {e}")

    print(f"Total chunks created: {len(all_chunks)}")

    # 3. Update Vector Store
    if all_chunks:
        if args.force:
            print("Creating new vector store...")
            vector_manager.create_vector_store(all_chunks)
        else:
            print("Updating existing vector store...")
            vector_manager.update_vector_store(all_chunks)
        
        state.save_state()
        
        
        print("\nIngestion complete!")


if __name__ == "__main__":
    main()