File size: 5,327 Bytes
d456104
 
 
 
c302758
 
 
 
 
 
 
 
d456104
 
 
 
 
 
c302758
d456104
 
 
 
 
 
 
 
 
 
 
 
 
 
c302758
d456104
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c302758
 
 
 
 
d456104
 
 
 
 
 
 
 
 
 
 
 
 
 
c302758
 
 
 
 
d456104
 
 
 
c302758
d456104
 
 
 
 
 
c302758
 
 
 
 
 
 
 
 
 
 
 
 
 
d456104
 
c302758
 
 
d456104
 
c302758
 
d456104
 
 
 
c302758
 
d456104
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
"""
ingest.py
---------
One-time CLI script to load all documents from data/raw/,
sanitize them, split into chunks, embed, and persist the FAISS index.

Pipeline:
  1. Load documents from directory
  2. Sanitize text (remove noise, normalize encoding)
  3. Split into structured chunks with validation
  4. Load embedding model
  5. Build and persist FAISS vector store

Usage
-----
    python scripts/ingest.py
    python scripts/ingest.py --data-dir /path/to/docs
    python scripts/ingest.py --chunk-size 600 --chunk-overlap 60
    python scripts/ingest.py --skip-sanitize  # Skip sanitization step
"""

import argparse
import logging
import sys
import time
from pathlib import Path

# Ensure project root is importable
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from app.config import CHUNK_OVERLAP, CHUNK_SIZE, DATA_RAW_DIR, VECTOR_DB_PATH
from components.document_loader import load_documents_from_directory
from components.embedder import HuggingFaceEmbedder
from components.sanitizer import sanitize_documents
from components.text_splitter import split_documents
from components.vector_store import VectorStore

logging.basicConfig(
    level=logging.INFO,
    format="%(asctime)s  %(levelname)-8s  %(message)s",
    datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(description="Ingest documents into FAISS vector store.")
    parser.add_argument(
        "--data-dir",
        type=str,
        default=str(DATA_RAW_DIR),
        help="Directory containing source documents (default: data/raw/)",
    )
    parser.add_argument(
        "--chunk-size",
        type=int,
        default=CHUNK_SIZE,
        help=f"Characters per chunk (default: {CHUNK_SIZE})",
    )
    parser.add_argument(
        "--chunk-overlap",
        type=int,
        default=CHUNK_OVERLAP,
        help=f"Overlap between chunks (default: {CHUNK_OVERLAP})",
    )
    parser.add_argument(
        "--skip-sanitize",
        action="store_true",
        help="Skip document sanitization (not recommended)",
    )
    return parser.parse_args()


def main() -> None:
    args = parse_args()

    data_dir = Path(args.data_dir)
    if not data_dir.exists():
        logger.error("Data directory not found: %s", data_dir)
        sys.exit(1)

    print("\n" + "=" * 60)
    print("  RAG Chatbot β€” Document Ingestion")
    print("=" * 60)
    print(f"  Source dir     : {data_dir}")
    print(f"  Chunk size     : {args.chunk_size} chars")
    print(f"  Overlap        : {args.chunk_overlap} chars")
    print(f"  Sanitization   : {'ON' if not args.skip_sanitize else 'OFF'}")
    print(f"  Index path     : {VECTOR_DB_PATH}")
    print("=" * 60 + "\n")

    # ── Step 1: Load documents ────────────────────────────────────────────────
    t0 = time.time()
    print("πŸ“„ Step 1/5  Loading documents …")
    docs = load_documents_from_directory(data_dir)
    if not docs:
        logger.error("No supported documents found in '%s'.", data_dir)
        sys.exit(1)
    print(f"   Loaded {len(docs)} page(s) in {time.time()-t0:.1f}s\n")

    # ── Step 2: Sanitize ──────────────────────────────────────────────────────
    if not args.skip_sanitize:
        print("🧹 Step 2/5  Sanitizing documents …")
        t_san = time.time()
        docs = sanitize_documents(docs)
        if not docs:
            logger.error("All documents were invalid after sanitization.")
            sys.exit(1)
        print(f"   Sanitized in {time.time()-t_san:.1f}s\n")
    else:
        print("⊘  Step 2/5  Skipped sanitization\n")

    # ── Step 3: Split ─────────────────────────────────────────────────────────
    print("βœ‚οΈ  Step 3/5  Splitting into chunks …")
    t1 = time.time()
    chunks = split_documents(docs, args.chunk_size, args.chunk_overlap)
    if not chunks:
        logger.error("No valid chunks created from documents.")
        sys.exit(1)
    print(f"   Created {len(chunks)} chunks in {time.time()-t1:.1f}s\n")

    # ── Step 4: Load embedding model ──────────────────────────────────────────
    print("πŸ”’ Step 4/5  Loading embedding model …")
    t2 = time.time()
    embedder = HuggingFaceEmbedder()
    print(f"   Model ready in {time.time()-t2:.1f}s\n")

    # ── Step 5: Build & persist vector store ──────────────────────────────────
    print("πŸ—„οΈ  Step 5/5  Building FAISS index …")
    t3 = time.time()
    store = VectorStore(embedder=embedder, index_path=VECTOR_DB_PATH)
    store.build(chunks)
    print(f"   Index saved in {time.time()-t3:.1f}s\n")

    total = time.time() - t0
    print("=" * 60)
    print(f"  βœ… Ingestion complete in {total:.1f}s")
    print(f"     {len(chunks)} chunks indexed and saved to '{VECTOR_DB_PATH}'")
    print("=" * 60 + "\n")


if __name__ == "__main__":
    main()