rag-chatbot / scripts /ingest.py
Mobiworks's picture
Sync from GitHub via hub-sync
c302758 verified
Raw
History Blame Contribute Delete
5.33 kB
"""
ingest.py
---------
One-time CLI script to load all documents from data/raw/,
sanitize them, split into chunks, embed, and persist the FAISS index.
Pipeline:
1. Load documents from directory
2. Sanitize text (remove noise, normalize encoding)
3. Split into structured chunks with validation
4. Load embedding model
5. Build and persist FAISS vector store
Usage
-----
python scripts/ingest.py
python scripts/ingest.py --data-dir /path/to/docs
python scripts/ingest.py --chunk-size 600 --chunk-overlap 60
python scripts/ingest.py --skip-sanitize # Skip sanitization step
"""
import argparse
import logging
import sys
import time
from pathlib import Path
# Ensure project root is importable
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from app.config import CHUNK_OVERLAP, CHUNK_SIZE, DATA_RAW_DIR, VECTOR_DB_PATH
from components.document_loader import load_documents_from_directory
from components.embedder import HuggingFaceEmbedder
from components.sanitizer import sanitize_documents
from components.text_splitter import split_documents
from components.vector_store import VectorStore
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Ingest documents into FAISS vector store.")
parser.add_argument(
"--data-dir",
type=str,
default=str(DATA_RAW_DIR),
help="Directory containing source documents (default: data/raw/)",
)
parser.add_argument(
"--chunk-size",
type=int,
default=CHUNK_SIZE,
help=f"Characters per chunk (default: {CHUNK_SIZE})",
)
parser.add_argument(
"--chunk-overlap",
type=int,
default=CHUNK_OVERLAP,
help=f"Overlap between chunks (default: {CHUNK_OVERLAP})",
)
parser.add_argument(
"--skip-sanitize",
action="store_true",
help="Skip document sanitization (not recommended)",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
data_dir = Path(args.data_dir)
if not data_dir.exists():
logger.error("Data directory not found: %s", data_dir)
sys.exit(1)
print("\n" + "=" * 60)
print(" RAG Chatbot β€” Document Ingestion")
print("=" * 60)
print(f" Source dir : {data_dir}")
print(f" Chunk size : {args.chunk_size} chars")
print(f" Overlap : {args.chunk_overlap} chars")
print(f" Sanitization : {'ON' if not args.skip_sanitize else 'OFF'}")
print(f" Index path : {VECTOR_DB_PATH}")
print("=" * 60 + "\n")
# ── Step 1: Load documents ────────────────────────────────────────────────
t0 = time.time()
print("πŸ“„ Step 1/5 Loading documents …")
docs = load_documents_from_directory(data_dir)
if not docs:
logger.error("No supported documents found in '%s'.", data_dir)
sys.exit(1)
print(f" Loaded {len(docs)} page(s) in {time.time()-t0:.1f}s\n")
# ── Step 2: Sanitize ──────────────────────────────────────────────────────
if not args.skip_sanitize:
print("🧹 Step 2/5 Sanitizing documents …")
t_san = time.time()
docs = sanitize_documents(docs)
if not docs:
logger.error("All documents were invalid after sanitization.")
sys.exit(1)
print(f" Sanitized in {time.time()-t_san:.1f}s\n")
else:
print("⊘ Step 2/5 Skipped sanitization\n")
# ── Step 3: Split ─────────────────────────────────────────────────────────
print("βœ‚οΈ Step 3/5 Splitting into chunks …")
t1 = time.time()
chunks = split_documents(docs, args.chunk_size, args.chunk_overlap)
if not chunks:
logger.error("No valid chunks created from documents.")
sys.exit(1)
print(f" Created {len(chunks)} chunks in {time.time()-t1:.1f}s\n")
# ── Step 4: Load embedding model ──────────────────────────────────────────
print("πŸ”’ Step 4/5 Loading embedding model …")
t2 = time.time()
embedder = HuggingFaceEmbedder()
print(f" Model ready in {time.time()-t2:.1f}s\n")
# ── Step 5: Build & persist vector store ──────────────────────────────────
print("πŸ—„οΈ Step 5/5 Building FAISS index …")
t3 = time.time()
store = VectorStore(embedder=embedder, index_path=VECTOR_DB_PATH)
store.build(chunks)
print(f" Index saved in {time.time()-t3:.1f}s\n")
total = time.time() - t0
print("=" * 60)
print(f" βœ… Ingestion complete in {total:.1f}s")
print(f" {len(chunks)} chunks indexed and saved to '{VECTOR_DB_PATH}'")
print("=" * 60 + "\n")
if __name__ == "__main__":
main()