Spaces:
Running
Running
| """ | |
| ingest.py | |
| --------- | |
| One-time CLI script to load all documents from data/raw/, | |
| sanitize them, split into chunks, embed, and persist the FAISS index. | |
| Pipeline: | |
| 1. Load documents from directory | |
| 2. Sanitize text (remove noise, normalize encoding) | |
| 3. Split into structured chunks with validation | |
| 4. Load embedding model | |
| 5. Build and persist FAISS vector store | |
| Usage | |
| ----- | |
| python scripts/ingest.py | |
| python scripts/ingest.py --data-dir /path/to/docs | |
| python scripts/ingest.py --chunk-size 600 --chunk-overlap 60 | |
| python scripts/ingest.py --skip-sanitize # Skip sanitization step | |
| """ | |
| import argparse | |
| import logging | |
| import sys | |
| import time | |
| from pathlib import Path | |
| # Ensure project root is importable | |
| sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) | |
| from app.config import CHUNK_OVERLAP, CHUNK_SIZE, DATA_RAW_DIR, VECTOR_DB_PATH | |
| from components.document_loader import load_documents_from_directory | |
| from components.embedder import HuggingFaceEmbedder | |
| from components.sanitizer import sanitize_documents | |
| from components.text_splitter import split_documents | |
| from components.vector_store import VectorStore | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s %(levelname)-8s %(message)s", | |
| datefmt="%H:%M:%S", | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def parse_args() -> argparse.Namespace: | |
| parser = argparse.ArgumentParser(description="Ingest documents into FAISS vector store.") | |
| parser.add_argument( | |
| "--data-dir", | |
| type=str, | |
| default=str(DATA_RAW_DIR), | |
| help="Directory containing source documents (default: data/raw/)", | |
| ) | |
| parser.add_argument( | |
| "--chunk-size", | |
| type=int, | |
| default=CHUNK_SIZE, | |
| help=f"Characters per chunk (default: {CHUNK_SIZE})", | |
| ) | |
| parser.add_argument( | |
| "--chunk-overlap", | |
| type=int, | |
| default=CHUNK_OVERLAP, | |
| help=f"Overlap between chunks (default: {CHUNK_OVERLAP})", | |
| ) | |
| parser.add_argument( | |
| "--skip-sanitize", | |
| action="store_true", | |
| help="Skip document sanitization (not recommended)", | |
| ) | |
| return parser.parse_args() | |
| def main() -> None: | |
| args = parse_args() | |
| data_dir = Path(args.data_dir) | |
| if not data_dir.exists(): | |
| logger.error("Data directory not found: %s", data_dir) | |
| sys.exit(1) | |
| print("\n" + "=" * 60) | |
| print(" RAG Chatbot β Document Ingestion") | |
| print("=" * 60) | |
| print(f" Source dir : {data_dir}") | |
| print(f" Chunk size : {args.chunk_size} chars") | |
| print(f" Overlap : {args.chunk_overlap} chars") | |
| print(f" Sanitization : {'ON' if not args.skip_sanitize else 'OFF'}") | |
| print(f" Index path : {VECTOR_DB_PATH}") | |
| print("=" * 60 + "\n") | |
| # ββ Step 1: Load documents ββββββββββββββββββββββββββββββββββββββββββββββββ | |
| t0 = time.time() | |
| print("π Step 1/5 Loading documents β¦") | |
| docs = load_documents_from_directory(data_dir) | |
| if not docs: | |
| logger.error("No supported documents found in '%s'.", data_dir) | |
| sys.exit(1) | |
| print(f" Loaded {len(docs)} page(s) in {time.time()-t0:.1f}s\n") | |
| # ββ Step 2: Sanitize ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| if not args.skip_sanitize: | |
| print("π§Ή Step 2/5 Sanitizing documents β¦") | |
| t_san = time.time() | |
| docs = sanitize_documents(docs) | |
| if not docs: | |
| logger.error("All documents were invalid after sanitization.") | |
| sys.exit(1) | |
| print(f" Sanitized in {time.time()-t_san:.1f}s\n") | |
| else: | |
| print("β Step 2/5 Skipped sanitization\n") | |
| # ββ Step 3: Split βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| print("βοΈ Step 3/5 Splitting into chunks β¦") | |
| t1 = time.time() | |
| chunks = split_documents(docs, args.chunk_size, args.chunk_overlap) | |
| if not chunks: | |
| logger.error("No valid chunks created from documents.") | |
| sys.exit(1) | |
| print(f" Created {len(chunks)} chunks in {time.time()-t1:.1f}s\n") | |
| # ββ Step 4: Load embedding model ββββββββββββββββββββββββββββββββββββββββββ | |
| print("π’ Step 4/5 Loading embedding model β¦") | |
| t2 = time.time() | |
| embedder = HuggingFaceEmbedder() | |
| print(f" Model ready in {time.time()-t2:.1f}s\n") | |
| # ββ Step 5: Build & persist vector store ββββββββββββββββββββββββββββββββββ | |
| print("ποΈ Step 5/5 Building FAISS index β¦") | |
| t3 = time.time() | |
| store = VectorStore(embedder=embedder, index_path=VECTOR_DB_PATH) | |
| store.build(chunks) | |
| print(f" Index saved in {time.time()-t3:.1f}s\n") | |
| total = time.time() - t0 | |
| print("=" * 60) | |
| print(f" β Ingestion complete in {total:.1f}s") | |
| print(f" {len(chunks)} chunks indexed and saved to '{VECTOR_DB_PATH}'") | |
| print("=" * 60 + "\n") | |
| if __name__ == "__main__": | |
| main() | |