Spaces:
Running
Running
File size: 5,327 Bytes
d456104 c302758 d456104 c302758 d456104 c302758 d456104 c302758 d456104 c302758 d456104 c302758 d456104 c302758 d456104 c302758 d456104 c302758 d456104 c302758 d456104 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | """
ingest.py
---------
One-time CLI script to load all documents from data/raw/,
sanitize them, split into chunks, embed, and persist the FAISS index.
Pipeline:
1. Load documents from directory
2. Sanitize text (remove noise, normalize encoding)
3. Split into structured chunks with validation
4. Load embedding model
5. Build and persist FAISS vector store
Usage
-----
python scripts/ingest.py
python scripts/ingest.py --data-dir /path/to/docs
python scripts/ingest.py --chunk-size 600 --chunk-overlap 60
python scripts/ingest.py --skip-sanitize # Skip sanitization step
"""
import argparse
import logging
import sys
import time
from pathlib import Path
# Ensure project root is importable
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from app.config import CHUNK_OVERLAP, CHUNK_SIZE, DATA_RAW_DIR, VECTOR_DB_PATH
from components.document_loader import load_documents_from_directory
from components.embedder import HuggingFaceEmbedder
from components.sanitizer import sanitize_documents
from components.text_splitter import split_documents
from components.vector_store import VectorStore
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)-8s %(message)s",
datefmt="%H:%M:%S",
)
logger = logging.getLogger(__name__)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(description="Ingest documents into FAISS vector store.")
parser.add_argument(
"--data-dir",
type=str,
default=str(DATA_RAW_DIR),
help="Directory containing source documents (default: data/raw/)",
)
parser.add_argument(
"--chunk-size",
type=int,
default=CHUNK_SIZE,
help=f"Characters per chunk (default: {CHUNK_SIZE})",
)
parser.add_argument(
"--chunk-overlap",
type=int,
default=CHUNK_OVERLAP,
help=f"Overlap between chunks (default: {CHUNK_OVERLAP})",
)
parser.add_argument(
"--skip-sanitize",
action="store_true",
help="Skip document sanitization (not recommended)",
)
return parser.parse_args()
def main() -> None:
args = parse_args()
data_dir = Path(args.data_dir)
if not data_dir.exists():
logger.error("Data directory not found: %s", data_dir)
sys.exit(1)
print("\n" + "=" * 60)
print(" RAG Chatbot β Document Ingestion")
print("=" * 60)
print(f" Source dir : {data_dir}")
print(f" Chunk size : {args.chunk_size} chars")
print(f" Overlap : {args.chunk_overlap} chars")
print(f" Sanitization : {'ON' if not args.skip_sanitize else 'OFF'}")
print(f" Index path : {VECTOR_DB_PATH}")
print("=" * 60 + "\n")
# ββ Step 1: Load documents ββββββββββββββββββββββββββββββββββββββββββββββββ
t0 = time.time()
print("π Step 1/5 Loading documents β¦")
docs = load_documents_from_directory(data_dir)
if not docs:
logger.error("No supported documents found in '%s'.", data_dir)
sys.exit(1)
print(f" Loaded {len(docs)} page(s) in {time.time()-t0:.1f}s\n")
# ββ Step 2: Sanitize ββββββββββββββββββββββββββββββββββββββββββββββββββββββ
if not args.skip_sanitize:
print("π§Ή Step 2/5 Sanitizing documents β¦")
t_san = time.time()
docs = sanitize_documents(docs)
if not docs:
logger.error("All documents were invalid after sanitization.")
sys.exit(1)
print(f" Sanitized in {time.time()-t_san:.1f}s\n")
else:
print("β Step 2/5 Skipped sanitization\n")
# ββ Step 3: Split βββββββββββββββββββββββββββββββββββββββββββββββββββββββββ
print("βοΈ Step 3/5 Splitting into chunks β¦")
t1 = time.time()
chunks = split_documents(docs, args.chunk_size, args.chunk_overlap)
if not chunks:
logger.error("No valid chunks created from documents.")
sys.exit(1)
print(f" Created {len(chunks)} chunks in {time.time()-t1:.1f}s\n")
# ββ Step 4: Load embedding model ββββββββββββββββββββββββββββββββββββββββββ
print("π’ Step 4/5 Loading embedding model β¦")
t2 = time.time()
embedder = HuggingFaceEmbedder()
print(f" Model ready in {time.time()-t2:.1f}s\n")
# ββ Step 5: Build & persist vector store ββββββββββββββββββββββββββββββββββ
print("ποΈ Step 5/5 Building FAISS index β¦")
t3 = time.time()
store = VectorStore(embedder=embedder, index_path=VECTOR_DB_PATH)
store.build(chunks)
print(f" Index saved in {time.time()-t3:.1f}s\n")
total = time.time() - t0
print("=" * 60)
print(f" β
Ingestion complete in {total:.1f}s")
print(f" {len(chunks)} chunks indexed and saved to '{VECTOR_DB_PATH}'")
print("=" * 60 + "\n")
if __name__ == "__main__":
main()
|