""" Insta-AutoApp — OEM Manual Ingestion Pipeline Converts the 2023 Ford Bronco Owner's Manual (PDF) into a FAISS vector index for semantic retrieval. This script runs ONCE before the app can be used. Usage: 1. Place the PDF in data/ 2. Run: python ingest.py 3. Output: data/index.faiss + data/index.pkl """ import logging import os import pickle import sys import time import faiss import fitz # PyMuPDF import numpy as np from sentence_transformers import SentenceTransformer from config import ( MANUAL_PDF_PATH, MANUAL_PDF_FILENAME, FAISS_INDEX_PATH, FAISS_DOCSTORE_PATH, EMBEDDING_MODEL, CHUNK_SIZE, CHUNK_OVERLAP, ) logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") logger = logging.getLogger(__name__) # ── Low-value content filter ───────────────────────────────────────── LOW_VALUE_MARKERS = [ "table of contents", "all rights reserved", "edition date", "visual search", "copyright", "printed in", ] def is_low_value(text: str) -> bool: """Filter out TOC, copyright, legal, and navigation-only pages.""" lowered = text.lower().strip() if len(lowered) < 50: return True if any(marker in lowered for marker in LOW_VALUE_MARKERS): dots_ratio = lowered.count(".") / max(len(lowered), 1) if dots_ratio > 0.05: return True if lowered.count("copyright") > 0 and len(lowered) < 300: return True return False # ── Chunking ───────────────────────────────────────────────────────── def chunk_text(text: str, page_num: int, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list: """Split text into overlapping chunks with page metadata.""" words = text.split() chunks = [] start = 0 while start < len(words): end = start + chunk_size chunk_text = " ".join(words[start:end]) if len(chunk_text.strip()) > 30: chunks.append({ "text": chunk_text.strip(), "page": page_num, "source": MANUAL_PDF_FILENAME, }) start += chunk_size - overlap return chunks # ── Main pipeline ──────────────────────────────────────────────────── def main(): logger.info("=" * 60) logger.info("Insta-AutoApp — FAISS Index Builder") logger.info("=" * 60) # Check PDF exists if not os.path.exists(MANUAL_PDF_PATH): logger.error( f"PDF not found: {MANUAL_PDF_PATH}\n" f"Please place '{MANUAL_PDF_FILENAME}' in the data/ directory." ) sys.exit(1) # Step 1: Extract text from PDF logger.info(f"Loading PDF: {MANUAL_PDF_PATH}") doc = fitz.open(MANUAL_PDF_PATH) logger.info(f"PDF loaded: {len(doc)} pages") # Step 2: Extract and filter pages all_chunks = [] pages_kept = 0 pages_skipped = 0 for page_num in range(len(doc)): page = doc[page_num] text = page.get_text() if is_low_value(text): pages_skipped += 1 continue pages_kept += 1 page_chunks = chunk_text(text, page_num + 1) all_chunks.extend(page_chunks) doc.close() logger.info(f"Pages processed: {pages_kept} kept, {pages_skipped} filtered out") logger.info(f"Chunks created: {len(all_chunks)}") if not all_chunks: logger.error("No chunks were created. Check the PDF file.") sys.exit(1) # Step 3: Embed chunks logger.info(f"Loading embedding model: {EMBEDDING_MODEL}") embedder = SentenceTransformer(EMBEDDING_MODEL) logger.info("Embedding chunks (this may take 1-3 minutes)...") start_time = time.time() texts = [c["text"] for c in all_chunks] embeddings = embedder.encode(texts, show_progress_bar=True, normalize_embeddings=True) embeddings = np.array(embeddings, dtype=np.float32) elapsed = time.time() - start_time logger.info(f"Embedding complete: {embeddings.shape[0]} vectors, {embeddings.shape[1]} dimensions ({elapsed:.1f}s)") # Step 4: Build FAISS index logger.info("Building FAISS index...") dimension = embeddings.shape[1] index = faiss.IndexFlatIP(dimension) # Inner product (cosine sim with normalized vecs) index.add(embeddings) logger.info(f"FAISS index built: {index.ntotal} vectors") # Step 5: Save to disk out_dir = os.path.dirname(FAISS_INDEX_PATH) if out_dir: os.makedirs(out_dir, exist_ok=True) faiss.write_index(index, FAISS_INDEX_PATH) with open(FAISS_DOCSTORE_PATH, "wb") as f: pickle.dump(all_chunks, f) logger.info(f"Index saved: {FAISS_INDEX_PATH}") logger.info(f"Metadata saved: {FAISS_DOCSTORE_PATH}") logger.info("=" * 60) logger.info("DONE. You can now run: python app.py") logger.info("=" * 60) if __name__ == "__main__": main()