| """ |
| Insta-AutoApp β OEM Manual Ingestion Pipeline |
| |
| Converts the 2023 Ford Bronco Owner's Manual (PDF) into a FAISS vector index |
| for semantic retrieval. This script runs ONCE before the app can be used. |
| |
| Usage: |
| 1. Place the PDF in data/ |
| 2. Run: python ingest.py |
| 3. Output: data/index.faiss + data/index.pkl |
| """ |
|
|
| import logging |
| import os |
| import pickle |
| import sys |
| import time |
|
|
| import faiss |
| import fitz |
| import numpy as np |
| from sentence_transformers import SentenceTransformer |
|
|
| from config import ( |
| MANUAL_PDF_PATH, MANUAL_PDF_FILENAME, |
| FAISS_INDEX_PATH, FAISS_DOCSTORE_PATH, |
| EMBEDDING_MODEL, CHUNK_SIZE, CHUNK_OVERLAP, |
| ) |
|
|
| logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s") |
| logger = logging.getLogger(__name__) |
|
|
|
|
| |
|
|
| LOW_VALUE_MARKERS = [ |
| "table of contents", "all rights reserved", "edition date", |
| "visual search", "copyright", "printed in", |
| ] |
|
|
|
|
| def is_low_value(text: str) -> bool: |
| """Filter out TOC, copyright, legal, and navigation-only pages.""" |
| lowered = text.lower().strip() |
| if len(lowered) < 50: |
| return True |
| if any(marker in lowered for marker in LOW_VALUE_MARKERS): |
| dots_ratio = lowered.count(".") / max(len(lowered), 1) |
| if dots_ratio > 0.05: |
| return True |
| if lowered.count("copyright") > 0 and len(lowered) < 300: |
| return True |
| return False |
|
|
|
|
| |
|
|
| def chunk_text(text: str, page_num: int, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list: |
| """Split text into overlapping chunks with page metadata.""" |
| words = text.split() |
| chunks = [] |
| start = 0 |
| while start < len(words): |
| end = start + chunk_size |
| chunk_text = " ".join(words[start:end]) |
| if len(chunk_text.strip()) > 30: |
| chunks.append({ |
| "text": chunk_text.strip(), |
| "page": page_num, |
| "source": MANUAL_PDF_FILENAME, |
| }) |
| start += chunk_size - overlap |
| return chunks |
|
|
|
|
| |
|
|
| def main(): |
| logger.info("=" * 60) |
| logger.info("Insta-AutoApp β FAISS Index Builder") |
| logger.info("=" * 60) |
|
|
| |
| if not os.path.exists(MANUAL_PDF_PATH): |
| logger.error( |
| f"PDF not found: {MANUAL_PDF_PATH}\n" |
| f"Please place '{MANUAL_PDF_FILENAME}' in the data/ directory." |
| ) |
| sys.exit(1) |
|
|
| |
| logger.info(f"Loading PDF: {MANUAL_PDF_PATH}") |
| doc = fitz.open(MANUAL_PDF_PATH) |
| logger.info(f"PDF loaded: {len(doc)} pages") |
|
|
| |
| all_chunks = [] |
| pages_kept = 0 |
| pages_skipped = 0 |
| for page_num in range(len(doc)): |
| page = doc[page_num] |
| text = page.get_text() |
| if is_low_value(text): |
| pages_skipped += 1 |
| continue |
| pages_kept += 1 |
| page_chunks = chunk_text(text, page_num + 1) |
| all_chunks.extend(page_chunks) |
|
|
| doc.close() |
| logger.info(f"Pages processed: {pages_kept} kept, {pages_skipped} filtered out") |
| logger.info(f"Chunks created: {len(all_chunks)}") |
|
|
| if not all_chunks: |
| logger.error("No chunks were created. Check the PDF file.") |
| sys.exit(1) |
|
|
| |
| logger.info(f"Loading embedding model: {EMBEDDING_MODEL}") |
| embedder = SentenceTransformer(EMBEDDING_MODEL) |
|
|
| logger.info("Embedding chunks (this may take 1-3 minutes)...") |
| start_time = time.time() |
| texts = [c["text"] for c in all_chunks] |
| embeddings = embedder.encode(texts, show_progress_bar=True, normalize_embeddings=True) |
| embeddings = np.array(embeddings, dtype=np.float32) |
| elapsed = time.time() - start_time |
| logger.info(f"Embedding complete: {embeddings.shape[0]} vectors, {embeddings.shape[1]} dimensions ({elapsed:.1f}s)") |
|
|
| |
| logger.info("Building FAISS index...") |
| dimension = embeddings.shape[1] |
| index = faiss.IndexFlatIP(dimension) |
| index.add(embeddings) |
| logger.info(f"FAISS index built: {index.ntotal} vectors") |
|
|
| |
| out_dir = os.path.dirname(FAISS_INDEX_PATH) |
| if out_dir: |
| os.makedirs(out_dir, exist_ok=True) |
| faiss.write_index(index, FAISS_INDEX_PATH) |
| with open(FAISS_DOCSTORE_PATH, "wb") as f: |
| pickle.dump(all_chunks, f) |
|
|
| logger.info(f"Index saved: {FAISS_INDEX_PATH}") |
| logger.info(f"Metadata saved: {FAISS_DOCSTORE_PATH}") |
| logger.info("=" * 60) |
| logger.info("DONE. You can now run: python app.py") |
| logger.info("=" * 60) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|