NeilDriscoll's picture
Upload 14 files
661743e verified
"""
Insta-AutoApp β€” OEM Manual Ingestion Pipeline
Converts the 2023 Ford Bronco Owner's Manual (PDF) into a FAISS vector index
for semantic retrieval. This script runs ONCE before the app can be used.
Usage:
1. Place the PDF in data/
2. Run: python ingest.py
3. Output: data/index.faiss + data/index.pkl
"""
import logging
import os
import pickle
import sys
import time
import faiss
import fitz # PyMuPDF
import numpy as np
from sentence_transformers import SentenceTransformer
from config import (
MANUAL_PDF_PATH, MANUAL_PDF_FILENAME,
FAISS_INDEX_PATH, FAISS_DOCSTORE_PATH,
EMBEDDING_MODEL, CHUNK_SIZE, CHUNK_OVERLAP,
)
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
logger = logging.getLogger(__name__)
# ── Low-value content filter ─────────────────────────────────────────
LOW_VALUE_MARKERS = [
"table of contents", "all rights reserved", "edition date",
"visual search", "copyright", "printed in",
]
def is_low_value(text: str) -> bool:
"""Filter out TOC, copyright, legal, and navigation-only pages."""
lowered = text.lower().strip()
if len(lowered) < 50:
return True
if any(marker in lowered for marker in LOW_VALUE_MARKERS):
dots_ratio = lowered.count(".") / max(len(lowered), 1)
if dots_ratio > 0.05:
return True
if lowered.count("copyright") > 0 and len(lowered) < 300:
return True
return False
# ── Chunking ─────────────────────────────────────────────────────────
def chunk_text(text: str, page_num: int, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list:
"""Split text into overlapping chunks with page metadata."""
words = text.split()
chunks = []
start = 0
while start < len(words):
end = start + chunk_size
chunk_text = " ".join(words[start:end])
if len(chunk_text.strip()) > 30:
chunks.append({
"text": chunk_text.strip(),
"page": page_num,
"source": MANUAL_PDF_FILENAME,
})
start += chunk_size - overlap
return chunks
# ── Main pipeline ────────────────────────────────────────────────────
def main():
logger.info("=" * 60)
logger.info("Insta-AutoApp β€” FAISS Index Builder")
logger.info("=" * 60)
# Check PDF exists
if not os.path.exists(MANUAL_PDF_PATH):
logger.error(
f"PDF not found: {MANUAL_PDF_PATH}\n"
f"Please place '{MANUAL_PDF_FILENAME}' in the data/ directory."
)
sys.exit(1)
# Step 1: Extract text from PDF
logger.info(f"Loading PDF: {MANUAL_PDF_PATH}")
doc = fitz.open(MANUAL_PDF_PATH)
logger.info(f"PDF loaded: {len(doc)} pages")
# Step 2: Extract and filter pages
all_chunks = []
pages_kept = 0
pages_skipped = 0
for page_num in range(len(doc)):
page = doc[page_num]
text = page.get_text()
if is_low_value(text):
pages_skipped += 1
continue
pages_kept += 1
page_chunks = chunk_text(text, page_num + 1)
all_chunks.extend(page_chunks)
doc.close()
logger.info(f"Pages processed: {pages_kept} kept, {pages_skipped} filtered out")
logger.info(f"Chunks created: {len(all_chunks)}")
if not all_chunks:
logger.error("No chunks were created. Check the PDF file.")
sys.exit(1)
# Step 3: Embed chunks
logger.info(f"Loading embedding model: {EMBEDDING_MODEL}")
embedder = SentenceTransformer(EMBEDDING_MODEL)
logger.info("Embedding chunks (this may take 1-3 minutes)...")
start_time = time.time()
texts = [c["text"] for c in all_chunks]
embeddings = embedder.encode(texts, show_progress_bar=True, normalize_embeddings=True)
embeddings = np.array(embeddings, dtype=np.float32)
elapsed = time.time() - start_time
logger.info(f"Embedding complete: {embeddings.shape[0]} vectors, {embeddings.shape[1]} dimensions ({elapsed:.1f}s)")
# Step 4: Build FAISS index
logger.info("Building FAISS index...")
dimension = embeddings.shape[1]
index = faiss.IndexFlatIP(dimension) # Inner product (cosine sim with normalized vecs)
index.add(embeddings)
logger.info(f"FAISS index built: {index.ntotal} vectors")
# Step 5: Save to disk
out_dir = os.path.dirname(FAISS_INDEX_PATH)
if out_dir:
os.makedirs(out_dir, exist_ok=True)
faiss.write_index(index, FAISS_INDEX_PATH)
with open(FAISS_DOCSTORE_PATH, "wb") as f:
pickle.dump(all_chunks, f)
logger.info(f"Index saved: {FAISS_INDEX_PATH}")
logger.info(f"Metadata saved: {FAISS_DOCSTORE_PATH}")
logger.info("=" * 60)
logger.info("DONE. You can now run: python app.py")
logger.info("=" * 60)
if __name__ == "__main__":
main()