Spaces:

NeilDriscoll
/

InstaAutoApp_v3_TeamDataMavericks

Sleeping

App Files Files Community

InstaAutoApp_v3_TeamDataMavericks / ingest.py

NeilDriscoll

Upload 14 files

661743e verified about 2 months ago

raw

history blame contribute delete

5.15 kB

	"""
	Insta-AutoApp — OEM Manual Ingestion Pipeline

	Converts the 2023 Ford Bronco Owner's Manual (PDF) into a FAISS vector index
	for semantic retrieval. This script runs ONCE before the app can be used.

	Usage:
	1. Place the PDF in data/
	2. Run: python ingest.py
	3. Output: data/index.faiss + data/index.pkl
	"""

	import logging
	import os
	import pickle
	import sys
	import time

	import faiss
	import fitz # PyMuPDF
	import numpy as np
	from sentence_transformers import SentenceTransformer

	from config import (
	MANUAL_PDF_PATH, MANUAL_PDF_FILENAME,
	FAISS_INDEX_PATH, FAISS_DOCSTORE_PATH,
	EMBEDDING_MODEL, CHUNK_SIZE, CHUNK_OVERLAP,
	)

	logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
	logger = logging.getLogger(__name__)


	# ── Low-value content filter ─────────────────────────────────────────

	LOW_VALUE_MARKERS = [
	"table of contents", "all rights reserved", "edition date",
	"visual search", "copyright", "printed in",
	]


	def is_low_value(text: str) -> bool:
	"""Filter out TOC, copyright, legal, and navigation-only pages."""
	lowered = text.lower().strip()
	if len(lowered) < 50:
	return True
	if any(marker in lowered for marker in LOW_VALUE_MARKERS):
	dots_ratio = lowered.count(".") / max(len(lowered), 1)
	if dots_ratio > 0.05:
	return True
	if lowered.count("copyright") > 0 and len(lowered) < 300:
	return True
	return False


	# ── Chunking ─────────────────────────────────────────────────────────

	def chunk_text(text: str, page_num: int, chunk_size: int = CHUNK_SIZE, overlap: int = CHUNK_OVERLAP) -> list:
	"""Split text into overlapping chunks with page metadata."""
	words = text.split()
	chunks = []
	start = 0
	while start < len(words):
	end = start + chunk_size
	chunk_text = " ".join(words[start:end])
	if len(chunk_text.strip()) > 30:
	chunks.append({
	"text": chunk_text.strip(),
	"page": page_num,
	"source": MANUAL_PDF_FILENAME,
	})
	start += chunk_size - overlap
	return chunks


	# ── Main pipeline ────────────────────────────────────────────────────

	def main():
	logger.info("=" * 60)
	logger.info("Insta-AutoApp — FAISS Index Builder")
	logger.info("=" * 60)

	# Check PDF exists
	if not os.path.exists(MANUAL_PDF_PATH):
	logger.error(
	f"PDF not found: {MANUAL_PDF_PATH}\n"
	f"Please place '{MANUAL_PDF_FILENAME}' in the data/ directory."
	)
	sys.exit(1)

	# Step 1: Extract text from PDF
	logger.info(f"Loading PDF: {MANUAL_PDF_PATH}")
	doc = fitz.open(MANUAL_PDF_PATH)
	logger.info(f"PDF loaded: {len(doc)} pages")

	# Step 2: Extract and filter pages
	all_chunks = []
	pages_kept = 0
	pages_skipped = 0
	for page_num in range(len(doc)):
	page = doc[page_num]
	text = page.get_text()
	if is_low_value(text):
	pages_skipped += 1
	continue
	pages_kept += 1
	page_chunks = chunk_text(text, page_num + 1)
	all_chunks.extend(page_chunks)

	doc.close()
	logger.info(f"Pages processed: {pages_kept} kept, {pages_skipped} filtered out")
	logger.info(f"Chunks created: {len(all_chunks)}")

	if not all_chunks:
	logger.error("No chunks were created. Check the PDF file.")
	sys.exit(1)

	# Step 3: Embed chunks
	logger.info(f"Loading embedding model: {EMBEDDING_MODEL}")
	embedder = SentenceTransformer(EMBEDDING_MODEL)

	logger.info("Embedding chunks (this may take 1-3 minutes)...")
	start_time = time.time()
	texts = [c["text"] for c in all_chunks]
	embeddings = embedder.encode(texts, show_progress_bar=True, normalize_embeddings=True)
	embeddings = np.array(embeddings, dtype=np.float32)
	elapsed = time.time() - start_time
	logger.info(f"Embedding complete: {embeddings.shape[0]} vectors, {embeddings.shape[1]} dimensions ({elapsed:.1f}s)")

	# Step 4: Build FAISS index
	logger.info("Building FAISS index...")
	dimension = embeddings.shape[1]
	index = faiss.IndexFlatIP(dimension) # Inner product (cosine sim with normalized vecs)
	index.add(embeddings)
	logger.info(f"FAISS index built: {index.ntotal} vectors")

	# Step 5: Save to disk
	out_dir = os.path.dirname(FAISS_INDEX_PATH)
	if out_dir:
	os.makedirs(out_dir, exist_ok=True)
	faiss.write_index(index, FAISS_INDEX_PATH)
	with open(FAISS_DOCSTORE_PATH, "wb") as f:
	pickle.dump(all_chunks, f)

	logger.info(f"Index saved: {FAISS_INDEX_PATH}")
	logger.info(f"Metadata saved: {FAISS_DOCSTORE_PATH}")
	logger.info("=" * 60)
	logger.info("DONE. You can now run: python app.py")
	logger.info("=" * 60)


	if __name__ == "__main__":
	main()