# pdf_parser.py import fitz # PyMuPDF import os import torch import pickle import numpy as np from PIL import Image import cv2 import pytesseract from transformers import CLIPProcessor, CLIPModel import faiss # Load OpenCLIP model (CPU) processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") model.eval().to("cpu") def parse_pdf(pdf_path): doc = fitz.open(pdf_path) metadata = [] all_embeddings = [] for i, page in enumerate(doc): # Step 1: Convert to image pix = page.get_pixmap(dpi=150) image_path = f"page_{i+1}.png" with open(image_path, "wb") as f: f.write(pix.tobytes()) # Step 2: OCR image = Image.open(image_path).convert("RGB") ocr_text = pytesseract.image_to_string(image) # Step 3: Thumbnail thumbnail_path = f"thumb_page_{i+1}.jpg" cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) thumb = cv2.resize(cv_img, (250, 350)) cv2.imwrite(thumbnail_path, thumb) # Step 4: Vision Embedding inputs = processor(images=image, return_tensors="pt").to("cpu") with torch.no_grad(): embedding = model.get_image_features(**inputs)[0].cpu().numpy() embedding /= np.linalg.norm(embedding) all_embeddings.append(embedding) metadata.append({ "page": i + 1, "thumbnail": thumbnail_path, "ocr": ocr_text.strip() }) # Step 5: Save FAISS index dim = all_embeddings[0].shape[0] index = faiss.IndexFlatL2(dim) index.add(np.array(all_embeddings).astype("float32")) faiss.write_index(index, "vision_index.faiss") # Step 6: Save metadata with open("metadata.pkl", "wb") as f: pickle.dump(metadata, f) print(f"✅ Processed {len(doc)} pages.")