VisLanRAG / src /pdf_parser.py
zach9111's picture
Fix import paths and user permissions for Streamlit
72a3bea
# pdf_parser.py
import fitz # PyMuPDF
import os
import torch
import pickle
import numpy as np
from PIL import Image
import cv2
import pytesseract
from transformers import CLIPProcessor, CLIPModel
import faiss
# Load OpenCLIP model (CPU)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
model.eval().to("cpu")
def parse_pdf(pdf_path):
doc = fitz.open(pdf_path)
metadata = []
all_embeddings = []
for i, page in enumerate(doc):
# Step 1: Convert to image
pix = page.get_pixmap(dpi=150)
image_path = f"page_{i+1}.png"
with open(image_path, "wb") as f:
f.write(pix.tobytes())
# Step 2: OCR
image = Image.open(image_path).convert("RGB")
ocr_text = pytesseract.image_to_string(image)
# Step 3: Thumbnail
thumbnail_path = f"thumb_page_{i+1}.jpg"
cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
thumb = cv2.resize(cv_img, (250, 350))
cv2.imwrite(thumbnail_path, thumb)
# Step 4: Vision Embedding
inputs = processor(images=image, return_tensors="pt").to("cpu")
with torch.no_grad():
embedding = model.get_image_features(**inputs)[0].cpu().numpy()
embedding /= np.linalg.norm(embedding)
all_embeddings.append(embedding)
metadata.append({
"page": i + 1,
"thumbnail": thumbnail_path,
"ocr": ocr_text.strip()
})
# Step 5: Save FAISS index
dim = all_embeddings[0].shape[0]
index = faiss.IndexFlatL2(dim)
index.add(np.array(all_embeddings).astype("float32"))
faiss.write_index(index, "vision_index.faiss")
# Step 6: Save metadata
with open("metadata.pkl", "wb") as f:
pickle.dump(metadata, f)
print(f"✅ Processed {len(doc)} pages.")