|
|
|
|
|
|
|
|
import fitz |
|
|
import os |
|
|
import torch |
|
|
import pickle |
|
|
import numpy as np |
|
|
from PIL import Image |
|
|
import cv2 |
|
|
import pytesseract |
|
|
from transformers import CLIPProcessor, CLIPModel |
|
|
import faiss |
|
|
|
|
|
|
|
|
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32") |
|
|
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32") |
|
|
model.eval().to("cpu") |
|
|
|
|
|
def parse_pdf(pdf_path): |
|
|
doc = fitz.open(pdf_path) |
|
|
metadata = [] |
|
|
all_embeddings = [] |
|
|
|
|
|
for i, page in enumerate(doc): |
|
|
|
|
|
pix = page.get_pixmap(dpi=150) |
|
|
image_path = f"page_{i+1}.png" |
|
|
with open(image_path, "wb") as f: |
|
|
f.write(pix.tobytes()) |
|
|
|
|
|
|
|
|
image = Image.open(image_path).convert("RGB") |
|
|
ocr_text = pytesseract.image_to_string(image) |
|
|
|
|
|
|
|
|
thumbnail_path = f"thumb_page_{i+1}.jpg" |
|
|
cv_img = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR) |
|
|
thumb = cv2.resize(cv_img, (250, 350)) |
|
|
cv2.imwrite(thumbnail_path, thumb) |
|
|
|
|
|
|
|
|
inputs = processor(images=image, return_tensors="pt").to("cpu") |
|
|
with torch.no_grad(): |
|
|
embedding = model.get_image_features(**inputs)[0].cpu().numpy() |
|
|
embedding /= np.linalg.norm(embedding) |
|
|
|
|
|
all_embeddings.append(embedding) |
|
|
metadata.append({ |
|
|
"page": i + 1, |
|
|
"thumbnail": thumbnail_path, |
|
|
"ocr": ocr_text.strip() |
|
|
}) |
|
|
|
|
|
|
|
|
dim = all_embeddings[0].shape[0] |
|
|
index = faiss.IndexFlatL2(dim) |
|
|
index.add(np.array(all_embeddings).astype("float32")) |
|
|
faiss.write_index(index, "vision_index.faiss") |
|
|
|
|
|
|
|
|
with open("metadata.pkl", "wb") as f: |
|
|
pickle.dump(metadata, f) |
|
|
|
|
|
print(f"✅ Processed {len(doc)} pages.") |
|
|
|