# core.py from ilia3 import extract_text_from_pdf, find_jeld_param import os from PIL import Image import torch from transformers import CLIPProcessor, CLIPModel from sklearn.metrics.pairwise import cosine_similarity import numpy as np import json MODEL_NAME = "openai/clip-vit-base-patch32" model = CLIPModel.from_pretrained(MODEL_NAME) processor = CLIPProcessor.from_pretrained(MODEL_NAME) JSON_PATH = "covers_embeddings.json" def _load_db(): return json.load(open(JSON_PATH)) if os.path.exists(JSON_PATH) else {} def _save_db(db): json.dump(db, open(JSON_PATH, "w")) def _get_embedding(pil_image): inputs = processor(images=pil_image, return_tensors="pt") with torch.no_grad(): emb = model.get_image_features(**inputs) emb = emb / emb.norm(p=2, dim=-1, keepdim=True) return emb.cpu().numpy().squeeze() def analyze_or_save(pdf_path, pil_image, custom_name=None, threshold=0.90): base_name = os.path.splitext(os.path.basename(pdf_path))[0] key = custom_name.strip() if custom_name else base_name # استخراج متن صفحات ۲ تا ۵ text = extract_text_from_pdf(pdf_path, pages=(2, 5)) jeld_param = find_jeld_param(text) if jeld_param: key += f"_{jeld_param}" db = _load_db() new_emb = _get_embedding(pil_image) if not db: db[key] = new_emb.tolist() _save_db(db) return {"status": "new", "similarity": 0.0, "saved_path": key} keys = list(db.keys()) embeddings = np.array([np.array(v) for v in db.values()]) sims = cosine_similarity(new_emb.reshape(1, -1), embeddings)[0] max_sim = sims.max() max_idx = sims.argmax() most_similar_key = keys[max_idx] if max_sim > 0.90: return { "status": "duplicate", "similarity": max_sim * 100, "similar_path": most_similar_key } db[key] = new_emb.tolist() _save_db(db) return { "status": "new", "similarity": max_sim * 100, "saved_path": key }