Spaces:
Runtime error
Runtime error
| # core.py | |
| from ilia3 import extract_text_from_pdf, find_jeld_param | |
| import os | |
| from PIL import Image | |
| import torch | |
| from transformers import CLIPProcessor, CLIPModel | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| import numpy as np | |
| import json | |
| MODEL_NAME = "openai/clip-vit-base-patch32" | |
| model = CLIPModel.from_pretrained(MODEL_NAME) | |
| processor = CLIPProcessor.from_pretrained(MODEL_NAME) | |
| JSON_PATH = "covers_embeddings.json" | |
| def _load_db(): | |
| return json.load(open(JSON_PATH)) if os.path.exists(JSON_PATH) else {} | |
| def _save_db(db): | |
| json.dump(db, open(JSON_PATH, "w")) | |
| def _get_embedding(pil_image): | |
| inputs = processor(images=pil_image, return_tensors="pt") | |
| with torch.no_grad(): | |
| emb = model.get_image_features(**inputs) | |
| emb = emb / emb.norm(p=2, dim=-1, keepdim=True) | |
| return emb.cpu().numpy().squeeze() | |
| def analyze_or_save(pdf_path, pil_image, custom_name=None, threshold=0.90): | |
| base_name = os.path.splitext(os.path.basename(pdf_path))[0] | |
| key = custom_name.strip() if custom_name else base_name | |
| # استخراج متن صفحات ۲ تا ۵ | |
| text = extract_text_from_pdf(pdf_path, pages=(2, 5)) | |
| jeld_param = find_jeld_param(text) | |
| if jeld_param: | |
| key += f"_{jeld_param}" | |
| db = _load_db() | |
| new_emb = _get_embedding(pil_image) | |
| if not db: | |
| db[key] = new_emb.tolist() | |
| _save_db(db) | |
| return {"status": "new", "similarity": 0.0, "saved_path": key} | |
| keys = list(db.keys()) | |
| embeddings = np.array([np.array(v) for v in db.values()]) | |
| sims = cosine_similarity(new_emb.reshape(1, -1), embeddings)[0] | |
| max_sim = sims.max() | |
| max_idx = sims.argmax() | |
| most_similar_key = keys[max_idx] | |
| if max_sim > 0.90: | |
| return { | |
| "status": "duplicate", | |
| "similarity": max_sim * 100, | |
| "similar_path": most_similar_key | |
| } | |
| db[key] = new_emb.tolist() | |
| _save_db(db) | |
| return { | |
| "status": "new", | |
| "similarity": max_sim * 100, | |
| "saved_path": key | |
| } |