Spaces:

F-allahmoradi
/

BookCover

Runtime error

App Files Files Community

F-allahmoradi commited on Oct 21, 2025

Commit

64367bb

verified ·

1 Parent(s): eee453b

Upload core.py

Browse files

Files changed (1) hide show

core.py +68 -0

core.py ADDED Viewed

	@@ -0,0 +1,68 @@

+# core.py
+from ilia3 import extract_text_from_pdf, find_jeld_param
+import os
+from PIL import Image
+import torch
+from transformers import CLIPProcessor, CLIPModel
+from sklearn.metrics.pairwise import cosine_similarity
+import numpy as np
+import json
+MODEL_NAME = "openai/clip-vit-base-patch32"
+model = CLIPModel.from_pretrained(MODEL_NAME)
+processor = CLIPProcessor.from_pretrained(MODEL_NAME)
+JSON_PATH = "covers_embeddings.json"
+def _load_db():
+    return json.load(open(JSON_PATH)) if os.path.exists(JSON_PATH) else {}
+def _save_db(db):
+    json.dump(db, open(JSON_PATH, "w"))
+def _get_embedding(pil_image):
+    inputs = processor(images=pil_image, return_tensors="pt")
+    with torch.no_grad():
+        emb = model.get_image_features(**inputs)
+    emb = emb / emb.norm(p=2, dim=-1, keepdim=True)
+    return emb.cpu().numpy().squeeze()
+def analyze_or_save(pdf_path, pil_image, custom_name=None, threshold=0.90):
+    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
+    key = custom_name.strip() if custom_name else base_name
+    # استخراج متن صفحات ۲ تا ۵
+    text = extract_text_from_pdf(pdf_path, pages=(2, 5))
+    jeld_param = find_jeld_param(text)
+    if jeld_param:
+        key += f"_{jeld_param}"
+    db = _load_db()
+    new_emb = _get_embedding(pil_image)
+    if not db:
+        db[key] = new_emb.tolist()
+        _save_db(db)
+        return {"status": "new", "similarity": 0.0, "saved_path": key}
+    keys = list(db.keys())
+    embeddings = np.array([np.array(v) for v in db.values()])
+    sims = cosine_similarity(new_emb.reshape(1, -1), embeddings)[0]
+    max_sim = sims.max()
+    max_idx = sims.argmax()
+    most_similar_key = keys[max_idx]
+    if max_sim > 0.90:
+        return {
+            "status": "duplicate",
+            "similarity": max_sim * 100,
+            "similar_path": most_similar_key
+        }
+    db[key] = new_emb.tolist()
+    _save_db(db)
+    return {
+        "status": "new",
+        "similarity": max_sim * 100,
+        "saved_path": key
+    }