File size: 2,020 Bytes
64367bb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
# core.py
from ilia3 import extract_text_from_pdf, find_jeld_param
import os
from PIL import Image
import torch
from transformers import CLIPProcessor, CLIPModel
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
import json

MODEL_NAME = "openai/clip-vit-base-patch32"
model = CLIPModel.from_pretrained(MODEL_NAME)
processor = CLIPProcessor.from_pretrained(MODEL_NAME)
JSON_PATH = "covers_embeddings.json"

def _load_db():
    return json.load(open(JSON_PATH)) if os.path.exists(JSON_PATH) else {}

def _save_db(db):
    json.dump(db, open(JSON_PATH, "w"))

def _get_embedding(pil_image):
    inputs = processor(images=pil_image, return_tensors="pt")
    with torch.no_grad():
        emb = model.get_image_features(**inputs)
    emb = emb / emb.norm(p=2, dim=-1, keepdim=True)
    return emb.cpu().numpy().squeeze()

def analyze_or_save(pdf_path, pil_image, custom_name=None, threshold=0.90):
    base_name = os.path.splitext(os.path.basename(pdf_path))[0]
    key = custom_name.strip() if custom_name else base_name

    # استخراج متن صفحات ۲ تا ۵
    text = extract_text_from_pdf(pdf_path, pages=(2, 5))
    jeld_param = find_jeld_param(text)

    if jeld_param:
        key += f"_{jeld_param}"

    db = _load_db()
    new_emb = _get_embedding(pil_image)

    if not db:
        db[key] = new_emb.tolist()
        _save_db(db)
        return {"status": "new", "similarity": 0.0, "saved_path": key}

    keys = list(db.keys())
    embeddings = np.array([np.array(v) for v in db.values()])
    sims = cosine_similarity(new_emb.reshape(1, -1), embeddings)[0]
    max_sim = sims.max()
    max_idx = sims.argmax()
    most_similar_key = keys[max_idx]

    if max_sim > 0.90:
        return {
            "status": "duplicate",
            "similarity": max_sim * 100,
            "similar_path": most_similar_key
        }

    db[key] = new_emb.tolist()
    _save_db(db)
    return {
        "status": "new",
        "similarity": max_sim * 100,
        "saved_path": key
    }