Spaces:
Runtime error
Runtime error
wisdom anthony commited on
Commit ·
97911a8
1
Parent(s): 028bcd8
similarity code for backend
Browse files- .gitignore +7 -1
- api/main.py +2 -0
- api/product_routes.py +2 -1
- api/similarity_routes.py +1079 -0
- db/similarity_repository.py +462 -0
- product_detector/mock_detector.py +33 -0
- requirements.txt +5 -1
- similarity_engine/__init__.py +17 -0
- similarity_engine/enhanced_image_processor.py +531 -0
- similarity_engine/product_comparator.py +300 -0
- similarity_engine/promo_comparator.py +278 -0
- similarity_engine/similarity_core.py +190 -0
- utils/cache_manager.py +350 -0
.gitignore
CHANGED
|
@@ -113,4 +113,10 @@ config.ini
|
|
| 113 |
secrets.json
|
| 114 |
credentials.json
|
| 115 |
*.pem
|
| 116 |
-
*.key
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
secrets.json
|
| 114 |
credentials.json
|
| 115 |
*.pem
|
| 116 |
+
*.key
|
| 117 |
+
|
| 118 |
+
# Similarity Engine Cache
|
| 119 |
+
cache/
|
| 120 |
+
/cache/
|
| 121 |
+
**/cache/
|
| 122 |
+
*.cache
|
api/main.py
CHANGED
|
@@ -5,6 +5,7 @@ from api.product_routes import router as product_router
|
|
| 5 |
from api.receipt_routes import router as receipt_router
|
| 6 |
from api.scrape_routes import router as scrape_router
|
| 7 |
from api.cijene_routes import router as cijene_router
|
|
|
|
| 8 |
|
| 9 |
# Initialize FastAPI
|
| 10 |
app = FastAPI(title="SupaKuna API")
|
|
@@ -24,6 +25,7 @@ app.include_router(product_router)
|
|
| 24 |
app.include_router(receipt_router)
|
| 25 |
app.include_router(scrape_router)
|
| 26 |
app.include_router(cijene_router)
|
|
|
|
| 27 |
|
| 28 |
@app.get("/", tags=["Health"])
|
| 29 |
def health_check():
|
|
|
|
| 5 |
from api.receipt_routes import router as receipt_router
|
| 6 |
from api.scrape_routes import router as scrape_router
|
| 7 |
from api.cijene_routes import router as cijene_router
|
| 8 |
+
from api.similarity_routes import router as similarity_router
|
| 9 |
|
| 10 |
# Initialize FastAPI
|
| 11 |
app = FastAPI(title="SupaKuna API")
|
|
|
|
| 25 |
app.include_router(receipt_router)
|
| 26 |
app.include_router(scrape_router)
|
| 27 |
app.include_router(cijene_router)
|
| 28 |
+
app.include_router(similarity_router)
|
| 29 |
|
| 30 |
@app.get("/", tags=["Health"])
|
| 31 |
def health_check():
|
api/product_routes.py
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
from fastapi import APIRouter, File, UploadFile, HTTPException, Form
|
| 2 |
from utils.image_processing import read_image_file, process_product_image
|
| 3 |
-
from product_detector.detector import ObjectDetector
|
|
|
|
| 4 |
from config.settings import MODEL_ONNX_PATH, CLASS_NAMES, INPUT_SIZE
|
| 5 |
from utils.image_processing import process_and_store_product_image
|
| 6 |
|
|
|
|
| 1 |
from fastapi import APIRouter, File, UploadFile, HTTPException, Form
|
| 2 |
from utils.image_processing import read_image_file, process_product_image
|
| 3 |
+
# from product_detector.detector import ObjectDetector # Temporarily disabled - model corrupted
|
| 4 |
+
from product_detector.mock_detector import MockObjectDetector as ObjectDetector
|
| 5 |
from config.settings import MODEL_ONNX_PATH, CLASS_NAMES, INPUT_SIZE
|
| 6 |
from utils.image_processing import process_and_store_product_image
|
| 7 |
|
api/similarity_routes.py
ADDED
|
@@ -0,0 +1,1079 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Similarity Routes - FastAPI Endpoints for Similarity Engine
|
| 3 |
+
NOW WITH JSON CACHING FOR IMPROVED PERFORMANCE
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import logging
|
| 7 |
+
import sys
|
| 8 |
+
import os
|
| 9 |
+
from typing import List, Dict, Any, Optional
|
| 10 |
+
from fastapi import APIRouter, HTTPException, File, UploadFile, Form, BackgroundTasks
|
| 11 |
+
from pydantic import BaseModel, Field
|
| 12 |
+
import time
|
| 13 |
+
from fastapi.responses import StreamingResponse
|
| 14 |
+
import asyncio
|
| 15 |
+
import json
|
| 16 |
+
|
| 17 |
+
# Add parent directory to path to access other modules
|
| 18 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 19 |
+
|
| 20 |
+
# Import our similarity engine modules
|
| 21 |
+
try:
|
| 22 |
+
from similarity_engine.similarity_core import calculate_similarity, calculate_confidence, test_similarity_examples
|
| 23 |
+
from similarity_engine.product_comparator import ProductComparator
|
| 24 |
+
from db.similarity_repository import get_similarity_repository
|
| 25 |
+
from utils.cache_manager import (
|
| 26 |
+
get_cache_manager,
|
| 27 |
+
cache_duplicate_analysis,
|
| 28 |
+
load_duplicate_analysis,
|
| 29 |
+
cache_promo_analysis,
|
| 30 |
+
load_promo_analysis
|
| 31 |
+
)
|
| 32 |
+
|
| 33 |
+
print("✅ All similarity modules imported successfully")
|
| 34 |
+
print("✅ Cache manager imported successfully")
|
| 35 |
+
|
| 36 |
+
except ImportError as e:
|
| 37 |
+
print(f"⚠️ Some similarity modules failed to import: {e}")
|
| 38 |
+
print("🔄 Using fallback implementations...")
|
| 39 |
+
|
| 40 |
+
# Create fallback functions to prevent startup failure
|
| 41 |
+
def calculate_similarity(a, b):
|
| 42 |
+
a, b = str(a).lower().strip(), str(b).lower().strip()
|
| 43 |
+
if a == b: return 1.0
|
| 44 |
+
if not a or not b: return 0.0
|
| 45 |
+
return 0.8 if a in b or b in a else 0.3
|
| 46 |
+
|
| 47 |
+
def calculate_confidence(sim, a, b):
|
| 48 |
+
return sim * 0.9
|
| 49 |
+
|
| 50 |
+
def test_similarity_examples(): return []
|
| 51 |
+
|
| 52 |
+
class ProductComparator:
|
| 53 |
+
def __init__(self, *args, **kwargs): pass
|
| 54 |
+
|
| 55 |
+
def get_similarity_repository(): return None
|
| 56 |
+
|
| 57 |
+
# Fallback cache functions
|
| 58 |
+
def get_cache_manager(): return None
|
| 59 |
+
def cache_duplicate_analysis(*args, **kwargs): return None
|
| 60 |
+
def load_duplicate_analysis(*args, **kwargs): return None
|
| 61 |
+
def cache_promo_analysis(*args, **kwargs): return None
|
| 62 |
+
def load_promo_analysis(*args, **kwargs): return None
|
| 63 |
+
|
| 64 |
+
# Configure logging
|
| 65 |
+
logging.basicConfig(level=logging.INFO)
|
| 66 |
+
logger = logging.getLogger(__name__)
|
| 67 |
+
|
| 68 |
+
# Create router
|
| 69 |
+
router = APIRouter(prefix="/similarity", tags=["Similarity Engine"])
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
|
| 73 |
+
# Pydantic models for request/response
|
| 74 |
+
class ProductComparisonRequest(BaseModel):
|
| 75 |
+
product1_name: str = Field(..., description="First product name")
|
| 76 |
+
product2_name: str = Field(..., description="Second product name")
|
| 77 |
+
threshold: Optional[float] = Field(0.87, description="Similarity threshold", ge=0.1, le=1.0)
|
| 78 |
+
|
| 79 |
+
class DuplicateAnalysisRequest(BaseModel):
|
| 80 |
+
threshold: Optional[float] = Field(0.87, description="Similarity threshold", ge=0.1, le=1.0)
|
| 81 |
+
use_sample_data: Optional[bool] = Field(False, description="Use sample data instead of database")
|
| 82 |
+
return_summary_only: Optional[bool] = Field(False, description="Return only summary statistics")
|
| 83 |
+
force_refresh: Optional[bool] = Field(False, description="Force refresh, bypass cache")
|
| 84 |
+
|
| 85 |
+
class PromoComparisonRequest(BaseModel):
|
| 86 |
+
threshold: Optional[float] = Field(0.85, description="Similarity threshold", ge=0.1, le=1.0)
|
| 87 |
+
max_results: Optional[int] = Field(None, description="Maximum results to return")
|
| 88 |
+
force_refresh: Optional[bool] = Field(False, description="Force refresh, bypass cache")
|
| 89 |
+
|
| 90 |
+
class CacheManagementRequest(BaseModel):
|
| 91 |
+
analysis_type: Optional[str] = Field(None, description="Type to clear (duplicates, promo_matches, or all)")
|
| 92 |
+
older_than_hours: Optional[int] = Field(None, description="Clear cache older than X hours")
|
| 93 |
+
|
| 94 |
+
# Health check endpoint
|
| 95 |
+
@router.get("/health", summary="Health Check")
|
| 96 |
+
async def health_check():
|
| 97 |
+
"""Check if similarity engine is working"""
|
| 98 |
+
try:
|
| 99 |
+
# Test basic similarity calculation
|
| 100 |
+
test_similarity = calculate_similarity("test product", "test product")
|
| 101 |
+
|
| 102 |
+
# Test database connection
|
| 103 |
+
repository = get_similarity_repository()
|
| 104 |
+
db_status = "connected" if repository and repository.supabase else "disconnected"
|
| 105 |
+
|
| 106 |
+
# Test cache manager
|
| 107 |
+
cache_mgr = get_cache_manager()
|
| 108 |
+
cache_status = "available" if cache_mgr else "unavailable"
|
| 109 |
+
|
| 110 |
+
return {
|
| 111 |
+
"status": "healthy",
|
| 112 |
+
"similarity_engine": "operational",
|
| 113 |
+
"database_connection": db_status,
|
| 114 |
+
"cache_system": cache_status,
|
| 115 |
+
"test_similarity": test_similarity,
|
| 116 |
+
"timestamp": time.time()
|
| 117 |
+
}
|
| 118 |
+
except Exception as e:
|
| 119 |
+
logger.error(f"Health check failed: {e}")
|
| 120 |
+
return {
|
| 121 |
+
"status": "error",
|
| 122 |
+
"similarity_engine": "failed",
|
| 123 |
+
"error": str(e),
|
| 124 |
+
"timestamp": time.time()
|
| 125 |
+
}
|
| 126 |
+
|
| 127 |
+
# Core similarity endpoints
|
| 128 |
+
@router.post("/compare-products", summary="Compare Two Products")
|
| 129 |
+
async def compare_two_products(request: ProductComparisonRequest):
|
| 130 |
+
"""Compare similarity between two specific products"""
|
| 131 |
+
try:
|
| 132 |
+
logger.info(f"🔍 Comparing products: '{request.product1_name}' vs '{request.product2_name}'")
|
| 133 |
+
|
| 134 |
+
# Calculate similarity directly
|
| 135 |
+
similarity = calculate_similarity(request.product1_name, request.product2_name)
|
| 136 |
+
confidence = calculate_confidence(similarity, request.product1_name, request.product2_name)
|
| 137 |
+
is_duplicate = similarity >= request.threshold
|
| 138 |
+
|
| 139 |
+
# Determine assessment
|
| 140 |
+
if similarity >= 0.95:
|
| 141 |
+
assessment = {"description": "Perfect match - identical products", "emoji": "✅", "category": "identical"}
|
| 142 |
+
elif similarity >= 0.8:
|
| 143 |
+
assessment = {"description": "Very similar - likely duplicates", "emoji": "⚠️", "category": "very_similar"}
|
| 144 |
+
elif similarity >= 0.7:
|
| 145 |
+
assessment = {"description": "Similar - review recommended", "emoji": "🤔", "category": "similar"}
|
| 146 |
+
else:
|
| 147 |
+
assessment = {"description": "Different products", "emoji": "❌", "category": "different"}
|
| 148 |
+
|
| 149 |
+
result = {
|
| 150 |
+
'product1_name': request.product1_name,
|
| 151 |
+
'product2_name': request.product2_name,
|
| 152 |
+
'similarity': round(similarity, 3),
|
| 153 |
+
'confidence': round(confidence, 3),
|
| 154 |
+
'is_duplicate': is_duplicate,
|
| 155 |
+
'threshold_used': request.threshold,
|
| 156 |
+
'assessment': assessment,
|
| 157 |
+
'percentage_similarity': round(similarity * 100, 1),
|
| 158 |
+
'percentage_confidence': round(confidence * 100, 1)
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
return {
|
| 162 |
+
"status": "success",
|
| 163 |
+
"comparison": result,
|
| 164 |
+
"timestamp": time.time()
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
except Exception as e:
|
| 168 |
+
logger.error(f"Product comparison error: {e}")
|
| 169 |
+
raise HTTPException(status_code=500, detail=f"Comparison failed: {str(e)}")
|
| 170 |
+
|
| 171 |
+
@router.post("/find-duplicates", summary="Find Database Duplicates - WITH CACHING")
|
| 172 |
+
async def find_duplicates(request: DuplicateAnalysisRequest):
|
| 173 |
+
"""Find duplicate products in the database - NOW WITH JSON CACHING"""
|
| 174 |
+
try:
|
| 175 |
+
logger.info(f"🔍 Starting duplicate analysis with threshold {request.threshold}")
|
| 176 |
+
|
| 177 |
+
repository = get_similarity_repository()
|
| 178 |
+
|
| 179 |
+
# Load products - REAL DATA NOW!
|
| 180 |
+
if request.use_sample_data:
|
| 181 |
+
logger.info("📊 Using sample data for analysis")
|
| 182 |
+
products = repository.get_sample_products() if repository else []
|
| 183 |
+
else:
|
| 184 |
+
logger.info("📊 Loading REAL products from database...")
|
| 185 |
+
products = repository.load_all_products() if repository else []
|
| 186 |
+
|
| 187 |
+
if not products:
|
| 188 |
+
return {
|
| 189 |
+
"status": "error",
|
| 190 |
+
"message": "No products found for analysis - check database connection",
|
| 191 |
+
"duplicates": [],
|
| 192 |
+
"analysis_summary": {}
|
| 193 |
+
}
|
| 194 |
+
|
| 195 |
+
logger.info(f"📊 Loaded {len(products)} products for analysis")
|
| 196 |
+
|
| 197 |
+
# 🚀 NEW: CHECK CACHE FIRST (unless force refresh)
|
| 198 |
+
if not request.force_refresh:
|
| 199 |
+
cached_results = load_duplicate_analysis(len(products), request.threshold)
|
| 200 |
+
if cached_results:
|
| 201 |
+
logger.info("✅ Returning cached duplicate analysis results")
|
| 202 |
+
return {
|
| 203 |
+
"status": "success",
|
| 204 |
+
"data": cached_results,
|
| 205 |
+
"cached": True,
|
| 206 |
+
"cache_hit": True,
|
| 207 |
+
"timestamp": time.time()
|
| 208 |
+
}
|
| 209 |
+
|
| 210 |
+
# Run analysis if no cache or force refresh
|
| 211 |
+
logger.info("🔄 Running fresh duplicate analysis...")
|
| 212 |
+
start_time = time.time()
|
| 213 |
+
duplicates = []
|
| 214 |
+
|
| 215 |
+
for i, product1 in enumerate(products):
|
| 216 |
+
for j, product2 in enumerate(products[i+1:], i+1):
|
| 217 |
+
product_name1 = product1.get('product_name', '').strip()
|
| 218 |
+
product_name2 = product2.get('product_name', '').strip()
|
| 219 |
+
|
| 220 |
+
if not product_name1 or not product_name2:
|
| 221 |
+
continue
|
| 222 |
+
|
| 223 |
+
similarity = calculate_similarity(product_name1, product_name2)
|
| 224 |
+
if similarity >= request.threshold:
|
| 225 |
+
confidence = calculate_confidence(similarity, product_name1, product_name2)
|
| 226 |
+
duplicates.append({
|
| 227 |
+
'product1_id': product1.get('product_id'),
|
| 228 |
+
'product1_name': product_name1,
|
| 229 |
+
'product2_id': product2.get('product_id'),
|
| 230 |
+
'product2_name': product_name2,
|
| 231 |
+
'similarity': round(similarity, 3),
|
| 232 |
+
'confidence': round(confidence, 3)
|
| 233 |
+
})
|
| 234 |
+
|
| 235 |
+
analysis_time = time.time() - start_time
|
| 236 |
+
|
| 237 |
+
# Create results
|
| 238 |
+
results = {
|
| 239 |
+
'duplicates': duplicates,
|
| 240 |
+
'analysis_summary': {
|
| 241 |
+
'total_products': len(products),
|
| 242 |
+
'duplicates_found': len(duplicates),
|
| 243 |
+
'duplicate_rate': (len(duplicates) / len(products)) * 100 if products else 0,
|
| 244 |
+
'analysis_time_seconds': round(analysis_time, 2),
|
| 245 |
+
'threshold_used': request.threshold,
|
| 246 |
+
'success': True,
|
| 247 |
+
'data_source': 'sample_data' if request.use_sample_data else 'real_database'
|
| 248 |
+
},
|
| 249 |
+
'recommendations': [
|
| 250 |
+
f"Found {len(duplicates)} potential duplicates in {len(products)} products",
|
| 251 |
+
"Review matches manually for final decision",
|
| 252 |
+
"Higher similarity scores indicate more confident matches",
|
| 253 |
+
f"Analysis completed in {analysis_time:.2f} seconds"
|
| 254 |
+
]
|
| 255 |
+
}
|
| 256 |
+
|
| 257 |
+
# 🚀 NEW: CACHE THE RESULTS
|
| 258 |
+
cache_parameters = {
|
| 259 |
+
'threshold': request.threshold,
|
| 260 |
+
'use_sample_data': request.use_sample_data,
|
| 261 |
+
'total_products': len(products),
|
| 262 |
+
'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
|
| 263 |
+
}
|
| 264 |
+
|
| 265 |
+
cache_key = cache_duplicate_analysis(
|
| 266 |
+
len(products),
|
| 267 |
+
request.threshold,
|
| 268 |
+
results,
|
| 269 |
+
cache_parameters
|
| 270 |
+
)
|
| 271 |
+
|
| 272 |
+
logger.info(f"✅ Duplicate analysis complete: {len(duplicates)} duplicates found in {len(products)} products")
|
| 273 |
+
logger.info(f"💾 Results cached with key: {cache_key}")
|
| 274 |
+
|
| 275 |
+
return {
|
| 276 |
+
"status": "success",
|
| 277 |
+
"data": results,
|
| 278 |
+
"cached": False,
|
| 279 |
+
"cache_key": cache_key,
|
| 280 |
+
"timestamp": time.time()
|
| 281 |
+
}
|
| 282 |
+
|
| 283 |
+
except Exception as e:
|
| 284 |
+
logger.error(f"Duplicate analysis error: {e}")
|
| 285 |
+
raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
|
| 286 |
+
|
| 287 |
+
@router.post("/find-duplicates-stream", summary="Stream Duplicate Analysis with Real-Time Progress")
|
| 288 |
+
async def find_duplicates_stream(request: DuplicateAnalysisRequest):
|
| 289 |
+
"""Stream duplicate analysis with real-time progress updates"""
|
| 290 |
+
|
| 291 |
+
async def generate_progress():
|
| 292 |
+
try:
|
| 293 |
+
logger.info(f"🔍 Starting streaming duplicate analysis with threshold {request.threshold}")
|
| 294 |
+
|
| 295 |
+
repository = get_similarity_repository()
|
| 296 |
+
|
| 297 |
+
# Load products - REAL DATA
|
| 298 |
+
if request.use_sample_data:
|
| 299 |
+
logger.info("📊 Using sample data for analysis")
|
| 300 |
+
products = repository.get_sample_products() if repository else []
|
| 301 |
+
else:
|
| 302 |
+
logger.info("📊 Loading REAL products from database...")
|
| 303 |
+
products = repository.load_all_products() if repository else []
|
| 304 |
+
|
| 305 |
+
if not products:
|
| 306 |
+
yield f"data: {json.dumps({'type': 'error', 'message': 'No products found for analysis'})}\n\n"
|
| 307 |
+
return
|
| 308 |
+
|
| 309 |
+
logger.info(f"📊 Loaded {len(products)} products for streaming analysis")
|
| 310 |
+
|
| 311 |
+
# Check cache first (unless force refresh)
|
| 312 |
+
if not request.force_refresh:
|
| 313 |
+
cached_results = load_duplicate_analysis(len(products), request.threshold)
|
| 314 |
+
if cached_results:
|
| 315 |
+
logger.info("✅ Returning cached duplicate analysis results via stream")
|
| 316 |
+
yield f"data: {json.dumps({'type': 'cache_hit', 'data': cached_results})}\n\n"
|
| 317 |
+
return
|
| 318 |
+
|
| 319 |
+
# Send initial status
|
| 320 |
+
total_comparisons = len(products) * (len(products) - 1) // 2
|
| 321 |
+
yield f"data: {json.dumps({'type': 'init', 'total_products': len(products), 'total_comparisons': total_comparisons, 'threshold': request.threshold})}\n\n"
|
| 322 |
+
|
| 323 |
+
# Run analysis with real-time progress
|
| 324 |
+
start_time = time.time()
|
| 325 |
+
duplicates = []
|
| 326 |
+
completed_comparisons = 0
|
| 327 |
+
|
| 328 |
+
for i, product1 in enumerate(products):
|
| 329 |
+
product_name1 = product1.get('product_name', '').strip()
|
| 330 |
+
product_id1 = product1.get('product_id', 'unknown')
|
| 331 |
+
|
| 332 |
+
if not product_name1:
|
| 333 |
+
continue
|
| 334 |
+
|
| 335 |
+
# Send progress update every 100 products
|
| 336 |
+
if i % 100 == 0:
|
| 337 |
+
progress_percentage = (i / len(products)) * 100
|
| 338 |
+
yield f"data: {json.dumps({'type': 'product_progress', 'current_product': i + 1, 'total_products': len(products), 'progress': round(progress_percentage, 1), 'current_name': product_name1[:50]})}\n\n"
|
| 339 |
+
|
| 340 |
+
for j, product2 in enumerate(products[i+1:], i+1):
|
| 341 |
+
product_name2 = product2.get('product_name', '').strip()
|
| 342 |
+
product_id2 = product2.get('product_id', 'unknown')
|
| 343 |
+
|
| 344 |
+
if not product_name2 or product_id1 == product_id2:
|
| 345 |
+
continue
|
| 346 |
+
|
| 347 |
+
completed_comparisons += 1
|
| 348 |
+
|
| 349 |
+
# Calculate similarity
|
| 350 |
+
similarity = calculate_similarity(product_name1, product_name2)
|
| 351 |
+
|
| 352 |
+
# Send progress every 10,000 comparisons
|
| 353 |
+
if completed_comparisons % 10000 == 0:
|
| 354 |
+
progress_percentage = (completed_comparisons / total_comparisons) * 100
|
| 355 |
+
yield f"data: {json.dumps({'type': 'comparison_progress', 'completed_comparisons': completed_comparisons, 'total_comparisons': total_comparisons, 'progress': round(progress_percentage, 1), 'comparing': f'{product_name1[:30]} vs {product_name2[:30]}'})}\n\n"
|
| 356 |
+
await asyncio.sleep(0.01) # Small delay to prevent overwhelming
|
| 357 |
+
|
| 358 |
+
# Check if it's a duplicate
|
| 359 |
+
if similarity >= request.threshold:
|
| 360 |
+
confidence = calculate_confidence(similarity, product_name1, product_name2)
|
| 361 |
+
|
| 362 |
+
duplicate_info = {
|
| 363 |
+
'product1_id': product_id1,
|
| 364 |
+
'product1_name': product_name1,
|
| 365 |
+
'product2_id': product_id2,
|
| 366 |
+
'product2_name': product_name2,
|
| 367 |
+
'similarity': round(similarity, 3),
|
| 368 |
+
'confidence': round(confidence, 3)
|
| 369 |
+
}
|
| 370 |
+
|
| 371 |
+
duplicates.append(duplicate_info)
|
| 372 |
+
|
| 373 |
+
# Send duplicate found immediately
|
| 374 |
+
yield f"data: {json.dumps({'type': 'duplicate_found', 'duplicate': duplicate_info, 'total_duplicates': len(duplicates)})}\n\n"
|
| 375 |
+
logger.info(f" 🔍 DUPLICATE FOUND via stream: {product_name1} ↔ {product_name2} ({similarity:.3f})")
|
| 376 |
+
|
| 377 |
+
analysis_time = time.time() - start_time
|
| 378 |
+
|
| 379 |
+
# Create final results
|
| 380 |
+
results = {
|
| 381 |
+
'duplicates': duplicates,
|
| 382 |
+
'analysis_summary': {
|
| 383 |
+
'total_products': len(products),
|
| 384 |
+
'total_comparisons': completed_comparisons,
|
| 385 |
+
'duplicates_found': len(duplicates),
|
| 386 |
+
'duplicate_rate': (len(duplicates) / len(products)) * 100 if products else 0,
|
| 387 |
+
'analysis_time_seconds': round(analysis_time, 2),
|
| 388 |
+
'threshold_used': request.threshold,
|
| 389 |
+
'success': True,
|
| 390 |
+
'data_source': 'sample_data' if request.use_sample_data else 'real_database'
|
| 391 |
+
},
|
| 392 |
+
'recommendations': [
|
| 393 |
+
f"Found {len(duplicates)} potential duplicates in {len(products)} products",
|
| 394 |
+
"Review matches manually for final decision",
|
| 395 |
+
"Higher similarity scores indicate more confident matches",
|
| 396 |
+
f"Analysis completed in {analysis_time:.2f} seconds"
|
| 397 |
+
]
|
| 398 |
+
}
|
| 399 |
+
|
| 400 |
+
# Cache the results
|
| 401 |
+
cache_parameters = {
|
| 402 |
+
'threshold': request.threshold,
|
| 403 |
+
'use_sample_data': request.use_sample_data,
|
| 404 |
+
'total_products': len(products),
|
| 405 |
+
'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
|
| 406 |
+
}
|
| 407 |
+
|
| 408 |
+
cache_key = cache_duplicate_analysis(
|
| 409 |
+
len(products),
|
| 410 |
+
request.threshold,
|
| 411 |
+
results,
|
| 412 |
+
cache_parameters
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
# Send final complete results
|
| 416 |
+
yield f"data: {json.dumps({'type': 'complete', 'data': results, 'cache_key': cache_key})}\n\n"
|
| 417 |
+
|
| 418 |
+
logger.info(f"✅ Streaming duplicate analysis complete: {len(duplicates)} duplicates found in {len(products)} products")
|
| 419 |
+
logger.info(f"💾 Results cached with key: {cache_key}")
|
| 420 |
+
|
| 421 |
+
except Exception as e:
|
| 422 |
+
logger.error(f"Streaming duplicate analysis error: {e}")
|
| 423 |
+
yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
|
| 424 |
+
|
| 425 |
+
return StreamingResponse(
|
| 426 |
+
generate_progress(),
|
| 427 |
+
media_type="text/event-stream",
|
| 428 |
+
headers={
|
| 429 |
+
"Cache-Control": "no-cache",
|
| 430 |
+
"Connection": "keep-alive",
|
| 431 |
+
"Content-Type": "text/event-stream",
|
| 432 |
+
"X-Accel-Buffering": "no" # Disable nginx buffering
|
| 433 |
+
}
|
| 434 |
+
)
|
| 435 |
+
|
| 436 |
+
@router.post("/compare-promo", summary="Compare Promo Products - WITH CACHING")
|
| 437 |
+
async def compare_promo_products(request: PromoComparisonRequest):
|
| 438 |
+
"""Compare promotional products against database products - NOW WITH CACHING"""
|
| 439 |
+
try:
|
| 440 |
+
logger.info(f"🏷️ Starting promo comparison with threshold {request.threshold}")
|
| 441 |
+
|
| 442 |
+
repository = get_similarity_repository()
|
| 443 |
+
|
| 444 |
+
if not repository:
|
| 445 |
+
return {"status": "error", "message": "Database repository not available"}
|
| 446 |
+
|
| 447 |
+
# Load REAL promo products
|
| 448 |
+
logger.info("📊 Loading REAL promo products from database...")
|
| 449 |
+
try:
|
| 450 |
+
promo_products = repository.load_promo_products()
|
| 451 |
+
db_products = repository.load_all_products()
|
| 452 |
+
logger.info(f"✅ Loaded {len(promo_products)} promo products and {len(db_products)} database products")
|
| 453 |
+
except Exception as e:
|
| 454 |
+
logger.warning(f"❌ Failed to load real data, using samples: {e}")
|
| 455 |
+
promo_products = repository._get_sample_promo_products()
|
| 456 |
+
db_products = repository.get_sample_products()
|
| 457 |
+
|
| 458 |
+
# 🚀 NEW: CHECK CACHE FIRST (unless force refresh)
|
| 459 |
+
if not request.force_refresh:
|
| 460 |
+
cached_results = load_promo_analysis(
|
| 461 |
+
len(promo_products),
|
| 462 |
+
len(db_products),
|
| 463 |
+
request.threshold
|
| 464 |
+
)
|
| 465 |
+
if cached_results:
|
| 466 |
+
logger.info("✅ Returning cached promo analysis results")
|
| 467 |
+
return {
|
| 468 |
+
"status": "success",
|
| 469 |
+
**cached_results,
|
| 470 |
+
"cached": True,
|
| 471 |
+
"cache_hit": True,
|
| 472 |
+
"timestamp": time.time()
|
| 473 |
+
}
|
| 474 |
+
|
| 475 |
+
logger.info(f"📊 Comparing {len(promo_products)} promo products against {len(db_products)} database products")
|
| 476 |
+
|
| 477 |
+
# Find matches using REAL similarity algorithm
|
| 478 |
+
start_time = time.time()
|
| 479 |
+
matches = []
|
| 480 |
+
total_comparisons = 0
|
| 481 |
+
|
| 482 |
+
for promo in promo_products:
|
| 483 |
+
promo_name = promo.get('name', '').strip()
|
| 484 |
+
if not promo_name:
|
| 485 |
+
continue
|
| 486 |
+
|
| 487 |
+
best_match = None
|
| 488 |
+
best_similarity = 0.0
|
| 489 |
+
|
| 490 |
+
for db in db_products:
|
| 491 |
+
db_name = db.get('product_name', '').strip()
|
| 492 |
+
if not db_name:
|
| 493 |
+
continue
|
| 494 |
+
|
| 495 |
+
similarity = calculate_similarity(promo_name, db_name)
|
| 496 |
+
total_comparisons += 1
|
| 497 |
+
|
| 498 |
+
if similarity >= request.threshold and similarity > best_similarity:
|
| 499 |
+
best_similarity = similarity
|
| 500 |
+
confidence = calculate_confidence(similarity, promo_name, db_name)
|
| 501 |
+
|
| 502 |
+
best_match = {
|
| 503 |
+
'promo_id': promo.get('id'),
|
| 504 |
+
'promo_name': promo_name,
|
| 505 |
+
'promo_store': promo.get('store', ''),
|
| 506 |
+
'promo_price': promo.get('promo_price'),
|
| 507 |
+
'regular_price': promo.get('regular_price'),
|
| 508 |
+
'picture_id': promo.get('picture_id'),
|
| 509 |
+
'db_product_id': db.get('product_id'),
|
| 510 |
+
'db_product_name': db_name,
|
| 511 |
+
'db_brand': db.get('brand', {}).get('brand_name', 'No Brand') if db.get('brand') else 'No Brand',
|
| 512 |
+
'similarity': round(similarity, 3),
|
| 513 |
+
'confidence': round(confidence, 3)
|
| 514 |
+
}
|
| 515 |
+
|
| 516 |
+
if best_match:
|
| 517 |
+
matches.append(best_match)
|
| 518 |
+
|
| 519 |
+
# Limit results if requested
|
| 520 |
+
if request.max_results and len(matches) >= request.max_results:
|
| 521 |
+
break
|
| 522 |
+
|
| 523 |
+
analysis_time = time.time() - start_time
|
| 524 |
+
|
| 525 |
+
# Prepare results
|
| 526 |
+
results = {
|
| 527 |
+
"matches": matches,
|
| 528 |
+
"statistics": {
|
| 529 |
+
"total_promo_products": len(promo_products),
|
| 530 |
+
"total_database_products": len(db_products),
|
| 531 |
+
"total_comparisons": total_comparisons,
|
| 532 |
+
"matches_found": len(matches),
|
| 533 |
+
"match_rate_percentage": round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
|
| 534 |
+
"threshold_used": request.threshold,
|
| 535 |
+
"analysis_time_seconds": round(analysis_time, 2)
|
| 536 |
+
}
|
| 537 |
+
}
|
| 538 |
+
|
| 539 |
+
# 🚀 NEW: CACHE THE RESULTS
|
| 540 |
+
cache_parameters = {
|
| 541 |
+
'threshold': request.threshold,
|
| 542 |
+
'max_results': request.max_results,
|
| 543 |
+
'promo_count': len(promo_products),
|
| 544 |
+
'db_count': len(db_products),
|
| 545 |
+
'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
|
| 546 |
+
}
|
| 547 |
+
|
| 548 |
+
cache_key = cache_promo_analysis(
|
| 549 |
+
len(promo_products),
|
| 550 |
+
len(db_products),
|
| 551 |
+
request.threshold,
|
| 552 |
+
results,
|
| 553 |
+
cache_parameters
|
| 554 |
+
)
|
| 555 |
+
|
| 556 |
+
logger.info(f"✅ Promo comparison complete: {len(matches)} matches found")
|
| 557 |
+
logger.info(f"💾 Results cached with key: {cache_key}")
|
| 558 |
+
|
| 559 |
+
return {
|
| 560 |
+
"status": "success",
|
| 561 |
+
**results,
|
| 562 |
+
"cached": False,
|
| 563 |
+
"cache_key": cache_key,
|
| 564 |
+
"timestamp": time.time()
|
| 565 |
+
}
|
| 566 |
+
|
| 567 |
+
except Exception as e:
|
| 568 |
+
logger.error(f"Promo comparison error: {e}")
|
| 569 |
+
raise HTTPException(status_code=500, detail=f"Promo comparison failed: {str(e)}")
|
| 570 |
+
|
| 571 |
+
|
| 572 |
+
@router.post("/compare-promo-stream", summary="Stream Promo Comparison with Real-Time Progress")
|
| 573 |
+
async def compare_promo_stream(request: PromoComparisonRequest):
|
| 574 |
+
"""Stream promo product comparison with real-time progress updates"""
|
| 575 |
+
|
| 576 |
+
async def generate_promo_progress():
|
| 577 |
+
try:
|
| 578 |
+
logger.info(f"🏷️ Starting streaming promo comparison with threshold {request.threshold}")
|
| 579 |
+
|
| 580 |
+
repository = get_similarity_repository()
|
| 581 |
+
|
| 582 |
+
if not repository:
|
| 583 |
+
yield f"data: {json.dumps({'type': 'error', 'message': 'Database repository not available'})}\n\n"
|
| 584 |
+
return
|
| 585 |
+
|
| 586 |
+
# Load REAL promo and database products
|
| 587 |
+
logger.info("📊 Loading REAL promo and database products...")
|
| 588 |
+
try:
|
| 589 |
+
promo_products = repository.load_promo_products()
|
| 590 |
+
db_products = repository.load_all_products()
|
| 591 |
+
logger.info(f"✅ Loaded {len(promo_products)} promo products and {len(db_products)} database products")
|
| 592 |
+
except Exception as e:
|
| 593 |
+
logger.warning(f"❌ Failed to load real data, using samples: {e}")
|
| 594 |
+
promo_products = repository._get_sample_promo_products()
|
| 595 |
+
db_products = repository.get_sample_products()
|
| 596 |
+
|
| 597 |
+
# Check cache first (unless force refresh)
|
| 598 |
+
if not request.force_refresh:
|
| 599 |
+
cached_results = load_promo_analysis(
|
| 600 |
+
len(promo_products),
|
| 601 |
+
len(db_products),
|
| 602 |
+
request.threshold
|
| 603 |
+
)
|
| 604 |
+
if cached_results:
|
| 605 |
+
logger.info("✅ Returning cached promo analysis results via stream")
|
| 606 |
+
yield f"data: {json.dumps({'type': 'cache_hit', 'data': cached_results})}\n\n"
|
| 607 |
+
return
|
| 608 |
+
|
| 609 |
+
# Send initial status
|
| 610 |
+
total_comparisons = len(promo_products) * len(db_products)
|
| 611 |
+
yield f"data: {json.dumps({'type': 'init', 'total_promo_products': len(promo_products), 'total_db_products': len(db_products), 'total_comparisons': total_comparisons, 'threshold': request.threshold})}\n\n"
|
| 612 |
+
|
| 613 |
+
# Find matches using REAL similarity algorithm with streaming
|
| 614 |
+
start_time = time.time()
|
| 615 |
+
matches = []
|
| 616 |
+
completed_comparisons = 0
|
| 617 |
+
|
| 618 |
+
for i, promo in enumerate(promo_products):
|
| 619 |
+
promo_name = promo.get('name', '').strip()
|
| 620 |
+
promo_store = promo.get('store', '').strip()
|
| 621 |
+
|
| 622 |
+
if not promo_name:
|
| 623 |
+
continue
|
| 624 |
+
|
| 625 |
+
# Send promo progress update
|
| 626 |
+
promo_progress = (i / len(promo_products)) * 100
|
| 627 |
+
yield f"data: {json.dumps({'type': 'promo_progress', 'current_promo': i + 1, 'total_promos': len(promo_products), 'progress': round(promo_progress, 1), 'current_promo_name': promo_name[:50], 'store': promo_store})}\n\n"
|
| 628 |
+
|
| 629 |
+
best_match = None
|
| 630 |
+
best_similarity = 0.0
|
| 631 |
+
|
| 632 |
+
for j, db_product in enumerate(db_products):
|
| 633 |
+
db_name = db_product.get('product_name', '').strip()
|
| 634 |
+
|
| 635 |
+
if not db_name:
|
| 636 |
+
continue
|
| 637 |
+
|
| 638 |
+
completed_comparisons += 1
|
| 639 |
+
|
| 640 |
+
# Calculate similarity
|
| 641 |
+
similarity = calculate_similarity(promo_name, db_name)
|
| 642 |
+
|
| 643 |
+
# Send detailed comparison progress every 1000 comparisons
|
| 644 |
+
if completed_comparisons % 1000 == 0:
|
| 645 |
+
overall_progress = (completed_comparisons / total_comparisons) * 100
|
| 646 |
+
yield f"data: {json.dumps({'type': 'comparison_progress', 'completed_comparisons': completed_comparisons, 'total_comparisons': total_comparisons, 'progress': round(overall_progress, 1), 'comparing': f'{promo_name[:30]} vs {db_name[:30]}'})}\n\n"
|
| 647 |
+
await asyncio.sleep(0.01) # Small delay
|
| 648 |
+
|
| 649 |
+
if similarity >= request.threshold and similarity > best_similarity:
|
| 650 |
+
best_similarity = similarity
|
| 651 |
+
confidence = calculate_confidence(similarity, promo_name, db_name)
|
| 652 |
+
|
| 653 |
+
best_match = {
|
| 654 |
+
'promo_id': promo.get('id'),
|
| 655 |
+
'promo_name': promo_name,
|
| 656 |
+
'promo_store': promo_store,
|
| 657 |
+
'promo_price': promo.get('promo_price', 0),
|
| 658 |
+
'regular_price': promo.get('regular_price', 0),
|
| 659 |
+
'picture_id': promo.get('picture_id'),
|
| 660 |
+
'db_product_id': db_product.get('product_id'),
|
| 661 |
+
'db_product_name': db_name,
|
| 662 |
+
'db_brand': db_product.get('brand', {}).get('brand_name', 'No Brand') if db_product.get('brand') else 'No Brand',
|
| 663 |
+
'similarity': round(similarity, 3),
|
| 664 |
+
'confidence': round(confidence, 3)
|
| 665 |
+
}
|
| 666 |
+
|
| 667 |
+
# If match found, send immediate update
|
| 668 |
+
if best_match:
|
| 669 |
+
matches.append(best_match)
|
| 670 |
+
yield f"data: {json.dumps({'type': 'match_found', 'match': best_match, 'total_matches': len(matches)})}\n\n"
|
| 671 |
+
logger.info(f"🔍 PROMO MATCH FOUND via stream: {promo_name} ↔ {best_match['db_product_name']} ({best_similarity:.3f})")
|
| 672 |
+
|
| 673 |
+
# Limit results if requested
|
| 674 |
+
if request.max_results and len(matches) >= request.max_results:
|
| 675 |
+
yield f"data: {json.dumps({'type': 'max_results_reached', 'max_results': request.max_results, 'matches_found': len(matches)})}\n\n"
|
| 676 |
+
break
|
| 677 |
+
|
| 678 |
+
analysis_time = time.time() - start_time
|
| 679 |
+
|
| 680 |
+
# Prepare final results
|
| 681 |
+
results = {
|
| 682 |
+
"matches": matches,
|
| 683 |
+
"statistics": {
|
| 684 |
+
"total_promo_products": len(promo_products),
|
| 685 |
+
"total_database_products": len(db_products),
|
| 686 |
+
"total_comparisons": completed_comparisons,
|
| 687 |
+
"matches_found": len(matches),
|
| 688 |
+
"match_rate_percentage": round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
|
| 689 |
+
"threshold_used": request.threshold,
|
| 690 |
+
"analysis_time_seconds": round(analysis_time, 2)
|
| 691 |
+
}
|
| 692 |
+
}
|
| 693 |
+
|
| 694 |
+
# Cache the results
|
| 695 |
+
cache_parameters = {
|
| 696 |
+
'threshold': request.threshold,
|
| 697 |
+
'max_results': request.max_results,
|
| 698 |
+
'promo_count': len(promo_products),
|
| 699 |
+
'db_count': len(db_products),
|
| 700 |
+
'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
|
| 701 |
+
}
|
| 702 |
+
|
| 703 |
+
cache_key = cache_promo_analysis(
|
| 704 |
+
len(promo_products),
|
| 705 |
+
len(db_products),
|
| 706 |
+
request.threshold,
|
| 707 |
+
results,
|
| 708 |
+
cache_parameters
|
| 709 |
+
)
|
| 710 |
+
|
| 711 |
+
# Send final results
|
| 712 |
+
yield f"data: {json.dumps({'type': 'complete', 'data': results, 'cache_key': cache_key})}\n\n"
|
| 713 |
+
|
| 714 |
+
logger.info(f"✅ Streaming promo comparison complete: {len(matches)} matches found")
|
| 715 |
+
logger.info(f"💾 Results cached with key: {cache_key}")
|
| 716 |
+
|
| 717 |
+
except Exception as e:
|
| 718 |
+
logger.error(f"Streaming promo comparison error: {e}")
|
| 719 |
+
yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
|
| 720 |
+
|
| 721 |
+
return StreamingResponse(
|
| 722 |
+
generate_promo_progress(),
|
| 723 |
+
media_type="text/event-stream",
|
| 724 |
+
headers={
|
| 725 |
+
"Cache-Control": "no-cache",
|
| 726 |
+
"Connection": "keep-alive",
|
| 727 |
+
"Content-Type": "text/event-stream",
|
| 728 |
+
"X-Accel-Buffering": "no" # Disable nginx buffering
|
| 729 |
+
}
|
| 730 |
+
)
|
| 731 |
+
|
| 732 |
+
@router.get("/test-algorithm", summary="Test Similarity Algorithm")
|
| 733 |
+
async def test_algorithm():
|
| 734 |
+
"""Test the similarity algorithm with known examples"""
|
| 735 |
+
try:
|
| 736 |
+
logger.info("🧪 Testing similarity algorithm")
|
| 737 |
+
|
| 738 |
+
test_cases = [
|
| 739 |
+
("Maslac", "Maslac", True),
|
| 740 |
+
("Vrhnje za kuhanje", "Vrhnje za kuhanje 3x200g", False),
|
| 741 |
+
("Apple iPhone 13", "iPhone 13 Apple", True),
|
| 742 |
+
("vindija mlijeko cokoladno", "vindija cokoladno mlijeko", True)
|
| 743 |
+
]
|
| 744 |
+
|
| 745 |
+
results = []
|
| 746 |
+
for text1, text2, should_match in test_cases:
|
| 747 |
+
similarity = calculate_similarity(text1, text2)
|
| 748 |
+
confidence = calculate_confidence(similarity, text1, text2)
|
| 749 |
+
is_match = similarity >= 0.85
|
| 750 |
+
passed = is_match == should_match
|
| 751 |
+
|
| 752 |
+
results.append({
|
| 753 |
+
"text1": text1,
|
| 754 |
+
"text2": text2,
|
| 755 |
+
"similarity": round(similarity, 3),
|
| 756 |
+
"confidence": round(confidence, 3),
|
| 757 |
+
"is_match": is_match,
|
| 758 |
+
"should_match": should_match,
|
| 759 |
+
"test_passed": passed
|
| 760 |
+
})
|
| 761 |
+
|
| 762 |
+
passed_tests = sum(1 for r in results if r["test_passed"])
|
| 763 |
+
total_tests = len(results)
|
| 764 |
+
|
| 765 |
+
return {
|
| 766 |
+
"status": "success",
|
| 767 |
+
"test_results": results,
|
| 768 |
+
"summary": {
|
| 769 |
+
"total_tests": total_tests,
|
| 770 |
+
"passed_tests": passed_tests,
|
| 771 |
+
"failed_tests": total_tests - passed_tests,
|
| 772 |
+
"success_rate": round((passed_tests / total_tests) * 100, 1)
|
| 773 |
+
},
|
| 774 |
+
"timestamp": time.time()
|
| 775 |
+
}
|
| 776 |
+
|
| 777 |
+
except Exception as e:
|
| 778 |
+
logger.error(f"Algorithm test error: {e}")
|
| 779 |
+
raise HTTPException(status_code=500, detail=f"Test failed: {str(e)}")
|
| 780 |
+
|
| 781 |
+
@router.get("/stats", summary="Get Engine Statistics")
|
| 782 |
+
async def get_statistics():
|
| 783 |
+
"""Get overall similarity engine statistics - REAL DATA"""
|
| 784 |
+
try:
|
| 785 |
+
repository = get_similarity_repository()
|
| 786 |
+
|
| 787 |
+
if not repository:
|
| 788 |
+
return {
|
| 789 |
+
"status": "error",
|
| 790 |
+
"message": "Repository not available",
|
| 791 |
+
"statistics": {}
|
| 792 |
+
}
|
| 793 |
+
|
| 794 |
+
# Get REAL statistics
|
| 795 |
+
try:
|
| 796 |
+
all_products = repository.load_all_products()
|
| 797 |
+
products_without_images = repository.get_products_without_images()
|
| 798 |
+
promo_products = repository.load_promo_products()
|
| 799 |
+
promo_with_images = repository.load_promo_products(with_images_only=True)
|
| 800 |
+
except Exception as e:
|
| 801 |
+
logger.error(f"Error loading statistics: {e}")
|
| 802 |
+
all_products = []
|
| 803 |
+
products_without_images = []
|
| 804 |
+
promo_products = []
|
| 805 |
+
promo_with_images = []
|
| 806 |
+
|
| 807 |
+
return {
|
| 808 |
+
"status": "success",
|
| 809 |
+
"statistics": {
|
| 810 |
+
"database": {
|
| 811 |
+
"total_products": len(all_products),
|
| 812 |
+
"products_without_images": len(products_without_images),
|
| 813 |
+
"products_with_images": len(all_products) - len(products_without_images)
|
| 814 |
+
},
|
| 815 |
+
"promotional": {
|
| 816 |
+
"total_promo_products": len(promo_products),
|
| 817 |
+
"promo_with_images": len(promo_with_images),
|
| 818 |
+
"promo_without_images": len(promo_products) - len(promo_with_images)
|
| 819 |
+
},
|
| 820 |
+
"image_coverage": {
|
| 821 |
+
"database_coverage": round(((len(all_products) - len(products_without_images)) / max(len(all_products), 1)) * 100, 1),
|
| 822 |
+
"promo_coverage": round((len(promo_with_images) / max(len(promo_products), 1)) * 100, 1)
|
| 823 |
+
}
|
| 824 |
+
},
|
| 825 |
+
"engine_status": "operational",
|
| 826 |
+
"database_connection": "connected" if repository.supabase else "disconnected",
|
| 827 |
+
"timestamp": time.time()
|
| 828 |
+
}
|
| 829 |
+
|
| 830 |
+
except Exception as e:
|
| 831 |
+
logger.error(f"Statistics error: {e}")
|
| 832 |
+
raise HTTPException(status_code=500, detail=f"Failed to get statistics: {str(e)}")
|
| 833 |
+
|
| 834 |
+
# 🚀 NEW: CACHE MANAGEMENT ENDPOINTS
|
| 835 |
+
|
| 836 |
+
@router.get("/cache/stats", summary="Get Cache Statistics")
|
| 837 |
+
async def get_cache_stats():
|
| 838 |
+
"""Get cache usage statistics"""
|
| 839 |
+
try:
|
| 840 |
+
cache_mgr = get_cache_manager()
|
| 841 |
+
|
| 842 |
+
if not cache_mgr:
|
| 843 |
+
return {
|
| 844 |
+
"status": "error",
|
| 845 |
+
"message": "Cache manager not available",
|
| 846 |
+
"cache_stats": {}
|
| 847 |
+
}
|
| 848 |
+
|
| 849 |
+
stats = cache_mgr.get_cache_stats()
|
| 850 |
+
|
| 851 |
+
return {
|
| 852 |
+
"status": "success",
|
| 853 |
+
"cache_stats": stats,
|
| 854 |
+
"timestamp": time.time()
|
| 855 |
+
}
|
| 856 |
+
|
| 857 |
+
except Exception as e:
|
| 858 |
+
logger.error(f"Cache stats error: {e}")
|
| 859 |
+
raise HTTPException(status_code=500, detail=f"Failed to get cache stats: {str(e)}")
|
| 860 |
+
|
| 861 |
+
@router.post("/cache/clear", summary="Clear Cache")
|
| 862 |
+
async def clear_cache(request: CacheManagementRequest):
|
| 863 |
+
"""Clear cached results"""
|
| 864 |
+
try:
|
| 865 |
+
cache_mgr = get_cache_manager()
|
| 866 |
+
|
| 867 |
+
if not cache_mgr:
|
| 868 |
+
return {
|
| 869 |
+
"status": "error",
|
| 870 |
+
"message": "Cache manager not available"
|
| 871 |
+
}
|
| 872 |
+
|
| 873 |
+
# Clear cache based on parameters
|
| 874 |
+
removed_count = cache_mgr.clear_cache(
|
| 875 |
+
analysis_type=request.analysis_type,
|
| 876 |
+
older_than_hours=request.older_than_hours
|
| 877 |
+
)
|
| 878 |
+
|
| 879 |
+
logger.info(f"🧹 Cache cleared: {removed_count} files removed")
|
| 880 |
+
|
| 881 |
+
return {
|
| 882 |
+
"status": "success",
|
| 883 |
+
"message": f"Cache cleared successfully",
|
| 884 |
+
"files_removed": removed_count,
|
| 885 |
+
"cleared_type": request.analysis_type or "all",
|
| 886 |
+
"timestamp": time.time()
|
| 887 |
+
}
|
| 888 |
+
|
| 889 |
+
except Exception as e:
|
| 890 |
+
logger.error(f"Cache clear error: {e}")
|
| 891 |
+
raise HTTPException(status_code=500, detail=f"Failed to clear cache: {str(e)}")
|
| 892 |
+
|
| 893 |
+
@router.post("/cache/cleanup", summary="Cleanup Expired Cache")
|
| 894 |
+
async def cleanup_expired_cache():
|
| 895 |
+
"""Remove only expired cache files"""
|
| 896 |
+
try:
|
| 897 |
+
cache_mgr = get_cache_manager()
|
| 898 |
+
|
| 899 |
+
if not cache_mgr:
|
| 900 |
+
return {
|
| 901 |
+
"status": "error",
|
| 902 |
+
"message": "Cache manager not available"
|
| 903 |
+
}
|
| 904 |
+
|
| 905 |
+
removed_count = cache_mgr.cleanup_expired_cache()
|
| 906 |
+
|
| 907 |
+
logger.info(f"🧹 Expired cache cleaned up: {removed_count} files removed")
|
| 908 |
+
|
| 909 |
+
return {
|
| 910 |
+
"status": "success",
|
| 911 |
+
"message": "Expired cache cleaned up successfully",
|
| 912 |
+
"expired_files_removed": removed_count,
|
| 913 |
+
"timestamp": time.time()
|
| 914 |
+
}
|
| 915 |
+
|
| 916 |
+
except Exception as e:
|
| 917 |
+
logger.error(f"Cache cleanup error: {e}")
|
| 918 |
+
raise HTTPException(status_code=500, detail=f"Failed to cleanup cache: {str(e)}")
|
| 919 |
+
|
| 920 |
+
@router.post("/compare-promo-stream", summary="Stream Promo Comparison with Real-Time Progress")
|
| 921 |
+
async def compare_promo_stream(request: PromoComparisonRequest):
|
| 922 |
+
"""Stream promo product comparison with real-time progress updates"""
|
| 923 |
+
|
| 924 |
+
async def generate_promo_progress():
|
| 925 |
+
try:
|
| 926 |
+
logger.info(f"🏷️ Starting streaming promo comparison with threshold {request.threshold}")
|
| 927 |
+
|
| 928 |
+
repository = get_similarity_repository()
|
| 929 |
+
|
| 930 |
+
if not repository:
|
| 931 |
+
yield f"data: {json.dumps({'type': 'error', 'message': 'Database repository not available'})}\n\n"
|
| 932 |
+
return
|
| 933 |
+
|
| 934 |
+
# Load REAL promo and database products
|
| 935 |
+
logger.info("📊 Loading REAL promo and database products...")
|
| 936 |
+
try:
|
| 937 |
+
promo_products = repository.load_promo_products()
|
| 938 |
+
db_products = repository.load_all_products()
|
| 939 |
+
logger.info(f"✅ Loaded {len(promo_products)} promo products and {len(db_products)} database products")
|
| 940 |
+
except Exception as e:
|
| 941 |
+
logger.warning(f"❌ Failed to load real data, using samples: {e}")
|
| 942 |
+
promo_products = repository._get_sample_promo_products()
|
| 943 |
+
db_products = repository.get_sample_products()
|
| 944 |
+
|
| 945 |
+
# Check cache first (unless force refresh)
|
| 946 |
+
if not request.force_refresh:
|
| 947 |
+
cached_results = load_promo_analysis(
|
| 948 |
+
len(promo_products),
|
| 949 |
+
len(db_products),
|
| 950 |
+
request.threshold
|
| 951 |
+
)
|
| 952 |
+
if cached_results:
|
| 953 |
+
logger.info("✅ Returning cached promo analysis results via stream")
|
| 954 |
+
yield f"data: {json.dumps({'type': 'cache_hit', 'data': cached_results})}\n\n"
|
| 955 |
+
return
|
| 956 |
+
|
| 957 |
+
# Send initial status
|
| 958 |
+
total_comparisons = len(promo_products) * len(db_products)
|
| 959 |
+
yield f"data: {json.dumps({'type': 'init', 'total_promo_products': len(promo_products), 'total_db_products': len(db_products), 'total_comparisons': total_comparisons, 'threshold': request.threshold})}\n\n"
|
| 960 |
+
|
| 961 |
+
# Find matches using REAL similarity algorithm with streaming
|
| 962 |
+
start_time = time.time()
|
| 963 |
+
matches = []
|
| 964 |
+
completed_comparisons = 0
|
| 965 |
+
|
| 966 |
+
for i, promo in enumerate(promo_products):
|
| 967 |
+
promo_name = promo.get('name', '').strip()
|
| 968 |
+
promo_store = promo.get('store', '').strip()
|
| 969 |
+
|
| 970 |
+
if not promo_name:
|
| 971 |
+
continue
|
| 972 |
+
|
| 973 |
+
# Send promo progress update every 10 promos
|
| 974 |
+
if i % 10 == 0:
|
| 975 |
+
promo_progress = (i / len(promo_products)) * 100
|
| 976 |
+
yield f"data: {json.dumps({'type': 'promo_progress', 'current_promo': i + 1, 'total_promos': len(promo_products), 'progress': round(promo_progress, 1), 'current_promo_name': promo_name[:50], 'store': promo_store})}\n\n"
|
| 977 |
+
|
| 978 |
+
best_match = None
|
| 979 |
+
best_similarity = 0.0
|
| 980 |
+
|
| 981 |
+
for j, db_product in enumerate(db_products):
|
| 982 |
+
db_name = db_product.get('product_name', '').strip()
|
| 983 |
+
|
| 984 |
+
if not db_name:
|
| 985 |
+
continue
|
| 986 |
+
|
| 987 |
+
completed_comparisons += 1
|
| 988 |
+
|
| 989 |
+
# Calculate similarity
|
| 990 |
+
similarity = calculate_similarity(promo_name, db_name)
|
| 991 |
+
|
| 992 |
+
# Send detailed comparison progress every 5000 comparisons
|
| 993 |
+
if completed_comparisons % 5000 == 0:
|
| 994 |
+
overall_progress = (completed_comparisons / total_comparisons) * 100
|
| 995 |
+
yield f"data: {json.dumps({'type': 'comparison_progress', 'completed_comparisons': completed_comparisons, 'total_comparisons': total_comparisons, 'progress': round(overall_progress, 1), 'comparing': f'{promo_name[:30]} vs {db_name[:30]}'})}\n\n"
|
| 996 |
+
await asyncio.sleep(0.01) # Small delay
|
| 997 |
+
|
| 998 |
+
if similarity >= request.threshold and similarity > best_similarity:
|
| 999 |
+
best_similarity = similarity
|
| 1000 |
+
confidence = calculate_confidence(similarity, promo_name, db_name)
|
| 1001 |
+
|
| 1002 |
+
best_match = {
|
| 1003 |
+
'promo_id': promo.get('id'),
|
| 1004 |
+
'promo_name': promo_name,
|
| 1005 |
+
'promo_store': promo_store,
|
| 1006 |
+
'promo_price': promo.get('promo_price', 0),
|
| 1007 |
+
'regular_price': promo.get('regular_price', 0),
|
| 1008 |
+
'picture_id': promo.get('picture_id'),
|
| 1009 |
+
'db_product_id': db_product.get('product_id'),
|
| 1010 |
+
'db_product_name': db_name,
|
| 1011 |
+
'db_brand': db_product.get('brand', {}).get('brand_name', 'No Brand') if db_product.get('brand') else 'No Brand',
|
| 1012 |
+
'similarity': round(similarity, 3),
|
| 1013 |
+
'confidence': round(confidence, 3)
|
| 1014 |
+
}
|
| 1015 |
+
|
| 1016 |
+
# If match found, send immediate update
|
| 1017 |
+
if best_match:
|
| 1018 |
+
matches.append(best_match)
|
| 1019 |
+
yield f"data: {json.dumps({'type': 'match_found', 'match': best_match, 'total_matches': len(matches)})}\n\n"
|
| 1020 |
+
logger.info(f"🔍 PROMO MATCH FOUND via stream: {promo_name} ↔ {best_match['db_product_name']} ({best_similarity:.3f})")
|
| 1021 |
+
|
| 1022 |
+
# Limit results if requested
|
| 1023 |
+
if request.max_results and len(matches) >= request.max_results:
|
| 1024 |
+
yield f"data: {json.dumps({'type': 'max_results_reached', 'max_results': request.max_results, 'matches_found': len(matches)})}\n\n"
|
| 1025 |
+
break
|
| 1026 |
+
|
| 1027 |
+
analysis_time = time.time() - start_time
|
| 1028 |
+
|
| 1029 |
+
# Prepare final results
|
| 1030 |
+
results = {
|
| 1031 |
+
"matches": matches,
|
| 1032 |
+
"statistics": {
|
| 1033 |
+
"total_promo_products": len(promo_products),
|
| 1034 |
+
"total_database_products": len(db_products),
|
| 1035 |
+
"total_comparisons": completed_comparisons,
|
| 1036 |
+
"matches_found": len(matches),
|
| 1037 |
+
"match_rate_percentage": round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
|
| 1038 |
+
"threshold_used": request.threshold,
|
| 1039 |
+
"analysis_time_seconds": round(analysis_time, 2)
|
| 1040 |
+
}
|
| 1041 |
+
}
|
| 1042 |
+
|
| 1043 |
+
# Cache the results
|
| 1044 |
+
cache_parameters = {
|
| 1045 |
+
'threshold': request.threshold,
|
| 1046 |
+
'max_results': request.max_results,
|
| 1047 |
+
'promo_count': len(promo_products),
|
| 1048 |
+
'db_count': len(db_products),
|
| 1049 |
+
'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
|
| 1050 |
+
}
|
| 1051 |
+
|
| 1052 |
+
cache_key = cache_promo_analysis(
|
| 1053 |
+
len(promo_products),
|
| 1054 |
+
len(db_products),
|
| 1055 |
+
request.threshold,
|
| 1056 |
+
results,
|
| 1057 |
+
cache_parameters
|
| 1058 |
+
)
|
| 1059 |
+
|
| 1060 |
+
# Send final results
|
| 1061 |
+
yield f"data: {json.dumps({'type': 'complete', 'data': results, 'cache_key': cache_key})}\n\n"
|
| 1062 |
+
|
| 1063 |
+
logger.info(f"✅ Streaming promo comparison complete: {len(matches)} matches found")
|
| 1064 |
+
logger.info(f"💾 Results cached with key: {cache_key}")
|
| 1065 |
+
|
| 1066 |
+
except Exception as e:
|
| 1067 |
+
logger.error(f"Streaming promo comparison error: {e}")
|
| 1068 |
+
yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
|
| 1069 |
+
|
| 1070 |
+
return StreamingResponse(
|
| 1071 |
+
generate_promo_progress(),
|
| 1072 |
+
media_type="text/event-stream",
|
| 1073 |
+
headers={
|
| 1074 |
+
"Cache-Control": "no-cache",
|
| 1075 |
+
"Connection": "keep-alive",
|
| 1076 |
+
"Content-Type": "text/event-stream",
|
| 1077 |
+
"X-Accel-Buffering": "no" # Disable nginx buffering
|
| 1078 |
+
}
|
| 1079 |
+
)
|
db/similarity_repository.py
ADDED
|
@@ -0,0 +1,462 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Similarity Repository - Backend Database Operations
|
| 3 |
+
Handles all Supabase interactions for the similarity engine
|
| 4 |
+
VERIFIED FOR REAL DATA CONNECTION
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import os
|
| 8 |
+
import logging
|
| 9 |
+
import sys
|
| 10 |
+
from typing import List, Dict, Any, Optional
|
| 11 |
+
|
| 12 |
+
# Configure logging
|
| 13 |
+
logging.basicConfig(level=logging.INFO)
|
| 14 |
+
logger = logging.getLogger(__name__)
|
| 15 |
+
|
| 16 |
+
class SimilarityRepository:
|
| 17 |
+
"""Repository class for similarity engine database operations"""
|
| 18 |
+
|
| 19 |
+
def __init__(self):
|
| 20 |
+
"""Initialize Supabase connection"""
|
| 21 |
+
# Try backend environment variables first, then fall back to VITE_ variables
|
| 22 |
+
self.supabase_url = os.getenv('SUPABASE_URL') or os.getenv('VITE_SUPABASE_URL')
|
| 23 |
+
self.supabase_key = os.getenv('SUPABASE_KEY') or os.getenv('VITE_SUPABASE_KEY')
|
| 24 |
+
self.user_email = os.getenv('SUPABASE_USER_EMAIL', 'tonywis12@yahoo.com')
|
| 25 |
+
self.user_password = os.getenv('SUPABASE_USER_PASSWORD', 'Anthony.12')
|
| 26 |
+
|
| 27 |
+
# Try to load environment variables from .env file
|
| 28 |
+
try:
|
| 29 |
+
from dotenv import load_dotenv
|
| 30 |
+
# Look for .env file in parent directory (where the main app is)
|
| 31 |
+
env_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), '.env')
|
| 32 |
+
load_dotenv(env_path)
|
| 33 |
+
|
| 34 |
+
# Reload environment variables after loading .env (try both naming conventions)
|
| 35 |
+
self.supabase_url = os.getenv('SUPABASE_URL') or os.getenv('VITE_SUPABASE_URL')
|
| 36 |
+
self.supabase_key = os.getenv('SUPABASE_KEY') or os.getenv('VITE_SUPABASE_KEY')
|
| 37 |
+
|
| 38 |
+
logger.info(f"📋 Loaded .env from: {env_path}")
|
| 39 |
+
logger.info(f"📋 Supabase URL: {self.supabase_url[:30] if self.supabase_url else 'Not found'}...")
|
| 40 |
+
logger.info(f"📋 Supabase Key: {self.supabase_key[:20] if self.supabase_key else 'Not found'}...")
|
| 41 |
+
|
| 42 |
+
except ImportError:
|
| 43 |
+
logger.warning("python-dotenv not available, using system environment variables")
|
| 44 |
+
except Exception as e:
|
| 45 |
+
logger.warning(f"Could not load .env file: {e}")
|
| 46 |
+
|
| 47 |
+
if not self.supabase_url or not self.supabase_key:
|
| 48 |
+
logger.error("❌ Supabase credentials not found in environment variables")
|
| 49 |
+
logger.error("💡 Looking for SUPABASE_URL/SUPABASE_KEY or VITE_SUPABASE_URL/VITE_SUPABASE_KEY")
|
| 50 |
+
self.supabase = None
|
| 51 |
+
return
|
| 52 |
+
|
| 53 |
+
self.supabase = None
|
| 54 |
+
self._authenticate()
|
| 55 |
+
|
| 56 |
+
def _authenticate(self) -> bool:
|
| 57 |
+
"""Authenticate with Supabase"""
|
| 58 |
+
try:
|
| 59 |
+
from supabase import create_client
|
| 60 |
+
|
| 61 |
+
logger.info("🔗 Connecting to Supabase...")
|
| 62 |
+
self.supabase = create_client(self.supabase_url, self.supabase_key)
|
| 63 |
+
|
| 64 |
+
logger.info("🔐 Authenticating user...")
|
| 65 |
+
auth_result = self.supabase.auth.sign_in_with_password({
|
| 66 |
+
"email": self.user_email,
|
| 67 |
+
"password": self.user_password
|
| 68 |
+
})
|
| 69 |
+
|
| 70 |
+
if auth_result.user:
|
| 71 |
+
logger.info(f"✅ Authenticated as: {auth_result.user.email}")
|
| 72 |
+
return True
|
| 73 |
+
else:
|
| 74 |
+
logger.error("❌ Authentication failed - no user returned")
|
| 75 |
+
return False
|
| 76 |
+
|
| 77 |
+
except ImportError as e:
|
| 78 |
+
logger.error(f"❌ Supabase library not available: {e}")
|
| 79 |
+
logger.error("💡 Install with: pip install supabase")
|
| 80 |
+
return False
|
| 81 |
+
except Exception as e:
|
| 82 |
+
logger.error(f"❌ Authentication error: {e}")
|
| 83 |
+
return False
|
| 84 |
+
|
| 85 |
+
def load_all_products(self) -> List[Dict[str, Any]]:
|
| 86 |
+
"""
|
| 87 |
+
Load all products from database with pagination
|
| 88 |
+
|
| 89 |
+
Returns:
|
| 90 |
+
List of product dictionaries
|
| 91 |
+
"""
|
| 92 |
+
if not self.supabase:
|
| 93 |
+
logger.error("❌ No Supabase connection - returning sample data")
|
| 94 |
+
return self.get_sample_products()
|
| 95 |
+
|
| 96 |
+
try:
|
| 97 |
+
logger.info("📊 Loading all products from database...")
|
| 98 |
+
|
| 99 |
+
# Get total count first
|
| 100 |
+
count_result = self.supabase.table('products').select('*', count='exact').execute()
|
| 101 |
+
total_count = count_result.count
|
| 102 |
+
logger.info(f"📋 Total products in database: {total_count}")
|
| 103 |
+
|
| 104 |
+
if not total_count:
|
| 105 |
+
logger.warning("❌ No products found in database")
|
| 106 |
+
return self.get_sample_products()
|
| 107 |
+
|
| 108 |
+
# Fetch all products in batches
|
| 109 |
+
page_size = 1000
|
| 110 |
+
all_products = []
|
| 111 |
+
page = 0
|
| 112 |
+
|
| 113 |
+
while True:
|
| 114 |
+
offset = page * page_size
|
| 115 |
+
logger.info(f"⏳ Fetching products {offset + 1} to {offset + page_size}...")
|
| 116 |
+
|
| 117 |
+
batch_result = self.supabase.table('products').select('''
|
| 118 |
+
product_id,
|
| 119 |
+
product_name,
|
| 120 |
+
product_weight,
|
| 121 |
+
product_image,
|
| 122 |
+
brand_id,
|
| 123 |
+
brand:brands (
|
| 124 |
+
brand_id,
|
| 125 |
+
brand_name
|
| 126 |
+
)
|
| 127 |
+
''').range(offset, offset + page_size - 1).execute()
|
| 128 |
+
|
| 129 |
+
batch_products = batch_result.data
|
| 130 |
+
|
| 131 |
+
if not batch_products:
|
| 132 |
+
break
|
| 133 |
+
|
| 134 |
+
all_products.extend(batch_products)
|
| 135 |
+
page += 1
|
| 136 |
+
|
| 137 |
+
if len(all_products) >= total_count:
|
| 138 |
+
break
|
| 139 |
+
|
| 140 |
+
logger.info(f"✅ Loaded {len(all_products)} products successfully")
|
| 141 |
+
return self._validate_products(all_products)
|
| 142 |
+
|
| 143 |
+
except Exception as e:
|
| 144 |
+
logger.error(f"❌ Error loading products: {e}")
|
| 145 |
+
logger.info("🔄 Falling back to sample data")
|
| 146 |
+
return self.get_sample_products()
|
| 147 |
+
|
| 148 |
+
def load_promo_products(self, with_images_only: bool = False) -> List[Dict[str, Any]]:
|
| 149 |
+
|
| 150 |
+
if not self.supabase:
|
| 151 |
+
logger.error("❌ No Supabase connection - returning sample promo data")
|
| 152 |
+
return self._get_sample_promo_products()
|
| 153 |
+
|
| 154 |
+
try:
|
| 155 |
+
logger.info("📊 Loading promo products from database...")
|
| 156 |
+
|
| 157 |
+
# Build basic query without is_ignored filter (this might be the issue)
|
| 158 |
+
query = self.supabase.table('promo_products').select('''
|
| 159 |
+
id,
|
| 160 |
+
name,
|
| 161 |
+
store,
|
| 162 |
+
description,
|
| 163 |
+
picture_id,
|
| 164 |
+
regular_price,
|
| 165 |
+
promo_price
|
| 166 |
+
''')
|
| 167 |
+
|
| 168 |
+
# Add image filter if requested
|
| 169 |
+
if with_images_only:
|
| 170 |
+
query = query.not_.is_('picture_id', 'null')
|
| 171 |
+
logger.info("🖼️ Filtering for promo products with images only")
|
| 172 |
+
|
| 173 |
+
# Get total count first (without is_ignored filter)
|
| 174 |
+
try:
|
| 175 |
+
count_result = query.select('*', count='exact').execute()
|
| 176 |
+
total_count = count_result.count
|
| 177 |
+
logger.info(f"📋 Total promo products found: {total_count}")
|
| 178 |
+
except Exception as count_error:
|
| 179 |
+
logger.error(f"❌ Error getting promo count: {count_error}")
|
| 180 |
+
# Try simpler count
|
| 181 |
+
simple_count = self.supabase.table('promo_products').select('id', count='exact').execute()
|
| 182 |
+
total_count = simple_count.count
|
| 183 |
+
logger.info(f"📋 Simple count result: {total_count}")
|
| 184 |
+
|
| 185 |
+
if not total_count or total_count == 0:
|
| 186 |
+
logger.warning("❌ No promo products found in database")
|
| 187 |
+
return self._get_sample_promo_products()
|
| 188 |
+
|
| 189 |
+
# Fetch all promo products in batches
|
| 190 |
+
page_size = 1000
|
| 191 |
+
all_promo_products = []
|
| 192 |
+
page = 0
|
| 193 |
+
|
| 194 |
+
logger.info(f"📦 Starting to fetch {total_count} promo products in batches...")
|
| 195 |
+
|
| 196 |
+
while True:
|
| 197 |
+
offset = page * page_size
|
| 198 |
+
logger.info(f"⏳ Fetching promo products {offset + 1} to {offset + page_size}...")
|
| 199 |
+
|
| 200 |
+
try:
|
| 201 |
+
batch_result = query.range(offset, offset + page_size - 1).execute()
|
| 202 |
+
batch_products = batch_result.data
|
| 203 |
+
|
| 204 |
+
if not batch_products:
|
| 205 |
+
logger.info(f"📝 No more promo products found at offset {offset}")
|
| 206 |
+
break
|
| 207 |
+
|
| 208 |
+
all_promo_products.extend(batch_products)
|
| 209 |
+
page += 1
|
| 210 |
+
|
| 211 |
+
logger.info(f"✅ Loaded batch: {len(batch_products)} products (total so far: {len(all_promo_products)})")
|
| 212 |
+
|
| 213 |
+
if len(all_promo_products) >= total_count:
|
| 214 |
+
break
|
| 215 |
+
|
| 216 |
+
except Exception as batch_error:
|
| 217 |
+
logger.error(f"❌ Error loading batch at offset {offset}: {batch_error}")
|
| 218 |
+
break
|
| 219 |
+
|
| 220 |
+
logger.info(f"✅ Successfully loaded {len(all_promo_products)} promo products from database")
|
| 221 |
+
|
| 222 |
+
# Log sample of what we loaded
|
| 223 |
+
if all_promo_products:
|
| 224 |
+
sample_product = all_promo_products[0]
|
| 225 |
+
logger.info(f"📝 Sample promo product: {sample_product.get('name', 'No name')} from {sample_product.get('store', 'No store')}")
|
| 226 |
+
|
| 227 |
+
return all_promo_products
|
| 228 |
+
|
| 229 |
+
except Exception as e:
|
| 230 |
+
logger.error(f"❌ Error loading promo products: {e}")
|
| 231 |
+
logger.error(f"❌ Exception type: {type(e).__name__}")
|
| 232 |
+
logger.error(f"❌ Exception details: {str(e)}")
|
| 233 |
+
logger.info("🔄 Falling back to sample promo data")
|
| 234 |
+
return self._get_sample_promo_products()
|
| 235 |
+
|
| 236 |
+
|
| 237 |
+
def update_product_image(self, product_id: str, image_url: str) -> bool:
|
| 238 |
+
"""
|
| 239 |
+
Update product image in database
|
| 240 |
+
|
| 241 |
+
Args:
|
| 242 |
+
product_id: Product ID to update
|
| 243 |
+
image_url: New image URL
|
| 244 |
+
|
| 245 |
+
Returns:
|
| 246 |
+
True if successful, False otherwise
|
| 247 |
+
"""
|
| 248 |
+
if not self.supabase:
|
| 249 |
+
logger.error("❌ No Supabase connection")
|
| 250 |
+
return False
|
| 251 |
+
|
| 252 |
+
try:
|
| 253 |
+
logger.info(f"📊 Updating product {product_id} with image URL")
|
| 254 |
+
|
| 255 |
+
result = self.supabase.table('products').update({
|
| 256 |
+
'product_image': image_url
|
| 257 |
+
}).eq('product_id', product_id).execute()
|
| 258 |
+
|
| 259 |
+
if result.data:
|
| 260 |
+
logger.info(f"✅ Updated product {product_id} with image")
|
| 261 |
+
return True
|
| 262 |
+
else:
|
| 263 |
+
logger.error(f"❌ Failed to update product {product_id}")
|
| 264 |
+
return False
|
| 265 |
+
|
| 266 |
+
except Exception as e:
|
| 267 |
+
logger.error(f"❌ Database update error for product {product_id}: {e}")
|
| 268 |
+
return False
|
| 269 |
+
|
| 270 |
+
def get_products_without_images(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
|
| 271 |
+
"""
|
| 272 |
+
Get products that don't have images
|
| 273 |
+
|
| 274 |
+
Args:
|
| 275 |
+
limit: Maximum number of products to return
|
| 276 |
+
|
| 277 |
+
Returns:
|
| 278 |
+
List of products without images
|
| 279 |
+
"""
|
| 280 |
+
if not self.supabase:
|
| 281 |
+
logger.error("❌ No Supabase connection")
|
| 282 |
+
return []
|
| 283 |
+
|
| 284 |
+
try:
|
| 285 |
+
logger.info("📊 Loading products without images...")
|
| 286 |
+
|
| 287 |
+
query = self.supabase.table('products').select('''
|
| 288 |
+
product_id,
|
| 289 |
+
product_name,
|
| 290 |
+
product_weight,
|
| 291 |
+
brand:brands (
|
| 292 |
+
brand_id,
|
| 293 |
+
brand_name
|
| 294 |
+
)
|
| 295 |
+
''').is_('product_image', 'null')
|
| 296 |
+
|
| 297 |
+
if limit:
|
| 298 |
+
query = query.limit(limit)
|
| 299 |
+
|
| 300 |
+
result = query.execute()
|
| 301 |
+
products = result.data
|
| 302 |
+
|
| 303 |
+
logger.info(f"✅ Found {len(products)} products without images")
|
| 304 |
+
return self._validate_products(products)
|
| 305 |
+
|
| 306 |
+
except Exception as e:
|
| 307 |
+
logger.error(f"❌ Error loading products without images: {e}")
|
| 308 |
+
return []
|
| 309 |
+
|
| 310 |
+
def _validate_products(self, products: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
| 311 |
+
"""
|
| 312 |
+
Validate and clean product data
|
| 313 |
+
|
| 314 |
+
Args:
|
| 315 |
+
products: Raw product list
|
| 316 |
+
|
| 317 |
+
Returns:
|
| 318 |
+
Validated product list
|
| 319 |
+
"""
|
| 320 |
+
validated = []
|
| 321 |
+
|
| 322 |
+
for i, product in enumerate(products):
|
| 323 |
+
if not isinstance(product, dict):
|
| 324 |
+
logger.warning(f"⚠️ Skipping invalid product at index {i}: not a dictionary")
|
| 325 |
+
continue
|
| 326 |
+
|
| 327 |
+
product_id = product.get('product_id')
|
| 328 |
+
product_name = product.get('product_name')
|
| 329 |
+
|
| 330 |
+
if not product_id:
|
| 331 |
+
product['product_id'] = f"unknown_{i}"
|
| 332 |
+
logger.warning(f"⚠️ Added missing product_id for product at index {i}")
|
| 333 |
+
|
| 334 |
+
if not product_name or not product_name.strip():
|
| 335 |
+
logger.warning(f"⚠️ Skipping product with missing/empty name: {product}")
|
| 336 |
+
continue
|
| 337 |
+
|
| 338 |
+
# Clean product name
|
| 339 |
+
product['product_name'] = product_name.strip()
|
| 340 |
+
|
| 341 |
+
validated.append(product)
|
| 342 |
+
|
| 343 |
+
logger.info(f"✅ Validated {len(validated)} products (skipped {len(products) - len(validated)})")
|
| 344 |
+
return validated
|
| 345 |
+
|
| 346 |
+
def get_sample_products(self) -> List[Dict[str, Any]]:
|
| 347 |
+
"""
|
| 348 |
+
Get sample products for testing
|
| 349 |
+
|
| 350 |
+
Returns:
|
| 351 |
+
List of sample product dictionaries
|
| 352 |
+
"""
|
| 353 |
+
logger.info("📊 Using sample products for testing")
|
| 354 |
+
return [
|
| 355 |
+
{
|
| 356 |
+
"product_id": "1",
|
| 357 |
+
"product_name": "vindija mlijeko cokoladno 2.8%",
|
| 358 |
+
"product_weight": "1L",
|
| 359 |
+
"brand": {"brand_id": "1", "brand_name": "Vindija"}
|
| 360 |
+
},
|
| 361 |
+
{
|
| 362 |
+
"product_id": "2",
|
| 363 |
+
"product_name": "vindija cokoladno mlijeko 2.8%",
|
| 364 |
+
"product_weight": "1L",
|
| 365 |
+
"brand": {"brand_id": "1", "brand_name": "Vindija"}
|
| 366 |
+
},
|
| 367 |
+
{
|
| 368 |
+
"product_id": "3",
|
| 369 |
+
"product_name": "Apple iPhone 13 Pro",
|
| 370 |
+
"product_weight": "238g",
|
| 371 |
+
"brand": {"brand_id": "3", "brand_name": "Apple"}
|
| 372 |
+
},
|
| 373 |
+
{
|
| 374 |
+
"product_id": "4",
|
| 375 |
+
"product_name": "iPhone 13 Pro Apple",
|
| 376 |
+
"product_weight": "238g",
|
| 377 |
+
"brand": {"brand_id": "3", "brand_name": "Apple"}
|
| 378 |
+
},
|
| 379 |
+
{
|
| 380 |
+
"product_id": "5",
|
| 381 |
+
"product_name": "Samsung Galaxy S22",
|
| 382 |
+
"product_weight": "167g",
|
| 383 |
+
"brand": {"brand_id": "4", "brand_name": "Samsung"}
|
| 384 |
+
},
|
| 385 |
+
{
|
| 386 |
+
"product_id": "6",
|
| 387 |
+
"product_name": "Galaxy S22 Samsung",
|
| 388 |
+
"product_weight": "167g",
|
| 389 |
+
"brand": {"brand_id": "4", "brand_name": "Samsung"}
|
| 390 |
+
},
|
| 391 |
+
{
|
| 392 |
+
"product_id": "7",
|
| 393 |
+
"product_name": "Coca Cola 330ml",
|
| 394 |
+
"product_weight": "330ml",
|
| 395 |
+
"brand": {"brand_id": "7", "brand_name": "Coca-Cola"}
|
| 396 |
+
},
|
| 397 |
+
{
|
| 398 |
+
"product_id": "8",
|
| 399 |
+
"product_name": "Pepsi Cola 330ml",
|
| 400 |
+
"product_weight": "330ml",
|
| 401 |
+
"brand": {"brand_id": "8", "brand_name": "PepsiCo"}
|
| 402 |
+
}
|
| 403 |
+
]
|
| 404 |
+
|
| 405 |
+
def _get_sample_promo_products(self) -> List[Dict[str, Any]]:
|
| 406 |
+
"""Get sample promo products for testing"""
|
| 407 |
+
logger.info("📊 Using sample promo products for testing")
|
| 408 |
+
return [
|
| 409 |
+
{"id": "p1", "name": "vindija jogurt prirodni", "store": "Konzum", "promo_price": 1.20, "regular_price": 1.50, "picture_id": "img1"},
|
| 410 |
+
{"id": "p2", "name": "coca cola original 500ml", "store": "Konzum", "promo_price": 0.80, "regular_price": 1.10, "picture_id": "img2"},
|
| 411 |
+
{"id": "p3", "name": "samsung galaxy phone", "store": "Links", "promo_price": 299.99, "regular_price": 399.99, "picture_id": "img3"},
|
| 412 |
+
]
|
| 413 |
+
|
| 414 |
+
def test_connection(self) -> Dict[str, Any]:
|
| 415 |
+
"""
|
| 416 |
+
Test the database connection and return status
|
| 417 |
+
|
| 418 |
+
Returns:
|
| 419 |
+
Status dictionary with connection details
|
| 420 |
+
"""
|
| 421 |
+
if not self.supabase:
|
| 422 |
+
return {
|
| 423 |
+
"connected": False,
|
| 424 |
+
"error": "No Supabase client initialized",
|
| 425 |
+
"products_count": 0,
|
| 426 |
+
"promo_products_count": 0
|
| 427 |
+
}
|
| 428 |
+
|
| 429 |
+
try:
|
| 430 |
+
# Test products table
|
| 431 |
+
products_result = self.supabase.table('products').select('product_id', count='exact').limit(1).execute()
|
| 432 |
+
products_count = products_result.count
|
| 433 |
+
|
| 434 |
+
# Test promo_products table
|
| 435 |
+
promo_result = self.supabase.table('promo_products').select('id', count='exact').limit(1).execute()
|
| 436 |
+
promo_count = promo_result.count
|
| 437 |
+
|
| 438 |
+
return {
|
| 439 |
+
"connected": True,
|
| 440 |
+
"products_count": products_count,
|
| 441 |
+
"promo_products_count": promo_count,
|
| 442 |
+
"database_url": self.supabase_url[:30] + "..." if self.supabase_url else "Unknown"
|
| 443 |
+
}
|
| 444 |
+
|
| 445 |
+
except Exception as e:
|
| 446 |
+
return {
|
| 447 |
+
"connected": False,
|
| 448 |
+
"error": str(e),
|
| 449 |
+
"products_count": 0,
|
| 450 |
+
"promo_products_count": 0
|
| 451 |
+
}
|
| 452 |
+
|
| 453 |
+
|
| 454 |
+
# Global repository instance
|
| 455 |
+
_repository = None
|
| 456 |
+
|
| 457 |
+
def get_similarity_repository() -> SimilarityRepository:
|
| 458 |
+
"""Get singleton repository instance"""
|
| 459 |
+
global _repository
|
| 460 |
+
if _repository is None:
|
| 461 |
+
_repository = SimilarityRepository()
|
| 462 |
+
return _repository
|
product_detector/mock_detector.py
ADDED
|
@@ -0,0 +1,33 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
from typing import List, Dict
|
| 3 |
+
import warnings
|
| 4 |
+
|
| 5 |
+
class MockObjectDetector:
|
| 6 |
+
"""
|
| 7 |
+
Mock Object Detector to temporarily replace the broken ONNX model
|
| 8 |
+
Returns dummy detection results to keep the server running
|
| 9 |
+
"""
|
| 10 |
+
|
| 11 |
+
def __init__(self, model_path: str, class_names: List[str], input_size: int = 640):
|
| 12 |
+
self.class_names = class_names
|
| 13 |
+
self.input_size = input_size
|
| 14 |
+
print(f"🔧 Mock detector initialized - model file was corrupted")
|
| 15 |
+
print(f"📝 Available classes: {class_names}")
|
| 16 |
+
|
| 17 |
+
def predict(self, image: np.ndarray) -> List[Dict]:
|
| 18 |
+
"""
|
| 19 |
+
Mock prediction method - returns sample detections
|
| 20 |
+
Replace this with real detector once model is fixed
|
| 21 |
+
"""
|
| 22 |
+
# Return mock detection results
|
| 23 |
+
mock_detections = [
|
| 24 |
+
{
|
| 25 |
+
"class": "product" if len(self.class_names) > 0 else "unknown",
|
| 26 |
+
"confidence": 0.85,
|
| 27 |
+
"bbox": [100, 100, 300, 250], # x1, y1, x2, y2
|
| 28 |
+
"bbox_normalized": [0.3, 0.3, 0.4, 0.5] # center_x, center_y, width, height (normalized)
|
| 29 |
+
}
|
| 30 |
+
]
|
| 31 |
+
|
| 32 |
+
print(f"🔍 Mock detection completed - found {len(mock_detections)} objects")
|
| 33 |
+
return mock_detections
|
requirements.txt
CHANGED
|
@@ -12,4 +12,8 @@ python-dotenv
|
|
| 12 |
supabase
|
| 13 |
rembg
|
| 14 |
httpx
|
| 15 |
-
unidecode
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
supabase
|
| 13 |
rembg
|
| 14 |
httpx
|
| 15 |
+
unidecode
|
| 16 |
+
# Similarity Engine Dependencies
|
| 17 |
+
requests
|
| 18 |
+
python-dateutil
|
| 19 |
+
pydantic
|
similarity_engine/__init__.py
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Similarity Engine Package
|
| 3 |
+
Backend version of the Price Hunter similarity engine
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from .similarity_core import calculate_similarity, calculate_confidence
|
| 7 |
+
from .product_comparator import ProductComparator, compare_products_batch
|
| 8 |
+
from .enhanced_image_processor import get_image_processor
|
| 9 |
+
|
| 10 |
+
__version__ = "1.0.0"
|
| 11 |
+
__all__ = [
|
| 12 |
+
"calculate_similarity",
|
| 13 |
+
"calculate_confidence",
|
| 14 |
+
"ProductComparator",
|
| 15 |
+
"compare_products_batch",
|
| 16 |
+
"get_image_processor"
|
| 17 |
+
]
|
similarity_engine/enhanced_image_processor.py
ADDED
|
@@ -0,0 +1,531 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Enhanced Image Processor - Multiple Sources & Flexible Processing
|
| 3 |
+
Supports promo products, manual uploads, URL sources, Google Images, and more
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import os
|
| 7 |
+
import logging
|
| 8 |
+
import requests
|
| 9 |
+
import time
|
| 10 |
+
from typing import List, Dict, Any, Optional, Tuple
|
| 11 |
+
import sys
|
| 12 |
+
import os
|
| 13 |
+
|
| 14 |
+
# Add parent directory to path
|
| 15 |
+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
| 16 |
+
|
| 17 |
+
from similarity_core import calculate_similarity, calculate_confidence
|
| 18 |
+
from db.similarity_repository import get_similarity_repository
|
| 19 |
+
|
| 20 |
+
# Configure logging
|
| 21 |
+
logging.basicConfig(level=logging.INFO)
|
| 22 |
+
logger = logging.getLogger(__name__)
|
| 23 |
+
|
| 24 |
+
class EnhancedImageProcessor:
|
| 25 |
+
"""Enhanced image processor with multiple sources and flexible options"""
|
| 26 |
+
|
| 27 |
+
def __init__(self):
|
| 28 |
+
"""Initialize the image processor"""
|
| 29 |
+
self.repository = get_similarity_repository()
|
| 30 |
+
self.processing_stats = {
|
| 31 |
+
'total_processed': 0,
|
| 32 |
+
'successful': 0,
|
| 33 |
+
'failed': 0,
|
| 34 |
+
'skipped': 0
|
| 35 |
+
}
|
| 36 |
+
|
| 37 |
+
def find_high_similarity_matches(
|
| 38 |
+
self,
|
| 39 |
+
source_products: List[Dict],
|
| 40 |
+
target_products: List[Dict],
|
| 41 |
+
threshold: float = 0.95,
|
| 42 |
+
source_type: str = "promo"
|
| 43 |
+
) -> List[Dict[str, Any]]:
|
| 44 |
+
"""
|
| 45 |
+
Find high similarity matches between source and target products
|
| 46 |
+
|
| 47 |
+
Args:
|
| 48 |
+
source_products: Products with images (promo, manual, etc.)
|
| 49 |
+
target_products: Database products to match against
|
| 50 |
+
threshold: Similarity threshold
|
| 51 |
+
source_type: Type of source ("promo", "manual", "google", etc.)
|
| 52 |
+
|
| 53 |
+
Returns:
|
| 54 |
+
List of high similarity matches
|
| 55 |
+
"""
|
| 56 |
+
logger.info(f"🔍 Finding high similarity matches for {source_type} images")
|
| 57 |
+
logger.info(f"📊 Source products: {len(source_products)}")
|
| 58 |
+
logger.info(f"📊 Target products: {len(target_products)}")
|
| 59 |
+
logger.info(f"🎯 Similarity threshold: {threshold}")
|
| 60 |
+
|
| 61 |
+
matches = []
|
| 62 |
+
|
| 63 |
+
for i, source_product in enumerate(source_products):
|
| 64 |
+
source_name = source_product.get('name', '').strip()
|
| 65 |
+
if not source_name:
|
| 66 |
+
continue
|
| 67 |
+
|
| 68 |
+
logger.info(f"📊 Analyzing {source_type} product {i+1}/{len(source_products)}: {source_name[:50]}...")
|
| 69 |
+
|
| 70 |
+
for target_product in target_products:
|
| 71 |
+
target_name = target_product.get('product_name', '').strip()
|
| 72 |
+
if not target_name:
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
similarity = calculate_similarity(source_name, target_name)
|
| 76 |
+
|
| 77 |
+
if similarity >= threshold:
|
| 78 |
+
confidence = calculate_confidence(similarity, source_name, target_name)
|
| 79 |
+
|
| 80 |
+
match = {
|
| 81 |
+
'source_id': source_product.get('id'),
|
| 82 |
+
'source_name': source_name,
|
| 83 |
+
'source_type': source_type,
|
| 84 |
+
'target_product_id': target_product.get('product_id'),
|
| 85 |
+
'target_product_name': target_name,
|
| 86 |
+
'similarity': round(similarity, 3),
|
| 87 |
+
'confidence': round(confidence, 3),
|
| 88 |
+
'has_current_image': bool(target_product.get('product_image')),
|
| 89 |
+
'source_image_info': self._extract_image_info(source_product, source_type)
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
matches.append(match)
|
| 93 |
+
logger.info(f" 🔍 HIGH MATCH: {source_name} ↔ {target_name} ({similarity:.3f})")
|
| 94 |
+
break
|
| 95 |
+
|
| 96 |
+
logger.info(f"✅ Found {len(matches)} high similarity matches")
|
| 97 |
+
return matches
|
| 98 |
+
|
| 99 |
+
def _extract_image_info(self, product: Dict, source_type: str) -> Dict[str, Any]:
|
| 100 |
+
"""Extract image information based on source type"""
|
| 101 |
+
if source_type == "promo":
|
| 102 |
+
picture_id = product.get('picture_id')
|
| 103 |
+
return {
|
| 104 |
+
'picture_id': picture_id,
|
| 105 |
+
'image_url': f"https://backend.360promo.hr/contents/products/{picture_id}.jpg" if picture_id else None,
|
| 106 |
+
'store': product.get('store'),
|
| 107 |
+
'promo_price': product.get('promo_price'),
|
| 108 |
+
'regular_price': product.get('regular_price')
|
| 109 |
+
}
|
| 110 |
+
elif source_type == "manual":
|
| 111 |
+
return {
|
| 112 |
+
'image_url': product.get('image_url'),
|
| 113 |
+
'original_filename': product.get('filename'),
|
| 114 |
+
'uploaded_by': product.get('uploaded_by')
|
| 115 |
+
}
|
| 116 |
+
elif source_type == "google":
|
| 117 |
+
return {
|
| 118 |
+
'image_url': product.get('image_url'),
|
| 119 |
+
'source_page': product.get('source_page'),
|
| 120 |
+
'search_query': product.get('search_query')
|
| 121 |
+
}
|
| 122 |
+
elif source_type == "url":
|
| 123 |
+
return {
|
| 124 |
+
'image_url': product.get('image_url'),
|
| 125 |
+
'source_domain': product.get('source_domain')
|
| 126 |
+
}
|
| 127 |
+
else:
|
| 128 |
+
return {
|
| 129 |
+
'image_url': product.get('image_url', product.get('picture_url'))
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
def check_image_availability(self, image_url: str) -> bool:
|
| 133 |
+
"""Check if image URL is accessible"""
|
| 134 |
+
try:
|
| 135 |
+
response = requests.head(image_url, timeout=10)
|
| 136 |
+
return response.status_code == 200
|
| 137 |
+
except Exception as e:
|
| 138 |
+
logger.warning(f"⚠️ Image not accessible: {image_url} - {e}")
|
| 139 |
+
return False
|
| 140 |
+
|
| 141 |
+
def process_image_from_url(
|
| 142 |
+
self,
|
| 143 |
+
image_url: str,
|
| 144 |
+
product_id: str,
|
| 145 |
+
processing_options: Dict[str, Any] = None
|
| 146 |
+
) -> Optional[str]:
|
| 147 |
+
"""
|
| 148 |
+
Download and process image from URL
|
| 149 |
+
|
| 150 |
+
Args:
|
| 151 |
+
image_url: Source image URL
|
| 152 |
+
product_id: Target product ID
|
| 153 |
+
processing_options: Processing configuration
|
| 154 |
+
|
| 155 |
+
Returns:
|
| 156 |
+
Processed image URL or None if failed
|
| 157 |
+
"""
|
| 158 |
+
if processing_options is None:
|
| 159 |
+
processing_options = {
|
| 160 |
+
'remove_background': True,
|
| 161 |
+
'upscale_factor': 2,
|
| 162 |
+
'target_format': 'webp',
|
| 163 |
+
'quality': 85
|
| 164 |
+
}
|
| 165 |
+
|
| 166 |
+
try:
|
| 167 |
+
logger.info(f"📥 Downloading image from: {image_url}")
|
| 168 |
+
|
| 169 |
+
# Download image
|
| 170 |
+
response = requests.get(image_url, timeout=30)
|
| 171 |
+
if response.status_code != 200:
|
| 172 |
+
logger.error(f"❌ Failed to download: HTTP {response.status_code}")
|
| 173 |
+
return None
|
| 174 |
+
|
| 175 |
+
logger.info("✅ Image downloaded successfully")
|
| 176 |
+
|
| 177 |
+
# Try to process via backend endpoint
|
| 178 |
+
processed_url = self._process_via_backend(
|
| 179 |
+
response.content,
|
| 180 |
+
product_id,
|
| 181 |
+
processing_options
|
| 182 |
+
)
|
| 183 |
+
|
| 184 |
+
if processed_url:
|
| 185 |
+
return processed_url
|
| 186 |
+
|
| 187 |
+
# If processing fails, return original URL
|
| 188 |
+
logger.warning("⚠️ Processing failed, using original URL")
|
| 189 |
+
return image_url
|
| 190 |
+
|
| 191 |
+
except Exception as e:
|
| 192 |
+
logger.error(f"❌ Error processing image from URL: {e}")
|
| 193 |
+
return None
|
| 194 |
+
|
| 195 |
+
def _process_via_backend(
|
| 196 |
+
self,
|
| 197 |
+
image_content: bytes,
|
| 198 |
+
product_id: str,
|
| 199 |
+
options: Dict[str, Any]
|
| 200 |
+
) -> Optional[str]:
|
| 201 |
+
"""Process image via backend endpoint"""
|
| 202 |
+
try:
|
| 203 |
+
# Get backend endpoint
|
| 204 |
+
endpoint = os.getenv('IMAGE_PROCESS_ENDPOINT', 'http://localhost:7860/products/process-product-image')
|
| 205 |
+
|
| 206 |
+
files = {'file': ('image.jpg', image_content, 'image/jpeg')}
|
| 207 |
+
data = {
|
| 208 |
+
'remove_bg': str(options.get('remove_background', True)).lower(),
|
| 209 |
+
'upscale': str(options.get('upscale_factor', 2) > 1).lower(),
|
| 210 |
+
'scale_factor': str(options.get('upscale_factor', 2)),
|
| 211 |
+
'process_order': 'remove_first',
|
| 212 |
+
'product_id': product_id
|
| 213 |
+
}
|
| 214 |
+
|
| 215 |
+
response = requests.post(endpoint, files=files, data=data, timeout=60)
|
| 216 |
+
|
| 217 |
+
if response.status_code == 200:
|
| 218 |
+
result = response.json()
|
| 219 |
+
if result.get('status') == 'success':
|
| 220 |
+
logger.info("✅ Image processed successfully via backend")
|
| 221 |
+
return result.get('image_url')
|
| 222 |
+
|
| 223 |
+
logger.warning(f"⚠️ Backend processing failed: {response.status_code}")
|
| 224 |
+
return None
|
| 225 |
+
|
| 226 |
+
except Exception as e:
|
| 227 |
+
logger.warning(f"⚠️ Backend processing unavailable: {e}")
|
| 228 |
+
return None
|
| 229 |
+
|
| 230 |
+
def process_promo_images(
|
| 231 |
+
self,
|
| 232 |
+
similarity_threshold: float = 0.95,
|
| 233 |
+
skip_existing: bool = True,
|
| 234 |
+
max_products: Optional[int] = None
|
| 235 |
+
) -> Dict[str, int]:
|
| 236 |
+
"""
|
| 237 |
+
Process images from promotional products
|
| 238 |
+
|
| 239 |
+
Args:
|
| 240 |
+
similarity_threshold: Minimum similarity for processing
|
| 241 |
+
skip_existing: Skip products that already have images
|
| 242 |
+
max_products: Maximum products to process
|
| 243 |
+
|
| 244 |
+
Returns:
|
| 245 |
+
Processing statistics
|
| 246 |
+
"""
|
| 247 |
+
logger.info("🏷️ Starting promo image processing...")
|
| 248 |
+
|
| 249 |
+
# Load promo products with images
|
| 250 |
+
promo_products = self.repository.load_promo_products(with_images_only=True)
|
| 251 |
+
if not promo_products:
|
| 252 |
+
logger.error("❌ No promo products with images found")
|
| 253 |
+
return self._get_empty_stats()
|
| 254 |
+
|
| 255 |
+
# Load target products
|
| 256 |
+
if skip_existing:
|
| 257 |
+
target_products = self.repository.get_products_without_images(max_products)
|
| 258 |
+
else:
|
| 259 |
+
all_products = self.repository.load_all_products()
|
| 260 |
+
target_products = all_products[:max_products] if max_products else all_products
|
| 261 |
+
|
| 262 |
+
if not target_products:
|
| 263 |
+
logger.error("❌ No target products found")
|
| 264 |
+
return self._get_empty_stats()
|
| 265 |
+
|
| 266 |
+
# Find matches
|
| 267 |
+
matches = self.find_high_similarity_matches(
|
| 268 |
+
promo_products,
|
| 269 |
+
target_products,
|
| 270 |
+
similarity_threshold,
|
| 271 |
+
"promo"
|
| 272 |
+
)
|
| 273 |
+
|
| 274 |
+
return self._process_matches(matches, skip_existing)
|
| 275 |
+
|
| 276 |
+
def process_manual_upload(
|
| 277 |
+
self,
|
| 278 |
+
image_file: bytes,
|
| 279 |
+
filename: str,
|
| 280 |
+
product_id: str,
|
| 281 |
+
processing_options: Dict[str, Any] = None
|
| 282 |
+
) -> bool:
|
| 283 |
+
"""
|
| 284 |
+
Process manually uploaded image
|
| 285 |
+
|
| 286 |
+
Args:
|
| 287 |
+
image_file: Image file content
|
| 288 |
+
filename: Original filename
|
| 289 |
+
product_id: Target product ID
|
| 290 |
+
processing_options: Processing configuration
|
| 291 |
+
|
| 292 |
+
Returns:
|
| 293 |
+
True if successful
|
| 294 |
+
"""
|
| 295 |
+
logger.info(f"📤 Processing manual upload for product {product_id}")
|
| 296 |
+
|
| 297 |
+
try:
|
| 298 |
+
# Process image
|
| 299 |
+
processed_url = self._process_via_backend(
|
| 300 |
+
image_file,
|
| 301 |
+
product_id,
|
| 302 |
+
processing_options or {}
|
| 303 |
+
)
|
| 304 |
+
|
| 305 |
+
if not processed_url:
|
| 306 |
+
logger.error("❌ Failed to process uploaded image")
|
| 307 |
+
return False
|
| 308 |
+
|
| 309 |
+
# Update database
|
| 310 |
+
success = self.repository.update_product_image(product_id, processed_url)
|
| 311 |
+
|
| 312 |
+
if success:
|
| 313 |
+
# Save metadata
|
| 314 |
+
self.repository.save_image_metadata(product_id, {
|
| 315 |
+
'source_type': 'manual',
|
| 316 |
+
'original_filename': filename,
|
| 317 |
+
'processed_url': processed_url,
|
| 318 |
+
'upload_time': time.time()
|
| 319 |
+
})
|
| 320 |
+
|
| 321 |
+
logger.info(f"✅ Successfully attached manual upload to product {product_id}")
|
| 322 |
+
return True
|
| 323 |
+
|
| 324 |
+
return False
|
| 325 |
+
|
| 326 |
+
except Exception as e:
|
| 327 |
+
logger.error(f"❌ Error processing manual upload: {e}")
|
| 328 |
+
return False
|
| 329 |
+
|
| 330 |
+
def process_from_url_list(
|
| 331 |
+
self,
|
| 332 |
+
url_mappings: List[Dict[str, str]],
|
| 333 |
+
processing_options: Dict[str, Any] = None
|
| 334 |
+
) -> Dict[str, int]:
|
| 335 |
+
"""
|
| 336 |
+
Process images from a list of URL mappings
|
| 337 |
+
|
| 338 |
+
Args:
|
| 339 |
+
url_mappings: List of {'product_id': 'xxx', 'image_url': 'xxx'} mappings
|
| 340 |
+
processing_options: Processing configuration
|
| 341 |
+
|
| 342 |
+
Returns:
|
| 343 |
+
Processing statistics
|
| 344 |
+
"""
|
| 345 |
+
logger.info(f"🌐 Processing {len(url_mappings)} URL mappings...")
|
| 346 |
+
|
| 347 |
+
stats = self._get_empty_stats()
|
| 348 |
+
stats['total_processed'] = len(url_mappings)
|
| 349 |
+
|
| 350 |
+
for mapping in url_mappings:
|
| 351 |
+
product_id = mapping.get('product_id')
|
| 352 |
+
image_url = mapping.get('image_url')
|
| 353 |
+
|
| 354 |
+
if not product_id or not image_url:
|
| 355 |
+
stats['failed'] += 1
|
| 356 |
+
continue
|
| 357 |
+
|
| 358 |
+
logger.info(f"📊 Processing URL for product {product_id}")
|
| 359 |
+
|
| 360 |
+
# Check availability
|
| 361 |
+
if not self.check_image_availability(image_url):
|
| 362 |
+
stats['failed'] += 1
|
| 363 |
+
continue
|
| 364 |
+
|
| 365 |
+
# Process image
|
| 366 |
+
processed_url = self.process_image_from_url(
|
| 367 |
+
image_url,
|
| 368 |
+
product_id,
|
| 369 |
+
processing_options
|
| 370 |
+
)
|
| 371 |
+
|
| 372 |
+
if processed_url:
|
| 373 |
+
# Update database
|
| 374 |
+
if self.repository.update_product_image(product_id, processed_url):
|
| 375 |
+
stats['successful'] += 1
|
| 376 |
+
|
| 377 |
+
# Save metadata
|
| 378 |
+
self.repository.save_image_metadata(product_id, {
|
| 379 |
+
'source_type': 'url',
|
| 380 |
+
'source_url': image_url,
|
| 381 |
+
'processed_url': processed_url,
|
| 382 |
+
'processing_time': time.time()
|
| 383 |
+
})
|
| 384 |
+
else:
|
| 385 |
+
stats['failed'] += 1
|
| 386 |
+
else:
|
| 387 |
+
stats['failed'] += 1
|
| 388 |
+
|
| 389 |
+
logger.info(f"✅ URL processing complete: {stats['successful']}/{stats['total_processed']} successful")
|
| 390 |
+
return stats
|
| 391 |
+
|
| 392 |
+
def search_and_attach_google_images(
|
| 393 |
+
self,
|
| 394 |
+
product_id: str,
|
| 395 |
+
search_query: str,
|
| 396 |
+
max_results: int = 3,
|
| 397 |
+
require_approval: bool = True
|
| 398 |
+
) -> List[Dict[str, Any]]:
|
| 399 |
+
"""
|
| 400 |
+
Search Google Images and find potential matches
|
| 401 |
+
|
| 402 |
+
Args:
|
| 403 |
+
product_id: Target product ID
|
| 404 |
+
search_query: Search query for Google Images
|
| 405 |
+
max_results: Maximum results to return
|
| 406 |
+
require_approval: Whether manual approval is required
|
| 407 |
+
|
| 408 |
+
Returns:
|
| 409 |
+
List of potential image matches
|
| 410 |
+
"""
|
| 411 |
+
logger.info(f"🔍 Google Image search for product {product_id}: '{search_query}'")
|
| 412 |
+
|
| 413 |
+
# TODO: Implement Google Images API integration
|
| 414 |
+
# For now, return mock results
|
| 415 |
+
mock_results = [
|
| 416 |
+
{
|
| 417 |
+
'image_url': f'https://example.com/mock-image-1.jpg',
|
| 418 |
+
'thumbnail_url': f'https://example.com/mock-thumb-1.jpg',
|
| 419 |
+
'source_page': f'https://example.com/product-page-1',
|
| 420 |
+
'title': f'Mock result for {search_query}',
|
| 421 |
+
'confidence': 0.85
|
| 422 |
+
}
|
| 423 |
+
]
|
| 424 |
+
|
| 425 |
+
logger.info(f"🔍 Found {len(mock_results)} potential Google Image matches")
|
| 426 |
+
logger.warning("⚠️ Google Images integration not yet implemented - returning mock data")
|
| 427 |
+
|
| 428 |
+
return mock_results
|
| 429 |
+
|
| 430 |
+
def _process_matches(self, matches: List[Dict], skip_existing: bool = True) -> Dict[str, int]:
|
| 431 |
+
"""Process similarity matches and attach images"""
|
| 432 |
+
stats = self._get_empty_stats()
|
| 433 |
+
stats['total_processed'] = len(matches)
|
| 434 |
+
|
| 435 |
+
if not matches:
|
| 436 |
+
return stats
|
| 437 |
+
|
| 438 |
+
# Filter existing if needed
|
| 439 |
+
if skip_existing:
|
| 440 |
+
to_process = [m for m in matches if not m['has_current_image']]
|
| 441 |
+
stats['skipped'] = len(matches) - len(to_process)
|
| 442 |
+
matches = to_process
|
| 443 |
+
|
| 444 |
+
logger.info(f"📊 Processing images for {len(matches)} products...")
|
| 445 |
+
|
| 446 |
+
for match in matches:
|
| 447 |
+
product_id = match['target_product_id']
|
| 448 |
+
image_info = match['source_image_info']
|
| 449 |
+
image_url = image_info.get('image_url')
|
| 450 |
+
|
| 451 |
+
if not image_url:
|
| 452 |
+
stats['failed'] += 1
|
| 453 |
+
continue
|
| 454 |
+
|
| 455 |
+
logger.info(f"📊 Processing image for product {product_id}")
|
| 456 |
+
|
| 457 |
+
# Check availability
|
| 458 |
+
if not self.check_image_availability(image_url):
|
| 459 |
+
stats['failed'] += 1
|
| 460 |
+
continue
|
| 461 |
+
|
| 462 |
+
# Process image
|
| 463 |
+
processed_url = self.process_image_from_url(image_url, product_id)
|
| 464 |
+
|
| 465 |
+
if processed_url and self.repository.update_product_image(product_id, processed_url):
|
| 466 |
+
stats['successful'] += 1
|
| 467 |
+
|
| 468 |
+
# Save metadata
|
| 469 |
+
self.repository.save_image_metadata(product_id, {
|
| 470 |
+
'source_type': match['source_type'],
|
| 471 |
+
'similarity': match['similarity'],
|
| 472 |
+
'confidence': match['confidence'],
|
| 473 |
+
'source_info': image_info,
|
| 474 |
+
'processing_time': time.time()
|
| 475 |
+
})
|
| 476 |
+
|
| 477 |
+
logger.info(f"✅ Successfully attached image to product {product_id}")
|
| 478 |
+
else:
|
| 479 |
+
stats['failed'] += 1
|
| 480 |
+
|
| 481 |
+
return stats
|
| 482 |
+
|
| 483 |
+
def _get_empty_stats(self) -> Dict[str, int]:
|
| 484 |
+
"""Get empty statistics dictionary"""
|
| 485 |
+
return {
|
| 486 |
+
'total_processed': 0,
|
| 487 |
+
'successful': 0,
|
| 488 |
+
'failed': 0,
|
| 489 |
+
'skipped': 0,
|
| 490 |
+
'unavailable': 0
|
| 491 |
+
}
|
| 492 |
+
|
| 493 |
+
def get_processing_report(self, stats: Dict[str, int]) -> Dict[str, Any]:
|
| 494 |
+
"""Generate processing report"""
|
| 495 |
+
return {
|
| 496 |
+
'summary': {
|
| 497 |
+
'total_processed': stats['total_processed'],
|
| 498 |
+
'successful': stats['successful'],
|
| 499 |
+
'failed': stats['failed'],
|
| 500 |
+
'skipped': stats.get('skipped', 0),
|
| 501 |
+
'success_rate': (stats['successful'] / max(stats['total_processed'], 1)) * 100
|
| 502 |
+
},
|
| 503 |
+
'timestamp': time.time(),
|
| 504 |
+
'recommendations': self._generate_recommendations(stats)
|
| 505 |
+
}
|
| 506 |
+
|
| 507 |
+
def _generate_recommendations(self, stats: Dict[str, int]) -> List[str]:
|
| 508 |
+
"""Generate recommendations based on processing stats"""
|
| 509 |
+
recommendations = []
|
| 510 |
+
|
| 511 |
+
if stats['failed'] > stats['successful']:
|
| 512 |
+
recommendations.append("High failure rate - check image sources and processing settings")
|
| 513 |
+
|
| 514 |
+
if stats.get('skipped', 0) > 0:
|
| 515 |
+
recommendations.append(f"{stats['skipped']} products already had images - consider processing all products")
|
| 516 |
+
|
| 517 |
+
if stats['successful'] > 0:
|
| 518 |
+
recommendations.append(f"Successfully processed {stats['successful']} images - consider similar processing for remaining products")
|
| 519 |
+
|
| 520 |
+
return recommendations
|
| 521 |
+
|
| 522 |
+
|
| 523 |
+
# Global processor instance
|
| 524 |
+
_processor = None
|
| 525 |
+
|
| 526 |
+
def get_image_processor() -> EnhancedImageProcessor:
|
| 527 |
+
"""Get singleton image processor instance"""
|
| 528 |
+
global _processor
|
| 529 |
+
if _processor is None:
|
| 530 |
+
_processor = EnhancedImageProcessor()
|
| 531 |
+
return _processor
|
similarity_engine/product_comparator.py
ADDED
|
@@ -0,0 +1,300 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Product Comparator - Backend Version
|
| 3 |
+
Modified to return data instead of printing for API usage
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import time
|
| 7 |
+
import logging
|
| 8 |
+
from typing import List, Dict, Any
|
| 9 |
+
import sys
|
| 10 |
+
import os
|
| 11 |
+
|
| 12 |
+
# Add the similarity_engine directory to path
|
| 13 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 14 |
+
sys.path.append(current_dir)
|
| 15 |
+
|
| 16 |
+
from similarity_core import calculate_similarity, calculate_confidence
|
| 17 |
+
|
| 18 |
+
# Configure logging
|
| 19 |
+
logging.basicConfig(level=logging.INFO)
|
| 20 |
+
logger = logging.getLogger(__name__)
|
| 21 |
+
|
| 22 |
+
class ProductComparator:
|
| 23 |
+
"""
|
| 24 |
+
Backend version of ProductComparator that returns data
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
def __init__(self, threshold: float = 0.87, min_length_diff: int = 3):
|
| 28 |
+
"""
|
| 29 |
+
Initialize the comparator
|
| 30 |
+
|
| 31 |
+
Args:
|
| 32 |
+
threshold: Similarity threshold for considering products as duplicates
|
| 33 |
+
min_length_diff: Minimum length difference to consider as different products
|
| 34 |
+
"""
|
| 35 |
+
self.threshold = threshold
|
| 36 |
+
self.min_length_diff = min_length_diff
|
| 37 |
+
self.comparison_count = 0
|
| 38 |
+
self.duplicates_found = 0
|
| 39 |
+
|
| 40 |
+
def is_valid_comparison(self, name1: str, name2: str) -> bool:
|
| 41 |
+
"""
|
| 42 |
+
Check if this comparison should be made to avoid false duplicates
|
| 43 |
+
|
| 44 |
+
Args:
|
| 45 |
+
name1: First product name
|
| 46 |
+
name2: Second product name
|
| 47 |
+
|
| 48 |
+
Returns:
|
| 49 |
+
True if this comparison should be made
|
| 50 |
+
"""
|
| 51 |
+
# Skip empty names
|
| 52 |
+
if not name1 or not name2:
|
| 53 |
+
return False
|
| 54 |
+
|
| 55 |
+
# Skip identical names (already handled)
|
| 56 |
+
if name1 == name2:
|
| 57 |
+
return False
|
| 58 |
+
|
| 59 |
+
# Skip if one is contained in the other without significant differences
|
| 60 |
+
if (name1 in name2 or name2 in name1) and abs(len(name1) - len(name2)) < self.min_length_diff:
|
| 61 |
+
return False
|
| 62 |
+
|
| 63 |
+
return True
|
| 64 |
+
|
| 65 |
+
def find_all_duplicates(self, products: List[Dict[str, Any]]) -> Dict[str, Any]:
|
| 66 |
+
"""
|
| 67 |
+
Find all duplicate pairs in product list
|
| 68 |
+
|
| 69 |
+
Args:
|
| 70 |
+
products: List of product dictionaries with 'product_id' and 'product_name'
|
| 71 |
+
|
| 72 |
+
Returns:
|
| 73 |
+
Dictionary with analysis results and duplicate pairs
|
| 74 |
+
"""
|
| 75 |
+
start_time = time.time()
|
| 76 |
+
total_products = len(products)
|
| 77 |
+
total_comparisons = total_products * (total_products - 1) // 2
|
| 78 |
+
duplicates = []
|
| 79 |
+
|
| 80 |
+
logger.info(f"🔍 Starting duplicate analysis for {total_products} products")
|
| 81 |
+
logger.info(f"📊 Total comparisons needed: {total_comparisons:,}")
|
| 82 |
+
logger.info(f"🎯 Duplicate threshold: {self.threshold}")
|
| 83 |
+
|
| 84 |
+
comparison_count = 0
|
| 85 |
+
progress_updates = []
|
| 86 |
+
|
| 87 |
+
for i, product1 in enumerate(products):
|
| 88 |
+
product_name1 = product1.get('product_name', '').strip()
|
| 89 |
+
product_id1 = product1.get('product_id', 'unknown')
|
| 90 |
+
|
| 91 |
+
# Skip products with empty names
|
| 92 |
+
if not product_name1:
|
| 93 |
+
continue
|
| 94 |
+
|
| 95 |
+
# Log progress every 100 products
|
| 96 |
+
if i % 100 == 0:
|
| 97 |
+
progress_updates.append(f"Analyzing product {i+1}/{total_products}")
|
| 98 |
+
logger.info(f"📊 Progress: {i+1}/{total_products} products analyzed")
|
| 99 |
+
|
| 100 |
+
for j, product2 in enumerate(products[i+1:], i+1):
|
| 101 |
+
product_name2 = product2.get('product_name', '').strip()
|
| 102 |
+
product_id2 = product2.get('product_id', 'unknown')
|
| 103 |
+
|
| 104 |
+
# Skip products with empty names or identical IDs
|
| 105 |
+
if not product_name2 or product_id1 == product_id2:
|
| 106 |
+
continue
|
| 107 |
+
|
| 108 |
+
# Skip invalid comparisons
|
| 109 |
+
if not self.is_valid_comparison(product_name1, product_name2):
|
| 110 |
+
continue
|
| 111 |
+
|
| 112 |
+
comparison_count += 1
|
| 113 |
+
|
| 114 |
+
# Calculate similarity
|
| 115 |
+
similarity = calculate_similarity(product_name1, product_name2)
|
| 116 |
+
|
| 117 |
+
# Check if it's a duplicate
|
| 118 |
+
if similarity >= self.threshold:
|
| 119 |
+
confidence = calculate_confidence(
|
| 120 |
+
similarity,
|
| 121 |
+
product_name1,
|
| 122 |
+
product_name2
|
| 123 |
+
)
|
| 124 |
+
|
| 125 |
+
duplicate_info = {
|
| 126 |
+
'product1_id': product_id1,
|
| 127 |
+
'product1_name': product_name1,
|
| 128 |
+
'product2_id': product_id2,
|
| 129 |
+
'product2_name': product_name2,
|
| 130 |
+
'similarity': round(similarity, 3),
|
| 131 |
+
'confidence': round(confidence, 3)
|
| 132 |
+
}
|
| 133 |
+
|
| 134 |
+
duplicates.append(duplicate_info)
|
| 135 |
+
logger.info(f" 🔍 DUPLICATE FOUND: {product_name1} ↔ {product_name2} ({similarity:.3f})")
|
| 136 |
+
|
| 137 |
+
end_time = time.time()
|
| 138 |
+
analysis_time = end_time - start_time
|
| 139 |
+
|
| 140 |
+
# Return comprehensive results
|
| 141 |
+
results = {
|
| 142 |
+
'duplicates': duplicates,
|
| 143 |
+
'analysis_summary': {
|
| 144 |
+
'total_products': total_products,
|
| 145 |
+
'total_comparisons': comparison_count,
|
| 146 |
+
'duplicates_found': len(duplicates),
|
| 147 |
+
'duplicate_rate': (len(duplicates) / max(total_products, 1)) * 100,
|
| 148 |
+
'analysis_time_seconds': round(analysis_time, 2),
|
| 149 |
+
'threshold_used': self.threshold,
|
| 150 |
+
'success': True
|
| 151 |
+
},
|
| 152 |
+
'progress_log': progress_updates,
|
| 153 |
+
'recommendations': self._generate_recommendations(duplicates, total_products)
|
| 154 |
+
}
|
| 155 |
+
|
| 156 |
+
logger.info(f"✅ Analysis complete: {len(duplicates)} duplicates found in {analysis_time:.1f}s")
|
| 157 |
+
return results
|
| 158 |
+
|
| 159 |
+
def _generate_recommendations(self, duplicates: List[Dict], total_products: int) -> List[str]:
|
| 160 |
+
"""Generate recommendations based on analysis results"""
|
| 161 |
+
recommendations = []
|
| 162 |
+
|
| 163 |
+
if not duplicates:
|
| 164 |
+
recommendations.append("✅ No duplicates found - your database is clean!")
|
| 165 |
+
recommendations.append("💡 Consider periodic duplicate checks as you add new products")
|
| 166 |
+
else:
|
| 167 |
+
duplicate_rate = (len(duplicates) / total_products) * 100
|
| 168 |
+
|
| 169 |
+
if duplicate_rate > 10:
|
| 170 |
+
recommendations.append("⚠️ High duplicate rate detected - consider cleaning database")
|
| 171 |
+
recommendations.append("🔧 Review product naming conventions to reduce future duplicates")
|
| 172 |
+
elif duplicate_rate > 5:
|
| 173 |
+
recommendations.append("💡 Moderate duplicate rate - review and merge similar products")
|
| 174 |
+
else:
|
| 175 |
+
recommendations.append("✅ Low duplicate rate - good database quality")
|
| 176 |
+
|
| 177 |
+
recommendations.append(f"📋 Review {len(duplicates)} duplicate pairs for manual decision")
|
| 178 |
+
recommendations.append("💡 Higher similarity scores indicate more confident matches")
|
| 179 |
+
|
| 180 |
+
return recommendations
|
| 181 |
+
|
| 182 |
+
def compare_two_products(self, product1_name: str, product2_name: str) -> Dict[str, Any]:
|
| 183 |
+
"""
|
| 184 |
+
Compare two specific products
|
| 185 |
+
|
| 186 |
+
Args:
|
| 187 |
+
product1_name: First product name
|
| 188 |
+
product2_name: Second product name
|
| 189 |
+
|
| 190 |
+
Returns:
|
| 191 |
+
Comparison results
|
| 192 |
+
"""
|
| 193 |
+
logger.info(f"🔍 Comparing '{product1_name}' vs '{product2_name}'")
|
| 194 |
+
|
| 195 |
+
similarity = calculate_similarity(product1_name, product2_name)
|
| 196 |
+
confidence = calculate_confidence(similarity, product1_name, product2_name)
|
| 197 |
+
is_duplicate = similarity >= self.threshold
|
| 198 |
+
|
| 199 |
+
# Determine assessment
|
| 200 |
+
if similarity >= 0.95:
|
| 201 |
+
assessment = "Perfect match - identical products"
|
| 202 |
+
assessment_emoji = "✅"
|
| 203 |
+
elif similarity >= 0.8:
|
| 204 |
+
assessment = "Very similar - likely duplicates"
|
| 205 |
+
assessment_emoji = "⚠️"
|
| 206 |
+
elif similarity >= 0.7:
|
| 207 |
+
assessment = "Similar - review recommended"
|
| 208 |
+
assessment_emoji = "🤔"
|
| 209 |
+
else:
|
| 210 |
+
assessment = "Different products"
|
| 211 |
+
assessment_emoji = "❌"
|
| 212 |
+
|
| 213 |
+
results = {
|
| 214 |
+
'product1_name': product1_name,
|
| 215 |
+
'product2_name': product2_name,
|
| 216 |
+
'similarity': round(similarity, 3),
|
| 217 |
+
'confidence': round(confidence, 3),
|
| 218 |
+
'is_duplicate': is_duplicate,
|
| 219 |
+
'threshold_used': self.threshold,
|
| 220 |
+
'assessment': {
|
| 221 |
+
'description': assessment,
|
| 222 |
+
'emoji': assessment_emoji,
|
| 223 |
+
'category': 'identical' if similarity >= 0.95 else
|
| 224 |
+
'very_similar' if similarity >= 0.8 else
|
| 225 |
+
'similar' if similarity >= 0.7 else 'different'
|
| 226 |
+
},
|
| 227 |
+
'percentage_similarity': round(similarity * 100, 1),
|
| 228 |
+
'percentage_confidence': round(confidence * 100, 1)
|
| 229 |
+
}
|
| 230 |
+
|
| 231 |
+
logger.info(f"📊 Comparison result: {similarity:.3f} similarity, {assessment}")
|
| 232 |
+
return results
|
| 233 |
+
|
| 234 |
+
def get_statistics(self) -> Dict[str, Any]:
|
| 235 |
+
"""
|
| 236 |
+
Get analysis statistics
|
| 237 |
+
|
| 238 |
+
Returns:
|
| 239 |
+
Dictionary with analysis statistics
|
| 240 |
+
"""
|
| 241 |
+
return {
|
| 242 |
+
'comparison_count': self.comparison_count,
|
| 243 |
+
'duplicates_found': self.duplicates_found,
|
| 244 |
+
'threshold': self.threshold,
|
| 245 |
+
'min_length_diff': self.min_length_diff
|
| 246 |
+
}
|
| 247 |
+
|
| 248 |
+
|
| 249 |
+
def compare_products_batch(
|
| 250 |
+
products: List[Dict[str, Any]],
|
| 251 |
+
threshold: float = 0.87,
|
| 252 |
+
return_summary_only: bool = False
|
| 253 |
+
) -> Dict[str, Any]:
|
| 254 |
+
"""
|
| 255 |
+
Convenience function to compare products and return results
|
| 256 |
+
|
| 257 |
+
Args:
|
| 258 |
+
products: List of product dictionaries
|
| 259 |
+
threshold: Similarity threshold for duplicates
|
| 260 |
+
return_summary_only: If True, only return summary stats
|
| 261 |
+
|
| 262 |
+
Returns:
|
| 263 |
+
Analysis results dictionary
|
| 264 |
+
"""
|
| 265 |
+
logger.info(f"🚀 Starting batch product comparison with threshold {threshold}")
|
| 266 |
+
|
| 267 |
+
comparator = ProductComparator(threshold=threshold)
|
| 268 |
+
results = comparator.find_all_duplicates(products)
|
| 269 |
+
|
| 270 |
+
if return_summary_only:
|
| 271 |
+
return {
|
| 272 |
+
'analysis_summary': results['analysis_summary'],
|
| 273 |
+
'recommendations': results['recommendations']
|
| 274 |
+
}
|
| 275 |
+
|
| 276 |
+
return results
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def find_product_duplicates_simple(
|
| 280 |
+
product_names: List[str],
|
| 281 |
+
threshold: float = 0.87
|
| 282 |
+
) -> List[Dict[str, Any]]:
|
| 283 |
+
"""
|
| 284 |
+
Simple function to find duplicates in a list of product names
|
| 285 |
+
|
| 286 |
+
Args:
|
| 287 |
+
product_names: List of product names
|
| 288 |
+
threshold: Similarity threshold
|
| 289 |
+
|
| 290 |
+
Returns:
|
| 291 |
+
List of duplicate pairs
|
| 292 |
+
"""
|
| 293 |
+
# Convert to product format
|
| 294 |
+
products = [
|
| 295 |
+
{"product_id": str(i), "product_name": name}
|
| 296 |
+
for i, name in enumerate(product_names) if name and name.strip()
|
| 297 |
+
]
|
| 298 |
+
|
| 299 |
+
results = compare_products_batch(products, threshold, return_summary_only=False)
|
| 300 |
+
return results.get('duplicates', [])
|
similarity_engine/promo_comparator.py
ADDED
|
@@ -0,0 +1,278 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""
|
| 3 |
+
Promo Products Comparator - Compare promo products against database products
|
| 4 |
+
Integrates with the existing similarity engine for duplicate detection
|
| 5 |
+
Backend version with streaming support
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import sys
|
| 9 |
+
import os
|
| 10 |
+
import time
|
| 11 |
+
import logging
|
| 12 |
+
from typing import List, Dict, Any, AsyncGenerator
|
| 13 |
+
import asyncio
|
| 14 |
+
|
| 15 |
+
# Add current directory to path for imports
|
| 16 |
+
current_dir = os.path.dirname(os.path.abspath(__file__))
|
| 17 |
+
sys.path.append(current_dir)
|
| 18 |
+
|
| 19 |
+
try:
|
| 20 |
+
from similarity_core import calculate_similarity, calculate_confidence
|
| 21 |
+
from similarity_repository import get_similarity_repository
|
| 22 |
+
except ImportError as e:
|
| 23 |
+
print(f"❌ Import error: {e}")
|
| 24 |
+
print("💡 Make sure you're running this from the similarity_engine directory")
|
| 25 |
+
|
| 26 |
+
# Configure logging
|
| 27 |
+
logging.basicConfig(level=logging.INFO)
|
| 28 |
+
logger = logging.getLogger(__name__)
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
class PromoComparator:
|
| 32 |
+
"""Handles promo product comparison with streaming support"""
|
| 33 |
+
|
| 34 |
+
def __init__(self, threshold: float = 0.85):
|
| 35 |
+
self.threshold = threshold
|
| 36 |
+
self.repository = get_similarity_repository()
|
| 37 |
+
|
| 38 |
+
def compare_promo_against_database(self, promo_products: List[Dict], db_products: List[Dict]) -> List[Dict]:
|
| 39 |
+
"""
|
| 40 |
+
Compare promo products against database products for potential matches
|
| 41 |
+
|
| 42 |
+
Args:
|
| 43 |
+
promo_products: List of promo product dictionaries
|
| 44 |
+
db_products: List of database product dictionaries
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
List of potential matches
|
| 48 |
+
"""
|
| 49 |
+
logger.info(f"🔍 Starting promo comparison: {len(promo_products)} promo vs {len(db_products)} db products")
|
| 50 |
+
|
| 51 |
+
matches = []
|
| 52 |
+
total_comparisons = len(promo_products)
|
| 53 |
+
|
| 54 |
+
start_time = time.time()
|
| 55 |
+
|
| 56 |
+
for i, promo_product in enumerate(promo_products):
|
| 57 |
+
promo_name = promo_product.get('name', '').strip()
|
| 58 |
+
promo_store = promo_product.get('store', '').strip()
|
| 59 |
+
|
| 60 |
+
if not promo_name:
|
| 61 |
+
continue
|
| 62 |
+
|
| 63 |
+
logger.info(f"📊 Analyzing promo product {i+1}/{total_comparisons}: '{promo_name[:50]}'")
|
| 64 |
+
|
| 65 |
+
best_match = None
|
| 66 |
+
best_similarity = 0.0
|
| 67 |
+
|
| 68 |
+
# Compare against all database products
|
| 69 |
+
for db_product in db_products:
|
| 70 |
+
db_name = db_product.get('product_name', '').strip()
|
| 71 |
+
|
| 72 |
+
if not db_name:
|
| 73 |
+
continue
|
| 74 |
+
|
| 75 |
+
# Calculate similarity
|
| 76 |
+
similarity = calculate_similarity(promo_name, db_name)
|
| 77 |
+
|
| 78 |
+
if similarity >= self.threshold and similarity > best_similarity:
|
| 79 |
+
best_similarity = similarity
|
| 80 |
+
confidence = calculate_confidence(similarity, promo_name, db_name)
|
| 81 |
+
|
| 82 |
+
best_match = {
|
| 83 |
+
'promo_id': promo_product.get('id'),
|
| 84 |
+
'promo_name': promo_name,
|
| 85 |
+
'promo_store': promo_store,
|
| 86 |
+
'promo_price': promo_product.get('promo_price', 0),
|
| 87 |
+
'regular_price': promo_product.get('regular_price', 0),
|
| 88 |
+
'picture_id': promo_product.get('picture_id'),
|
| 89 |
+
'db_product_id': db_product.get('product_id'),
|
| 90 |
+
'db_product_name': db_name,
|
| 91 |
+
'db_brand': db_product.get('brand', {}).get('brand_name', 'No Brand') if db_product.get('brand') else 'No Brand',
|
| 92 |
+
'similarity': round(similarity, 3),
|
| 93 |
+
'confidence': round(confidence, 3)
|
| 94 |
+
}
|
| 95 |
+
|
| 96 |
+
if best_match:
|
| 97 |
+
matches.append(best_match)
|
| 98 |
+
logger.info(f"🔍 MATCH FOUND: {promo_name} ↔ {best_match['db_product_name']} ({best_similarity:.3f})")
|
| 99 |
+
|
| 100 |
+
end_time = time.time()
|
| 101 |
+
analysis_time = end_time - start_time
|
| 102 |
+
|
| 103 |
+
logger.info(f"✅ Comparison complete! Found {len(matches)} potential matches in {analysis_time:.1f} seconds")
|
| 104 |
+
|
| 105 |
+
return matches
|
| 106 |
+
|
| 107 |
+
async def compare_promo_streaming(self, promo_products: List[Dict], db_products: List[Dict]) -> AsyncGenerator[Dict, None]:
|
| 108 |
+
"""
|
| 109 |
+
Compare promo products with real-time streaming updates
|
| 110 |
+
|
| 111 |
+
Args:
|
| 112 |
+
promo_products: List of promo product dictionaries
|
| 113 |
+
db_products: List of database product dictionaries
|
| 114 |
+
|
| 115 |
+
Yields:
|
| 116 |
+
Progress updates and results
|
| 117 |
+
"""
|
| 118 |
+
yield {
|
| 119 |
+
'type': 'init',
|
| 120 |
+
'total_promo_products': len(promo_products),
|
| 121 |
+
'total_db_products': len(db_products),
|
| 122 |
+
'threshold': self.threshold
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
matches = []
|
| 126 |
+
total_comparisons = len(promo_products)
|
| 127 |
+
|
| 128 |
+
start_time = time.time()
|
| 129 |
+
|
| 130 |
+
for i, promo_product in enumerate(promo_products):
|
| 131 |
+
promo_name = promo_product.get('name', '').strip()
|
| 132 |
+
promo_store = promo_product.get('store', '').strip()
|
| 133 |
+
|
| 134 |
+
if not promo_name:
|
| 135 |
+
continue
|
| 136 |
+
|
| 137 |
+
# Send current comparison info
|
| 138 |
+
yield {
|
| 139 |
+
'type': 'comparing',
|
| 140 |
+
'current': i + 1,
|
| 141 |
+
'total': total_comparisons,
|
| 142 |
+
'promo_name': promo_name[:50],
|
| 143 |
+
'promo_store': promo_store,
|
| 144 |
+
'progress': round(((i + 1) / total_comparisons) * 100, 1)
|
| 145 |
+
}
|
| 146 |
+
|
| 147 |
+
best_match = None
|
| 148 |
+
best_similarity = 0.0
|
| 149 |
+
comparisons_for_this_promo = 0
|
| 150 |
+
|
| 151 |
+
# Compare against all database products
|
| 152 |
+
for db_product in db_products:
|
| 153 |
+
db_name = db_product.get('product_name', '').strip()
|
| 154 |
+
|
| 155 |
+
if not db_name:
|
| 156 |
+
continue
|
| 157 |
+
|
| 158 |
+
# Calculate similarity
|
| 159 |
+
similarity = calculate_similarity(promo_name, db_name)
|
| 160 |
+
comparisons_for_this_promo += 1
|
| 161 |
+
|
| 162 |
+
if similarity >= self.threshold and similarity > best_similarity:
|
| 163 |
+
best_similarity = similarity
|
| 164 |
+
confidence = calculate_confidence(similarity, promo_name, db_name)
|
| 165 |
+
|
| 166 |
+
best_match = {
|
| 167 |
+
'promo_id': promo_product.get('id'),
|
| 168 |
+
'promo_name': promo_name,
|
| 169 |
+
'promo_store': promo_store,
|
| 170 |
+
'promo_price': promo_product.get('promo_price', 0),
|
| 171 |
+
'regular_price': promo_product.get('regular_price', 0),
|
| 172 |
+
'picture_id': promo_product.get('picture_id'),
|
| 173 |
+
'db_product_id': db_product.get('product_id'),
|
| 174 |
+
'db_product_name': db_name,
|
| 175 |
+
'db_brand': db_product.get('brand', {}).get('brand_name', 'No Brand') if db_product.get('brand') else 'No Brand',
|
| 176 |
+
'similarity': round(similarity, 3),
|
| 177 |
+
'confidence': round(confidence, 3)
|
| 178 |
+
}
|
| 179 |
+
|
| 180 |
+
# If match found, send immediate update
|
| 181 |
+
if best_match:
|
| 182 |
+
matches.append(best_match)
|
| 183 |
+
yield {
|
| 184 |
+
'type': 'match_found',
|
| 185 |
+
'match': best_match,
|
| 186 |
+
'total_matches': len(matches)
|
| 187 |
+
}
|
| 188 |
+
|
| 189 |
+
# Small delay to prevent overwhelming
|
| 190 |
+
await asyncio.sleep(0.01)
|
| 191 |
+
|
| 192 |
+
# Send final results
|
| 193 |
+
analysis_time = time.time() - start_time
|
| 194 |
+
yield {
|
| 195 |
+
'type': 'complete',
|
| 196 |
+
'matches': matches,
|
| 197 |
+
'summary': {
|
| 198 |
+
'total_promo_products': len(promo_products),
|
| 199 |
+
'total_db_products': len(db_products),
|
| 200 |
+
'matches_found': len(matches),
|
| 201 |
+
'match_rate': round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
|
| 202 |
+
'analysis_time_seconds': round(analysis_time, 2),
|
| 203 |
+
'threshold_used': self.threshold
|
| 204 |
+
}
|
| 205 |
+
}
|
| 206 |
+
|
| 207 |
+
def run_promo_comparison(self, save_report: bool = True) -> Dict[str, Any]:
|
| 208 |
+
"""
|
| 209 |
+
Main function to run promo product comparison
|
| 210 |
+
|
| 211 |
+
Args:
|
| 212 |
+
save_report: Whether to save results to file
|
| 213 |
+
|
| 214 |
+
Returns:
|
| 215 |
+
Comparison results
|
| 216 |
+
"""
|
| 217 |
+
logger.info("🏷️ Starting promo products comparison")
|
| 218 |
+
|
| 219 |
+
if not self.repository:
|
| 220 |
+
logger.error("❌ Repository not available")
|
| 221 |
+
return {'error': 'Repository not available'}
|
| 222 |
+
|
| 223 |
+
# Load promo products
|
| 224 |
+
logger.info("📊 Loading promotional products...")
|
| 225 |
+
promo_products = self.repository.load_promo_products()
|
| 226 |
+
|
| 227 |
+
if not promo_products:
|
| 228 |
+
logger.warning("❌ No promo products found")
|
| 229 |
+
return {'error': 'No promo products found'}
|
| 230 |
+
|
| 231 |
+
# Load database products
|
| 232 |
+
logger.info("📊 Loading database products...")
|
| 233 |
+
db_products = self.repository.load_all_products()
|
| 234 |
+
|
| 235 |
+
if not db_products:
|
| 236 |
+
logger.warning("❌ No database products found")
|
| 237 |
+
return {'error': 'No database products found'}
|
| 238 |
+
|
| 239 |
+
# Run comparison
|
| 240 |
+
matches = self.compare_promo_against_database(promo_products, db_products)
|
| 241 |
+
|
| 242 |
+
# Prepare results
|
| 243 |
+
results = {
|
| 244 |
+
'matches': matches,
|
| 245 |
+
'summary': {
|
| 246 |
+
'total_promo_products': len(promo_products),
|
| 247 |
+
'total_db_products': len(db_products),
|
| 248 |
+
'matches_found': len(matches),
|
| 249 |
+
'match_rate': round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
|
| 250 |
+
'threshold_used': self.threshold
|
| 251 |
+
}
|
| 252 |
+
}
|
| 253 |
+
|
| 254 |
+
logger.info(f"✅ Promo comparison complete: {len(matches)} matches found")
|
| 255 |
+
|
| 256 |
+
return results
|
| 257 |
+
|
| 258 |
+
|
| 259 |
+
def get_promo_comparator(threshold: float = 0.85) -> PromoComparator:
|
| 260 |
+
"""Get a PromoComparator instance"""
|
| 261 |
+
return PromoComparator(threshold=threshold)
|
| 262 |
+
|
| 263 |
+
|
| 264 |
+
# For backwards compatibility
|
| 265 |
+
def run_promo_comparison(threshold: float = 0.85, save_report: bool = True):
|
| 266 |
+
"""
|
| 267 |
+
Main function to run promo product comparison (backwards compatibility)
|
| 268 |
+
|
| 269 |
+
Args:
|
| 270 |
+
threshold: Similarity threshold for matches
|
| 271 |
+
save_report: Whether to save results to file
|
| 272 |
+
"""
|
| 273 |
+
comparator = PromoComparator(threshold=threshold)
|
| 274 |
+
return comparator.run_promo_comparison(save_report=save_report)
|
| 275 |
+
|
| 276 |
+
|
| 277 |
+
if __name__ == "__main__":
|
| 278 |
+
run_promo_comparison()
|
similarity_engine/similarity_core.py
ADDED
|
@@ -0,0 +1,190 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
COMPLETE Similarity Engine - Core Module
|
| 3 |
+
All original methods preserved with improved duplicate detection
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import re
|
| 7 |
+
from typing import Set
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def normalize_product_name(name: str) -> str:
|
| 11 |
+
"""Normalize product name for better comparison"""
|
| 12 |
+
if not name:
|
| 13 |
+
return ""
|
| 14 |
+
name = name.lower()
|
| 15 |
+
name = re.sub(r'[^\w\s]', ' ', name)
|
| 16 |
+
name = re.sub(r'\s+', ' ', name).strip()
|
| 17 |
+
stop_words = {'za', 'i', 'u', 'na', 'sa', 'od', 'the', 'and', 'of', 'in'}
|
| 18 |
+
words = [word for word in name.split() if word not in stop_words]
|
| 19 |
+
return ' '.join(words)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def word_order_similarity(s1: str, s2: str) -> float:
|
| 23 |
+
"""Calculate word order similarity"""
|
| 24 |
+
s1 = normalize_product_name(s1)
|
| 25 |
+
s2 = normalize_product_name(s2)
|
| 26 |
+
if not s1 or not s2:
|
| 27 |
+
return 0.0
|
| 28 |
+
words1 = set(s1.split())
|
| 29 |
+
words2 = set(s2.split())
|
| 30 |
+
if not words1 and not words2:
|
| 31 |
+
return 1.0
|
| 32 |
+
if not words1 or not words2:
|
| 33 |
+
return 0.0
|
| 34 |
+
intersection = len(words1 & words2)
|
| 35 |
+
union = len(words1 | words2)
|
| 36 |
+
return intersection / union if union > 0 else 0.0
|
| 37 |
+
|
| 38 |
+
|
| 39 |
+
def dice_coefficient(s1: str, s2: str) -> float:
|
| 40 |
+
"""Calculate Dice coefficient for character-level similarity"""
|
| 41 |
+
if not s1 or not s2:
|
| 42 |
+
return 0.0
|
| 43 |
+
s1, s2 = s1.lower(), s2.lower()
|
| 44 |
+
bigrams1 = set(s1[i:i+2] for i in range(len(s1) - 1))
|
| 45 |
+
bigrams2 = set(s2[i:i+2] for i in range(len(s2) - 1))
|
| 46 |
+
if not bigrams1 and not bigrams2:
|
| 47 |
+
return 1.0
|
| 48 |
+
if not bigrams1 or not bigrams2:
|
| 49 |
+
return 0.0
|
| 50 |
+
intersection = len(bigrams1 & bigrams2)
|
| 51 |
+
return 2.0 * intersection / (len(bigrams1) + len(bigrams2))
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def jaro_winkler(s1: str, s2: str) -> float:
|
| 55 |
+
"""Calculate Jaro-Winkler similarity"""
|
| 56 |
+
if not s1 or not s2:
|
| 57 |
+
return 0.0
|
| 58 |
+
s1, s2 = s1.lower(), s2.lower()
|
| 59 |
+
if s1 == s2:
|
| 60 |
+
return 1.0
|
| 61 |
+
len1, len2 = len(s1), len(s2)
|
| 62 |
+
match_window = max(len1, len2) // 2 - 1
|
| 63 |
+
match_window = max(0, match_window)
|
| 64 |
+
s1_matches = [False] * len1
|
| 65 |
+
s2_matches = [False] * len2
|
| 66 |
+
matches = 0
|
| 67 |
+
transpositions = 0
|
| 68 |
+
for i in range(len1):
|
| 69 |
+
start = max(0, i - match_window)
|
| 70 |
+
end = min(i + match_window + 1, len2)
|
| 71 |
+
for j in range(start, end):
|
| 72 |
+
if s2_matches[j] or s1[i] != s2[j]:
|
| 73 |
+
continue
|
| 74 |
+
s1_matches[i] = s2_matches[j] = True
|
| 75 |
+
matches += 1
|
| 76 |
+
break
|
| 77 |
+
if matches == 0:
|
| 78 |
+
return 0.0
|
| 79 |
+
k = 0
|
| 80 |
+
for i in range(len1):
|
| 81 |
+
if not s1_matches[i]:
|
| 82 |
+
continue
|
| 83 |
+
while not s2_matches[k]:
|
| 84 |
+
k += 1
|
| 85 |
+
if s1[i] != s2[k]:
|
| 86 |
+
transpositions += 1
|
| 87 |
+
k += 1
|
| 88 |
+
jaro = (matches/len1 + matches/len2 + (matches - transpositions/2)/matches) / 3.0
|
| 89 |
+
prefix = 0
|
| 90 |
+
for i in range(min(len1, len2, 4)):
|
| 91 |
+
if s1[i] == s2[i]:
|
| 92 |
+
prefix += 1
|
| 93 |
+
else:
|
| 94 |
+
break
|
| 95 |
+
return jaro + (0.1 * prefix * (1 - jaro))
|
| 96 |
+
|
| 97 |
+
|
| 98 |
+
def levenshtein_similarity(s1: str, s2: str) -> float:
|
| 99 |
+
"""Calculate normalized Levenshtein similarity"""
|
| 100 |
+
if not s1 or not s2:
|
| 101 |
+
return 0.0
|
| 102 |
+
s1, s2 = s1.lower(), s2.lower()
|
| 103 |
+
if s1 == s2:
|
| 104 |
+
return 1.0
|
| 105 |
+
len1, len2 = len(s1), len(s2)
|
| 106 |
+
matrix = [[0] * (len2 + 1) for _ in range(len1 + 1)]
|
| 107 |
+
for i in range(len1 + 1):
|
| 108 |
+
matrix[i][0] = i
|
| 109 |
+
for j in range(len2 + 1):
|
| 110 |
+
matrix[0][j] = j
|
| 111 |
+
for i in range(1, len1 + 1):
|
| 112 |
+
for j in range(1, len2 + 1):
|
| 113 |
+
cost = 0 if s1[i-1] == s2[j-1] else 1
|
| 114 |
+
matrix[i][j] = min(
|
| 115 |
+
matrix[i-1][j] + 1,
|
| 116 |
+
matrix[i][j-1] + 1,
|
| 117 |
+
matrix[i-1][j-1] + cost
|
| 118 |
+
)
|
| 119 |
+
distance = matrix[len1][len2]
|
| 120 |
+
return 1 - (distance / max(len1, len2)) if max(len1, len2) > 0 else 0.0
|
| 121 |
+
|
| 122 |
+
|
| 123 |
+
def hybrid_similarity(s1: str, s2: str) -> float:
|
| 124 |
+
"""Combined similarity score using multiple algorithms"""
|
| 125 |
+
if normalize_product_name(s1) == normalize_product_name(s2):
|
| 126 |
+
return 1.0
|
| 127 |
+
norm1 = normalize_product_name(s1)
|
| 128 |
+
norm2 = normalize_product_name(s2)
|
| 129 |
+
if norm1 in norm2 or norm2 in norm1:
|
| 130 |
+
len_diff = abs(len(norm1) - len(norm2))
|
| 131 |
+
if len_diff < 3:
|
| 132 |
+
return 0.95
|
| 133 |
+
elif len_diff < 10:
|
| 134 |
+
return 0.85
|
| 135 |
+
else:
|
| 136 |
+
return 0.7
|
| 137 |
+
word_sim = word_order_similarity(s1, s2) * 0.40
|
| 138 |
+
dice_sim = dice_coefficient(s1, s2) * 0.30
|
| 139 |
+
jaro_sim = jaro_winkler(s1, s2) * 0.20
|
| 140 |
+
leven_sim = levenshtein_similarity(s1, s2) * 0.10
|
| 141 |
+
return word_sim + dice_sim + jaro_sim + leven_sim
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
def calculate_similarity(text1: str, text2: str) -> float:
|
| 145 |
+
"""Main similarity function"""
|
| 146 |
+
if not text1 or not text2:
|
| 147 |
+
return 0.0
|
| 148 |
+
if text1.strip() == text2.strip():
|
| 149 |
+
return 1.0
|
| 150 |
+
if len(text1) < 5 or len(text2) < 5:
|
| 151 |
+
return 0.0
|
| 152 |
+
return hybrid_similarity(text1.strip(), text2.strip())
|
| 153 |
+
|
| 154 |
+
|
| 155 |
+
def calculate_confidence(similarity: float, text1: str, text2: str) -> float:
|
| 156 |
+
"""Calculate confidence score"""
|
| 157 |
+
confidence = similarity
|
| 158 |
+
avg_length = (len(text1) + len(text2)) / 2
|
| 159 |
+
length_factor = min(avg_length / 100, 0.15)
|
| 160 |
+
word_count1 = len(text1.split())
|
| 161 |
+
word_count2 = len(text2.split())
|
| 162 |
+
avg_words = (word_count1 + word_count2) / 2
|
| 163 |
+
word_factor = min(avg_words / 15, 0.10)
|
| 164 |
+
len_diff = abs(len(text1) - len(text2))
|
| 165 |
+
len_penalty = min(len_diff / 50, 0.20)
|
| 166 |
+
final_confidence = confidence + length_factor + word_factor - len_penalty
|
| 167 |
+
return min(max(final_confidence, 0.0), 1.0)
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def test_similarity_examples():
|
| 171 |
+
"""Test function with examples"""
|
| 172 |
+
test_cases = [
|
| 173 |
+
("Maslac", "Maslac", True),
|
| 174 |
+
("Vrhnje za kuhanje", "Vrhnje za kuhanje 3x200g", False),
|
| 175 |
+
("Japanke copacabana lila", "Japanke copacabana flower", False),
|
| 176 |
+
("Kroasan praline pan pek 70 g", "Kroasan marelica pan pek 70 g", False),
|
| 177 |
+
("Spužva za kupanje", "Spužva baby za kupanje", False),
|
| 178 |
+
("Apple iPhone 13", "iPhone 13 Apple", True),
|
| 179 |
+
("vindija mlijeko cokoladno", "vindija cokoladno mlijeko", True)
|
| 180 |
+
]
|
| 181 |
+
print("🧪 Testing Similarity Examples:")
|
| 182 |
+
print("=" * 50)
|
| 183 |
+
for text1, text2, should_match in test_cases:
|
| 184 |
+
similarity = calculate_similarity(text1, text2)
|
| 185 |
+
confidence = calculate_confidence(similarity, text1, text2)
|
| 186 |
+
is_match = similarity >= 0.85
|
| 187 |
+
status = "✅ PASS" if is_match == should_match else "❌ FAIL"
|
| 188 |
+
print(f"{status} '{text1}' vs '{text2}'")
|
| 189 |
+
print(f" Similarity: {similarity:.3f} | Confidence: {confidence:.3f}")
|
| 190 |
+
print()
|
utils/cache_manager.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Cache Manager for Similarity Engine
|
| 3 |
+
Handles JSON caching of analysis results for improved performance
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
import os
|
| 8 |
+
import hashlib
|
| 9 |
+
import logging
|
| 10 |
+
from datetime import datetime, timedelta
|
| 11 |
+
from typing import Dict, Any, Optional, List
|
| 12 |
+
from pathlib import Path
|
| 13 |
+
|
| 14 |
+
# Configure logging
|
| 15 |
+
logging.basicConfig(level=logging.INFO)
|
| 16 |
+
logger = logging.getLogger(__name__)
|
| 17 |
+
|
| 18 |
+
class SimilarityCacheManager:
|
| 19 |
+
"""Manages caching of similarity analysis results"""
|
| 20 |
+
|
| 21 |
+
def __init__(self, cache_base_dir: str = "cache"):
|
| 22 |
+
"""
|
| 23 |
+
Initialize cache manager
|
| 24 |
+
|
| 25 |
+
Args:
|
| 26 |
+
cache_base_dir: Base directory for cache files
|
| 27 |
+
"""
|
| 28 |
+
self.cache_base_dir = Path(cache_base_dir)
|
| 29 |
+
self.cache_dirs = {
|
| 30 |
+
'duplicates': self.cache_base_dir / 'duplicates',
|
| 31 |
+
'promo_matches': self.cache_base_dir / 'promo_matches',
|
| 32 |
+
'comparisons': self.cache_base_dir / 'comparisons'
|
| 33 |
+
}
|
| 34 |
+
|
| 35 |
+
# Ensure cache directories exist
|
| 36 |
+
for cache_dir in self.cache_dirs.values():
|
| 37 |
+
cache_dir.mkdir(parents=True, exist_ok=True)
|
| 38 |
+
|
| 39 |
+
logger.info(f"📁 Cache manager initialized with base dir: {self.cache_base_dir}")
|
| 40 |
+
|
| 41 |
+
def generate_cache_key(
|
| 42 |
+
self,
|
| 43 |
+
analysis_type: str,
|
| 44 |
+
products_count: int,
|
| 45 |
+
threshold: float,
|
| 46 |
+
algorithm: str = "hybrid",
|
| 47 |
+
additional_params: Dict = None
|
| 48 |
+
) -> str:
|
| 49 |
+
"""
|
| 50 |
+
Generate unique cache key for analysis parameters
|
| 51 |
+
|
| 52 |
+
Args:
|
| 53 |
+
analysis_type: Type of analysis ('duplicates', 'promo', 'comparison')
|
| 54 |
+
products_count: Number of products in analysis
|
| 55 |
+
threshold: Similarity threshold used
|
| 56 |
+
algorithm: Algorithm used
|
| 57 |
+
additional_params: Any additional parameters to include in key
|
| 58 |
+
|
| 59 |
+
Returns:
|
| 60 |
+
Unique cache key string
|
| 61 |
+
"""
|
| 62 |
+
# Base parameters
|
| 63 |
+
key_data = {
|
| 64 |
+
'type': analysis_type,
|
| 65 |
+
'count': products_count,
|
| 66 |
+
'threshold': round(threshold, 2),
|
| 67 |
+
'algorithm': algorithm,
|
| 68 |
+
'date': datetime.now().strftime("%Y%m%d")
|
| 69 |
+
}
|
| 70 |
+
|
| 71 |
+
# Add additional parameters if provided
|
| 72 |
+
if additional_params:
|
| 73 |
+
key_data.update(additional_params)
|
| 74 |
+
|
| 75 |
+
# Create hash from parameters for uniqueness
|
| 76 |
+
key_string = json.dumps(key_data, sort_keys=True)
|
| 77 |
+
key_hash = hashlib.md5(key_string.encode()).hexdigest()[:8]
|
| 78 |
+
|
| 79 |
+
# Create readable cache key
|
| 80 |
+
cache_key = f"{analysis_type}_{products_count}_{int(threshold*100)}_{algorithm}_{key_hash}"
|
| 81 |
+
|
| 82 |
+
logger.debug(f"🔑 Generated cache key: {cache_key}")
|
| 83 |
+
return cache_key
|
| 84 |
+
|
| 85 |
+
def get_cache_file_path(self, analysis_type: str, cache_key: str) -> Path:
|
| 86 |
+
"""Get full path for cache file"""
|
| 87 |
+
cache_dir = self.cache_dirs.get(analysis_type, self.cache_dirs['comparisons'])
|
| 88 |
+
return cache_dir / f"{cache_key}.json"
|
| 89 |
+
|
| 90 |
+
def save_cache(
|
| 91 |
+
self,
|
| 92 |
+
analysis_type: str,
|
| 93 |
+
cache_key: str,
|
| 94 |
+
results: Dict[str, Any],
|
| 95 |
+
parameters: Dict[str, Any],
|
| 96 |
+
expiry_hours: int = 24
|
| 97 |
+
) -> bool:
|
| 98 |
+
"""
|
| 99 |
+
Save analysis results to cache
|
| 100 |
+
|
| 101 |
+
Args:
|
| 102 |
+
analysis_type: Type of analysis
|
| 103 |
+
cache_key: Unique cache key
|
| 104 |
+
results: Analysis results to cache
|
| 105 |
+
parameters: Parameters used for analysis
|
| 106 |
+
expiry_hours: Hours until cache expires
|
| 107 |
+
|
| 108 |
+
Returns:
|
| 109 |
+
True if saved successfully, False otherwise
|
| 110 |
+
"""
|
| 111 |
+
try:
|
| 112 |
+
cache_file = self.get_cache_file_path(analysis_type, cache_key)
|
| 113 |
+
|
| 114 |
+
cache_data = {
|
| 115 |
+
'cache_id': cache_key,
|
| 116 |
+
'analysis_type': analysis_type,
|
| 117 |
+
'created_at': datetime.now().isoformat(),
|
| 118 |
+
'expires_at': (datetime.now() + timedelta(hours=expiry_hours)).isoformat(),
|
| 119 |
+
'parameters': parameters,
|
| 120 |
+
'results': results,
|
| 121 |
+
'version': '1.0'
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
with open(cache_file, 'w', encoding='utf-8') as f:
|
| 125 |
+
json.dump(cache_data, f, indent=2, ensure_ascii=False)
|
| 126 |
+
|
| 127 |
+
file_size = cache_file.stat().st_size / 1024 # KB
|
| 128 |
+
logger.info(f"💾 Saved cache: {cache_key} ({file_size:.1f} KB)")
|
| 129 |
+
return True
|
| 130 |
+
|
| 131 |
+
except Exception as e:
|
| 132 |
+
logger.error(f"❌ Failed to save cache {cache_key}: {e}")
|
| 133 |
+
return False
|
| 134 |
+
|
| 135 |
+
def load_cache(self, analysis_type: str, cache_key: str) -> Optional[Dict[str, Any]]:
|
| 136 |
+
"""
|
| 137 |
+
Load cached analysis results
|
| 138 |
+
|
| 139 |
+
Args:
|
| 140 |
+
analysis_type: Type of analysis
|
| 141 |
+
cache_key: Cache key to load
|
| 142 |
+
|
| 143 |
+
Returns:
|
| 144 |
+
Cached results if valid, None otherwise
|
| 145 |
+
"""
|
| 146 |
+
try:
|
| 147 |
+
cache_file = self.get_cache_file_path(analysis_type, cache_key)
|
| 148 |
+
|
| 149 |
+
if not cache_file.exists():
|
| 150 |
+
logger.debug(f"📭 Cache miss: {cache_key}")
|
| 151 |
+
return None
|
| 152 |
+
|
| 153 |
+
with open(cache_file, 'r', encoding='utf-8') as f:
|
| 154 |
+
cache_data = json.load(f)
|
| 155 |
+
|
| 156 |
+
# Check if cache is expired
|
| 157 |
+
expiry_time = datetime.fromisoformat(cache_data['expires_at'])
|
| 158 |
+
if datetime.now() > expiry_time:
|
| 159 |
+
logger.info(f"⏰ Cache expired: {cache_key}")
|
| 160 |
+
cache_file.unlink() # Remove expired cache
|
| 161 |
+
return None
|
| 162 |
+
|
| 163 |
+
logger.info(f"✅ Cache hit: {cache_key}")
|
| 164 |
+
return cache_data['results']
|
| 165 |
+
|
| 166 |
+
except Exception as e:
|
| 167 |
+
logger.error(f"❌ Failed to load cache {cache_key}: {e}")
|
| 168 |
+
return None
|
| 169 |
+
|
| 170 |
+
def is_cache_valid(self, analysis_type: str, cache_key: str) -> bool:
|
| 171 |
+
"""Check if cache exists and is valid"""
|
| 172 |
+
try:
|
| 173 |
+
cache_file = self.get_cache_file_path(analysis_type, cache_key)
|
| 174 |
+
|
| 175 |
+
if not cache_file.exists():
|
| 176 |
+
return False
|
| 177 |
+
|
| 178 |
+
with open(cache_file, 'r', encoding='utf-8') as f:
|
| 179 |
+
cache_data = json.load(f)
|
| 180 |
+
|
| 181 |
+
expiry_time = datetime.fromisoformat(cache_data['expires_at'])
|
| 182 |
+
return datetime.now() <= expiry_time
|
| 183 |
+
|
| 184 |
+
except Exception:
|
| 185 |
+
return False
|
| 186 |
+
|
| 187 |
+
def clear_cache(self, analysis_type: str = None, older_than_hours: int = None) -> int:
|
| 188 |
+
"""
|
| 189 |
+
Clear cached results
|
| 190 |
+
|
| 191 |
+
Args:
|
| 192 |
+
analysis_type: Specific analysis type to clear, or None for all
|
| 193 |
+
older_than_hours: Clear cache older than X hours, or None for all
|
| 194 |
+
|
| 195 |
+
Returns:
|
| 196 |
+
Number of files removed
|
| 197 |
+
"""
|
| 198 |
+
removed_count = 0
|
| 199 |
+
|
| 200 |
+
# Determine which directories to clear
|
| 201 |
+
dirs_to_clear = [self.cache_dirs[analysis_type]] if analysis_type else self.cache_dirs.values()
|
| 202 |
+
|
| 203 |
+
for cache_dir in dirs_to_clear:
|
| 204 |
+
if not cache_dir.exists():
|
| 205 |
+
continue
|
| 206 |
+
|
| 207 |
+
for cache_file in cache_dir.glob("*.json"):
|
| 208 |
+
should_remove = False
|
| 209 |
+
|
| 210 |
+
try:
|
| 211 |
+
if older_than_hours is None:
|
| 212 |
+
should_remove = True
|
| 213 |
+
else:
|
| 214 |
+
# Check file age
|
| 215 |
+
with open(cache_file, 'r') as f:
|
| 216 |
+
cache_data = json.load(f)
|
| 217 |
+
|
| 218 |
+
created_time = datetime.fromisoformat(cache_data['created_at'])
|
| 219 |
+
age_hours = (datetime.now() - created_time).total_seconds() / 3600
|
| 220 |
+
|
| 221 |
+
if age_hours > older_than_hours:
|
| 222 |
+
should_remove = True
|
| 223 |
+
|
| 224 |
+
if should_remove:
|
| 225 |
+
cache_file.unlink()
|
| 226 |
+
removed_count += 1
|
| 227 |
+
logger.info(f"🗑️ Removed cache: {cache_file.name}")
|
| 228 |
+
|
| 229 |
+
except Exception as e:
|
| 230 |
+
logger.warning(f"⚠️ Failed to process cache file {cache_file}: {e}")
|
| 231 |
+
|
| 232 |
+
logger.info(f"🧹 Cache cleanup complete: {removed_count} files removed")
|
| 233 |
+
return removed_count
|
| 234 |
+
|
| 235 |
+
def get_cache_stats(self) -> Dict[str, Any]:
|
| 236 |
+
"""Get cache statistics"""
|
| 237 |
+
stats = {
|
| 238 |
+
'total_files': 0,
|
| 239 |
+
'total_size_mb': 0,
|
| 240 |
+
'by_type': {},
|
| 241 |
+
'cache_dirs': {}
|
| 242 |
+
}
|
| 243 |
+
|
| 244 |
+
for analysis_type, cache_dir in self.cache_dirs.items():
|
| 245 |
+
if not cache_dir.exists():
|
| 246 |
+
continue
|
| 247 |
+
|
| 248 |
+
type_stats = {
|
| 249 |
+
'files': 0,
|
| 250 |
+
'size_mb': 0,
|
| 251 |
+
'valid_files': 0,
|
| 252 |
+
'expired_files': 0
|
| 253 |
+
}
|
| 254 |
+
|
| 255 |
+
for cache_file in cache_dir.glob("*.json"):
|
| 256 |
+
try:
|
| 257 |
+
file_size = cache_file.stat().st_size / (1024 * 1024) # MB
|
| 258 |
+
type_stats['files'] += 1
|
| 259 |
+
type_stats['size_mb'] += file_size
|
| 260 |
+
|
| 261 |
+
# Check if valid
|
| 262 |
+
with open(cache_file, 'r') as f:
|
| 263 |
+
cache_data = json.load(f)
|
| 264 |
+
|
| 265 |
+
expiry_time = datetime.fromisoformat(cache_data['expires_at'])
|
| 266 |
+
if datetime.now() <= expiry_time:
|
| 267 |
+
type_stats['valid_files'] += 1
|
| 268 |
+
else:
|
| 269 |
+
type_stats['expired_files'] += 1
|
| 270 |
+
|
| 271 |
+
except Exception:
|
| 272 |
+
type_stats['expired_files'] += 1
|
| 273 |
+
|
| 274 |
+
stats['by_type'][analysis_type] = type_stats
|
| 275 |
+
stats['total_files'] += type_stats['files']
|
| 276 |
+
stats['total_size_mb'] += type_stats['size_mb']
|
| 277 |
+
stats['cache_dirs'][analysis_type] = str(cache_dir)
|
| 278 |
+
|
| 279 |
+
return stats
|
| 280 |
+
|
| 281 |
+
def cleanup_expired_cache(self) -> int:
|
| 282 |
+
"""Remove all expired cache files"""
|
| 283 |
+
return self.clear_cache(older_than_hours=0) # Remove only expired files
|
| 284 |
+
|
| 285 |
+
|
| 286 |
+
# Global cache manager instance
|
| 287 |
+
_cache_manager = None
|
| 288 |
+
|
| 289 |
+
def get_cache_manager() -> SimilarityCacheManager:
|
| 290 |
+
"""Get singleton cache manager instance"""
|
| 291 |
+
global _cache_manager
|
| 292 |
+
if _cache_manager is None:
|
| 293 |
+
_cache_manager = SimilarityCacheManager()
|
| 294 |
+
return _cache_manager
|
| 295 |
+
|
| 296 |
+
|
| 297 |
+
# Convenience functions
|
| 298 |
+
def cache_duplicate_analysis(
|
| 299 |
+
products_count: int,
|
| 300 |
+
threshold: float,
|
| 301 |
+
results: Dict[str, Any],
|
| 302 |
+
parameters: Dict[str, Any]
|
| 303 |
+
) -> str:
|
| 304 |
+
"""Cache duplicate analysis results"""
|
| 305 |
+
cache_mgr = get_cache_manager()
|
| 306 |
+
cache_key = cache_mgr.generate_cache_key('duplicates', products_count, threshold)
|
| 307 |
+
cache_mgr.save_cache('duplicates', cache_key, results, parameters)
|
| 308 |
+
return cache_key
|
| 309 |
+
|
| 310 |
+
def load_duplicate_analysis(
|
| 311 |
+
products_count: int,
|
| 312 |
+
threshold: float
|
| 313 |
+
) -> Optional[Dict[str, Any]]:
|
| 314 |
+
"""Load cached duplicate analysis results"""
|
| 315 |
+
cache_mgr = get_cache_manager()
|
| 316 |
+
cache_key = cache_mgr.generate_cache_key('duplicates', products_count, threshold)
|
| 317 |
+
return cache_mgr.load_cache('duplicates', cache_key)
|
| 318 |
+
|
| 319 |
+
def cache_promo_analysis(
|
| 320 |
+
promo_count: int,
|
| 321 |
+
db_count: int,
|
| 322 |
+
threshold: float,
|
| 323 |
+
results: Dict[str, Any],
|
| 324 |
+
parameters: Dict[str, Any]
|
| 325 |
+
) -> str:
|
| 326 |
+
"""Cache promo analysis results"""
|
| 327 |
+
cache_mgr = get_cache_manager()
|
| 328 |
+
cache_key = cache_mgr.generate_cache_key(
|
| 329 |
+
'promo_matches',
|
| 330 |
+
promo_count + db_count,
|
| 331 |
+
threshold,
|
| 332 |
+
additional_params={'promo_count': promo_count, 'db_count': db_count}
|
| 333 |
+
)
|
| 334 |
+
cache_mgr.save_cache('promo_matches', cache_key, results, parameters)
|
| 335 |
+
return cache_key
|
| 336 |
+
|
| 337 |
+
def load_promo_analysis(
|
| 338 |
+
promo_count: int,
|
| 339 |
+
db_count: int,
|
| 340 |
+
threshold: float
|
| 341 |
+
) -> Optional[Dict[str, Any]]:
|
| 342 |
+
"""Load cached promo analysis results"""
|
| 343 |
+
cache_mgr = get_cache_manager()
|
| 344 |
+
cache_key = cache_mgr.generate_cache_key(
|
| 345 |
+
'promo_matches',
|
| 346 |
+
promo_count + db_count,
|
| 347 |
+
threshold,
|
| 348 |
+
additional_params={'promo_count': promo_count, 'db_count': db_count}
|
| 349 |
+
)
|
| 350 |
+
return cache_mgr.load_cache('promo_matches', cache_key)
|