wisdom anthony commited on
Commit
97911a8
·
1 Parent(s): 028bcd8

similarity code for backend

Browse files
.gitignore CHANGED
@@ -113,4 +113,10 @@ config.ini
113
  secrets.json
114
  credentials.json
115
  *.pem
116
- *.key
 
 
 
 
 
 
 
113
  secrets.json
114
  credentials.json
115
  *.pem
116
+ *.key
117
+
118
+ # Similarity Engine Cache
119
+ cache/
120
+ /cache/
121
+ **/cache/
122
+ *.cache
api/main.py CHANGED
@@ -5,6 +5,7 @@ from api.product_routes import router as product_router
5
  from api.receipt_routes import router as receipt_router
6
  from api.scrape_routes import router as scrape_router
7
  from api.cijene_routes import router as cijene_router
 
8
 
9
  # Initialize FastAPI
10
  app = FastAPI(title="SupaKuna API")
@@ -24,6 +25,7 @@ app.include_router(product_router)
24
  app.include_router(receipt_router)
25
  app.include_router(scrape_router)
26
  app.include_router(cijene_router)
 
27
 
28
  @app.get("/", tags=["Health"])
29
  def health_check():
 
5
  from api.receipt_routes import router as receipt_router
6
  from api.scrape_routes import router as scrape_router
7
  from api.cijene_routes import router as cijene_router
8
+ from api.similarity_routes import router as similarity_router
9
 
10
  # Initialize FastAPI
11
  app = FastAPI(title="SupaKuna API")
 
25
  app.include_router(receipt_router)
26
  app.include_router(scrape_router)
27
  app.include_router(cijene_router)
28
+ app.include_router(similarity_router)
29
 
30
  @app.get("/", tags=["Health"])
31
  def health_check():
api/product_routes.py CHANGED
@@ -1,6 +1,7 @@
1
  from fastapi import APIRouter, File, UploadFile, HTTPException, Form
2
  from utils.image_processing import read_image_file, process_product_image
3
- from product_detector.detector import ObjectDetector
 
4
  from config.settings import MODEL_ONNX_PATH, CLASS_NAMES, INPUT_SIZE
5
  from utils.image_processing import process_and_store_product_image
6
 
 
1
  from fastapi import APIRouter, File, UploadFile, HTTPException, Form
2
  from utils.image_processing import read_image_file, process_product_image
3
+ # from product_detector.detector import ObjectDetector # Temporarily disabled - model corrupted
4
+ from product_detector.mock_detector import MockObjectDetector as ObjectDetector
5
  from config.settings import MODEL_ONNX_PATH, CLASS_NAMES, INPUT_SIZE
6
  from utils.image_processing import process_and_store_product_image
7
 
api/similarity_routes.py ADDED
@@ -0,0 +1,1079 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Similarity Routes - FastAPI Endpoints for Similarity Engine
3
+ NOW WITH JSON CACHING FOR IMPROVED PERFORMANCE
4
+ """
5
+
6
+ import logging
7
+ import sys
8
+ import os
9
+ from typing import List, Dict, Any, Optional
10
+ from fastapi import APIRouter, HTTPException, File, UploadFile, Form, BackgroundTasks
11
+ from pydantic import BaseModel, Field
12
+ import time
13
+ from fastapi.responses import StreamingResponse
14
+ import asyncio
15
+ import json
16
+
17
+ # Add parent directory to path to access other modules
18
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
19
+
20
+ # Import our similarity engine modules
21
+ try:
22
+ from similarity_engine.similarity_core import calculate_similarity, calculate_confidence, test_similarity_examples
23
+ from similarity_engine.product_comparator import ProductComparator
24
+ from db.similarity_repository import get_similarity_repository
25
+ from utils.cache_manager import (
26
+ get_cache_manager,
27
+ cache_duplicate_analysis,
28
+ load_duplicate_analysis,
29
+ cache_promo_analysis,
30
+ load_promo_analysis
31
+ )
32
+
33
+ print("✅ All similarity modules imported successfully")
34
+ print("✅ Cache manager imported successfully")
35
+
36
+ except ImportError as e:
37
+ print(f"⚠️ Some similarity modules failed to import: {e}")
38
+ print("🔄 Using fallback implementations...")
39
+
40
+ # Create fallback functions to prevent startup failure
41
+ def calculate_similarity(a, b):
42
+ a, b = str(a).lower().strip(), str(b).lower().strip()
43
+ if a == b: return 1.0
44
+ if not a or not b: return 0.0
45
+ return 0.8 if a in b or b in a else 0.3
46
+
47
+ def calculate_confidence(sim, a, b):
48
+ return sim * 0.9
49
+
50
+ def test_similarity_examples(): return []
51
+
52
+ class ProductComparator:
53
+ def __init__(self, *args, **kwargs): pass
54
+
55
+ def get_similarity_repository(): return None
56
+
57
+ # Fallback cache functions
58
+ def get_cache_manager(): return None
59
+ def cache_duplicate_analysis(*args, **kwargs): return None
60
+ def load_duplicate_analysis(*args, **kwargs): return None
61
+ def cache_promo_analysis(*args, **kwargs): return None
62
+ def load_promo_analysis(*args, **kwargs): return None
63
+
64
+ # Configure logging
65
+ logging.basicConfig(level=logging.INFO)
66
+ logger = logging.getLogger(__name__)
67
+
68
+ # Create router
69
+ router = APIRouter(prefix="/similarity", tags=["Similarity Engine"])
70
+
71
+
72
+
73
+ # Pydantic models for request/response
74
+ class ProductComparisonRequest(BaseModel):
75
+ product1_name: str = Field(..., description="First product name")
76
+ product2_name: str = Field(..., description="Second product name")
77
+ threshold: Optional[float] = Field(0.87, description="Similarity threshold", ge=0.1, le=1.0)
78
+
79
+ class DuplicateAnalysisRequest(BaseModel):
80
+ threshold: Optional[float] = Field(0.87, description="Similarity threshold", ge=0.1, le=1.0)
81
+ use_sample_data: Optional[bool] = Field(False, description="Use sample data instead of database")
82
+ return_summary_only: Optional[bool] = Field(False, description="Return only summary statistics")
83
+ force_refresh: Optional[bool] = Field(False, description="Force refresh, bypass cache")
84
+
85
+ class PromoComparisonRequest(BaseModel):
86
+ threshold: Optional[float] = Field(0.85, description="Similarity threshold", ge=0.1, le=1.0)
87
+ max_results: Optional[int] = Field(None, description="Maximum results to return")
88
+ force_refresh: Optional[bool] = Field(False, description="Force refresh, bypass cache")
89
+
90
+ class CacheManagementRequest(BaseModel):
91
+ analysis_type: Optional[str] = Field(None, description="Type to clear (duplicates, promo_matches, or all)")
92
+ older_than_hours: Optional[int] = Field(None, description="Clear cache older than X hours")
93
+
94
+ # Health check endpoint
95
+ @router.get("/health", summary="Health Check")
96
+ async def health_check():
97
+ """Check if similarity engine is working"""
98
+ try:
99
+ # Test basic similarity calculation
100
+ test_similarity = calculate_similarity("test product", "test product")
101
+
102
+ # Test database connection
103
+ repository = get_similarity_repository()
104
+ db_status = "connected" if repository and repository.supabase else "disconnected"
105
+
106
+ # Test cache manager
107
+ cache_mgr = get_cache_manager()
108
+ cache_status = "available" if cache_mgr else "unavailable"
109
+
110
+ return {
111
+ "status": "healthy",
112
+ "similarity_engine": "operational",
113
+ "database_connection": db_status,
114
+ "cache_system": cache_status,
115
+ "test_similarity": test_similarity,
116
+ "timestamp": time.time()
117
+ }
118
+ except Exception as e:
119
+ logger.error(f"Health check failed: {e}")
120
+ return {
121
+ "status": "error",
122
+ "similarity_engine": "failed",
123
+ "error": str(e),
124
+ "timestamp": time.time()
125
+ }
126
+
127
+ # Core similarity endpoints
128
+ @router.post("/compare-products", summary="Compare Two Products")
129
+ async def compare_two_products(request: ProductComparisonRequest):
130
+ """Compare similarity between two specific products"""
131
+ try:
132
+ logger.info(f"🔍 Comparing products: '{request.product1_name}' vs '{request.product2_name}'")
133
+
134
+ # Calculate similarity directly
135
+ similarity = calculate_similarity(request.product1_name, request.product2_name)
136
+ confidence = calculate_confidence(similarity, request.product1_name, request.product2_name)
137
+ is_duplicate = similarity >= request.threshold
138
+
139
+ # Determine assessment
140
+ if similarity >= 0.95:
141
+ assessment = {"description": "Perfect match - identical products", "emoji": "✅", "category": "identical"}
142
+ elif similarity >= 0.8:
143
+ assessment = {"description": "Very similar - likely duplicates", "emoji": "⚠️", "category": "very_similar"}
144
+ elif similarity >= 0.7:
145
+ assessment = {"description": "Similar - review recommended", "emoji": "🤔", "category": "similar"}
146
+ else:
147
+ assessment = {"description": "Different products", "emoji": "❌", "category": "different"}
148
+
149
+ result = {
150
+ 'product1_name': request.product1_name,
151
+ 'product2_name': request.product2_name,
152
+ 'similarity': round(similarity, 3),
153
+ 'confidence': round(confidence, 3),
154
+ 'is_duplicate': is_duplicate,
155
+ 'threshold_used': request.threshold,
156
+ 'assessment': assessment,
157
+ 'percentage_similarity': round(similarity * 100, 1),
158
+ 'percentage_confidence': round(confidence * 100, 1)
159
+ }
160
+
161
+ return {
162
+ "status": "success",
163
+ "comparison": result,
164
+ "timestamp": time.time()
165
+ }
166
+
167
+ except Exception as e:
168
+ logger.error(f"Product comparison error: {e}")
169
+ raise HTTPException(status_code=500, detail=f"Comparison failed: {str(e)}")
170
+
171
+ @router.post("/find-duplicates", summary="Find Database Duplicates - WITH CACHING")
172
+ async def find_duplicates(request: DuplicateAnalysisRequest):
173
+ """Find duplicate products in the database - NOW WITH JSON CACHING"""
174
+ try:
175
+ logger.info(f"🔍 Starting duplicate analysis with threshold {request.threshold}")
176
+
177
+ repository = get_similarity_repository()
178
+
179
+ # Load products - REAL DATA NOW!
180
+ if request.use_sample_data:
181
+ logger.info("📊 Using sample data for analysis")
182
+ products = repository.get_sample_products() if repository else []
183
+ else:
184
+ logger.info("📊 Loading REAL products from database...")
185
+ products = repository.load_all_products() if repository else []
186
+
187
+ if not products:
188
+ return {
189
+ "status": "error",
190
+ "message": "No products found for analysis - check database connection",
191
+ "duplicates": [],
192
+ "analysis_summary": {}
193
+ }
194
+
195
+ logger.info(f"📊 Loaded {len(products)} products for analysis")
196
+
197
+ # 🚀 NEW: CHECK CACHE FIRST (unless force refresh)
198
+ if not request.force_refresh:
199
+ cached_results = load_duplicate_analysis(len(products), request.threshold)
200
+ if cached_results:
201
+ logger.info("✅ Returning cached duplicate analysis results")
202
+ return {
203
+ "status": "success",
204
+ "data": cached_results,
205
+ "cached": True,
206
+ "cache_hit": True,
207
+ "timestamp": time.time()
208
+ }
209
+
210
+ # Run analysis if no cache or force refresh
211
+ logger.info("🔄 Running fresh duplicate analysis...")
212
+ start_time = time.time()
213
+ duplicates = []
214
+
215
+ for i, product1 in enumerate(products):
216
+ for j, product2 in enumerate(products[i+1:], i+1):
217
+ product_name1 = product1.get('product_name', '').strip()
218
+ product_name2 = product2.get('product_name', '').strip()
219
+
220
+ if not product_name1 or not product_name2:
221
+ continue
222
+
223
+ similarity = calculate_similarity(product_name1, product_name2)
224
+ if similarity >= request.threshold:
225
+ confidence = calculate_confidence(similarity, product_name1, product_name2)
226
+ duplicates.append({
227
+ 'product1_id': product1.get('product_id'),
228
+ 'product1_name': product_name1,
229
+ 'product2_id': product2.get('product_id'),
230
+ 'product2_name': product_name2,
231
+ 'similarity': round(similarity, 3),
232
+ 'confidence': round(confidence, 3)
233
+ })
234
+
235
+ analysis_time = time.time() - start_time
236
+
237
+ # Create results
238
+ results = {
239
+ 'duplicates': duplicates,
240
+ 'analysis_summary': {
241
+ 'total_products': len(products),
242
+ 'duplicates_found': len(duplicates),
243
+ 'duplicate_rate': (len(duplicates) / len(products)) * 100 if products else 0,
244
+ 'analysis_time_seconds': round(analysis_time, 2),
245
+ 'threshold_used': request.threshold,
246
+ 'success': True,
247
+ 'data_source': 'sample_data' if request.use_sample_data else 'real_database'
248
+ },
249
+ 'recommendations': [
250
+ f"Found {len(duplicates)} potential duplicates in {len(products)} products",
251
+ "Review matches manually for final decision",
252
+ "Higher similarity scores indicate more confident matches",
253
+ f"Analysis completed in {analysis_time:.2f} seconds"
254
+ ]
255
+ }
256
+
257
+ # 🚀 NEW: CACHE THE RESULTS
258
+ cache_parameters = {
259
+ 'threshold': request.threshold,
260
+ 'use_sample_data': request.use_sample_data,
261
+ 'total_products': len(products),
262
+ 'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
263
+ }
264
+
265
+ cache_key = cache_duplicate_analysis(
266
+ len(products),
267
+ request.threshold,
268
+ results,
269
+ cache_parameters
270
+ )
271
+
272
+ logger.info(f"✅ Duplicate analysis complete: {len(duplicates)} duplicates found in {len(products)} products")
273
+ logger.info(f"💾 Results cached with key: {cache_key}")
274
+
275
+ return {
276
+ "status": "success",
277
+ "data": results,
278
+ "cached": False,
279
+ "cache_key": cache_key,
280
+ "timestamp": time.time()
281
+ }
282
+
283
+ except Exception as e:
284
+ logger.error(f"Duplicate analysis error: {e}")
285
+ raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
286
+
287
+ @router.post("/find-duplicates-stream", summary="Stream Duplicate Analysis with Real-Time Progress")
288
+ async def find_duplicates_stream(request: DuplicateAnalysisRequest):
289
+ """Stream duplicate analysis with real-time progress updates"""
290
+
291
+ async def generate_progress():
292
+ try:
293
+ logger.info(f"🔍 Starting streaming duplicate analysis with threshold {request.threshold}")
294
+
295
+ repository = get_similarity_repository()
296
+
297
+ # Load products - REAL DATA
298
+ if request.use_sample_data:
299
+ logger.info("📊 Using sample data for analysis")
300
+ products = repository.get_sample_products() if repository else []
301
+ else:
302
+ logger.info("📊 Loading REAL products from database...")
303
+ products = repository.load_all_products() if repository else []
304
+
305
+ if not products:
306
+ yield f"data: {json.dumps({'type': 'error', 'message': 'No products found for analysis'})}\n\n"
307
+ return
308
+
309
+ logger.info(f"📊 Loaded {len(products)} products for streaming analysis")
310
+
311
+ # Check cache first (unless force refresh)
312
+ if not request.force_refresh:
313
+ cached_results = load_duplicate_analysis(len(products), request.threshold)
314
+ if cached_results:
315
+ logger.info("✅ Returning cached duplicate analysis results via stream")
316
+ yield f"data: {json.dumps({'type': 'cache_hit', 'data': cached_results})}\n\n"
317
+ return
318
+
319
+ # Send initial status
320
+ total_comparisons = len(products) * (len(products) - 1) // 2
321
+ yield f"data: {json.dumps({'type': 'init', 'total_products': len(products), 'total_comparisons': total_comparisons, 'threshold': request.threshold})}\n\n"
322
+
323
+ # Run analysis with real-time progress
324
+ start_time = time.time()
325
+ duplicates = []
326
+ completed_comparisons = 0
327
+
328
+ for i, product1 in enumerate(products):
329
+ product_name1 = product1.get('product_name', '').strip()
330
+ product_id1 = product1.get('product_id', 'unknown')
331
+
332
+ if not product_name1:
333
+ continue
334
+
335
+ # Send progress update every 100 products
336
+ if i % 100 == 0:
337
+ progress_percentage = (i / len(products)) * 100
338
+ yield f"data: {json.dumps({'type': 'product_progress', 'current_product': i + 1, 'total_products': len(products), 'progress': round(progress_percentage, 1), 'current_name': product_name1[:50]})}\n\n"
339
+
340
+ for j, product2 in enumerate(products[i+1:], i+1):
341
+ product_name2 = product2.get('product_name', '').strip()
342
+ product_id2 = product2.get('product_id', 'unknown')
343
+
344
+ if not product_name2 or product_id1 == product_id2:
345
+ continue
346
+
347
+ completed_comparisons += 1
348
+
349
+ # Calculate similarity
350
+ similarity = calculate_similarity(product_name1, product_name2)
351
+
352
+ # Send progress every 10,000 comparisons
353
+ if completed_comparisons % 10000 == 0:
354
+ progress_percentage = (completed_comparisons / total_comparisons) * 100
355
+ yield f"data: {json.dumps({'type': 'comparison_progress', 'completed_comparisons': completed_comparisons, 'total_comparisons': total_comparisons, 'progress': round(progress_percentage, 1), 'comparing': f'{product_name1[:30]} vs {product_name2[:30]}'})}\n\n"
356
+ await asyncio.sleep(0.01) # Small delay to prevent overwhelming
357
+
358
+ # Check if it's a duplicate
359
+ if similarity >= request.threshold:
360
+ confidence = calculate_confidence(similarity, product_name1, product_name2)
361
+
362
+ duplicate_info = {
363
+ 'product1_id': product_id1,
364
+ 'product1_name': product_name1,
365
+ 'product2_id': product_id2,
366
+ 'product2_name': product_name2,
367
+ 'similarity': round(similarity, 3),
368
+ 'confidence': round(confidence, 3)
369
+ }
370
+
371
+ duplicates.append(duplicate_info)
372
+
373
+ # Send duplicate found immediately
374
+ yield f"data: {json.dumps({'type': 'duplicate_found', 'duplicate': duplicate_info, 'total_duplicates': len(duplicates)})}\n\n"
375
+ logger.info(f" 🔍 DUPLICATE FOUND via stream: {product_name1} ↔ {product_name2} ({similarity:.3f})")
376
+
377
+ analysis_time = time.time() - start_time
378
+
379
+ # Create final results
380
+ results = {
381
+ 'duplicates': duplicates,
382
+ 'analysis_summary': {
383
+ 'total_products': len(products),
384
+ 'total_comparisons': completed_comparisons,
385
+ 'duplicates_found': len(duplicates),
386
+ 'duplicate_rate': (len(duplicates) / len(products)) * 100 if products else 0,
387
+ 'analysis_time_seconds': round(analysis_time, 2),
388
+ 'threshold_used': request.threshold,
389
+ 'success': True,
390
+ 'data_source': 'sample_data' if request.use_sample_data else 'real_database'
391
+ },
392
+ 'recommendations': [
393
+ f"Found {len(duplicates)} potential duplicates in {len(products)} products",
394
+ "Review matches manually for final decision",
395
+ "Higher similarity scores indicate more confident matches",
396
+ f"Analysis completed in {analysis_time:.2f} seconds"
397
+ ]
398
+ }
399
+
400
+ # Cache the results
401
+ cache_parameters = {
402
+ 'threshold': request.threshold,
403
+ 'use_sample_data': request.use_sample_data,
404
+ 'total_products': len(products),
405
+ 'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
406
+ }
407
+
408
+ cache_key = cache_duplicate_analysis(
409
+ len(products),
410
+ request.threshold,
411
+ results,
412
+ cache_parameters
413
+ )
414
+
415
+ # Send final complete results
416
+ yield f"data: {json.dumps({'type': 'complete', 'data': results, 'cache_key': cache_key})}\n\n"
417
+
418
+ logger.info(f"✅ Streaming duplicate analysis complete: {len(duplicates)} duplicates found in {len(products)} products")
419
+ logger.info(f"💾 Results cached with key: {cache_key}")
420
+
421
+ except Exception as e:
422
+ logger.error(f"Streaming duplicate analysis error: {e}")
423
+ yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
424
+
425
+ return StreamingResponse(
426
+ generate_progress(),
427
+ media_type="text/event-stream",
428
+ headers={
429
+ "Cache-Control": "no-cache",
430
+ "Connection": "keep-alive",
431
+ "Content-Type": "text/event-stream",
432
+ "X-Accel-Buffering": "no" # Disable nginx buffering
433
+ }
434
+ )
435
+
436
+ @router.post("/compare-promo", summary="Compare Promo Products - WITH CACHING")
437
+ async def compare_promo_products(request: PromoComparisonRequest):
438
+ """Compare promotional products against database products - NOW WITH CACHING"""
439
+ try:
440
+ logger.info(f"🏷️ Starting promo comparison with threshold {request.threshold}")
441
+
442
+ repository = get_similarity_repository()
443
+
444
+ if not repository:
445
+ return {"status": "error", "message": "Database repository not available"}
446
+
447
+ # Load REAL promo products
448
+ logger.info("📊 Loading REAL promo products from database...")
449
+ try:
450
+ promo_products = repository.load_promo_products()
451
+ db_products = repository.load_all_products()
452
+ logger.info(f"✅ Loaded {len(promo_products)} promo products and {len(db_products)} database products")
453
+ except Exception as e:
454
+ logger.warning(f"❌ Failed to load real data, using samples: {e}")
455
+ promo_products = repository._get_sample_promo_products()
456
+ db_products = repository.get_sample_products()
457
+
458
+ # 🚀 NEW: CHECK CACHE FIRST (unless force refresh)
459
+ if not request.force_refresh:
460
+ cached_results = load_promo_analysis(
461
+ len(promo_products),
462
+ len(db_products),
463
+ request.threshold
464
+ )
465
+ if cached_results:
466
+ logger.info("✅ Returning cached promo analysis results")
467
+ return {
468
+ "status": "success",
469
+ **cached_results,
470
+ "cached": True,
471
+ "cache_hit": True,
472
+ "timestamp": time.time()
473
+ }
474
+
475
+ logger.info(f"📊 Comparing {len(promo_products)} promo products against {len(db_products)} database products")
476
+
477
+ # Find matches using REAL similarity algorithm
478
+ start_time = time.time()
479
+ matches = []
480
+ total_comparisons = 0
481
+
482
+ for promo in promo_products:
483
+ promo_name = promo.get('name', '').strip()
484
+ if not promo_name:
485
+ continue
486
+
487
+ best_match = None
488
+ best_similarity = 0.0
489
+
490
+ for db in db_products:
491
+ db_name = db.get('product_name', '').strip()
492
+ if not db_name:
493
+ continue
494
+
495
+ similarity = calculate_similarity(promo_name, db_name)
496
+ total_comparisons += 1
497
+
498
+ if similarity >= request.threshold and similarity > best_similarity:
499
+ best_similarity = similarity
500
+ confidence = calculate_confidence(similarity, promo_name, db_name)
501
+
502
+ best_match = {
503
+ 'promo_id': promo.get('id'),
504
+ 'promo_name': promo_name,
505
+ 'promo_store': promo.get('store', ''),
506
+ 'promo_price': promo.get('promo_price'),
507
+ 'regular_price': promo.get('regular_price'),
508
+ 'picture_id': promo.get('picture_id'),
509
+ 'db_product_id': db.get('product_id'),
510
+ 'db_product_name': db_name,
511
+ 'db_brand': db.get('brand', {}).get('brand_name', 'No Brand') if db.get('brand') else 'No Brand',
512
+ 'similarity': round(similarity, 3),
513
+ 'confidence': round(confidence, 3)
514
+ }
515
+
516
+ if best_match:
517
+ matches.append(best_match)
518
+
519
+ # Limit results if requested
520
+ if request.max_results and len(matches) >= request.max_results:
521
+ break
522
+
523
+ analysis_time = time.time() - start_time
524
+
525
+ # Prepare results
526
+ results = {
527
+ "matches": matches,
528
+ "statistics": {
529
+ "total_promo_products": len(promo_products),
530
+ "total_database_products": len(db_products),
531
+ "total_comparisons": total_comparisons,
532
+ "matches_found": len(matches),
533
+ "match_rate_percentage": round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
534
+ "threshold_used": request.threshold,
535
+ "analysis_time_seconds": round(analysis_time, 2)
536
+ }
537
+ }
538
+
539
+ # 🚀 NEW: CACHE THE RESULTS
540
+ cache_parameters = {
541
+ 'threshold': request.threshold,
542
+ 'max_results': request.max_results,
543
+ 'promo_count': len(promo_products),
544
+ 'db_count': len(db_products),
545
+ 'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
546
+ }
547
+
548
+ cache_key = cache_promo_analysis(
549
+ len(promo_products),
550
+ len(db_products),
551
+ request.threshold,
552
+ results,
553
+ cache_parameters
554
+ )
555
+
556
+ logger.info(f"✅ Promo comparison complete: {len(matches)} matches found")
557
+ logger.info(f"💾 Results cached with key: {cache_key}")
558
+
559
+ return {
560
+ "status": "success",
561
+ **results,
562
+ "cached": False,
563
+ "cache_key": cache_key,
564
+ "timestamp": time.time()
565
+ }
566
+
567
+ except Exception as e:
568
+ logger.error(f"Promo comparison error: {e}")
569
+ raise HTTPException(status_code=500, detail=f"Promo comparison failed: {str(e)}")
570
+
571
+
572
+ @router.post("/compare-promo-stream", summary="Stream Promo Comparison with Real-Time Progress")
573
+ async def compare_promo_stream(request: PromoComparisonRequest):
574
+ """Stream promo product comparison with real-time progress updates"""
575
+
576
+ async def generate_promo_progress():
577
+ try:
578
+ logger.info(f"🏷️ Starting streaming promo comparison with threshold {request.threshold}")
579
+
580
+ repository = get_similarity_repository()
581
+
582
+ if not repository:
583
+ yield f"data: {json.dumps({'type': 'error', 'message': 'Database repository not available'})}\n\n"
584
+ return
585
+
586
+ # Load REAL promo and database products
587
+ logger.info("📊 Loading REAL promo and database products...")
588
+ try:
589
+ promo_products = repository.load_promo_products()
590
+ db_products = repository.load_all_products()
591
+ logger.info(f"✅ Loaded {len(promo_products)} promo products and {len(db_products)} database products")
592
+ except Exception as e:
593
+ logger.warning(f"❌ Failed to load real data, using samples: {e}")
594
+ promo_products = repository._get_sample_promo_products()
595
+ db_products = repository.get_sample_products()
596
+
597
+ # Check cache first (unless force refresh)
598
+ if not request.force_refresh:
599
+ cached_results = load_promo_analysis(
600
+ len(promo_products),
601
+ len(db_products),
602
+ request.threshold
603
+ )
604
+ if cached_results:
605
+ logger.info("✅ Returning cached promo analysis results via stream")
606
+ yield f"data: {json.dumps({'type': 'cache_hit', 'data': cached_results})}\n\n"
607
+ return
608
+
609
+ # Send initial status
610
+ total_comparisons = len(promo_products) * len(db_products)
611
+ yield f"data: {json.dumps({'type': 'init', 'total_promo_products': len(promo_products), 'total_db_products': len(db_products), 'total_comparisons': total_comparisons, 'threshold': request.threshold})}\n\n"
612
+
613
+ # Find matches using REAL similarity algorithm with streaming
614
+ start_time = time.time()
615
+ matches = []
616
+ completed_comparisons = 0
617
+
618
+ for i, promo in enumerate(promo_products):
619
+ promo_name = promo.get('name', '').strip()
620
+ promo_store = promo.get('store', '').strip()
621
+
622
+ if not promo_name:
623
+ continue
624
+
625
+ # Send promo progress update
626
+ promo_progress = (i / len(promo_products)) * 100
627
+ yield f"data: {json.dumps({'type': 'promo_progress', 'current_promo': i + 1, 'total_promos': len(promo_products), 'progress': round(promo_progress, 1), 'current_promo_name': promo_name[:50], 'store': promo_store})}\n\n"
628
+
629
+ best_match = None
630
+ best_similarity = 0.0
631
+
632
+ for j, db_product in enumerate(db_products):
633
+ db_name = db_product.get('product_name', '').strip()
634
+
635
+ if not db_name:
636
+ continue
637
+
638
+ completed_comparisons += 1
639
+
640
+ # Calculate similarity
641
+ similarity = calculate_similarity(promo_name, db_name)
642
+
643
+ # Send detailed comparison progress every 1000 comparisons
644
+ if completed_comparisons % 1000 == 0:
645
+ overall_progress = (completed_comparisons / total_comparisons) * 100
646
+ yield f"data: {json.dumps({'type': 'comparison_progress', 'completed_comparisons': completed_comparisons, 'total_comparisons': total_comparisons, 'progress': round(overall_progress, 1), 'comparing': f'{promo_name[:30]} vs {db_name[:30]}'})}\n\n"
647
+ await asyncio.sleep(0.01) # Small delay
648
+
649
+ if similarity >= request.threshold and similarity > best_similarity:
650
+ best_similarity = similarity
651
+ confidence = calculate_confidence(similarity, promo_name, db_name)
652
+
653
+ best_match = {
654
+ 'promo_id': promo.get('id'),
655
+ 'promo_name': promo_name,
656
+ 'promo_store': promo_store,
657
+ 'promo_price': promo.get('promo_price', 0),
658
+ 'regular_price': promo.get('regular_price', 0),
659
+ 'picture_id': promo.get('picture_id'),
660
+ 'db_product_id': db_product.get('product_id'),
661
+ 'db_product_name': db_name,
662
+ 'db_brand': db_product.get('brand', {}).get('brand_name', 'No Brand') if db_product.get('brand') else 'No Brand',
663
+ 'similarity': round(similarity, 3),
664
+ 'confidence': round(confidence, 3)
665
+ }
666
+
667
+ # If match found, send immediate update
668
+ if best_match:
669
+ matches.append(best_match)
670
+ yield f"data: {json.dumps({'type': 'match_found', 'match': best_match, 'total_matches': len(matches)})}\n\n"
671
+ logger.info(f"🔍 PROMO MATCH FOUND via stream: {promo_name} ↔ {best_match['db_product_name']} ({best_similarity:.3f})")
672
+
673
+ # Limit results if requested
674
+ if request.max_results and len(matches) >= request.max_results:
675
+ yield f"data: {json.dumps({'type': 'max_results_reached', 'max_results': request.max_results, 'matches_found': len(matches)})}\n\n"
676
+ break
677
+
678
+ analysis_time = time.time() - start_time
679
+
680
+ # Prepare final results
681
+ results = {
682
+ "matches": matches,
683
+ "statistics": {
684
+ "total_promo_products": len(promo_products),
685
+ "total_database_products": len(db_products),
686
+ "total_comparisons": completed_comparisons,
687
+ "matches_found": len(matches),
688
+ "match_rate_percentage": round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
689
+ "threshold_used": request.threshold,
690
+ "analysis_time_seconds": round(analysis_time, 2)
691
+ }
692
+ }
693
+
694
+ # Cache the results
695
+ cache_parameters = {
696
+ 'threshold': request.threshold,
697
+ 'max_results': request.max_results,
698
+ 'promo_count': len(promo_products),
699
+ 'db_count': len(db_products),
700
+ 'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
701
+ }
702
+
703
+ cache_key = cache_promo_analysis(
704
+ len(promo_products),
705
+ len(db_products),
706
+ request.threshold,
707
+ results,
708
+ cache_parameters
709
+ )
710
+
711
+ # Send final results
712
+ yield f"data: {json.dumps({'type': 'complete', 'data': results, 'cache_key': cache_key})}\n\n"
713
+
714
+ logger.info(f"✅ Streaming promo comparison complete: {len(matches)} matches found")
715
+ logger.info(f"💾 Results cached with key: {cache_key}")
716
+
717
+ except Exception as e:
718
+ logger.error(f"Streaming promo comparison error: {e}")
719
+ yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
720
+
721
+ return StreamingResponse(
722
+ generate_promo_progress(),
723
+ media_type="text/event-stream",
724
+ headers={
725
+ "Cache-Control": "no-cache",
726
+ "Connection": "keep-alive",
727
+ "Content-Type": "text/event-stream",
728
+ "X-Accel-Buffering": "no" # Disable nginx buffering
729
+ }
730
+ )
731
+
732
+ @router.get("/test-algorithm", summary="Test Similarity Algorithm")
733
+ async def test_algorithm():
734
+ """Test the similarity algorithm with known examples"""
735
+ try:
736
+ logger.info("🧪 Testing similarity algorithm")
737
+
738
+ test_cases = [
739
+ ("Maslac", "Maslac", True),
740
+ ("Vrhnje za kuhanje", "Vrhnje za kuhanje 3x200g", False),
741
+ ("Apple iPhone 13", "iPhone 13 Apple", True),
742
+ ("vindija mlijeko cokoladno", "vindija cokoladno mlijeko", True)
743
+ ]
744
+
745
+ results = []
746
+ for text1, text2, should_match in test_cases:
747
+ similarity = calculate_similarity(text1, text2)
748
+ confidence = calculate_confidence(similarity, text1, text2)
749
+ is_match = similarity >= 0.85
750
+ passed = is_match == should_match
751
+
752
+ results.append({
753
+ "text1": text1,
754
+ "text2": text2,
755
+ "similarity": round(similarity, 3),
756
+ "confidence": round(confidence, 3),
757
+ "is_match": is_match,
758
+ "should_match": should_match,
759
+ "test_passed": passed
760
+ })
761
+
762
+ passed_tests = sum(1 for r in results if r["test_passed"])
763
+ total_tests = len(results)
764
+
765
+ return {
766
+ "status": "success",
767
+ "test_results": results,
768
+ "summary": {
769
+ "total_tests": total_tests,
770
+ "passed_tests": passed_tests,
771
+ "failed_tests": total_tests - passed_tests,
772
+ "success_rate": round((passed_tests / total_tests) * 100, 1)
773
+ },
774
+ "timestamp": time.time()
775
+ }
776
+
777
+ except Exception as e:
778
+ logger.error(f"Algorithm test error: {e}")
779
+ raise HTTPException(status_code=500, detail=f"Test failed: {str(e)}")
780
+
781
+ @router.get("/stats", summary="Get Engine Statistics")
782
+ async def get_statistics():
783
+ """Get overall similarity engine statistics - REAL DATA"""
784
+ try:
785
+ repository = get_similarity_repository()
786
+
787
+ if not repository:
788
+ return {
789
+ "status": "error",
790
+ "message": "Repository not available",
791
+ "statistics": {}
792
+ }
793
+
794
+ # Get REAL statistics
795
+ try:
796
+ all_products = repository.load_all_products()
797
+ products_without_images = repository.get_products_without_images()
798
+ promo_products = repository.load_promo_products()
799
+ promo_with_images = repository.load_promo_products(with_images_only=True)
800
+ except Exception as e:
801
+ logger.error(f"Error loading statistics: {e}")
802
+ all_products = []
803
+ products_without_images = []
804
+ promo_products = []
805
+ promo_with_images = []
806
+
807
+ return {
808
+ "status": "success",
809
+ "statistics": {
810
+ "database": {
811
+ "total_products": len(all_products),
812
+ "products_without_images": len(products_without_images),
813
+ "products_with_images": len(all_products) - len(products_without_images)
814
+ },
815
+ "promotional": {
816
+ "total_promo_products": len(promo_products),
817
+ "promo_with_images": len(promo_with_images),
818
+ "promo_without_images": len(promo_products) - len(promo_with_images)
819
+ },
820
+ "image_coverage": {
821
+ "database_coverage": round(((len(all_products) - len(products_without_images)) / max(len(all_products), 1)) * 100, 1),
822
+ "promo_coverage": round((len(promo_with_images) / max(len(promo_products), 1)) * 100, 1)
823
+ }
824
+ },
825
+ "engine_status": "operational",
826
+ "database_connection": "connected" if repository.supabase else "disconnected",
827
+ "timestamp": time.time()
828
+ }
829
+
830
+ except Exception as e:
831
+ logger.error(f"Statistics error: {e}")
832
+ raise HTTPException(status_code=500, detail=f"Failed to get statistics: {str(e)}")
833
+
834
+ # 🚀 NEW: CACHE MANAGEMENT ENDPOINTS
835
+
836
+ @router.get("/cache/stats", summary="Get Cache Statistics")
837
+ async def get_cache_stats():
838
+ """Get cache usage statistics"""
839
+ try:
840
+ cache_mgr = get_cache_manager()
841
+
842
+ if not cache_mgr:
843
+ return {
844
+ "status": "error",
845
+ "message": "Cache manager not available",
846
+ "cache_stats": {}
847
+ }
848
+
849
+ stats = cache_mgr.get_cache_stats()
850
+
851
+ return {
852
+ "status": "success",
853
+ "cache_stats": stats,
854
+ "timestamp": time.time()
855
+ }
856
+
857
+ except Exception as e:
858
+ logger.error(f"Cache stats error: {e}")
859
+ raise HTTPException(status_code=500, detail=f"Failed to get cache stats: {str(e)}")
860
+
861
+ @router.post("/cache/clear", summary="Clear Cache")
862
+ async def clear_cache(request: CacheManagementRequest):
863
+ """Clear cached results"""
864
+ try:
865
+ cache_mgr = get_cache_manager()
866
+
867
+ if not cache_mgr:
868
+ return {
869
+ "status": "error",
870
+ "message": "Cache manager not available"
871
+ }
872
+
873
+ # Clear cache based on parameters
874
+ removed_count = cache_mgr.clear_cache(
875
+ analysis_type=request.analysis_type,
876
+ older_than_hours=request.older_than_hours
877
+ )
878
+
879
+ logger.info(f"🧹 Cache cleared: {removed_count} files removed")
880
+
881
+ return {
882
+ "status": "success",
883
+ "message": f"Cache cleared successfully",
884
+ "files_removed": removed_count,
885
+ "cleared_type": request.analysis_type or "all",
886
+ "timestamp": time.time()
887
+ }
888
+
889
+ except Exception as e:
890
+ logger.error(f"Cache clear error: {e}")
891
+ raise HTTPException(status_code=500, detail=f"Failed to clear cache: {str(e)}")
892
+
893
+ @router.post("/cache/cleanup", summary="Cleanup Expired Cache")
894
+ async def cleanup_expired_cache():
895
+ """Remove only expired cache files"""
896
+ try:
897
+ cache_mgr = get_cache_manager()
898
+
899
+ if not cache_mgr:
900
+ return {
901
+ "status": "error",
902
+ "message": "Cache manager not available"
903
+ }
904
+
905
+ removed_count = cache_mgr.cleanup_expired_cache()
906
+
907
+ logger.info(f"🧹 Expired cache cleaned up: {removed_count} files removed")
908
+
909
+ return {
910
+ "status": "success",
911
+ "message": "Expired cache cleaned up successfully",
912
+ "expired_files_removed": removed_count,
913
+ "timestamp": time.time()
914
+ }
915
+
916
+ except Exception as e:
917
+ logger.error(f"Cache cleanup error: {e}")
918
+ raise HTTPException(status_code=500, detail=f"Failed to cleanup cache: {str(e)}")
919
+
920
+ @router.post("/compare-promo-stream", summary="Stream Promo Comparison with Real-Time Progress")
921
+ async def compare_promo_stream(request: PromoComparisonRequest):
922
+ """Stream promo product comparison with real-time progress updates"""
923
+
924
+ async def generate_promo_progress():
925
+ try:
926
+ logger.info(f"🏷️ Starting streaming promo comparison with threshold {request.threshold}")
927
+
928
+ repository = get_similarity_repository()
929
+
930
+ if not repository:
931
+ yield f"data: {json.dumps({'type': 'error', 'message': 'Database repository not available'})}\n\n"
932
+ return
933
+
934
+ # Load REAL promo and database products
935
+ logger.info("📊 Loading REAL promo and database products...")
936
+ try:
937
+ promo_products = repository.load_promo_products()
938
+ db_products = repository.load_all_products()
939
+ logger.info(f"✅ Loaded {len(promo_products)} promo products and {len(db_products)} database products")
940
+ except Exception as e:
941
+ logger.warning(f"❌ Failed to load real data, using samples: {e}")
942
+ promo_products = repository._get_sample_promo_products()
943
+ db_products = repository.get_sample_products()
944
+
945
+ # Check cache first (unless force refresh)
946
+ if not request.force_refresh:
947
+ cached_results = load_promo_analysis(
948
+ len(promo_products),
949
+ len(db_products),
950
+ request.threshold
951
+ )
952
+ if cached_results:
953
+ logger.info("✅ Returning cached promo analysis results via stream")
954
+ yield f"data: {json.dumps({'type': 'cache_hit', 'data': cached_results})}\n\n"
955
+ return
956
+
957
+ # Send initial status
958
+ total_comparisons = len(promo_products) * len(db_products)
959
+ yield f"data: {json.dumps({'type': 'init', 'total_promo_products': len(promo_products), 'total_db_products': len(db_products), 'total_comparisons': total_comparisons, 'threshold': request.threshold})}\n\n"
960
+
961
+ # Find matches using REAL similarity algorithm with streaming
962
+ start_time = time.time()
963
+ matches = []
964
+ completed_comparisons = 0
965
+
966
+ for i, promo in enumerate(promo_products):
967
+ promo_name = promo.get('name', '').strip()
968
+ promo_store = promo.get('store', '').strip()
969
+
970
+ if not promo_name:
971
+ continue
972
+
973
+ # Send promo progress update every 10 promos
974
+ if i % 10 == 0:
975
+ promo_progress = (i / len(promo_products)) * 100
976
+ yield f"data: {json.dumps({'type': 'promo_progress', 'current_promo': i + 1, 'total_promos': len(promo_products), 'progress': round(promo_progress, 1), 'current_promo_name': promo_name[:50], 'store': promo_store})}\n\n"
977
+
978
+ best_match = None
979
+ best_similarity = 0.0
980
+
981
+ for j, db_product in enumerate(db_products):
982
+ db_name = db_product.get('product_name', '').strip()
983
+
984
+ if not db_name:
985
+ continue
986
+
987
+ completed_comparisons += 1
988
+
989
+ # Calculate similarity
990
+ similarity = calculate_similarity(promo_name, db_name)
991
+
992
+ # Send detailed comparison progress every 5000 comparisons
993
+ if completed_comparisons % 5000 == 0:
994
+ overall_progress = (completed_comparisons / total_comparisons) * 100
995
+ yield f"data: {json.dumps({'type': 'comparison_progress', 'completed_comparisons': completed_comparisons, 'total_comparisons': total_comparisons, 'progress': round(overall_progress, 1), 'comparing': f'{promo_name[:30]} vs {db_name[:30]}'})}\n\n"
996
+ await asyncio.sleep(0.01) # Small delay
997
+
998
+ if similarity >= request.threshold and similarity > best_similarity:
999
+ best_similarity = similarity
1000
+ confidence = calculate_confidence(similarity, promo_name, db_name)
1001
+
1002
+ best_match = {
1003
+ 'promo_id': promo.get('id'),
1004
+ 'promo_name': promo_name,
1005
+ 'promo_store': promo_store,
1006
+ 'promo_price': promo.get('promo_price', 0),
1007
+ 'regular_price': promo.get('regular_price', 0),
1008
+ 'picture_id': promo.get('picture_id'),
1009
+ 'db_product_id': db_product.get('product_id'),
1010
+ 'db_product_name': db_name,
1011
+ 'db_brand': db_product.get('brand', {}).get('brand_name', 'No Brand') if db_product.get('brand') else 'No Brand',
1012
+ 'similarity': round(similarity, 3),
1013
+ 'confidence': round(confidence, 3)
1014
+ }
1015
+
1016
+ # If match found, send immediate update
1017
+ if best_match:
1018
+ matches.append(best_match)
1019
+ yield f"data: {json.dumps({'type': 'match_found', 'match': best_match, 'total_matches': len(matches)})}\n\n"
1020
+ logger.info(f"🔍 PROMO MATCH FOUND via stream: {promo_name} ↔ {best_match['db_product_name']} ({best_similarity:.3f})")
1021
+
1022
+ # Limit results if requested
1023
+ if request.max_results and len(matches) >= request.max_results:
1024
+ yield f"data: {json.dumps({'type': 'max_results_reached', 'max_results': request.max_results, 'matches_found': len(matches)})}\n\n"
1025
+ break
1026
+
1027
+ analysis_time = time.time() - start_time
1028
+
1029
+ # Prepare final results
1030
+ results = {
1031
+ "matches": matches,
1032
+ "statistics": {
1033
+ "total_promo_products": len(promo_products),
1034
+ "total_database_products": len(db_products),
1035
+ "total_comparisons": completed_comparisons,
1036
+ "matches_found": len(matches),
1037
+ "match_rate_percentage": round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
1038
+ "threshold_used": request.threshold,
1039
+ "analysis_time_seconds": round(analysis_time, 2)
1040
+ }
1041
+ }
1042
+
1043
+ # Cache the results
1044
+ cache_parameters = {
1045
+ 'threshold': request.threshold,
1046
+ 'max_results': request.max_results,
1047
+ 'promo_count': len(promo_products),
1048
+ 'db_count': len(db_products),
1049
+ 'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
1050
+ }
1051
+
1052
+ cache_key = cache_promo_analysis(
1053
+ len(promo_products),
1054
+ len(db_products),
1055
+ request.threshold,
1056
+ results,
1057
+ cache_parameters
1058
+ )
1059
+
1060
+ # Send final results
1061
+ yield f"data: {json.dumps({'type': 'complete', 'data': results, 'cache_key': cache_key})}\n\n"
1062
+
1063
+ logger.info(f"✅ Streaming promo comparison complete: {len(matches)} matches found")
1064
+ logger.info(f"💾 Results cached with key: {cache_key}")
1065
+
1066
+ except Exception as e:
1067
+ logger.error(f"Streaming promo comparison error: {e}")
1068
+ yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
1069
+
1070
+ return StreamingResponse(
1071
+ generate_promo_progress(),
1072
+ media_type="text/event-stream",
1073
+ headers={
1074
+ "Cache-Control": "no-cache",
1075
+ "Connection": "keep-alive",
1076
+ "Content-Type": "text/event-stream",
1077
+ "X-Accel-Buffering": "no" # Disable nginx buffering
1078
+ }
1079
+ )
db/similarity_repository.py ADDED
@@ -0,0 +1,462 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Similarity Repository - Backend Database Operations
3
+ Handles all Supabase interactions for the similarity engine
4
+ VERIFIED FOR REAL DATA CONNECTION
5
+ """
6
+
7
+ import os
8
+ import logging
9
+ import sys
10
+ from typing import List, Dict, Any, Optional
11
+
12
+ # Configure logging
13
+ logging.basicConfig(level=logging.INFO)
14
+ logger = logging.getLogger(__name__)
15
+
16
+ class SimilarityRepository:
17
+ """Repository class for similarity engine database operations"""
18
+
19
+ def __init__(self):
20
+ """Initialize Supabase connection"""
21
+ # Try backend environment variables first, then fall back to VITE_ variables
22
+ self.supabase_url = os.getenv('SUPABASE_URL') or os.getenv('VITE_SUPABASE_URL')
23
+ self.supabase_key = os.getenv('SUPABASE_KEY') or os.getenv('VITE_SUPABASE_KEY')
24
+ self.user_email = os.getenv('SUPABASE_USER_EMAIL', 'tonywis12@yahoo.com')
25
+ self.user_password = os.getenv('SUPABASE_USER_PASSWORD', 'Anthony.12')
26
+
27
+ # Try to load environment variables from .env file
28
+ try:
29
+ from dotenv import load_dotenv
30
+ # Look for .env file in parent directory (where the main app is)
31
+ env_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), '.env')
32
+ load_dotenv(env_path)
33
+
34
+ # Reload environment variables after loading .env (try both naming conventions)
35
+ self.supabase_url = os.getenv('SUPABASE_URL') or os.getenv('VITE_SUPABASE_URL')
36
+ self.supabase_key = os.getenv('SUPABASE_KEY') or os.getenv('VITE_SUPABASE_KEY')
37
+
38
+ logger.info(f"📋 Loaded .env from: {env_path}")
39
+ logger.info(f"📋 Supabase URL: {self.supabase_url[:30] if self.supabase_url else 'Not found'}...")
40
+ logger.info(f"📋 Supabase Key: {self.supabase_key[:20] if self.supabase_key else 'Not found'}...")
41
+
42
+ except ImportError:
43
+ logger.warning("python-dotenv not available, using system environment variables")
44
+ except Exception as e:
45
+ logger.warning(f"Could not load .env file: {e}")
46
+
47
+ if not self.supabase_url or not self.supabase_key:
48
+ logger.error("❌ Supabase credentials not found in environment variables")
49
+ logger.error("💡 Looking for SUPABASE_URL/SUPABASE_KEY or VITE_SUPABASE_URL/VITE_SUPABASE_KEY")
50
+ self.supabase = None
51
+ return
52
+
53
+ self.supabase = None
54
+ self._authenticate()
55
+
56
+ def _authenticate(self) -> bool:
57
+ """Authenticate with Supabase"""
58
+ try:
59
+ from supabase import create_client
60
+
61
+ logger.info("🔗 Connecting to Supabase...")
62
+ self.supabase = create_client(self.supabase_url, self.supabase_key)
63
+
64
+ logger.info("🔐 Authenticating user...")
65
+ auth_result = self.supabase.auth.sign_in_with_password({
66
+ "email": self.user_email,
67
+ "password": self.user_password
68
+ })
69
+
70
+ if auth_result.user:
71
+ logger.info(f"✅ Authenticated as: {auth_result.user.email}")
72
+ return True
73
+ else:
74
+ logger.error("❌ Authentication failed - no user returned")
75
+ return False
76
+
77
+ except ImportError as e:
78
+ logger.error(f"❌ Supabase library not available: {e}")
79
+ logger.error("💡 Install with: pip install supabase")
80
+ return False
81
+ except Exception as e:
82
+ logger.error(f"❌ Authentication error: {e}")
83
+ return False
84
+
85
+ def load_all_products(self) -> List[Dict[str, Any]]:
86
+ """
87
+ Load all products from database with pagination
88
+
89
+ Returns:
90
+ List of product dictionaries
91
+ """
92
+ if not self.supabase:
93
+ logger.error("❌ No Supabase connection - returning sample data")
94
+ return self.get_sample_products()
95
+
96
+ try:
97
+ logger.info("📊 Loading all products from database...")
98
+
99
+ # Get total count first
100
+ count_result = self.supabase.table('products').select('*', count='exact').execute()
101
+ total_count = count_result.count
102
+ logger.info(f"📋 Total products in database: {total_count}")
103
+
104
+ if not total_count:
105
+ logger.warning("❌ No products found in database")
106
+ return self.get_sample_products()
107
+
108
+ # Fetch all products in batches
109
+ page_size = 1000
110
+ all_products = []
111
+ page = 0
112
+
113
+ while True:
114
+ offset = page * page_size
115
+ logger.info(f"⏳ Fetching products {offset + 1} to {offset + page_size}...")
116
+
117
+ batch_result = self.supabase.table('products').select('''
118
+ product_id,
119
+ product_name,
120
+ product_weight,
121
+ product_image,
122
+ brand_id,
123
+ brand:brands (
124
+ brand_id,
125
+ brand_name
126
+ )
127
+ ''').range(offset, offset + page_size - 1).execute()
128
+
129
+ batch_products = batch_result.data
130
+
131
+ if not batch_products:
132
+ break
133
+
134
+ all_products.extend(batch_products)
135
+ page += 1
136
+
137
+ if len(all_products) >= total_count:
138
+ break
139
+
140
+ logger.info(f"✅ Loaded {len(all_products)} products successfully")
141
+ return self._validate_products(all_products)
142
+
143
+ except Exception as e:
144
+ logger.error(f"❌ Error loading products: {e}")
145
+ logger.info("🔄 Falling back to sample data")
146
+ return self.get_sample_products()
147
+
148
+ def load_promo_products(self, with_images_only: bool = False) -> List[Dict[str, Any]]:
149
+
150
+ if not self.supabase:
151
+ logger.error("❌ No Supabase connection - returning sample promo data")
152
+ return self._get_sample_promo_products()
153
+
154
+ try:
155
+ logger.info("📊 Loading promo products from database...")
156
+
157
+ # Build basic query without is_ignored filter (this might be the issue)
158
+ query = self.supabase.table('promo_products').select('''
159
+ id,
160
+ name,
161
+ store,
162
+ description,
163
+ picture_id,
164
+ regular_price,
165
+ promo_price
166
+ ''')
167
+
168
+ # Add image filter if requested
169
+ if with_images_only:
170
+ query = query.not_.is_('picture_id', 'null')
171
+ logger.info("🖼️ Filtering for promo products with images only")
172
+
173
+ # Get total count first (without is_ignored filter)
174
+ try:
175
+ count_result = query.select('*', count='exact').execute()
176
+ total_count = count_result.count
177
+ logger.info(f"📋 Total promo products found: {total_count}")
178
+ except Exception as count_error:
179
+ logger.error(f"❌ Error getting promo count: {count_error}")
180
+ # Try simpler count
181
+ simple_count = self.supabase.table('promo_products').select('id', count='exact').execute()
182
+ total_count = simple_count.count
183
+ logger.info(f"📋 Simple count result: {total_count}")
184
+
185
+ if not total_count or total_count == 0:
186
+ logger.warning("❌ No promo products found in database")
187
+ return self._get_sample_promo_products()
188
+
189
+ # Fetch all promo products in batches
190
+ page_size = 1000
191
+ all_promo_products = []
192
+ page = 0
193
+
194
+ logger.info(f"📦 Starting to fetch {total_count} promo products in batches...")
195
+
196
+ while True:
197
+ offset = page * page_size
198
+ logger.info(f"⏳ Fetching promo products {offset + 1} to {offset + page_size}...")
199
+
200
+ try:
201
+ batch_result = query.range(offset, offset + page_size - 1).execute()
202
+ batch_products = batch_result.data
203
+
204
+ if not batch_products:
205
+ logger.info(f"📝 No more promo products found at offset {offset}")
206
+ break
207
+
208
+ all_promo_products.extend(batch_products)
209
+ page += 1
210
+
211
+ logger.info(f"✅ Loaded batch: {len(batch_products)} products (total so far: {len(all_promo_products)})")
212
+
213
+ if len(all_promo_products) >= total_count:
214
+ break
215
+
216
+ except Exception as batch_error:
217
+ logger.error(f"❌ Error loading batch at offset {offset}: {batch_error}")
218
+ break
219
+
220
+ logger.info(f"✅ Successfully loaded {len(all_promo_products)} promo products from database")
221
+
222
+ # Log sample of what we loaded
223
+ if all_promo_products:
224
+ sample_product = all_promo_products[0]
225
+ logger.info(f"📝 Sample promo product: {sample_product.get('name', 'No name')} from {sample_product.get('store', 'No store')}")
226
+
227
+ return all_promo_products
228
+
229
+ except Exception as e:
230
+ logger.error(f"❌ Error loading promo products: {e}")
231
+ logger.error(f"❌ Exception type: {type(e).__name__}")
232
+ logger.error(f"❌ Exception details: {str(e)}")
233
+ logger.info("🔄 Falling back to sample promo data")
234
+ return self._get_sample_promo_products()
235
+
236
+
237
+ def update_product_image(self, product_id: str, image_url: str) -> bool:
238
+ """
239
+ Update product image in database
240
+
241
+ Args:
242
+ product_id: Product ID to update
243
+ image_url: New image URL
244
+
245
+ Returns:
246
+ True if successful, False otherwise
247
+ """
248
+ if not self.supabase:
249
+ logger.error("❌ No Supabase connection")
250
+ return False
251
+
252
+ try:
253
+ logger.info(f"📊 Updating product {product_id} with image URL")
254
+
255
+ result = self.supabase.table('products').update({
256
+ 'product_image': image_url
257
+ }).eq('product_id', product_id).execute()
258
+
259
+ if result.data:
260
+ logger.info(f"✅ Updated product {product_id} with image")
261
+ return True
262
+ else:
263
+ logger.error(f"❌ Failed to update product {product_id}")
264
+ return False
265
+
266
+ except Exception as e:
267
+ logger.error(f"❌ Database update error for product {product_id}: {e}")
268
+ return False
269
+
270
+ def get_products_without_images(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
271
+ """
272
+ Get products that don't have images
273
+
274
+ Args:
275
+ limit: Maximum number of products to return
276
+
277
+ Returns:
278
+ List of products without images
279
+ """
280
+ if not self.supabase:
281
+ logger.error("❌ No Supabase connection")
282
+ return []
283
+
284
+ try:
285
+ logger.info("📊 Loading products without images...")
286
+
287
+ query = self.supabase.table('products').select('''
288
+ product_id,
289
+ product_name,
290
+ product_weight,
291
+ brand:brands (
292
+ brand_id,
293
+ brand_name
294
+ )
295
+ ''').is_('product_image', 'null')
296
+
297
+ if limit:
298
+ query = query.limit(limit)
299
+
300
+ result = query.execute()
301
+ products = result.data
302
+
303
+ logger.info(f"✅ Found {len(products)} products without images")
304
+ return self._validate_products(products)
305
+
306
+ except Exception as e:
307
+ logger.error(f"❌ Error loading products without images: {e}")
308
+ return []
309
+
310
+ def _validate_products(self, products: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
311
+ """
312
+ Validate and clean product data
313
+
314
+ Args:
315
+ products: Raw product list
316
+
317
+ Returns:
318
+ Validated product list
319
+ """
320
+ validated = []
321
+
322
+ for i, product in enumerate(products):
323
+ if not isinstance(product, dict):
324
+ logger.warning(f"⚠️ Skipping invalid product at index {i}: not a dictionary")
325
+ continue
326
+
327
+ product_id = product.get('product_id')
328
+ product_name = product.get('product_name')
329
+
330
+ if not product_id:
331
+ product['product_id'] = f"unknown_{i}"
332
+ logger.warning(f"⚠️ Added missing product_id for product at index {i}")
333
+
334
+ if not product_name or not product_name.strip():
335
+ logger.warning(f"⚠️ Skipping product with missing/empty name: {product}")
336
+ continue
337
+
338
+ # Clean product name
339
+ product['product_name'] = product_name.strip()
340
+
341
+ validated.append(product)
342
+
343
+ logger.info(f"✅ Validated {len(validated)} products (skipped {len(products) - len(validated)})")
344
+ return validated
345
+
346
+ def get_sample_products(self) -> List[Dict[str, Any]]:
347
+ """
348
+ Get sample products for testing
349
+
350
+ Returns:
351
+ List of sample product dictionaries
352
+ """
353
+ logger.info("📊 Using sample products for testing")
354
+ return [
355
+ {
356
+ "product_id": "1",
357
+ "product_name": "vindija mlijeko cokoladno 2.8%",
358
+ "product_weight": "1L",
359
+ "brand": {"brand_id": "1", "brand_name": "Vindija"}
360
+ },
361
+ {
362
+ "product_id": "2",
363
+ "product_name": "vindija cokoladno mlijeko 2.8%",
364
+ "product_weight": "1L",
365
+ "brand": {"brand_id": "1", "brand_name": "Vindija"}
366
+ },
367
+ {
368
+ "product_id": "3",
369
+ "product_name": "Apple iPhone 13 Pro",
370
+ "product_weight": "238g",
371
+ "brand": {"brand_id": "3", "brand_name": "Apple"}
372
+ },
373
+ {
374
+ "product_id": "4",
375
+ "product_name": "iPhone 13 Pro Apple",
376
+ "product_weight": "238g",
377
+ "brand": {"brand_id": "3", "brand_name": "Apple"}
378
+ },
379
+ {
380
+ "product_id": "5",
381
+ "product_name": "Samsung Galaxy S22",
382
+ "product_weight": "167g",
383
+ "brand": {"brand_id": "4", "brand_name": "Samsung"}
384
+ },
385
+ {
386
+ "product_id": "6",
387
+ "product_name": "Galaxy S22 Samsung",
388
+ "product_weight": "167g",
389
+ "brand": {"brand_id": "4", "brand_name": "Samsung"}
390
+ },
391
+ {
392
+ "product_id": "7",
393
+ "product_name": "Coca Cola 330ml",
394
+ "product_weight": "330ml",
395
+ "brand": {"brand_id": "7", "brand_name": "Coca-Cola"}
396
+ },
397
+ {
398
+ "product_id": "8",
399
+ "product_name": "Pepsi Cola 330ml",
400
+ "product_weight": "330ml",
401
+ "brand": {"brand_id": "8", "brand_name": "PepsiCo"}
402
+ }
403
+ ]
404
+
405
+ def _get_sample_promo_products(self) -> List[Dict[str, Any]]:
406
+ """Get sample promo products for testing"""
407
+ logger.info("📊 Using sample promo products for testing")
408
+ return [
409
+ {"id": "p1", "name": "vindija jogurt prirodni", "store": "Konzum", "promo_price": 1.20, "regular_price": 1.50, "picture_id": "img1"},
410
+ {"id": "p2", "name": "coca cola original 500ml", "store": "Konzum", "promo_price": 0.80, "regular_price": 1.10, "picture_id": "img2"},
411
+ {"id": "p3", "name": "samsung galaxy phone", "store": "Links", "promo_price": 299.99, "regular_price": 399.99, "picture_id": "img3"},
412
+ ]
413
+
414
+ def test_connection(self) -> Dict[str, Any]:
415
+ """
416
+ Test the database connection and return status
417
+
418
+ Returns:
419
+ Status dictionary with connection details
420
+ """
421
+ if not self.supabase:
422
+ return {
423
+ "connected": False,
424
+ "error": "No Supabase client initialized",
425
+ "products_count": 0,
426
+ "promo_products_count": 0
427
+ }
428
+
429
+ try:
430
+ # Test products table
431
+ products_result = self.supabase.table('products').select('product_id', count='exact').limit(1).execute()
432
+ products_count = products_result.count
433
+
434
+ # Test promo_products table
435
+ promo_result = self.supabase.table('promo_products').select('id', count='exact').limit(1).execute()
436
+ promo_count = promo_result.count
437
+
438
+ return {
439
+ "connected": True,
440
+ "products_count": products_count,
441
+ "promo_products_count": promo_count,
442
+ "database_url": self.supabase_url[:30] + "..." if self.supabase_url else "Unknown"
443
+ }
444
+
445
+ except Exception as e:
446
+ return {
447
+ "connected": False,
448
+ "error": str(e),
449
+ "products_count": 0,
450
+ "promo_products_count": 0
451
+ }
452
+
453
+
454
+ # Global repository instance
455
+ _repository = None
456
+
457
+ def get_similarity_repository() -> SimilarityRepository:
458
+ """Get singleton repository instance"""
459
+ global _repository
460
+ if _repository is None:
461
+ _repository = SimilarityRepository()
462
+ return _repository
product_detector/mock_detector.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ from typing import List, Dict
3
+ import warnings
4
+
5
+ class MockObjectDetector:
6
+ """
7
+ Mock Object Detector to temporarily replace the broken ONNX model
8
+ Returns dummy detection results to keep the server running
9
+ """
10
+
11
+ def __init__(self, model_path: str, class_names: List[str], input_size: int = 640):
12
+ self.class_names = class_names
13
+ self.input_size = input_size
14
+ print(f"🔧 Mock detector initialized - model file was corrupted")
15
+ print(f"📝 Available classes: {class_names}")
16
+
17
+ def predict(self, image: np.ndarray) -> List[Dict]:
18
+ """
19
+ Mock prediction method - returns sample detections
20
+ Replace this with real detector once model is fixed
21
+ """
22
+ # Return mock detection results
23
+ mock_detections = [
24
+ {
25
+ "class": "product" if len(self.class_names) > 0 else "unknown",
26
+ "confidence": 0.85,
27
+ "bbox": [100, 100, 300, 250], # x1, y1, x2, y2
28
+ "bbox_normalized": [0.3, 0.3, 0.4, 0.5] # center_x, center_y, width, height (normalized)
29
+ }
30
+ ]
31
+
32
+ print(f"🔍 Mock detection completed - found {len(mock_detections)} objects")
33
+ return mock_detections
requirements.txt CHANGED
@@ -12,4 +12,8 @@ python-dotenv
12
  supabase
13
  rembg
14
  httpx
15
- unidecode
 
 
 
 
 
12
  supabase
13
  rembg
14
  httpx
15
+ unidecode
16
+ # Similarity Engine Dependencies
17
+ requests
18
+ python-dateutil
19
+ pydantic
similarity_engine/__init__.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Similarity Engine Package
3
+ Backend version of the Price Hunter similarity engine
4
+ """
5
+
6
+ from .similarity_core import calculate_similarity, calculate_confidence
7
+ from .product_comparator import ProductComparator, compare_products_batch
8
+ from .enhanced_image_processor import get_image_processor
9
+
10
+ __version__ = "1.0.0"
11
+ __all__ = [
12
+ "calculate_similarity",
13
+ "calculate_confidence",
14
+ "ProductComparator",
15
+ "compare_products_batch",
16
+ "get_image_processor"
17
+ ]
similarity_engine/enhanced_image_processor.py ADDED
@@ -0,0 +1,531 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced Image Processor - Multiple Sources & Flexible Processing
3
+ Supports promo products, manual uploads, URL sources, Google Images, and more
4
+ """
5
+
6
+ import os
7
+ import logging
8
+ import requests
9
+ import time
10
+ from typing import List, Dict, Any, Optional, Tuple
11
+ import sys
12
+ import os
13
+
14
+ # Add parent directory to path
15
+ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
16
+
17
+ from similarity_core import calculate_similarity, calculate_confidence
18
+ from db.similarity_repository import get_similarity_repository
19
+
20
+ # Configure logging
21
+ logging.basicConfig(level=logging.INFO)
22
+ logger = logging.getLogger(__name__)
23
+
24
+ class EnhancedImageProcessor:
25
+ """Enhanced image processor with multiple sources and flexible options"""
26
+
27
+ def __init__(self):
28
+ """Initialize the image processor"""
29
+ self.repository = get_similarity_repository()
30
+ self.processing_stats = {
31
+ 'total_processed': 0,
32
+ 'successful': 0,
33
+ 'failed': 0,
34
+ 'skipped': 0
35
+ }
36
+
37
+ def find_high_similarity_matches(
38
+ self,
39
+ source_products: List[Dict],
40
+ target_products: List[Dict],
41
+ threshold: float = 0.95,
42
+ source_type: str = "promo"
43
+ ) -> List[Dict[str, Any]]:
44
+ """
45
+ Find high similarity matches between source and target products
46
+
47
+ Args:
48
+ source_products: Products with images (promo, manual, etc.)
49
+ target_products: Database products to match against
50
+ threshold: Similarity threshold
51
+ source_type: Type of source ("promo", "manual", "google", etc.)
52
+
53
+ Returns:
54
+ List of high similarity matches
55
+ """
56
+ logger.info(f"🔍 Finding high similarity matches for {source_type} images")
57
+ logger.info(f"📊 Source products: {len(source_products)}")
58
+ logger.info(f"📊 Target products: {len(target_products)}")
59
+ logger.info(f"🎯 Similarity threshold: {threshold}")
60
+
61
+ matches = []
62
+
63
+ for i, source_product in enumerate(source_products):
64
+ source_name = source_product.get('name', '').strip()
65
+ if not source_name:
66
+ continue
67
+
68
+ logger.info(f"📊 Analyzing {source_type} product {i+1}/{len(source_products)}: {source_name[:50]}...")
69
+
70
+ for target_product in target_products:
71
+ target_name = target_product.get('product_name', '').strip()
72
+ if not target_name:
73
+ continue
74
+
75
+ similarity = calculate_similarity(source_name, target_name)
76
+
77
+ if similarity >= threshold:
78
+ confidence = calculate_confidence(similarity, source_name, target_name)
79
+
80
+ match = {
81
+ 'source_id': source_product.get('id'),
82
+ 'source_name': source_name,
83
+ 'source_type': source_type,
84
+ 'target_product_id': target_product.get('product_id'),
85
+ 'target_product_name': target_name,
86
+ 'similarity': round(similarity, 3),
87
+ 'confidence': round(confidence, 3),
88
+ 'has_current_image': bool(target_product.get('product_image')),
89
+ 'source_image_info': self._extract_image_info(source_product, source_type)
90
+ }
91
+
92
+ matches.append(match)
93
+ logger.info(f" 🔍 HIGH MATCH: {source_name} ↔ {target_name} ({similarity:.3f})")
94
+ break
95
+
96
+ logger.info(f"✅ Found {len(matches)} high similarity matches")
97
+ return matches
98
+
99
+ def _extract_image_info(self, product: Dict, source_type: str) -> Dict[str, Any]:
100
+ """Extract image information based on source type"""
101
+ if source_type == "promo":
102
+ picture_id = product.get('picture_id')
103
+ return {
104
+ 'picture_id': picture_id,
105
+ 'image_url': f"https://backend.360promo.hr/contents/products/{picture_id}.jpg" if picture_id else None,
106
+ 'store': product.get('store'),
107
+ 'promo_price': product.get('promo_price'),
108
+ 'regular_price': product.get('regular_price')
109
+ }
110
+ elif source_type == "manual":
111
+ return {
112
+ 'image_url': product.get('image_url'),
113
+ 'original_filename': product.get('filename'),
114
+ 'uploaded_by': product.get('uploaded_by')
115
+ }
116
+ elif source_type == "google":
117
+ return {
118
+ 'image_url': product.get('image_url'),
119
+ 'source_page': product.get('source_page'),
120
+ 'search_query': product.get('search_query')
121
+ }
122
+ elif source_type == "url":
123
+ return {
124
+ 'image_url': product.get('image_url'),
125
+ 'source_domain': product.get('source_domain')
126
+ }
127
+ else:
128
+ return {
129
+ 'image_url': product.get('image_url', product.get('picture_url'))
130
+ }
131
+
132
+ def check_image_availability(self, image_url: str) -> bool:
133
+ """Check if image URL is accessible"""
134
+ try:
135
+ response = requests.head(image_url, timeout=10)
136
+ return response.status_code == 200
137
+ except Exception as e:
138
+ logger.warning(f"⚠️ Image not accessible: {image_url} - {e}")
139
+ return False
140
+
141
+ def process_image_from_url(
142
+ self,
143
+ image_url: str,
144
+ product_id: str,
145
+ processing_options: Dict[str, Any] = None
146
+ ) -> Optional[str]:
147
+ """
148
+ Download and process image from URL
149
+
150
+ Args:
151
+ image_url: Source image URL
152
+ product_id: Target product ID
153
+ processing_options: Processing configuration
154
+
155
+ Returns:
156
+ Processed image URL or None if failed
157
+ """
158
+ if processing_options is None:
159
+ processing_options = {
160
+ 'remove_background': True,
161
+ 'upscale_factor': 2,
162
+ 'target_format': 'webp',
163
+ 'quality': 85
164
+ }
165
+
166
+ try:
167
+ logger.info(f"📥 Downloading image from: {image_url}")
168
+
169
+ # Download image
170
+ response = requests.get(image_url, timeout=30)
171
+ if response.status_code != 200:
172
+ logger.error(f"❌ Failed to download: HTTP {response.status_code}")
173
+ return None
174
+
175
+ logger.info("✅ Image downloaded successfully")
176
+
177
+ # Try to process via backend endpoint
178
+ processed_url = self._process_via_backend(
179
+ response.content,
180
+ product_id,
181
+ processing_options
182
+ )
183
+
184
+ if processed_url:
185
+ return processed_url
186
+
187
+ # If processing fails, return original URL
188
+ logger.warning("⚠️ Processing failed, using original URL")
189
+ return image_url
190
+
191
+ except Exception as e:
192
+ logger.error(f"❌ Error processing image from URL: {e}")
193
+ return None
194
+
195
+ def _process_via_backend(
196
+ self,
197
+ image_content: bytes,
198
+ product_id: str,
199
+ options: Dict[str, Any]
200
+ ) -> Optional[str]:
201
+ """Process image via backend endpoint"""
202
+ try:
203
+ # Get backend endpoint
204
+ endpoint = os.getenv('IMAGE_PROCESS_ENDPOINT', 'http://localhost:7860/products/process-product-image')
205
+
206
+ files = {'file': ('image.jpg', image_content, 'image/jpeg')}
207
+ data = {
208
+ 'remove_bg': str(options.get('remove_background', True)).lower(),
209
+ 'upscale': str(options.get('upscale_factor', 2) > 1).lower(),
210
+ 'scale_factor': str(options.get('upscale_factor', 2)),
211
+ 'process_order': 'remove_first',
212
+ 'product_id': product_id
213
+ }
214
+
215
+ response = requests.post(endpoint, files=files, data=data, timeout=60)
216
+
217
+ if response.status_code == 200:
218
+ result = response.json()
219
+ if result.get('status') == 'success':
220
+ logger.info("✅ Image processed successfully via backend")
221
+ return result.get('image_url')
222
+
223
+ logger.warning(f"⚠️ Backend processing failed: {response.status_code}")
224
+ return None
225
+
226
+ except Exception as e:
227
+ logger.warning(f"⚠️ Backend processing unavailable: {e}")
228
+ return None
229
+
230
+ def process_promo_images(
231
+ self,
232
+ similarity_threshold: float = 0.95,
233
+ skip_existing: bool = True,
234
+ max_products: Optional[int] = None
235
+ ) -> Dict[str, int]:
236
+ """
237
+ Process images from promotional products
238
+
239
+ Args:
240
+ similarity_threshold: Minimum similarity for processing
241
+ skip_existing: Skip products that already have images
242
+ max_products: Maximum products to process
243
+
244
+ Returns:
245
+ Processing statistics
246
+ """
247
+ logger.info("🏷️ Starting promo image processing...")
248
+
249
+ # Load promo products with images
250
+ promo_products = self.repository.load_promo_products(with_images_only=True)
251
+ if not promo_products:
252
+ logger.error("❌ No promo products with images found")
253
+ return self._get_empty_stats()
254
+
255
+ # Load target products
256
+ if skip_existing:
257
+ target_products = self.repository.get_products_without_images(max_products)
258
+ else:
259
+ all_products = self.repository.load_all_products()
260
+ target_products = all_products[:max_products] if max_products else all_products
261
+
262
+ if not target_products:
263
+ logger.error("❌ No target products found")
264
+ return self._get_empty_stats()
265
+
266
+ # Find matches
267
+ matches = self.find_high_similarity_matches(
268
+ promo_products,
269
+ target_products,
270
+ similarity_threshold,
271
+ "promo"
272
+ )
273
+
274
+ return self._process_matches(matches, skip_existing)
275
+
276
+ def process_manual_upload(
277
+ self,
278
+ image_file: bytes,
279
+ filename: str,
280
+ product_id: str,
281
+ processing_options: Dict[str, Any] = None
282
+ ) -> bool:
283
+ """
284
+ Process manually uploaded image
285
+
286
+ Args:
287
+ image_file: Image file content
288
+ filename: Original filename
289
+ product_id: Target product ID
290
+ processing_options: Processing configuration
291
+
292
+ Returns:
293
+ True if successful
294
+ """
295
+ logger.info(f"📤 Processing manual upload for product {product_id}")
296
+
297
+ try:
298
+ # Process image
299
+ processed_url = self._process_via_backend(
300
+ image_file,
301
+ product_id,
302
+ processing_options or {}
303
+ )
304
+
305
+ if not processed_url:
306
+ logger.error("❌ Failed to process uploaded image")
307
+ return False
308
+
309
+ # Update database
310
+ success = self.repository.update_product_image(product_id, processed_url)
311
+
312
+ if success:
313
+ # Save metadata
314
+ self.repository.save_image_metadata(product_id, {
315
+ 'source_type': 'manual',
316
+ 'original_filename': filename,
317
+ 'processed_url': processed_url,
318
+ 'upload_time': time.time()
319
+ })
320
+
321
+ logger.info(f"✅ Successfully attached manual upload to product {product_id}")
322
+ return True
323
+
324
+ return False
325
+
326
+ except Exception as e:
327
+ logger.error(f"❌ Error processing manual upload: {e}")
328
+ return False
329
+
330
+ def process_from_url_list(
331
+ self,
332
+ url_mappings: List[Dict[str, str]],
333
+ processing_options: Dict[str, Any] = None
334
+ ) -> Dict[str, int]:
335
+ """
336
+ Process images from a list of URL mappings
337
+
338
+ Args:
339
+ url_mappings: List of {'product_id': 'xxx', 'image_url': 'xxx'} mappings
340
+ processing_options: Processing configuration
341
+
342
+ Returns:
343
+ Processing statistics
344
+ """
345
+ logger.info(f"🌐 Processing {len(url_mappings)} URL mappings...")
346
+
347
+ stats = self._get_empty_stats()
348
+ stats['total_processed'] = len(url_mappings)
349
+
350
+ for mapping in url_mappings:
351
+ product_id = mapping.get('product_id')
352
+ image_url = mapping.get('image_url')
353
+
354
+ if not product_id or not image_url:
355
+ stats['failed'] += 1
356
+ continue
357
+
358
+ logger.info(f"📊 Processing URL for product {product_id}")
359
+
360
+ # Check availability
361
+ if not self.check_image_availability(image_url):
362
+ stats['failed'] += 1
363
+ continue
364
+
365
+ # Process image
366
+ processed_url = self.process_image_from_url(
367
+ image_url,
368
+ product_id,
369
+ processing_options
370
+ )
371
+
372
+ if processed_url:
373
+ # Update database
374
+ if self.repository.update_product_image(product_id, processed_url):
375
+ stats['successful'] += 1
376
+
377
+ # Save metadata
378
+ self.repository.save_image_metadata(product_id, {
379
+ 'source_type': 'url',
380
+ 'source_url': image_url,
381
+ 'processed_url': processed_url,
382
+ 'processing_time': time.time()
383
+ })
384
+ else:
385
+ stats['failed'] += 1
386
+ else:
387
+ stats['failed'] += 1
388
+
389
+ logger.info(f"✅ URL processing complete: {stats['successful']}/{stats['total_processed']} successful")
390
+ return stats
391
+
392
+ def search_and_attach_google_images(
393
+ self,
394
+ product_id: str,
395
+ search_query: str,
396
+ max_results: int = 3,
397
+ require_approval: bool = True
398
+ ) -> List[Dict[str, Any]]:
399
+ """
400
+ Search Google Images and find potential matches
401
+
402
+ Args:
403
+ product_id: Target product ID
404
+ search_query: Search query for Google Images
405
+ max_results: Maximum results to return
406
+ require_approval: Whether manual approval is required
407
+
408
+ Returns:
409
+ List of potential image matches
410
+ """
411
+ logger.info(f"🔍 Google Image search for product {product_id}: '{search_query}'")
412
+
413
+ # TODO: Implement Google Images API integration
414
+ # For now, return mock results
415
+ mock_results = [
416
+ {
417
+ 'image_url': f'https://example.com/mock-image-1.jpg',
418
+ 'thumbnail_url': f'https://example.com/mock-thumb-1.jpg',
419
+ 'source_page': f'https://example.com/product-page-1',
420
+ 'title': f'Mock result for {search_query}',
421
+ 'confidence': 0.85
422
+ }
423
+ ]
424
+
425
+ logger.info(f"🔍 Found {len(mock_results)} potential Google Image matches")
426
+ logger.warning("⚠️ Google Images integration not yet implemented - returning mock data")
427
+
428
+ return mock_results
429
+
430
+ def _process_matches(self, matches: List[Dict], skip_existing: bool = True) -> Dict[str, int]:
431
+ """Process similarity matches and attach images"""
432
+ stats = self._get_empty_stats()
433
+ stats['total_processed'] = len(matches)
434
+
435
+ if not matches:
436
+ return stats
437
+
438
+ # Filter existing if needed
439
+ if skip_existing:
440
+ to_process = [m for m in matches if not m['has_current_image']]
441
+ stats['skipped'] = len(matches) - len(to_process)
442
+ matches = to_process
443
+
444
+ logger.info(f"📊 Processing images for {len(matches)} products...")
445
+
446
+ for match in matches:
447
+ product_id = match['target_product_id']
448
+ image_info = match['source_image_info']
449
+ image_url = image_info.get('image_url')
450
+
451
+ if not image_url:
452
+ stats['failed'] += 1
453
+ continue
454
+
455
+ logger.info(f"📊 Processing image for product {product_id}")
456
+
457
+ # Check availability
458
+ if not self.check_image_availability(image_url):
459
+ stats['failed'] += 1
460
+ continue
461
+
462
+ # Process image
463
+ processed_url = self.process_image_from_url(image_url, product_id)
464
+
465
+ if processed_url and self.repository.update_product_image(product_id, processed_url):
466
+ stats['successful'] += 1
467
+
468
+ # Save metadata
469
+ self.repository.save_image_metadata(product_id, {
470
+ 'source_type': match['source_type'],
471
+ 'similarity': match['similarity'],
472
+ 'confidence': match['confidence'],
473
+ 'source_info': image_info,
474
+ 'processing_time': time.time()
475
+ })
476
+
477
+ logger.info(f"✅ Successfully attached image to product {product_id}")
478
+ else:
479
+ stats['failed'] += 1
480
+
481
+ return stats
482
+
483
+ def _get_empty_stats(self) -> Dict[str, int]:
484
+ """Get empty statistics dictionary"""
485
+ return {
486
+ 'total_processed': 0,
487
+ 'successful': 0,
488
+ 'failed': 0,
489
+ 'skipped': 0,
490
+ 'unavailable': 0
491
+ }
492
+
493
+ def get_processing_report(self, stats: Dict[str, int]) -> Dict[str, Any]:
494
+ """Generate processing report"""
495
+ return {
496
+ 'summary': {
497
+ 'total_processed': stats['total_processed'],
498
+ 'successful': stats['successful'],
499
+ 'failed': stats['failed'],
500
+ 'skipped': stats.get('skipped', 0),
501
+ 'success_rate': (stats['successful'] / max(stats['total_processed'], 1)) * 100
502
+ },
503
+ 'timestamp': time.time(),
504
+ 'recommendations': self._generate_recommendations(stats)
505
+ }
506
+
507
+ def _generate_recommendations(self, stats: Dict[str, int]) -> List[str]:
508
+ """Generate recommendations based on processing stats"""
509
+ recommendations = []
510
+
511
+ if stats['failed'] > stats['successful']:
512
+ recommendations.append("High failure rate - check image sources and processing settings")
513
+
514
+ if stats.get('skipped', 0) > 0:
515
+ recommendations.append(f"{stats['skipped']} products already had images - consider processing all products")
516
+
517
+ if stats['successful'] > 0:
518
+ recommendations.append(f"Successfully processed {stats['successful']} images - consider similar processing for remaining products")
519
+
520
+ return recommendations
521
+
522
+
523
+ # Global processor instance
524
+ _processor = None
525
+
526
+ def get_image_processor() -> EnhancedImageProcessor:
527
+ """Get singleton image processor instance"""
528
+ global _processor
529
+ if _processor is None:
530
+ _processor = EnhancedImageProcessor()
531
+ return _processor
similarity_engine/product_comparator.py ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Product Comparator - Backend Version
3
+ Modified to return data instead of printing for API usage
4
+ """
5
+
6
+ import time
7
+ import logging
8
+ from typing import List, Dict, Any
9
+ import sys
10
+ import os
11
+
12
+ # Add the similarity_engine directory to path
13
+ current_dir = os.path.dirname(os.path.abspath(__file__))
14
+ sys.path.append(current_dir)
15
+
16
+ from similarity_core import calculate_similarity, calculate_confidence
17
+
18
+ # Configure logging
19
+ logging.basicConfig(level=logging.INFO)
20
+ logger = logging.getLogger(__name__)
21
+
22
+ class ProductComparator:
23
+ """
24
+ Backend version of ProductComparator that returns data
25
+ """
26
+
27
+ def __init__(self, threshold: float = 0.87, min_length_diff: int = 3):
28
+ """
29
+ Initialize the comparator
30
+
31
+ Args:
32
+ threshold: Similarity threshold for considering products as duplicates
33
+ min_length_diff: Minimum length difference to consider as different products
34
+ """
35
+ self.threshold = threshold
36
+ self.min_length_diff = min_length_diff
37
+ self.comparison_count = 0
38
+ self.duplicates_found = 0
39
+
40
+ def is_valid_comparison(self, name1: str, name2: str) -> bool:
41
+ """
42
+ Check if this comparison should be made to avoid false duplicates
43
+
44
+ Args:
45
+ name1: First product name
46
+ name2: Second product name
47
+
48
+ Returns:
49
+ True if this comparison should be made
50
+ """
51
+ # Skip empty names
52
+ if not name1 or not name2:
53
+ return False
54
+
55
+ # Skip identical names (already handled)
56
+ if name1 == name2:
57
+ return False
58
+
59
+ # Skip if one is contained in the other without significant differences
60
+ if (name1 in name2 or name2 in name1) and abs(len(name1) - len(name2)) < self.min_length_diff:
61
+ return False
62
+
63
+ return True
64
+
65
+ def find_all_duplicates(self, products: List[Dict[str, Any]]) -> Dict[str, Any]:
66
+ """
67
+ Find all duplicate pairs in product list
68
+
69
+ Args:
70
+ products: List of product dictionaries with 'product_id' and 'product_name'
71
+
72
+ Returns:
73
+ Dictionary with analysis results and duplicate pairs
74
+ """
75
+ start_time = time.time()
76
+ total_products = len(products)
77
+ total_comparisons = total_products * (total_products - 1) // 2
78
+ duplicates = []
79
+
80
+ logger.info(f"🔍 Starting duplicate analysis for {total_products} products")
81
+ logger.info(f"📊 Total comparisons needed: {total_comparisons:,}")
82
+ logger.info(f"🎯 Duplicate threshold: {self.threshold}")
83
+
84
+ comparison_count = 0
85
+ progress_updates = []
86
+
87
+ for i, product1 in enumerate(products):
88
+ product_name1 = product1.get('product_name', '').strip()
89
+ product_id1 = product1.get('product_id', 'unknown')
90
+
91
+ # Skip products with empty names
92
+ if not product_name1:
93
+ continue
94
+
95
+ # Log progress every 100 products
96
+ if i % 100 == 0:
97
+ progress_updates.append(f"Analyzing product {i+1}/{total_products}")
98
+ logger.info(f"📊 Progress: {i+1}/{total_products} products analyzed")
99
+
100
+ for j, product2 in enumerate(products[i+1:], i+1):
101
+ product_name2 = product2.get('product_name', '').strip()
102
+ product_id2 = product2.get('product_id', 'unknown')
103
+
104
+ # Skip products with empty names or identical IDs
105
+ if not product_name2 or product_id1 == product_id2:
106
+ continue
107
+
108
+ # Skip invalid comparisons
109
+ if not self.is_valid_comparison(product_name1, product_name2):
110
+ continue
111
+
112
+ comparison_count += 1
113
+
114
+ # Calculate similarity
115
+ similarity = calculate_similarity(product_name1, product_name2)
116
+
117
+ # Check if it's a duplicate
118
+ if similarity >= self.threshold:
119
+ confidence = calculate_confidence(
120
+ similarity,
121
+ product_name1,
122
+ product_name2
123
+ )
124
+
125
+ duplicate_info = {
126
+ 'product1_id': product_id1,
127
+ 'product1_name': product_name1,
128
+ 'product2_id': product_id2,
129
+ 'product2_name': product_name2,
130
+ 'similarity': round(similarity, 3),
131
+ 'confidence': round(confidence, 3)
132
+ }
133
+
134
+ duplicates.append(duplicate_info)
135
+ logger.info(f" 🔍 DUPLICATE FOUND: {product_name1} ↔ {product_name2} ({similarity:.3f})")
136
+
137
+ end_time = time.time()
138
+ analysis_time = end_time - start_time
139
+
140
+ # Return comprehensive results
141
+ results = {
142
+ 'duplicates': duplicates,
143
+ 'analysis_summary': {
144
+ 'total_products': total_products,
145
+ 'total_comparisons': comparison_count,
146
+ 'duplicates_found': len(duplicates),
147
+ 'duplicate_rate': (len(duplicates) / max(total_products, 1)) * 100,
148
+ 'analysis_time_seconds': round(analysis_time, 2),
149
+ 'threshold_used': self.threshold,
150
+ 'success': True
151
+ },
152
+ 'progress_log': progress_updates,
153
+ 'recommendations': self._generate_recommendations(duplicates, total_products)
154
+ }
155
+
156
+ logger.info(f"✅ Analysis complete: {len(duplicates)} duplicates found in {analysis_time:.1f}s")
157
+ return results
158
+
159
+ def _generate_recommendations(self, duplicates: List[Dict], total_products: int) -> List[str]:
160
+ """Generate recommendations based on analysis results"""
161
+ recommendations = []
162
+
163
+ if not duplicates:
164
+ recommendations.append("✅ No duplicates found - your database is clean!")
165
+ recommendations.append("💡 Consider periodic duplicate checks as you add new products")
166
+ else:
167
+ duplicate_rate = (len(duplicates) / total_products) * 100
168
+
169
+ if duplicate_rate > 10:
170
+ recommendations.append("⚠️ High duplicate rate detected - consider cleaning database")
171
+ recommendations.append("🔧 Review product naming conventions to reduce future duplicates")
172
+ elif duplicate_rate > 5:
173
+ recommendations.append("💡 Moderate duplicate rate - review and merge similar products")
174
+ else:
175
+ recommendations.append("✅ Low duplicate rate - good database quality")
176
+
177
+ recommendations.append(f"📋 Review {len(duplicates)} duplicate pairs for manual decision")
178
+ recommendations.append("💡 Higher similarity scores indicate more confident matches")
179
+
180
+ return recommendations
181
+
182
+ def compare_two_products(self, product1_name: str, product2_name: str) -> Dict[str, Any]:
183
+ """
184
+ Compare two specific products
185
+
186
+ Args:
187
+ product1_name: First product name
188
+ product2_name: Second product name
189
+
190
+ Returns:
191
+ Comparison results
192
+ """
193
+ logger.info(f"🔍 Comparing '{product1_name}' vs '{product2_name}'")
194
+
195
+ similarity = calculate_similarity(product1_name, product2_name)
196
+ confidence = calculate_confidence(similarity, product1_name, product2_name)
197
+ is_duplicate = similarity >= self.threshold
198
+
199
+ # Determine assessment
200
+ if similarity >= 0.95:
201
+ assessment = "Perfect match - identical products"
202
+ assessment_emoji = "✅"
203
+ elif similarity >= 0.8:
204
+ assessment = "Very similar - likely duplicates"
205
+ assessment_emoji = "⚠️"
206
+ elif similarity >= 0.7:
207
+ assessment = "Similar - review recommended"
208
+ assessment_emoji = "🤔"
209
+ else:
210
+ assessment = "Different products"
211
+ assessment_emoji = "❌"
212
+
213
+ results = {
214
+ 'product1_name': product1_name,
215
+ 'product2_name': product2_name,
216
+ 'similarity': round(similarity, 3),
217
+ 'confidence': round(confidence, 3),
218
+ 'is_duplicate': is_duplicate,
219
+ 'threshold_used': self.threshold,
220
+ 'assessment': {
221
+ 'description': assessment,
222
+ 'emoji': assessment_emoji,
223
+ 'category': 'identical' if similarity >= 0.95 else
224
+ 'very_similar' if similarity >= 0.8 else
225
+ 'similar' if similarity >= 0.7 else 'different'
226
+ },
227
+ 'percentage_similarity': round(similarity * 100, 1),
228
+ 'percentage_confidence': round(confidence * 100, 1)
229
+ }
230
+
231
+ logger.info(f"📊 Comparison result: {similarity:.3f} similarity, {assessment}")
232
+ return results
233
+
234
+ def get_statistics(self) -> Dict[str, Any]:
235
+ """
236
+ Get analysis statistics
237
+
238
+ Returns:
239
+ Dictionary with analysis statistics
240
+ """
241
+ return {
242
+ 'comparison_count': self.comparison_count,
243
+ 'duplicates_found': self.duplicates_found,
244
+ 'threshold': self.threshold,
245
+ 'min_length_diff': self.min_length_diff
246
+ }
247
+
248
+
249
+ def compare_products_batch(
250
+ products: List[Dict[str, Any]],
251
+ threshold: float = 0.87,
252
+ return_summary_only: bool = False
253
+ ) -> Dict[str, Any]:
254
+ """
255
+ Convenience function to compare products and return results
256
+
257
+ Args:
258
+ products: List of product dictionaries
259
+ threshold: Similarity threshold for duplicates
260
+ return_summary_only: If True, only return summary stats
261
+
262
+ Returns:
263
+ Analysis results dictionary
264
+ """
265
+ logger.info(f"🚀 Starting batch product comparison with threshold {threshold}")
266
+
267
+ comparator = ProductComparator(threshold=threshold)
268
+ results = comparator.find_all_duplicates(products)
269
+
270
+ if return_summary_only:
271
+ return {
272
+ 'analysis_summary': results['analysis_summary'],
273
+ 'recommendations': results['recommendations']
274
+ }
275
+
276
+ return results
277
+
278
+
279
+ def find_product_duplicates_simple(
280
+ product_names: List[str],
281
+ threshold: float = 0.87
282
+ ) -> List[Dict[str, Any]]:
283
+ """
284
+ Simple function to find duplicates in a list of product names
285
+
286
+ Args:
287
+ product_names: List of product names
288
+ threshold: Similarity threshold
289
+
290
+ Returns:
291
+ List of duplicate pairs
292
+ """
293
+ # Convert to product format
294
+ products = [
295
+ {"product_id": str(i), "product_name": name}
296
+ for i, name in enumerate(product_names) if name and name.strip()
297
+ ]
298
+
299
+ results = compare_products_batch(products, threshold, return_summary_only=False)
300
+ return results.get('duplicates', [])
similarity_engine/promo_comparator.py ADDED
@@ -0,0 +1,278 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Promo Products Comparator - Compare promo products against database products
4
+ Integrates with the existing similarity engine for duplicate detection
5
+ Backend version with streaming support
6
+ """
7
+
8
+ import sys
9
+ import os
10
+ import time
11
+ import logging
12
+ from typing import List, Dict, Any, AsyncGenerator
13
+ import asyncio
14
+
15
+ # Add current directory to path for imports
16
+ current_dir = os.path.dirname(os.path.abspath(__file__))
17
+ sys.path.append(current_dir)
18
+
19
+ try:
20
+ from similarity_core import calculate_similarity, calculate_confidence
21
+ from similarity_repository import get_similarity_repository
22
+ except ImportError as e:
23
+ print(f"❌ Import error: {e}")
24
+ print("💡 Make sure you're running this from the similarity_engine directory")
25
+
26
+ # Configure logging
27
+ logging.basicConfig(level=logging.INFO)
28
+ logger = logging.getLogger(__name__)
29
+
30
+
31
+ class PromoComparator:
32
+ """Handles promo product comparison with streaming support"""
33
+
34
+ def __init__(self, threshold: float = 0.85):
35
+ self.threshold = threshold
36
+ self.repository = get_similarity_repository()
37
+
38
+ def compare_promo_against_database(self, promo_products: List[Dict], db_products: List[Dict]) -> List[Dict]:
39
+ """
40
+ Compare promo products against database products for potential matches
41
+
42
+ Args:
43
+ promo_products: List of promo product dictionaries
44
+ db_products: List of database product dictionaries
45
+
46
+ Returns:
47
+ List of potential matches
48
+ """
49
+ logger.info(f"🔍 Starting promo comparison: {len(promo_products)} promo vs {len(db_products)} db products")
50
+
51
+ matches = []
52
+ total_comparisons = len(promo_products)
53
+
54
+ start_time = time.time()
55
+
56
+ for i, promo_product in enumerate(promo_products):
57
+ promo_name = promo_product.get('name', '').strip()
58
+ promo_store = promo_product.get('store', '').strip()
59
+
60
+ if not promo_name:
61
+ continue
62
+
63
+ logger.info(f"📊 Analyzing promo product {i+1}/{total_comparisons}: '{promo_name[:50]}'")
64
+
65
+ best_match = None
66
+ best_similarity = 0.0
67
+
68
+ # Compare against all database products
69
+ for db_product in db_products:
70
+ db_name = db_product.get('product_name', '').strip()
71
+
72
+ if not db_name:
73
+ continue
74
+
75
+ # Calculate similarity
76
+ similarity = calculate_similarity(promo_name, db_name)
77
+
78
+ if similarity >= self.threshold and similarity > best_similarity:
79
+ best_similarity = similarity
80
+ confidence = calculate_confidence(similarity, promo_name, db_name)
81
+
82
+ best_match = {
83
+ 'promo_id': promo_product.get('id'),
84
+ 'promo_name': promo_name,
85
+ 'promo_store': promo_store,
86
+ 'promo_price': promo_product.get('promo_price', 0),
87
+ 'regular_price': promo_product.get('regular_price', 0),
88
+ 'picture_id': promo_product.get('picture_id'),
89
+ 'db_product_id': db_product.get('product_id'),
90
+ 'db_product_name': db_name,
91
+ 'db_brand': db_product.get('brand', {}).get('brand_name', 'No Brand') if db_product.get('brand') else 'No Brand',
92
+ 'similarity': round(similarity, 3),
93
+ 'confidence': round(confidence, 3)
94
+ }
95
+
96
+ if best_match:
97
+ matches.append(best_match)
98
+ logger.info(f"🔍 MATCH FOUND: {promo_name} ↔ {best_match['db_product_name']} ({best_similarity:.3f})")
99
+
100
+ end_time = time.time()
101
+ analysis_time = end_time - start_time
102
+
103
+ logger.info(f"✅ Comparison complete! Found {len(matches)} potential matches in {analysis_time:.1f} seconds")
104
+
105
+ return matches
106
+
107
+ async def compare_promo_streaming(self, promo_products: List[Dict], db_products: List[Dict]) -> AsyncGenerator[Dict, None]:
108
+ """
109
+ Compare promo products with real-time streaming updates
110
+
111
+ Args:
112
+ promo_products: List of promo product dictionaries
113
+ db_products: List of database product dictionaries
114
+
115
+ Yields:
116
+ Progress updates and results
117
+ """
118
+ yield {
119
+ 'type': 'init',
120
+ 'total_promo_products': len(promo_products),
121
+ 'total_db_products': len(db_products),
122
+ 'threshold': self.threshold
123
+ }
124
+
125
+ matches = []
126
+ total_comparisons = len(promo_products)
127
+
128
+ start_time = time.time()
129
+
130
+ for i, promo_product in enumerate(promo_products):
131
+ promo_name = promo_product.get('name', '').strip()
132
+ promo_store = promo_product.get('store', '').strip()
133
+
134
+ if not promo_name:
135
+ continue
136
+
137
+ # Send current comparison info
138
+ yield {
139
+ 'type': 'comparing',
140
+ 'current': i + 1,
141
+ 'total': total_comparisons,
142
+ 'promo_name': promo_name[:50],
143
+ 'promo_store': promo_store,
144
+ 'progress': round(((i + 1) / total_comparisons) * 100, 1)
145
+ }
146
+
147
+ best_match = None
148
+ best_similarity = 0.0
149
+ comparisons_for_this_promo = 0
150
+
151
+ # Compare against all database products
152
+ for db_product in db_products:
153
+ db_name = db_product.get('product_name', '').strip()
154
+
155
+ if not db_name:
156
+ continue
157
+
158
+ # Calculate similarity
159
+ similarity = calculate_similarity(promo_name, db_name)
160
+ comparisons_for_this_promo += 1
161
+
162
+ if similarity >= self.threshold and similarity > best_similarity:
163
+ best_similarity = similarity
164
+ confidence = calculate_confidence(similarity, promo_name, db_name)
165
+
166
+ best_match = {
167
+ 'promo_id': promo_product.get('id'),
168
+ 'promo_name': promo_name,
169
+ 'promo_store': promo_store,
170
+ 'promo_price': promo_product.get('promo_price', 0),
171
+ 'regular_price': promo_product.get('regular_price', 0),
172
+ 'picture_id': promo_product.get('picture_id'),
173
+ 'db_product_id': db_product.get('product_id'),
174
+ 'db_product_name': db_name,
175
+ 'db_brand': db_product.get('brand', {}).get('brand_name', 'No Brand') if db_product.get('brand') else 'No Brand',
176
+ 'similarity': round(similarity, 3),
177
+ 'confidence': round(confidence, 3)
178
+ }
179
+
180
+ # If match found, send immediate update
181
+ if best_match:
182
+ matches.append(best_match)
183
+ yield {
184
+ 'type': 'match_found',
185
+ 'match': best_match,
186
+ 'total_matches': len(matches)
187
+ }
188
+
189
+ # Small delay to prevent overwhelming
190
+ await asyncio.sleep(0.01)
191
+
192
+ # Send final results
193
+ analysis_time = time.time() - start_time
194
+ yield {
195
+ 'type': 'complete',
196
+ 'matches': matches,
197
+ 'summary': {
198
+ 'total_promo_products': len(promo_products),
199
+ 'total_db_products': len(db_products),
200
+ 'matches_found': len(matches),
201
+ 'match_rate': round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
202
+ 'analysis_time_seconds': round(analysis_time, 2),
203
+ 'threshold_used': self.threshold
204
+ }
205
+ }
206
+
207
+ def run_promo_comparison(self, save_report: bool = True) -> Dict[str, Any]:
208
+ """
209
+ Main function to run promo product comparison
210
+
211
+ Args:
212
+ save_report: Whether to save results to file
213
+
214
+ Returns:
215
+ Comparison results
216
+ """
217
+ logger.info("🏷️ Starting promo products comparison")
218
+
219
+ if not self.repository:
220
+ logger.error("❌ Repository not available")
221
+ return {'error': 'Repository not available'}
222
+
223
+ # Load promo products
224
+ logger.info("📊 Loading promotional products...")
225
+ promo_products = self.repository.load_promo_products()
226
+
227
+ if not promo_products:
228
+ logger.warning("❌ No promo products found")
229
+ return {'error': 'No promo products found'}
230
+
231
+ # Load database products
232
+ logger.info("📊 Loading database products...")
233
+ db_products = self.repository.load_all_products()
234
+
235
+ if not db_products:
236
+ logger.warning("❌ No database products found")
237
+ return {'error': 'No database products found'}
238
+
239
+ # Run comparison
240
+ matches = self.compare_promo_against_database(promo_products, db_products)
241
+
242
+ # Prepare results
243
+ results = {
244
+ 'matches': matches,
245
+ 'summary': {
246
+ 'total_promo_products': len(promo_products),
247
+ 'total_db_products': len(db_products),
248
+ 'matches_found': len(matches),
249
+ 'match_rate': round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
250
+ 'threshold_used': self.threshold
251
+ }
252
+ }
253
+
254
+ logger.info(f"✅ Promo comparison complete: {len(matches)} matches found")
255
+
256
+ return results
257
+
258
+
259
+ def get_promo_comparator(threshold: float = 0.85) -> PromoComparator:
260
+ """Get a PromoComparator instance"""
261
+ return PromoComparator(threshold=threshold)
262
+
263
+
264
+ # For backwards compatibility
265
+ def run_promo_comparison(threshold: float = 0.85, save_report: bool = True):
266
+ """
267
+ Main function to run promo product comparison (backwards compatibility)
268
+
269
+ Args:
270
+ threshold: Similarity threshold for matches
271
+ save_report: Whether to save results to file
272
+ """
273
+ comparator = PromoComparator(threshold=threshold)
274
+ return comparator.run_promo_comparison(save_report=save_report)
275
+
276
+
277
+ if __name__ == "__main__":
278
+ run_promo_comparison()
similarity_engine/similarity_core.py ADDED
@@ -0,0 +1,190 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ COMPLETE Similarity Engine - Core Module
3
+ All original methods preserved with improved duplicate detection
4
+ """
5
+
6
+ import re
7
+ from typing import Set
8
+
9
+
10
+ def normalize_product_name(name: str) -> str:
11
+ """Normalize product name for better comparison"""
12
+ if not name:
13
+ return ""
14
+ name = name.lower()
15
+ name = re.sub(r'[^\w\s]', ' ', name)
16
+ name = re.sub(r'\s+', ' ', name).strip()
17
+ stop_words = {'za', 'i', 'u', 'na', 'sa', 'od', 'the', 'and', 'of', 'in'}
18
+ words = [word for word in name.split() if word not in stop_words]
19
+ return ' '.join(words)
20
+
21
+
22
+ def word_order_similarity(s1: str, s2: str) -> float:
23
+ """Calculate word order similarity"""
24
+ s1 = normalize_product_name(s1)
25
+ s2 = normalize_product_name(s2)
26
+ if not s1 or not s2:
27
+ return 0.0
28
+ words1 = set(s1.split())
29
+ words2 = set(s2.split())
30
+ if not words1 and not words2:
31
+ return 1.0
32
+ if not words1 or not words2:
33
+ return 0.0
34
+ intersection = len(words1 & words2)
35
+ union = len(words1 | words2)
36
+ return intersection / union if union > 0 else 0.0
37
+
38
+
39
+ def dice_coefficient(s1: str, s2: str) -> float:
40
+ """Calculate Dice coefficient for character-level similarity"""
41
+ if not s1 or not s2:
42
+ return 0.0
43
+ s1, s2 = s1.lower(), s2.lower()
44
+ bigrams1 = set(s1[i:i+2] for i in range(len(s1) - 1))
45
+ bigrams2 = set(s2[i:i+2] for i in range(len(s2) - 1))
46
+ if not bigrams1 and not bigrams2:
47
+ return 1.0
48
+ if not bigrams1 or not bigrams2:
49
+ return 0.0
50
+ intersection = len(bigrams1 & bigrams2)
51
+ return 2.0 * intersection / (len(bigrams1) + len(bigrams2))
52
+
53
+
54
+ def jaro_winkler(s1: str, s2: str) -> float:
55
+ """Calculate Jaro-Winkler similarity"""
56
+ if not s1 or not s2:
57
+ return 0.0
58
+ s1, s2 = s1.lower(), s2.lower()
59
+ if s1 == s2:
60
+ return 1.0
61
+ len1, len2 = len(s1), len(s2)
62
+ match_window = max(len1, len2) // 2 - 1
63
+ match_window = max(0, match_window)
64
+ s1_matches = [False] * len1
65
+ s2_matches = [False] * len2
66
+ matches = 0
67
+ transpositions = 0
68
+ for i in range(len1):
69
+ start = max(0, i - match_window)
70
+ end = min(i + match_window + 1, len2)
71
+ for j in range(start, end):
72
+ if s2_matches[j] or s1[i] != s2[j]:
73
+ continue
74
+ s1_matches[i] = s2_matches[j] = True
75
+ matches += 1
76
+ break
77
+ if matches == 0:
78
+ return 0.0
79
+ k = 0
80
+ for i in range(len1):
81
+ if not s1_matches[i]:
82
+ continue
83
+ while not s2_matches[k]:
84
+ k += 1
85
+ if s1[i] != s2[k]:
86
+ transpositions += 1
87
+ k += 1
88
+ jaro = (matches/len1 + matches/len2 + (matches - transpositions/2)/matches) / 3.0
89
+ prefix = 0
90
+ for i in range(min(len1, len2, 4)):
91
+ if s1[i] == s2[i]:
92
+ prefix += 1
93
+ else:
94
+ break
95
+ return jaro + (0.1 * prefix * (1 - jaro))
96
+
97
+
98
+ def levenshtein_similarity(s1: str, s2: str) -> float:
99
+ """Calculate normalized Levenshtein similarity"""
100
+ if not s1 or not s2:
101
+ return 0.0
102
+ s1, s2 = s1.lower(), s2.lower()
103
+ if s1 == s2:
104
+ return 1.0
105
+ len1, len2 = len(s1), len(s2)
106
+ matrix = [[0] * (len2 + 1) for _ in range(len1 + 1)]
107
+ for i in range(len1 + 1):
108
+ matrix[i][0] = i
109
+ for j in range(len2 + 1):
110
+ matrix[0][j] = j
111
+ for i in range(1, len1 + 1):
112
+ for j in range(1, len2 + 1):
113
+ cost = 0 if s1[i-1] == s2[j-1] else 1
114
+ matrix[i][j] = min(
115
+ matrix[i-1][j] + 1,
116
+ matrix[i][j-1] + 1,
117
+ matrix[i-1][j-1] + cost
118
+ )
119
+ distance = matrix[len1][len2]
120
+ return 1 - (distance / max(len1, len2)) if max(len1, len2) > 0 else 0.0
121
+
122
+
123
+ def hybrid_similarity(s1: str, s2: str) -> float:
124
+ """Combined similarity score using multiple algorithms"""
125
+ if normalize_product_name(s1) == normalize_product_name(s2):
126
+ return 1.0
127
+ norm1 = normalize_product_name(s1)
128
+ norm2 = normalize_product_name(s2)
129
+ if norm1 in norm2 or norm2 in norm1:
130
+ len_diff = abs(len(norm1) - len(norm2))
131
+ if len_diff < 3:
132
+ return 0.95
133
+ elif len_diff < 10:
134
+ return 0.85
135
+ else:
136
+ return 0.7
137
+ word_sim = word_order_similarity(s1, s2) * 0.40
138
+ dice_sim = dice_coefficient(s1, s2) * 0.30
139
+ jaro_sim = jaro_winkler(s1, s2) * 0.20
140
+ leven_sim = levenshtein_similarity(s1, s2) * 0.10
141
+ return word_sim + dice_sim + jaro_sim + leven_sim
142
+
143
+
144
+ def calculate_similarity(text1: str, text2: str) -> float:
145
+ """Main similarity function"""
146
+ if not text1 or not text2:
147
+ return 0.0
148
+ if text1.strip() == text2.strip():
149
+ return 1.0
150
+ if len(text1) < 5 or len(text2) < 5:
151
+ return 0.0
152
+ return hybrid_similarity(text1.strip(), text2.strip())
153
+
154
+
155
+ def calculate_confidence(similarity: float, text1: str, text2: str) -> float:
156
+ """Calculate confidence score"""
157
+ confidence = similarity
158
+ avg_length = (len(text1) + len(text2)) / 2
159
+ length_factor = min(avg_length / 100, 0.15)
160
+ word_count1 = len(text1.split())
161
+ word_count2 = len(text2.split())
162
+ avg_words = (word_count1 + word_count2) / 2
163
+ word_factor = min(avg_words / 15, 0.10)
164
+ len_diff = abs(len(text1) - len(text2))
165
+ len_penalty = min(len_diff / 50, 0.20)
166
+ final_confidence = confidence + length_factor + word_factor - len_penalty
167
+ return min(max(final_confidence, 0.0), 1.0)
168
+
169
+
170
+ def test_similarity_examples():
171
+ """Test function with examples"""
172
+ test_cases = [
173
+ ("Maslac", "Maslac", True),
174
+ ("Vrhnje za kuhanje", "Vrhnje za kuhanje 3x200g", False),
175
+ ("Japanke copacabana lila", "Japanke copacabana flower", False),
176
+ ("Kroasan praline pan pek 70 g", "Kroasan marelica pan pek 70 g", False),
177
+ ("Spužva za kupanje", "Spužva baby za kupanje", False),
178
+ ("Apple iPhone 13", "iPhone 13 Apple", True),
179
+ ("vindija mlijeko cokoladno", "vindija cokoladno mlijeko", True)
180
+ ]
181
+ print("🧪 Testing Similarity Examples:")
182
+ print("=" * 50)
183
+ for text1, text2, should_match in test_cases:
184
+ similarity = calculate_similarity(text1, text2)
185
+ confidence = calculate_confidence(similarity, text1, text2)
186
+ is_match = similarity >= 0.85
187
+ status = "✅ PASS" if is_match == should_match else "❌ FAIL"
188
+ print(f"{status} '{text1}' vs '{text2}'")
189
+ print(f" Similarity: {similarity:.3f} | Confidence: {confidence:.3f}")
190
+ print()
utils/cache_manager.py ADDED
@@ -0,0 +1,350 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Cache Manager for Similarity Engine
3
+ Handles JSON caching of analysis results for improved performance
4
+ """
5
+
6
+ import json
7
+ import os
8
+ import hashlib
9
+ import logging
10
+ from datetime import datetime, timedelta
11
+ from typing import Dict, Any, Optional, List
12
+ from pathlib import Path
13
+
14
+ # Configure logging
15
+ logging.basicConfig(level=logging.INFO)
16
+ logger = logging.getLogger(__name__)
17
+
18
+ class SimilarityCacheManager:
19
+ """Manages caching of similarity analysis results"""
20
+
21
+ def __init__(self, cache_base_dir: str = "cache"):
22
+ """
23
+ Initialize cache manager
24
+
25
+ Args:
26
+ cache_base_dir: Base directory for cache files
27
+ """
28
+ self.cache_base_dir = Path(cache_base_dir)
29
+ self.cache_dirs = {
30
+ 'duplicates': self.cache_base_dir / 'duplicates',
31
+ 'promo_matches': self.cache_base_dir / 'promo_matches',
32
+ 'comparisons': self.cache_base_dir / 'comparisons'
33
+ }
34
+
35
+ # Ensure cache directories exist
36
+ for cache_dir in self.cache_dirs.values():
37
+ cache_dir.mkdir(parents=True, exist_ok=True)
38
+
39
+ logger.info(f"📁 Cache manager initialized with base dir: {self.cache_base_dir}")
40
+
41
+ def generate_cache_key(
42
+ self,
43
+ analysis_type: str,
44
+ products_count: int,
45
+ threshold: float,
46
+ algorithm: str = "hybrid",
47
+ additional_params: Dict = None
48
+ ) -> str:
49
+ """
50
+ Generate unique cache key for analysis parameters
51
+
52
+ Args:
53
+ analysis_type: Type of analysis ('duplicates', 'promo', 'comparison')
54
+ products_count: Number of products in analysis
55
+ threshold: Similarity threshold used
56
+ algorithm: Algorithm used
57
+ additional_params: Any additional parameters to include in key
58
+
59
+ Returns:
60
+ Unique cache key string
61
+ """
62
+ # Base parameters
63
+ key_data = {
64
+ 'type': analysis_type,
65
+ 'count': products_count,
66
+ 'threshold': round(threshold, 2),
67
+ 'algorithm': algorithm,
68
+ 'date': datetime.now().strftime("%Y%m%d")
69
+ }
70
+
71
+ # Add additional parameters if provided
72
+ if additional_params:
73
+ key_data.update(additional_params)
74
+
75
+ # Create hash from parameters for uniqueness
76
+ key_string = json.dumps(key_data, sort_keys=True)
77
+ key_hash = hashlib.md5(key_string.encode()).hexdigest()[:8]
78
+
79
+ # Create readable cache key
80
+ cache_key = f"{analysis_type}_{products_count}_{int(threshold*100)}_{algorithm}_{key_hash}"
81
+
82
+ logger.debug(f"🔑 Generated cache key: {cache_key}")
83
+ return cache_key
84
+
85
+ def get_cache_file_path(self, analysis_type: str, cache_key: str) -> Path:
86
+ """Get full path for cache file"""
87
+ cache_dir = self.cache_dirs.get(analysis_type, self.cache_dirs['comparisons'])
88
+ return cache_dir / f"{cache_key}.json"
89
+
90
+ def save_cache(
91
+ self,
92
+ analysis_type: str,
93
+ cache_key: str,
94
+ results: Dict[str, Any],
95
+ parameters: Dict[str, Any],
96
+ expiry_hours: int = 24
97
+ ) -> bool:
98
+ """
99
+ Save analysis results to cache
100
+
101
+ Args:
102
+ analysis_type: Type of analysis
103
+ cache_key: Unique cache key
104
+ results: Analysis results to cache
105
+ parameters: Parameters used for analysis
106
+ expiry_hours: Hours until cache expires
107
+
108
+ Returns:
109
+ True if saved successfully, False otherwise
110
+ """
111
+ try:
112
+ cache_file = self.get_cache_file_path(analysis_type, cache_key)
113
+
114
+ cache_data = {
115
+ 'cache_id': cache_key,
116
+ 'analysis_type': analysis_type,
117
+ 'created_at': datetime.now().isoformat(),
118
+ 'expires_at': (datetime.now() + timedelta(hours=expiry_hours)).isoformat(),
119
+ 'parameters': parameters,
120
+ 'results': results,
121
+ 'version': '1.0'
122
+ }
123
+
124
+ with open(cache_file, 'w', encoding='utf-8') as f:
125
+ json.dump(cache_data, f, indent=2, ensure_ascii=False)
126
+
127
+ file_size = cache_file.stat().st_size / 1024 # KB
128
+ logger.info(f"💾 Saved cache: {cache_key} ({file_size:.1f} KB)")
129
+ return True
130
+
131
+ except Exception as e:
132
+ logger.error(f"❌ Failed to save cache {cache_key}: {e}")
133
+ return False
134
+
135
+ def load_cache(self, analysis_type: str, cache_key: str) -> Optional[Dict[str, Any]]:
136
+ """
137
+ Load cached analysis results
138
+
139
+ Args:
140
+ analysis_type: Type of analysis
141
+ cache_key: Cache key to load
142
+
143
+ Returns:
144
+ Cached results if valid, None otherwise
145
+ """
146
+ try:
147
+ cache_file = self.get_cache_file_path(analysis_type, cache_key)
148
+
149
+ if not cache_file.exists():
150
+ logger.debug(f"📭 Cache miss: {cache_key}")
151
+ return None
152
+
153
+ with open(cache_file, 'r', encoding='utf-8') as f:
154
+ cache_data = json.load(f)
155
+
156
+ # Check if cache is expired
157
+ expiry_time = datetime.fromisoformat(cache_data['expires_at'])
158
+ if datetime.now() > expiry_time:
159
+ logger.info(f"⏰ Cache expired: {cache_key}")
160
+ cache_file.unlink() # Remove expired cache
161
+ return None
162
+
163
+ logger.info(f"✅ Cache hit: {cache_key}")
164
+ return cache_data['results']
165
+
166
+ except Exception as e:
167
+ logger.error(f"❌ Failed to load cache {cache_key}: {e}")
168
+ return None
169
+
170
+ def is_cache_valid(self, analysis_type: str, cache_key: str) -> bool:
171
+ """Check if cache exists and is valid"""
172
+ try:
173
+ cache_file = self.get_cache_file_path(analysis_type, cache_key)
174
+
175
+ if not cache_file.exists():
176
+ return False
177
+
178
+ with open(cache_file, 'r', encoding='utf-8') as f:
179
+ cache_data = json.load(f)
180
+
181
+ expiry_time = datetime.fromisoformat(cache_data['expires_at'])
182
+ return datetime.now() <= expiry_time
183
+
184
+ except Exception:
185
+ return False
186
+
187
+ def clear_cache(self, analysis_type: str = None, older_than_hours: int = None) -> int:
188
+ """
189
+ Clear cached results
190
+
191
+ Args:
192
+ analysis_type: Specific analysis type to clear, or None for all
193
+ older_than_hours: Clear cache older than X hours, or None for all
194
+
195
+ Returns:
196
+ Number of files removed
197
+ """
198
+ removed_count = 0
199
+
200
+ # Determine which directories to clear
201
+ dirs_to_clear = [self.cache_dirs[analysis_type]] if analysis_type else self.cache_dirs.values()
202
+
203
+ for cache_dir in dirs_to_clear:
204
+ if not cache_dir.exists():
205
+ continue
206
+
207
+ for cache_file in cache_dir.glob("*.json"):
208
+ should_remove = False
209
+
210
+ try:
211
+ if older_than_hours is None:
212
+ should_remove = True
213
+ else:
214
+ # Check file age
215
+ with open(cache_file, 'r') as f:
216
+ cache_data = json.load(f)
217
+
218
+ created_time = datetime.fromisoformat(cache_data['created_at'])
219
+ age_hours = (datetime.now() - created_time).total_seconds() / 3600
220
+
221
+ if age_hours > older_than_hours:
222
+ should_remove = True
223
+
224
+ if should_remove:
225
+ cache_file.unlink()
226
+ removed_count += 1
227
+ logger.info(f"🗑️ Removed cache: {cache_file.name}")
228
+
229
+ except Exception as e:
230
+ logger.warning(f"⚠️ Failed to process cache file {cache_file}: {e}")
231
+
232
+ logger.info(f"🧹 Cache cleanup complete: {removed_count} files removed")
233
+ return removed_count
234
+
235
+ def get_cache_stats(self) -> Dict[str, Any]:
236
+ """Get cache statistics"""
237
+ stats = {
238
+ 'total_files': 0,
239
+ 'total_size_mb': 0,
240
+ 'by_type': {},
241
+ 'cache_dirs': {}
242
+ }
243
+
244
+ for analysis_type, cache_dir in self.cache_dirs.items():
245
+ if not cache_dir.exists():
246
+ continue
247
+
248
+ type_stats = {
249
+ 'files': 0,
250
+ 'size_mb': 0,
251
+ 'valid_files': 0,
252
+ 'expired_files': 0
253
+ }
254
+
255
+ for cache_file in cache_dir.glob("*.json"):
256
+ try:
257
+ file_size = cache_file.stat().st_size / (1024 * 1024) # MB
258
+ type_stats['files'] += 1
259
+ type_stats['size_mb'] += file_size
260
+
261
+ # Check if valid
262
+ with open(cache_file, 'r') as f:
263
+ cache_data = json.load(f)
264
+
265
+ expiry_time = datetime.fromisoformat(cache_data['expires_at'])
266
+ if datetime.now() <= expiry_time:
267
+ type_stats['valid_files'] += 1
268
+ else:
269
+ type_stats['expired_files'] += 1
270
+
271
+ except Exception:
272
+ type_stats['expired_files'] += 1
273
+
274
+ stats['by_type'][analysis_type] = type_stats
275
+ stats['total_files'] += type_stats['files']
276
+ stats['total_size_mb'] += type_stats['size_mb']
277
+ stats['cache_dirs'][analysis_type] = str(cache_dir)
278
+
279
+ return stats
280
+
281
+ def cleanup_expired_cache(self) -> int:
282
+ """Remove all expired cache files"""
283
+ return self.clear_cache(older_than_hours=0) # Remove only expired files
284
+
285
+
286
+ # Global cache manager instance
287
+ _cache_manager = None
288
+
289
+ def get_cache_manager() -> SimilarityCacheManager:
290
+ """Get singleton cache manager instance"""
291
+ global _cache_manager
292
+ if _cache_manager is None:
293
+ _cache_manager = SimilarityCacheManager()
294
+ return _cache_manager
295
+
296
+
297
+ # Convenience functions
298
+ def cache_duplicate_analysis(
299
+ products_count: int,
300
+ threshold: float,
301
+ results: Dict[str, Any],
302
+ parameters: Dict[str, Any]
303
+ ) -> str:
304
+ """Cache duplicate analysis results"""
305
+ cache_mgr = get_cache_manager()
306
+ cache_key = cache_mgr.generate_cache_key('duplicates', products_count, threshold)
307
+ cache_mgr.save_cache('duplicates', cache_key, results, parameters)
308
+ return cache_key
309
+
310
+ def load_duplicate_analysis(
311
+ products_count: int,
312
+ threshold: float
313
+ ) -> Optional[Dict[str, Any]]:
314
+ """Load cached duplicate analysis results"""
315
+ cache_mgr = get_cache_manager()
316
+ cache_key = cache_mgr.generate_cache_key('duplicates', products_count, threshold)
317
+ return cache_mgr.load_cache('duplicates', cache_key)
318
+
319
+ def cache_promo_analysis(
320
+ promo_count: int,
321
+ db_count: int,
322
+ threshold: float,
323
+ results: Dict[str, Any],
324
+ parameters: Dict[str, Any]
325
+ ) -> str:
326
+ """Cache promo analysis results"""
327
+ cache_mgr = get_cache_manager()
328
+ cache_key = cache_mgr.generate_cache_key(
329
+ 'promo_matches',
330
+ promo_count + db_count,
331
+ threshold,
332
+ additional_params={'promo_count': promo_count, 'db_count': db_count}
333
+ )
334
+ cache_mgr.save_cache('promo_matches', cache_key, results, parameters)
335
+ return cache_key
336
+
337
+ def load_promo_analysis(
338
+ promo_count: int,
339
+ db_count: int,
340
+ threshold: float
341
+ ) -> Optional[Dict[str, Any]]:
342
+ """Load cached promo analysis results"""
343
+ cache_mgr = get_cache_manager()
344
+ cache_key = cache_mgr.generate_cache_key(
345
+ 'promo_matches',
346
+ promo_count + db_count,
347
+ threshold,
348
+ additional_params={'promo_count': promo_count, 'db_count': db_count}
349
+ )
350
+ return cache_mgr.load_cache('promo_matches', cache_key)