Spaces:

dal4933
/

TEST-FRANKO

Runtime error

App Files Files Community

wisdom anthony commited on Jul 14, 2025

Commit

97911a8

1 Parent(s): 028bcd8

similarity code for backend

Browse files

Files changed (13) hide show

.gitignore +7 -1
api/main.py +2 -0
api/product_routes.py +2 -1
api/similarity_routes.py +1079 -0
db/similarity_repository.py +462 -0
product_detector/mock_detector.py +33 -0
requirements.txt +5 -1
similarity_engine/__init__.py +17 -0
similarity_engine/enhanced_image_processor.py +531 -0
similarity_engine/product_comparator.py +300 -0
similarity_engine/promo_comparator.py +278 -0
similarity_engine/similarity_core.py +190 -0
utils/cache_manager.py +350 -0

.gitignore CHANGED Viewed

@@ -113,4 +113,10 @@ config.ini
 secrets.json
 credentials.json
 *.pem
-*.key

 secrets.json
 credentials.json
 *.pem
+*.key
+# Similarity Engine Cache
+cache/
+/cache/
+**/cache/
+*.cache

api/main.py CHANGED Viewed

@@ -5,6 +5,7 @@ from api.product_routes import router as product_router
 from api.receipt_routes import router as receipt_router
 from api.scrape_routes import router as scrape_router
 from api.cijene_routes import router as cijene_router
 # Initialize FastAPI
 app = FastAPI(title="SupaKuna API")
@@ -24,6 +25,7 @@ app.include_router(product_router)
 app.include_router(receipt_router)
 app.include_router(scrape_router)
 app.include_router(cijene_router)
 @app.get("/", tags=["Health"])
 def health_check():

 from api.receipt_routes import router as receipt_router
 from api.scrape_routes import router as scrape_router
 from api.cijene_routes import router as cijene_router
+from api.similarity_routes import router as similarity_router
 # Initialize FastAPI
 app = FastAPI(title="SupaKuna API")
 app.include_router(receipt_router)
 app.include_router(scrape_router)
 app.include_router(cijene_router)
+app.include_router(similarity_router)
 @app.get("/", tags=["Health"])
 def health_check():

api/product_routes.py CHANGED Viewed

@@ -1,6 +1,7 @@
 from fastapi import APIRouter, File, UploadFile, HTTPException, Form
 from utils.image_processing import read_image_file, process_product_image
-from product_detector.detector import ObjectDetector
 from config.settings import MODEL_ONNX_PATH, CLASS_NAMES, INPUT_SIZE
 from utils.image_processing import process_and_store_product_image

 from fastapi import APIRouter, File, UploadFile, HTTPException, Form
 from utils.image_processing import read_image_file, process_product_image
+# from product_detector.detector import ObjectDetector  # Temporarily disabled - model corrupted
+from product_detector.mock_detector import MockObjectDetector as ObjectDetector
 from config.settings import MODEL_ONNX_PATH, CLASS_NAMES, INPUT_SIZE
 from utils.image_processing import process_and_store_product_image

api/similarity_routes.py ADDED Viewed

	@@ -0,0 +1,1079 @@

+"""
+Similarity Routes - FastAPI Endpoints for Similarity Engine
+NOW WITH JSON CACHING FOR IMPROVED PERFORMANCE
+"""
+import logging
+import sys
+import os
+from typing import List, Dict, Any, Optional
+from fastapi import APIRouter, HTTPException, File, UploadFile, Form, BackgroundTasks
+from pydantic import BaseModel, Field
+import time
+from fastapi.responses import StreamingResponse
+import asyncio
+import json
+# Add parent directory to path to access other modules
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Import our similarity engine modules
+try:
+    from similarity_engine.similarity_core import calculate_similarity, calculate_confidence, test_similarity_examples
+    from similarity_engine.product_comparator import ProductComparator
+    from db.similarity_repository import get_similarity_repository
+    from utils.cache_manager import (
+        get_cache_manager,
+        cache_duplicate_analysis,
+        load_duplicate_analysis,
+        cache_promo_analysis,
+        load_promo_analysis
+    )
+    print("✅ All similarity modules imported successfully")
+    print("✅ Cache manager imported successfully")
+except ImportError as e:
+    print(f"⚠️ Some similarity modules failed to import: {e}")
+    print("🔄 Using fallback implementations...")
+    # Create fallback functions to prevent startup failure
+    def calculate_similarity(a, b):
+        a, b = str(a).lower().strip(), str(b).lower().strip()
+        if a == b: return 1.0
+        if not a or not b: return 0.0
+        return 0.8 if a in b or b in a else 0.3
+    def calculate_confidence(sim, a, b):
+        return sim * 0.9
+    def test_similarity_examples(): return []
+    class ProductComparator:
+        def __init__(self, *args, **kwargs): pass
+    def get_similarity_repository(): return None
+    # Fallback cache functions
+    def get_cache_manager(): return None
+    def cache_duplicate_analysis(*args, **kwargs): return None
+    def load_duplicate_analysis(*args, **kwargs): return None
+    def cache_promo_analysis(*args, **kwargs): return None
+    def load_promo_analysis(*args, **kwargs): return None
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Create router
+router = APIRouter(prefix="/similarity", tags=["Similarity Engine"])
+# Pydantic models for request/response
+class ProductComparisonRequest(BaseModel):
+    product1_name: str = Field(..., description="First product name")
+    product2_name: str = Field(..., description="Second product name")
+    threshold: Optional[float] = Field(0.87, description="Similarity threshold", ge=0.1, le=1.0)
+class DuplicateAnalysisRequest(BaseModel):
+    threshold: Optional[float] = Field(0.87, description="Similarity threshold", ge=0.1, le=1.0)
+    use_sample_data: Optional[bool] = Field(False, description="Use sample data instead of database")
+    return_summary_only: Optional[bool] = Field(False, description="Return only summary statistics")
+    force_refresh: Optional[bool] = Field(False, description="Force refresh, bypass cache")
+class PromoComparisonRequest(BaseModel):
+    threshold: Optional[float] = Field(0.85, description="Similarity threshold", ge=0.1, le=1.0)
+    max_results: Optional[int] = Field(None, description="Maximum results to return")
+    force_refresh: Optional[bool] = Field(False, description="Force refresh, bypass cache")
+class CacheManagementRequest(BaseModel):
+    analysis_type: Optional[str] = Field(None, description="Type to clear (duplicates, promo_matches, or all)")
+    older_than_hours: Optional[int] = Field(None, description="Clear cache older than X hours")
+# Health check endpoint
+@router.get("/health", summary="Health Check")
+async def health_check():
+    """Check if similarity engine is working"""
+    try:
+        # Test basic similarity calculation
+        test_similarity = calculate_similarity("test product", "test product")
+        # Test database connection
+        repository = get_similarity_repository()
+        db_status = "connected" if repository and repository.supabase else "disconnected"
+        # Test cache manager
+        cache_mgr = get_cache_manager()
+        cache_status = "available" if cache_mgr else "unavailable"
+        return {
+            "status": "healthy",
+            "similarity_engine": "operational",
+            "database_connection": db_status,
+            "cache_system": cache_status,
+            "test_similarity": test_similarity,
+            "timestamp": time.time()
+        }
+    except Exception as e:
+        logger.error(f"Health check failed: {e}")
+        return {
+            "status": "error",
+            "similarity_engine": "failed",
+            "error": str(e),
+            "timestamp": time.time()
+        }
+# Core similarity endpoints
+@router.post("/compare-products", summary="Compare Two Products")
+async def compare_two_products(request: ProductComparisonRequest):
+    """Compare similarity between two specific products"""
+    try:
+        logger.info(f"🔍 Comparing products: '{request.product1_name}' vs '{request.product2_name}'")
+        # Calculate similarity directly
+        similarity = calculate_similarity(request.product1_name, request.product2_name)
+        confidence = calculate_confidence(similarity, request.product1_name, request.product2_name)
+        is_duplicate = similarity >= request.threshold
+        # Determine assessment
+        if similarity >= 0.95:
+            assessment = {"description": "Perfect match - identical products", "emoji": "✅", "category": "identical"}
+        elif similarity >= 0.8:
+            assessment = {"description": "Very similar - likely duplicates", "emoji": "⚠️", "category": "very_similar"}
+        elif similarity >= 0.7:
+            assessment = {"description": "Similar - review recommended", "emoji": "🤔", "category": "similar"}
+        else:
+            assessment = {"description": "Different products", "emoji": "❌", "category": "different"}
+        result = {
+            'product1_name': request.product1_name,
+            'product2_name': request.product2_name,
+            'similarity': round(similarity, 3),
+            'confidence': round(confidence, 3),
+            'is_duplicate': is_duplicate,
+            'threshold_used': request.threshold,
+            'assessment': assessment,
+            'percentage_similarity': round(similarity * 100, 1),
+            'percentage_confidence': round(confidence * 100, 1)
+        }
+        return {
+            "status": "success",
+            "comparison": result,
+            "timestamp": time.time()
+        }
+    except Exception as e:
+        logger.error(f"Product comparison error: {e}")
+        raise HTTPException(status_code=500, detail=f"Comparison failed: {str(e)}")
+@router.post("/find-duplicates", summary="Find Database Duplicates - WITH CACHING")
+async def find_duplicates(request: DuplicateAnalysisRequest):
+    """Find duplicate products in the database - NOW WITH JSON CACHING"""
+    try:
+        logger.info(f"🔍 Starting duplicate analysis with threshold {request.threshold}")
+        repository = get_similarity_repository()
+        # Load products - REAL DATA NOW!
+        if request.use_sample_data:
+            logger.info("📊 Using sample data for analysis")
+            products = repository.get_sample_products() if repository else []
+        else:
+            logger.info("📊 Loading REAL products from database...")
+            products = repository.load_all_products() if repository else []
+        if not products:
+            return {
+                "status": "error",
+                "message": "No products found for analysis - check database connection",
+                "duplicates": [],
+                "analysis_summary": {}
+            }
+        logger.info(f"📊 Loaded {len(products)} products for analysis")
+        # 🚀 NEW: CHECK CACHE FIRST (unless force refresh)
+        if not request.force_refresh:
+            cached_results = load_duplicate_analysis(len(products), request.threshold)
+            if cached_results:
+                logger.info("✅ Returning cached duplicate analysis results")
+                return {
+                    "status": "success",
+                    "data": cached_results,
+                    "cached": True,
+                    "cache_hit": True,
+                    "timestamp": time.time()
+                }
+        # Run analysis if no cache or force refresh
+        logger.info("🔄 Running fresh duplicate analysis...")
+        start_time = time.time()
+        duplicates = []
+        for i, product1 in enumerate(products):
+            for j, product2 in enumerate(products[i+1:], i+1):
+                product_name1 = product1.get('product_name', '').strip()
+                product_name2 = product2.get('product_name', '').strip()
+                if not product_name1 or not product_name2:
+                    continue
+                similarity = calculate_similarity(product_name1, product_name2)
+                if similarity >= request.threshold:
+                    confidence = calculate_confidence(similarity, product_name1, product_name2)
+                    duplicates.append({
+                        'product1_id': product1.get('product_id'),
+                        'product1_name': product_name1,
+                        'product2_id': product2.get('product_id'),
+                        'product2_name': product_name2,
+                        'similarity': round(similarity, 3),
+                        'confidence': round(confidence, 3)
+                    })
+        analysis_time = time.time() - start_time
+        # Create results
+        results = {
+            'duplicates': duplicates,
+            'analysis_summary': {
+                'total_products': len(products),
+                'duplicates_found': len(duplicates),
+                'duplicate_rate': (len(duplicates) / len(products)) * 100 if products else 0,
+                'analysis_time_seconds': round(analysis_time, 2),
+                'threshold_used': request.threshold,
+                'success': True,
+                'data_source': 'sample_data' if request.use_sample_data else 'real_database'
+            },
+            'recommendations': [
+                f"Found {len(duplicates)} potential duplicates in {len(products)} products",
+                "Review matches manually for final decision",
+                "Higher similarity scores indicate more confident matches",
+                f"Analysis completed in {analysis_time:.2f} seconds"
+            ]
+        }
+        # 🚀 NEW: CACHE THE RESULTS
+        cache_parameters = {
+            'threshold': request.threshold,
+            'use_sample_data': request.use_sample_data,
+            'total_products': len(products),
+            'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
+        }
+        cache_key = cache_duplicate_analysis(
+            len(products),
+            request.threshold,
+            results,
+            cache_parameters
+        )
+        logger.info(f"✅ Duplicate analysis complete: {len(duplicates)} duplicates found in {len(products)} products")
+        logger.info(f"💾 Results cached with key: {cache_key}")
+        return {
+            "status": "success",
+            "data": results,
+            "cached": False,
+            "cache_key": cache_key,
+            "timestamp": time.time()
+        }
+    except Exception as e:
+        logger.error(f"Duplicate analysis error: {e}")
+        raise HTTPException(status_code=500, detail=f"Analysis failed: {str(e)}")
+@router.post("/find-duplicates-stream", summary="Stream Duplicate Analysis with Real-Time Progress")
+async def find_duplicates_stream(request: DuplicateAnalysisRequest):
+    """Stream duplicate analysis with real-time progress updates"""
+    async def generate_progress():
+        try:
+            logger.info(f"🔍 Starting streaming duplicate analysis with threshold {request.threshold}")
+            repository = get_similarity_repository()
+            # Load products - REAL DATA
+            if request.use_sample_data:
+                logger.info("📊 Using sample data for analysis")
+                products = repository.get_sample_products() if repository else []
+            else:
+                logger.info("📊 Loading REAL products from database...")
+                products = repository.load_all_products() if repository else []
+            if not products:
+                yield f"data: {json.dumps({'type': 'error', 'message': 'No products found for analysis'})}\n\n"
+                return
+            logger.info(f"📊 Loaded {len(products)} products for streaming analysis")
+            # Check cache first (unless force refresh)
+            if not request.force_refresh:
+                cached_results = load_duplicate_analysis(len(products), request.threshold)
+                if cached_results:
+                    logger.info("✅ Returning cached duplicate analysis results via stream")
+                    yield f"data: {json.dumps({'type': 'cache_hit', 'data': cached_results})}\n\n"
+                    return
+            # Send initial status
+            total_comparisons = len(products) * (len(products) - 1) // 2
+            yield f"data: {json.dumps({'type': 'init', 'total_products': len(products), 'total_comparisons': total_comparisons, 'threshold': request.threshold})}\n\n"
+            # Run analysis with real-time progress
+            start_time = time.time()
+            duplicates = []
+            completed_comparisons = 0
+            for i, product1 in enumerate(products):
+                product_name1 = product1.get('product_name', '').strip()
+                product_id1 = product1.get('product_id', 'unknown')
+                if not product_name1:
+                    continue
+                # Send progress update every 100 products
+                if i % 100 == 0:
+                    progress_percentage = (i / len(products)) * 100
+                    yield f"data: {json.dumps({'type': 'product_progress', 'current_product': i + 1, 'total_products': len(products), 'progress': round(progress_percentage, 1), 'current_name': product_name1[:50]})}\n\n"
+                for j, product2 in enumerate(products[i+1:], i+1):
+                    product_name2 = product2.get('product_name', '').strip()
+                    product_id2 = product2.get('product_id', 'unknown')
+                    if not product_name2 or product_id1 == product_id2:
+                        continue
+                    completed_comparisons += 1
+                    # Calculate similarity
+                    similarity = calculate_similarity(product_name1, product_name2)
+                    # Send progress every 10,000 comparisons
+                    if completed_comparisons % 10000 == 0:
+                        progress_percentage = (completed_comparisons / total_comparisons) * 100
+                        yield f"data: {json.dumps({'type': 'comparison_progress', 'completed_comparisons': completed_comparisons, 'total_comparisons': total_comparisons, 'progress': round(progress_percentage, 1), 'comparing': f'{product_name1[:30]} vs {product_name2[:30]}'})}\n\n"
+                        await asyncio.sleep(0.01)  # Small delay to prevent overwhelming
+                    # Check if it's a duplicate
+                    if similarity >= request.threshold:
+                        confidence = calculate_confidence(similarity, product_name1, product_name2)
+                        duplicate_info = {
+                            'product1_id': product_id1,
+                            'product1_name': product_name1,
+                            'product2_id': product_id2,
+                            'product2_name': product_name2,
+                            'similarity': round(similarity, 3),
+                            'confidence': round(confidence, 3)
+                        }
+                        duplicates.append(duplicate_info)
+                        # Send duplicate found immediately
+                        yield f"data: {json.dumps({'type': 'duplicate_found', 'duplicate': duplicate_info, 'total_duplicates': len(duplicates)})}\n\n"
+                        logger.info(f"  🔍 DUPLICATE FOUND via stream: {product_name1} ↔ {product_name2} ({similarity:.3f})")
+            analysis_time = time.time() - start_time
+            # Create final results
+            results = {
+                'duplicates': duplicates,
+                'analysis_summary': {
+                    'total_products': len(products),
+                    'total_comparisons': completed_comparisons,
+                    'duplicates_found': len(duplicates),
+                    'duplicate_rate': (len(duplicates) / len(products)) * 100 if products else 0,
+                    'analysis_time_seconds': round(analysis_time, 2),
+                    'threshold_used': request.threshold,
+                    'success': True,
+                    'data_source': 'sample_data' if request.use_sample_data else 'real_database'
+                },
+                'recommendations': [
+                    f"Found {len(duplicates)} potential duplicates in {len(products)} products",
+                    "Review matches manually for final decision",
+                    "Higher similarity scores indicate more confident matches",
+                    f"Analysis completed in {analysis_time:.2f} seconds"
+                ]
+            }
+            # Cache the results
+            cache_parameters = {
+                'threshold': request.threshold,
+                'use_sample_data': request.use_sample_data,
+                'total_products': len(products),
+                'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
+            }
+            cache_key = cache_duplicate_analysis(
+                len(products),
+                request.threshold,
+                results,
+                cache_parameters
+            )
+            # Send final complete results
+            yield f"data: {json.dumps({'type': 'complete', 'data': results, 'cache_key': cache_key})}\n\n"
+            logger.info(f"✅ Streaming duplicate analysis complete: {len(duplicates)} duplicates found in {len(products)} products")
+            logger.info(f"💾 Results cached with key: {cache_key}")
+        except Exception as e:
+            logger.error(f"Streaming duplicate analysis error: {e}")
+            yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
+    return StreamingResponse(
+        generate_progress(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Content-Type": "text/event-stream",
+            "X-Accel-Buffering": "no"  # Disable nginx buffering
+        }
+    )
+@router.post("/compare-promo", summary="Compare Promo Products - WITH CACHING")
+async def compare_promo_products(request: PromoComparisonRequest):
+    """Compare promotional products against database products - NOW WITH CACHING"""
+    try:
+        logger.info(f"🏷️ Starting promo comparison with threshold {request.threshold}")
+        repository = get_similarity_repository()
+        if not repository:
+            return {"status": "error", "message": "Database repository not available"}
+        # Load REAL promo products
+        logger.info("📊 Loading REAL promo products from database...")
+        try:
+            promo_products = repository.load_promo_products()
+            db_products = repository.load_all_products()
+            logger.info(f"✅ Loaded {len(promo_products)} promo products and {len(db_products)} database products")
+        except Exception as e:
+            logger.warning(f"❌ Failed to load real data, using samples: {e}")
+            promo_products = repository._get_sample_promo_products()
+            db_products = repository.get_sample_products()
+        # 🚀 NEW: CHECK CACHE FIRST (unless force refresh)
+        if not request.force_refresh:
+            cached_results = load_promo_analysis(
+                len(promo_products),
+                len(db_products),
+                request.threshold
+            )
+            if cached_results:
+                logger.info("✅ Returning cached promo analysis results")
+                return {
+                    "status": "success",
+                    **cached_results,
+                    "cached": True,
+                    "cache_hit": True,
+                    "timestamp": time.time()
+                }
+        logger.info(f"📊 Comparing {len(promo_products)} promo products against {len(db_products)} database products")
+        # Find matches using REAL similarity algorithm
+        start_time = time.time()
+        matches = []
+        total_comparisons = 0
+        for promo in promo_products:
+            promo_name = promo.get('name', '').strip()
+            if not promo_name:
+                continue
+            best_match = None
+            best_similarity = 0.0
+            for db in db_products:
+                db_name = db.get('product_name', '').strip()
+                if not db_name:
+                    continue
+                similarity = calculate_similarity(promo_name, db_name)
+                total_comparisons += 1
+                if similarity >= request.threshold and similarity > best_similarity:
+                    best_similarity = similarity
+                    confidence = calculate_confidence(similarity, promo_name, db_name)
+                    best_match = {
+                        'promo_id': promo.get('id'),
+                        'promo_name': promo_name,
+                        'promo_store': promo.get('store', ''),
+                        'promo_price': promo.get('promo_price'),
+                        'regular_price': promo.get('regular_price'),
+                        'picture_id': promo.get('picture_id'),
+                        'db_product_id': db.get('product_id'),
+                        'db_product_name': db_name,
+                        'db_brand': db.get('brand', {}).get('brand_name', 'No Brand') if db.get('brand') else 'No Brand',
+                        'similarity': round(similarity, 3),
+                        'confidence': round(confidence, 3)
+                    }
+            if best_match:
+                matches.append(best_match)
+                # Limit results if requested
+                if request.max_results and len(matches) >= request.max_results:
+                    break
+        analysis_time = time.time() - start_time
+        # Prepare results
+        results = {
+            "matches": matches,
+            "statistics": {
+                "total_promo_products": len(promo_products),
+                "total_database_products": len(db_products),
+                "total_comparisons": total_comparisons,
+                "matches_found": len(matches),
+                "match_rate_percentage": round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
+                "threshold_used": request.threshold,
+                "analysis_time_seconds": round(analysis_time, 2)
+            }
+        }
+        # 🚀 NEW: CACHE THE RESULTS
+        cache_parameters = {
+            'threshold': request.threshold,
+            'max_results': request.max_results,
+            'promo_count': len(promo_products),
+            'db_count': len(db_products),
+            'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
+        }
+        cache_key = cache_promo_analysis(
+            len(promo_products),
+            len(db_products),
+            request.threshold,
+            results,
+            cache_parameters
+        )
+        logger.info(f"✅ Promo comparison complete: {len(matches)} matches found")
+        logger.info(f"💾 Results cached with key: {cache_key}")
+        return {
+            "status": "success",
+            **results,
+            "cached": False,
+            "cache_key": cache_key,
+            "timestamp": time.time()
+        }
+    except Exception as e:
+        logger.error(f"Promo comparison error: {e}")
+        raise HTTPException(status_code=500, detail=f"Promo comparison failed: {str(e)}")
+@router.post("/compare-promo-stream", summary="Stream Promo Comparison with Real-Time Progress")
+async def compare_promo_stream(request: PromoComparisonRequest):
+    """Stream promo product comparison with real-time progress updates"""
+    async def generate_promo_progress():
+        try:
+            logger.info(f"🏷️ Starting streaming promo comparison with threshold {request.threshold}")
+            repository = get_similarity_repository()
+            if not repository:
+                yield f"data: {json.dumps({'type': 'error', 'message': 'Database repository not available'})}\n\n"
+                return
+            # Load REAL promo and database products
+            logger.info("📊 Loading REAL promo and database products...")
+            try:
+                promo_products = repository.load_promo_products()
+                db_products = repository.load_all_products()
+                logger.info(f"✅ Loaded {len(promo_products)} promo products and {len(db_products)} database products")
+            except Exception as e:
+                logger.warning(f"❌ Failed to load real data, using samples: {e}")
+                promo_products = repository._get_sample_promo_products()
+                db_products = repository.get_sample_products()
+            # Check cache first (unless force refresh)
+            if not request.force_refresh:
+                cached_results = load_promo_analysis(
+                    len(promo_products),
+                    len(db_products),
+                    request.threshold
+                )
+                if cached_results:
+                    logger.info("✅ Returning cached promo analysis results via stream")
+                    yield f"data: {json.dumps({'type': 'cache_hit', 'data': cached_results})}\n\n"
+                    return
+            # Send initial status
+            total_comparisons = len(promo_products) * len(db_products)
+            yield f"data: {json.dumps({'type': 'init', 'total_promo_products': len(promo_products), 'total_db_products': len(db_products), 'total_comparisons': total_comparisons, 'threshold': request.threshold})}\n\n"
+            # Find matches using REAL similarity algorithm with streaming
+            start_time = time.time()
+            matches = []
+            completed_comparisons = 0
+            for i, promo in enumerate(promo_products):
+                promo_name = promo.get('name', '').strip()
+                promo_store = promo.get('store', '').strip()
+                if not promo_name:
+                    continue
+                # Send promo progress update
+                promo_progress = (i / len(promo_products)) * 100
+                yield f"data: {json.dumps({'type': 'promo_progress', 'current_promo': i + 1, 'total_promos': len(promo_products), 'progress': round(promo_progress, 1), 'current_promo_name': promo_name[:50], 'store': promo_store})}\n\n"
+                best_match = None
+                best_similarity = 0.0
+                for j, db_product in enumerate(db_products):
+                    db_name = db_product.get('product_name', '').strip()
+                    if not db_name:
+                        continue
+                    completed_comparisons += 1
+                    # Calculate similarity
+                    similarity = calculate_similarity(promo_name, db_name)
+                    # Send detailed comparison progress every 1000 comparisons
+                    if completed_comparisons % 1000 == 0:
+                        overall_progress = (completed_comparisons / total_comparisons) * 100
+                        yield f"data: {json.dumps({'type': 'comparison_progress', 'completed_comparisons': completed_comparisons, 'total_comparisons': total_comparisons, 'progress': round(overall_progress, 1), 'comparing': f'{promo_name[:30]} vs {db_name[:30]}'})}\n\n"
+                        await asyncio.sleep(0.01)  # Small delay
+                    if similarity >= request.threshold and similarity > best_similarity:
+                        best_similarity = similarity
+                        confidence = calculate_confidence(similarity, promo_name, db_name)
+                        best_match = {
+                            'promo_id': promo.get('id'),
+                            'promo_name': promo_name,
+                            'promo_store': promo_store,
+                            'promo_price': promo.get('promo_price', 0),
+                            'regular_price': promo.get('regular_price', 0),
+                            'picture_id': promo.get('picture_id'),
+                            'db_product_id': db_product.get('product_id'),
+                            'db_product_name': db_name,
+                            'db_brand': db_product.get('brand', {}).get('brand_name', 'No Brand') if db_product.get('brand') else 'No Brand',
+                            'similarity': round(similarity, 3),
+                            'confidence': round(confidence, 3)
+                        }
+                # If match found, send immediate update
+                if best_match:
+                    matches.append(best_match)
+                    yield f"data: {json.dumps({'type': 'match_found', 'match': best_match, 'total_matches': len(matches)})}\n\n"
+                    logger.info(f"🔍 PROMO MATCH FOUND via stream: {promo_name} ↔ {best_match['db_product_name']} ({best_similarity:.3f})")
+                # Limit results if requested
+                if request.max_results and len(matches) >= request.max_results:
+                    yield f"data: {json.dumps({'type': 'max_results_reached', 'max_results': request.max_results, 'matches_found': len(matches)})}\n\n"
+                    break
+            analysis_time = time.time() - start_time
+            # Prepare final results
+            results = {
+                "matches": matches,
+                "statistics": {
+                    "total_promo_products": len(promo_products),
+                    "total_database_products": len(db_products),
+                    "total_comparisons": completed_comparisons,
+                    "matches_found": len(matches),
+                    "match_rate_percentage": round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
+                    "threshold_used": request.threshold,
+                    "analysis_time_seconds": round(analysis_time, 2)
+                }
+            }
+            # Cache the results
+            cache_parameters = {
+                'threshold': request.threshold,
+                'max_results': request.max_results,
+                'promo_count': len(promo_products),
+                'db_count': len(db_products),
+                'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
+            }
+            cache_key = cache_promo_analysis(
+                len(promo_products),
+                len(db_products),
+                request.threshold,
+                results,
+                cache_parameters
+            )
+            # Send final results
+            yield f"data: {json.dumps({'type': 'complete', 'data': results, 'cache_key': cache_key})}\n\n"
+            logger.info(f"✅ Streaming promo comparison complete: {len(matches)} matches found")
+            logger.info(f"💾 Results cached with key: {cache_key}")
+        except Exception as e:
+            logger.error(f"Streaming promo comparison error: {e}")
+            yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
+    return StreamingResponse(
+        generate_promo_progress(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Content-Type": "text/event-stream",
+            "X-Accel-Buffering": "no"  # Disable nginx buffering
+        }
+    )
+@router.get("/test-algorithm", summary="Test Similarity Algorithm")
+async def test_algorithm():
+    """Test the similarity algorithm with known examples"""
+    try:
+        logger.info("🧪 Testing similarity algorithm")
+        test_cases = [
+            ("Maslac", "Maslac", True),
+            ("Vrhnje za kuhanje", "Vrhnje za kuhanje 3x200g", False),
+            ("Apple iPhone 13", "iPhone 13 Apple", True),
+            ("vindija mlijeko cokoladno", "vindija cokoladno mlijeko", True)
+        ]
+        results = []
+        for text1, text2, should_match in test_cases:
+            similarity = calculate_similarity(text1, text2)
+            confidence = calculate_confidence(similarity, text1, text2)
+            is_match = similarity >= 0.85
+            passed = is_match == should_match
+            results.append({
+                "text1": text1,
+                "text2": text2,
+                "similarity": round(similarity, 3),
+                "confidence": round(confidence, 3),
+                "is_match": is_match,
+                "should_match": should_match,
+                "test_passed": passed
+            })
+        passed_tests = sum(1 for r in results if r["test_passed"])
+        total_tests = len(results)
+        return {
+            "status": "success",
+            "test_results": results,
+            "summary": {
+                "total_tests": total_tests,
+                "passed_tests": passed_tests,
+                "failed_tests": total_tests - passed_tests,
+                "success_rate": round((passed_tests / total_tests) * 100, 1)
+            },
+            "timestamp": time.time()
+        }
+    except Exception as e:
+        logger.error(f"Algorithm test error: {e}")
+        raise HTTPException(status_code=500, detail=f"Test failed: {str(e)}")
+@router.get("/stats", summary="Get Engine Statistics")
+async def get_statistics():
+    """Get overall similarity engine statistics - REAL DATA"""
+    try:
+        repository = get_similarity_repository()
+        if not repository:
+            return {
+                "status": "error",
+                "message": "Repository not available",
+                "statistics": {}
+            }
+        # Get REAL statistics
+        try:
+            all_products = repository.load_all_products()
+            products_without_images = repository.get_products_without_images()
+            promo_products = repository.load_promo_products()
+            promo_with_images = repository.load_promo_products(with_images_only=True)
+        except Exception as e:
+            logger.error(f"Error loading statistics: {e}")
+            all_products = []
+            products_without_images = []
+            promo_products = []
+            promo_with_images = []
+        return {
+            "status": "success",
+            "statistics": {
+                "database": {
+                    "total_products": len(all_products),
+                    "products_without_images": len(products_without_images),
+                    "products_with_images": len(all_products) - len(products_without_images)
+                },
+                "promotional": {
+                    "total_promo_products": len(promo_products),
+                    "promo_with_images": len(promo_with_images),
+                    "promo_without_images": len(promo_products) - len(promo_with_images)
+                },
+                "image_coverage": {
+                    "database_coverage": round(((len(all_products) - len(products_without_images)) / max(len(all_products), 1)) * 100, 1),
+                    "promo_coverage": round((len(promo_with_images) / max(len(promo_products), 1)) * 100, 1)
+                }
+            },
+            "engine_status": "operational",
+            "database_connection": "connected" if repository.supabase else "disconnected",
+            "timestamp": time.time()
+        }
+    except Exception as e:
+        logger.error(f"Statistics error: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to get statistics: {str(e)}")
+# 🚀 NEW: CACHE MANAGEMENT ENDPOINTS
+@router.get("/cache/stats", summary="Get Cache Statistics")
+async def get_cache_stats():
+    """Get cache usage statistics"""
+    try:
+        cache_mgr = get_cache_manager()
+        if not cache_mgr:
+            return {
+                "status": "error",
+                "message": "Cache manager not available",
+                "cache_stats": {}
+            }
+        stats = cache_mgr.get_cache_stats()
+        return {
+            "status": "success",
+            "cache_stats": stats,
+            "timestamp": time.time()
+        }
+    except Exception as e:
+        logger.error(f"Cache stats error: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to get cache stats: {str(e)}")
+@router.post("/cache/clear", summary="Clear Cache")
+async def clear_cache(request: CacheManagementRequest):
+    """Clear cached results"""
+    try:
+        cache_mgr = get_cache_manager()
+        if not cache_mgr:
+            return {
+                "status": "error",
+                "message": "Cache manager not available"
+            }
+        # Clear cache based on parameters
+        removed_count = cache_mgr.clear_cache(
+            analysis_type=request.analysis_type,
+            older_than_hours=request.older_than_hours
+        )
+        logger.info(f"🧹 Cache cleared: {removed_count} files removed")
+        return {
+            "status": "success",
+            "message": f"Cache cleared successfully",
+            "files_removed": removed_count,
+            "cleared_type": request.analysis_type or "all",
+            "timestamp": time.time()
+        }
+    except Exception as e:
+        logger.error(f"Cache clear error: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to clear cache: {str(e)}")
+@router.post("/cache/cleanup", summary="Cleanup Expired Cache")
+async def cleanup_expired_cache():
+    """Remove only expired cache files"""
+    try:
+        cache_mgr = get_cache_manager()
+        if not cache_mgr:
+            return {
+                "status": "error",
+                "message": "Cache manager not available"
+            }
+        removed_count = cache_mgr.cleanup_expired_cache()
+        logger.info(f"🧹 Expired cache cleaned up: {removed_count} files removed")
+        return {
+            "status": "success",
+            "message": "Expired cache cleaned up successfully",
+            "expired_files_removed": removed_count,
+            "timestamp": time.time()
+        }
+    except Exception as e:
+        logger.error(f"Cache cleanup error: {e}")
+        raise HTTPException(status_code=500, detail=f"Failed to cleanup cache: {str(e)}")
+@router.post("/compare-promo-stream", summary="Stream Promo Comparison with Real-Time Progress")
+async def compare_promo_stream(request: PromoComparisonRequest):
+    """Stream promo product comparison with real-time progress updates"""
+    async def generate_promo_progress():
+        try:
+            logger.info(f"🏷️ Starting streaming promo comparison with threshold {request.threshold}")
+            repository = get_similarity_repository()
+            if not repository:
+                yield f"data: {json.dumps({'type': 'error', 'message': 'Database repository not available'})}\n\n"
+                return
+            # Load REAL promo and database products
+            logger.info("📊 Loading REAL promo and database products...")
+            try:
+                promo_products = repository.load_promo_products()
+                db_products = repository.load_all_products()
+                logger.info(f"✅ Loaded {len(promo_products)} promo products and {len(db_products)} database products")
+            except Exception as e:
+                logger.warning(f"❌ Failed to load real data, using samples: {e}")
+                promo_products = repository._get_sample_promo_products()
+                db_products = repository.get_sample_products()
+            # Check cache first (unless force refresh)
+            if not request.force_refresh:
+                cached_results = load_promo_analysis(
+                    len(promo_products),
+                    len(db_products),
+                    request.threshold
+                )
+                if cached_results:
+                    logger.info("✅ Returning cached promo analysis results via stream")
+                    yield f"data: {json.dumps({'type': 'cache_hit', 'data': cached_results})}\n\n"
+                    return
+            # Send initial status
+            total_comparisons = len(promo_products) * len(db_products)
+            yield f"data: {json.dumps({'type': 'init', 'total_promo_products': len(promo_products), 'total_db_products': len(db_products), 'total_comparisons': total_comparisons, 'threshold': request.threshold})}\n\n"
+            # Find matches using REAL similarity algorithm with streaming
+            start_time = time.time()
+            matches = []
+            completed_comparisons = 0
+            for i, promo in enumerate(promo_products):
+                promo_name = promo.get('name', '').strip()
+                promo_store = promo.get('store', '').strip()
+                if not promo_name:
+                    continue
+                # Send promo progress update every 10 promos
+                if i % 10 == 0:
+                    promo_progress = (i / len(promo_products)) * 100
+                    yield f"data: {json.dumps({'type': 'promo_progress', 'current_promo': i + 1, 'total_promos': len(promo_products), 'progress': round(promo_progress, 1), 'current_promo_name': promo_name[:50], 'store': promo_store})}\n\n"
+                best_match = None
+                best_similarity = 0.0
+                for j, db_product in enumerate(db_products):
+                    db_name = db_product.get('product_name', '').strip()
+                    if not db_name:
+                        continue
+                    completed_comparisons += 1
+                    # Calculate similarity
+                    similarity = calculate_similarity(promo_name, db_name)
+                    # Send detailed comparison progress every 5000 comparisons
+                    if completed_comparisons % 5000 == 0:
+                        overall_progress = (completed_comparisons / total_comparisons) * 100
+                        yield f"data: {json.dumps({'type': 'comparison_progress', 'completed_comparisons': completed_comparisons, 'total_comparisons': total_comparisons, 'progress': round(overall_progress, 1), 'comparing': f'{promo_name[:30]} vs {db_name[:30]}'})}\n\n"
+                        await asyncio.sleep(0.01)  # Small delay
+                    if similarity >= request.threshold and similarity > best_similarity:
+                        best_similarity = similarity
+                        confidence = calculate_confidence(similarity, promo_name, db_name)
+                        best_match = {
+                            'promo_id': promo.get('id'),
+                            'promo_name': promo_name,
+                            'promo_store': promo_store,
+                            'promo_price': promo.get('promo_price', 0),
+                            'regular_price': promo.get('regular_price', 0),
+                            'picture_id': promo.get('picture_id'),
+                            'db_product_id': db_product.get('product_id'),
+                            'db_product_name': db_name,
+                            'db_brand': db_product.get('brand', {}).get('brand_name', 'No Brand') if db_product.get('brand') else 'No Brand',
+                            'similarity': round(similarity, 3),
+                            'confidence': round(confidence, 3)
+                        }
+                # If match found, send immediate update
+                if best_match:
+                    matches.append(best_match)
+                    yield f"data: {json.dumps({'type': 'match_found', 'match': best_match, 'total_matches': len(matches)})}\n\n"
+                    logger.info(f"🔍 PROMO MATCH FOUND via stream: {promo_name} ↔ {best_match['db_product_name']} ({best_similarity:.3f})")
+                # Limit results if requested
+                if request.max_results and len(matches) >= request.max_results:
+                    yield f"data: {json.dumps({'type': 'max_results_reached', 'max_results': request.max_results, 'matches_found': len(matches)})}\n\n"
+                    break
+            analysis_time = time.time() - start_time
+            # Prepare final results
+            results = {
+                "matches": matches,
+                "statistics": {
+                    "total_promo_products": len(promo_products),
+                    "total_database_products": len(db_products),
+                    "total_comparisons": completed_comparisons,
+                    "matches_found": len(matches),
+                    "match_rate_percentage": round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
+                    "threshold_used": request.threshold,
+                    "analysis_time_seconds": round(analysis_time, 2)
+                }
+            }
+            # Cache the results
+            cache_parameters = {
+                'threshold': request.threshold,
+                'max_results': request.max_results,
+                'promo_count': len(promo_products),
+                'db_count': len(db_products),
+                'analysis_date': time.strftime('%Y-%m-%d %H:%M:%S')
+            }
+            cache_key = cache_promo_analysis(
+                len(promo_products),
+                len(db_products),
+                request.threshold,
+                results,
+                cache_parameters
+            )
+            # Send final results
+            yield f"data: {json.dumps({'type': 'complete', 'data': results, 'cache_key': cache_key})}\n\n"
+            logger.info(f"✅ Streaming promo comparison complete: {len(matches)} matches found")
+            logger.info(f"💾 Results cached with key: {cache_key}")
+        except Exception as e:
+            logger.error(f"Streaming promo comparison error: {e}")
+            yield f"data: {json.dumps({'type': 'error', 'message': str(e)})}\n\n"
+    return StreamingResponse(
+        generate_promo_progress(),
+        media_type="text/event-stream",
+        headers={
+            "Cache-Control": "no-cache",
+            "Connection": "keep-alive",
+            "Content-Type": "text/event-stream",
+            "X-Accel-Buffering": "no"  # Disable nginx buffering
+        }
+    )

db/similarity_repository.py ADDED Viewed

	@@ -0,0 +1,462 @@

+"""
+Similarity Repository - Backend Database Operations
+Handles all Supabase interactions for the similarity engine
+VERIFIED FOR REAL DATA CONNECTION
+"""
+import os
+import logging
+import sys
+from typing import List, Dict, Any, Optional
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class SimilarityRepository:
+    """Repository class for similarity engine database operations"""
+    def __init__(self):
+        """Initialize Supabase connection"""
+        # Try backend environment variables first, then fall back to VITE_ variables
+        self.supabase_url = os.getenv('SUPABASE_URL') or os.getenv('VITE_SUPABASE_URL')
+        self.supabase_key = os.getenv('SUPABASE_KEY') or os.getenv('VITE_SUPABASE_KEY')
+        self.user_email = os.getenv('SUPABASE_USER_EMAIL', 'tonywis12@yahoo.com')
+        self.user_password = os.getenv('SUPABASE_USER_PASSWORD', 'Anthony.12')
+        # Try to load environment variables from .env file
+        try:
+            from dotenv import load_dotenv
+            # Look for .env file in parent directory (where the main app is)
+            env_path = os.path.join(os.path.dirname(os.path.dirname(__file__)), '.env')
+            load_dotenv(env_path)
+            # Reload environment variables after loading .env (try both naming conventions)
+            self.supabase_url = os.getenv('SUPABASE_URL') or os.getenv('VITE_SUPABASE_URL')
+            self.supabase_key = os.getenv('SUPABASE_KEY') or os.getenv('VITE_SUPABASE_KEY')
+            logger.info(f"📋 Loaded .env from: {env_path}")
+            logger.info(f"📋 Supabase URL: {self.supabase_url[:30] if self.supabase_url else 'Not found'}...")
+            logger.info(f"📋 Supabase Key: {self.supabase_key[:20] if self.supabase_key else 'Not found'}...")
+        except ImportError:
+            logger.warning("python-dotenv not available, using system environment variables")
+        except Exception as e:
+            logger.warning(f"Could not load .env file: {e}")
+        if not self.supabase_url or not self.supabase_key:
+            logger.error("❌ Supabase credentials not found in environment variables")
+            logger.error("💡 Looking for SUPABASE_URL/SUPABASE_KEY or VITE_SUPABASE_URL/VITE_SUPABASE_KEY")
+            self.supabase = None
+            return
+        self.supabase = None
+        self._authenticate()
+    def _authenticate(self) -> bool:
+        """Authenticate with Supabase"""
+        try:
+            from supabase import create_client
+            logger.info("🔗 Connecting to Supabase...")
+            self.supabase = create_client(self.supabase_url, self.supabase_key)
+            logger.info("🔐 Authenticating user...")
+            auth_result = self.supabase.auth.sign_in_with_password({
+                "email": self.user_email,
+                "password": self.user_password
+            })
+            if auth_result.user:
+                logger.info(f"✅ Authenticated as: {auth_result.user.email}")
+                return True
+            else:
+                logger.error("❌ Authentication failed - no user returned")
+                return False
+        except ImportError as e:
+            logger.error(f"❌ Supabase library not available: {e}")
+            logger.error("💡 Install with: pip install supabase")
+            return False
+        except Exception as e:
+            logger.error(f"❌ Authentication error: {e}")
+            return False
+    def load_all_products(self) -> List[Dict[str, Any]]:
+        """
+        Load all products from database with pagination
+        Returns:
+            List of product dictionaries
+        """
+        if not self.supabase:
+            logger.error("❌ No Supabase connection - returning sample data")
+            return self.get_sample_products()
+        try:
+            logger.info("📊 Loading all products from database...")
+            # Get total count first
+            count_result = self.supabase.table('products').select('*', count='exact').execute()
+            total_count = count_result.count
+            logger.info(f"📋 Total products in database: {total_count}")
+            if not total_count:
+                logger.warning("❌ No products found in database")
+                return self.get_sample_products()
+            # Fetch all products in batches
+            page_size = 1000
+            all_products = []
+            page = 0
+            while True:
+                offset = page * page_size
+                logger.info(f"⏳ Fetching products {offset + 1} to {offset + page_size}...")
+                batch_result = self.supabase.table('products').select('''
+                    product_id,
+                    product_name,
+                    product_weight,
+                    product_image,
+                    brand_id,
+                    brand:brands (
+                        brand_id,
+                        brand_name
+                    )
+                ''').range(offset, offset + page_size - 1).execute()
+                batch_products = batch_result.data
+                if not batch_products:
+                    break
+                all_products.extend(batch_products)
+                page += 1
+                if len(all_products) >= total_count:
+                    break
+            logger.info(f"✅ Loaded {len(all_products)} products successfully")
+            return self._validate_products(all_products)
+        except Exception as e:
+            logger.error(f"❌ Error loading products: {e}")
+            logger.info("🔄 Falling back to sample data")
+            return self.get_sample_products()
+    def load_promo_products(self, with_images_only: bool = False) -> List[Dict[str, Any]]:
+        if not self.supabase:
+            logger.error("❌ No Supabase connection - returning sample promo data")
+            return self._get_sample_promo_products()
+        try:
+            logger.info("📊 Loading promo products from database...")
+            # Build basic query without is_ignored filter (this might be the issue)
+            query = self.supabase.table('promo_products').select('''
+                id,
+                name,
+                store,
+                description,
+                picture_id,
+                regular_price,
+                promo_price
+            ''')
+            # Add image filter if requested
+            if with_images_only:
+                query = query.not_.is_('picture_id', 'null')
+                logger.info("🖼️ Filtering for promo products with images only")
+            # Get total count first (without is_ignored filter)
+            try:
+                count_result = query.select('*', count='exact').execute()
+                total_count = count_result.count
+                logger.info(f"📋 Total promo products found: {total_count}")
+            except Exception as count_error:
+                logger.error(f"❌ Error getting promo count: {count_error}")
+                # Try simpler count
+                simple_count = self.supabase.table('promo_products').select('id', count='exact').execute()
+                total_count = simple_count.count
+                logger.info(f"📋 Simple count result: {total_count}")
+            if not total_count or total_count == 0:
+                logger.warning("❌ No promo products found in database")
+                return self._get_sample_promo_products()
+            # Fetch all promo products in batches
+            page_size = 1000
+            all_promo_products = []
+            page = 0
+            logger.info(f"📦 Starting to fetch {total_count} promo products in batches...")
+            while True:
+                offset = page * page_size
+                logger.info(f"⏳ Fetching promo products {offset + 1} to {offset + page_size}...")
+                try:
+                    batch_result = query.range(offset, offset + page_size - 1).execute()
+                    batch_products = batch_result.data
+                    if not batch_products:
+                        logger.info(f"📝 No more promo products found at offset {offset}")
+                        break
+                    all_promo_products.extend(batch_products)
+                    page += 1
+                    logger.info(f"✅ Loaded batch: {len(batch_products)} products (total so far: {len(all_promo_products)})")
+                    if len(all_promo_products) >= total_count:
+                        break
+                except Exception as batch_error:
+                    logger.error(f"❌ Error loading batch at offset {offset}: {batch_error}")
+                    break
+            logger.info(f"✅ Successfully loaded {len(all_promo_products)} promo products from database")
+            # Log sample of what we loaded
+            if all_promo_products:
+                sample_product = all_promo_products[0]
+                logger.info(f"📝 Sample promo product: {sample_product.get('name', 'No name')} from {sample_product.get('store', 'No store')}")
+            return all_promo_products
+        except Exception as e:
+            logger.error(f"❌ Error loading promo products: {e}")
+            logger.error(f"❌ Exception type: {type(e).__name__}")
+            logger.error(f"❌ Exception details: {str(e)}")
+            logger.info("🔄 Falling back to sample promo data")
+            return self._get_sample_promo_products()
+    def update_product_image(self, product_id: str, image_url: str) -> bool:
+        """
+        Update product image in database
+        Args:
+            product_id: Product ID to update
+            image_url: New image URL
+        Returns:
+            True if successful, False otherwise
+        """
+        if not self.supabase:
+            logger.error("❌ No Supabase connection")
+            return False
+        try:
+            logger.info(f"📊 Updating product {product_id} with image URL")
+            result = self.supabase.table('products').update({
+                'product_image': image_url
+            }).eq('product_id', product_id).execute()
+            if result.data:
+                logger.info(f"✅ Updated product {product_id} with image")
+                return True
+            else:
+                logger.error(f"❌ Failed to update product {product_id}")
+                return False
+        except Exception as e:
+            logger.error(f"❌ Database update error for product {product_id}: {e}")
+            return False
+    def get_products_without_images(self, limit: Optional[int] = None) -> List[Dict[str, Any]]:
+        """
+        Get products that don't have images
+        Args:
+            limit: Maximum number of products to return
+        Returns:
+            List of products without images
+        """
+        if not self.supabase:
+            logger.error("❌ No Supabase connection")
+            return []
+        try:
+            logger.info("📊 Loading products without images...")
+            query = self.supabase.table('products').select('''
+                product_id,
+                product_name,
+                product_weight,
+                brand:brands (
+                    brand_id,
+                    brand_name
+                )
+            ''').is_('product_image', 'null')
+            if limit:
+                query = query.limit(limit)
+            result = query.execute()
+            products = result.data
+            logger.info(f"✅ Found {len(products)} products without images")
+            return self._validate_products(products)
+        except Exception as e:
+            logger.error(f"❌ Error loading products without images: {e}")
+            return []
+    def _validate_products(self, products: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
+        """
+        Validate and clean product data
+        Args:
+            products: Raw product list
+        Returns:
+            Validated product list
+        """
+        validated = []
+        for i, product in enumerate(products):
+            if not isinstance(product, dict):
+                logger.warning(f"⚠️ Skipping invalid product at index {i}: not a dictionary")
+                continue
+            product_id = product.get('product_id')
+            product_name = product.get('product_name')
+            if not product_id:
+                product['product_id'] = f"unknown_{i}"
+                logger.warning(f"⚠️ Added missing product_id for product at index {i}")
+            if not product_name or not product_name.strip():
+                logger.warning(f"⚠️ Skipping product with missing/empty name: {product}")
+                continue
+            # Clean product name
+            product['product_name'] = product_name.strip()
+            validated.append(product)
+        logger.info(f"✅ Validated {len(validated)} products (skipped {len(products) - len(validated)})")
+        return validated
+    def get_sample_products(self) -> List[Dict[str, Any]]:
+        """
+        Get sample products for testing
+        Returns:
+            List of sample product dictionaries
+        """
+        logger.info("📊 Using sample products for testing")
+        return [
+            {
+                "product_id": "1",
+                "product_name": "vindija mlijeko cokoladno 2.8%",
+                "product_weight": "1L",
+                "brand": {"brand_id": "1", "brand_name": "Vindija"}
+            },
+            {
+                "product_id": "2",
+                "product_name": "vindija cokoladno mlijeko 2.8%",
+                "product_weight": "1L",
+                "brand": {"brand_id": "1", "brand_name": "Vindija"}
+            },
+            {
+                "product_id": "3",
+                "product_name": "Apple iPhone 13 Pro",
+                "product_weight": "238g",
+                "brand": {"brand_id": "3", "brand_name": "Apple"}
+            },
+            {
+                "product_id": "4",
+                "product_name": "iPhone 13 Pro Apple",
+                "product_weight": "238g",
+                "brand": {"brand_id": "3", "brand_name": "Apple"}
+            },
+            {
+                "product_id": "5",
+                "product_name": "Samsung Galaxy S22",
+                "product_weight": "167g",
+                "brand": {"brand_id": "4", "brand_name": "Samsung"}
+            },
+            {
+                "product_id": "6",
+                "product_name": "Galaxy S22 Samsung",
+                "product_weight": "167g",
+                "brand": {"brand_id": "4", "brand_name": "Samsung"}
+            },
+            {
+                "product_id": "7",
+                "product_name": "Coca Cola 330ml",
+                "product_weight": "330ml",
+                "brand": {"brand_id": "7", "brand_name": "Coca-Cola"}
+            },
+            {
+                "product_id": "8",
+                "product_name": "Pepsi Cola 330ml",
+                "product_weight": "330ml",
+                "brand": {"brand_id": "8", "brand_name": "PepsiCo"}
+            }
+        ]
+    def _get_sample_promo_products(self) -> List[Dict[str, Any]]:
+        """Get sample promo products for testing"""
+        logger.info("📊 Using sample promo products for testing")
+        return [
+            {"id": "p1", "name": "vindija jogurt prirodni", "store": "Konzum", "promo_price": 1.20, "regular_price": 1.50, "picture_id": "img1"},
+            {"id": "p2", "name": "coca cola original 500ml", "store": "Konzum", "promo_price": 0.80, "regular_price": 1.10, "picture_id": "img2"},
+            {"id": "p3", "name": "samsung galaxy phone", "store": "Links", "promo_price": 299.99, "regular_price": 399.99, "picture_id": "img3"},
+        ]
+    def test_connection(self) -> Dict[str, Any]:
+        """
+        Test the database connection and return status
+        Returns:
+            Status dictionary with connection details
+        """
+        if not self.supabase:
+            return {
+                "connected": False,
+                "error": "No Supabase client initialized",
+                "products_count": 0,
+                "promo_products_count": 0
+            }
+        try:
+            # Test products table
+            products_result = self.supabase.table('products').select('product_id', count='exact').limit(1).execute()
+            products_count = products_result.count
+            # Test promo_products table
+            promo_result = self.supabase.table('promo_products').select('id', count='exact').limit(1).execute()
+            promo_count = promo_result.count
+            return {
+                "connected": True,
+                "products_count": products_count,
+                "promo_products_count": promo_count,
+                "database_url": self.supabase_url[:30] + "..." if self.supabase_url else "Unknown"
+            }
+        except Exception as e:
+            return {
+                "connected": False,
+                "error": str(e),
+                "products_count": 0,
+                "promo_products_count": 0
+            }
+# Global repository instance
+_repository = None
+def get_similarity_repository() -> SimilarityRepository:
+    """Get singleton repository instance"""
+    global _repository
+    if _repository is None:
+        _repository = SimilarityRepository()
+    return _repository

product_detector/mock_detector.py ADDED Viewed

	@@ -0,0 +1,33 @@

+import numpy as np
+from typing import List, Dict
+import warnings
+class MockObjectDetector:
+    """
+    Mock Object Detector to temporarily replace the broken ONNX model
+    Returns dummy detection results to keep the server running
+    """
+    def __init__(self, model_path: str, class_names: List[str], input_size: int = 640):
+        self.class_names = class_names
+        self.input_size = input_size
+        print(f"🔧 Mock detector initialized - model file was corrupted")
+        print(f"📝 Available classes: {class_names}")
+    def predict(self, image: np.ndarray) -> List[Dict]:
+        """
+        Mock prediction method - returns sample detections
+        Replace this with real detector once model is fixed
+        """
+        # Return mock detection results
+        mock_detections = [
+            {
+                "class": "product" if len(self.class_names) > 0 else "unknown",
+                "confidence": 0.85,
+                "bbox": [100, 100, 300, 250],  # x1, y1, x2, y2
+                "bbox_normalized": [0.3, 0.3, 0.4, 0.5]  # center_x, center_y, width, height (normalized)
+            }
+        ]
+        print(f"🔍 Mock detection completed - found {len(mock_detections)} objects")
+        return mock_detections

requirements.txt CHANGED Viewed

@@ -12,4 +12,8 @@ python-dotenv
 supabase
 rembg
 httpx
-unidecode

 supabase
 rembg
 httpx
+unidecode
+# Similarity Engine Dependencies
+requests
+python-dateutil
+pydantic

similarity_engine/__init__.py ADDED Viewed

	@@ -0,0 +1,17 @@

+"""
+Similarity Engine Package
+Backend version of the Price Hunter similarity engine
+"""
+from .similarity_core import calculate_similarity, calculate_confidence
+from .product_comparator import ProductComparator, compare_products_batch
+from .enhanced_image_processor import get_image_processor
+__version__ = "1.0.0"
+__all__ = [
+    "calculate_similarity",
+    "calculate_confidence",
+    "ProductComparator",
+    "compare_products_batch",
+    "get_image_processor"
+]

similarity_engine/enhanced_image_processor.py ADDED Viewed

	@@ -0,0 +1,531 @@

+"""
+Enhanced Image Processor - Multiple Sources & Flexible Processing
+Supports promo products, manual uploads, URL sources, Google Images, and more
+"""
+import os
+import logging
+import requests
+import time
+from typing import List, Dict, Any, Optional, Tuple
+import sys
+import os
+# Add parent directory to path
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from similarity_core import calculate_similarity, calculate_confidence
+from db.similarity_repository import get_similarity_repository
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class EnhancedImageProcessor:
+    """Enhanced image processor with multiple sources and flexible options"""
+    def __init__(self):
+        """Initialize the image processor"""
+        self.repository = get_similarity_repository()
+        self.processing_stats = {
+            'total_processed': 0,
+            'successful': 0,
+            'failed': 0,
+            'skipped': 0
+        }
+    def find_high_similarity_matches(
+        self,
+        source_products: List[Dict],
+        target_products: List[Dict],
+        threshold: float = 0.95,
+        source_type: str = "promo"
+    ) -> List[Dict[str, Any]]:
+        """
+        Find high similarity matches between source and target products
+        Args:
+            source_products: Products with images (promo, manual, etc.)
+            target_products: Database products to match against
+            threshold: Similarity threshold
+            source_type: Type of source ("promo", "manual", "google", etc.)
+        Returns:
+            List of high similarity matches
+        """
+        logger.info(f"🔍 Finding high similarity matches for {source_type} images")
+        logger.info(f"📊 Source products: {len(source_products)}")
+        logger.info(f"📊 Target products: {len(target_products)}")
+        logger.info(f"🎯 Similarity threshold: {threshold}")
+        matches = []
+        for i, source_product in enumerate(source_products):
+            source_name = source_product.get('name', '').strip()
+            if not source_name:
+                continue
+            logger.info(f"📊 Analyzing {source_type} product {i+1}/{len(source_products)}: {source_name[:50]}...")
+            for target_product in target_products:
+                target_name = target_product.get('product_name', '').strip()
+                if not target_name:
+                    continue
+                similarity = calculate_similarity(source_name, target_name)
+                if similarity >= threshold:
+                    confidence = calculate_confidence(similarity, source_name, target_name)
+                    match = {
+                        'source_id': source_product.get('id'),
+                        'source_name': source_name,
+                        'source_type': source_type,
+                        'target_product_id': target_product.get('product_id'),
+                        'target_product_name': target_name,
+                        'similarity': round(similarity, 3),
+                        'confidence': round(confidence, 3),
+                        'has_current_image': bool(target_product.get('product_image')),
+                        'source_image_info': self._extract_image_info(source_product, source_type)
+                    }
+                    matches.append(match)
+                    logger.info(f"  🔍 HIGH MATCH: {source_name} ↔ {target_name} ({similarity:.3f})")
+                    break
+        logger.info(f"✅ Found {len(matches)} high similarity matches")
+        return matches
+    def _extract_image_info(self, product: Dict, source_type: str) -> Dict[str, Any]:
+        """Extract image information based on source type"""
+        if source_type == "promo":
+            picture_id = product.get('picture_id')
+            return {
+                'picture_id': picture_id,
+                'image_url': f"https://backend.360promo.hr/contents/products/{picture_id}.jpg" if picture_id else None,
+                'store': product.get('store'),
+                'promo_price': product.get('promo_price'),
+                'regular_price': product.get('regular_price')
+            }
+        elif source_type == "manual":
+            return {
+                'image_url': product.get('image_url'),
+                'original_filename': product.get('filename'),
+                'uploaded_by': product.get('uploaded_by')
+            }
+        elif source_type == "google":
+            return {
+                'image_url': product.get('image_url'),
+                'source_page': product.get('source_page'),
+                'search_query': product.get('search_query')
+            }
+        elif source_type == "url":
+            return {
+                'image_url': product.get('image_url'),
+                'source_domain': product.get('source_domain')
+            }
+        else:
+            return {
+                'image_url': product.get('image_url', product.get('picture_url'))
+            }
+    def check_image_availability(self, image_url: str) -> bool:
+        """Check if image URL is accessible"""
+        try:
+            response = requests.head(image_url, timeout=10)
+            return response.status_code == 200
+        except Exception as e:
+            logger.warning(f"⚠️ Image not accessible: {image_url} - {e}")
+            return False
+    def process_image_from_url(
+        self,
+        image_url: str,
+        product_id: str,
+        processing_options: Dict[str, Any] = None
+    ) -> Optional[str]:
+        """
+        Download and process image from URL
+        Args:
+            image_url: Source image URL
+            product_id: Target product ID
+            processing_options: Processing configuration
+        Returns:
+            Processed image URL or None if failed
+        """
+        if processing_options is None:
+            processing_options = {
+                'remove_background': True,
+                'upscale_factor': 2,
+                'target_format': 'webp',
+                'quality': 85
+            }
+        try:
+            logger.info(f"📥 Downloading image from: {image_url}")
+            # Download image
+            response = requests.get(image_url, timeout=30)
+            if response.status_code != 200:
+                logger.error(f"❌ Failed to download: HTTP {response.status_code}")
+                return None
+            logger.info("✅ Image downloaded successfully")
+            # Try to process via backend endpoint
+            processed_url = self._process_via_backend(
+                response.content,
+                product_id,
+                processing_options
+            )
+            if processed_url:
+                return processed_url
+            # If processing fails, return original URL
+            logger.warning("⚠️ Processing failed, using original URL")
+            return image_url
+        except Exception as e:
+            logger.error(f"❌ Error processing image from URL: {e}")
+            return None
+    def _process_via_backend(
+        self,
+        image_content: bytes,
+        product_id: str,
+        options: Dict[str, Any]
+    ) -> Optional[str]:
+        """Process image via backend endpoint"""
+        try:
+            # Get backend endpoint
+            endpoint = os.getenv('IMAGE_PROCESS_ENDPOINT', 'http://localhost:7860/products/process-product-image')
+            files = {'file': ('image.jpg', image_content, 'image/jpeg')}
+            data = {
+                'remove_bg': str(options.get('remove_background', True)).lower(),
+                'upscale': str(options.get('upscale_factor', 2) > 1).lower(),
+                'scale_factor': str(options.get('upscale_factor', 2)),
+                'process_order': 'remove_first',
+                'product_id': product_id
+            }
+            response = requests.post(endpoint, files=files, data=data, timeout=60)
+            if response.status_code == 200:
+                result = response.json()
+                if result.get('status') == 'success':
+                    logger.info("✅ Image processed successfully via backend")
+                    return result.get('image_url')
+            logger.warning(f"⚠️ Backend processing failed: {response.status_code}")
+            return None
+        except Exception as e:
+            logger.warning(f"⚠️ Backend processing unavailable: {e}")
+            return None
+    def process_promo_images(
+        self,
+        similarity_threshold: float = 0.95,
+        skip_existing: bool = True,
+        max_products: Optional[int] = None
+    ) -> Dict[str, int]:
+        """
+        Process images from promotional products
+        Args:
+            similarity_threshold: Minimum similarity for processing
+            skip_existing: Skip products that already have images
+            max_products: Maximum products to process
+        Returns:
+            Processing statistics
+        """
+        logger.info("🏷️ Starting promo image processing...")
+        # Load promo products with images
+        promo_products = self.repository.load_promo_products(with_images_only=True)
+        if not promo_products:
+            logger.error("❌ No promo products with images found")
+            return self._get_empty_stats()
+        # Load target products
+        if skip_existing:
+            target_products = self.repository.get_products_without_images(max_products)
+        else:
+            all_products = self.repository.load_all_products()
+            target_products = all_products[:max_products] if max_products else all_products
+        if not target_products:
+            logger.error("❌ No target products found")
+            return self._get_empty_stats()
+        # Find matches
+        matches = self.find_high_similarity_matches(
+            promo_products,
+            target_products,
+            similarity_threshold,
+            "promo"
+        )
+        return self._process_matches(matches, skip_existing)
+    def process_manual_upload(
+        self,
+        image_file: bytes,
+        filename: str,
+        product_id: str,
+        processing_options: Dict[str, Any] = None
+    ) -> bool:
+        """
+        Process manually uploaded image
+        Args:
+            image_file: Image file content
+            filename: Original filename
+            product_id: Target product ID
+            processing_options: Processing configuration
+        Returns:
+            True if successful
+        """
+        logger.info(f"📤 Processing manual upload for product {product_id}")
+        try:
+            # Process image
+            processed_url = self._process_via_backend(
+                image_file,
+                product_id,
+                processing_options or {}
+            )
+            if not processed_url:
+                logger.error("❌ Failed to process uploaded image")
+                return False
+            # Update database
+            success = self.repository.update_product_image(product_id, processed_url)
+            if success:
+                # Save metadata
+                self.repository.save_image_metadata(product_id, {
+                    'source_type': 'manual',
+                    'original_filename': filename,
+                    'processed_url': processed_url,
+                    'upload_time': time.time()
+                })
+                logger.info(f"✅ Successfully attached manual upload to product {product_id}")
+                return True
+            return False
+        except Exception as e:
+            logger.error(f"❌ Error processing manual upload: {e}")
+            return False
+    def process_from_url_list(
+        self,
+        url_mappings: List[Dict[str, str]],
+        processing_options: Dict[str, Any] = None
+    ) -> Dict[str, int]:
+        """
+        Process images from a list of URL mappings
+        Args:
+            url_mappings: List of {'product_id': 'xxx', 'image_url': 'xxx'} mappings
+            processing_options: Processing configuration
+        Returns:
+            Processing statistics
+        """
+        logger.info(f"🌐 Processing {len(url_mappings)} URL mappings...")
+        stats = self._get_empty_stats()
+        stats['total_processed'] = len(url_mappings)
+        for mapping in url_mappings:
+            product_id = mapping.get('product_id')
+            image_url = mapping.get('image_url')
+            if not product_id or not image_url:
+                stats['failed'] += 1
+                continue
+            logger.info(f"📊 Processing URL for product {product_id}")
+            # Check availability
+            if not self.check_image_availability(image_url):
+                stats['failed'] += 1
+                continue
+            # Process image
+            processed_url = self.process_image_from_url(
+                image_url,
+                product_id,
+                processing_options
+            )
+            if processed_url:
+                # Update database
+                if self.repository.update_product_image(product_id, processed_url):
+                    stats['successful'] += 1
+                    # Save metadata
+                    self.repository.save_image_metadata(product_id, {
+                        'source_type': 'url',
+                        'source_url': image_url,
+                        'processed_url': processed_url,
+                        'processing_time': time.time()
+                    })
+                else:
+                    stats['failed'] += 1
+            else:
+                stats['failed'] += 1
+        logger.info(f"✅ URL processing complete: {stats['successful']}/{stats['total_processed']} successful")
+        return stats
+    def search_and_attach_google_images(
+        self,
+        product_id: str,
+        search_query: str,
+        max_results: int = 3,
+        require_approval: bool = True
+    ) -> List[Dict[str, Any]]:
+        """
+        Search Google Images and find potential matches
+        Args:
+            product_id: Target product ID
+            search_query: Search query for Google Images
+            max_results: Maximum results to return
+            require_approval: Whether manual approval is required
+        Returns:
+            List of potential image matches
+        """
+        logger.info(f"🔍 Google Image search for product {product_id}: '{search_query}'")
+        # TODO: Implement Google Images API integration
+        # For now, return mock results
+        mock_results = [
+            {
+                'image_url': f'https://example.com/mock-image-1.jpg',
+                'thumbnail_url': f'https://example.com/mock-thumb-1.jpg',
+                'source_page': f'https://example.com/product-page-1',
+                'title': f'Mock result for {search_query}',
+                'confidence': 0.85
+            }
+        ]
+        logger.info(f"🔍 Found {len(mock_results)} potential Google Image matches")
+        logger.warning("⚠️ Google Images integration not yet implemented - returning mock data")
+        return mock_results
+    def _process_matches(self, matches: List[Dict], skip_existing: bool = True) -> Dict[str, int]:
+        """Process similarity matches and attach images"""
+        stats = self._get_empty_stats()
+        stats['total_processed'] = len(matches)
+        if not matches:
+            return stats
+        # Filter existing if needed
+        if skip_existing:
+            to_process = [m for m in matches if not m['has_current_image']]
+            stats['skipped'] = len(matches) - len(to_process)
+            matches = to_process
+        logger.info(f"📊 Processing images for {len(matches)} products...")
+        for match in matches:
+            product_id = match['target_product_id']
+            image_info = match['source_image_info']
+            image_url = image_info.get('image_url')
+            if not image_url:
+                stats['failed'] += 1
+                continue
+            logger.info(f"📊 Processing image for product {product_id}")
+            # Check availability
+            if not self.check_image_availability(image_url):
+                stats['failed'] += 1
+                continue
+            # Process image
+            processed_url = self.process_image_from_url(image_url, product_id)
+            if processed_url and self.repository.update_product_image(product_id, processed_url):
+                stats['successful'] += 1
+                # Save metadata
+                self.repository.save_image_metadata(product_id, {
+                    'source_type': match['source_type'],
+                    'similarity': match['similarity'],
+                    'confidence': match['confidence'],
+                    'source_info': image_info,
+                    'processing_time': time.time()
+                })
+                logger.info(f"✅ Successfully attached image to product {product_id}")
+            else:
+                stats['failed'] += 1
+        return stats
+    def _get_empty_stats(self) -> Dict[str, int]:
+        """Get empty statistics dictionary"""
+        return {
+            'total_processed': 0,
+            'successful': 0,
+            'failed': 0,
+            'skipped': 0,
+            'unavailable': 0
+        }
+    def get_processing_report(self, stats: Dict[str, int]) -> Dict[str, Any]:
+        """Generate processing report"""
+        return {
+            'summary': {
+                'total_processed': stats['total_processed'],
+                'successful': stats['successful'],
+                'failed': stats['failed'],
+                'skipped': stats.get('skipped', 0),
+                'success_rate': (stats['successful'] / max(stats['total_processed'], 1)) * 100
+            },
+            'timestamp': time.time(),
+            'recommendations': self._generate_recommendations(stats)
+        }
+    def _generate_recommendations(self, stats: Dict[str, int]) -> List[str]:
+        """Generate recommendations based on processing stats"""
+        recommendations = []
+        if stats['failed'] > stats['successful']:
+            recommendations.append("High failure rate - check image sources and processing settings")
+        if stats.get('skipped', 0) > 0:
+            recommendations.append(f"{stats['skipped']} products already had images - consider processing all products")
+        if stats['successful'] > 0:
+            recommendations.append(f"Successfully processed {stats['successful']} images - consider similar processing for remaining products")
+        return recommendations
+# Global processor instance
+_processor = None
+def get_image_processor() -> EnhancedImageProcessor:
+    """Get singleton image processor instance"""
+    global _processor
+    if _processor is None:
+        _processor = EnhancedImageProcessor()
+    return _processor

similarity_engine/product_comparator.py ADDED Viewed

	@@ -0,0 +1,300 @@

+"""
+Product Comparator - Backend Version
+Modified to return data instead of printing for API usage
+"""
+import time
+import logging
+from typing import List, Dict, Any
+import sys
+import os
+# Add the similarity_engine directory to path
+current_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(current_dir)
+from similarity_core import calculate_similarity, calculate_confidence
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class ProductComparator:
+    """
+    Backend version of ProductComparator that returns data
+    """
+    def __init__(self, threshold: float = 0.87, min_length_diff: int = 3):
+        """
+        Initialize the comparator
+        Args:
+            threshold: Similarity threshold for considering products as duplicates
+            min_length_diff: Minimum length difference to consider as different products
+        """
+        self.threshold = threshold
+        self.min_length_diff = min_length_diff
+        self.comparison_count = 0
+        self.duplicates_found = 0
+    def is_valid_comparison(self, name1: str, name2: str) -> bool:
+        """
+        Check if this comparison should be made to avoid false duplicates
+        Args:
+            name1: First product name
+            name2: Second product name
+        Returns:
+            True if this comparison should be made
+        """
+        # Skip empty names
+        if not name1 or not name2:
+            return False
+        # Skip identical names (already handled)
+        if name1 == name2:
+            return False
+        # Skip if one is contained in the other without significant differences
+        if (name1 in name2 or name2 in name1) and abs(len(name1) - len(name2)) < self.min_length_diff:
+            return False
+        return True
+    def find_all_duplicates(self, products: List[Dict[str, Any]]) -> Dict[str, Any]:
+        """
+        Find all duplicate pairs in product list
+        Args:
+            products: List of product dictionaries with 'product_id' and 'product_name'
+        Returns:
+            Dictionary with analysis results and duplicate pairs
+        """
+        start_time = time.time()
+        total_products = len(products)
+        total_comparisons = total_products * (total_products - 1) // 2
+        duplicates = []
+        logger.info(f"🔍 Starting duplicate analysis for {total_products} products")
+        logger.info(f"📊 Total comparisons needed: {total_comparisons:,}")
+        logger.info(f"🎯 Duplicate threshold: {self.threshold}")
+        comparison_count = 0
+        progress_updates = []
+        for i, product1 in enumerate(products):
+            product_name1 = product1.get('product_name', '').strip()
+            product_id1 = product1.get('product_id', 'unknown')
+            # Skip products with empty names
+            if not product_name1:
+                continue
+            # Log progress every 100 products
+            if i % 100 == 0:
+                progress_updates.append(f"Analyzing product {i+1}/{total_products}")
+                logger.info(f"📊 Progress: {i+1}/{total_products} products analyzed")
+            for j, product2 in enumerate(products[i+1:], i+1):
+                product_name2 = product2.get('product_name', '').strip()
+                product_id2 = product2.get('product_id', 'unknown')
+                # Skip products with empty names or identical IDs
+                if not product_name2 or product_id1 == product_id2:
+                    continue
+                # Skip invalid comparisons
+                if not self.is_valid_comparison(product_name1, product_name2):
+                    continue
+                comparison_count += 1
+                # Calculate similarity
+                similarity = calculate_similarity(product_name1, product_name2)
+                # Check if it's a duplicate
+                if similarity >= self.threshold:
+                    confidence = calculate_confidence(
+                        similarity,
+                        product_name1,
+                        product_name2
+                    )
+                    duplicate_info = {
+                        'product1_id': product_id1,
+                        'product1_name': product_name1,
+                        'product2_id': product_id2,
+                        'product2_name': product_name2,
+                        'similarity': round(similarity, 3),
+                        'confidence': round(confidence, 3)
+                    }
+                    duplicates.append(duplicate_info)
+                    logger.info(f"  🔍 DUPLICATE FOUND: {product_name1} ↔ {product_name2} ({similarity:.3f})")
+        end_time = time.time()
+        analysis_time = end_time - start_time
+        # Return comprehensive results
+        results = {
+            'duplicates': duplicates,
+            'analysis_summary': {
+                'total_products': total_products,
+                'total_comparisons': comparison_count,
+                'duplicates_found': len(duplicates),
+                'duplicate_rate': (len(duplicates) / max(total_products, 1)) * 100,
+                'analysis_time_seconds': round(analysis_time, 2),
+                'threshold_used': self.threshold,
+                'success': True
+            },
+            'progress_log': progress_updates,
+            'recommendations': self._generate_recommendations(duplicates, total_products)
+        }
+        logger.info(f"✅ Analysis complete: {len(duplicates)} duplicates found in {analysis_time:.1f}s")
+        return results
+    def _generate_recommendations(self, duplicates: List[Dict], total_products: int) -> List[str]:
+        """Generate recommendations based on analysis results"""
+        recommendations = []
+        if not duplicates:
+            recommendations.append("✅ No duplicates found - your database is clean!")
+            recommendations.append("💡 Consider periodic duplicate checks as you add new products")
+        else:
+            duplicate_rate = (len(duplicates) / total_products) * 100
+            if duplicate_rate > 10:
+                recommendations.append("⚠️ High duplicate rate detected - consider cleaning database")
+                recommendations.append("🔧 Review product naming conventions to reduce future duplicates")
+            elif duplicate_rate > 5:
+                recommendations.append("💡 Moderate duplicate rate - review and merge similar products")
+            else:
+                recommendations.append("✅ Low duplicate rate - good database quality")
+            recommendations.append(f"📋 Review {len(duplicates)} duplicate pairs for manual decision")
+            recommendations.append("💡 Higher similarity scores indicate more confident matches")
+        return recommendations
+    def compare_two_products(self, product1_name: str, product2_name: str) -> Dict[str, Any]:
+        """
+        Compare two specific products
+        Args:
+            product1_name: First product name
+            product2_name: Second product name
+        Returns:
+            Comparison results
+        """
+        logger.info(f"🔍 Comparing '{product1_name}' vs '{product2_name}'")
+        similarity = calculate_similarity(product1_name, product2_name)
+        confidence = calculate_confidence(similarity, product1_name, product2_name)
+        is_duplicate = similarity >= self.threshold
+        # Determine assessment
+        if similarity >= 0.95:
+            assessment = "Perfect match - identical products"
+            assessment_emoji = "✅"
+        elif similarity >= 0.8:
+            assessment = "Very similar - likely duplicates"
+            assessment_emoji = "⚠️"
+        elif similarity >= 0.7:
+            assessment = "Similar - review recommended"
+            assessment_emoji = "🤔"
+        else:
+            assessment = "Different products"
+            assessment_emoji = "❌"
+        results = {
+            'product1_name': product1_name,
+            'product2_name': product2_name,
+            'similarity': round(similarity, 3),
+            'confidence': round(confidence, 3),
+            'is_duplicate': is_duplicate,
+            'threshold_used': self.threshold,
+            'assessment': {
+                'description': assessment,
+                'emoji': assessment_emoji,
+                'category': 'identical' if similarity >= 0.95 else
+                          'very_similar' if similarity >= 0.8 else
+                          'similar' if similarity >= 0.7 else 'different'
+            },
+            'percentage_similarity': round(similarity * 100, 1),
+            'percentage_confidence': round(confidence * 100, 1)
+        }
+        logger.info(f"📊 Comparison result: {similarity:.3f} similarity, {assessment}")
+        return results
+    def get_statistics(self) -> Dict[str, Any]:
+        """
+        Get analysis statistics
+        Returns:
+            Dictionary with analysis statistics
+        """
+        return {
+            'comparison_count': self.comparison_count,
+            'duplicates_found': self.duplicates_found,
+            'threshold': self.threshold,
+            'min_length_diff': self.min_length_diff
+        }
+def compare_products_batch(
+    products: List[Dict[str, Any]],
+    threshold: float = 0.87,
+    return_summary_only: bool = False
+) -> Dict[str, Any]:
+    """
+    Convenience function to compare products and return results
+    Args:
+        products: List of product dictionaries
+        threshold: Similarity threshold for duplicates
+        return_summary_only: If True, only return summary stats
+    Returns:
+        Analysis results dictionary
+    """
+    logger.info(f"🚀 Starting batch product comparison with threshold {threshold}")
+    comparator = ProductComparator(threshold=threshold)
+    results = comparator.find_all_duplicates(products)
+    if return_summary_only:
+        return {
+            'analysis_summary': results['analysis_summary'],
+            'recommendations': results['recommendations']
+        }
+    return results
+def find_product_duplicates_simple(
+    product_names: List[str],
+    threshold: float = 0.87
+) -> List[Dict[str, Any]]:
+    """
+    Simple function to find duplicates in a list of product names
+    Args:
+        product_names: List of product names
+        threshold: Similarity threshold
+    Returns:
+        List of duplicate pairs
+    """
+    # Convert to product format
+    products = [
+        {"product_id": str(i), "product_name": name}
+        for i, name in enumerate(product_names) if name and name.strip()
+    ]
+    results = compare_products_batch(products, threshold, return_summary_only=False)
+    return results.get('duplicates', [])

similarity_engine/promo_comparator.py ADDED Viewed

	@@ -0,0 +1,278 @@

+#!/usr/bin/env python3
+"""
+Promo Products Comparator - Compare promo products against database products
+Integrates with the existing similarity engine for duplicate detection
+Backend version with streaming support
+"""
+import sys
+import os
+import time
+import logging
+from typing import List, Dict, Any, AsyncGenerator
+import asyncio
+# Add current directory to path for imports
+current_dir = os.path.dirname(os.path.abspath(__file__))
+sys.path.append(current_dir)
+try:
+    from similarity_core import calculate_similarity, calculate_confidence
+    from similarity_repository import get_similarity_repository
+except ImportError as e:
+    print(f"❌ Import error: {e}")
+    print("💡 Make sure you're running this from the similarity_engine directory")
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class PromoComparator:
+    """Handles promo product comparison with streaming support"""
+    def __init__(self, threshold: float = 0.85):
+        self.threshold = threshold
+        self.repository = get_similarity_repository()
+    def compare_promo_against_database(self, promo_products: List[Dict], db_products: List[Dict]) -> List[Dict]:
+        """
+        Compare promo products against database products for potential matches
+        Args:
+            promo_products: List of promo product dictionaries
+            db_products: List of database product dictionaries
+        Returns:
+            List of potential matches
+        """
+        logger.info(f"🔍 Starting promo comparison: {len(promo_products)} promo vs {len(db_products)} db products")
+        matches = []
+        total_comparisons = len(promo_products)
+        start_time = time.time()
+        for i, promo_product in enumerate(promo_products):
+            promo_name = promo_product.get('name', '').strip()
+            promo_store = promo_product.get('store', '').strip()
+            if not promo_name:
+                continue
+            logger.info(f"📊 Analyzing promo product {i+1}/{total_comparisons}: '{promo_name[:50]}'")
+            best_match = None
+            best_similarity = 0.0
+            # Compare against all database products
+            for db_product in db_products:
+                db_name = db_product.get('product_name', '').strip()
+                if not db_name:
+                    continue
+                # Calculate similarity
+                similarity = calculate_similarity(promo_name, db_name)
+                if similarity >= self.threshold and similarity > best_similarity:
+                    best_similarity = similarity
+                    confidence = calculate_confidence(similarity, promo_name, db_name)
+                    best_match = {
+                        'promo_id': promo_product.get('id'),
+                        'promo_name': promo_name,
+                        'promo_store': promo_store,
+                        'promo_price': promo_product.get('promo_price', 0),
+                        'regular_price': promo_product.get('regular_price', 0),
+                        'picture_id': promo_product.get('picture_id'),
+                        'db_product_id': db_product.get('product_id'),
+                        'db_product_name': db_name,
+                        'db_brand': db_product.get('brand', {}).get('brand_name', 'No Brand') if db_product.get('brand') else 'No Brand',
+                        'similarity': round(similarity, 3),
+                        'confidence': round(confidence, 3)
+                    }
+            if best_match:
+                matches.append(best_match)
+                logger.info(f"🔍 MATCH FOUND: {promo_name} ↔ {best_match['db_product_name']} ({best_similarity:.3f})")
+        end_time = time.time()
+        analysis_time = end_time - start_time
+        logger.info(f"✅ Comparison complete! Found {len(matches)} potential matches in {analysis_time:.1f} seconds")
+        return matches
+    async def compare_promo_streaming(self, promo_products: List[Dict], db_products: List[Dict]) -> AsyncGenerator[Dict, None]:
+        """
+        Compare promo products with real-time streaming updates
+        Args:
+            promo_products: List of promo product dictionaries
+            db_products: List of database product dictionaries
+        Yields:
+            Progress updates and results
+        """
+        yield {
+            'type': 'init',
+            'total_promo_products': len(promo_products),
+            'total_db_products': len(db_products),
+            'threshold': self.threshold
+        }
+        matches = []
+        total_comparisons = len(promo_products)
+        start_time = time.time()
+        for i, promo_product in enumerate(promo_products):
+            promo_name = promo_product.get('name', '').strip()
+            promo_store = promo_product.get('store', '').strip()
+            if not promo_name:
+                continue
+            # Send current comparison info
+            yield {
+                'type': 'comparing',
+                'current': i + 1,
+                'total': total_comparisons,
+                'promo_name': promo_name[:50],
+                'promo_store': promo_store,
+                'progress': round(((i + 1) / total_comparisons) * 100, 1)
+            }
+            best_match = None
+            best_similarity = 0.0
+            comparisons_for_this_promo = 0
+            # Compare against all database products
+            for db_product in db_products:
+                db_name = db_product.get('product_name', '').strip()
+                if not db_name:
+                    continue
+                # Calculate similarity
+                similarity = calculate_similarity(promo_name, db_name)
+                comparisons_for_this_promo += 1
+                if similarity >= self.threshold and similarity > best_similarity:
+                    best_similarity = similarity
+                    confidence = calculate_confidence(similarity, promo_name, db_name)
+                    best_match = {
+                        'promo_id': promo_product.get('id'),
+                        'promo_name': promo_name,
+                        'promo_store': promo_store,
+                        'promo_price': promo_product.get('promo_price', 0),
+                        'regular_price': promo_product.get('regular_price', 0),
+                        'picture_id': promo_product.get('picture_id'),
+                        'db_product_id': db_product.get('product_id'),
+                        'db_product_name': db_name,
+                        'db_brand': db_product.get('brand', {}).get('brand_name', 'No Brand') if db_product.get('brand') else 'No Brand',
+                        'similarity': round(similarity, 3),
+                        'confidence': round(confidence, 3)
+                    }
+            # If match found, send immediate update
+            if best_match:
+                matches.append(best_match)
+                yield {
+                    'type': 'match_found',
+                    'match': best_match,
+                    'total_matches': len(matches)
+                }
+            # Small delay to prevent overwhelming
+            await asyncio.sleep(0.01)
+        # Send final results
+        analysis_time = time.time() - start_time
+        yield {
+            'type': 'complete',
+            'matches': matches,
+            'summary': {
+                'total_promo_products': len(promo_products),
+                'total_db_products': len(db_products),
+                'matches_found': len(matches),
+                'match_rate': round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
+                'analysis_time_seconds': round(analysis_time, 2),
+                'threshold_used': self.threshold
+            }
+        }
+    def run_promo_comparison(self, save_report: bool = True) -> Dict[str, Any]:
+        """
+        Main function to run promo product comparison
+        Args:
+            save_report: Whether to save results to file
+        Returns:
+            Comparison results
+        """
+        logger.info("🏷️ Starting promo products comparison")
+        if not self.repository:
+            logger.error("❌ Repository not available")
+            return {'error': 'Repository not available'}
+        # Load promo products
+        logger.info("📊 Loading promotional products...")
+        promo_products = self.repository.load_promo_products()
+        if not promo_products:
+            logger.warning("❌ No promo products found")
+            return {'error': 'No promo products found'}
+        # Load database products
+        logger.info("📊 Loading database products...")
+        db_products = self.repository.load_all_products()
+        if not db_products:
+            logger.warning("❌ No database products found")
+            return {'error': 'No database products found'}
+        # Run comparison
+        matches = self.compare_promo_against_database(promo_products, db_products)
+        # Prepare results
+        results = {
+            'matches': matches,
+            'summary': {
+                'total_promo_products': len(promo_products),
+                'total_db_products': len(db_products),
+                'matches_found': len(matches),
+                'match_rate': round((len(matches) / len(promo_products)) * 100, 1) if promo_products else 0,
+                'threshold_used': self.threshold
+            }
+        }
+        logger.info(f"✅ Promo comparison complete: {len(matches)} matches found")
+        return results
+def get_promo_comparator(threshold: float = 0.85) -> PromoComparator:
+    """Get a PromoComparator instance"""
+    return PromoComparator(threshold=threshold)
+# For backwards compatibility
+def run_promo_comparison(threshold: float = 0.85, save_report: bool = True):
+    """
+    Main function to run promo product comparison (backwards compatibility)
+    Args:
+        threshold: Similarity threshold for matches
+        save_report: Whether to save results to file
+    """
+    comparator = PromoComparator(threshold=threshold)
+    return comparator.run_promo_comparison(save_report=save_report)
+if __name__ == "__main__":
+    run_promo_comparison()

similarity_engine/similarity_core.py ADDED Viewed

	@@ -0,0 +1,190 @@

+"""
+COMPLETE Similarity Engine - Core Module
+All original methods preserved with improved duplicate detection
+"""
+import re
+from typing import Set
+def normalize_product_name(name: str) -> str:
+    """Normalize product name for better comparison"""
+    if not name:
+        return ""
+    name = name.lower()
+    name = re.sub(r'[^\w\s]', ' ', name)
+    name = re.sub(r'\s+', ' ', name).strip()
+    stop_words = {'za', 'i', 'u', 'na', 'sa', 'od', 'the', 'and', 'of', 'in'}
+    words = [word for word in name.split() if word not in stop_words]
+    return ' '.join(words)
+def word_order_similarity(s1: str, s2: str) -> float:
+    """Calculate word order similarity"""
+    s1 = normalize_product_name(s1)
+    s2 = normalize_product_name(s2)
+    if not s1 or not s2:
+        return 0.0
+    words1 = set(s1.split())
+    words2 = set(s2.split())
+    if not words1 and not words2:
+        return 1.0
+    if not words1 or not words2:
+        return 0.0
+    intersection = len(words1 & words2)
+    union = len(words1 | words2)
+    return intersection / union if union > 0 else 0.0
+def dice_coefficient(s1: str, s2: str) -> float:
+    """Calculate Dice coefficient for character-level similarity"""
+    if not s1 or not s2:
+        return 0.0
+    s1, s2 = s1.lower(), s2.lower()
+    bigrams1 = set(s1[i:i+2] for i in range(len(s1) - 1))
+    bigrams2 = set(s2[i:i+2] for i in range(len(s2) - 1))
+    if not bigrams1 and not bigrams2:
+        return 1.0
+    if not bigrams1 or not bigrams2:
+        return 0.0
+    intersection = len(bigrams1 & bigrams2)
+    return 2.0 * intersection / (len(bigrams1) + len(bigrams2))
+def jaro_winkler(s1: str, s2: str) -> float:
+    """Calculate Jaro-Winkler similarity"""
+    if not s1 or not s2:
+        return 0.0
+    s1, s2 = s1.lower(), s2.lower()
+    if s1 == s2:
+        return 1.0
+    len1, len2 = len(s1), len(s2)
+    match_window = max(len1, len2) // 2 - 1
+    match_window = max(0, match_window)
+    s1_matches = [False] * len1
+    s2_matches = [False] * len2
+    matches = 0
+    transpositions = 0
+    for i in range(len1):
+        start = max(0, i - match_window)
+        end = min(i + match_window + 1, len2)
+        for j in range(start, end):
+            if s2_matches[j] or s1[i] != s2[j]:
+                continue
+            s1_matches[i] = s2_matches[j] = True
+            matches += 1
+            break
+    if matches == 0:
+        return 0.0
+    k = 0
+    for i in range(len1):
+        if not s1_matches[i]:
+            continue
+        while not s2_matches[k]:
+            k += 1
+        if s1[i] != s2[k]:
+            transpositions += 1
+        k += 1
+    jaro = (matches/len1 + matches/len2 + (matches - transpositions/2)/matches) / 3.0
+    prefix = 0
+    for i in range(min(len1, len2, 4)):
+        if s1[i] == s2[i]:
+            prefix += 1
+        else:
+            break
+    return jaro + (0.1 * prefix * (1 - jaro))
+def levenshtein_similarity(s1: str, s2: str) -> float:
+    """Calculate normalized Levenshtein similarity"""
+    if not s1 or not s2:
+        return 0.0
+    s1, s2 = s1.lower(), s2.lower()
+    if s1 == s2:
+        return 1.0
+    len1, len2 = len(s1), len(s2)
+    matrix = [[0] * (len2 + 1) for _ in range(len1 + 1)]
+    for i in range(len1 + 1):
+        matrix[i][0] = i
+    for j in range(len2 + 1):
+        matrix[0][j] = j
+    for i in range(1, len1 + 1):
+        for j in range(1, len2 + 1):
+            cost = 0 if s1[i-1] == s2[j-1] else 1
+            matrix[i][j] = min(
+                matrix[i-1][j] + 1,
+                matrix[i][j-1] + 1,
+                matrix[i-1][j-1] + cost
+            )
+    distance = matrix[len1][len2]
+    return 1 - (distance / max(len1, len2)) if max(len1, len2) > 0 else 0.0
+def hybrid_similarity(s1: str, s2: str) -> float:
+    """Combined similarity score using multiple algorithms"""
+    if normalize_product_name(s1) == normalize_product_name(s2):
+        return 1.0
+    norm1 = normalize_product_name(s1)
+    norm2 = normalize_product_name(s2)
+    if norm1 in norm2 or norm2 in norm1:
+        len_diff = abs(len(norm1) - len(norm2))
+        if len_diff < 3:
+            return 0.95
+        elif len_diff < 10:
+            return 0.85
+        else:
+            return 0.7
+    word_sim = word_order_similarity(s1, s2) * 0.40
+    dice_sim = dice_coefficient(s1, s2) * 0.30
+    jaro_sim = jaro_winkler(s1, s2) * 0.20
+    leven_sim = levenshtein_similarity(s1, s2) * 0.10
+    return word_sim + dice_sim + jaro_sim + leven_sim
+def calculate_similarity(text1: str, text2: str) -> float:
+    """Main similarity function"""
+    if not text1 or not text2:
+        return 0.0
+    if text1.strip() == text2.strip():
+        return 1.0
+    if len(text1) < 5 or len(text2) < 5:
+        return 0.0
+    return hybrid_similarity(text1.strip(), text2.strip())
+def calculate_confidence(similarity: float, text1: str, text2: str) -> float:
+    """Calculate confidence score"""
+    confidence = similarity
+    avg_length = (len(text1) + len(text2)) / 2
+    length_factor = min(avg_length / 100, 0.15)
+    word_count1 = len(text1.split())
+    word_count2 = len(text2.split())
+    avg_words = (word_count1 + word_count2) / 2
+    word_factor = min(avg_words / 15, 0.10)
+    len_diff = abs(len(text1) - len(text2))
+    len_penalty = min(len_diff / 50, 0.20)
+    final_confidence = confidence + length_factor + word_factor - len_penalty
+    return min(max(final_confidence, 0.0), 1.0)
+def test_similarity_examples():
+    """Test function with examples"""
+    test_cases = [
+        ("Maslac", "Maslac", True),
+        ("Vrhnje za kuhanje", "Vrhnje za kuhanje 3x200g", False),
+        ("Japanke copacabana lila", "Japanke copacabana flower", False),
+        ("Kroasan praline pan pek 70 g", "Kroasan marelica pan pek 70 g", False),
+        ("Spužva za kupanje", "Spužva baby za kupanje", False),
+        ("Apple iPhone 13", "iPhone 13 Apple", True),
+        ("vindija mlijeko cokoladno", "vindija cokoladno mlijeko", True)
+    ]
+    print("🧪 Testing Similarity Examples:")
+    print("=" * 50)
+    for text1, text2, should_match in test_cases:
+        similarity = calculate_similarity(text1, text2)
+        confidence = calculate_confidence(similarity, text1, text2)
+        is_match = similarity >= 0.85
+        status = "✅ PASS" if is_match == should_match else "❌ FAIL"
+        print(f"{status} '{text1}' vs '{text2}'")
+        print(f"      Similarity: {similarity:.3f} | Confidence: {confidence:.3f}")
+        print()

utils/cache_manager.py ADDED Viewed

	@@ -0,0 +1,350 @@

+"""
+Cache Manager for Similarity Engine
+Handles JSON caching of analysis results for improved performance
+"""
+import json
+import os
+import hashlib
+import logging
+from datetime import datetime, timedelta
+from typing import Dict, Any, Optional, List
+from pathlib import Path
+# Configure logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class SimilarityCacheManager:
+    """Manages caching of similarity analysis results"""
+    def __init__(self, cache_base_dir: str = "cache"):
+        """
+        Initialize cache manager
+        Args:
+            cache_base_dir: Base directory for cache files
+        """
+        self.cache_base_dir = Path(cache_base_dir)
+        self.cache_dirs = {
+            'duplicates': self.cache_base_dir / 'duplicates',
+            'promo_matches': self.cache_base_dir / 'promo_matches',
+            'comparisons': self.cache_base_dir / 'comparisons'
+        }
+        # Ensure cache directories exist
+        for cache_dir in self.cache_dirs.values():
+            cache_dir.mkdir(parents=True, exist_ok=True)
+        logger.info(f"📁 Cache manager initialized with base dir: {self.cache_base_dir}")
+    def generate_cache_key(
+        self,
+        analysis_type: str,
+        products_count: int,
+        threshold: float,
+        algorithm: str = "hybrid",
+        additional_params: Dict = None
+    ) -> str:
+        """
+        Generate unique cache key for analysis parameters
+        Args:
+            analysis_type: Type of analysis ('duplicates', 'promo', 'comparison')
+            products_count: Number of products in analysis
+            threshold: Similarity threshold used
+            algorithm: Algorithm used
+            additional_params: Any additional parameters to include in key
+        Returns:
+            Unique cache key string
+        """
+        # Base parameters
+        key_data = {
+            'type': analysis_type,
+            'count': products_count,
+            'threshold': round(threshold, 2),
+            'algorithm': algorithm,
+            'date': datetime.now().strftime("%Y%m%d")
+        }
+        # Add additional parameters if provided
+        if additional_params:
+            key_data.update(additional_params)
+        # Create hash from parameters for uniqueness
+        key_string = json.dumps(key_data, sort_keys=True)
+        key_hash = hashlib.md5(key_string.encode()).hexdigest()[:8]
+        # Create readable cache key
+        cache_key = f"{analysis_type}_{products_count}_{int(threshold*100)}_{algorithm}_{key_hash}"
+        logger.debug(f"🔑 Generated cache key: {cache_key}")
+        return cache_key
+    def get_cache_file_path(self, analysis_type: str, cache_key: str) -> Path:
+        """Get full path for cache file"""
+        cache_dir = self.cache_dirs.get(analysis_type, self.cache_dirs['comparisons'])
+        return cache_dir / f"{cache_key}.json"
+    def save_cache(
+        self,
+        analysis_type: str,
+        cache_key: str,
+        results: Dict[str, Any],
+        parameters: Dict[str, Any],
+        expiry_hours: int = 24
+    ) -> bool:
+        """
+        Save analysis results to cache
+        Args:
+            analysis_type: Type of analysis
+            cache_key: Unique cache key
+            results: Analysis results to cache
+            parameters: Parameters used for analysis
+            expiry_hours: Hours until cache expires
+        Returns:
+            True if saved successfully, False otherwise
+        """
+        try:
+            cache_file = self.get_cache_file_path(analysis_type, cache_key)
+            cache_data = {
+                'cache_id': cache_key,
+                'analysis_type': analysis_type,
+                'created_at': datetime.now().isoformat(),
+                'expires_at': (datetime.now() + timedelta(hours=expiry_hours)).isoformat(),
+                'parameters': parameters,
+                'results': results,
+                'version': '1.0'
+            }
+            with open(cache_file, 'w', encoding='utf-8') as f:
+                json.dump(cache_data, f, indent=2, ensure_ascii=False)
+            file_size = cache_file.stat().st_size / 1024  # KB
+            logger.info(f"💾 Saved cache: {cache_key} ({file_size:.1f} KB)")
+            return True
+        except Exception as e:
+            logger.error(f"❌ Failed to save cache {cache_key}: {e}")
+            return False
+    def load_cache(self, analysis_type: str, cache_key: str) -> Optional[Dict[str, Any]]:
+        """
+        Load cached analysis results
+        Args:
+            analysis_type: Type of analysis
+            cache_key: Cache key to load
+        Returns:
+            Cached results if valid, None otherwise
+        """
+        try:
+            cache_file = self.get_cache_file_path(analysis_type, cache_key)
+            if not cache_file.exists():
+                logger.debug(f"📭 Cache miss: {cache_key}")
+                return None
+            with open(cache_file, 'r', encoding='utf-8') as f:
+                cache_data = json.load(f)
+            # Check if cache is expired
+            expiry_time = datetime.fromisoformat(cache_data['expires_at'])
+            if datetime.now() > expiry_time:
+                logger.info(f"⏰ Cache expired: {cache_key}")
+                cache_file.unlink()  # Remove expired cache
+                return None
+            logger.info(f"✅ Cache hit: {cache_key}")
+            return cache_data['results']
+        except Exception as e:
+            logger.error(f"❌ Failed to load cache {cache_key}: {e}")
+            return None
+    def is_cache_valid(self, analysis_type: str, cache_key: str) -> bool:
+        """Check if cache exists and is valid"""
+        try:
+            cache_file = self.get_cache_file_path(analysis_type, cache_key)
+            if not cache_file.exists():
+                return False
+            with open(cache_file, 'r', encoding='utf-8') as f:
+                cache_data = json.load(f)
+            expiry_time = datetime.fromisoformat(cache_data['expires_at'])
+            return datetime.now() <= expiry_time
+        except Exception:
+            return False
+    def clear_cache(self, analysis_type: str = None, older_than_hours: int = None) -> int:
+        """
+        Clear cached results
+        Args:
+            analysis_type: Specific analysis type to clear, or None for all
+            older_than_hours: Clear cache older than X hours, or None for all
+        Returns:
+            Number of files removed
+        """
+        removed_count = 0
+        # Determine which directories to clear
+        dirs_to_clear = [self.cache_dirs[analysis_type]] if analysis_type else self.cache_dirs.values()
+        for cache_dir in dirs_to_clear:
+            if not cache_dir.exists():
+                continue
+            for cache_file in cache_dir.glob("*.json"):
+                should_remove = False
+                try:
+                    if older_than_hours is None:
+                        should_remove = True
+                    else:
+                        # Check file age
+                        with open(cache_file, 'r') as f:
+                            cache_data = json.load(f)
+                        created_time = datetime.fromisoformat(cache_data['created_at'])
+                        age_hours = (datetime.now() - created_time).total_seconds() / 3600
+                        if age_hours > older_than_hours:
+                            should_remove = True
+                    if should_remove:
+                        cache_file.unlink()
+                        removed_count += 1
+                        logger.info(f"🗑️ Removed cache: {cache_file.name}")
+                except Exception as e:
+                    logger.warning(f"⚠️ Failed to process cache file {cache_file}: {e}")
+        logger.info(f"🧹 Cache cleanup complete: {removed_count} files removed")
+        return removed_count
+    def get_cache_stats(self) -> Dict[str, Any]:
+        """Get cache statistics"""
+        stats = {
+            'total_files': 0,
+            'total_size_mb': 0,
+            'by_type': {},
+            'cache_dirs': {}
+        }
+        for analysis_type, cache_dir in self.cache_dirs.items():
+            if not cache_dir.exists():
+                continue
+            type_stats = {
+                'files': 0,
+                'size_mb': 0,
+                'valid_files': 0,
+                'expired_files': 0
+            }
+            for cache_file in cache_dir.glob("*.json"):
+                try:
+                    file_size = cache_file.stat().st_size / (1024 * 1024)  # MB
+                    type_stats['files'] += 1
+                    type_stats['size_mb'] += file_size
+                    # Check if valid
+                    with open(cache_file, 'r') as f:
+                        cache_data = json.load(f)
+                    expiry_time = datetime.fromisoformat(cache_data['expires_at'])
+                    if datetime.now() <= expiry_time:
+                        type_stats['valid_files'] += 1
+                    else:
+                        type_stats['expired_files'] += 1
+                except Exception:
+                    type_stats['expired_files'] += 1
+            stats['by_type'][analysis_type] = type_stats
+            stats['total_files'] += type_stats['files']
+            stats['total_size_mb'] += type_stats['size_mb']
+            stats['cache_dirs'][analysis_type] = str(cache_dir)
+        return stats
+    def cleanup_expired_cache(self) -> int:
+        """Remove all expired cache files"""
+        return self.clear_cache(older_than_hours=0)  # Remove only expired files
+# Global cache manager instance
+_cache_manager = None
+def get_cache_manager() -> SimilarityCacheManager:
+    """Get singleton cache manager instance"""
+    global _cache_manager
+    if _cache_manager is None:
+        _cache_manager = SimilarityCacheManager()
+    return _cache_manager
+# Convenience functions
+def cache_duplicate_analysis(
+    products_count: int,
+    threshold: float,
+    results: Dict[str, Any],
+    parameters: Dict[str, Any]
+) -> str:
+    """Cache duplicate analysis results"""
+    cache_mgr = get_cache_manager()
+    cache_key = cache_mgr.generate_cache_key('duplicates', products_count, threshold)
+    cache_mgr.save_cache('duplicates', cache_key, results, parameters)
+    return cache_key
+def load_duplicate_analysis(
+    products_count: int,
+    threshold: float
+) -> Optional[Dict[str, Any]]:
+    """Load cached duplicate analysis results"""
+    cache_mgr = get_cache_manager()
+    cache_key = cache_mgr.generate_cache_key('duplicates', products_count, threshold)
+    return cache_mgr.load_cache('duplicates', cache_key)
+def cache_promo_analysis(
+    promo_count: int,
+    db_count: int,
+    threshold: float,
+    results: Dict[str, Any],
+    parameters: Dict[str, Any]
+) -> str:
+    """Cache promo analysis results"""
+    cache_mgr = get_cache_manager()
+    cache_key = cache_mgr.generate_cache_key(
+        'promo_matches',
+        promo_count + db_count,
+        threshold,
+        additional_params={'promo_count': promo_count, 'db_count': db_count}
+    )
+    cache_mgr.save_cache('promo_matches', cache_key, results, parameters)
+    return cache_key
+def load_promo_analysis(
+    promo_count: int,
+    db_count: int,
+    threshold: float
+) -> Optional[Dict[str, Any]]:
+    """Load cached promo analysis results"""
+    cache_mgr = get_cache_manager()
+    cache_key = cache_mgr.generate_cache_key(
+        'promo_matches',
+        promo_count + db_count,
+        threshold,
+        additional_params={'promo_count': promo_count, 'db_count': db_count}
+    )
+    return cache_mgr.load_cache('promo_matches', cache_key)