Spaces:

dal4933
/

TEST-FRANKO

Runtime error

App Files Files Community

Franko Fišter commited on May 19, 2025

Commit

e245176

1 Parent(s): 4c7debd

Working scraping endpoint, WIP product image scraping and processing

Browse files

Files changed (6) hide show

api/main.py +2 -0
api/product_routes.py +128 -2
api/scrape_routes.py +158 -0
db/scrape_repository.py +69 -0
requirements.txt +2 -1
utils/image_processing.py +94 -0

api/main.py CHANGED Viewed

@@ -3,6 +3,7 @@ from fastapi.middleware.cors import CORSMiddleware
 from config.settings import API_HOST, API_PORT
 from api.product_routes import router as product_router
 from api.receipt_routes import router as receipt_router
 # Initialize FastAPI
 app = FastAPI(title="Product and Receipt API")
@@ -20,6 +21,7 @@ app.add_middleware(
 # Add routers
 app.include_router(product_router)
 app.include_router(receipt_router)
 @app.get("/", tags=["Health"])
 def health_check():

 from config.settings import API_HOST, API_PORT
 from api.product_routes import router as product_router
 from api.receipt_routes import router as receipt_router
+from api.scrape_routes import router as scrape_router
 # Initialize FastAPI
 app = FastAPI(title="Product and Receipt API")
 # Add routers
 app.include_router(product_router)
 app.include_router(receipt_router)
+app.include_router(scrape_router)
 @app.get("/", tags=["Health"])
 def health_check():

api/product_routes.py CHANGED Viewed

@@ -1,7 +1,10 @@
-from fastapi import APIRouter, File, UploadFile, HTTPException
-from utils.image_processing import read_image_file
 from product_detector.detector import ObjectDetector
 from config.settings import MODEL_ONNX_PATH, CLASS_NAMES, INPUT_SIZE
 # Initialize the detector
 detector = ObjectDetector(
@@ -31,3 +34,126 @@ async def detect_objects(file: UploadFile = File(...)):
         raise
     except Exception as e:
         raise HTTPException(500, f"Processing error: {str(e)}")

+from fastapi import APIRouter, File, UploadFile, HTTPException, Form
+from utils.image_processing import read_image_file, process_product_image
 from product_detector.detector import ObjectDetector
 from config.settings import MODEL_ONNX_PATH, CLASS_NAMES, INPUT_SIZE
+from typing import Optional
+from db.supabase_client import SupabaseClient
+import uuid
 # Initialize the detector
 detector = ObjectDetector(
         raise
     except Exception as e:
         raise HTTPException(500, f"Processing error: {str(e)}")
+@router.post("/process-image")
+async def process_image(
+    file: UploadFile = File(...),
+    remove_bg: bool = Form(True),
+    upscale: bool = Form(True),
+    scale_factor: int = Form(2),
+    process_order: str = Form("remove_first")
+):
+    """
+    Process product images by removing background and/or upscaling
+    - remove_bg: Whether to remove the white background
+    - upscale: Whether to upscale the image
+    - scale_factor: Scale factor for upscaling (2, 3, or 4)
+    - process_order: Order of operations ('remove_first' or 'upscale_first')
+    """
+    try:
+        # Validate inputs
+        if scale_factor not in [2, 3, 4]:
+            raise HTTPException(400, "Scale factor must be 2, 3, or 4")
+        if process_order not in ["remove_first", "upscale_first"]:
+            raise HTTPException(400, "Process order must be 'remove_first' or 'upscale_first'")
+        if not file.content_type.startswith("image/"):
+            raise HTTPException(400, "File must be an image")
+        # Process the image
+        processed_image, filename = await process_product_image(
+            file,
+            remove_bg=remove_bg,
+            upscale=upscale,
+            scale_factor=scale_factor,
+            process_order=process_order
+        )
+        # Generate a unique ID for the image
+        image_id = str(uuid.uuid4())
+        image_path = f"{image_id}_{filename}"
+        # Upload the processed image to Supabase Storage
+        supabase.storage.from_("product-images").upload(
+            file=processed_image,
+            path=image_path,
+            file_options={"content-type": "image/png", "upsert": "true"}
+        )
+        # Get the public URL for the uploaded image
+        image_url = supabase.storage.from_("product-images").get_public_url(image_path)
+        return {
+            "status": "success",
+            "message": "Image processed successfully",
+            "image_url": image_url,
+            "image_path": image_path,
+            "processing": {
+                "background_removed": remove_bg,
+                "upscaled": upscale,
+                "scale_factor": scale_factor if upscale else None,
+                "process_order": process_order
+            }
+        }
+    except HTTPException:
+        raise
+    except Exception as e:
+        raise HTTPException(500, f"Image processing error: {str(e)}")
+@router.post("/process-product-image")
+async def process_product_image_endpoint(
+    file: UploadFile = File(...),
+    remove_bg: bool = Form(True),
+    upscale: bool = Form(True),
+    scale_factor: int = Form(2),
+    process_order: str = Form("remove_first"),
+    product_id: str = Form(None)
+):
+    """
+    Process a product image by removing background and upscaling,
+    then save to Supabase Storage
+    """
+    try:
+        # Process the image
+        processed_image, filename = await process_product_image(
+            file,
+            remove_bg=remove_bg,
+            upscale=upscale,
+            scale_factor=scale_factor,
+            process_order=process_order
+        )
+        # Generate a unique ID for the image
+        image_id = str(uuid.uuid4())
+        image_path = f"{image_id}_{filename}"
+        # Upload the processed image to Supabase Storage
+        supabase.storage.from_("product-images").upload(
+            file=processed_image,
+            path=image_path,
+            file_options={"content-type": "image/png", "upsert": "true"}
+        )
+        # Get the public URL for the uploaded image
+        image_url = supabase.storage.from_("product-images").get_public_url(image_path)
+        # If product_id is provided, update the product record
+        if product_id:
+            # Update product_image column in the database
+            result = supabase.table("products").update({
+                "product_image": image_url
+            }).eq("id", product_id).execute()
+        return {
+            "status": "success",
+            "message": "Image processed successfully",
+            "image_url": image_url,
+            "image_path": image_path
+        }
+    except Exception as e:
+        raise HTTPException(500, f"Image processing error: {str(e)}")

api/scrape_routes.py ADDED Viewed

	@@ -0,0 +1,158 @@

+from fastapi import APIRouter, HTTPException, Request, Depends, BackgroundTasks
+from db.scrape_repository import PromoProductRepository
+from utils.rate_limiter import RateLimiter
+import requests
+from concurrent.futures import ThreadPoolExecutor, as_completed
+# Initialize rate limiter and repository
+rate_limiter = RateLimiter(max_requests=5)  # Lower limit for scraping operations
+promo_repository = PromoProductRepository()
+router = APIRouter(prefix="/scrape", tags=["Data Scraping"])
+RETAILER_MAPPING = {
+    8: "Studenac",
+    4: "Konzum",
+    3: "Kaufland",
+    9: "Tommy",
+    109: "Spar",
+    6: "Plodine",
+    5: "Lidl"
+}
+def fetch_page(session, page):
+    """Fetch a single page of products"""
+    url = f"https://backend.360promo.hr/api/promotions/products?pageNumber={page}&sortBySalePercentage=False"
+    try:
+        print(f"📄 Fetching page {page}...")
+        response = session.get(url, timeout=10)
+        response.raise_for_status()
+        return page, response.json()
+    except Exception as e:
+        print(f"❌ Error on page {page}: {str(e)}")
+        return page, []
+def fetch_all_products():
+    products = []
+    max_workers = 8  # Adjust based on API capacity
+    with requests.Session() as session:
+        # First, fetch page 1 to see if there's data
+        _, page1_data = fetch_page(session, 1)
+        if not page1_data:
+            print("No data found on first page")
+            return []
+        products.extend(page1_data)
+        # Set up for concurrent fetching of subsequent pages
+        last_page_with_data = 1
+        while True:
+            # Determine next batch of pages to fetch
+            start_page = last_page_with_data + 1
+            end_page = start_page + max_workers - 1
+            if start_page > 1000:  # Safety limit
+                print("Reached maximum page limit")
+                break
+            pages_to_fetch = list(range(start_page, end_page + 1))
+            # Fetch pages concurrently
+            with ThreadPoolExecutor(max_workers=max_workers) as executor:
+                futures = [executor.submit(fetch_page, session, page) for page in pages_to_fetch]
+                # Process results
+                new_data_found = False
+                highest_page_with_data = 0
+                for future in as_completed(futures):
+                    page, data = future.result()
+                    if data:  # If we got data
+                        products.extend(data)
+                        new_data_found = True
+                        highest_page_with_data = max(highest_page_with_data, page)
+            # If no new data was found in this batch, we're done
+            if not new_data_found:
+                break
+            # Update the last page with data
+            last_page_with_data = highest_page_with_data
+    print(f"\n✅ Total products collected: {len(products)}")
+    return products
+def process_products(products):
+    unified_products = []
+    for product in products:
+        retailer_id = product.get('retailerId')
+        if not retailer_id or retailer_id not in RETAILER_MAPPING:
+            continue
+        store_name = RETAILER_MAPPING[retailer_id]
+        price = product.get('promoPrice') or product.get('regularPrice')
+        if price is None:
+            continue
+        item = {
+            "store": store_name,
+            "pictureId": product.get('id'),
+            "name": product.get('name', 'Unknown Product'),
+            "description": product.get('description', ''),
+            "promoStartDate": product.get('promoStartDate'),
+            "promoEndDate": product.get('promoEndDate'),
+            "regularPrice": product.get('regularPrice'),
+            "promoPrice": product.get('promoPrice')
+        }
+        unified_products.append(item)
+    return unified_products
+async def scrape_and_store_products():
+    """Background task to scrape products and store them in the database"""
+    try:
+        # Fetch products from the API
+        products = fetch_all_products()
+        if not products:
+            print("No products found to scrape")
+            return 0
+        # Process products into standardized format
+        unified_products = process_products(products)
+        if not unified_products:
+            print("No valid products found to store")
+            return 0
+        # Store products in Supabase
+        stored_count = promo_repository.upsert_multiple_products(unified_products)
+        print(f"Successfully stored {stored_count} products")
+        return stored_count
+    except Exception as e:
+        print(f"Error during scraping: {str(e)}")
+        return 0
+@router.post("/promo")
+async def trigger_promo_scrape(
+    background_tasks: BackgroundTasks,
+    request: Request
+):
+    """
+    Admin only: Trigger a promotional product scraping operation.
+    This runs in the background to avoid timeout issues.
+    """
+    try:
+        # Add scraping task to background tasks
+        background_tasks.add_task(scrape_and_store_products)
+        return {
+            "status": "success",
+            "message": "Promotional product scraping started. Results will be stored in the database."
+        }
+    except Exception as e:
+        print(f"ERROR: {str(e)}")
+        raise HTTPException(500, f"Failed to start scraping operation: {str(e)}")

db/scrape_repository.py ADDED Viewed

	@@ -0,0 +1,69 @@

+from typing import Dict, Any, List
+from datetime import datetime
+from .supabase_client import SupabaseClient
+class PromoProductRepository:
+    def __init__(self):
+        self.supabase = SupabaseClient().get_client()
+    def upsert_multiple_products(self, products: List[Dict[str, Any]]) -> int:
+        """
+        Upsert multiple promo products in batches
+        Returns the number of successfully upserted products
+        """
+        batch_size = 100  # Adjust based on Supabase capacity
+        successfully_upserted = 0
+        timestamp = datetime.now().isoformat()
+        # Process in batches to avoid request size limitations
+        for i in range(0, len(products), batch_size):
+            batch = products[i:i+batch_size]
+            for product in batch:
+                store = product.get("store")
+                name = product.get("name")
+                formatted_product = {
+                    "store": store,
+                    "picture_id": product.get("pictureId"),
+                    "name": name,
+                    "description": product.get("description", ""),
+                    "promo_start_date": product.get("promoStartDate"),
+                    "promo_end_date": product.get("promoEndDate"),
+                    "regular_price": product.get("regularPrice"),
+                    "promo_price": product.get("promoPrice"),
+                    "last_updated": timestamp
+                }
+                try:
+                    # Check if product exists with same store and name
+                    result = self.supabase.table("promo_products").select("*") \
+                        .eq("store", store) \
+                        .eq("name", name) \
+                        .execute()
+                    if result.data and len(result.data) > 0:
+                        # Product exists, update it
+                        record_id = result.data[0]["id"]
+                        self.supabase.table("promo_products") \
+                            .update(formatted_product) \
+                            .eq("id", record_id) \
+                            .execute()
+                    else:
+                        # Product doesn't exist, insert it
+                        self.supabase.table("promo_products") \
+                            .insert(formatted_product) \
+                            .execute()
+                    successfully_upserted += 1
+                    # Print progress periodically
+                    if successfully_upserted % 50 == 0:
+                        print(f"Processed {successfully_upserted} products so far...")
+                except Exception as e:
+                    print(f"Failed to upsert product '{name}' from '{store}': {str(e)}")
+                    # Continue with next product instead of failing completely
+        print(f"Successfully upserted {successfully_upserted} products")
+        return successfully_upserted

requirements.txt CHANGED Viewed

@@ -9,4 +9,5 @@ ultralytics
 python-multipart
 google-cloud-vision
 python-dotenv
-supabase

 python-multipart
 google-cloud-vision
 python-dotenv
+supabase
+rembg

utils/image_processing.py CHANGED Viewed

@@ -1,6 +1,12 @@
 import numpy as np
 import cv2
 from fastapi import UploadFile, HTTPException
 async def read_image_file(file: UploadFile) -> np.ndarray:
     """Read and process an image file from FastAPI UploadFile"""
@@ -14,3 +20,91 @@ async def read_image_file(file: UploadFile) -> np.ndarray:
         raise HTTPException(400, "Invalid image data")
     return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

 import numpy as np
 import cv2
 from fastapi import UploadFile, HTTPException
+from PIL import Image
+import io
+from rembg import remove
+import time
+import uuid
+from typing import Tuple, Optional
 async def read_image_file(file: UploadFile) -> np.ndarray:
     """Read and process an image file from FastAPI UploadFile"""
         raise HTTPException(400, "Invalid image data")
     return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
+def remove_background(image_bytes: bytes) -> bytes:
+    """Remove white background from image using rembg"""
+    try:
+        return remove(image_bytes)
+    except Exception as e:
+        print(f"Error removing background: {str(e)}")
+        raise Exception(f"Background removal error: {str(e)}")
+def upscale_image(image_bytes: bytes, scale_factor: int = 2) -> bytes:
+    """Upscale image using OpenCV"""
+    try:
+        # Create a numpy array from the image bytes
+        nparr = np.frombuffer(image_bytes, np.uint8)
+        img = cv2.imdecode(nparr, cv2.IMREAD_UNCHANGED)
+        # Handle images with alpha channel
+        if len(img.shape) > 2 and img.shape[2] == 4:
+            # Split channels
+            b, g, r, a = cv2.split(img)
+            # Scale RGB channels
+            rgb_channels = cv2.merge([b, g, r])
+            scaled_rgb = cv2.resize(rgb_channels, None, fx=scale_factor, fy=scale_factor,
+                                   interpolation=cv2.INTER_CUBIC)
+            # Scale alpha channel separately
+            scaled_alpha = cv2.resize(a, None, fx=scale_factor, fy=scale_factor,
+                                     interpolation=cv2.INTER_CUBIC)
+            # Merge channels back together
+            scaled_img = cv2.merge([
+                scaled_rgb[:, :, 0],
+                scaled_rgb[:, :, 1],
+                scaled_rgb[:, :, 2],
+                scaled_alpha
+            ])
+        else:
+            # Regular RGB image
+            scaled_img = cv2.resize(img, None, fx=scale_factor, fy=scale_factor,
+                                   interpolation=cv2.INTER_CUBIC)
+        # Encode the image back to bytes
+        success, buffer = cv2.imencode('.png', scaled_img)
+        if not success:
+            raise Exception("Failed to encode upscaled image")
+        return buffer.tobytes()
+    except Exception as e:
+        print(f"Error upscaling image: {str(e)}")
+        raise Exception(f"Image upscaling error: {str(e)}")
+async def process_product_image(
+    file: UploadFile,
+    remove_bg: bool = True,
+    upscale: bool = True,
+    scale_factor: int = 2,
+    process_order: str = "remove_first"
+) -> Tuple[bytes, str]:
+    """Process a product image with background removal and upscaling"""
+    # Read the file content
+    content = await file.read()
+    file.file.seek(0)  # Reset file pointer for potential reuse
+    # Create a descriptive filename with timestamp for uniqueness
+    timestamp = int(time.time())
+    original_filename = file.filename.split('.')
+    base_name = original_filename[0] if len(original_filename) > 0 else 'product'
+    extension = 'png'  # Always use PNG to preserve transparency
+    # Process the image based on the parameters and order
+    processed_content = content
+    if process_order == "remove_first" and remove_bg and upscale:
+        processed_content = remove_background(processed_content)
+        processed_content = upscale_image(processed_content, scale_factor)
+    elif process_order == "upscale_first" and remove_bg and upscale:
+        processed_content = upscale_image(processed_content, scale_factor)
+        processed_content = remove_background(processed_content)
+    elif remove_bg:
+        processed_content = remove_background(processed_content)
+    elif upscale:
+        processed_content = upscale_image(processed_content, scale_factor)
+    # Create descriptive filename with processing info
+    processed_filename = f"{base_name}_{'nobg' if remove_bg else ''}_{'upx' + str(scale_factor) if upscale else ''}_{timestamp}.{extension}"
+    return processed_content, processed_filename