Franko Fišter commited on
Commit
e245176
·
1 Parent(s): 4c7debd

Working scraping endpoint, WIP product image scraping and processing

Browse files
api/main.py CHANGED
@@ -3,6 +3,7 @@ from fastapi.middleware.cors import CORSMiddleware
3
  from config.settings import API_HOST, API_PORT
4
  from api.product_routes import router as product_router
5
  from api.receipt_routes import router as receipt_router
 
6
 
7
  # Initialize FastAPI
8
  app = FastAPI(title="Product and Receipt API")
@@ -20,6 +21,7 @@ app.add_middleware(
20
  # Add routers
21
  app.include_router(product_router)
22
  app.include_router(receipt_router)
 
23
 
24
  @app.get("/", tags=["Health"])
25
  def health_check():
 
3
  from config.settings import API_HOST, API_PORT
4
  from api.product_routes import router as product_router
5
  from api.receipt_routes import router as receipt_router
6
+ from api.scrape_routes import router as scrape_router
7
 
8
  # Initialize FastAPI
9
  app = FastAPI(title="Product and Receipt API")
 
21
  # Add routers
22
  app.include_router(product_router)
23
  app.include_router(receipt_router)
24
+ app.include_router(scrape_router)
25
 
26
  @app.get("/", tags=["Health"])
27
  def health_check():
api/product_routes.py CHANGED
@@ -1,7 +1,10 @@
1
- from fastapi import APIRouter, File, UploadFile, HTTPException
2
- from utils.image_processing import read_image_file
3
  from product_detector.detector import ObjectDetector
4
  from config.settings import MODEL_ONNX_PATH, CLASS_NAMES, INPUT_SIZE
 
 
 
5
 
6
  # Initialize the detector
7
  detector = ObjectDetector(
@@ -31,3 +34,126 @@ async def detect_objects(file: UploadFile = File(...)):
31
  raise
32
  except Exception as e:
33
  raise HTTPException(500, f"Processing error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, File, UploadFile, HTTPException, Form
2
+ from utils.image_processing import read_image_file, process_product_image
3
  from product_detector.detector import ObjectDetector
4
  from config.settings import MODEL_ONNX_PATH, CLASS_NAMES, INPUT_SIZE
5
+ from typing import Optional
6
+ from db.supabase_client import SupabaseClient
7
+ import uuid
8
 
9
  # Initialize the detector
10
  detector = ObjectDetector(
 
34
  raise
35
  except Exception as e:
36
  raise HTTPException(500, f"Processing error: {str(e)}")
37
+
38
+ @router.post("/process-image")
39
+ async def process_image(
40
+ file: UploadFile = File(...),
41
+ remove_bg: bool = Form(True),
42
+ upscale: bool = Form(True),
43
+ scale_factor: int = Form(2),
44
+ process_order: str = Form("remove_first")
45
+ ):
46
+ """
47
+ Process product images by removing background and/or upscaling
48
+
49
+ - remove_bg: Whether to remove the white background
50
+ - upscale: Whether to upscale the image
51
+ - scale_factor: Scale factor for upscaling (2, 3, or 4)
52
+ - process_order: Order of operations ('remove_first' or 'upscale_first')
53
+ """
54
+ try:
55
+ # Validate inputs
56
+ if scale_factor not in [2, 3, 4]:
57
+ raise HTTPException(400, "Scale factor must be 2, 3, or 4")
58
+
59
+ if process_order not in ["remove_first", "upscale_first"]:
60
+ raise HTTPException(400, "Process order must be 'remove_first' or 'upscale_first'")
61
+
62
+ if not file.content_type.startswith("image/"):
63
+ raise HTTPException(400, "File must be an image")
64
+
65
+ # Process the image
66
+ processed_image, filename = await process_product_image(
67
+ file,
68
+ remove_bg=remove_bg,
69
+ upscale=upscale,
70
+ scale_factor=scale_factor,
71
+ process_order=process_order
72
+ )
73
+
74
+ # Generate a unique ID for the image
75
+ image_id = str(uuid.uuid4())
76
+ image_path = f"{image_id}_{filename}"
77
+
78
+ # Upload the processed image to Supabase Storage
79
+ supabase.storage.from_("product-images").upload(
80
+ file=processed_image,
81
+ path=image_path,
82
+ file_options={"content-type": "image/png", "upsert": "true"}
83
+ )
84
+
85
+ # Get the public URL for the uploaded image
86
+ image_url = supabase.storage.from_("product-images").get_public_url(image_path)
87
+
88
+ return {
89
+ "status": "success",
90
+ "message": "Image processed successfully",
91
+ "image_url": image_url,
92
+ "image_path": image_path,
93
+ "processing": {
94
+ "background_removed": remove_bg,
95
+ "upscaled": upscale,
96
+ "scale_factor": scale_factor if upscale else None,
97
+ "process_order": process_order
98
+ }
99
+ }
100
+
101
+ except HTTPException:
102
+ raise
103
+ except Exception as e:
104
+ raise HTTPException(500, f"Image processing error: {str(e)}")
105
+
106
+
107
+ @router.post("/process-product-image")
108
+ async def process_product_image_endpoint(
109
+ file: UploadFile = File(...),
110
+ remove_bg: bool = Form(True),
111
+ upscale: bool = Form(True),
112
+ scale_factor: int = Form(2),
113
+ process_order: str = Form("remove_first"),
114
+ product_id: str = Form(None)
115
+ ):
116
+ """
117
+ Process a product image by removing background and upscaling,
118
+ then save to Supabase Storage
119
+ """
120
+ try:
121
+ # Process the image
122
+ processed_image, filename = await process_product_image(
123
+ file,
124
+ remove_bg=remove_bg,
125
+ upscale=upscale,
126
+ scale_factor=scale_factor,
127
+ process_order=process_order
128
+ )
129
+
130
+ # Generate a unique ID for the image
131
+ image_id = str(uuid.uuid4())
132
+ image_path = f"{image_id}_{filename}"
133
+
134
+ # Upload the processed image to Supabase Storage
135
+ supabase.storage.from_("product-images").upload(
136
+ file=processed_image,
137
+ path=image_path,
138
+ file_options={"content-type": "image/png", "upsert": "true"}
139
+ )
140
+
141
+ # Get the public URL for the uploaded image
142
+ image_url = supabase.storage.from_("product-images").get_public_url(image_path)
143
+
144
+ # If product_id is provided, update the product record
145
+ if product_id:
146
+ # Update product_image column in the database
147
+ result = supabase.table("products").update({
148
+ "product_image": image_url
149
+ }).eq("id", product_id).execute()
150
+
151
+ return {
152
+ "status": "success",
153
+ "message": "Image processed successfully",
154
+ "image_url": image_url,
155
+ "image_path": image_path
156
+ }
157
+
158
+ except Exception as e:
159
+ raise HTTPException(500, f"Image processing error: {str(e)}")
api/scrape_routes.py ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from fastapi import APIRouter, HTTPException, Request, Depends, BackgroundTasks
2
+ from db.scrape_repository import PromoProductRepository
3
+ from utils.rate_limiter import RateLimiter
4
+ import requests
5
+ from concurrent.futures import ThreadPoolExecutor, as_completed
6
+
7
+ # Initialize rate limiter and repository
8
+ rate_limiter = RateLimiter(max_requests=5) # Lower limit for scraping operations
9
+ promo_repository = PromoProductRepository()
10
+
11
+ router = APIRouter(prefix="/scrape", tags=["Data Scraping"])
12
+
13
+ RETAILER_MAPPING = {
14
+ 8: "Studenac",
15
+ 4: "Konzum",
16
+ 3: "Kaufland",
17
+ 9: "Tommy",
18
+ 109: "Spar",
19
+ 6: "Plodine",
20
+ 5: "Lidl"
21
+ }
22
+
23
+ def fetch_page(session, page):
24
+ """Fetch a single page of products"""
25
+ url = f"https://backend.360promo.hr/api/promotions/products?pageNumber={page}&sortBySalePercentage=False"
26
+ try:
27
+ print(f"📄 Fetching page {page}...")
28
+ response = session.get(url, timeout=10)
29
+ response.raise_for_status()
30
+ return page, response.json()
31
+ except Exception as e:
32
+ print(f"❌ Error on page {page}: {str(e)}")
33
+ return page, []
34
+
35
+ def fetch_all_products():
36
+ products = []
37
+ max_workers = 8 # Adjust based on API capacity
38
+
39
+ with requests.Session() as session:
40
+ # First, fetch page 1 to see if there's data
41
+ _, page1_data = fetch_page(session, 1)
42
+ if not page1_data:
43
+ print("No data found on first page")
44
+ return []
45
+
46
+ products.extend(page1_data)
47
+
48
+ # Set up for concurrent fetching of subsequent pages
49
+ last_page_with_data = 1
50
+ while True:
51
+ # Determine next batch of pages to fetch
52
+ start_page = last_page_with_data + 1
53
+ end_page = start_page + max_workers - 1
54
+
55
+ if start_page > 1000: # Safety limit
56
+ print("Reached maximum page limit")
57
+ break
58
+
59
+ pages_to_fetch = list(range(start_page, end_page + 1))
60
+
61
+ # Fetch pages concurrently
62
+ with ThreadPoolExecutor(max_workers=max_workers) as executor:
63
+ futures = [executor.submit(fetch_page, session, page) for page in pages_to_fetch]
64
+
65
+ # Process results
66
+ new_data_found = False
67
+ highest_page_with_data = 0
68
+
69
+ for future in as_completed(futures):
70
+ page, data = future.result()
71
+ if data: # If we got data
72
+ products.extend(data)
73
+ new_data_found = True
74
+ highest_page_with_data = max(highest_page_with_data, page)
75
+
76
+ # If no new data was found in this batch, we're done
77
+ if not new_data_found:
78
+ break
79
+
80
+ # Update the last page with data
81
+ last_page_with_data = highest_page_with_data
82
+
83
+ print(f"\n✅ Total products collected: {len(products)}")
84
+ return products
85
+
86
+ def process_products(products):
87
+ unified_products = []
88
+
89
+ for product in products:
90
+ retailer_id = product.get('retailerId')
91
+ if not retailer_id or retailer_id not in RETAILER_MAPPING:
92
+ continue
93
+
94
+ store_name = RETAILER_MAPPING[retailer_id]
95
+ price = product.get('promoPrice') or product.get('regularPrice')
96
+
97
+ if price is None:
98
+ continue
99
+
100
+ item = {
101
+ "store": store_name,
102
+ "pictureId": product.get('id'),
103
+ "name": product.get('name', 'Unknown Product'),
104
+ "description": product.get('description', ''),
105
+ "promoStartDate": product.get('promoStartDate'),
106
+ "promoEndDate": product.get('promoEndDate'),
107
+ "regularPrice": product.get('regularPrice'),
108
+ "promoPrice": product.get('promoPrice')
109
+ }
110
+
111
+ unified_products.append(item)
112
+
113
+ return unified_products
114
+
115
+ async def scrape_and_store_products():
116
+ """Background task to scrape products and store them in the database"""
117
+ try:
118
+ # Fetch products from the API
119
+ products = fetch_all_products()
120
+ if not products:
121
+ print("No products found to scrape")
122
+ return 0
123
+
124
+ # Process products into standardized format
125
+ unified_products = process_products(products)
126
+ if not unified_products:
127
+ print("No valid products found to store")
128
+ return 0
129
+
130
+ # Store products in Supabase
131
+ stored_count = promo_repository.upsert_multiple_products(unified_products)
132
+ print(f"Successfully stored {stored_count} products")
133
+ return stored_count
134
+
135
+ except Exception as e:
136
+ print(f"Error during scraping: {str(e)}")
137
+ return 0
138
+
139
+ @router.post("/promo")
140
+ async def trigger_promo_scrape(
141
+ background_tasks: BackgroundTasks,
142
+ request: Request
143
+ ):
144
+ """
145
+ Admin only: Trigger a promotional product scraping operation.
146
+ This runs in the background to avoid timeout issues.
147
+ """
148
+ try:
149
+ # Add scraping task to background tasks
150
+ background_tasks.add_task(scrape_and_store_products)
151
+
152
+ return {
153
+ "status": "success",
154
+ "message": "Promotional product scraping started. Results will be stored in the database."
155
+ }
156
+ except Exception as e:
157
+ print(f"ERROR: {str(e)}")
158
+ raise HTTPException(500, f"Failed to start scraping operation: {str(e)}")
db/scrape_repository.py ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any, List
2
+ from datetime import datetime
3
+ from .supabase_client import SupabaseClient
4
+
5
+ class PromoProductRepository:
6
+ def __init__(self):
7
+ self.supabase = SupabaseClient().get_client()
8
+
9
+ def upsert_multiple_products(self, products: List[Dict[str, Any]]) -> int:
10
+ """
11
+ Upsert multiple promo products in batches
12
+ Returns the number of successfully upserted products
13
+ """
14
+ batch_size = 100 # Adjust based on Supabase capacity
15
+ successfully_upserted = 0
16
+ timestamp = datetime.now().isoformat()
17
+
18
+ # Process in batches to avoid request size limitations
19
+ for i in range(0, len(products), batch_size):
20
+ batch = products[i:i+batch_size]
21
+
22
+ for product in batch:
23
+ store = product.get("store")
24
+ name = product.get("name")
25
+
26
+ formatted_product = {
27
+ "store": store,
28
+ "picture_id": product.get("pictureId"),
29
+ "name": name,
30
+ "description": product.get("description", ""),
31
+ "promo_start_date": product.get("promoStartDate"),
32
+ "promo_end_date": product.get("promoEndDate"),
33
+ "regular_price": product.get("regularPrice"),
34
+ "promo_price": product.get("promoPrice"),
35
+ "last_updated": timestamp
36
+ }
37
+
38
+ try:
39
+ # Check if product exists with same store and name
40
+ result = self.supabase.table("promo_products").select("*") \
41
+ .eq("store", store) \
42
+ .eq("name", name) \
43
+ .execute()
44
+
45
+ if result.data and len(result.data) > 0:
46
+ # Product exists, update it
47
+ record_id = result.data[0]["id"]
48
+ self.supabase.table("promo_products") \
49
+ .update(formatted_product) \
50
+ .eq("id", record_id) \
51
+ .execute()
52
+ else:
53
+ # Product doesn't exist, insert it
54
+ self.supabase.table("promo_products") \
55
+ .insert(formatted_product) \
56
+ .execute()
57
+
58
+ successfully_upserted += 1
59
+
60
+ # Print progress periodically
61
+ if successfully_upserted % 50 == 0:
62
+ print(f"Processed {successfully_upserted} products so far...")
63
+
64
+ except Exception as e:
65
+ print(f"Failed to upsert product '{name}' from '{store}': {str(e)}")
66
+ # Continue with next product instead of failing completely
67
+
68
+ print(f"Successfully upserted {successfully_upserted} products")
69
+ return successfully_upserted
requirements.txt CHANGED
@@ -9,4 +9,5 @@ ultralytics
9
  python-multipart
10
  google-cloud-vision
11
  python-dotenv
12
- supabase
 
 
9
  python-multipart
10
  google-cloud-vision
11
  python-dotenv
12
+ supabase
13
+ rembg
utils/image_processing.py CHANGED
@@ -1,6 +1,12 @@
1
  import numpy as np
2
  import cv2
3
  from fastapi import UploadFile, HTTPException
 
 
 
 
 
 
4
 
5
  async def read_image_file(file: UploadFile) -> np.ndarray:
6
  """Read and process an image file from FastAPI UploadFile"""
@@ -14,3 +20,91 @@ async def read_image_file(file: UploadFile) -> np.ndarray:
14
  raise HTTPException(400, "Invalid image data")
15
 
16
  return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import numpy as np
2
  import cv2
3
  from fastapi import UploadFile, HTTPException
4
+ from PIL import Image
5
+ import io
6
+ from rembg import remove
7
+ import time
8
+ import uuid
9
+ from typing import Tuple, Optional
10
 
11
  async def read_image_file(file: UploadFile) -> np.ndarray:
12
  """Read and process an image file from FastAPI UploadFile"""
 
20
  raise HTTPException(400, "Invalid image data")
21
 
22
  return cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
23
+
24
+ def remove_background(image_bytes: bytes) -> bytes:
25
+ """Remove white background from image using rembg"""
26
+ try:
27
+ return remove(image_bytes)
28
+ except Exception as e:
29
+ print(f"Error removing background: {str(e)}")
30
+ raise Exception(f"Background removal error: {str(e)}")
31
+
32
+ def upscale_image(image_bytes: bytes, scale_factor: int = 2) -> bytes:
33
+ """Upscale image using OpenCV"""
34
+ try:
35
+ # Create a numpy array from the image bytes
36
+ nparr = np.frombuffer(image_bytes, np.uint8)
37
+ img = cv2.imdecode(nparr, cv2.IMREAD_UNCHANGED)
38
+
39
+ # Handle images with alpha channel
40
+ if len(img.shape) > 2 and img.shape[2] == 4:
41
+ # Split channels
42
+ b, g, r, a = cv2.split(img)
43
+
44
+ # Scale RGB channels
45
+ rgb_channels = cv2.merge([b, g, r])
46
+ scaled_rgb = cv2.resize(rgb_channels, None, fx=scale_factor, fy=scale_factor,
47
+ interpolation=cv2.INTER_CUBIC)
48
+
49
+ # Scale alpha channel separately
50
+ scaled_alpha = cv2.resize(a, None, fx=scale_factor, fy=scale_factor,
51
+ interpolation=cv2.INTER_CUBIC)
52
+
53
+ # Merge channels back together
54
+ scaled_img = cv2.merge([
55
+ scaled_rgb[:, :, 0],
56
+ scaled_rgb[:, :, 1],
57
+ scaled_rgb[:, :, 2],
58
+ scaled_alpha
59
+ ])
60
+ else:
61
+ # Regular RGB image
62
+ scaled_img = cv2.resize(img, None, fx=scale_factor, fy=scale_factor,
63
+ interpolation=cv2.INTER_CUBIC)
64
+
65
+ # Encode the image back to bytes
66
+ success, buffer = cv2.imencode('.png', scaled_img)
67
+ if not success:
68
+ raise Exception("Failed to encode upscaled image")
69
+
70
+ return buffer.tobytes()
71
+ except Exception as e:
72
+ print(f"Error upscaling image: {str(e)}")
73
+ raise Exception(f"Image upscaling error: {str(e)}")
74
+
75
+ async def process_product_image(
76
+ file: UploadFile,
77
+ remove_bg: bool = True,
78
+ upscale: bool = True,
79
+ scale_factor: int = 2,
80
+ process_order: str = "remove_first"
81
+ ) -> Tuple[bytes, str]:
82
+ """Process a product image with background removal and upscaling"""
83
+ # Read the file content
84
+ content = await file.read()
85
+ file.file.seek(0) # Reset file pointer for potential reuse
86
+
87
+ # Create a descriptive filename with timestamp for uniqueness
88
+ timestamp = int(time.time())
89
+ original_filename = file.filename.split('.')
90
+ base_name = original_filename[0] if len(original_filename) > 0 else 'product'
91
+ extension = 'png' # Always use PNG to preserve transparency
92
+
93
+ # Process the image based on the parameters and order
94
+ processed_content = content
95
+
96
+ if process_order == "remove_first" and remove_bg and upscale:
97
+ processed_content = remove_background(processed_content)
98
+ processed_content = upscale_image(processed_content, scale_factor)
99
+ elif process_order == "upscale_first" and remove_bg and upscale:
100
+ processed_content = upscale_image(processed_content, scale_factor)
101
+ processed_content = remove_background(processed_content)
102
+ elif remove_bg:
103
+ processed_content = remove_background(processed_content)
104
+ elif upscale:
105
+ processed_content = upscale_image(processed_content, scale_factor)
106
+
107
+ # Create descriptive filename with processing info
108
+ processed_filename = f"{base_name}_{'nobg' if remove_bg else ''}_{'upx' + str(scale_factor) if upscale else ''}_{timestamp}.{extension}"
109
+
110
+ return processed_content, processed_filename