dramp77 commited on
Commit
30ad672
·
verified ·
1 Parent(s): 23be87a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +502 -501
app.py CHANGED
@@ -1,501 +1,502 @@
1
- import requests
2
- import re
3
- import csv
4
- import datetime
5
- import gradio as gr
6
- import os
7
- from openai import OpenAI
8
- from PIL import Image
9
- from io import BytesIO
10
- from dotenv import load_dotenv
11
- import json
12
-
13
- # Load environment variables
14
- load_dotenv()
15
-
16
- # Initialize OpenAI client
17
- client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
18
-
19
- # Define reference images directory
20
- REFERENCE_IMAGES_DIR = 'reference_images'
21
- os.makedirs(REFERENCE_IMAGES_DIR, exist_ok=True)
22
-
23
- def load_reference_images():
24
- """Load all reference images from the reference directory"""
25
- reference_data = {}
26
- for category in os.listdir(REFERENCE_IMAGES_DIR):
27
- category_path = os.path.join(REFERENCE_IMAGES_DIR, category)
28
- if os.path.isdir(category_path):
29
- reference_data[category] = []
30
- for img_file in os.listdir(category_path):
31
- if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
32
- img_path = os.path.join(category_path, img_file)
33
- reference_data[category].append(img_path)
34
- return reference_data
35
-
36
- def compare_with_reference(image_url, product_category):
37
- """Compare product image with reference images using OpenAI Vision"""
38
- reference_images = load_reference_images().get(product_category, [])
39
-
40
- if not reference_images:
41
- return "Error: No reference images found for this category", 0
42
-
43
- try:
44
- messages = [
45
- {
46
- "role": "user",
47
- "content": [
48
- {
49
- "type": "text",
50
- "text": """Compare these images and determine if the product appears to be authentic.
51
- Consider:
52
- 1. Logo placement and quality
53
- 2. Product design details
54
- 3. Material quality appearance
55
- 4. Color accuracy
56
- 5. Overall build quality
57
-
58
- The first image is the reference (authentic product).
59
- The second image is the product to verify.
60
-
61
- Respond with 'Pass' if it appears authentic or 'Not Pass' if it shows signs of being counterfeit.
62
- """
63
- },
64
- {
65
- "type": "image_url",
66
- "image_url": {"url": reference_images[0]} # Using first reference image
67
- },
68
- {
69
- "type": "image_url",
70
- "image_url": {"url": image_url}
71
- }
72
- ]
73
- }
74
- ]
75
-
76
- response = client.chat.completions.create(
77
- model="gpt-4o-mini",
78
- messages=messages,
79
- max_tokens=10
80
- )
81
-
82
- result = response.choices[0].message.content.strip()
83
- confidence = 1.0 if result == "Pass" else 0.0
84
-
85
- return result, confidence
86
-
87
- except Exception as e:
88
- print(f"Error in comparison: {e}")
89
- return "Error", 0
90
-
91
- def scrape_tokopedia(product_url, product_category):
92
- """Scrape product data from Tokopedia"""
93
- try:
94
- # Validasi URL Tokopedia
95
- match = re.search(r'tokopedia\.com/([^/]+)/([^/?]+)', product_url)
96
- if not match:
97
- return "Error: Invalid Tokopedia URL format.", None
98
-
99
- headers = {
100
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
101
- 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
102
- 'Accept-Language': 'en-US,en;q=0.9',
103
- 'Accept-Encoding': 'gzip, deflate, br',
104
- 'Connection': 'keep-alive',
105
- 'Upgrade-Insecure-Requests': '1',
106
- 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
107
- 'sec-ch-ua-platform': '"Windows"'
108
- }
109
-
110
- session = requests.Session()
111
- print(f"Fetching product page: {product_url}")
112
-
113
- # Langsung mengakses halaman produk
114
- response = session.get(product_url, headers=headers, timeout=10)
115
- response.raise_for_status() # Raise exception for bad status codes
116
-
117
- print(f"Response status: {response.status_code}")
118
-
119
- # Multiple patterns untuk mencari URL gambar
120
- image_patterns = [
121
- r'https://images\.tokopedia\.net/img/[^"\']+\.(jpg|jpeg|png)',
122
- r'https://[^"\']+\.tokopedia\.net/[^"\']+\.(jpg|jpeg|png)',
123
- r'"imageUrl":"(https://[^"]+)"',
124
- r'"url":"(https://images[^"]+)"',
125
- r'content="(https://images\.tokopedia\.net[^"]+)"'
126
- ]
127
-
128
- all_images = []
129
- for pattern in image_patterns:
130
- matches = re.findall(pattern, response.text)
131
- if matches:
132
- if isinstance(matches[0], tuple):
133
- # If the pattern contains groups, take the full match
134
- images = [m[0] if isinstance(m, tuple) else m for m in matches]
135
- else:
136
- images = matches
137
- all_images.extend(images)
138
-
139
- # Remove duplicates and clean URLs
140
- unique_images = list(set(all_images))
141
- print(f"Found {len(unique_images)} unique images")
142
-
143
- if not unique_images:
144
- # Try to extract from JSON-LD
145
- json_ld_pattern = r'<script type="application/ld\+json">(.*?)</script>'
146
- json_matches = re.findall(json_ld_pattern, response.text, re.DOTALL)
147
- for json_str in json_matches:
148
- try:
149
- json_data = json.loads(json_str)
150
- if 'image' in json_data:
151
- if isinstance(json_data['image'], list):
152
- unique_images.extend(json_data['image'])
153
- else:
154
- unique_images.append(json_data['image'])
155
- except:
156
- continue
157
-
158
- if not unique_images:
159
- return "Error: No product images found.", None
160
-
161
- # Filter and verify images
162
- valid_images = []
163
- for img_url in unique_images[:10]: # Try first 10 images
164
- try:
165
- print(f"Verifying image URL: {img_url}")
166
- img_response = session.head(img_url, headers=headers, timeout=5)
167
- content_type = img_response.headers.get('content-type', '')
168
-
169
- if img_response.status_code == 200 and 'image' in content_type.lower():
170
- valid_images.append(img_url)
171
- if len(valid_images) >= 5: # Stop after getting 5 valid images
172
- break
173
- except Exception as e:
174
- print(f"Error verifying image {img_url}: {str(e)}")
175
- continue
176
-
177
- if not valid_images:
178
- return "Error: Could not verify any product images.", None
179
-
180
- results = []
181
- for img_url in valid_images:
182
- try:
183
- print(f"Processing image: {img_url}")
184
- classification_result, confidence = compare_with_reference(img_url, product_category)
185
- results.append({
186
- 'image_url': img_url,
187
- 'classification': classification_result,
188
- 'confidence': confidence
189
- })
190
- except Exception as e:
191
- print(f"Error processing image {img_url}: {str(e)}")
192
- continue
193
-
194
- if not results:
195
- return "Error: Could not process any product images.", None
196
-
197
- output_file = 'tokopedia_authenticity_check.csv'
198
- with open(output_file, 'w', newline='', encoding='utf-8') as file:
199
- writer = csv.writer(file)
200
- writer.writerow(['image_url', 'authenticity_result', 'confidence'])
201
- for result in results:
202
- writer.writerow([
203
- result['image_url'],
204
- result['classification'],
205
- f"{result['confidence']:.2%}"
206
- ])
207
-
208
- pass_count = sum(1 for r in results if r['classification'] == 'Pass')
209
- total_images = len(results)
210
- summary = f"""
211
- Tokopedia Authenticity Check Results:
212
- Total Images Analyzed: {total_images}
213
- Appears Authentic: {pass_count}
214
- Potentially Counterfeit: {total_images - pass_count}
215
-
216
- Detailed results saved to {output_file}
217
- """
218
-
219
- return summary, results[0]['image_url']
220
-
221
- except Exception as e:
222
- print(f"Error in scrape_tokopedia: {str(e)}")
223
- return f"Error scraping Tokopedia: {str(e)}", None
224
-
225
- def scrape_shopee(product_url, product_category):
226
- """Scrape product data from Shopee"""
227
- try:
228
- # Extract shop_id and item_id from URL
229
- match = re.search(r'i\.(\d+)\.(\d+)', product_url)
230
- if not match:
231
- return "Error: Invalid Shopee URL format.", None
232
-
233
- shop_id, item_id = match.groups()
234
- api_url = f'https://shopee.co.id/api/v4/item/get?itemid={item_id}&shopid={shop_id}'
235
-
236
- headers = {
237
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
238
- 'Accept': 'application/json',
239
- 'X-Requested-With': 'XMLHttpRequest',
240
- 'Referer': 'https://shopee.co.id/',
241
- 'AF-AC-Encoding-Version': '3',
242
- }
243
-
244
- session = requests.Session()
245
- # First visit the main page to get cookies
246
- session.get(f'https://shopee.co.id/product/{shop_id}/{item_id}', headers=headers)
247
-
248
- response = session.get(api_url, headers=headers)
249
-
250
- if response.status_code != 200:
251
- return f"Error: Failed to fetch product data (HTTP {response.status_code}).", None
252
-
253
- product_data = response.json()
254
- images = product_data.get('data', {}).get('images', [])
255
-
256
- if not images:
257
- return "Error: No product images found.", None
258
-
259
- results = []
260
- for img_id in images[:5]:
261
- image_url = f"https://cf.shopee.co.id/file/{img_id}"
262
- classification_result, confidence = compare_with_reference(image_url, product_category)
263
- results.append({
264
- 'image_url': image_url,
265
- 'classification': classification_result,
266
- 'confidence': confidence
267
- })
268
-
269
- output_file = 'shopee_authenticity_check.csv'
270
- with open(output_file, 'w', newline='', encoding='utf-8') as file:
271
- writer = csv.writer(file)
272
- writer.writerow(['image_url', 'authenticity_result', 'confidence'])
273
- for result in results:
274
- writer.writerow([
275
- result['image_url'],
276
- result['classification'],
277
- f"{result['confidence']:.2%}"
278
- ])
279
-
280
- pass_count = sum(1 for r in results if r['classification'] == 'Pass')
281
- total_images = len(results)
282
- summary = f"""
283
- Shopee Authenticity Check Results:
284
- Total Images Analyzed: {total_images}
285
- Appears Authentic: {pass_count}
286
- Potentially Counterfeit: {total_images - pass_count}
287
-
288
- Detailed results saved to {output_file}
289
- """
290
-
291
- return summary, results[0]['image_url']
292
-
293
- except Exception as e:
294
- return f"Error scraping Shopee: {str(e)}", None
295
-
296
- def scrape_blibli(product_url, product_category):
297
- """Scrape product data from Blibli"""
298
- try:
299
- # Extract product ID from URL
300
- match = re.search(r'p/([^/\?]+)', product_url)
301
- if not match:
302
- return "Error: Invalid Blibli URL format.", None
303
-
304
- product_id = match.group(1)
305
- api_url = f"https://www.blibli.com/backend/product-detail/products/{product_id}"
306
-
307
- headers = {
308
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
309
- 'Accept': 'application/json',
310
- 'X-Requested-With': 'XMLHttpRequest',
311
- 'Referer': 'https://www.blibli.com/',
312
- }
313
-
314
- session = requests.Session()
315
- response = session.get(api_url, headers=headers)
316
-
317
- if response.status_code != 200:
318
- return f"Error: Failed to fetch product data (HTTP {response.status_code}).", None
319
-
320
- product_data = response.json()
321
- images = product_data.get('data', {}).get('images', [])
322
-
323
- if not images:
324
- return "Error: No product images found.", None
325
-
326
- results = []
327
- for img_url in images[:5]:
328
- classification_result, confidence = compare_with_reference(img_url, product_category)
329
- results.append({
330
- 'image_url': img_url,
331
- 'classification': classification_result,
332
- 'confidence': confidence
333
- })
334
-
335
- output_file = 'blibli_authenticity_check.csv'
336
- with open(output_file, 'w', newline='', encoding='utf-8') as file:
337
- writer = csv.writer(file)
338
- writer.writerow(['image_url', 'authenticity_result', 'confidence'])
339
- for result in results:
340
- writer.writerow([
341
- result['image_url'],
342
- result['classification'],
343
- f"{result['confidence']:.2%}"
344
- ])
345
-
346
- pass_count = sum(1 for r in results if r['classification'] == 'Pass')
347
- total_images = len(results)
348
- summary = f"""
349
- Blibli Authenticity Check Results:
350
- Total Images Analyzed: {total_images}
351
- Appears Authentic: {pass_count}
352
- Potentially Counterfeit: {total_images - pass_count}
353
-
354
- Detailed results saved to {output_file}
355
- """
356
-
357
- return summary, results[0]['image_url']
358
-
359
- except Exception as e:
360
- return f"Error scraping Blibli: {str(e)}", None
361
-
362
- def scrape_bukalapak(product_url, product_category):
363
- """Scrape product data from Bukalapak"""
364
- try:
365
- # Extract product ID from URL
366
- match = re.search(r'p/([^/\?]+)', product_url)
367
- if not match:
368
- return "Error: Invalid Bukalapak URL format.", None
369
-
370
- product_slug = match.group(1)
371
- api_url = f"https://api.bukalapak.com/products/{product_slug}"
372
-
373
- headers = {
374
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
375
- 'Accept': 'application/json',
376
- 'X-Requested-With': 'XMLHttpRequest',
377
- 'Referer': 'https://www.bukalapak.com/',
378
- }
379
-
380
- session = requests.Session()
381
- response = session.get(api_url, headers=headers)
382
-
383
- if response.status_code != 200:
384
- return f"Error: Failed to fetch product data (HTTP {response.status_code}).", None
385
-
386
- product_data = response.json()
387
- images = product_data.get('data', {}).get('images', [])
388
-
389
- if not images:
390
- return "Error: No product images found.", None
391
-
392
- results = []
393
- for img_data in images[:5]:
394
- img_url = img_data.get('large_url')
395
- if img_url:
396
- classification_result, confidence = compare_with_reference(img_url, product_category)
397
- results.append({
398
- 'image_url': img_url,
399
- 'classification': classification_result,
400
- 'confidence': confidence
401
- })
402
-
403
- output_file = 'bukalapak_authenticity_check.csv'
404
- with open(output_file, 'w', newline='', encoding='utf-8') as file:
405
- writer = csv.writer(file)
406
- writer.writerow(['image_url', 'authenticity_result', 'confidence'])
407
- for result in results:
408
- writer.writerow([
409
- result['image_url'],
410
- result['classification'],
411
- f"{result['confidence']:.2%}"
412
- ])
413
-
414
- pass_count = sum(1 for r in results if r['classification'] == 'Pass')
415
- total_images = len(results)
416
- summary = f"""
417
- Bukalapak Authenticity Check Results:
418
- Total Images Analyzed: {total_images}
419
- Appears Authentic: {pass_count}
420
- Potentially Counterfeit: {total_images - pass_count}
421
-
422
- Detailed results saved to {output_file}
423
- """
424
-
425
- return summary, results[0]['image_url']
426
-
427
- except Exception as e:
428
- return f"Error scraping Bukalapak: {str(e)}", None
429
-
430
- def gradio_scrape(marketplace_choice, product_url, product_category):
431
- """Updated gradio function with direct marketplace selection"""
432
- if not product_url:
433
- return "Error: Please enter a product URL", None
434
-
435
- # Validate URL based on selected marketplace
436
- url_patterns = {
437
- 'Shopee': r'shopee\.co\.id',
438
- 'Tokopedia': r'tokopedia\.com',
439
- 'Blibli': r'blibli\.com',
440
- 'Bukalapak': r'bukalapak\.com'
441
- }
442
-
443
- if not re.search(url_patterns[marketplace_choice], product_url):
444
- return f"Error: URL doesn't match selected marketplace ({marketplace_choice}). Please check your URL.", None
445
-
446
- # Call appropriate scraping function based on marketplace
447
- scraping_functions = {
448
- 'Shopee': scrape_shopee,
449
- 'Tokopedia': scrape_tokopedia,
450
- 'Blibli': scrape_blibli,
451
- 'Bukalapak': scrape_bukalapak
452
- }
453
-
454
- result, image_url = scraping_functions[marketplace_choice](product_url, product_category)
455
-
456
- if image_url:
457
- img = Image.open(BytesIO(requests.get(image_url).content))
458
- return result, img
459
- return result, None
460
-
461
- # Get available categories from reference_images directory
462
- categories = [d for d in os.listdir(REFERENCE_IMAGES_DIR)
463
- if os.path.isdir(os.path.join(REFERENCE_IMAGES_DIR, d))]
464
-
465
- # Define marketplace choices
466
- marketplace_choices = ['Shopee', 'Tokopedia', 'Blibli', 'Bukalapak']
467
-
468
- # Update Gradio Interface
469
- interface = gr.Interface(
470
- fn=gradio_scrape,
471
- inputs=[
472
- gr.Dropdown(
473
- choices=marketplace_choices,
474
- label="Select Marketplace",
475
- value="Shopee"
476
- ),
477
- gr.Textbox(
478
- label="Product URL",
479
- placeholder="Paste your product URL here"
480
- ),
481
- gr.Dropdown(
482
- choices=categories,
483
- label="Product Category"
484
- )
485
- ],
486
- outputs=[
487
- gr.Textbox(label="Authenticity Check Results"),
488
- gr.Image(label="Product Image Sample")
489
- ],
490
- title="E-commerce Product Authenticity Checker",
491
- description="""
492
- How to use:
493
- 1. Select your marketplace (Shopee/Tokopedia/Blibli/Bukalapak)
494
- 2. Paste the product URL
495
- 3. Select the product category
496
- 4. Click submit to check authenticity
497
- """,
498
- )
499
-
500
- if __name__ == "__main__":
501
- interface.launch()
 
 
1
+ import requests
2
+ import re
3
+ import csv
4
+ import datetime
5
+ import gradio as gr
6
+ import os
7
+ import openai
8
+ from openai import OpenAI
9
+ from PIL import Image
10
+ from io import BytesIO
11
+ from dotenv import load_dotenv
12
+ import json
13
+
14
+ # Load environment variables
15
+ load_dotenv()
16
+
17
+ # Initialize OpenAI client
18
+ client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
19
+
20
+ # Define reference images directory
21
+ REFERENCE_IMAGES_DIR = 'reference_images'
22
+ os.makedirs(REFERENCE_IMAGES_DIR, exist_ok=True)
23
+
24
+ def load_reference_images():
25
+ """Load all reference images from the reference directory"""
26
+ reference_data = {}
27
+ for category in os.listdir(REFERENCE_IMAGES_DIR):
28
+ category_path = os.path.join(REFERENCE_IMAGES_DIR, category)
29
+ if os.path.isdir(category_path):
30
+ reference_data[category] = []
31
+ for img_file in os.listdir(category_path):
32
+ if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
33
+ img_path = os.path.join(category_path, img_file)
34
+ reference_data[category].append(img_path)
35
+ return reference_data
36
+
37
+ def compare_with_reference(image_url, product_category):
38
+ """Compare product image with reference images using OpenAI Vision"""
39
+ reference_images = load_reference_images().get(product_category, [])
40
+
41
+ if not reference_images:
42
+ return "Error: No reference images found for this category", 0
43
+
44
+ try:
45
+ messages = [
46
+ {
47
+ "role": "user",
48
+ "content": [
49
+ {
50
+ "type": "text",
51
+ "text": """Compare these images and determine if the product appears to be authentic.
52
+ Consider:
53
+ 1. Logo placement and quality
54
+ 2. Product design details
55
+ 3. Material quality appearance
56
+ 4. Color accuracy
57
+ 5. Overall build quality
58
+
59
+ The first image is the reference (authentic product).
60
+ The second image is the product to verify.
61
+
62
+ Respond with 'Pass' if it appears authentic or 'Not Pass' if it shows signs of being counterfeit.
63
+ """
64
+ },
65
+ {
66
+ "type": "image_url",
67
+ "image_url": {"url": reference_images[0]} # Using first reference image
68
+ },
69
+ {
70
+ "type": "image_url",
71
+ "image_url": {"url": image_url}
72
+ }
73
+ ]
74
+ }
75
+ ]
76
+
77
+ response = client.chat.completions.create(
78
+ model="gpt-4o-mini",
79
+ messages=messages,
80
+ max_tokens=10
81
+ )
82
+
83
+ result = response.choices[0].message.content.strip()
84
+ confidence = 1.0 if result == "Pass" else 0.0
85
+
86
+ return result, confidence
87
+
88
+ except Exception as e:
89
+ print(f"Error in comparison: {e}")
90
+ return "Error", 0
91
+
92
+ def scrape_tokopedia(product_url, product_category):
93
+ """Scrape product data from Tokopedia"""
94
+ try:
95
+ # Validasi URL Tokopedia
96
+ match = re.search(r'tokopedia\.com/([^/]+)/([^/?]+)', product_url)
97
+ if not match:
98
+ return "Error: Invalid Tokopedia URL format.", None
99
+
100
+ headers = {
101
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
102
+ 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
103
+ 'Accept-Language': 'en-US,en;q=0.9',
104
+ 'Accept-Encoding': 'gzip, deflate, br',
105
+ 'Connection': 'keep-alive',
106
+ 'Upgrade-Insecure-Requests': '1',
107
+ 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
108
+ 'sec-ch-ua-platform': '"Windows"'
109
+ }
110
+
111
+ session = requests.Session()
112
+ print(f"Fetching product page: {product_url}")
113
+
114
+ # Langsung mengakses halaman produk
115
+ response = session.get(product_url, headers=headers, timeout=10)
116
+ response.raise_for_status() # Raise exception for bad status codes
117
+
118
+ print(f"Response status: {response.status_code}")
119
+
120
+ # Multiple patterns untuk mencari URL gambar
121
+ image_patterns = [
122
+ r'https://images\.tokopedia\.net/img/[^"\']+\.(jpg|jpeg|png)',
123
+ r'https://[^"\']+\.tokopedia\.net/[^"\']+\.(jpg|jpeg|png)',
124
+ r'"imageUrl":"(https://[^"]+)"',
125
+ r'"url":"(https://images[^"]+)"',
126
+ r'content="(https://images\.tokopedia\.net[^"]+)"'
127
+ ]
128
+
129
+ all_images = []
130
+ for pattern in image_patterns:
131
+ matches = re.findall(pattern, response.text)
132
+ if matches:
133
+ if isinstance(matches[0], tuple):
134
+ # If the pattern contains groups, take the full match
135
+ images = [m[0] if isinstance(m, tuple) else m for m in matches]
136
+ else:
137
+ images = matches
138
+ all_images.extend(images)
139
+
140
+ # Remove duplicates and clean URLs
141
+ unique_images = list(set(all_images))
142
+ print(f"Found {len(unique_images)} unique images")
143
+
144
+ if not unique_images:
145
+ # Try to extract from JSON-LD
146
+ json_ld_pattern = r'<script type="application/ld\+json">(.*?)</script>'
147
+ json_matches = re.findall(json_ld_pattern, response.text, re.DOTALL)
148
+ for json_str in json_matches:
149
+ try:
150
+ json_data = json.loads(json_str)
151
+ if 'image' in json_data:
152
+ if isinstance(json_data['image'], list):
153
+ unique_images.extend(json_data['image'])
154
+ else:
155
+ unique_images.append(json_data['image'])
156
+ except:
157
+ continue
158
+
159
+ if not unique_images:
160
+ return "Error: No product images found.", None
161
+
162
+ # Filter and verify images
163
+ valid_images = []
164
+ for img_url in unique_images[:10]: # Try first 10 images
165
+ try:
166
+ print(f"Verifying image URL: {img_url}")
167
+ img_response = session.head(img_url, headers=headers, timeout=5)
168
+ content_type = img_response.headers.get('content-type', '')
169
+
170
+ if img_response.status_code == 200 and 'image' in content_type.lower():
171
+ valid_images.append(img_url)
172
+ if len(valid_images) >= 5: # Stop after getting 5 valid images
173
+ break
174
+ except Exception as e:
175
+ print(f"Error verifying image {img_url}: {str(e)}")
176
+ continue
177
+
178
+ if not valid_images:
179
+ return "Error: Could not verify any product images.", None
180
+
181
+ results = []
182
+ for img_url in valid_images:
183
+ try:
184
+ print(f"Processing image: {img_url}")
185
+ classification_result, confidence = compare_with_reference(img_url, product_category)
186
+ results.append({
187
+ 'image_url': img_url,
188
+ 'classification': classification_result,
189
+ 'confidence': confidence
190
+ })
191
+ except Exception as e:
192
+ print(f"Error processing image {img_url}: {str(e)}")
193
+ continue
194
+
195
+ if not results:
196
+ return "Error: Could not process any product images.", None
197
+
198
+ output_file = 'tokopedia_authenticity_check.csv'
199
+ with open(output_file, 'w', newline='', encoding='utf-8') as file:
200
+ writer = csv.writer(file)
201
+ writer.writerow(['image_url', 'authenticity_result', 'confidence'])
202
+ for result in results:
203
+ writer.writerow([
204
+ result['image_url'],
205
+ result['classification'],
206
+ f"{result['confidence']:.2%}"
207
+ ])
208
+
209
+ pass_count = sum(1 for r in results if r['classification'] == 'Pass')
210
+ total_images = len(results)
211
+ summary = f"""
212
+ Tokopedia Authenticity Check Results:
213
+ Total Images Analyzed: {total_images}
214
+ Appears Authentic: {pass_count}
215
+ Potentially Counterfeit: {total_images - pass_count}
216
+
217
+ Detailed results saved to {output_file}
218
+ """
219
+
220
+ return summary, results[0]['image_url']
221
+
222
+ except Exception as e:
223
+ print(f"Error in scrape_tokopedia: {str(e)}")
224
+ return f"Error scraping Tokopedia: {str(e)}", None
225
+
226
+ def scrape_shopee(product_url, product_category):
227
+ """Scrape product data from Shopee"""
228
+ try:
229
+ # Extract shop_id and item_id from URL
230
+ match = re.search(r'i\.(\d+)\.(\d+)', product_url)
231
+ if not match:
232
+ return "Error: Invalid Shopee URL format.", None
233
+
234
+ shop_id, item_id = match.groups()
235
+ api_url = f'https://shopee.co.id/api/v4/item/get?itemid={item_id}&shopid={shop_id}'
236
+
237
+ headers = {
238
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
239
+ 'Accept': 'application/json',
240
+ 'X-Requested-With': 'XMLHttpRequest',
241
+ 'Referer': 'https://shopee.co.id/',
242
+ 'AF-AC-Encoding-Version': '3',
243
+ }
244
+
245
+ session = requests.Session()
246
+ # First visit the main page to get cookies
247
+ session.get(f'https://shopee.co.id/product/{shop_id}/{item_id}', headers=headers)
248
+
249
+ response = session.get(api_url, headers=headers)
250
+
251
+ if response.status_code != 200:
252
+ return f"Error: Failed to fetch product data (HTTP {response.status_code}).", None
253
+
254
+ product_data = response.json()
255
+ images = product_data.get('data', {}).get('images', [])
256
+
257
+ if not images:
258
+ return "Error: No product images found.", None
259
+
260
+ results = []
261
+ for img_id in images[:5]:
262
+ image_url = f"https://cf.shopee.co.id/file/{img_id}"
263
+ classification_result, confidence = compare_with_reference(image_url, product_category)
264
+ results.append({
265
+ 'image_url': image_url,
266
+ 'classification': classification_result,
267
+ 'confidence': confidence
268
+ })
269
+
270
+ output_file = 'shopee_authenticity_check.csv'
271
+ with open(output_file, 'w', newline='', encoding='utf-8') as file:
272
+ writer = csv.writer(file)
273
+ writer.writerow(['image_url', 'authenticity_result', 'confidence'])
274
+ for result in results:
275
+ writer.writerow([
276
+ result['image_url'],
277
+ result['classification'],
278
+ f"{result['confidence']:.2%}"
279
+ ])
280
+
281
+ pass_count = sum(1 for r in results if r['classification'] == 'Pass')
282
+ total_images = len(results)
283
+ summary = f"""
284
+ Shopee Authenticity Check Results:
285
+ Total Images Analyzed: {total_images}
286
+ Appears Authentic: {pass_count}
287
+ Potentially Counterfeit: {total_images - pass_count}
288
+
289
+ Detailed results saved to {output_file}
290
+ """
291
+
292
+ return summary, results[0]['image_url']
293
+
294
+ except Exception as e:
295
+ return f"Error scraping Shopee: {str(e)}", None
296
+
297
+ def scrape_blibli(product_url, product_category):
298
+ """Scrape product data from Blibli"""
299
+ try:
300
+ # Extract product ID from URL
301
+ match = re.search(r'p/([^/\?]+)', product_url)
302
+ if not match:
303
+ return "Error: Invalid Blibli URL format.", None
304
+
305
+ product_id = match.group(1)
306
+ api_url = f"https://www.blibli.com/backend/product-detail/products/{product_id}"
307
+
308
+ headers = {
309
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
310
+ 'Accept': 'application/json',
311
+ 'X-Requested-With': 'XMLHttpRequest',
312
+ 'Referer': 'https://www.blibli.com/',
313
+ }
314
+
315
+ session = requests.Session()
316
+ response = session.get(api_url, headers=headers)
317
+
318
+ if response.status_code != 200:
319
+ return f"Error: Failed to fetch product data (HTTP {response.status_code}).", None
320
+
321
+ product_data = response.json()
322
+ images = product_data.get('data', {}).get('images', [])
323
+
324
+ if not images:
325
+ return "Error: No product images found.", None
326
+
327
+ results = []
328
+ for img_url in images[:5]:
329
+ classification_result, confidence = compare_with_reference(img_url, product_category)
330
+ results.append({
331
+ 'image_url': img_url,
332
+ 'classification': classification_result,
333
+ 'confidence': confidence
334
+ })
335
+
336
+ output_file = 'blibli_authenticity_check.csv'
337
+ with open(output_file, 'w', newline='', encoding='utf-8') as file:
338
+ writer = csv.writer(file)
339
+ writer.writerow(['image_url', 'authenticity_result', 'confidence'])
340
+ for result in results:
341
+ writer.writerow([
342
+ result['image_url'],
343
+ result['classification'],
344
+ f"{result['confidence']:.2%}"
345
+ ])
346
+
347
+ pass_count = sum(1 for r in results if r['classification'] == 'Pass')
348
+ total_images = len(results)
349
+ summary = f"""
350
+ Blibli Authenticity Check Results:
351
+ Total Images Analyzed: {total_images}
352
+ Appears Authentic: {pass_count}
353
+ Potentially Counterfeit: {total_images - pass_count}
354
+
355
+ Detailed results saved to {output_file}
356
+ """
357
+
358
+ return summary, results[0]['image_url']
359
+
360
+ except Exception as e:
361
+ return f"Error scraping Blibli: {str(e)}", None
362
+
363
+ def scrape_bukalapak(product_url, product_category):
364
+ """Scrape product data from Bukalapak"""
365
+ try:
366
+ # Extract product ID from URL
367
+ match = re.search(r'p/([^/\?]+)', product_url)
368
+ if not match:
369
+ return "Error: Invalid Bukalapak URL format.", None
370
+
371
+ product_slug = match.group(1)
372
+ api_url = f"https://api.bukalapak.com/products/{product_slug}"
373
+
374
+ headers = {
375
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
376
+ 'Accept': 'application/json',
377
+ 'X-Requested-With': 'XMLHttpRequest',
378
+ 'Referer': 'https://www.bukalapak.com/',
379
+ }
380
+
381
+ session = requests.Session()
382
+ response = session.get(api_url, headers=headers)
383
+
384
+ if response.status_code != 200:
385
+ return f"Error: Failed to fetch product data (HTTP {response.status_code}).", None
386
+
387
+ product_data = response.json()
388
+ images = product_data.get('data', {}).get('images', [])
389
+
390
+ if not images:
391
+ return "Error: No product images found.", None
392
+
393
+ results = []
394
+ for img_data in images[:5]:
395
+ img_url = img_data.get('large_url')
396
+ if img_url:
397
+ classification_result, confidence = compare_with_reference(img_url, product_category)
398
+ results.append({
399
+ 'image_url': img_url,
400
+ 'classification': classification_result,
401
+ 'confidence': confidence
402
+ })
403
+
404
+ output_file = 'bukalapak_authenticity_check.csv'
405
+ with open(output_file, 'w', newline='', encoding='utf-8') as file:
406
+ writer = csv.writer(file)
407
+ writer.writerow(['image_url', 'authenticity_result', 'confidence'])
408
+ for result in results:
409
+ writer.writerow([
410
+ result['image_url'],
411
+ result['classification'],
412
+ f"{result['confidence']:.2%}"
413
+ ])
414
+
415
+ pass_count = sum(1 for r in results if r['classification'] == 'Pass')
416
+ total_images = len(results)
417
+ summary = f"""
418
+ Bukalapak Authenticity Check Results:
419
+ Total Images Analyzed: {total_images}
420
+ Appears Authentic: {pass_count}
421
+ Potentially Counterfeit: {total_images - pass_count}
422
+
423
+ Detailed results saved to {output_file}
424
+ """
425
+
426
+ return summary, results[0]['image_url']
427
+
428
+ except Exception as e:
429
+ return f"Error scraping Bukalapak: {str(e)}", None
430
+
431
+ def gradio_scrape(marketplace_choice, product_url, product_category):
432
+ """Updated gradio function with direct marketplace selection"""
433
+ if not product_url:
434
+ return "Error: Please enter a product URL", None
435
+
436
+ # Validate URL based on selected marketplace
437
+ url_patterns = {
438
+ 'Shopee': r'shopee\.co\.id',
439
+ 'Tokopedia': r'tokopedia\.com',
440
+ 'Blibli': r'blibli\.com',
441
+ 'Bukalapak': r'bukalapak\.com'
442
+ }
443
+
444
+ if not re.search(url_patterns[marketplace_choice], product_url):
445
+ return f"Error: URL doesn't match selected marketplace ({marketplace_choice}). Please check your URL.", None
446
+
447
+ # Call appropriate scraping function based on marketplace
448
+ scraping_functions = {
449
+ 'Shopee': scrape_shopee,
450
+ 'Tokopedia': scrape_tokopedia,
451
+ 'Blibli': scrape_blibli,
452
+ 'Bukalapak': scrape_bukalapak
453
+ }
454
+
455
+ result, image_url = scraping_functions[marketplace_choice](product_url, product_category)
456
+
457
+ if image_url:
458
+ img = Image.open(BytesIO(requests.get(image_url).content))
459
+ return result, img
460
+ return result, None
461
+
462
+ # Get available categories from reference_images directory
463
+ categories = [d for d in os.listdir(REFERENCE_IMAGES_DIR)
464
+ if os.path.isdir(os.path.join(REFERENCE_IMAGES_DIR, d))]
465
+
466
+ # Define marketplace choices
467
+ marketplace_choices = ['Shopee', 'Tokopedia', 'Blibli', 'Bukalapak']
468
+
469
+ # Update Gradio Interface
470
+ interface = gr.Interface(
471
+ fn=gradio_scrape,
472
+ inputs=[
473
+ gr.Dropdown(
474
+ choices=marketplace_choices,
475
+ label="Select Marketplace",
476
+ value="Shopee"
477
+ ),
478
+ gr.Textbox(
479
+ label="Product URL",
480
+ placeholder="Paste your product URL here"
481
+ ),
482
+ gr.Dropdown(
483
+ choices=categories,
484
+ label="Product Category"
485
+ )
486
+ ],
487
+ outputs=[
488
+ gr.Textbox(label="Authenticity Check Results"),
489
+ gr.Image(label="Product Image Sample")
490
+ ],
491
+ title="E-commerce Product Authenticity Checker",
492
+ description="""
493
+ How to use:
494
+ 1. Select your marketplace (Shopee/Tokopedia/Blibli/Bukalapak)
495
+ 2. Paste the product URL
496
+ 3. Select the product category
497
+ 4. Click submit to check authenticity
498
+ """,
499
+ )
500
+
501
+ if __name__ == "__main__":
502
+ interface.launch()