Spaces:
Sleeping
Sleeping
| import requests | |
| import re | |
| import csv | |
| import datetime | |
| import gradio as gr | |
| import os | |
| import openai | |
| from openai import OpenAI | |
| from PIL import Image | |
| from io import BytesIO | |
| from dotenv import load_dotenv | |
| import json | |
| # Load environment variables | |
| load_dotenv() | |
| # Initialize OpenAI client | |
| client = OpenAI(api_key=os.getenv('OPENAI_API_KEY')) | |
| # Define reference images directory | |
| REFERENCE_IMAGES_DIR = 'reference_images' | |
| os.makedirs(REFERENCE_IMAGES_DIR, exist_ok=True) | |
| def load_reference_images(): | |
| """Load all reference images from the reference directory""" | |
| reference_data = {} | |
| for category in os.listdir(REFERENCE_IMAGES_DIR): | |
| category_path = os.path.join(REFERENCE_IMAGES_DIR, category) | |
| if os.path.isdir(category_path): | |
| reference_data[category] = [] | |
| for img_file in os.listdir(category_path): | |
| if img_file.lower().endswith(('.png', '.jpg', '.jpeg')): | |
| img_path = os.path.join(category_path, img_file) | |
| reference_data[category].append(img_path) | |
| return reference_data | |
| def compare_with_reference(image_url, product_category): | |
| """Compare product image with reference images using OpenAI Vision""" | |
| reference_images = load_reference_images().get(product_category, []) | |
| if not reference_images: | |
| return "Error: No reference images found for this category", 0 | |
| try: | |
| messages = [ | |
| { | |
| "role": "user", | |
| "content": [ | |
| { | |
| "type": "text", | |
| "text": """Compare these images and determine if the product appears to be authentic. | |
| Consider: | |
| 1. Logo placement and quality | |
| 2. Product design details | |
| 3. Material quality appearance | |
| 4. Color accuracy | |
| 5. Overall build quality | |
| The first image is the reference (authentic product). | |
| The second image is the product to verify. | |
| Respond with 'Pass' if it appears authentic or 'Not Pass' if it shows signs of being counterfeit. | |
| """ | |
| }, | |
| { | |
| "type": "image_url", | |
| "image_url": {"url": reference_images[0]} # Using first reference image | |
| }, | |
| { | |
| "type": "image_url", | |
| "image_url": {"url": image_url} | |
| } | |
| ] | |
| } | |
| ] | |
| response = client.chat.completions.create( | |
| model="gpt-4o-mini", | |
| messages=messages, | |
| max_tokens=10 | |
| ) | |
| result = response.choices[0].message.content.strip() | |
| confidence = 1.0 if result == "Pass" else 0.0 | |
| return result, confidence | |
| except Exception as e: | |
| print(f"Error in comparison: {e}") | |
| return "Error", 0 | |
| def scrape_tokopedia(product_url, product_category): | |
| """Scrape product data from Tokopedia""" | |
| try: | |
| # Validasi URL Tokopedia | |
| match = re.search(r'tokopedia\.com/([^/]+)/([^/?]+)', product_url) | |
| if not match: | |
| return "Error: Invalid Tokopedia URL format.", None | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', | |
| 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', | |
| 'Accept-Language': 'en-US,en;q=0.9', | |
| 'Accept-Encoding': 'gzip, deflate, br', | |
| 'Connection': 'keep-alive', | |
| 'Upgrade-Insecure-Requests': '1', | |
| 'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"', | |
| 'sec-ch-ua-platform': '"Windows"' | |
| } | |
| session = requests.Session() | |
| print(f"Fetching product page: {product_url}") | |
| # Langsung mengakses halaman produk | |
| response = session.get(product_url, headers=headers, timeout=10) | |
| response.raise_for_status() # Raise exception for bad status codes | |
| print(f"Response status: {response.status_code}") | |
| # Multiple patterns untuk mencari URL gambar | |
| image_patterns = [ | |
| r'https://images\.tokopedia\.net/img/[^"\']+\.(jpg|jpeg|png)', | |
| r'https://[^"\']+\.tokopedia\.net/[^"\']+\.(jpg|jpeg|png)', | |
| r'"imageUrl":"(https://[^"]+)"', | |
| r'"url":"(https://images[^"]+)"', | |
| r'content="(https://images\.tokopedia\.net[^"]+)"' | |
| ] | |
| all_images = [] | |
| for pattern in image_patterns: | |
| matches = re.findall(pattern, response.text) | |
| if matches: | |
| if isinstance(matches[0], tuple): | |
| # If the pattern contains groups, take the full match | |
| images = [m[0] if isinstance(m, tuple) else m for m in matches] | |
| else: | |
| images = matches | |
| all_images.extend(images) | |
| # Remove duplicates and clean URLs | |
| unique_images = list(set(all_images)) | |
| print(f"Found {len(unique_images)} unique images") | |
| if not unique_images: | |
| # Try to extract from JSON-LD | |
| json_ld_pattern = r'<script type="application/ld\+json">(.*?)</script>' | |
| json_matches = re.findall(json_ld_pattern, response.text, re.DOTALL) | |
| for json_str in json_matches: | |
| try: | |
| json_data = json.loads(json_str) | |
| if 'image' in json_data: | |
| if isinstance(json_data['image'], list): | |
| unique_images.extend(json_data['image']) | |
| else: | |
| unique_images.append(json_data['image']) | |
| except: | |
| continue | |
| if not unique_images: | |
| return "Error: No product images found.", None | |
| # Filter and verify images | |
| valid_images = [] | |
| for img_url in unique_images[:10]: # Try first 10 images | |
| try: | |
| print(f"Verifying image URL: {img_url}") | |
| img_response = session.head(img_url, headers=headers, timeout=5) | |
| content_type = img_response.headers.get('content-type', '') | |
| if img_response.status_code == 200 and 'image' in content_type.lower(): | |
| valid_images.append(img_url) | |
| if len(valid_images) >= 5: # Stop after getting 5 valid images | |
| break | |
| except Exception as e: | |
| print(f"Error verifying image {img_url}: {str(e)}") | |
| continue | |
| if not valid_images: | |
| return "Error: Could not verify any product images.", None | |
| results = [] | |
| for img_url in valid_images: | |
| try: | |
| print(f"Processing image: {img_url}") | |
| classification_result, confidence = compare_with_reference(img_url, product_category) | |
| results.append({ | |
| 'image_url': img_url, | |
| 'classification': classification_result, | |
| 'confidence': confidence | |
| }) | |
| except Exception as e: | |
| print(f"Error processing image {img_url}: {str(e)}") | |
| continue | |
| if not results: | |
| return "Error: Could not process any product images.", None | |
| output_file = 'tokopedia_authenticity_check.csv' | |
| with open(output_file, 'w', newline='', encoding='utf-8') as file: | |
| writer = csv.writer(file) | |
| writer.writerow(['image_url', 'authenticity_result', 'confidence']) | |
| for result in results: | |
| writer.writerow([ | |
| result['image_url'], | |
| result['classification'], | |
| f"{result['confidence']:.2%}" | |
| ]) | |
| pass_count = sum(1 for r in results if r['classification'] == 'Pass') | |
| total_images = len(results) | |
| summary = f""" | |
| Tokopedia Authenticity Check Results: | |
| Total Images Analyzed: {total_images} | |
| Appears Authentic: {pass_count} | |
| Potentially Counterfeit: {total_images - pass_count} | |
| Detailed results saved to {output_file} | |
| """ | |
| return summary, results[0]['image_url'] | |
| except Exception as e: | |
| print(f"Error in scrape_tokopedia: {str(e)}") | |
| return f"Error scraping Tokopedia: {str(e)}", None | |
| def scrape_shopee(product_url, product_category): | |
| """Scrape product data from Shopee""" | |
| try: | |
| # Extract shop_id and item_id from URL | |
| match = re.search(r'i\.(\d+)\.(\d+)', product_url) | |
| if not match: | |
| return "Error: Invalid Shopee URL format.", None | |
| shop_id, item_id = match.groups() | |
| api_url = f'https://shopee.co.id/api/v4/item/get?itemid={item_id}&shopid={shop_id}' | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', | |
| 'Accept': 'application/json', | |
| 'X-Requested-With': 'XMLHttpRequest', | |
| 'Referer': 'https://shopee.co.id/', | |
| 'AF-AC-Encoding-Version': '3', | |
| } | |
| session = requests.Session() | |
| # First visit the main page to get cookies | |
| session.get(f'https://shopee.co.id/product/{shop_id}/{item_id}', headers=headers) | |
| response = session.get(api_url, headers=headers) | |
| if response.status_code != 200: | |
| return f"Error: Failed to fetch product data (HTTP {response.status_code}).", None | |
| product_data = response.json() | |
| images = product_data.get('data', {}).get('images', []) | |
| if not images: | |
| return "Error: No product images found.", None | |
| results = [] | |
| for img_id in images[:5]: | |
| image_url = f"https://cf.shopee.co.id/file/{img_id}" | |
| classification_result, confidence = compare_with_reference(image_url, product_category) | |
| results.append({ | |
| 'image_url': image_url, | |
| 'classification': classification_result, | |
| 'confidence': confidence | |
| }) | |
| output_file = 'shopee_authenticity_check.csv' | |
| with open(output_file, 'w', newline='', encoding='utf-8') as file: | |
| writer = csv.writer(file) | |
| writer.writerow(['image_url', 'authenticity_result', 'confidence']) | |
| for result in results: | |
| writer.writerow([ | |
| result['image_url'], | |
| result['classification'], | |
| f"{result['confidence']:.2%}" | |
| ]) | |
| pass_count = sum(1 for r in results if r['classification'] == 'Pass') | |
| total_images = len(results) | |
| summary = f""" | |
| Shopee Authenticity Check Results: | |
| Total Images Analyzed: {total_images} | |
| Appears Authentic: {pass_count} | |
| Potentially Counterfeit: {total_images - pass_count} | |
| Detailed results saved to {output_file} | |
| """ | |
| return summary, results[0]['image_url'] | |
| except Exception as e: | |
| return f"Error scraping Shopee: {str(e)}", None | |
| def scrape_blibli(product_url, product_category): | |
| """Scrape product data from Blibli""" | |
| try: | |
| # Extract product ID from URL | |
| match = re.search(r'p/([^/\?]+)', product_url) | |
| if not match: | |
| return "Error: Invalid Blibli URL format.", None | |
| product_id = match.group(1) | |
| api_url = f"https://www.blibli.com/backend/product-detail/products/{product_id}" | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', | |
| 'Accept': 'application/json', | |
| 'X-Requested-With': 'XMLHttpRequest', | |
| 'Referer': 'https://www.blibli.com/', | |
| } | |
| session = requests.Session() | |
| response = session.get(api_url, headers=headers) | |
| if response.status_code != 200: | |
| return f"Error: Failed to fetch product data (HTTP {response.status_code}).", None | |
| product_data = response.json() | |
| images = product_data.get('data', {}).get('images', []) | |
| if not images: | |
| return "Error: No product images found.", None | |
| results = [] | |
| for img_url in images[:5]: | |
| classification_result, confidence = compare_with_reference(img_url, product_category) | |
| results.append({ | |
| 'image_url': img_url, | |
| 'classification': classification_result, | |
| 'confidence': confidence | |
| }) | |
| output_file = 'blibli_authenticity_check.csv' | |
| with open(output_file, 'w', newline='', encoding='utf-8') as file: | |
| writer = csv.writer(file) | |
| writer.writerow(['image_url', 'authenticity_result', 'confidence']) | |
| for result in results: | |
| writer.writerow([ | |
| result['image_url'], | |
| result['classification'], | |
| f"{result['confidence']:.2%}" | |
| ]) | |
| pass_count = sum(1 for r in results if r['classification'] == 'Pass') | |
| total_images = len(results) | |
| summary = f""" | |
| Blibli Authenticity Check Results: | |
| Total Images Analyzed: {total_images} | |
| Appears Authentic: {pass_count} | |
| Potentially Counterfeit: {total_images - pass_count} | |
| Detailed results saved to {output_file} | |
| """ | |
| return summary, results[0]['image_url'] | |
| except Exception as e: | |
| return f"Error scraping Blibli: {str(e)}", None | |
| def scrape_bukalapak(product_url, product_category): | |
| """Scrape product data from Bukalapak""" | |
| try: | |
| # Extract product ID from URL | |
| match = re.search(r'p/([^/\?]+)', product_url) | |
| if not match: | |
| return "Error: Invalid Bukalapak URL format.", None | |
| product_slug = match.group(1) | |
| api_url = f"https://api.bukalapak.com/products/{product_slug}" | |
| headers = { | |
| 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36', | |
| 'Accept': 'application/json', | |
| 'X-Requested-With': 'XMLHttpRequest', | |
| 'Referer': 'https://www.bukalapak.com/', | |
| } | |
| session = requests.Session() | |
| response = session.get(api_url, headers=headers) | |
| if response.status_code != 200: | |
| return f"Error: Failed to fetch product data (HTTP {response.status_code}).", None | |
| product_data = response.json() | |
| images = product_data.get('data', {}).get('images', []) | |
| if not images: | |
| return "Error: No product images found.", None | |
| results = [] | |
| for img_data in images[:5]: | |
| img_url = img_data.get('large_url') | |
| if img_url: | |
| classification_result, confidence = compare_with_reference(img_url, product_category) | |
| results.append({ | |
| 'image_url': img_url, | |
| 'classification': classification_result, | |
| 'confidence': confidence | |
| }) | |
| output_file = 'bukalapak_authenticity_check.csv' | |
| with open(output_file, 'w', newline='', encoding='utf-8') as file: | |
| writer = csv.writer(file) | |
| writer.writerow(['image_url', 'authenticity_result', 'confidence']) | |
| for result in results: | |
| writer.writerow([ | |
| result['image_url'], | |
| result['classification'], | |
| f"{result['confidence']:.2%}" | |
| ]) | |
| pass_count = sum(1 for r in results if r['classification'] == 'Pass') | |
| total_images = len(results) | |
| summary = f""" | |
| Bukalapak Authenticity Check Results: | |
| Total Images Analyzed: {total_images} | |
| Appears Authentic: {pass_count} | |
| Potentially Counterfeit: {total_images - pass_count} | |
| Detailed results saved to {output_file} | |
| """ | |
| return summary, results[0]['image_url'] | |
| except Exception as e: | |
| return f"Error scraping Bukalapak: {str(e)}", None | |
| def gradio_scrape(marketplace_choice, product_url, product_category): | |
| """Updated gradio function with direct marketplace selection""" | |
| if not product_url: | |
| return "Error: Please enter a product URL", None | |
| # Validate URL based on selected marketplace | |
| url_patterns = { | |
| 'Shopee': r'shopee\.co\.id', | |
| 'Tokopedia': r'tokopedia\.com', | |
| 'Blibli': r'blibli\.com', | |
| 'Bukalapak': r'bukalapak\.com' | |
| } | |
| if not re.search(url_patterns[marketplace_choice], product_url): | |
| return f"Error: URL doesn't match selected marketplace ({marketplace_choice}). Please check your URL.", None | |
| # Call appropriate scraping function based on marketplace | |
| scraping_functions = { | |
| 'Shopee': scrape_shopee, | |
| 'Tokopedia': scrape_tokopedia, | |
| 'Blibli': scrape_blibli, | |
| 'Bukalapak': scrape_bukalapak | |
| } | |
| result, image_url = scraping_functions[marketplace_choice](product_url, product_category) | |
| if image_url: | |
| img = Image.open(BytesIO(requests.get(image_url).content)) | |
| return result, img | |
| return result, None | |
| # Get available categories from reference_images directory | |
| categories = [d for d in os.listdir(REFERENCE_IMAGES_DIR) | |
| if os.path.isdir(os.path.join(REFERENCE_IMAGES_DIR, d))] | |
| # Define marketplace choices | |
| marketplace_choices = ['Shopee', 'Tokopedia', 'Blibli', 'Bukalapak'] | |
| # Update Gradio Interface | |
| interface = gr.Interface( | |
| fn=gradio_scrape, | |
| inputs=[ | |
| gr.Dropdown( | |
| choices=marketplace_choices, | |
| label="Select Marketplace", | |
| value="Shopee" | |
| ), | |
| gr.Textbox( | |
| label="Product URL", | |
| placeholder="Paste your product URL here" | |
| ), | |
| gr.Dropdown( | |
| choices=categories, | |
| label="Product Category" | |
| ) | |
| ], | |
| outputs=[ | |
| gr.Textbox(label="Authenticity Check Results"), | |
| gr.Image(label="Product Image Sample") | |
| ], | |
| title="E-commerce Product Authenticity Checker", | |
| description=""" | |
| How to use: | |
| 1. Select your marketplace (Shopee/Tokopedia/Blibli/Bukalapak) | |
| 2. Paste the product URL | |
| 3. Select the product category | |
| 4. Click submit to check authenticity | |
| """, | |
| ) | |
| if __name__ == "__main__": | |
| interface.launch() | |