Spaces:

dramp77
/

classification-test

Sleeping

App Files Files Community

dramp77 commited on Nov 21, 2024

Commit

8b114cb

1 Parent(s): 8fdd88f

push code

Browse files

Files changed (8) hide show

.env +1 -0
app.py +501 -0
cek.py +61 -0
classification-test/.gitattributes +35 -0
classification-test/README.md +13 -0
data.csv +1 -0
shopee_reviews.csv +1 -0
tokopedia_authenticity_check.csv +3 -0

.env ADDED Viewed

	@@ -0,0 +1 @@


1	+ OPENAI_API_KEY = "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"

app.py ADDED Viewed

	@@ -0,0 +1,501 @@

+import requests
+import re
+import csv
+import datetime
+import gradio as gr
+import os
+from openai import OpenAI
+from PIL import Image
+from io import BytesIO
+from dotenv import load_dotenv
+import json
+# Load environment variables
+load_dotenv()
+# Initialize OpenAI client
+client = OpenAI(api_key=os.getenv('OPENAI_API_KEY'))
+# Define reference images directory
+REFERENCE_IMAGES_DIR = 'reference_images'
+os.makedirs(REFERENCE_IMAGES_DIR, exist_ok=True)
+def load_reference_images():
+    """Load all reference images from the reference directory"""
+    reference_data = {}
+    for category in os.listdir(REFERENCE_IMAGES_DIR):
+        category_path = os.path.join(REFERENCE_IMAGES_DIR, category)
+        if os.path.isdir(category_path):
+            reference_data[category] = []
+            for img_file in os.listdir(category_path):
+                if img_file.lower().endswith(('.png', '.jpg', '.jpeg')):
+                    img_path = os.path.join(category_path, img_file)
+                    reference_data[category].append(img_path)
+    return reference_data
+def compare_with_reference(image_url, product_category):
+    """Compare product image with reference images using OpenAI Vision"""
+    reference_images = load_reference_images().get(product_category, [])
+    if not reference_images:
+        return "Error: No reference images found for this category", 0
+    try:
+        messages = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "text",
+                        "text": """Compare these images and determine if the product appears to be authentic.
+                        Consider:
+                        1. Logo placement and quality
+                        2. Product design details
+                        3. Material quality appearance
+                        4. Color accuracy
+                        5. Overall build quality
+                        The first image is the reference (authentic product).
+                        The second image is the product to verify.
+                        Respond with 'Pass' if it appears authentic or 'Not Pass' if it shows signs of being counterfeit.
+                        """
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": reference_images[0]}  # Using first reference image
+                    },
+                    {
+                        "type": "image_url",
+                        "image_url": {"url": image_url}
+                    }
+                ]
+            }
+        ]
+        response = client.chat.completions.create(
+            model="gpt-4o-mini",
+            messages=messages,
+            max_tokens=10
+        )
+        result = response.choices[0].message.content.strip()
+        confidence = 1.0 if result == "Pass" else 0.0
+        return result, confidence
+    except Exception as e:
+        print(f"Error in comparison: {e}")
+        return "Error", 0
+def scrape_tokopedia(product_url, product_category):
+    """Scrape product data from Tokopedia"""
+    try:
+        # Validasi URL Tokopedia
+        match = re.search(r'tokopedia\.com/([^/]+)/([^/?]+)', product_url)
+        if not match:
+            return "Error: Invalid Tokopedia URL format.", None
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
+            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
+            'Accept-Language': 'en-US,en;q=0.9',
+            'Accept-Encoding': 'gzip, deflate, br',
+            'Connection': 'keep-alive',
+            'Upgrade-Insecure-Requests': '1',
+            'sec-ch-ua': '"Not_A Brand";v="8", "Chromium";v="120", "Google Chrome";v="120"',
+            'sec-ch-ua-platform': '"Windows"'
+        }
+        session = requests.Session()
+        print(f"Fetching product page: {product_url}")
+        # Langsung mengakses halaman produk
+        response = session.get(product_url, headers=headers, timeout=10)
+        response.raise_for_status()  # Raise exception for bad status codes
+        print(f"Response status: {response.status_code}")
+        # Multiple patterns untuk mencari URL gambar
+        image_patterns = [
+            r'https://images\.tokopedia\.net/img/[^"\']+\.(jpg|jpeg|png)',
+            r'https://[^"\']+\.tokopedia\.net/[^"\']+\.(jpg|jpeg|png)',
+            r'"imageUrl":"(https://[^"]+)"',
+            r'"url":"(https://images[^"]+)"',
+            r'content="(https://images\.tokopedia\.net[^"]+)"'
+        ]
+        all_images = []
+        for pattern in image_patterns:
+            matches = re.findall(pattern, response.text)
+            if matches:
+                if isinstance(matches[0], tuple):
+                    # If the pattern contains groups, take the full match
+                    images = [m[0] if isinstance(m, tuple) else m for m in matches]
+                else:
+                    images = matches
+                all_images.extend(images)
+        # Remove duplicates and clean URLs
+        unique_images = list(set(all_images))
+        print(f"Found {len(unique_images)} unique images")
+        if not unique_images:
+            # Try to extract from JSON-LD
+            json_ld_pattern = r'<script type="application/ld\+json">(.*?)</script>'
+            json_matches = re.findall(json_ld_pattern, response.text, re.DOTALL)
+            for json_str in json_matches:
+                try:
+                    json_data = json.loads(json_str)
+                    if 'image' in json_data:
+                        if isinstance(json_data['image'], list):
+                            unique_images.extend(json_data['image'])
+                        else:
+                            unique_images.append(json_data['image'])
+                except:
+                    continue
+        if not unique_images:
+            return "Error: No product images found.", None
+        # Filter and verify images
+        valid_images = []
+        for img_url in unique_images[:10]:  # Try first 10 images
+            try:
+                print(f"Verifying image URL: {img_url}")
+                img_response = session.head(img_url, headers=headers, timeout=5)
+                content_type = img_response.headers.get('content-type', '')
+                if img_response.status_code == 200 and 'image' in content_type.lower():
+                    valid_images.append(img_url)
+                    if len(valid_images) >= 5:  # Stop after getting 5 valid images
+                        break
+            except Exception as e:
+                print(f"Error verifying image {img_url}: {str(e)}")
+                continue
+        if not valid_images:
+            return "Error: Could not verify any product images.", None
+        results = []
+        for img_url in valid_images:
+            try:
+                print(f"Processing image: {img_url}")
+                classification_result, confidence = compare_with_reference(img_url, product_category)
+                results.append({
+                    'image_url': img_url,
+                    'classification': classification_result,
+                    'confidence': confidence
+                })
+            except Exception as e:
+                print(f"Error processing image {img_url}: {str(e)}")
+                continue
+        if not results:
+            return "Error: Could not process any product images.", None
+        output_file = 'tokopedia_authenticity_check.csv'
+        with open(output_file, 'w', newline='', encoding='utf-8') as file:
+            writer = csv.writer(file)
+            writer.writerow(['image_url', 'authenticity_result', 'confidence'])
+            for result in results:
+                writer.writerow([
+                    result['image_url'],
+                    result['classification'],
+                    f"{result['confidence']:.2%}"
+                ])
+        pass_count = sum(1 for r in results if r['classification'] == 'Pass')
+        total_images = len(results)
+        summary = f"""
+        Tokopedia Authenticity Check Results:
+        Total Images Analyzed: {total_images}
+        Appears Authentic: {pass_count}
+        Potentially Counterfeit: {total_images - pass_count}
+        Detailed results saved to {output_file}
+        """
+        return summary, results[0]['image_url']
+    except Exception as e:
+        print(f"Error in scrape_tokopedia: {str(e)}")
+        return f"Error scraping Tokopedia: {str(e)}", None
+def scrape_shopee(product_url, product_category):
+    """Scrape product data from Shopee"""
+    try:
+        # Extract shop_id and item_id from URL
+        match = re.search(r'i\.(\d+)\.(\d+)', product_url)
+        if not match:
+            return "Error: Invalid Shopee URL format.", None
+        shop_id, item_id = match.groups()
+        api_url = f'https://shopee.co.id/api/v4/item/get?itemid={item_id}&shopid={shop_id}'
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+            'Accept': 'application/json',
+            'X-Requested-With': 'XMLHttpRequest',
+            'Referer': 'https://shopee.co.id/',
+            'AF-AC-Encoding-Version': '3',
+        }
+        session = requests.Session()
+        # First visit the main page to get cookies
+        session.get(f'https://shopee.co.id/product/{shop_id}/{item_id}', headers=headers)
+        response = session.get(api_url, headers=headers)
+        if response.status_code != 200:
+            return f"Error: Failed to fetch product data (HTTP {response.status_code}).", None
+        product_data = response.json()
+        images = product_data.get('data', {}).get('images', [])
+        if not images:
+            return "Error: No product images found.", None
+        results = []
+        for img_id in images[:5]:
+            image_url = f"https://cf.shopee.co.id/file/{img_id}"
+            classification_result, confidence = compare_with_reference(image_url, product_category)
+            results.append({
+                'image_url': image_url,
+                'classification': classification_result,
+                'confidence': confidence
+            })
+        output_file = 'shopee_authenticity_check.csv'
+        with open(output_file, 'w', newline='', encoding='utf-8') as file:
+            writer = csv.writer(file)
+            writer.writerow(['image_url', 'authenticity_result', 'confidence'])
+            for result in results:
+                writer.writerow([
+                    result['image_url'],
+                    result['classification'],
+                    f"{result['confidence']:.2%}"
+                ])
+        pass_count = sum(1 for r in results if r['classification'] == 'Pass')
+        total_images = len(results)
+        summary = f"""
+        Shopee Authenticity Check Results:
+        Total Images Analyzed: {total_images}
+        Appears Authentic: {pass_count}
+        Potentially Counterfeit: {total_images - pass_count}
+        Detailed results saved to {output_file}
+        """
+        return summary, results[0]['image_url']
+    except Exception as e:
+        return f"Error scraping Shopee: {str(e)}", None
+def scrape_blibli(product_url, product_category):
+    """Scrape product data from Blibli"""
+    try:
+        # Extract product ID from URL
+        match = re.search(r'p/([^/\?]+)', product_url)
+        if not match:
+            return "Error: Invalid Blibli URL format.", None
+        product_id = match.group(1)
+        api_url = f"https://www.blibli.com/backend/product-detail/products/{product_id}"
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+            'Accept': 'application/json',
+            'X-Requested-With': 'XMLHttpRequest',
+            'Referer': 'https://www.blibli.com/',
+        }
+        session = requests.Session()
+        response = session.get(api_url, headers=headers)
+        if response.status_code != 200:
+            return f"Error: Failed to fetch product data (HTTP {response.status_code}).", None
+        product_data = response.json()
+        images = product_data.get('data', {}).get('images', [])
+        if not images:
+            return "Error: No product images found.", None
+        results = []
+        for img_url in images[:5]:
+            classification_result, confidence = compare_with_reference(img_url, product_category)
+            results.append({
+                'image_url': img_url,
+                'classification': classification_result,
+                'confidence': confidence
+            })
+        output_file = 'blibli_authenticity_check.csv'
+        with open(output_file, 'w', newline='', encoding='utf-8') as file:
+            writer = csv.writer(file)
+            writer.writerow(['image_url', 'authenticity_result', 'confidence'])
+            for result in results:
+                writer.writerow([
+                    result['image_url'],
+                    result['classification'],
+                    f"{result['confidence']:.2%}"
+                ])
+        pass_count = sum(1 for r in results if r['classification'] == 'Pass')
+        total_images = len(results)
+        summary = f"""
+        Blibli Authenticity Check Results:
+        Total Images Analyzed: {total_images}
+        Appears Authentic: {pass_count}
+        Potentially Counterfeit: {total_images - pass_count}
+        Detailed results saved to {output_file}
+        """
+        return summary, results[0]['image_url']
+    except Exception as e:
+        return f"Error scraping Blibli: {str(e)}", None
+def scrape_bukalapak(product_url, product_category):
+    """Scrape product data from Bukalapak"""
+    try:
+        # Extract product ID from URL
+        match = re.search(r'p/([^/\?]+)', product_url)
+        if not match:
+            return "Error: Invalid Bukalapak URL format.", None
+        product_slug = match.group(1)
+        api_url = f"https://api.bukalapak.com/products/{product_slug}"
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
+            'Accept': 'application/json',
+            'X-Requested-With': 'XMLHttpRequest',
+            'Referer': 'https://www.bukalapak.com/',
+        }
+        session = requests.Session()
+        response = session.get(api_url, headers=headers)
+        if response.status_code != 200:
+            return f"Error: Failed to fetch product data (HTTP {response.status_code}).", None
+        product_data = response.json()
+        images = product_data.get('data', {}).get('images', [])
+        if not images:
+            return "Error: No product images found.", None
+        results = []
+        for img_data in images[:5]:
+            img_url = img_data.get('large_url')
+            if img_url:
+                classification_result, confidence = compare_with_reference(img_url, product_category)
+                results.append({
+                    'image_url': img_url,
+                    'classification': classification_result,
+                    'confidence': confidence
+                })
+        output_file = 'bukalapak_authenticity_check.csv'
+        with open(output_file, 'w', newline='', encoding='utf-8') as file:
+            writer = csv.writer(file)
+            writer.writerow(['image_url', 'authenticity_result', 'confidence'])
+            for result in results:
+                writer.writerow([
+                    result['image_url'],
+                    result['classification'],
+                    f"{result['confidence']:.2%}"
+                ])
+        pass_count = sum(1 for r in results if r['classification'] == 'Pass')
+        total_images = len(results)
+        summary = f"""
+        Bukalapak Authenticity Check Results:
+        Total Images Analyzed: {total_images}
+        Appears Authentic: {pass_count}
+        Potentially Counterfeit: {total_images - pass_count}
+        Detailed results saved to {output_file}
+        """
+        return summary, results[0]['image_url']
+    except Exception as e:
+        return f"Error scraping Bukalapak: {str(e)}", None
+def gradio_scrape(marketplace_choice, product_url, product_category):
+    """Updated gradio function with direct marketplace selection"""
+    if not product_url:
+        return "Error: Please enter a product URL", None
+    # Validate URL based on selected marketplace
+    url_patterns = {
+        'Shopee': r'shopee\.co\.id',
+        'Tokopedia': r'tokopedia\.com',
+        'Blibli': r'blibli\.com',
+        'Bukalapak': r'bukalapak\.com'
+    }
+    if not re.search(url_patterns[marketplace_choice], product_url):
+        return f"Error: URL doesn't match selected marketplace ({marketplace_choice}). Please check your URL.", None
+    # Call appropriate scraping function based on marketplace
+    scraping_functions = {
+        'Shopee': scrape_shopee,
+        'Tokopedia': scrape_tokopedia,
+        'Blibli': scrape_blibli,
+        'Bukalapak': scrape_bukalapak
+    }
+    result, image_url = scraping_functions[marketplace_choice](product_url, product_category)
+    if image_url:
+        img = Image.open(BytesIO(requests.get(image_url).content))
+        return result, img
+    return result, None
+# Get available categories from reference_images directory
+categories = [d for d in os.listdir(REFERENCE_IMAGES_DIR)
+             if os.path.isdir(os.path.join(REFERENCE_IMAGES_DIR, d))]
+# Define marketplace choices
+marketplace_choices = ['Shopee', 'Tokopedia', 'Blibli', 'Bukalapak']
+# Update Gradio Interface
+interface = gr.Interface(
+    fn=gradio_scrape,
+    inputs=[
+        gr.Dropdown(
+            choices=marketplace_choices,
+            label="Select Marketplace",
+            value="Shopee"
+        ),
+        gr.Textbox(
+            label="Product URL",
+            placeholder="Paste your product URL here"
+        ),
+        gr.Dropdown(
+            choices=categories,
+            label="Product Category"
+        )
+    ],
+    outputs=[
+        gr.Textbox(label="Authenticity Check Results"),
+        gr.Image(label="Product Image Sample")
+    ],
+    title="E-commerce Product Authenticity Checker",
+    description="""
+    How to use:
+    1. Select your marketplace (Shopee/Tokopedia/Blibli/Bukalapak)
+    2. Paste the product URL
+    3. Select the product category
+    4. Click submit to check authenticity
+    """,
+)
+if __name__ == "__main__":
+    interface.launch()

cek.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import requests
+import pandas as pd
+from datetime import datetime
+# URL API Shopee
+base_url = "https://shopee.co.id/api/v2/item/get_ratings"
+params = {
+    "exclude_filter": 1,
+    "filter": 0,
+    "filter_size": 0,
+    "flag": 1,
+    "fold_filter": 0,
+    "itemid": 5283031042,  # Ganti dengan item ID yang ingin di-scrape
+    "limit": 20,           # Jumlah review per permintaan
+    "offset": 0,           # Awal pagination
+    "relevant_reviews": "false",
+    "request_source": 2,
+    "shopid": 52733860,    # Shop ID terkait
+    "tag_filter": "",
+    "type": 0,
+    "variation_filters": ""
+}
+# Dataframe untuk menyimpan hasil
+reviews = {"username": [], "rating": [], "comment": [], "date": [], "images": []}
+# Loop untuk iterasi pagination
+while True:
+    response = requests.get(base_url, params=params)
+    if response.status_code != 200:
+        print("Error: Failed to fetch data.")
+        break
+    data = response.json()
+    # Periksa apakah ada data dalam "ratings"
+    if "data" not in data or "ratings" not in data["data"]:
+        print("No more ratings found.")
+        break
+    for rating in data["data"]["ratings"]:
+        reviews["username"].append(rating.get("author_username", "Unknown"))
+        reviews["rating"].append(rating.get("rating_star", "N/A"))
+        reviews["comment"].append(rating.get("comment", "No comment"))
+        reviews["date"].append(
+            datetime.utcfromtimestamp(rating.get("ctime", 0)).strftime("%Y-%m-%d %H:%M")
+        )
+        reviews["images"].append(", ".join(rating.get("images", [])))
+    # Jika jumlah data kurang dari limit, berarti sudah di halaman terakhir
+    if len(data["data"]["ratings"]) < params["limit"]:
+        break
+    # Tambah offset untuk pagination
+    params["offset"] += params["limit"]
+# Simpan data ke dalam file CSV
+df = pd.DataFrame(reviews)
+print(df)
+df.to_csv("shopee_reviews.csv", index=False)

classification-test/.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

classification-test/README.md ADDED Viewed

	@@ -0,0 +1,13 @@

+---
+title: Classification Test
+emoji: 🐨
+colorFrom: indigo
+colorTo: pink
+sdk: gradio
+sdk_version: 5.6.0
+app_file: app.py
+pinned: false
+license: gpl-3.0
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

data.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ username,rating,comment

shopee_reviews.csv ADDED Viewed

	@@ -0,0 +1 @@


1	+ username,rating,comment,date,images

tokopedia_authenticity_check.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+image_url,authenticity_result,confidence
+https://images.tokopedia.net/img/cache/700/VqbcmM/2024/10/23/d7bd79c8-3614-4d71-8aba-5e9a28aae180.jpg,Error,0.00%
+https://images.tokopedia.net/img/cache/500-square/VqbcmM/2024/10/23/d7bd79c8-3614-4d71-8aba-5e9a28aae180.jpg,Error,0.00%