Spaces:

hatamo
/

Antique_Auth_API

Running

App Files Files Community

hatamo commited on 17 days ago

Commit

56e046e

1 Parent(s): 0e2ce87

Added new version of scrappers

Browse files

Files changed (3) hide show

code/web_scraper_allegro.py +119 -110
code/web_scraper_ebay.py +133 -73
requirements.txt +0 -2

code/web_scraper_allegro.py CHANGED Viewed

@@ -1,16 +1,33 @@
-from apify_client import ApifyClient
 import os
-import re
-def sanitize_folder_name(text):
-    """Helper function to sanitize folder names"""
     polish_chars = {
-        "ą": "a", "ć": "c", "ę": "e", "ł": "l", "ń": "n",
-        "ó": "o", "ś": "s", "ź": "z", "ż": "z"
     }
     text = text.lower()
     result = ""
     for char in text:
         if char in polish_chars:
             result += polish_chars[char]
@@ -18,125 +35,117 @@ def sanitize_folder_name(text):
             result += char
         else:
             result += "_"
     while "__" in result:
         result = result.replace("__", "_")
-    return result.strip("_")
-def extract_price(price_str):
-    """Extract numeric price from various formats"""
-    if not price_str:
-        return None
-    match = re.search(r'(\d+[.,]\d{2}|\d+)', str(price_str))
-    if match:
-        return match.group(1).replace(',', '.')
-    return price_str
-def extract_images_from_apify(item_data):
-    """Extract and normalize image URLs from Apify response"""
-    unique_links = set()
-    allowed_sizes = ["/s128/", "/s360/", "/s512/", "/s720/", "/s1024/", "/s1440/", "/original/"]
-    image_sources = []
-    if 'images' in item_data and item_data['images']:
-        if isinstance(item_data['images'], list):
-            image_sources.extend(item_data['images'])
-        else:
-            image_sources.append(item_data['images'])
-    if 'image' in item_data and item_data['image']:
-        image_sources.append(item_data['image'])
-    if 'imageUrl' in item_data and item_data['imageUrl']:
-        image_sources.append(item_data['imageUrl'])
-    for img_url in image_sources:
-        if img_url and isinstance(img_url, str):
-            if "allegroimg.com" in img_url or "img" in img_url:
-                for size in allowed_sizes:
-                    img_url = img_url.replace(size, "/original/")
-                unique_links.add(img_url)
-    return list(unique_links)
-def scrape_allegro_offer(url: str):
-    """Scrape single Allegro product using Apify E-commerce Tool"""
-    api_token = os.getenv('APIFY_API_TOKEN')
-    if not api_token:
-        raise ValueError("APIFY_API_TOKEN environment variable not set")
-    client = ApifyClient(api_token)
-    # Correct input format for E-commerce Scraping Tool
-    run_input = {
-        "startUrls": [
-            url
-        ]
-    }
-    print(f"🔍 Scraping: {url}")
     try:
-        actor_call = client.actor("e-commerce/allegro-product-detail-scraper").call(
-            run_input=run_input
-        )
-        dataset_client = client.dataset(actor_call['defaultDatasetId'])
-        items = list(dataset_client.iterate_items())
-        if not items:
-            print("⚠️  No data returned from Apify")
-            return {
-                "platform": "allegro",
-                "url": url,
-                "title": "untitled",
-                "description": "No description",
-                "price": None,
-                "image_urls": []
-            }
-        item = items[0]
-        print(f"✅ Success! Found: {item.get('productTitle', 'untitled')}")
-        image_urls = extract_images_from_apify(item)
-        if not image_urls:
-            thumbnail = item.get("thumbnail")
-            if thumbnail:
-                image_urls = [thumbnail]
         return {
-            "platform": "allegro",
-            "url": item.get('url', url),
-            "title": item.get('productTitle', 'untitled').strip(),
-            "description": item.get('description', 'No description'),
-            "price": extract_price(item.get('price', item.get('currentPrice'))),
-            "image_urls": image_urls
-        }
     except Exception as e:
-        print(f"❌ Error: {e}")
-        return {
-            "platform": "allegro",
-            "url": url,
-            "title": "error",
-            "description": str(e),
-            "price": None,
-            "image_urls": []
-        }
-# Example usage
-if __name__ == "__main__":
-    url = input("Allegro URL: ")
-    result = scrape_allegro_offer(url)
-    print("\n✅ Scraping result:")
-    print(f"Title: {result['title']}")
-    print(f"Price: {result['price']}")
-    print(f"Description: {result['description'][:100]}..." if len(result['description']) > 100 else f"Description: {result['description']}")
-    print(f"Images: {len(result['image_urls'])} found")
-    for img in result['image_urls'][:3]:
-        print(f"  - {img}")

 import os
+import requests
+from apify_client import ApifyClient
+from dotenv import load_dotenv
+import json
+# --- CONFIGURATION ---
+# Load environment variables from the .env file (if it exists)
+load_dotenv()
+ACTOR_ID = "e-commerce/allegro-product-detail-scraper"
+# --- HELPER FUNCTIONS ---
+def sanitize_name(text):
+    """Sanitizes text by removing Polish characters and special symbols for a folder name."""
     polish_chars = {
+        "ą": "a",
+        "ć": "c",
+        "ę": "e",
+        "ł": "l",
+        "ń": "n",
+        "ó": "o",
+        "ś": "s",
+        "ź": "z",
+        "ż": "z",
     }
     text = text.lower()
     result = ""
     for char in text:
         if char in polish_chars:
             result += polish_chars[char]
             result += char
         else:
             result += "_"
+    # Remove double underscores
     while "__" in result:
         result = result.replace("__", "_")
+    return result.strip("_")
+def get_high_res_image(url):
+    """Converts a thumbnail/resized link to the original high-resolution Allegro link."""
+    if not url: return None
+    sizes = ["/s128/", "/s360/", "/s720/", "/s1024/", "/s1440/"]
+    for size in sizes:
+        if size in url:
+            return url.replace(size, "/original/")
+    return url
+def get_api_token():
+    """
+    Retrieves API token.
+    Priority 1: from .env file (environment variable).
+    Priority 2: prompts user input in the console.
+    """
+    token = os.getenv("APIFY_TOKEN")
+    if token:
+        print("Info: API Token loaded from .env file.")
+        return token
+    print("Warning: APIFY_TOKEN not found in .env file.")
+    token = input("Please enter your Apify API Token: ").strip()
+    return token
+def get_allegro_data(url):
+    apify_token = get_api_token()
+    if not apify_token:
+        print("ERROR: API Token is required to run the script.")
+        return
+    client = ApifyClient(apify_token)
+    run_input = { "startUrls": [url] }
     try:
+        print("--- GATHERING DATA ---")
+        run = client.actor(ACTOR_ID).call(run_input=run_input)
+        dataset_items = list(client.dataset(run["defaultDatasetId"]).iterate_items())
+        if not dataset_items:
+            print("Apify finished the job but returned no data.")
+            return
+        item = dataset_items[0]
+        # --- DATA MAPPING ---
+        # TITLE
+        title = item.get("productTitle") or item.get("title") or "untitled"
+        # DESCRIPTION
+        description = item.get("description", "No description")
+        # PARAMETERS
+        parameter_list = []
+        specs = item.get("productSpecifications", {})
+        if isinstance(specs, dict):
+            for key, value in specs.items():
+                parameter_list.append(f"{key}: {value}")
+        elif not specs:
+            raw_params = item.get("parameters") or item.get("attributes", [])
+            for p in raw_params:
+                name = p.get("name") or p.get("key")
+                val = p.get("value")
+                if name and val:
+                    parameter_list.append(f"{name}: {val}")
+        # IMAGES
+        unique_links = set()
+        raw_images = item.get("images", [])
+        for img in raw_images:
+            if isinstance(img, str): unique_links.add(get_high_res_image(img))
+            elif isinstance(img, dict): unique_links.add(get_high_res_image(img.get("url")))
+        if not unique_links:
+            thumb = item.get("thumbnail")
+            if thumb:
+                high_res = get_high_res_image(thumb)
+                unique_links.add(high_res)
+                print("Info: Retrieved main image from thumbnail (gallery was empty in API).")
+        print(f"Found {len(unique_links)} images.")
         return {
+                    "title": title,
+                    "sanitized_title": sanitize_name(title),
+                    "url": url,
+                    "description": description,
+                    "parameters": parameter_list,
+                    "image_urls": list(unique_links),
+                    "image_count": len(unique_links),
+                    "price": f"{item.get('price')} {item.get('currency')}"
+                }
     except Exception as e:
+        print(f"Main error occurred: {e}")
+# --- USAGE ---
+link = input("Enter the Allegro offer link: ")
+data = get_allegro_data(link)
+print(json.dumps(data, indent=4, ensure_ascii=False))

code/web_scraper_ebay.py CHANGED Viewed

@@ -1,93 +1,153 @@
-# scrape_ebay_offer.py
-import undetected_chromedriver as uc
-from selenium.webdriver.common.by import By
-from webdriver_manager.chrome import ChromeDriverManager
-from selenium.webdriver.chrome.service import Service
-import time
-import requests
 import os
-def scrape_ebay_offer(url: str):
-    """Zwraca dane aukcji bez zapisywania na dysk"""
-    print(f"🔍 eBay: {url}")
-    options = uc.ChromeOptions()
-    options.add_argument("--window-position=-3000,0")
-    options.add_argument("--headless")
-    options.add_argument("--no-sandbox")
-    options.add_argument("--disable-dev-shm-usage")
-    # Ustawienie binarki Chrome'a
-    if os.path.exists('/usr/bin/google-chrome'):
-        options.binary_location = '/usr/bin/google-chrome'
-    driver = uc.Chrome(
-        service=Service(ChromeDriverManager().install()),
-        options=options,
-        use_subprocess=True
-    )
     try:
-        driver.get(url)
-        time.sleep(4)
         # TITLE
-        try:
-            title_element = driver.find_element(By.CSS_SELECTOR, "h1.x-item-title__mainTitle")
-            title_str = title_element.text.strip()
-        except:
-            title_str = "untitled_ebay"
         # PARAMETERS
         parameter_list = []
-        try:
-            rows = driver.find_elements(By.CSS_SELECTOR, ".ux-labels-values")
-            for row in rows:
-                try:
-                    label = row.find_element(By.CSS_SELECTOR, ".ux-labels-values__labels").text.strip()
-                    value = row.find_element(By.CSS_SELECTOR, ".ux-labels-values__values").text.strip()
-                    if label and value:
-                        parameter_list.append(f"{label}: {value}")
-                except:
-                    continue
-        except:
-            pass
-        # DESCRIPTION
-        description_content = "No description"
-        try:
-            frame = driver.find_element(By.ID, "desc_ifr")
-            driver.switch_to.frame(frame)
-            description_content = driver.find_element(By.TAG_NAME, "body").text.strip()
-            driver.switch_to.default_content()
-        except:
-            pass
         # IMAGES
         unique_links = set()
-        try:
-            thumbnails = driver.find_elements(By.CSS_SELECTOR, ".ux-image-grid-item img")
-            for img in thumbnails:
-                src = img.get_attribute("src") or img.get_attribute("data-src")
-                if src and "ebayimg.com" in src:
-                    # Zamień na HD
-                    hd_link = src.replace("/s-l64/", "/s-l1600").replace("/s-l140/", "/s-l1600")
-                    unique_links.add(hd_link)
-        except:
-            pass
         return {
-            "platform": "ebay",
             "url": url,
-            "title": title_str,
-            "description": description_content,
             "parameters": parameter_list,
-            "image_urls": list(unique_links)
         }
-    finally:
-        driver.quit()
-if __name__ == "__main__":
-    url = input("eBay URL: ")
-    result = scrape_ebay_offer(url)
-    print(result)

 import os
+import requests
+from apify_client import ApifyClient
+from dotenv import load_dotenv
+import json
+# --- CONFIGURATION ---
+# Load environment variables from the .env file (if it exists)
+load_dotenv()
+ACTOR_ID = "vulnv/ebay-product-scraper"
+# --- HELPER FUNCTIONS ---
+def sanitize_name(text):
+    """Sanitizes text by removing Polish characters and special symbols for a folder name."""
+    polish_chars = {
+        "ą": "a",
+        "ć": "c",
+        "ę": "e",
+        "ł": "l",
+        "ń": "n",
+        "ó": "o",
+        "ś": "s",
+        "ź": "z",
+        "ż": "z",
+    }
+    text = text.lower()
+    result = ""
+    for char in text:
+        if char in polish_chars:
+            result += polish_chars[char]
+        elif char.isalnum():
+            result += char
+        else:
+            result += "_"
+    # Remove double underscores
+    while "__" in result:
+        result = result.replace("__", "_")
+    return result.strip("_")
+def get_high_res_ebay_image(url):
+    """Replaces the size code in the eBay link with s-l1600 (Max quality)."""
+    if not url: return None
+    sizes = ["s-l64", "s-l140", "s-l300", "s-l400", "s-l500", "s-l960"]
+    for size in sizes:
+        if size in url:
+            return url.replace(size, "s-l1600")
+    if "ebayimg.com" in url and "s-l1600" not in url:
+        parts = url.split("/")
+        last_part = parts[-1]
+        if "s-l" in last_part:
+            return url.replace(last_part[:last_part.find(".")], "s-l1600")
+    return url
+def get_api_token():
+    """Retrieves token from .env or asks the user."""
+    token = os.getenv("APIFY_TOKEN")
+    if token:
+        print("Info: API Token loaded from .env file.")
+        return token
+    print("Warning: APIFY_TOKEN not found in .env file.")
+    return input("Please enter your Apify API Token: ").strip()
+def get_ebay_data(url):
+    apify_token = get_api_token()
+    if not apify_token:
+        print("ERROR: API Token is required.")
+        return
+    print(f"\n--- SENDING REQUEST TO APIFY ---")
+    client = ApifyClient(apify_token)
+    run_input = { "product_urls": [url] }
     try:
+        run = client.actor(ACTOR_ID).call(run_input=run_input)
+        dataset_items = list(client.dataset(run["defaultDatasetId"]).iterate_items())
+        if not dataset_items:
+            print("Apify finished the job but returned no data.")
+            return
+        item = dataset_items[0]
+        # --- DATA MAPPING ---
         # TITLE
+        title = item.get("name") or item.get("title") or "untitled_ebay"
+        # PRICE
+        price = item.get("price", "N/A")
+        currency = item.get("currency", "")
+        # DESCRIPTION
+        description = item.get("description", "No text description available.")
         # PARAMETERS
         parameter_list = []
+        raw_props = item.get("additionalProperties", [])
+        if isinstance(raw_props, list):
+            for prop in raw_props:
+                p_name = prop.get("name")
+                p_val = prop.get("value")
+                if p_name and p_val:
+                    parameter_list.append(f"{p_name}: {p_val}")
+        if item.get("sku"): parameter_list.insert(0, f"SKU: {item.get('sku')}")
         # IMAGES
         unique_links = set()
+        main_img = item.get("mainImage", {}).get("url")
+        if main_img:
+            unique_links.add(get_high_res_ebay_image(main_img))
+        raw_images = item.get("images", [])
+        for img_entry in raw_images:
+            if isinstance(img_entry, dict):
+                raw_url = img_entry.get("url")
+                if raw_url:
+                    unique_links.add(get_high_res_ebay_image(raw_url))
+            elif isinstance(img_entry, str):
+                unique_links.add(get_high_res_ebay_image(img_entry))
+        print(f"Found {len(unique_links)} unique images (High-Res).")
         return {
+            "title": title,
+            "sanitized_title": sanitize_name(title),
             "url": url,
+            "description": description,
             "parameters": parameter_list,
+            "image_urls": list(unique_links),
+            "image_count": len(unique_links),
+            "price": f"{price} {currency}"
         }
+    except Exception as e:
+        print(f"Critical error occurred: {e}")
+# --- USAGE ---
+link = input("Enter the eBay offer link: ")
+data = get_ebay_data(link)
+print(json.dumps(data, indent=4, ensure_ascii=False))

requirements.txt CHANGED Viewed

@@ -8,8 +8,6 @@ tqdm
 fastapi
 uvicorn
 python-multipart
-undetected_chromedriver
-webdriver-manager
 bs4
 requests
 flask

 fastapi
 uvicorn
 python-multipart
 bs4
 requests
 flask