Spaces:

hatamo
/

Antique_Auth_API

Running

App Files Files Community

hatamo commited on 21 days ago

Commit

f8a8e76

1 Parent(s): 761b1f2

Modified scrapper for allegro

Browse files

Files changed (1) hide show

code/web_scraper_allegro.py +85 -121

code/web_scraper_allegro.py CHANGED Viewed

@@ -1,162 +1,126 @@
-# scrape_allegro_offer.py
 import undetected_chromedriver as uc
 from selenium.webdriver.common.by import By
-from webdriver_manager.chrome import ChromeDriverManager
-from selenium.webdriver.chrome.service import Service
 import time
-import requests
-import os
-def sanitize_folder_name(text):  # helper function
-    polish_chars = {
-        "ą": "a", "ć": "c", "ę": "e", "ł": "l", "ń": "n",
-        "ó": "o", "ś": "s", "ź": "z", "ż": "z"
-    }
-    text = text.lower()
-    result = ""
-    for char in text:
-        if char in polish_chars:
-            result += polish_chars[char]
-        elif char.isalnum():
-            result += char
-        else:
-            result += "_"
-    while "__" in result:
-        result = result.replace("__", "_")
-    return result.strip("_")
-def scrape_allegro_offer(url: str, headless: bool = True):
-    """Zwraca dane aukcji bez zapisywania na dysk
-    headless: jeśli False, uruchom przeglądarkę w trybie widocznym (przydatne do ręcznego rozwiązania CAPTCHA).
-    """
     options = uc.ChromeOptions()
-    options.add_argument("--window-position=-3000,0")
     if headless:
         options.add_argument("--headless=new")
-    options.add_argument("--no-sandbox")
-    options.add_argument("--disable-dev-shm-usage")
-    options.add_argument("--disable-blink-features=AutomationControlled")
-    options.add_argument("--start-maximized")
-    options.add_argument("--disable-extensions")
-    # set a realistic user agent
-    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36")
-    # Ustawienie binarki Chrome'a
-    if os.path.exists('/usr/bin/google-chrome'):
-        options.binary_location = '/usr/bin/google-chrome'
-    driver = uc.Chrome(
-        service=Service(ChromeDriverManager().install()),
-        options=options,
-        use_subprocess=True
-    )
     try:
-        print(f"🔍 Allegro: {url}")
         driver.get(url)
-        # Wait for dynamic content (images / title) to load
-        from selenium.webdriver.support.ui import WebDriverWait
-        from selenium.webdriver.support import expected_conditions as EC
-        try:
-            WebDriverWait(driver, 15).until(
-                EC.presence_of_element_located((By.TAG_NAME, "img"))
-            )
-        except Exception:
-            # fallback short sleep
             time.sleep(5)
         # TITLE
         try:
-            title_element = driver.find_element(By.TAG_NAME, "h1")
-            title_str = title_element.text.strip()
         except:
-            title_str = "untitled"
         # PARAMETERS
-        parameter_list = []
         try:
-            rows = driver.find_elements(By.CSS_SELECTOR, "tr")
-            for row in rows:
                 cells = row.find_elements(By.TAG_NAME, "td")
                 if len(cells) == 2:
                     name = cells[0].text.strip()
                     value = cells[1].text.strip()
                     if name and value:
-                        parameter_list.append(f"{name}: {value}")
         except:
             pass
         # DESCRIPTION
         try:
-            # try a few common selectors
-            description_element = None
-            for sel in ["div._0d3bd_am0a-", "div[itemprop='description']", "div#description"]:
-                try:
-                    description_element = driver.find_element(By.CSS_SELECTOR, sel)
-                    if description_element:
-                        break
-                except:
-                    continue
-            description_content = description_element.text if description_element else "No description"
         except:
-            description_content = "No description"
-        # IMAGES
-        unique_links = set()
-        try:
-            # more resilient image selector: look for any img with allegro domains
-            imgs = driver.find_elements(By.XPATH, "//img[contains(@src,'allegroimg.com') or contains(@src,'allegrostatic')]")
-            # also try thumbnails / data-src
-            if not imgs:
-                imgs = driver.find_elements(By.TAG_NAME, "img")
-            allowed_sizes = ["/s128/", "/s360/", "/s512/", "/s720/", "/s1024/", "/s1440/", "/original/"]
-            for img in imgs:
-                try:
-                    src = img.get_attribute("src") or img.get_attribute("data-src") or img.get_attribute("data-lazy-src")
-                    if not src:
-                        # try srcset
-                        srcset = img.get_attribute("srcset")
-                        if srcset:
-                            src = srcset.split()[0]
-                    if src and ("allegroimg.com" in src or "allegrostatic" in src):
-                        # normalize to original size when possible
-                        for size in allowed_sizes:
-                            if size in src:
-                                src = src.replace(size, "/original/")
-                                break
-                        unique_links.add(src)
-                except Exception:
-                    continue
-        except Exception as e:
-            print(f"Image error: {e}")
-        # If site served a captcha / anti-bot page, include a preview for debugging
-        blocked_preview = None
-        try:
-            page_src = driver.page_source
-            if "captcha-delivery" in page_src or "Please enable JS" in page_src or "ad blocker" in page_src.lower():
-                blocked_preview = page_src[:2000]
-        except Exception:
-            blocked_preview = None
-        result = {
             "platform": "allegro",
             "url": url,
-            "title": title_str,
-            "description": description_content,
-            "parameters": parameter_list,
-            "image_urls": list(unique_links)
         }
-        if blocked_preview:
-            result["blocked_preview"] = blocked_preview
-        return result
     finally:
         driver.quit()
 if __name__ == "__main__":
     url = input("Allegro URL: ")
-    result = scrape_allegro_offer(url)
-    print(result)

 import undetected_chromedriver as uc
 from selenium.webdriver.common.by import By
+from selenium.webdriver.support.ui import WebDriverWait
+from selenium.webdriver.support import expected_conditions as EC
 import time
+def scrape_allegro_offer(url: str, headless: bool = False):
+    """Cloudflare-aware scraping z realistic delays"""
     options = uc.ChromeOptions()
+    # Realistyczne ustawienia
+    options.add_argument("--disable-blink-features=AutomationControlled")
+    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
+    # Anti-detection
+    options.add_argument("--disable-dev-shm-usage")
+    options.add_argument("--no-sandbox")
+    options.add_argument("--disable-gpu")
+    # WAŻNE: Nie ukrywaj okna (headless = łatwo jest do wykrycia)
     if headless:
         options.add_argument("--headless=new")
+    # Stealth mode
+    options.add_argument("--disable-extensions")
+    options.add_argument("--disable-plugins")
+    driver = uc.Chrome(options=options, version_main=None, use_subprocess=False)
     try:
+        print(f"🔍 Scraping: {url}")
+        # 1. Otwórz Allegro (nie bezpośrednio ofertę)
+        driver.get("https://allegro.pl")
+        time.sleep(2)  # Czekaj na Cloudflare
+        # 2. Teraz otwórz konkretną ofertę
         driver.get(url)
+        # 3. Czekaj na załadowanie
+        time.sleep(3)
+        # Sprawdź czy Cloudflare/bot detection blokuje
+        if "Please enable JavaScript" in driver.page_source or "captcha" in driver.page_source.lower():
+            print("⚠️  CAPTCHA/Bot detection! Czekam 5 sekund...")
             time.sleep(5)
+            driver.refresh()
+            time.sleep(3)
         # TITLE
         try:
+            title = WebDriverWait(driver, 10).until(
+                EC.presence_of_element_located((By.TAG_NAME, "h1"))
+            ).text.strip()
+        except:
+            title = "untitled"
+        # IMAGES (timeout aby się załadowały)
+        try:
+            WebDriverWait(driver, 10).until(
+                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "img"))
+            )
         except:
+            pass
+        time.sleep(1)  # Extra czekanie na lazy-load
+        # Zbierz zdjęcia
+        image_urls = set()
+        try:
+            # Szukaj po data-src (lazy-loaded obrazy)
+            imgs = driver.find_elements(By.CSS_SELECTOR, "img[src*='allegroimg'], img[data-src*='allegroimg']")
+            for img in imgs:
+                src = img.get_attribute("src") or img.get_attribute("data-src")
+                if src and "allegroimg.com" in src:
+                    # Normalize to original size
+                    src = src.replace("/s360/", "/original/").replace("/s128/", "/original/").replace("/s720/", "/original/")
+                    image_urls.add(src)
+        except Exception as e:
+            print(f"Błąd zdjęć: {e}")
         # PARAMETERS
+        params = []
         try:
+            # Allegro używa różnych struktur — spróbuj kilka
+            for row in driver.find_elements(By.CSS_SELECTOR, "tr"):
                 cells = row.find_elements(By.TAG_NAME, "td")
                 if len(cells) == 2:
                     name = cells[0].text.strip()
                     value = cells[1].text.strip()
                     if name and value:
+                        params.append(f"{name}: {value}")
         except:
             pass
         # DESCRIPTION
+        description = "No description"
         try:
+            desc_elem = driver.find_element(By.CSS_SELECTOR, "div[itemprop='description']")
+            description = desc_elem.text.strip()
         except:
+            try:
+                # Fallback
+                description = driver.find_element(By.CLASS_NAME, "description").text
+            except:
+                pass
+        return {
             "platform": "allegro",
             "url": url,
+            "title": title,
+            "description": description,
+            "parameters": params,
+            "image_urls": list(image_urls)
         }
     finally:
         driver.quit()
 if __name__ == "__main__":
     url = input("Allegro URL: ")
+    data = scrape_allegro_offer(url, headless=False)  # headless=False aby widzieć co się dzieje
+    print(data)