Spaces:

hatamo
/

Antique_Auth_API

Running

App Files Files Community

hatamo commited on 21 days ago

Commit

948dcae

1 Parent(s): f8a8e76

Modified scrapper for allegro

Browse files

Files changed (1) hide show

code/web_scraper_allegro.py +61 -76

code/web_scraper_allegro.py CHANGED Viewed

@@ -1,95 +1,72 @@
-import undetected_chromedriver as uc
-from selenium.webdriver.common.by import By
-from selenium.webdriver.support.ui import WebDriverWait
-from selenium.webdriver.support import expected_conditions as EC
-import time
-def scrape_allegro_offer(url: str, headless: bool = False):
-    """Cloudflare-aware scraping z realistic delays"""
-    options = uc.ChromeOptions()
-    # Realistyczne ustawienia
-    options.add_argument("--disable-blink-features=AutomationControlled")
-    options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
-    # Anti-detection
-    options.add_argument("--disable-dev-shm-usage")
-    options.add_argument("--no-sandbox")
-    options.add_argument("--disable-gpu")
-    # WAŻNE: Nie ukrywaj okna (headless = łatwo jest do wykrycia)
-    if headless:
-        options.add_argument("--headless=new")
-    # Stealth mode
-    options.add_argument("--disable-extensions")
-    options.add_argument("--disable-plugins")
-    driver = uc.Chrome(options=options, version_main=None, use_subprocess=False)
     try:
         print(f"🔍 Scraping: {url}")
-        # 1. Otwórz Allegro (nie bezpośrednio ofertę)
-        driver.get("https://allegro.pl")
-        time.sleep(2)  # Czekaj na Cloudflare
-        # 2. Teraz otwórz konkretną ofertę
-        driver.get(url)
-        # 3. Czekaj na załadowanie
-        time.sleep(3)
-        # Sprawdź czy Cloudflare/bot detection blokuje
-        if "Please enable JavaScript" in driver.page_source or "captcha" in driver.page_source.lower():
-            print("⚠️  CAPTCHA/Bot detection! Czekam 5 sekund...")
-            time.sleep(5)
-            driver.refresh()
-            time.sleep(3)
         # TITLE
         try:
-            title = WebDriverWait(driver, 10).until(
-                EC.presence_of_element_located((By.TAG_NAME, "h1"))
-            ).text.strip()
-        except:
-            title = "untitled"
-        # IMAGES (timeout aby się załadowały)
-        try:
-            WebDriverWait(driver, 10).until(
-                EC.presence_of_all_elements_located((By.CSS_SELECTOR, "img"))
-            )
         except:
             pass
-        time.sleep(1)  # Extra czekanie na lazy-load
-        # Zbierz zdjęcia
         image_urls = set()
         try:
-            # Szukaj po data-src (lazy-loaded obrazy)
-            imgs = driver.find_elements(By.CSS_SELECTOR, "img[src*='allegroimg'], img[data-src*='allegroimg']")
-            for img in imgs:
-                src = img.get_attribute("src") or img.get_attribute("data-src")
                 if src and "allegroimg.com" in src:
-                    # Normalize to original size
-                    src = src.replace("/s360/", "/original/").replace("/s128/", "/original/").replace("/s720/", "/original/")
                     image_urls.add(src)
         except Exception as e:
             print(f"Błąd zdjęć: {e}")
-        # PARAMETERS
         params = []
         try:
-            # Allegro używa różnych struktur — spróbuj kilka
-            for row in driver.find_elements(By.CSS_SELECTOR, "tr"):
-                cells = row.find_elements(By.TAG_NAME, "td")
                 if len(cells) == 2:
-                    name = cells[0].text.strip()
-                    value = cells[1].text.strip()
                     if name and value:
                         params.append(f"{name}: {value}")
         except:
@@ -98,12 +75,15 @@ def scrape_allegro_offer(url: str, headless: bool = False):
         # DESCRIPTION
         description = "No description"
         try:
-            desc_elem = driver.find_element(By.CSS_SELECTOR, "div[itemprop='description']")
-            description = desc_elem.text.strip()
         except:
             try:
-                # Fallback
-                description = driver.find_element(By.CLASS_NAME, "description").text
             except:
                 pass
@@ -116,11 +96,16 @@ def scrape_allegro_offer(url: str, headless: bool = False):
             "image_urls": list(image_urls)
         }
-    finally:
-        driver.quit()
 if __name__ == "__main__":
     url = input("Allegro URL: ")
-    data = scrape_allegro_offer(url, headless=False)  # headless=False aby widzieć co się dzieje
     print(data)

+import requests
+from bs4 import BeautifulSoup
+import re
+def scrape_allegro_with_bs4(url: str):
+    """BeautifulSoup bez Selenium — działa na HF Spaces"""
+    headers = {
+        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+        "Accept-Language": "pl-PL,pl;q=0.9",
+        "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+        "Referer": "https://allegro.pl/",
+        "DNT": "1"
+    }
     try:
         print(f"🔍 Scraping: {url}")
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        soup = BeautifulSoup(response.content, "html.parser")
         # TITLE
+        title = "untitled"
         try:
+            title_tag = soup.find("h1")
+            if title_tag:
+                title = title_tag.get_text(strip=True)
         except:
             pass
+        # IMAGES (szukaj w JSON-LD lub img tags)
         image_urls = set()
         try:
+            # Metoda 1: Szukaj w script tag (JSON-LD)
+            scripts = soup.find_all("script", type="application/ld+json")
+            for script in scripts:
+                try:
+                    import json
+                    data = json.loads(script.string)
+                    if isinstance(data, list):
+                        data = data[0]
+                    if "image" in data:
+                        images = data["image"]
+                        if isinstance(images, list):
+                            image_urls.update(images)
+                        else:
+                            image_urls.add(images)
+                except:
+                    pass
+            # Metoda 2: Szukaj img tags z allegroimg
+            for img in soup.find_all("img"):
+                src = img.get("src") or img.get("data-src")
                 if src and "allegroimg.com" in src:
+                    # Normalize to original
+                    src = re.sub(r"/s\d+/", "/original/", src)
                     image_urls.add(src)
         except Exception as e:
             print(f"Błąd zdjęć: {e}")
+        # PARAMETERS (zwykle w tabeli)
         params = []
         try:
+            for row in soup.find_all("tr"):
+                cells = row.find_all("td")
                 if len(cells) == 2:
+                    name = cells[0].get_text(strip=True)
+                    value = cells[1].get_text(strip=True)
                     if name and value:
                         params.append(f"{name}: {value}")
         except:
         # DESCRIPTION
         description = "No description"
         try:
+            desc_div = soup.find("div", {"itemprop": "description"})
+            if desc_div:
+                description = desc_div.get_text(strip=True)[:500]  # Limit
         except:
+            # Fallback
             try:
+                desc_div = soup.find("div", class_=re.compile("description"))
+                if desc_div:
+                    description = desc_div.get_text(strip=True)[:500]
             except:
                 pass
             "image_urls": list(image_urls)
         }
+    except requests.exceptions.RequestException as e:
+        return {
+            "status": "error",
+            "error": f"Request failed: {str(e)}",
+            "platform": "allegro",
+            "url": url
+        }
 if __name__ == "__main__":
     url = input("Allegro URL: ")
+    data = scrape_allegro_with_bs4(url)
     print(data)