hatamo commited on
Commit
948dcae
·
1 Parent(s): f8a8e76

Modified scrapper for allegro

Browse files
Files changed (1) hide show
  1. code/web_scraper_allegro.py +61 -76
code/web_scraper_allegro.py CHANGED
@@ -1,95 +1,72 @@
1
- import undetected_chromedriver as uc
2
- from selenium.webdriver.common.by import By
3
- from selenium.webdriver.support.ui import WebDriverWait
4
- from selenium.webdriver.support import expected_conditions as EC
5
- import time
6
 
7
- def scrape_allegro_offer(url: str, headless: bool = False):
8
- """Cloudflare-aware scraping z realistic delays"""
9
 
10
- options = uc.ChromeOptions()
11
-
12
- # Realistyczne ustawienia
13
- options.add_argument("--disable-blink-features=AutomationControlled")
14
- options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
15
-
16
- # Anti-detection
17
- options.add_argument("--disable-dev-shm-usage")
18
- options.add_argument("--no-sandbox")
19
- options.add_argument("--disable-gpu")
20
-
21
- # WAŻNE: Nie ukrywaj okna (headless = łatwo jest do wykrycia)
22
- if headless:
23
- options.add_argument("--headless=new")
24
-
25
- # Stealth mode
26
- options.add_argument("--disable-extensions")
27
- options.add_argument("--disable-plugins")
28
-
29
- driver = uc.Chrome(options=options, version_main=None, use_subprocess=False)
30
 
31
  try:
32
  print(f"🔍 Scraping: {url}")
 
 
33
 
34
- # 1. Otwórz Allegro (nie bezpośrednio ofertę)
35
- driver.get("https://allegro.pl")
36
- time.sleep(2) # Czekaj na Cloudflare
37
-
38
- # 2. Teraz otwórz konkretną ofertę
39
- driver.get(url)
40
-
41
- # 3. Czekaj na załadowanie
42
- time.sleep(3)
43
-
44
- # Sprawdź czy Cloudflare/bot detection blokuje
45
- if "Please enable JavaScript" in driver.page_source or "captcha" in driver.page_source.lower():
46
- print("⚠️ CAPTCHA/Bot detection! Czekam 5 sekund...")
47
- time.sleep(5)
48
- driver.refresh()
49
- time.sleep(3)
50
 
51
  # TITLE
 
52
  try:
53
- title = WebDriverWait(driver, 10).until(
54
- EC.presence_of_element_located((By.TAG_NAME, "h1"))
55
- ).text.strip()
56
- except:
57
- title = "untitled"
58
-
59
- # IMAGES (timeout aby się załadowały)
60
- try:
61
- WebDriverWait(driver, 10).until(
62
- EC.presence_of_all_elements_located((By.CSS_SELECTOR, "img"))
63
- )
64
  except:
65
  pass
66
 
67
- time.sleep(1) # Extra czekanie na lazy-load
68
-
69
- # Zbierz zdjęcia
70
  image_urls = set()
71
  try:
72
- # Szukaj po data-src (lazy-loaded obrazy)
73
- imgs = driver.find_elements(By.CSS_SELECTOR, "img[src*='allegroimg'], img[data-src*='allegroimg']")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
- for img in imgs:
76
- src = img.get_attribute("src") or img.get_attribute("data-src")
 
77
  if src and "allegroimg.com" in src:
78
- # Normalize to original size
79
- src = src.replace("/s360/", "/original/").replace("/s128/", "/original/").replace("/s720/", "/original/")
80
  image_urls.add(src)
81
  except Exception as e:
82
  print(f"Błąd zdjęć: {e}")
83
 
84
- # PARAMETERS
85
  params = []
86
  try:
87
- # Allegro używa różnych struktur — spróbuj kilka
88
- for row in driver.find_elements(By.CSS_SELECTOR, "tr"):
89
- cells = row.find_elements(By.TAG_NAME, "td")
90
  if len(cells) == 2:
91
- name = cells[0].text.strip()
92
- value = cells[1].text.strip()
93
  if name and value:
94
  params.append(f"{name}: {value}")
95
  except:
@@ -98,12 +75,15 @@ def scrape_allegro_offer(url: str, headless: bool = False):
98
  # DESCRIPTION
99
  description = "No description"
100
  try:
101
- desc_elem = driver.find_element(By.CSS_SELECTOR, "div[itemprop='description']")
102
- description = desc_elem.text.strip()
 
103
  except:
 
104
  try:
105
- # Fallback
106
- description = driver.find_element(By.CLASS_NAME, "description").text
 
107
  except:
108
  pass
109
 
@@ -116,11 +96,16 @@ def scrape_allegro_offer(url: str, headless: bool = False):
116
  "image_urls": list(image_urls)
117
  }
118
 
119
- finally:
120
- driver.quit()
 
 
 
 
 
121
 
122
 
123
  if __name__ == "__main__":
124
  url = input("Allegro URL: ")
125
- data = scrape_allegro_offer(url, headless=False) # headless=False aby widzieć co się dzieje
126
  print(data)
 
1
+ import requests
2
+ from bs4 import BeautifulSoup
3
+ import re
 
 
4
 
5
+ def scrape_allegro_with_bs4(url: str):
6
+ """BeautifulSoup bez Selenium działa na HF Spaces"""
7
 
8
+ headers = {
9
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
10
+ "Accept-Language": "pl-PL,pl;q=0.9",
11
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
12
+ "Referer": "https://allegro.pl/",
13
+ "DNT": "1"
14
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
  try:
17
  print(f"🔍 Scraping: {url}")
18
+ response = requests.get(url, headers=headers, timeout=10)
19
+ response.raise_for_status()
20
 
21
+ soup = BeautifulSoup(response.content, "html.parser")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
  # TITLE
24
+ title = "untitled"
25
  try:
26
+ title_tag = soup.find("h1")
27
+ if title_tag:
28
+ title = title_tag.get_text(strip=True)
 
 
 
 
 
 
 
 
29
  except:
30
  pass
31
 
32
+ # IMAGES (szukaj w JSON-LD lub img tags)
 
 
33
  image_urls = set()
34
  try:
35
+ # Metoda 1: Szukaj w script tag (JSON-LD)
36
+ scripts = soup.find_all("script", type="application/ld+json")
37
+ for script in scripts:
38
+ try:
39
+ import json
40
+ data = json.loads(script.string)
41
+ if isinstance(data, list):
42
+ data = data[0]
43
+ if "image" in data:
44
+ images = data["image"]
45
+ if isinstance(images, list):
46
+ image_urls.update(images)
47
+ else:
48
+ image_urls.add(images)
49
+ except:
50
+ pass
51
 
52
+ # Metoda 2: Szukaj img tags z allegroimg
53
+ for img in soup.find_all("img"):
54
+ src = img.get("src") or img.get("data-src")
55
  if src and "allegroimg.com" in src:
56
+ # Normalize to original
57
+ src = re.sub(r"/s\d+/", "/original/", src)
58
  image_urls.add(src)
59
  except Exception as e:
60
  print(f"Błąd zdjęć: {e}")
61
 
62
+ # PARAMETERS (zwykle w tabeli)
63
  params = []
64
  try:
65
+ for row in soup.find_all("tr"):
66
+ cells = row.find_all("td")
 
67
  if len(cells) == 2:
68
+ name = cells[0].get_text(strip=True)
69
+ value = cells[1].get_text(strip=True)
70
  if name and value:
71
  params.append(f"{name}: {value}")
72
  except:
 
75
  # DESCRIPTION
76
  description = "No description"
77
  try:
78
+ desc_div = soup.find("div", {"itemprop": "description"})
79
+ if desc_div:
80
+ description = desc_div.get_text(strip=True)[:500] # Limit
81
  except:
82
+ # Fallback
83
  try:
84
+ desc_div = soup.find("div", class_=re.compile("description"))
85
+ if desc_div:
86
+ description = desc_div.get_text(strip=True)[:500]
87
  except:
88
  pass
89
 
 
96
  "image_urls": list(image_urls)
97
  }
98
 
99
+ except requests.exceptions.RequestException as e:
100
+ return {
101
+ "status": "error",
102
+ "error": f"Request failed: {str(e)}",
103
+ "platform": "allegro",
104
+ "url": url
105
+ }
106
 
107
 
108
  if __name__ == "__main__":
109
  url = input("Allegro URL: ")
110
+ data = scrape_allegro_with_bs4(url)
111
  print(data)