hatamo commited on
Commit
f8a8e76
·
1 Parent(s): 761b1f2

Modified scrapper for allegro

Browse files
Files changed (1) hide show
  1. code/web_scraper_allegro.py +85 -121
code/web_scraper_allegro.py CHANGED
@@ -1,162 +1,126 @@
1
- # scrape_allegro_offer.py
2
  import undetected_chromedriver as uc
3
  from selenium.webdriver.common.by import By
4
- from webdriver_manager.chrome import ChromeDriverManager
5
- from selenium.webdriver.chrome.service import Service
6
  import time
7
- import requests
8
- import os
9
 
10
- def sanitize_folder_name(text): # helper function
11
- polish_chars = {
12
- "ą": "a", "ć": "c", "ę": "e", "ł": "l", "ń": "n",
13
- "ó": "o", "ś": "s", "ź": "z", "ż": "z"
14
- }
15
- text = text.lower()
16
- result = ""
17
- for char in text:
18
- if char in polish_chars:
19
- result += polish_chars[char]
20
- elif char.isalnum():
21
- result += char
22
- else:
23
- result += "_"
24
- while "__" in result:
25
- result = result.replace("__", "_")
26
- return result.strip("_")
27
-
28
- def scrape_allegro_offer(url: str, headless: bool = True):
29
- """Zwraca dane aukcji bez zapisywania na dysk
30
-
31
- headless: jeśli False, uruchom przeglądarkę w trybie widocznym (przydatne do ręcznego rozwiązania CAPTCHA).
32
- """
33
  options = uc.ChromeOptions()
34
- options.add_argument("--window-position=-3000,0")
 
 
 
 
 
 
 
 
 
 
35
  if headless:
36
  options.add_argument("--headless=new")
37
- options.add_argument("--no-sandbox")
38
- options.add_argument("--disable-dev-shm-usage")
39
- options.add_argument("--disable-blink-features=AutomationControlled")
40
- options.add_argument("--start-maximized")
41
- options.add_argument("--disable-extensions")
42
- # set a realistic user agent
43
- options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36")
44
 
45
- # Ustawienie binarki Chrome'a
46
- if os.path.exists('/usr/bin/google-chrome'):
47
- options.binary_location = '/usr/bin/google-chrome'
48
 
49
- driver = uc.Chrome(
50
- service=Service(ChromeDriverManager().install()),
51
- options=options,
52
- use_subprocess=True
53
- )
54
 
55
  try:
56
- print(f"🔍 Allegro: {url}")
 
 
 
 
 
 
57
  driver.get(url)
58
- # Wait for dynamic content (images / title) to load
59
- from selenium.webdriver.support.ui import WebDriverWait
60
- from selenium.webdriver.support import expected_conditions as EC
61
- try:
62
- WebDriverWait(driver, 15).until(
63
- EC.presence_of_element_located((By.TAG_NAME, "img"))
64
- )
65
- except Exception:
66
- # fallback short sleep
67
  time.sleep(5)
 
 
68
 
69
  # TITLE
70
  try:
71
- title_element = driver.find_element(By.TAG_NAME, "h1")
72
- title_str = title_element.text.strip()
 
 
 
 
 
 
 
 
 
73
  except:
74
- title_str = "untitled"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  # PARAMETERS
77
- parameter_list = []
78
  try:
79
- rows = driver.find_elements(By.CSS_SELECTOR, "tr")
80
- for row in rows:
81
  cells = row.find_elements(By.TAG_NAME, "td")
82
  if len(cells) == 2:
83
  name = cells[0].text.strip()
84
  value = cells[1].text.strip()
85
  if name and value:
86
- parameter_list.append(f"{name}: {value}")
87
  except:
88
  pass
89
 
90
  # DESCRIPTION
 
91
  try:
92
- # try a few common selectors
93
- description_element = None
94
- for sel in ["div._0d3bd_am0a-", "div[itemprop='description']", "div#description"]:
95
- try:
96
- description_element = driver.find_element(By.CSS_SELECTOR, sel)
97
- if description_element:
98
- break
99
- except:
100
- continue
101
- description_content = description_element.text if description_element else "No description"
102
  except:
103
- description_content = "No description"
 
 
 
 
104
 
105
- # IMAGES
106
- unique_links = set()
107
- try:
108
- # more resilient image selector: look for any img with allegro domains
109
- imgs = driver.find_elements(By.XPATH, "//img[contains(@src,'allegroimg.com') or contains(@src,'allegrostatic')]")
110
- # also try thumbnails / data-src
111
- if not imgs:
112
- imgs = driver.find_elements(By.TAG_NAME, "img")
113
-
114
- allowed_sizes = ["/s128/", "/s360/", "/s512/", "/s720/", "/s1024/", "/s1440/", "/original/"]
115
- for img in imgs:
116
- try:
117
- src = img.get_attribute("src") or img.get_attribute("data-src") or img.get_attribute("data-lazy-src")
118
- if not src:
119
- # try srcset
120
- srcset = img.get_attribute("srcset")
121
- if srcset:
122
- src = srcset.split()[0]
123
- if src and ("allegroimg.com" in src or "allegrostatic" in src):
124
- # normalize to original size when possible
125
- for size in allowed_sizes:
126
- if size in src:
127
- src = src.replace(size, "/original/")
128
- break
129
- unique_links.add(src)
130
- except Exception:
131
- continue
132
- except Exception as e:
133
- print(f"Image error: {e}")
134
-
135
- # If site served a captcha / anti-bot page, include a preview for debugging
136
- blocked_preview = None
137
- try:
138
- page_src = driver.page_source
139
- if "captcha-delivery" in page_src or "Please enable JS" in page_src or "ad blocker" in page_src.lower():
140
- blocked_preview = page_src[:2000]
141
- except Exception:
142
- blocked_preview = None
143
-
144
- result = {
145
  "platform": "allegro",
146
  "url": url,
147
- "title": title_str,
148
- "description": description_content,
149
- "parameters": parameter_list,
150
- "image_urls": list(unique_links)
151
  }
152
- if blocked_preview:
153
- result["blocked_preview"] = blocked_preview
154
- return result
155
 
156
  finally:
157
  driver.quit()
158
 
 
159
  if __name__ == "__main__":
160
  url = input("Allegro URL: ")
161
- result = scrape_allegro_offer(url)
162
- print(result)
 
 
1
  import undetected_chromedriver as uc
2
  from selenium.webdriver.common.by import By
3
+ from selenium.webdriver.support.ui import WebDriverWait
4
+ from selenium.webdriver.support import expected_conditions as EC
5
  import time
 
 
6
 
7
+ def scrape_allegro_offer(url: str, headless: bool = False):
8
+ """Cloudflare-aware scraping z realistic delays"""
9
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  options = uc.ChromeOptions()
11
+
12
+ # Realistyczne ustawienia
13
+ options.add_argument("--disable-blink-features=AutomationControlled")
14
+ options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
15
+
16
+ # Anti-detection
17
+ options.add_argument("--disable-dev-shm-usage")
18
+ options.add_argument("--no-sandbox")
19
+ options.add_argument("--disable-gpu")
20
+
21
+ # WAŻNE: Nie ukrywaj okna (headless = łatwo jest do wykrycia)
22
  if headless:
23
  options.add_argument("--headless=new")
 
 
 
 
 
 
 
24
 
25
+ # Stealth mode
26
+ options.add_argument("--disable-extensions")
27
+ options.add_argument("--disable-plugins")
28
 
29
+ driver = uc.Chrome(options=options, version_main=None, use_subprocess=False)
 
 
 
 
30
 
31
  try:
32
+ print(f"🔍 Scraping: {url}")
33
+
34
+ # 1. Otwórz Allegro (nie bezpośrednio ofertę)
35
+ driver.get("https://allegro.pl")
36
+ time.sleep(2) # Czekaj na Cloudflare
37
+
38
+ # 2. Teraz otwórz konkretną ofertę
39
  driver.get(url)
40
+
41
+ # 3. Czekaj na załadowanie
42
+ time.sleep(3)
43
+
44
+ # Sprawdź czy Cloudflare/bot detection blokuje
45
+ if "Please enable JavaScript" in driver.page_source or "captcha" in driver.page_source.lower():
46
+ print("⚠️ CAPTCHA/Bot detection! Czekam 5 sekund...")
 
 
47
  time.sleep(5)
48
+ driver.refresh()
49
+ time.sleep(3)
50
 
51
  # TITLE
52
  try:
53
+ title = WebDriverWait(driver, 10).until(
54
+ EC.presence_of_element_located((By.TAG_NAME, "h1"))
55
+ ).text.strip()
56
+ except:
57
+ title = "untitled"
58
+
59
+ # IMAGES (timeout aby się załadowały)
60
+ try:
61
+ WebDriverWait(driver, 10).until(
62
+ EC.presence_of_all_elements_located((By.CSS_SELECTOR, "img"))
63
+ )
64
  except:
65
+ pass
66
+
67
+ time.sleep(1) # Extra czekanie na lazy-load
68
+
69
+ # Zbierz zdjęcia
70
+ image_urls = set()
71
+ try:
72
+ # Szukaj po data-src (lazy-loaded obrazy)
73
+ imgs = driver.find_elements(By.CSS_SELECTOR, "img[src*='allegroimg'], img[data-src*='allegroimg']")
74
+
75
+ for img in imgs:
76
+ src = img.get_attribute("src") or img.get_attribute("data-src")
77
+ if src and "allegroimg.com" in src:
78
+ # Normalize to original size
79
+ src = src.replace("/s360/", "/original/").replace("/s128/", "/original/").replace("/s720/", "/original/")
80
+ image_urls.add(src)
81
+ except Exception as e:
82
+ print(f"Błąd zdjęć: {e}")
83
 
84
  # PARAMETERS
85
+ params = []
86
  try:
87
+ # Allegro używa różnych struktur — spróbuj kilka
88
+ for row in driver.find_elements(By.CSS_SELECTOR, "tr"):
89
  cells = row.find_elements(By.TAG_NAME, "td")
90
  if len(cells) == 2:
91
  name = cells[0].text.strip()
92
  value = cells[1].text.strip()
93
  if name and value:
94
+ params.append(f"{name}: {value}")
95
  except:
96
  pass
97
 
98
  # DESCRIPTION
99
+ description = "No description"
100
  try:
101
+ desc_elem = driver.find_element(By.CSS_SELECTOR, "div[itemprop='description']")
102
+ description = desc_elem.text.strip()
 
 
 
 
 
 
 
 
103
  except:
104
+ try:
105
+ # Fallback
106
+ description = driver.find_element(By.CLASS_NAME, "description").text
107
+ except:
108
+ pass
109
 
110
+ return {
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  "platform": "allegro",
112
  "url": url,
113
+ "title": title,
114
+ "description": description,
115
+ "parameters": params,
116
+ "image_urls": list(image_urls)
117
  }
 
 
 
118
 
119
  finally:
120
  driver.quit()
121
 
122
+
123
  if __name__ == "__main__":
124
  url = input("Allegro URL: ")
125
+ data = scrape_allegro_offer(url, headless=False) # headless=False aby widzieć co się dzieje
126
+ print(data)