hatamo commited on
Commit
86e2ad3
·
1 Parent(s): 4e49379

Modified scrapper setings

Browse files
Files changed (1) hide show
  1. code/web_scraper_allegro.py +62 -14
code/web_scraper_allegro.py CHANGED
@@ -29,9 +29,14 @@ def scrape_allegro_offer(url: str):
29
  """Zwraca dane aukcji bez zapisywania na dysk"""
30
  options = uc.ChromeOptions()
31
  options.add_argument("--window-position=-3000,0")
32
- options.add_argument("--headless")
33
  options.add_argument("--no-sandbox")
34
  options.add_argument("--disable-dev-shm-usage")
 
 
 
 
 
35
 
36
  # Ustawienie binarki Chrome'a
37
  if os.path.exists('/usr/bin/google-chrome'):
@@ -46,7 +51,16 @@ def scrape_allegro_offer(url: str):
46
  try:
47
  print(f"🔍 Allegro: {url}")
48
  driver.get(url)
49
- time.sleep(10)
 
 
 
 
 
 
 
 
 
50
 
51
  # TITLE
52
  try:
@@ -71,28 +85,59 @@ def scrape_allegro_offer(url: str):
71
 
72
  # DESCRIPTION
73
  try:
74
- description_element = driver.find_element(By.CSS_SELECTOR, "div._0d3bd_am0a-")
75
- description_content = description_element.text
 
 
 
 
 
 
 
 
76
  except:
77
  description_content = "No description"
78
 
79
  # IMAGES
80
  unique_links = set()
81
  try:
82
- images = driver.find_elements(By.CSS_SELECTOR, ".msub_80.m9tr_5r._07951_IOf8s")
 
 
 
 
 
83
  allowed_sizes = ["/s128/", "/s360/", "/s512/", "/s720/", "/s1024/", "/s1440/", "/original/"]
84
- for img in images:
85
- src = img.get_attribute("src")
86
- if src and "allegroimg.com" in src:
87
- if not any(size in src for size in allowed_sizes):
88
- continue
89
- for size in allowed_sizes:
90
- src = src.replace(size, "/original/")
91
- unique_links.add(src)
 
 
 
 
 
 
 
 
 
92
  except Exception as e:
93
  print(f"Image error: {e}")
 
 
 
 
 
 
 
 
 
94
 
95
- return {
96
  "platform": "allegro",
97
  "url": url,
98
  "title": title_str,
@@ -100,6 +145,9 @@ def scrape_allegro_offer(url: str):
100
  "parameters": parameter_list,
101
  "image_urls": list(unique_links)
102
  }
 
 
 
103
 
104
  finally:
105
  driver.quit()
 
29
  """Zwraca dane aukcji bez zapisywania na dysk"""
30
  options = uc.ChromeOptions()
31
  options.add_argument("--window-position=-3000,0")
32
+ options.add_argument("--headless=new")
33
  options.add_argument("--no-sandbox")
34
  options.add_argument("--disable-dev-shm-usage")
35
+ options.add_argument("--disable-blink-features=AutomationControlled")
36
+ options.add_argument("--start-maximized")
37
+ options.add_argument("--disable-extensions")
38
+ # set a realistic user agent
39
+ options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/115.0.0.0 Safari/537.36")
40
 
41
  # Ustawienie binarki Chrome'a
42
  if os.path.exists('/usr/bin/google-chrome'):
 
51
  try:
52
  print(f"🔍 Allegro: {url}")
53
  driver.get(url)
54
+ # Wait for dynamic content (images / title) to load
55
+ from selenium.webdriver.support.ui import WebDriverWait
56
+ from selenium.webdriver.support import expected_conditions as EC
57
+ try:
58
+ WebDriverWait(driver, 15).until(
59
+ EC.presence_of_element_located((By.TAG_NAME, "img"))
60
+ )
61
+ except Exception:
62
+ # fallback short sleep
63
+ time.sleep(5)
64
 
65
  # TITLE
66
  try:
 
85
 
86
  # DESCRIPTION
87
  try:
88
+ # try a few common selectors
89
+ description_element = None
90
+ for sel in ["div._0d3bd_am0a-", "div[itemprop='description']", "div#description"]:
91
+ try:
92
+ description_element = driver.find_element(By.CSS_SELECTOR, sel)
93
+ if description_element:
94
+ break
95
+ except:
96
+ continue
97
+ description_content = description_element.text if description_element else "No description"
98
  except:
99
  description_content = "No description"
100
 
101
  # IMAGES
102
  unique_links = set()
103
  try:
104
+ # more resilient image selector: look for any img with allegro domains
105
+ imgs = driver.find_elements(By.XPATH, "//img[contains(@src,'allegroimg.com') or contains(@src,'allegrostatic')]")
106
+ # also try thumbnails / data-src
107
+ if not imgs:
108
+ imgs = driver.find_elements(By.TAG_NAME, "img")
109
+
110
  allowed_sizes = ["/s128/", "/s360/", "/s512/", "/s720/", "/s1024/", "/s1440/", "/original/"]
111
+ for img in imgs:
112
+ try:
113
+ src = img.get_attribute("src") or img.get_attribute("data-src") or img.get_attribute("data-lazy-src")
114
+ if not src:
115
+ # try srcset
116
+ srcset = img.get_attribute("srcset")
117
+ if srcset:
118
+ src = srcset.split()[0]
119
+ if src and ("allegroimg.com" in src or "allegrostatic" in src):
120
+ # normalize to original size when possible
121
+ for size in allowed_sizes:
122
+ if size in src:
123
+ src = src.replace(size, "/original/")
124
+ break
125
+ unique_links.add(src)
126
+ except Exception:
127
+ continue
128
  except Exception as e:
129
  print(f"Image error: {e}")
130
+
131
+ # If site served a captcha / anti-bot page, include a preview for debugging
132
+ blocked_preview = None
133
+ try:
134
+ page_src = driver.page_source
135
+ if "captcha-delivery" in page_src or "Please enable JS" in page_src or "ad blocker" in page_src.lower():
136
+ blocked_preview = page_src[:2000]
137
+ except Exception:
138
+ blocked_preview = None
139
 
140
+ result = {
141
  "platform": "allegro",
142
  "url": url,
143
  "title": title_str,
 
145
  "parameters": parameter_list,
146
  "image_urls": list(unique_links)
147
  }
148
+ if blocked_preview:
149
+ result["blocked_preview"] = blocked_preview
150
+ return result
151
 
152
  finally:
153
  driver.quit()