hatamo commited on
Commit
761b1f2
·
1 Parent(s): 86e2ad3

Modified scrapper setings

Browse files
Files changed (2) hide show
  1. code/app.py +2 -2
  2. code/web_scraper_allegro.py +7 -3
code/app.py CHANGED
@@ -278,7 +278,7 @@ async def validate_url(
278
 
279
 
280
  @app.post("/debug_scrape")
281
- async def debug_scrape(url: str = Form(...)):
282
  """Run scraper for a URL and return the raw auction dict and a small HTML preview.
283
  This endpoint is for debugging only."""
284
  try:
@@ -286,7 +286,7 @@ async def debug_scrape(url: str = Form(...)):
286
  # Choose scraper
287
  if "allegro.pl" in url:
288
  from web_scraper_allegro import scrape_allegro_offer
289
- auction = scrape_allegro_offer(url)
290
  elif "olx.pl" in url:
291
  from web_scraper_olx import scrape_olx_offer
292
  auction = scrape_olx_offer(url)
 
278
 
279
 
280
  @app.post("/debug_scrape")
281
+ async def debug_scrape(url: str = Form(...), headless: bool = Form(True)):
282
  """Run scraper for a URL and return the raw auction dict and a small HTML preview.
283
  This endpoint is for debugging only."""
284
  try:
 
286
  # Choose scraper
287
  if "allegro.pl" in url:
288
  from web_scraper_allegro import scrape_allegro_offer
289
+ auction = scrape_allegro_offer(url, headless=headless)
290
  elif "olx.pl" in url:
291
  from web_scraper_olx import scrape_olx_offer
292
  auction = scrape_olx_offer(url)
code/web_scraper_allegro.py CHANGED
@@ -25,11 +25,15 @@ def sanitize_folder_name(text): # helper function
25
  result = result.replace("__", "_")
26
  return result.strip("_")
27
 
28
- def scrape_allegro_offer(url: str):
29
- """Zwraca dane aukcji bez zapisywania na dysk"""
 
 
 
30
  options = uc.ChromeOptions()
31
  options.add_argument("--window-position=-3000,0")
32
- options.add_argument("--headless=new")
 
33
  options.add_argument("--no-sandbox")
34
  options.add_argument("--disable-dev-shm-usage")
35
  options.add_argument("--disable-blink-features=AutomationControlled")
 
25
  result = result.replace("__", "_")
26
  return result.strip("_")
27
 
28
+ def scrape_allegro_offer(url: str, headless: bool = True):
29
+ """Zwraca dane aukcji bez zapisywania na dysk
30
+
31
+ headless: jeśli False, uruchom przeglądarkę w trybie widocznym (przydatne do ręcznego rozwiązania CAPTCHA).
32
+ """
33
  options = uc.ChromeOptions()
34
  options.add_argument("--window-position=-3000,0")
35
+ if headless:
36
+ options.add_argument("--headless=new")
37
  options.add_argument("--no-sandbox")
38
  options.add_argument("--disable-dev-shm-usage")
39
  options.add_argument("--disable-blink-features=AutomationControlled")