Spaces:
Running
Running
Modified scrapper setings
Browse files- code/app.py +2 -2
- code/web_scraper_allegro.py +7 -3
code/app.py
CHANGED
|
@@ -278,7 +278,7 @@ async def validate_url(
|
|
| 278 |
|
| 279 |
|
| 280 |
@app.post("/debug_scrape")
|
| 281 |
-
async def debug_scrape(url: str = Form(...)):
|
| 282 |
"""Run scraper for a URL and return the raw auction dict and a small HTML preview.
|
| 283 |
This endpoint is for debugging only."""
|
| 284 |
try:
|
|
@@ -286,7 +286,7 @@ async def debug_scrape(url: str = Form(...)):
|
|
| 286 |
# Choose scraper
|
| 287 |
if "allegro.pl" in url:
|
| 288 |
from web_scraper_allegro import scrape_allegro_offer
|
| 289 |
-
auction = scrape_allegro_offer(url)
|
| 290 |
elif "olx.pl" in url:
|
| 291 |
from web_scraper_olx import scrape_olx_offer
|
| 292 |
auction = scrape_olx_offer(url)
|
|
|
|
| 278 |
|
| 279 |
|
| 280 |
@app.post("/debug_scrape")
|
| 281 |
+
async def debug_scrape(url: str = Form(...), headless: bool = Form(True)):
|
| 282 |
"""Run scraper for a URL and return the raw auction dict and a small HTML preview.
|
| 283 |
This endpoint is for debugging only."""
|
| 284 |
try:
|
|
|
|
| 286 |
# Choose scraper
|
| 287 |
if "allegro.pl" in url:
|
| 288 |
from web_scraper_allegro import scrape_allegro_offer
|
| 289 |
+
auction = scrape_allegro_offer(url, headless=headless)
|
| 290 |
elif "olx.pl" in url:
|
| 291 |
from web_scraper_olx import scrape_olx_offer
|
| 292 |
auction = scrape_olx_offer(url)
|
code/web_scraper_allegro.py
CHANGED
|
@@ -25,11 +25,15 @@ def sanitize_folder_name(text): # helper function
|
|
| 25 |
result = result.replace("__", "_")
|
| 26 |
return result.strip("_")
|
| 27 |
|
| 28 |
-
def scrape_allegro_offer(url: str):
|
| 29 |
-
"""Zwraca dane aukcji bez zapisywania na dysk
|
|
|
|
|
|
|
|
|
|
| 30 |
options = uc.ChromeOptions()
|
| 31 |
options.add_argument("--window-position=-3000,0")
|
| 32 |
-
|
|
|
|
| 33 |
options.add_argument("--no-sandbox")
|
| 34 |
options.add_argument("--disable-dev-shm-usage")
|
| 35 |
options.add_argument("--disable-blink-features=AutomationControlled")
|
|
|
|
| 25 |
result = result.replace("__", "_")
|
| 26 |
return result.strip("_")
|
| 27 |
|
| 28 |
+
def scrape_allegro_offer(url: str, headless: bool = True):
|
| 29 |
+
"""Zwraca dane aukcji bez zapisywania na dysk
|
| 30 |
+
|
| 31 |
+
headless: jeśli False, uruchom przeglądarkę w trybie widocznym (przydatne do ręcznego rozwiązania CAPTCHA).
|
| 32 |
+
"""
|
| 33 |
options = uc.ChromeOptions()
|
| 34 |
options.add_argument("--window-position=-3000,0")
|
| 35 |
+
if headless:
|
| 36 |
+
options.add_argument("--headless=new")
|
| 37 |
options.add_argument("--no-sandbox")
|
| 38 |
options.add_argument("--disable-dev-shm-usage")
|
| 39 |
options.add_argument("--disable-blink-features=AutomationControlled")
|