Antique_Auth_API / code /web_scraper_allegro.py
hatamo's picture
Modified scrapper for allegro
948dcae
import requests
from bs4 import BeautifulSoup
import re
def scrape_allegro_with_bs4(url: str):
"""BeautifulSoup bez Selenium — działa na HF Spaces"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
"Accept-Language": "pl-PL,pl;q=0.9",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Referer": "https://allegro.pl/",
"DNT": "1"
}
try:
print(f"🔍 Scraping: {url}")
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.content, "html.parser")
# TITLE
title = "untitled"
try:
title_tag = soup.find("h1")
if title_tag:
title = title_tag.get_text(strip=True)
except:
pass
# IMAGES (szukaj w JSON-LD lub img tags)
image_urls = set()
try:
# Metoda 1: Szukaj w script tag (JSON-LD)
scripts = soup.find_all("script", type="application/ld+json")
for script in scripts:
try:
import json
data = json.loads(script.string)
if isinstance(data, list):
data = data[0]
if "image" in data:
images = data["image"]
if isinstance(images, list):
image_urls.update(images)
else:
image_urls.add(images)
except:
pass
# Metoda 2: Szukaj img tags z allegroimg
for img in soup.find_all("img"):
src = img.get("src") or img.get("data-src")
if src and "allegroimg.com" in src:
# Normalize to original
src = re.sub(r"/s\d+/", "/original/", src)
image_urls.add(src)
except Exception as e:
print(f"Błąd zdjęć: {e}")
# PARAMETERS (zwykle w tabeli)
params = []
try:
for row in soup.find_all("tr"):
cells = row.find_all("td")
if len(cells) == 2:
name = cells[0].get_text(strip=True)
value = cells[1].get_text(strip=True)
if name and value:
params.append(f"{name}: {value}")
except:
pass
# DESCRIPTION
description = "No description"
try:
desc_div = soup.find("div", {"itemprop": "description"})
if desc_div:
description = desc_div.get_text(strip=True)[:500] # Limit
except:
# Fallback
try:
desc_div = soup.find("div", class_=re.compile("description"))
if desc_div:
description = desc_div.get_text(strip=True)[:500]
except:
pass
return {
"platform": "allegro",
"url": url,
"title": title,
"description": description,
"parameters": params,
"image_urls": list(image_urls)
}
except requests.exceptions.RequestException as e:
return {
"status": "error",
"error": f"Request failed: {str(e)}",
"platform": "allegro",
"url": url
}
if __name__ == "__main__":
url = input("Allegro URL: ")
data = scrape_allegro_with_bs4(url)
print(data)