import requests from bs4 import BeautifulSoup import re def scrape_allegro_with_bs4(url: str): """BeautifulSoup bez Selenium — działa na HF Spaces""" headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", "Accept-Language": "pl-PL,pl;q=0.9", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Referer": "https://allegro.pl/", "DNT": "1" } try: print(f"🔍 Scraping: {url}") response = requests.get(url, headers=headers, timeout=10) response.raise_for_status() soup = BeautifulSoup(response.content, "html.parser") # TITLE title = "untitled" try: title_tag = soup.find("h1") if title_tag: title = title_tag.get_text(strip=True) except: pass # IMAGES (szukaj w JSON-LD lub img tags) image_urls = set() try: # Metoda 1: Szukaj w script tag (JSON-LD) scripts = soup.find_all("script", type="application/ld+json") for script in scripts: try: import json data = json.loads(script.string) if isinstance(data, list): data = data[0] if "image" in data: images = data["image"] if isinstance(images, list): image_urls.update(images) else: image_urls.add(images) except: pass # Metoda 2: Szukaj img tags z allegroimg for img in soup.find_all("img"): src = img.get("src") or img.get("data-src") if src and "allegroimg.com" in src: # Normalize to original src = re.sub(r"/s\d+/", "/original/", src) image_urls.add(src) except Exception as e: print(f"Błąd zdjęć: {e}") # PARAMETERS (zwykle w tabeli) params = [] try: for row in soup.find_all("tr"): cells = row.find_all("td") if len(cells) == 2: name = cells[0].get_text(strip=True) value = cells[1].get_text(strip=True) if name and value: params.append(f"{name}: {value}") except: pass # DESCRIPTION description = "No description" try: desc_div = soup.find("div", {"itemprop": "description"}) if desc_div: description = desc_div.get_text(strip=True)[:500] # Limit except: # Fallback try: desc_div = soup.find("div", class_=re.compile("description")) if desc_div: description = desc_div.get_text(strip=True)[:500] except: pass return { "platform": "allegro", "url": url, "title": title, "description": description, "parameters": params, "image_urls": list(image_urls) } except requests.exceptions.RequestException as e: return { "status": "error", "error": f"Request failed: {str(e)}", "platform": "allegro", "url": url } if __name__ == "__main__": url = input("Allegro URL: ") data = scrape_allegro_with_bs4(url) print(data)