Spaces:
Running
Running
File size: 4,444 Bytes
422c1f3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 |
from apify_client import ApifyClient
import os
import re
def sanitize_folder_name(text):
"""Helper function to sanitize folder names"""
polish_chars = {
"ą": "a", "ć": "c", "ę": "e", "ł": "l", "ń": "n",
"ó": "o", "ś": "s", "ź": "z", "ż": "z"
}
text = text.lower()
result = ""
for char in text:
if char in polish_chars:
result += polish_chars[char]
elif char.isalnum():
result += char
else:
result += "_"
while "__" in result:
result = result.replace("__", "_")
return result.strip("_")
def extract_price(price_str):
"""Extract numeric price from various formats"""
if not price_str:
return None
match = re.search(r'(\d+[.,]\d{2}|\d+)', str(price_str))
if match:
return match.group(1).replace(',', '.')
return price_str
def extract_images_from_apify(item_data):
"""Extract and normalize image URLs from Apify response"""
unique_links = set()
allowed_sizes = ["/s128/", "/s360/", "/s512/", "/s720/", "/s1024/", "/s1440/", "/original/"]
image_sources = []
if 'images' in item_data and item_data['images']:
if isinstance(item_data['images'], list):
image_sources.extend(item_data['images'])
else:
image_sources.append(item_data['images'])
if 'image' in item_data and item_data['image']:
image_sources.append(item_data['image'])
if 'imageUrl' in item_data and item_data['imageUrl']:
image_sources.append(item_data['imageUrl'])
for img_url in image_sources:
if img_url and isinstance(img_url, str):
if "allegroimg.com" in img_url or "img" in img_url:
for size in allowed_sizes:
img_url = img_url.replace(size, "/original/")
unique_links.add(img_url)
return list(unique_links)
def scrape_allegro_offer(url: str):
"""Scrape single Allegro product using Apify E-commerce Tool"""
api_token = os.getenv('APIFY_API_TOKEN')
if not api_token:
raise ValueError("APIFY_API_TOKEN environment variable not set")
client = ApifyClient(api_token)
# Correct input format for E-commerce Scraping Tool
run_input = {
"startUrls": [
url
]
}
print(f"🔍 Scraping: {url}")
try:
actor_call = client.actor("e-commerce/allegro-product-detail-scraper").call(
run_input=run_input
)
dataset_client = client.dataset(actor_call['defaultDatasetId'])
items = list(dataset_client.iterate_items())
if not items:
print("⚠️ No data returned from Apify")
return {
"platform": "allegro",
"url": url,
"title": "untitled",
"description": "No description",
"price": None,
"image_urls": []
}
item = items[0]
print(f"✅ Success! Found: {item.get('productTitle', 'untitled')}")
image_urls = extract_images_from_apify(item)
if not image_urls:
thumbnail = item.get("thumbnail")
if thumbnail:
image_urls = [thumbnail]
return {
"platform": "allegro",
"url": item.get('url', url),
"title": item.get('productTitle', 'untitled').strip(),
"description": item.get('description', 'No description'),
"price": extract_price(item.get('price', item.get('currentPrice'))),
"image_urls": image_urls
}
except Exception as e:
print(f"❌ Error: {e}")
return {
"platform": "allegro",
"url": url,
"title": "error",
"description": str(e),
"price": None,
"image_urls": []
}
# Example usage
if __name__ == "__main__":
url = input("Allegro URL: ")
result = scrape_allegro_offer(url)
print("\n✅ Scraping result:")
print(f"Title: {result['title']}")
print(f"Price: {result['price']}")
print(f"Description: {result['description'][:100]}..." if len(result['description']) > 100 else f"Description: {result['description']}")
print(f"Images: {len(result['image_urls'])} found")
for img in result['image_urls'][:3]:
print(f" - {img}")
|