File size: 4,444 Bytes
422c1f3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
from apify_client import ApifyClient
import os
import re


def sanitize_folder_name(text):
    """Helper function to sanitize folder names"""
    polish_chars = {
        "ą": "a", "ć": "c", "ę": "e", "ł": "l", "ń": "n",
        "ó": "o", "ś": "s", "ź": "z", "ż": "z"
    }
    text = text.lower()
    result = ""
    for char in text:
        if char in polish_chars:
            result += polish_chars[char]
        elif char.isalnum():
            result += char
        else:
            result += "_"
    while "__" in result:
        result = result.replace("__", "_")
    return result.strip("_")


def extract_price(price_str):
    """Extract numeric price from various formats"""
    if not price_str:
        return None
    match = re.search(r'(\d+[.,]\d{2}|\d+)', str(price_str))
    if match:
        return match.group(1).replace(',', '.')
    return price_str


def extract_images_from_apify(item_data):
    """Extract and normalize image URLs from Apify response"""
    unique_links = set()
    allowed_sizes = ["/s128/", "/s360/", "/s512/", "/s720/", "/s1024/", "/s1440/", "/original/"]
    
    image_sources = []
    if 'images' in item_data and item_data['images']:
        if isinstance(item_data['images'], list):
            image_sources.extend(item_data['images'])
        else:
            image_sources.append(item_data['images'])
    
    if 'image' in item_data and item_data['image']:
        image_sources.append(item_data['image'])
    
    if 'imageUrl' in item_data and item_data['imageUrl']:
        image_sources.append(item_data['imageUrl'])
    
    for img_url in image_sources:
        if img_url and isinstance(img_url, str):
            if "allegroimg.com" in img_url or "img" in img_url:
                for size in allowed_sizes:
                    img_url = img_url.replace(size, "/original/")
                unique_links.add(img_url)
    
    return list(unique_links)


def scrape_allegro_offer(url: str):
    """Scrape single Allegro product using Apify E-commerce Tool"""
    
    api_token = os.getenv('APIFY_API_TOKEN')
    if not api_token:
        raise ValueError("APIFY_API_TOKEN environment variable not set")
    
    client = ApifyClient(api_token)
    
    # Correct input format for E-commerce Scraping Tool
    run_input = {
        "startUrls": [
            url
        ]
    }
    
    print(f"🔍 Scraping: {url}")
    
    try:
        actor_call = client.actor("e-commerce/allegro-product-detail-scraper").call(
            run_input=run_input
        )
        dataset_client = client.dataset(actor_call['defaultDatasetId'])
        items = list(dataset_client.iterate_items())
        
        if not items:
            print("⚠️  No data returned from Apify")
            return {
                "platform": "allegro",
                "url": url,
                "title": "untitled",
                "description": "No description",
                "price": None,
                "image_urls": []
            }
        
        item = items[0]
        print(f"✅ Success! Found: {item.get('productTitle', 'untitled')}")

        image_urls = extract_images_from_apify(item)

        if not image_urls:
            thumbnail = item.get("thumbnail")
            if thumbnail:
                image_urls = [thumbnail]
        
        return {
            "platform": "allegro",
            "url": item.get('url', url),
            "title": item.get('productTitle', 'untitled').strip(),
            "description": item.get('description', 'No description'),
            "price": extract_price(item.get('price', item.get('currentPrice'))),
            "image_urls": image_urls
        }
    
    except Exception as e:
        print(f"❌ Error: {e}")
        return {
            "platform": "allegro",
            "url": url,
            "title": "error",
            "description": str(e),
            "price": None,
            "image_urls": []
        }


# Example usage
if __name__ == "__main__":
    url = input("Allegro URL: ")
    result = scrape_allegro_offer(url)
    
    print("\n✅ Scraping result:")
    print(f"Title: {result['title']}")
    print(f"Price: {result['price']}")
    print(f"Description: {result['description'][:100]}..." if len(result['description']) > 100 else f"Description: {result['description']}")
    print(f"Images: {len(result['image_urls'])} found")
    for img in result['image_urls'][:3]:
        print(f"  - {img}")