hatamo commited on
Commit
56e046e
·
1 Parent(s): 0e2ce87

Added new version of scrappers

Browse files
code/web_scraper_allegro.py CHANGED
@@ -1,16 +1,33 @@
1
- from apify_client import ApifyClient
2
  import os
3
- import re
 
 
 
 
 
 
 
4
 
 
5
 
6
- def sanitize_folder_name(text):
7
- """Helper function to sanitize folder names"""
 
8
  polish_chars = {
9
- "ą": "a", "ć": "c", "ę": "e", "ł": "l", "ń": "n",
10
- "ó": "o", "ś": "s", "ź": "z", "ż": "z"
 
 
 
 
 
 
 
11
  }
 
12
  text = text.lower()
13
  result = ""
 
14
  for char in text:
15
  if char in polish_chars:
16
  result += polish_chars[char]
@@ -18,125 +35,117 @@ def sanitize_folder_name(text):
18
  result += char
19
  else:
20
  result += "_"
 
 
21
  while "__" in result:
22
  result = result.replace("__", "_")
23
- return result.strip("_")
24
 
 
25
 
26
- def extract_price(price_str):
27
- """Extract numeric price from various formats"""
28
- if not price_str:
29
- return None
30
- match = re.search(r'(\d+[.,]\d{2}|\d+)', str(price_str))
31
- if match:
32
- return match.group(1).replace(',', '.')
33
- return price_str
34
-
35
 
36
- def extract_images_from_apify(item_data):
37
- """Extract and normalize image URLs from Apify response"""
38
- unique_links = set()
39
- allowed_sizes = ["/s128/", "/s360/", "/s512/", "/s720/", "/s1024/", "/s1440/", "/original/"]
 
 
 
40
 
41
- image_sources = []
42
- if 'images' in item_data and item_data['images']:
43
- if isinstance(item_data['images'], list):
44
- image_sources.extend(item_data['images'])
45
- else:
46
- image_sources.append(item_data['images'])
47
-
48
- if 'image' in item_data and item_data['image']:
49
- image_sources.append(item_data['image'])
50
-
51
- if 'imageUrl' in item_data and item_data['imageUrl']:
52
- image_sources.append(item_data['imageUrl'])
53
-
54
- for img_url in image_sources:
55
- if img_url and isinstance(img_url, str):
56
- if "allegroimg.com" in img_url or "img" in img_url:
57
- for size in allowed_sizes:
58
- img_url = img_url.replace(size, "/original/")
59
- unique_links.add(img_url)
60
 
61
- return list(unique_links)
 
 
62
 
63
-
64
- def scrape_allegro_offer(url: str):
65
- """Scrape single Allegro product using Apify E-commerce Tool"""
66
-
67
- api_token = os.getenv('APIFY_API_TOKEN')
68
- if not api_token:
69
- raise ValueError("APIFY_API_TOKEN environment variable not set")
70
-
71
- client = ApifyClient(api_token)
72
-
73
- # Correct input format for E-commerce Scraping Tool
74
- run_input = {
75
- "startUrls": [
76
- url
77
- ]
78
- }
79
 
80
- print(f"🔍 Scraping: {url}")
 
 
 
 
81
 
 
 
82
  try:
83
- actor_call = client.actor("e-commerce/allegro-product-detail-scraper").call(
84
- run_input=run_input
85
- )
86
- dataset_client = client.dataset(actor_call['defaultDatasetId'])
87
- items = list(dataset_client.iterate_items())
88
 
89
- if not items:
90
- print("⚠️ No data returned from Apify")
91
- return {
92
- "platform": "allegro",
93
- "url": url,
94
- "title": "untitled",
95
- "description": "No description",
96
- "price": None,
97
- "image_urls": []
98
- }
99
 
100
- item = items[0]
101
- print(f"✅ Success! Found: {item.get('productTitle', 'untitled')}")
 
 
 
 
 
102
 
103
- image_urls = extract_images_from_apify(item)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- if not image_urls:
106
- thumbnail = item.get("thumbnail")
107
- if thumbnail:
108
- image_urls = [thumbnail]
109
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
110
  return {
111
- "platform": "allegro",
112
- "url": item.get('url', url),
113
- "title": item.get('productTitle', 'untitled').strip(),
114
- "description": item.get('description', 'No description'),
115
- "price": extract_price(item.get('price', item.get('currentPrice'))),
116
- "image_urls": image_urls
117
- }
118
-
 
 
119
  except Exception as e:
120
- print(f" Error: {e}")
121
- return {
122
- "platform": "allegro",
123
- "url": url,
124
- "title": "error",
125
- "description": str(e),
126
- "price": None,
127
- "image_urls": []
128
- }
129
-
130
-
131
- # Example usage
132
- if __name__ == "__main__":
133
- url = input("Allegro URL: ")
134
- result = scrape_allegro_offer(url)
135
-
136
- print("\n✅ Scraping result:")
137
- print(f"Title: {result['title']}")
138
- print(f"Price: {result['price']}")
139
- print(f"Description: {result['description'][:100]}..." if len(result['description']) > 100 else f"Description: {result['description']}")
140
- print(f"Images: {len(result['image_urls'])} found")
141
- for img in result['image_urls'][:3]:
142
- print(f" - {img}")
 
 
1
  import os
2
+ import requests
3
+ from apify_client import ApifyClient
4
+ from dotenv import load_dotenv
5
+ import json
6
+
7
+ # --- CONFIGURATION ---
8
+ # Load environment variables from the .env file (if it exists)
9
+ load_dotenv()
10
 
11
+ ACTOR_ID = "e-commerce/allegro-product-detail-scraper"
12
 
13
+ # --- HELPER FUNCTIONS ---
14
+ def sanitize_name(text):
15
+ """Sanitizes text by removing Polish characters and special symbols for a folder name."""
16
  polish_chars = {
17
+ "ą": "a",
18
+ "ć": "c",
19
+ "ę": "e",
20
+ "ł": "l",
21
+ "ń": "n",
22
+ "ó": "o",
23
+ "ś": "s",
24
+ "ź": "z",
25
+ "ż": "z",
26
  }
27
+
28
  text = text.lower()
29
  result = ""
30
+
31
  for char in text:
32
  if char in polish_chars:
33
  result += polish_chars[char]
 
35
  result += char
36
  else:
37
  result += "_"
38
+
39
+ # Remove double underscores
40
  while "__" in result:
41
  result = result.replace("__", "_")
 
42
 
43
+ return result.strip("_")
44
 
45
+ def get_high_res_image(url):
46
+ """Converts a thumbnail/resized link to the original high-resolution Allegro link."""
47
+ if not url: return None
48
+ sizes = ["/s128/", "/s360/", "/s720/", "/s1024/", "/s1440/"]
49
+ for size in sizes:
50
+ if size in url:
51
+ return url.replace(size, "/original/")
52
+ return url
 
53
 
54
+ def get_api_token():
55
+ """
56
+ Retrieves API token.
57
+ Priority 1: from .env file (environment variable).
58
+ Priority 2: prompts user input in the console.
59
+ """
60
+ token = os.getenv("APIFY_TOKEN")
61
 
62
+ if token:
63
+ print("Info: API Token loaded from .env file.")
64
+ return token
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
+ print("Warning: APIFY_TOKEN not found in .env file.")
67
+ token = input("Please enter your Apify API Token: ").strip()
68
+ return token
69
 
70
+ def get_allegro_data(url):
71
+ apify_token = get_api_token()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ if not apify_token:
74
+ print("ERROR: API Token is required to run the script.")
75
+ return
76
+
77
+ client = ApifyClient(apify_token)
78
 
79
+ run_input = { "startUrls": [url] }
80
+
81
  try:
82
+ print("--- GATHERING DATA ---")
 
 
 
 
83
 
84
+ run = client.actor(ACTOR_ID).call(run_input=run_input)
 
 
 
 
 
 
 
 
 
85
 
86
+ dataset_items = list(client.dataset(run["defaultDatasetId"]).iterate_items())
87
+
88
+ if not dataset_items:
89
+ print("Apify finished the job but returned no data.")
90
+ return
91
+
92
+ item = dataset_items[0]
93
 
94
+ # --- DATA MAPPING ---
95
+
96
+ # TITLE
97
+ title = item.get("productTitle") or item.get("title") or "untitled"
98
+
99
+ # DESCRIPTION
100
+ description = item.get("description", "No description")
101
+
102
+ # PARAMETERS
103
+ parameter_list = []
104
+ specs = item.get("productSpecifications", {})
105
+
106
+ if isinstance(specs, dict):
107
+ for key, value in specs.items():
108
+ parameter_list.append(f"{key}: {value}")
109
+ elif not specs:
110
+ raw_params = item.get("parameters") or item.get("attributes", [])
111
+ for p in raw_params:
112
+ name = p.get("name") or p.get("key")
113
+ val = p.get("value")
114
+ if name and val:
115
+ parameter_list.append(f"{name}: {val}")
116
 
117
+ # IMAGES
118
+ unique_links = set()
 
 
119
 
120
+ raw_images = item.get("images", [])
121
+ for img in raw_images:
122
+ if isinstance(img, str): unique_links.add(get_high_res_image(img))
123
+ elif isinstance(img, dict): unique_links.add(get_high_res_image(img.get("url")))
124
+
125
+ if not unique_links:
126
+ thumb = item.get("thumbnail")
127
+ if thumb:
128
+ high_res = get_high_res_image(thumb)
129
+ unique_links.add(high_res)
130
+ print("Info: Retrieved main image from thumbnail (gallery was empty in API).")
131
+
132
+ print(f"Found {len(unique_links)} images.")
133
+
134
  return {
135
+ "title": title,
136
+ "sanitized_title": sanitize_name(title),
137
+ "url": url,
138
+ "description": description,
139
+ "parameters": parameter_list,
140
+ "image_urls": list(unique_links),
141
+ "image_count": len(unique_links),
142
+ "price": f"{item.get('price')} {item.get('currency')}"
143
+ }
144
+
145
  except Exception as e:
146
+ print(f"Main error occurred: {e}")
147
+
148
+ # --- USAGE ---
149
+ link = input("Enter the Allegro offer link: ")
150
+ data = get_allegro_data(link)
151
+ print(json.dumps(data, indent=4, ensure_ascii=False))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
code/web_scraper_ebay.py CHANGED
@@ -1,93 +1,153 @@
1
- # scrape_ebay_offer.py
2
- import undetected_chromedriver as uc
3
- from selenium.webdriver.common.by import By
4
- from webdriver_manager.chrome import ChromeDriverManager
5
- from selenium.webdriver.chrome.service import Service
6
- import time
7
- import requests
8
  import os
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
 
10
- def scrape_ebay_offer(url: str):
11
- """Zwraca dane aukcji bez zapisywania na dysk"""
12
- print(f"🔍 eBay: {url}")
13
- options = uc.ChromeOptions()
14
- options.add_argument("--window-position=-3000,0")
15
- options.add_argument("--headless")
16
- options.add_argument("--no-sandbox")
17
- options.add_argument("--disable-dev-shm-usage")
 
 
18
 
19
- # Ustawienie binarki Chrome'a
20
- if os.path.exists('/usr/bin/google-chrome'):
21
- options.binary_location = '/usr/bin/google-chrome'
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
22
 
23
- driver = uc.Chrome(
24
- service=Service(ChromeDriverManager().install()),
25
- options=options,
26
- use_subprocess=True
27
- )
 
 
 
 
 
 
28
 
 
 
29
  try:
30
- driver.get(url)
31
- time.sleep(4)
 
 
 
 
 
 
 
 
 
32
 
33
  # TITLE
34
- try:
35
- title_element = driver.find_element(By.CSS_SELECTOR, "h1.x-item-title__mainTitle")
36
- title_str = title_element.text.strip()
37
- except:
38
- title_str = "untitled_ebay"
39
 
 
 
 
40
  # PARAMETERS
41
  parameter_list = []
42
- try:
43
- rows = driver.find_elements(By.CSS_SELECTOR, ".ux-labels-values")
44
- for row in rows:
45
- try:
46
- label = row.find_element(By.CSS_SELECTOR, ".ux-labels-values__labels").text.strip()
47
- value = row.find_element(By.CSS_SELECTOR, ".ux-labels-values__values").text.strip()
48
- if label and value:
49
- parameter_list.append(f"{label}: {value}")
50
- except:
51
- continue
52
- except:
53
- pass
54
-
55
- # DESCRIPTION
56
- description_content = "No description"
57
- try:
58
- frame = driver.find_element(By.ID, "desc_ifr")
59
- driver.switch_to.frame(frame)
60
- description_content = driver.find_element(By.TAG_NAME, "body").text.strip()
61
- driver.switch_to.default_content()
62
- except:
63
- pass
64
 
 
 
65
  # IMAGES
66
  unique_links = set()
67
- try:
68
- thumbnails = driver.find_elements(By.CSS_SELECTOR, ".ux-image-grid-item img")
69
- for img in thumbnails:
70
- src = img.get_attribute("src") or img.get_attribute("data-src")
71
- if src and "ebayimg.com" in src:
72
- # Zamień na HD
73
- hd_link = src.replace("/s-l64/", "/s-l1600").replace("/s-l140/", "/s-l1600")
74
- unique_links.add(hd_link)
75
- except:
76
- pass
77
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
78
  return {
79
- "platform": "ebay",
 
80
  "url": url,
81
- "title": title_str,
82
- "description": description_content,
83
  "parameters": parameter_list,
84
- "image_urls": list(unique_links)
 
 
85
  }
86
-
87
- finally:
88
- driver.quit()
89
 
90
- if __name__ == "__main__":
91
- url = input("eBay URL: ")
92
- result = scrape_ebay_offer(url)
93
- print(result)
 
 
 
 
 
 
 
 
 
 
 
1
  import os
2
+ import requests
3
+ from apify_client import ApifyClient
4
+ from dotenv import load_dotenv
5
+ import json
6
+
7
+ # --- CONFIGURATION ---
8
+ # Load environment variables from the .env file (if it exists)
9
+ load_dotenv()
10
+
11
+ ACTOR_ID = "vulnv/ebay-product-scraper"
12
+
13
+ # --- HELPER FUNCTIONS ---
14
+ def sanitize_name(text):
15
+ """Sanitizes text by removing Polish characters and special symbols for a folder name."""
16
+ polish_chars = {
17
+ "ą": "a",
18
+ "ć": "c",
19
+ "ę": "e",
20
+ "ł": "l",
21
+ "ń": "n",
22
+ "ó": "o",
23
+ "ś": "s",
24
+ "ź": "z",
25
+ "ż": "z",
26
+ }
27
+
28
+ text = text.lower()
29
+ result = ""
30
+
31
+ for char in text:
32
+ if char in polish_chars:
33
+ result += polish_chars[char]
34
+ elif char.isalnum():
35
+ result += char
36
+ else:
37
+ result += "_"
38
 
39
+ # Remove double underscores
40
+ while "__" in result:
41
+ result = result.replace("__", "_")
42
+
43
+ return result.strip("_")
44
+
45
+ def get_high_res_ebay_image(url):
46
+ """Replaces the size code in the eBay link with s-l1600 (Max quality)."""
47
+ if not url: return None
48
+ sizes = ["s-l64", "s-l140", "s-l300", "s-l400", "s-l500", "s-l960"]
49
 
50
+ for size in sizes:
51
+ if size in url:
52
+ return url.replace(size, "s-l1600")
53
+
54
+ if "ebayimg.com" in url and "s-l1600" not in url:
55
+ parts = url.split("/")
56
+ last_part = parts[-1]
57
+ if "s-l" in last_part:
58
+ return url.replace(last_part[:last_part.find(".")], "s-l1600")
59
+
60
+ return url
61
+
62
+ def get_api_token():
63
+ """Retrieves token from .env or asks the user."""
64
+ token = os.getenv("APIFY_TOKEN")
65
+ if token:
66
+ print("Info: API Token loaded from .env file.")
67
+ return token
68
 
69
+ print("Warning: APIFY_TOKEN not found in .env file.")
70
+ return input("Please enter your Apify API Token: ").strip()
71
+
72
+ def get_ebay_data(url):
73
+ apify_token = get_api_token()
74
+ if not apify_token:
75
+ print("ERROR: API Token is required.")
76
+ return
77
+
78
+ print(f"\n--- SENDING REQUEST TO APIFY ---")
79
+ client = ApifyClient(apify_token)
80
 
81
+ run_input = { "product_urls": [url] }
82
+
83
  try:
84
+ run = client.actor(ACTOR_ID).call(run_input=run_input)
85
+
86
+ dataset_items = list(client.dataset(run["defaultDatasetId"]).iterate_items())
87
+
88
+ if not dataset_items:
89
+ print("Apify finished the job but returned no data.")
90
+ return
91
+
92
+ item = dataset_items[0]
93
+
94
+ # --- DATA MAPPING ---
95
 
96
  # TITLE
97
+ title = item.get("name") or item.get("title") or "untitled_ebay"
98
+
99
+ # PRICE
100
+ price = item.get("price", "N/A")
101
+ currency = item.get("currency", "")
102
 
103
+ # DESCRIPTION
104
+ description = item.get("description", "No text description available.")
105
+
106
  # PARAMETERS
107
  parameter_list = []
108
+ raw_props = item.get("additionalProperties", [])
109
+ if isinstance(raw_props, list):
110
+ for prop in raw_props:
111
+ p_name = prop.get("name")
112
+ p_val = prop.get("value")
113
+ if p_name and p_val:
114
+ parameter_list.append(f"{p_name}: {p_val}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
115
 
116
+ if item.get("sku"): parameter_list.insert(0, f"SKU: {item.get('sku')}")
117
+
118
  # IMAGES
119
  unique_links = set()
 
 
 
 
 
 
 
 
 
 
120
 
121
+ main_img = item.get("mainImage", {}).get("url")
122
+ if main_img:
123
+ unique_links.add(get_high_res_ebay_image(main_img))
124
+
125
+ raw_images = item.get("images", [])
126
+ for img_entry in raw_images:
127
+ if isinstance(img_entry, dict):
128
+ raw_url = img_entry.get("url")
129
+ if raw_url:
130
+ unique_links.add(get_high_res_ebay_image(raw_url))
131
+ elif isinstance(img_entry, str):
132
+ unique_links.add(get_high_res_ebay_image(img_entry))
133
+
134
+ print(f"Found {len(unique_links)} unique images (High-Res).")
135
+
136
  return {
137
+ "title": title,
138
+ "sanitized_title": sanitize_name(title),
139
  "url": url,
140
+ "description": description,
 
141
  "parameters": parameter_list,
142
+ "image_urls": list(unique_links),
143
+ "image_count": len(unique_links),
144
+ "price": f"{price} {currency}"
145
  }
 
 
 
146
 
147
+ except Exception as e:
148
+ print(f"Critical error occurred: {e}")
149
+
150
+ # --- USAGE ---
151
+ link = input("Enter the eBay offer link: ")
152
+ data = get_ebay_data(link)
153
+ print(json.dumps(data, indent=4, ensure_ascii=False))
requirements.txt CHANGED
@@ -8,8 +8,6 @@ tqdm
8
  fastapi
9
  uvicorn
10
  python-multipart
11
- undetected_chromedriver
12
- webdriver-manager
13
  bs4
14
  requests
15
  flask
 
8
  fastapi
9
  uvicorn
10
  python-multipart
 
 
11
  bs4
12
  requests
13
  flask