petertulip86 commited on
Commit
f007e8d
·
verified ·
1 Parent(s): 44593ac

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +68 -68
app.py CHANGED
@@ -17,7 +17,7 @@ import subprocess
17
  import sys
18
 
19
  def setup_chrome_driver():
20
- """设置Chrome WebDriver,适用于Hugging Face Spaces"""
21
  chrome_options = Options()
22
  chrome_options.add_argument("--headless")
23
  chrome_options.add_argument("--no-sandbox")
@@ -27,7 +27,7 @@ def setup_chrome_driver():
27
  chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
28
 
29
  try:
30
- # 尝试使用系统中的Chrome
31
  driver = webdriver.Chrome(options=chrome_options)
32
  return driver
33
  except Exception as e:
@@ -35,10 +35,10 @@ def setup_chrome_driver():
35
  return None
36
 
37
  def extract_images_from_pinterest(url, max_images, progress_callback=None):
38
- """Pinterest页面提取图片URL"""
39
  driver = setup_chrome_driver()
40
  if not driver:
41
- return [], "无法启动Chrome WebDriver"
42
 
43
  try:
44
  driver.get(url)
@@ -49,12 +49,12 @@ def extract_images_from_pinterest(url, max_images, progress_callback=None):
49
  no_new_images_count = 0
50
 
51
  while len(imgList) < max_images:
52
- # 滚动页面
53
  scroll += 800
54
  driver.execute_script(f'window.scrollTo(0, {scroll})')
55
  time.sleep(1)
56
 
57
- # 获取图片元素
58
  imgs = driver.find_elements(By.CSS_SELECTOR, 'div[data-test-id="pin"] img')
59
  new_images = 0
60
 
@@ -68,9 +68,9 @@ def extract_images_from_pinterest(url, max_images, progress_callback=None):
68
  continue
69
 
70
  if progress_callback:
71
- progress_callback(f"已找到 {len(imgList)} 张图片")
72
 
73
- # 检查是否没有新图片
74
  if new_images == 0:
75
  no_new_images_count += 1
76
  else:
@@ -82,15 +82,15 @@ def extract_images_from_pinterest(url, max_images, progress_callback=None):
82
  return imgList, None
83
 
84
  except Exception as e:
85
- return [], f"提取图片时出错: {str(e)}"
86
  finally:
87
  driver.quit()
88
 
89
  def download_image(args):
90
- """下载单张图片"""
91
  index, url, temp_dir = args
92
  try:
93
- # 转换为高清图片URL
94
  if '236x' in url:
95
  url = url.replace('236x', 'originals')
96
  elif '474x' in url:
@@ -99,7 +99,7 @@ def download_image(args):
99
  # 生成文件名
100
  filename = f"pinterest_img_{index+1:04d}"
101
 
102
- # URL提取原始文件名
103
  url_parts = url.split('/')
104
  if len(url_parts) > 0:
105
  original_name = url_parts[-1].split('?')[0]
@@ -107,13 +107,13 @@ def download_image(args):
107
  clean_name = re.sub(r'[^\w\-_\.]', '_', original_name)
108
  filename = f"pinterest_img_{index+1:04d}_{clean_name}"
109
 
110
- # 确保文件扩展名
111
  if not filename.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
112
  filename += '.jpg'
113
 
114
  filepath = os.path.join(temp_dir, filename)
115
 
116
- # 下载图片
117
  headers = {
118
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
119
  }
@@ -127,30 +127,30 @@ def download_image(args):
127
  return True, filename
128
 
129
  except Exception as e:
130
- return False, f"下载失败: {str(e)}"
131
 
132
  def scrape_pinterest_images(pinterest_url, num_images, progress=gr.Progress()):
133
- """主要的爬虫函数"""
134
  if not pinterest_url:
135
- return None, "请输入Pinterest URL"
136
 
137
  if num_images <= 0:
138
- return None, "图片数量必须大于0"
139
 
140
- if num_images > 500: # 限制最大数量
141
- return None, "图片数量不能超过500"
142
 
143
- # 验证URL
144
  try:
145
  parsed_url = urlparse(pinterest_url)
146
  if 'pinterest.com' not in parsed_url.netloc:
147
- return None, "请输入有效的Pinterest URL"
148
  except:
149
- return None, "URL格式无效"
150
 
151
- progress(0, desc="开始提取图片URL...")
152
 
153
- # 提取图片URL
154
  def update_progress(msg):
155
  progress(0.3, desc=msg)
156
 
@@ -160,40 +160,40 @@ def scrape_pinterest_images(pinterest_url, num_images, progress=gr.Progress()):
160
  return None, error
161
 
162
  if not img_urls:
163
- return None, "未找到任何图片"
164
 
165
- progress(0.5, desc=f"开始下载 {len(img_urls)} 张图片...")
166
 
167
- # 创建临时目录
168
  temp_dir = tempfile.mkdtemp()
169
 
170
  try:
171
- # 准备下载参数
172
  download_args = [(i, url, temp_dir) for i, url in enumerate(img_urls)]
173
 
174
- # 多线程下载
175
  successful_downloads = []
176
  failed_downloads = []
177
 
178
- with ThreadPoolExecutor(max_workers=3) as executor: # 降低并发数
179
  results = list(executor.map(download_image, download_args))
180
 
181
  for i, (success, info) in enumerate(results):
182
  if success:
183
  successful_downloads.append(info)
184
  else:
185
- failed_downloads.append(f"图片 {i+1}: {info}")
186
 
187
- # 更新进度
188
  progress((0.5 + 0.4 * (i + 1) / len(results)),
189
- desc=f"已下载 {len(successful_downloads)} / {len(img_urls)} 张图片")
190
 
191
  if not successful_downloads:
192
- return None, "所有图片下载失败"
193
 
194
- progress(0.9, desc="创建ZIP文件...")
195
 
196
- # 创建ZIP文件
197
  zip_path = os.path.join(tempfile.gettempdir(), "pinterest_images.zip")
198
  with zipfile.ZipFile(zip_path, 'w') as zipf:
199
  for filename in os.listdir(temp_dir):
@@ -203,49 +203,49 @@ def scrape_pinterest_images(pinterest_url, num_images, progress=gr.Progress()):
203
 
204
  progress(1.0, desc="完成!")
205
 
206
- # 准备结果信息
207
  result_info = f"""
208
- 下载完成!
209
 
210
- 成功下载: {len(successful_downloads)} 张图片
211
- 失败: {len(failed_downloads)} 张图片
212
- 总计: {len(img_urls)} 张图片
213
 
214
- 请点击下方链接下载ZIP文件。
215
  """
216
 
217
  if failed_downloads:
218
- result_info += f"\n\n失败详情:\n" + "\n".join(failed_downloads[:10]) # 只显示前10个错误
219
 
220
  return zip_path, result_info
221
 
222
  except Exception as e:
223
- return None, f"处理过程中出错: {str(e)}"
224
  finally:
225
- # 清理临时目录
226
  try:
227
  shutil.rmtree(temp_dir)
228
  except:
229
  pass
230
 
231
- # 创建Gradio界面
232
  def create_interface():
233
- with gr.Blocks(title="Pinterest图片下载器", theme=gr.themes.Soft()) as interface:
234
  gr.Markdown("""
235
- # 🖼️ Pinterest 图片下载器
236
 
237
- 输入Pinterest搜索页面的URL,批量下载图片。
238
 
239
- **使用说明:**
240
- 1. 输入Pinterest搜索页面或板块的完整URL
241
- 2. 设置要下载的图片数量(建议不超过100张)
242
- 3. 点击"开始下载"按钮
243
- 4. 等待处理完成后下载ZIP文件
244
 
245
- **注意事项:**
246
- - 请确保输入的是有效的Pinterest URL
247
- - 下载速度取决于网络状况和图片大小
248
- - 建议单次下载不超过100张图片
249
  """)
250
 
251
  with gr.Row():
@@ -254,7 +254,7 @@ def create_interface():
254
  label="Pinterest URL",
255
  placeholder="https://www.pinterest.com/search/pins/?q=your-search-term",
256
  lines=2,
257
- info="输入Pinterest搜索页面或板块的完整URL"
258
  )
259
 
260
  num_images = gr.Slider(
@@ -262,22 +262,22 @@ def create_interface():
262
  maximum=500,
263
  value=20,
264
  step=1,
265
- label="图片数量",
266
- info="要下载的图片数量(建议不超过100张)"
267
  )
268
 
269
- download_btn = gr.Button("🚀 开始下载", variant="primary", size="lg")
270
 
271
  with gr.Column():
272
  result_info = gr.Textbox(
273
- label="下载结果",
274
  lines=10,
275
  interactive=False,
276
- info="显示下载进度和结果信息"
277
  )
278
 
279
  download_file = gr.File(
280
- label="下载文件",
281
  interactive=False,
282
  visible=False
283
  )
@@ -292,7 +292,7 @@ def create_interface():
292
  ```
293
  """)
294
 
295
- # 绑定事件
296
  def handle_download(url, num):
297
  zip_path, info = scrape_pinterest_images(url, int(num))
298
  if zip_path:
@@ -309,7 +309,7 @@ def create_interface():
309
 
310
  return interface
311
 
312
- # 启动应用
313
  if __name__ == "__main__":
314
  interface = create_interface()
315
  interface.launch(
 
17
  import sys
18
 
19
  def setup_chrome_driver():
20
+ """設置Chrome WebDriver,適用於Hugging Face Spaces"""
21
  chrome_options = Options()
22
  chrome_options.add_argument("--headless")
23
  chrome_options.add_argument("--no-sandbox")
 
27
  chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
28
 
29
  try:
30
+ # 嘗試使用系統中的Chrome
31
  driver = webdriver.Chrome(options=chrome_options)
32
  return driver
33
  except Exception as e:
 
35
  return None
36
 
37
  def extract_images_from_pinterest(url, max_images, progress_callback=None):
38
+ """Pinterest頁面提取圖片URL"""
39
  driver = setup_chrome_driver()
40
  if not driver:
41
+ return [], "無法啟動Chrome WebDriver"
42
 
43
  try:
44
  driver.get(url)
 
49
  no_new_images_count = 0
50
 
51
  while len(imgList) < max_images:
52
+ # 滾動頁面
53
  scroll += 800
54
  driver.execute_script(f'window.scrollTo(0, {scroll})')
55
  time.sleep(1)
56
 
57
+ # 獲取圖片元素
58
  imgs = driver.find_elements(By.CSS_SELECTOR, 'div[data-test-id="pin"] img')
59
  new_images = 0
60
 
 
68
  continue
69
 
70
  if progress_callback:
71
+ progress_callback(f"已找到 {len(imgList)} 張圖片")
72
 
73
+ # 檢查是否沒有新圖片
74
  if new_images == 0:
75
  no_new_images_count += 1
76
  else:
 
82
  return imgList, None
83
 
84
  except Exception as e:
85
+ return [], f"提取圖片時出錯: {str(e)}"
86
  finally:
87
  driver.quit()
88
 
89
  def download_image(args):
90
+ """下載單張圖片"""
91
  index, url, temp_dir = args
92
  try:
93
+ # 轉換為高清圖片URL
94
  if '236x' in url:
95
  url = url.replace('236x', 'originals')
96
  elif '474x' in url:
 
99
  # 生成文件名
100
  filename = f"pinterest_img_{index+1:04d}"
101
 
102
+ # URL提取原始文件名
103
  url_parts = url.split('/')
104
  if len(url_parts) > 0:
105
  original_name = url_parts[-1].split('?')[0]
 
107
  clean_name = re.sub(r'[^\w\-_\.]', '_', original_name)
108
  filename = f"pinterest_img_{index+1:04d}_{clean_name}"
109
 
110
+ # 確保文件擴展名
111
  if not filename.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
112
  filename += '.jpg'
113
 
114
  filepath = os.path.join(temp_dir, filename)
115
 
116
+ # 下載圖片
117
  headers = {
118
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
119
  }
 
127
  return True, filename
128
 
129
  except Exception as e:
130
+ return False, f"下載失敗: {str(e)}"
131
 
132
  def scrape_pinterest_images(pinterest_url, num_images, progress=gr.Progress()):
133
+ """主要的爬蟲函數"""
134
  if not pinterest_url:
135
+ return None, "請輸入Pinterest URL"
136
 
137
  if num_images <= 0:
138
+ return None, "圖片數量必須大於0"
139
 
140
+ if num_images > 500: # 限制最大數量
141
+ return None, "圖片數量不能超過500"
142
 
143
+ # 驗證URL
144
  try:
145
  parsed_url = urlparse(pinterest_url)
146
  if 'pinterest.com' not in parsed_url.netloc:
147
+ return None, "請輸入有效的Pinterest URL"
148
  except:
149
+ return None, "URL格式無效"
150
 
151
+ progress(0, desc="開始提取圖片URL...")
152
 
153
+ # 提取圖片URL
154
  def update_progress(msg):
155
  progress(0.3, desc=msg)
156
 
 
160
  return None, error
161
 
162
  if not img_urls:
163
+ return None, "未找到任何圖片"
164
 
165
+ progress(0.5, desc=f"開始下載 {len(img_urls)} 張圖片...")
166
 
167
+ # 創建臨時目錄
168
  temp_dir = tempfile.mkdtemp()
169
 
170
  try:
171
+ # 準備下載參數
172
  download_args = [(i, url, temp_dir) for i, url in enumerate(img_urls)]
173
 
174
+ # 多線程下載
175
  successful_downloads = []
176
  failed_downloads = []
177
 
178
+ with ThreadPoolExecutor(max_workers=3) as executor: # 降低並發數
179
  results = list(executor.map(download_image, download_args))
180
 
181
  for i, (success, info) in enumerate(results):
182
  if success:
183
  successful_downloads.append(info)
184
  else:
185
+ failed_downloads.append(f"圖片 {i+1}: {info}")
186
 
187
+ # 更新進度
188
  progress((0.5 + 0.4 * (i + 1) / len(results)),
189
+ desc=f"已下載 {len(successful_downloads)} / {len(img_urls)} 張圖片")
190
 
191
  if not successful_downloads:
192
+ return None, "所有圖片下載失敗"
193
 
194
+ progress(0.9, desc="創建ZIP文件...")
195
 
196
+ # 創建ZIP文件
197
  zip_path = os.path.join(tempfile.gettempdir(), "pinterest_images.zip")
198
  with zipfile.ZipFile(zip_path, 'w') as zipf:
199
  for filename in os.listdir(temp_dir):
 
203
 
204
  progress(1.0, desc="完成!")
205
 
206
+ # 準備結果信息
207
  result_info = f"""
208
+ 下載完成!
209
 
210
+ 成功下載: {len(successful_downloads)} 張圖片
211
+ 失敗: {len(failed_downloads)} 張圖片
212
+ 總計: {len(img_urls)} 張圖片
213
 
214
+ 請點擊下方鏈接下載ZIP文件。
215
  """
216
 
217
  if failed_downloads:
218
+ result_info += f"\n\n失敗詳情:\n" + "\n".join(failed_downloads[:10]) # 只顯示前10個錯誤
219
 
220
  return zip_path, result_info
221
 
222
  except Exception as e:
223
+ return None, f"處理過程中出錯: {str(e)}"
224
  finally:
225
+ # 清理臨時目錄
226
  try:
227
  shutil.rmtree(temp_dir)
228
  except:
229
  pass
230
 
231
+ # 創建Gradio界面
232
  def create_interface():
233
+ with gr.Blocks(title="Pinterest圖片下載器", theme=gr.themes.Soft()) as interface:
234
  gr.Markdown("""
235
+ # 🖼️ Pinterest 圖片下載器
236
 
237
+ 輸入Pinterest搜索頁面的URL,批量下載圖片。
238
 
239
+ **使用說明:**
240
+ 1. 輸入Pinterest搜索頁面或板塊的完整URL
241
+ 2. 設置要下載的圖片數量(建議不超過100張)
242
+ 3. 點擊"開始下載"按鈕
243
+ 4. 等待處理完成後下載ZIP文件
244
 
245
+ **注意事項:**
246
+ - 請確保輸入的是有效的Pinterest URL
247
+ - 下載速度取決於網絡狀況和圖片大小
248
+ - 建議單次下載不超過100張圖片
249
  """)
250
 
251
  with gr.Row():
 
254
  label="Pinterest URL",
255
  placeholder="https://www.pinterest.com/search/pins/?q=your-search-term",
256
  lines=2,
257
+ info="輸入Pinterest搜索頁面或板塊的完整URL"
258
  )
259
 
260
  num_images = gr.Slider(
 
262
  maximum=500,
263
  value=20,
264
  step=1,
265
+ label="圖片數量",
266
+ info="要下載的圖片數量(建議不超過100張)"
267
  )
268
 
269
+ download_btn = gr.Button("🚀 開始下載", variant="primary", size="lg")
270
 
271
  with gr.Column():
272
  result_info = gr.Textbox(
273
+ label="下載結果",
274
  lines=10,
275
  interactive=False,
276
+ info="顯示下載進度和結果信息"
277
  )
278
 
279
  download_file = gr.File(
280
+ label="下載文件",
281
  interactive=False,
282
  visible=False
283
  )
 
292
  ```
293
  """)
294
 
295
+ # 綁定事件
296
  def handle_download(url, num):
297
  zip_path, info = scrape_pinterest_images(url, int(num))
298
  if zip_path:
 
309
 
310
  return interface
311
 
312
+ # 啟動應用
313
  if __name__ == "__main__":
314
  interface = create_interface()
315
  interface.launch(