petertulip86 commited on
Commit
1e50eb2
·
verified ·
1 Parent(s): 0785ed1

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +319 -0
app.py ADDED
@@ -0,0 +1,319 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import requests
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ import os
5
+ import re
6
+ import zipfile
7
+ import tempfile
8
+ import shutil
9
+ from urllib.parse import urlparse
10
+ import time
11
+ from selenium import webdriver
12
+ from selenium.webdriver.chrome.options import Options
13
+ from selenium.webdriver.chrome.service import Service
14
+ from selenium.webdriver.common.by import By
15
+ from selenium.common.exceptions import TimeoutException, WebDriverException
16
+ import subprocess
17
+ import sys
18
+
19
+ def setup_chrome_driver():
20
+ """设置Chrome WebDriver,适用于Hugging Face Spaces"""
21
+ chrome_options = Options()
22
+ chrome_options.add_argument("--headless")
23
+ chrome_options.add_argument("--no-sandbox")
24
+ chrome_options.add_argument("--disable-dev-shm-usage")
25
+ chrome_options.add_argument("--disable-gpu")
26
+ chrome_options.add_argument("--window-size=1920,1080")
27
+ chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
28
+
29
+ try:
30
+ # 尝试使用系统中的Chrome
31
+ driver = webdriver.Chrome(options=chrome_options)
32
+ return driver
33
+ except Exception as e:
34
+ print(f"Chrome WebDriver setup failed: {e}")
35
+ return None
36
+
37
+ def extract_images_from_pinterest(url, max_images, progress_callback=None):
38
+ """从Pinterest页面提取图片URL"""
39
+ driver = setup_chrome_driver()
40
+ if not driver:
41
+ return [], "无法启动Chrome WebDriver"
42
+
43
+ try:
44
+ driver.get(url)
45
+ time.sleep(3)
46
+
47
+ imgList = []
48
+ scroll = 0
49
+ no_new_images_count = 0
50
+
51
+ while len(imgList) < max_images:
52
+ # 滚动页面
53
+ scroll += 800
54
+ driver.execute_script(f'window.scrollTo(0, {scroll})')
55
+ time.sleep(1)
56
+
57
+ # 获取图片元素
58
+ imgs = driver.find_elements(By.CSS_SELECTOR, 'div[data-test-id="pin"] img')
59
+ new_images = 0
60
+
61
+ for img in imgs:
62
+ try:
63
+ img_url = img.get_attribute('src')
64
+ if img_url and img_url not in imgList and len(imgList) < max_images:
65
+ imgList.append(img_url)
66
+ new_images += 1
67
+ except:
68
+ continue
69
+
70
+ if progress_callback:
71
+ progress_callback(f"已找到 {len(imgList)} 张图片")
72
+
73
+ # 检查是否没有新图片
74
+ if new_images == 0:
75
+ no_new_images_count += 1
76
+ else:
77
+ no_new_images_count = 0
78
+
79
+ if no_new_images_count >= 5:
80
+ break
81
+
82
+ return imgList, None
83
+
84
+ except Exception as e:
85
+ return [], f"提取图片时出错: {str(e)}"
86
+ finally:
87
+ driver.quit()
88
+
89
+ def download_image(args):
90
+ """下载单张图片"""
91
+ index, url, temp_dir = args
92
+ try:
93
+ # 转换为高清图片URL
94
+ if '236x' in url:
95
+ url = url.replace('236x', 'originals')
96
+ elif '474x' in url:
97
+ url = url.replace('474x', 'originals')
98
+
99
+ # 生成文件名
100
+ filename = f"pinterest_img_{index+1:04d}"
101
+
102
+ # 从URL提取原始文件名
103
+ url_parts = url.split('/')
104
+ if len(url_parts) > 0:
105
+ original_name = url_parts[-1].split('?')[0]
106
+ if '.' in original_name and len(original_name) < 100:
107
+ clean_name = re.sub(r'[^\w\-_\.]', '_', original_name)
108
+ filename = f"pinterest_img_{index+1:04d}_{clean_name}"
109
+
110
+ # 确保文件扩展名
111
+ if not filename.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
112
+ filename += '.jpg'
113
+
114
+ filepath = os.path.join(temp_dir, filename)
115
+
116
+ # 下载图片
117
+ headers = {
118
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
119
+ }
120
+
121
+ response = requests.get(url, headers=headers, timeout=30)
122
+ response.raise_for_status()
123
+
124
+ with open(filepath, 'wb') as f:
125
+ f.write(response.content)
126
+
127
+ return True, filename
128
+
129
+ except Exception as e:
130
+ return False, f"下载失败: {str(e)}"
131
+
132
+ def scrape_pinterest_images(pinterest_url, num_images, progress=gr.Progress()):
133
+ """主要的爬虫函数"""
134
+ if not pinterest_url:
135
+ return None, "请输入Pinterest URL"
136
+
137
+ if num_images <= 0:
138
+ return None, "图片数量必须大于0"
139
+
140
+ if num_images > 500: # 限制最大数量
141
+ return None, "图片数量不能超过500张"
142
+
143
+ # 验证URL
144
+ try:
145
+ parsed_url = urlparse(pinterest_url)
146
+ if 'pinterest.com' not in parsed_url.netloc:
147
+ return None, "请输入有效的Pinterest URL"
148
+ except:
149
+ return None, "URL格式无效"
150
+
151
+ progress(0, desc="开始提取图片URL...")
152
+
153
+ # 提取图片URL
154
+ def update_progress(msg):
155
+ progress(0.3, desc=msg)
156
+
157
+ img_urls, error = extract_images_from_pinterest(pinterest_url, num_images, update_progress)
158
+
159
+ if error:
160
+ return None, error
161
+
162
+ if not img_urls:
163
+ return None, "未找到任何图片"
164
+
165
+ progress(0.5, desc=f"开始下载 {len(img_urls)} 张图片...")
166
+
167
+ # 创建临时目录
168
+ temp_dir = tempfile.mkdtemp()
169
+
170
+ try:
171
+ # 准备下载参数
172
+ download_args = [(i, url, temp_dir) for i, url in enumerate(img_urls)]
173
+
174
+ # 多线程下载
175
+ successful_downloads = []
176
+ failed_downloads = []
177
+
178
+ with ThreadPoolExecutor(max_workers=3) as executor: # 降低并发数
179
+ results = list(executor.map(download_image, download_args))
180
+
181
+ for i, (success, info) in enumerate(results):
182
+ if success:
183
+ successful_downloads.append(info)
184
+ else:
185
+ failed_downloads.append(f"图片 {i+1}: {info}")
186
+
187
+ # 更新进度
188
+ progress((0.5 + 0.4 * (i + 1) / len(results)),
189
+ desc=f"已下载 {len(successful_downloads)} / {len(img_urls)} 张图片")
190
+
191
+ if not successful_downloads:
192
+ return None, "所有图片下载失败"
193
+
194
+ progress(0.9, desc="创建ZIP文件...")
195
+
196
+ # 创建ZIP文件
197
+ zip_path = os.path.join(tempfile.gettempdir(), "pinterest_images.zip")
198
+ with zipfile.ZipFile(zip_path, 'w') as zipf:
199
+ for filename in os.listdir(temp_dir):
200
+ file_path = os.path.join(temp_dir, filename)
201
+ if os.path.isfile(file_path):
202
+ zipf.write(file_path, filename)
203
+
204
+ progress(1.0, desc="完成!")
205
+
206
+ # 准备结果信息
207
+ result_info = f"""
208
+ 下载完成!
209
+
210
+ 成功下载: {len(successful_downloads)} 张图片
211
+ 失败: {len(failed_downloads)} 张图片
212
+ 总计: {len(img_urls)} 张图片
213
+
214
+ 请点击下方链接下载ZIP文件。
215
+ """
216
+
217
+ if failed_downloads:
218
+ result_info += f"\n\n失败详情:\n" + "\n".join(failed_downloads[:10]) # 只显示前10个错误
219
+
220
+ return zip_path, result_info
221
+
222
+ except Exception as e:
223
+ return None, f"处理过程中出错: {str(e)}"
224
+ finally:
225
+ # 清理临时目录
226
+ try:
227
+ shutil.rmtree(temp_dir)
228
+ except:
229
+ pass
230
+
231
+ # 创建Gradio界面
232
+ def create_interface():
233
+ with gr.Blocks(title="Pinterest图片下载器", theme=gr.themes.Soft()) as interface:
234
+ gr.Markdown("""
235
+ # 🖼️ Pinterest 图片下载器
236
+
237
+ 输入Pinterest搜索页面的URL,批量下载图片。
238
+
239
+ **使用说明:**
240
+ 1. 输入Pinterest搜索页面或板块的完整URL
241
+ 2. 设置要下载的图片数量(建议不超过100张)
242
+ 3. 点击"开始下载"按钮
243
+ 4. 等待处理完成后下载ZIP文件
244
+
245
+ **注意事项:**
246
+ - 请确保输入的是有效的Pinterest URL
247
+ - 下载速度取决于网络状况和图片大小
248
+ - 建议单次下载不超过100张图片
249
+ """)
250
+
251
+ with gr.Row():
252
+ with gr.Column():
253
+ pinterest_url = gr.Textbox(
254
+ label="Pinterest URL",
255
+ placeholder="https://www.pinterest.com/search/pins/?q=your-search-term",
256
+ lines=2,
257
+ info="输入Pinterest搜索页面或板块的完整URL"
258
+ )
259
+
260
+ num_images = gr.Slider(
261
+ minimum=1,
262
+ maximum=500,
263
+ value=20,
264
+ step=1,
265
+ label="图片数量",
266
+ info="要下载的图片数量(建议不超过100张)"
267
+ )
268
+
269
+ download_btn = gr.Button("🚀 开始下载", variant="primary", size="lg")
270
+
271
+ with gr.Column():
272
+ result_info = gr.Textbox(
273
+ label="下载结果",
274
+ lines=10,
275
+ interactive=False,
276
+ info="显示下载进度和结果信息"
277
+ )
278
+
279
+ download_file = gr.File(
280
+ label="下载文件",
281
+ interactive=False,
282
+ visible=False
283
+ )
284
+
285
+ # 示例URL
286
+ gr.Markdown("""
287
+ ### 示例URL:
288
+ ```
289
+ https://www.pinterest.com/search/pins/?q=landscape%20photography
290
+ https://www.pinterest.com/search/pins/?q=interior%20design
291
+ https://www.pinterest.com/search/pins/?q=food%20photography
292
+ ```
293
+ """)
294
+
295
+ # 绑定事件
296
+ def handle_download(url, num):
297
+ zip_path, info = scrape_pinterest_images(url, int(num))
298
+ if zip_path:
299
+ return info, gr.File(value=zip_path, visible=True)
300
+ else:
301
+ return info, gr.File(visible=False)
302
+
303
+ download_btn.click(
304
+ fn=handle_download,
305
+ inputs=[pinterest_url, num_images],
306
+ outputs=[result_info, download_file],
307
+ show_progress=True
308
+ )
309
+
310
+ return interface
311
+
312
+ # 启动应用
313
+ if __name__ == "__main__":
314
+ interface = create_interface()
315
+ interface.launch(
316
+ server_name="0.0.0.0",
317
+ server_port=7860,
318
+ share=True
319
+ )