Spaces:

petertulip86
/

Pinterest_Scraping_selenium

Sleeping

App Files Files Community

petertulip86 commited on May 26, 2025

Commit

f007e8d

verified ·

1 Parent(s): 44593ac

Update app.py

Browse files

Files changed (1) hide show

app.py +68 -68

app.py CHANGED Viewed

@@ -17,7 +17,7 @@ import subprocess
 import sys
 def setup_chrome_driver():
-    """设置Chrome WebDriver，适用于Hugging Face Spaces"""
     chrome_options = Options()
     chrome_options.add_argument("--headless")
     chrome_options.add_argument("--no-sandbox")
@@ -27,7 +27,7 @@ def setup_chrome_driver():
     chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
     try:
-        # 尝试使用系统中的Chrome
         driver = webdriver.Chrome(options=chrome_options)
         return driver
     except Exception as e:
@@ -35,10 +35,10 @@ def setup_chrome_driver():
         return None
 def extract_images_from_pinterest(url, max_images, progress_callback=None):
-    """从Pinterest页面提取图片URL"""
     driver = setup_chrome_driver()
     if not driver:
-        return [], "无法启动Chrome WebDriver"
     try:
         driver.get(url)
@@ -49,12 +49,12 @@ def extract_images_from_pinterest(url, max_images, progress_callback=None):
         no_new_images_count = 0
         while len(imgList) < max_images:
-            # 滚动页面
             scroll += 800
             driver.execute_script(f'window.scrollTo(0, {scroll})')
             time.sleep(1)
-            # 获取图片元素
             imgs = driver.find_elements(By.CSS_SELECTOR, 'div[data-test-id="pin"] img')
             new_images = 0
@@ -68,9 +68,9 @@ def extract_images_from_pinterest(url, max_images, progress_callback=None):
                     continue
             if progress_callback:
-                progress_callback(f"已找到 {len(imgList)} 张图片")
-            # 检查是否没有新图片
             if new_images == 0:
                 no_new_images_count += 1
             else:
@@ -82,15 +82,15 @@ def extract_images_from_pinterest(url, max_images, progress_callback=None):
         return imgList, None
     except Exception as e:
-        return [], f"提取图片时出错: {str(e)}"
     finally:
         driver.quit()
 def download_image(args):
-    """下载单张图片"""
     index, url, temp_dir = args
     try:
-        # 转换为高清图片URL
         if '236x' in url:
             url = url.replace('236x', 'originals')
         elif '474x' in url:
@@ -99,7 +99,7 @@ def download_image(args):
         # 生成文件名
         filename = f"pinterest_img_{index+1:04d}"
-        # 从URL提取原始文件名
         url_parts = url.split('/')
         if len(url_parts) > 0:
             original_name = url_parts[-1].split('?')[0]
@@ -107,13 +107,13 @@ def download_image(args):
                 clean_name = re.sub(r'[^\w\-_\.]', '_', original_name)
                 filename = f"pinterest_img_{index+1:04d}_{clean_name}"
-        # 确保文件扩展名
         if not filename.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
             filename += '.jpg'
         filepath = os.path.join(temp_dir, filename)
-        # 下载图片
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
@@ -127,30 +127,30 @@ def download_image(args):
         return True, filename
     except Exception as e:
-        return False, f"下载失败: {str(e)}"
 def scrape_pinterest_images(pinterest_url, num_images, progress=gr.Progress()):
-    """主要的爬虫函数"""
     if not pinterest_url:
-        return None, "请输入Pinterest URL"
     if num_images <= 0:
-        return None, "图片数量必须大于0"
-    if num_images > 500:  # 限制最大数量
-        return None, "图片数量不能超过500张"
-    # 验证URL
     try:
         parsed_url = urlparse(pinterest_url)
         if 'pinterest.com' not in parsed_url.netloc:
-            return None, "请输入有效的Pinterest URL"
     except:
-        return None, "URL格式无效"
-    progress(0, desc="开始提取图片URL...")
-    # 提取图片URL
     def update_progress(msg):
         progress(0.3, desc=msg)
@@ -160,40 +160,40 @@ def scrape_pinterest_images(pinterest_url, num_images, progress=gr.Progress()):
         return None, error
     if not img_urls:
-        return None, "未找到任何图片"
-    progress(0.5, desc=f"开始下载 {len(img_urls)} 张图片...")
-    # 创建临时目录
     temp_dir = tempfile.mkdtemp()
     try:
-        # 准备下载参数
         download_args = [(i, url, temp_dir) for i, url in enumerate(img_urls)]
-        # 多线程下载
         successful_downloads = []
         failed_downloads = []
-        with ThreadPoolExecutor(max_workers=3) as executor:  # 降低并发数
             results = list(executor.map(download_image, download_args))
         for i, (success, info) in enumerate(results):
             if success:
                 successful_downloads.append(info)
             else:
-                failed_downloads.append(f"图片 {i+1}: {info}")
-            # 更新进度
             progress((0.5 + 0.4 * (i + 1) / len(results)),
-                    desc=f"已下载 {len(successful_downloads)} / {len(img_urls)} 张图片")
         if not successful_downloads:
-            return None, "所有图片下载失败"
-        progress(0.9, desc="创建ZIP文件...")
-        # 创建ZIP文件
         zip_path = os.path.join(tempfile.gettempdir(), "pinterest_images.zip")
         with zipfile.ZipFile(zip_path, 'w') as zipf:
             for filename in os.listdir(temp_dir):
@@ -203,49 +203,49 @@ def scrape_pinterest_images(pinterest_url, num_images, progress=gr.Progress()):
         progress(1.0, desc="完成!")
-        # 准备结果信息
         result_info = f"""
-下载完成！
-成功下载: {len(successful_downloads)} 张图片
-失败: {len(failed_downloads)} 张图片
-总计: {len(img_urls)} 张图片
-请点击下方链接下载ZIP文件。
         """
         if failed_downloads:
-            result_info += f"\n\n失败详情:\n" + "\n".join(failed_downloads[:10])  # 只显示前10个错误
         return zip_path, result_info
     except Exception as e:
-        return None, f"处理过程中出错: {str(e)}"
     finally:
-        # 清理临时目录
         try:
             shutil.rmtree(temp_dir)
         except:
             pass
-# 创建Gradio界面
 def create_interface():
-    with gr.Blocks(title="Pinterest图片下载器", theme=gr.themes.Soft()) as interface:
         gr.Markdown("""
-        # 🖼️ Pinterest 图片下载器
-        输入Pinterest搜索页面的URL，批量下载图片。
-        **使用说明:**
-        1. 输入Pinterest搜索页面或板块的完整URL
-        2. 设置要下载的图片数量（建议不超过100张）
-        3. 点击"开始下载"按钮
-        4. 等待处理完成后下载ZIP文件
-        **注意事项:**
-        - 请确保输入的是有效的Pinterest URL
-        - 下载速度取决于网络状况和图片大小
-        - 建议单次下载不超过100张图片
         """)
         with gr.Row():
@@ -254,7 +254,7 @@ def create_interface():
                     label="Pinterest URL",
                     placeholder="https://www.pinterest.com/search/pins/?q=your-search-term",
                     lines=2,
-                    info="输入Pinterest搜索页面或板块的完整URL"
                 )
                 num_images = gr.Slider(
@@ -262,22 +262,22 @@ def create_interface():
                     maximum=500,
                     value=20,
                     step=1,
-                    label="图片数量",
-                    info="要下载的图片数量（建议不超过100张）"
                 )
-                download_btn = gr.Button("🚀 开始下载", variant="primary", size="lg")
             with gr.Column():
                 result_info = gr.Textbox(
-                    label="下载结果",
                     lines=10,
                     interactive=False,
-                    info="显示下载进度和结果信息"
                 )
                 download_file = gr.File(
-                    label="下载文件",
                     interactive=False,
                     visible=False
                 )
@@ -292,7 +292,7 @@ def create_interface():
         ```
         """)
-        # 绑定事件
         def handle_download(url, num):
             zip_path, info = scrape_pinterest_images(url, int(num))
             if zip_path:
@@ -309,7 +309,7 @@ def create_interface():
     return interface
-# 启动应用
 if __name__ == "__main__":
     interface = create_interface()
     interface.launch(

 import sys
 def setup_chrome_driver():
+    """設置Chrome WebDriver，適用於Hugging Face Spaces"""
     chrome_options = Options()
     chrome_options.add_argument("--headless")
     chrome_options.add_argument("--no-sandbox")
     chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
     try:
+        # 嘗試使用系統中的Chrome
         driver = webdriver.Chrome(options=chrome_options)
         return driver
     except Exception as e:
         return None
 def extract_images_from_pinterest(url, max_images, progress_callback=None):
+    """從Pinterest頁面提取圖片URL"""
     driver = setup_chrome_driver()
     if not driver:
+        return [], "無法啟動Chrome WebDriver"
     try:
         driver.get(url)
         no_new_images_count = 0
         while len(imgList) < max_images:
+            # 滾動頁面
             scroll += 800
             driver.execute_script(f'window.scrollTo(0, {scroll})')
             time.sleep(1)
+            # 獲取圖片元素
             imgs = driver.find_elements(By.CSS_SELECTOR, 'div[data-test-id="pin"] img')
             new_images = 0
                     continue
             if progress_callback:
+                progress_callback(f"已找到 {len(imgList)} 張圖片")
+            # 檢查是否沒有新圖片
             if new_images == 0:
                 no_new_images_count += 1
             else:
         return imgList, None
     except Exception as e:
+        return [], f"提取圖片時出錯: {str(e)}"
     finally:
         driver.quit()
 def download_image(args):
+    """下載單張圖片"""
     index, url, temp_dir = args
     try:
+        # 轉換為高清圖片URL
         if '236x' in url:
             url = url.replace('236x', 'originals')
         elif '474x' in url:
         # 生成文件名
         filename = f"pinterest_img_{index+1:04d}"
+        # 從URL提取原始文件名
         url_parts = url.split('/')
         if len(url_parts) > 0:
             original_name = url_parts[-1].split('?')[0]
                 clean_name = re.sub(r'[^\w\-_\.]', '_', original_name)
                 filename = f"pinterest_img_{index+1:04d}_{clean_name}"
+        # 確保文件擴展名
         if not filename.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
             filename += '.jpg'
         filepath = os.path.join(temp_dir, filename)
+        # 下載圖片
         headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
         return True, filename
     except Exception as e:
+        return False, f"下載失敗: {str(e)}"
 def scrape_pinterest_images(pinterest_url, num_images, progress=gr.Progress()):
+    """主要的爬蟲函數"""
     if not pinterest_url:
+        return None, "請輸入Pinterest URL"
     if num_images <= 0:
+        return None, "圖片數量必須大於0"
+    if num_images > 500:  # 限制最大數量
+        return None, "圖片數量不能超過500張"
+    # 驗證URL
     try:
         parsed_url = urlparse(pinterest_url)
         if 'pinterest.com' not in parsed_url.netloc:
+            return None, "請輸入有效的Pinterest URL"
     except:
+        return None, "URL格式無效"
+    progress(0, desc="開始提取圖片URL...")
+    # 提取圖片URL
     def update_progress(msg):
         progress(0.3, desc=msg)
         return None, error
     if not img_urls:
+        return None, "未找到任何圖片"
+    progress(0.5, desc=f"開始下載 {len(img_urls)} 張圖片...")
+    # 創建臨時目錄
     temp_dir = tempfile.mkdtemp()
     try:
+        # 準備下載參數
         download_args = [(i, url, temp_dir) for i, url in enumerate(img_urls)]
+        # 多線程下載
         successful_downloads = []
         failed_downloads = []
+        with ThreadPoolExecutor(max_workers=3) as executor:  # 降低並發數
             results = list(executor.map(download_image, download_args))
         for i, (success, info) in enumerate(results):
             if success:
                 successful_downloads.append(info)
             else:
+                failed_downloads.append(f"圖片 {i+1}: {info}")
+            # 更新進度
             progress((0.5 + 0.4 * (i + 1) / len(results)),
+                    desc=f"已下載 {len(successful_downloads)} / {len(img_urls)} 張圖片")
         if not successful_downloads:
+            return None, "所有圖片下載失敗"
+        progress(0.9, desc="創建ZIP文件...")
+        # 創建ZIP文件
         zip_path = os.path.join(tempfile.gettempdir(), "pinterest_images.zip")
         with zipfile.ZipFile(zip_path, 'w') as zipf:
             for filename in os.listdir(temp_dir):
         progress(1.0, desc="完成!")
+        # 準備結果信息
         result_info = f"""
+下載完成！
+成功下載: {len(successful_downloads)} 張圖片
+失敗: {len(failed_downloads)} 張圖片
+總計: {len(img_urls)} 張圖片
+請點擊下方鏈接下載ZIP文件。
         """
         if failed_downloads:
+            result_info += f"\n\n失敗詳情:\n" + "\n".join(failed_downloads[:10])  # 只顯示前10個錯誤
         return zip_path, result_info
     except Exception as e:
+        return None, f"處理過程中出錯: {str(e)}"
     finally:
+        # 清理臨時目錄
         try:
             shutil.rmtree(temp_dir)
         except:
             pass
+# 創建Gradio界面
 def create_interface():
+    with gr.Blocks(title="Pinterest圖片下載器", theme=gr.themes.Soft()) as interface:
         gr.Markdown("""
+        # 🖼️ Pinterest 圖片下載器
+        輸入Pinterest搜索頁面的URL，批量下載圖片。
+        **使用說明:**
+        1. 輸入Pinterest搜索頁面或板塊的完整URL
+        2. 設置要下載的圖片數量（建議不超過100張）
+        3. 點擊"開始下載"按鈕
+        4. 等待處理完成後下載ZIP文件
+        **注意事項:**
+        - 請確保輸入的是有效的Pinterest URL
+        - 下載速度取決於網絡狀況和圖片大小
+        - 建議單次下載不超過100張圖片
         """)
         with gr.Row():
                     label="Pinterest URL",
                     placeholder="https://www.pinterest.com/search/pins/?q=your-search-term",
                     lines=2,
+                    info="輸入Pinterest搜索頁面或板塊的完整URL"
                 )
                 num_images = gr.Slider(
                     maximum=500,
                     value=20,
                     step=1,
+                    label="圖片數量",
+                    info="要下載的圖片數量（建議不超過100張）"
                 )
+                download_btn = gr.Button("🚀 開始下載", variant="primary", size="lg")
             with gr.Column():
                 result_info = gr.Textbox(
+                    label="下載結果",
                     lines=10,
                     interactive=False,
+                    info="顯示下載進度和結果信息"
                 )
                 download_file = gr.File(
+                    label="下載文件",
                     interactive=False,
                     visible=False
                 )
         ```
         """)
+        # 綁定事件
         def handle_download(url, num):
             zip_path, info = scrape_pinterest_images(url, int(num))
             if zip_path:
     return interface
+# 啟動應用
 if __name__ == "__main__":
     interface = create_interface()
     interface.launch(