Spaces:

petertulip86
/

Pinterest_Scraping_selenium

Sleeping

App Files Files Community

petertulip86 commited on May 26, 2025

Commit

1e50eb2

verified ·

1 Parent(s): 0785ed1

Create app.py

Browse files

Files changed (1) hide show

app.py +319 -0

app.py ADDED Viewed

	@@ -0,0 +1,319 @@

+import gradio as gr
+import requests
+from concurrent.futures import ThreadPoolExecutor
+import os
+import re
+import zipfile
+import tempfile
+import shutil
+from urllib.parse import urlparse
+import time
+from selenium import webdriver
+from selenium.webdriver.chrome.options import Options
+from selenium.webdriver.chrome.service import Service
+from selenium.webdriver.common.by import By
+from selenium.common.exceptions import TimeoutException, WebDriverException
+import subprocess
+import sys
+def setup_chrome_driver():
+    """设置Chrome WebDriver，适用于Hugging Face Spaces"""
+    chrome_options = Options()
+    chrome_options.add_argument("--headless")
+    chrome_options.add_argument("--no-sandbox")
+    chrome_options.add_argument("--disable-dev-shm-usage")
+    chrome_options.add_argument("--disable-gpu")
+    chrome_options.add_argument("--window-size=1920,1080")
+    chrome_options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
+    try:
+        # 尝试使用系统中的Chrome
+        driver = webdriver.Chrome(options=chrome_options)
+        return driver
+    except Exception as e:
+        print(f"Chrome WebDriver setup failed: {e}")
+        return None
+def extract_images_from_pinterest(url, max_images, progress_callback=None):
+    """从Pinterest页面提取图片URL"""
+    driver = setup_chrome_driver()
+    if not driver:
+        return [], "无法启动Chrome WebDriver"
+    try:
+        driver.get(url)
+        time.sleep(3)
+        imgList = []
+        scroll = 0
+        no_new_images_count = 0
+        while len(imgList) < max_images:
+            # 滚动页面
+            scroll += 800
+            driver.execute_script(f'window.scrollTo(0, {scroll})')
+            time.sleep(1)
+            # 获取图片元素
+            imgs = driver.find_elements(By.CSS_SELECTOR, 'div[data-test-id="pin"] img')
+            new_images = 0
+            for img in imgs:
+                try:
+                    img_url = img.get_attribute('src')
+                    if img_url and img_url not in imgList and len(imgList) < max_images:
+                        imgList.append(img_url)
+                        new_images += 1
+                except:
+                    continue
+            if progress_callback:
+                progress_callback(f"已找到 {len(imgList)} 张图片")
+            # 检查是否没有新图片
+            if new_images == 0:
+                no_new_images_count += 1
+            else:
+                no_new_images_count = 0
+            if no_new_images_count >= 5:
+                break
+        return imgList, None
+    except Exception as e:
+        return [], f"提取图片时出错: {str(e)}"
+    finally:
+        driver.quit()
+def download_image(args):
+    """下载单张图片"""
+    index, url, temp_dir = args
+    try:
+        # 转换为高清图片URL
+        if '236x' in url:
+            url = url.replace('236x', 'originals')
+        elif '474x' in url:
+            url = url.replace('474x', 'originals')
+        # 生成文件名
+        filename = f"pinterest_img_{index+1:04d}"
+        # 从URL提取原始文件名
+        url_parts = url.split('/')
+        if len(url_parts) > 0:
+            original_name = url_parts[-1].split('?')[0]
+            if '.' in original_name and len(original_name) < 100:
+                clean_name = re.sub(r'[^\w\-_\.]', '_', original_name)
+                filename = f"pinterest_img_{index+1:04d}_{clean_name}"
+        # 确保文件扩展名
+        if not filename.lower().endswith(('.jpg', '.jpeg', '.png', '.gif', '.webp')):
+            filename += '.jpg'
+        filepath = os.path.join(temp_dir, filename)
+        # 下载图片
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=30)
+        response.raise_for_status()
+        with open(filepath, 'wb') as f:
+            f.write(response.content)
+        return True, filename
+    except Exception as e:
+        return False, f"下载失败: {str(e)}"
+def scrape_pinterest_images(pinterest_url, num_images, progress=gr.Progress()):
+    """主要的爬虫函数"""
+    if not pinterest_url:
+        return None, "请输入Pinterest URL"
+    if num_images <= 0:
+        return None, "图片数量必须大于0"
+    if num_images > 500:  # 限制最大数量
+        return None, "图片数量不能超过500张"
+    # 验证URL
+    try:
+        parsed_url = urlparse(pinterest_url)
+        if 'pinterest.com' not in parsed_url.netloc:
+            return None, "请输入有效的Pinterest URL"
+    except:
+        return None, "URL格式无效"
+    progress(0, desc="开始提取图片URL...")
+    # 提取图片URL
+    def update_progress(msg):
+        progress(0.3, desc=msg)
+    img_urls, error = extract_images_from_pinterest(pinterest_url, num_images, update_progress)
+    if error:
+        return None, error
+    if not img_urls:
+        return None, "未找到任何图片"
+    progress(0.5, desc=f"开始下载 {len(img_urls)} 张图片...")
+    # 创建临时目录
+    temp_dir = tempfile.mkdtemp()
+    try:
+        # 准备下载参数
+        download_args = [(i, url, temp_dir) for i, url in enumerate(img_urls)]
+        # 多线程下载
+        successful_downloads = []
+        failed_downloads = []
+        with ThreadPoolExecutor(max_workers=3) as executor:  # 降低并发数
+            results = list(executor.map(download_image, download_args))
+        for i, (success, info) in enumerate(results):
+            if success:
+                successful_downloads.append(info)
+            else:
+                failed_downloads.append(f"图片 {i+1}: {info}")
+            # 更新进度
+            progress((0.5 + 0.4 * (i + 1) / len(results)),
+                    desc=f"已下载 {len(successful_downloads)} / {len(img_urls)} 张图片")
+        if not successful_downloads:
+            return None, "所有图片下载失败"
+        progress(0.9, desc="创建ZIP文件...")
+        # 创建ZIP文件
+        zip_path = os.path.join(tempfile.gettempdir(), "pinterest_images.zip")
+        with zipfile.ZipFile(zip_path, 'w') as zipf:
+            for filename in os.listdir(temp_dir):
+                file_path = os.path.join(temp_dir, filename)
+                if os.path.isfile(file_path):
+                    zipf.write(file_path, filename)
+        progress(1.0, desc="完成!")
+        # 准备结果信息
+        result_info = f"""
+下载完成！
+成功下载: {len(successful_downloads)} 张图片
+失败: {len(failed_downloads)} 张图片
+总计: {len(img_urls)} 张图片
+请点击下方链接下载ZIP文件。
+        """
+        if failed_downloads:
+            result_info += f"\n\n失败详情:\n" + "\n".join(failed_downloads[:10])  # 只显示前10个错误
+        return zip_path, result_info
+    except Exception as e:
+        return None, f"处理过程中出错: {str(e)}"
+    finally:
+        # 清理临时目录
+        try:
+            shutil.rmtree(temp_dir)
+        except:
+            pass
+# 创建Gradio界面
+def create_interface():
+    with gr.Blocks(title="Pinterest图片下载器", theme=gr.themes.Soft()) as interface:
+        gr.Markdown("""
+        # 🖼️ Pinterest 图片下载器
+        输入Pinterest搜索页面的URL，批量下载图片。
+        **使用说明:**
+        1. 输入Pinterest搜索页面或板块的完整URL
+        2. 设置要下载的图片数量（建议不超过100张）
+        3. 点击"开始下载"按钮
+        4. 等待处理完成后下载ZIP文件
+        **注意事项:**
+        - 请确保输入的是有效的Pinterest URL
+        - 下载速度取决于网络状况和图片大小
+        - 建议单次下载不超过100张图片
+        """)
+        with gr.Row():
+            with gr.Column():
+                pinterest_url = gr.Textbox(
+                    label="Pinterest URL",
+                    placeholder="https://www.pinterest.com/search/pins/?q=your-search-term",
+                    lines=2,
+                    info="输入Pinterest搜索页面或板块的完整URL"
+                )
+                num_images = gr.Slider(
+                    minimum=1,
+                    maximum=500,
+                    value=20,
+                    step=1,
+                    label="图片数量",
+                    info="要下载的图片数量（建议不超过100张）"
+                )
+                download_btn = gr.Button("🚀 开始下载", variant="primary", size="lg")
+            with gr.Column():
+                result_info = gr.Textbox(
+                    label="下载结果",
+                    lines=10,
+                    interactive=False,
+                    info="显示下载进度和结果信息"
+                )
+                download_file = gr.File(
+                    label="下载文件",
+                    interactive=False,
+                    visible=False
+                )
+        # 示例URL
+        gr.Markdown("""
+        ### 示例URL:
+        ```
+        https://www.pinterest.com/search/pins/?q=landscape%20photography
+        https://www.pinterest.com/search/pins/?q=interior%20design
+        https://www.pinterest.com/search/pins/?q=food%20photography
+        ```
+        """)
+        # 绑定事件
+        def handle_download(url, num):
+            zip_path, info = scrape_pinterest_images(url, int(num))
+            if zip_path:
+                return info, gr.File(value=zip_path, visible=True)
+            else:
+                return info, gr.File(visible=False)
+        download_btn.click(
+            fn=handle_download,
+            inputs=[pinterest_url, num_images],
+            outputs=[result_info, download_file],
+            show_progress=True
+        )
+    return interface
+# 启动应用
+if __name__ == "__main__":
+    interface = create_interface()
+    interface.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=True
+    )