Spaces:
Sleeping
Sleeping
| import os | |
| import time | |
| import requests | |
| import gradio as gr | |
| from playwright.sync_api import sync_playwright | |
| # Install playwright browser on startup (HF Space compatible) | |
| os.system("playwright install chromium") | |
| # HF Space deployment - ready | |
| def download_image(url, folder_path, image_name): | |
| try: | |
| response = requests.get(url, stream=True, timeout=10) | |
| if response.status_code == 200: | |
| file_path = os.path.join(folder_path, image_name) | |
| with open(file_path, 'wb') as f: | |
| for chunk in response.iter_content(1024): | |
| f.write(chunk) | |
| return file_path | |
| except Exception as e: | |
| print(f"Error downloading {url}: {e}") | |
| return None | |
| def scrape_pinterest(keyword, count): | |
| base_folder = "downloads" | |
| keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_")) | |
| os.makedirs(keyword_folder, exist_ok=True) | |
| downloaded_paths = [] | |
| with sync_playwright() as p: | |
| # HF compatible - use default chromium without Edge path | |
| browser = p.chromium.launch(headless=True) | |
| context = browser.new_context( | |
| user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" | |
| ) | |
| page = context.new_page() | |
| search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}" | |
| try: | |
| page.goto(search_url, timeout=60000) | |
| page.wait_for_selector("div[data-test-id='pin']", timeout=15000) | |
| except Exception as e: | |
| browser.close() | |
| return [], f"Error: {str(e)}" | |
| downloaded_count = 0 | |
| seen_urls = set() | |
| last_height = page.evaluate("document.body.scrollHeight") | |
| no_new_content_count = 0 | |
| while downloaded_count < count: | |
| images = page.locator("div[data-test-id='pin'] img").all() | |
| for img in images: | |
| if downloaded_count >= count: | |
| break | |
| src = img.get_attribute("src") | |
| if not src: | |
| continue | |
| high_res_url = src.replace("236x", "736x") | |
| if high_res_url not in seen_urls: | |
| seen_urls.add(high_res_url) | |
| image_name = f"pinterest_{downloaded_count+1}.jpg" | |
| file_path = download_image(high_res_url, keyword_folder, image_name) | |
| if file_path: | |
| downloaded_paths.append(file_path) | |
| downloaded_count += 1 | |
| if downloaded_count >= count: | |
| break | |
| page.evaluate("window.scrollTo(0, document.body.scrollHeight)") | |
| page.wait_for_timeout(2000) | |
| new_height = page.evaluate("document.body.scrollHeight") | |
| if new_height == last_height: | |
| no_new_content_count += 1 | |
| if no_new_content_count > 3: | |
| break | |
| else: | |
| no_new_content_count = 0 | |
| last_height = new_height | |
| browser.close() | |
| return downloaded_paths, f"Downloaded {len(downloaded_paths)} images" | |
| # Gradio Interface | |
| def scrape_interface(keyword, count): | |
| if not keyword: | |
| return [], "Please enter a keyword" | |
| paths, msg = scrape_pinterest(keyword, count) | |
| # Return images for display | |
| return paths, msg | |
| with gr.Blocks(title="Pinterest Image Scraper") as demo: | |
| gr.Markdown("# Pinterest Image Scraper") | |
| gr.Markdown("Search and download Pinterest images by keyword") | |
| with gr.Row(): | |
| with gr.Column(): | |
| keyword_input = gr.Textbox( | |
| label="Search Keyword", | |
| placeholder="Enter keyword (e.g., aesthetic wallpaper, anime girl)", | |
| value="aesthetic wallpaper" | |
| ) | |
| count_slider = gr.Slider( | |
| minimum=1, | |
| maximum=20, | |
| value=5, | |
| step=1, | |
| label="Number of Images" | |
| ) | |
| scrape_btn = gr.Button("Scrape Images", variant="primary") | |
| with gr.Column(): | |
| status = gr.Textbox(label="Status") | |
| gallery = gr.Gallery(label="Downloaded Images", columns=3) | |
| scrape_btn.click( | |
| fn=scrape_interface, | |
| inputs=[keyword_input, count_slider], | |
| outputs=[gallery, status] | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() | |