import os import time import requests import gradio as gr from playwright.sync_api import sync_playwright # Install playwright browser on startup (HF Space compatible) os.system("playwright install chromium") # HF Space deployment - ready def download_image(url, folder_path, image_name): try: response = requests.get(url, stream=True, timeout=10) if response.status_code == 200: file_path = os.path.join(folder_path, image_name) with open(file_path, 'wb') as f: for chunk in response.iter_content(1024): f.write(chunk) return file_path except Exception as e: print(f"Error downloading {url}: {e}") return None def scrape_pinterest(keyword, count): base_folder = "downloads" keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_")) os.makedirs(keyword_folder, exist_ok=True) downloaded_paths = [] with sync_playwright() as p: # HF compatible - use default chromium without Edge path browser = p.chromium.launch(headless=True) context = browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" ) page = context.new_page() search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}" try: page.goto(search_url, timeout=60000) page.wait_for_selector("div[data-test-id='pin']", timeout=15000) except Exception as e: browser.close() return [], f"Error: {str(e)}" downloaded_count = 0 seen_urls = set() last_height = page.evaluate("document.body.scrollHeight") no_new_content_count = 0 while downloaded_count < count: images = page.locator("div[data-test-id='pin'] img").all() for img in images: if downloaded_count >= count: break src = img.get_attribute("src") if not src: continue high_res_url = src.replace("236x", "736x") if high_res_url not in seen_urls: seen_urls.add(high_res_url) image_name = f"pinterest_{downloaded_count+1}.jpg" file_path = download_image(high_res_url, keyword_folder, image_name) if file_path: downloaded_paths.append(file_path) downloaded_count += 1 if downloaded_count >= count: break page.evaluate("window.scrollTo(0, document.body.scrollHeight)") page.wait_for_timeout(2000) new_height = page.evaluate("document.body.scrollHeight") if new_height == last_height: no_new_content_count += 1 if no_new_content_count > 3: break else: no_new_content_count = 0 last_height = new_height browser.close() return downloaded_paths, f"Downloaded {len(downloaded_paths)} images" # Gradio Interface def scrape_interface(keyword, count): if not keyword: return [], "Please enter a keyword" paths, msg = scrape_pinterest(keyword, count) # Return images for display return paths, msg with gr.Blocks(title="Pinterest Image Scraper") as demo: gr.Markdown("# Pinterest Image Scraper") gr.Markdown("Search and download Pinterest images by keyword") with gr.Row(): with gr.Column(): keyword_input = gr.Textbox( label="Search Keyword", placeholder="Enter keyword (e.g., aesthetic wallpaper, anime girl)", value="aesthetic wallpaper" ) count_slider = gr.Slider( minimum=1, maximum=20, value=5, step=1, label="Number of Images" ) scrape_btn = gr.Button("Scrape Images", variant="primary") with gr.Column(): status = gr.Textbox(label="Status") gallery = gr.Gallery(label="Downloaded Images", columns=3) scrape_btn.click( fn=scrape_interface, inputs=[keyword_input, count_slider], outputs=[gallery, status] ) if __name__ == "__main__": demo.launch()