Spaces:

factorstudios
/

pinteresting

Sleeping

File size: 4,730 Bytes

cfa4580

import os
import time
import requests
import gradio as gr
from playwright.sync_api import sync_playwright

# Install playwright browser on startup (HF Space compatible)
os.system("playwright install chromium")

# HF Space deployment - ready

def download_image(url, folder_path, image_name):
    try:
        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200:
            file_path = os.path.join(folder_path, image_name)
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            return file_path
    except Exception as e:
        print(f"Error downloading {url}: {e}")
    return None

def scrape_pinterest(keyword, count):
    base_folder = "downloads"
    keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_"))
    os.makedirs(keyword_folder, exist_ok=True)
    
    downloaded_paths = []
    
    with sync_playwright() as p:
        # HF compatible - use default chromium without Edge path
        browser = p.chromium.launch(headless=True)
        context = browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        )
        page = context.new_page()
        
        search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}"
        
        try:
            page.goto(search_url, timeout=60000)
            page.wait_for_selector("div[data-test-id='pin']", timeout=15000)
        except Exception as e:
            browser.close()
            return [], f"Error: {str(e)}"
        
        downloaded_count = 0
        seen_urls = set()
        last_height = page.evaluate("document.body.scrollHeight")
        no_new_content_count = 0
        
        while downloaded_count < count:
            images = page.locator("div[data-test-id='pin'] img").all()
            
            for img in images:
                if downloaded_count >= count:
                    break
                    
                src = img.get_attribute("src")
                if not src:
                    continue
                
                high_res_url = src.replace("236x", "736x")
                
                if high_res_url not in seen_urls:
                    seen_urls.add(high_res_url)
                    image_name = f"pinterest_{downloaded_count+1}.jpg"
                    file_path = download_image(high_res_url, keyword_folder, image_name)
                    if file_path:
                        downloaded_paths.append(file_path)
                        downloaded_count += 1
            
            if downloaded_count >= count:
                break
                
            page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            page.wait_for_timeout(2000)
            
            new_height = page.evaluate("document.body.scrollHeight")
            if new_height == last_height:
                no_new_content_count += 1
                if no_new_content_count > 3:
                    break
            else:
                no_new_content_count = 0
                
            last_height = new_height
        
        browser.close()
    
    return downloaded_paths, f"Downloaded {len(downloaded_paths)} images"

# Gradio Interface
def scrape_interface(keyword, count):
    if not keyword:
        return [], "Please enter a keyword"
    
    paths, msg = scrape_pinterest(keyword, count)
    
    # Return images for display
    return paths, msg

with gr.Blocks(title="Pinterest Image Scraper") as demo:
    gr.Markdown("# Pinterest Image Scraper")
    gr.Markdown("Search and download Pinterest images by keyword")
    
    with gr.Row():
        with gr.Column():
            keyword_input = gr.Textbox(
                label="Search Keyword",
                placeholder="Enter keyword (e.g., aesthetic wallpaper, anime girl)",
                value="aesthetic wallpaper"
            )
            count_slider = gr.Slider(
                minimum=1,
                maximum=20,
                value=5,
                step=1,
                label="Number of Images"
            )
            scrape_btn = gr.Button("Scrape Images", variant="primary")
        
        with gr.Column():
            status = gr.Textbox(label="Status")
            gallery = gr.Gallery(label="Downloaded Images", columns=3)
    
    scrape_btn.click(
        fn=scrape_interface,
        inputs=[keyword_input, count_slider],
        outputs=[gallery, status]
    )

if __name__ == "__main__":
    demo.launch()