Spaces:

factorstudios
/

pinteresting

Sleeping

File size: 5,044 Bytes

cfa4580

import argparse
import os
import time
import requests
from playwright.sync_api import sync_playwright

def download_image(url, folder_path, image_name):
    try:
        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200:
            file_path = os.path.join(folder_path, image_name)
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            print(f"Downloaded: {image_name}")
            return True
        else:
            print(f"Failed to download {url}: Status code {response.status_code}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")
    return False

def scrape_pinterest(keyword, count):
    # Setup downloads folder
    base_folder = "downloads"
    keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_"))
    os.makedirs(keyword_folder, exist_ok=True)
    
    print(f"Scraping {count} images for '{keyword}'...")
    print(f"Saving to {keyword_folder}/")

    with sync_playwright() as p:
        # Pinterest sometimes blocks headless without proper user agents or stealth, 
        # but standard headless=False or providing a realistic UA usually works.
        # We will use headless=True and the local Edge installation to save 150MB of downloads.
        browser = p.chromium.launch(
            headless=True,
            executable_path=r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
        )
        context = browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = context.new_page()
        
        # Navigate to search page
        search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}"
        print(f"Navigating to {search_url}")
        
        try:
            page.goto(search_url, timeout=60000)
            # Wait for content to load
            page.wait_for_selector("div[data-test-id='pin']", timeout=15000)
        except Exception as e:
            print(f"Error loading page: {e}. Check if Pinterest requires login or if the IP is blocked.")
            browser.close()
            return

        downloaded_count = 0
        seen_urls = set()
        
        # Scroll and extract
        last_height = page.evaluate("document.body.scrollHeight")
        no_new_content_count = 0
        
        while downloaded_count < count:
            # Find all image elements within pins
            # Pinterest structured images usually have a srcset or src. We look for high-res.
            images = page.locator("div[data-test-id='pin'] img").all()
            
            for img in images:
                if downloaded_count >= count:
                    break
                    
                src = img.get_attribute("src")
                if not src:
                    continue
                
                # Pinterest thumbnails are often 236x. Let's try to get the original or larger versions.
                # typical url: https://i.pinimg.com/236x/xx/xx/xx/...jpg
                # hi-res url: https://i.pinimg.com/736x/xx/xx/xx/...jpg or originals/
                
                high_res_url = src.replace("236x", "736x")
                
                if high_res_url not in seen_urls:
                    seen_urls.add(high_res_url)
                    image_name = f"pinterest_{downloaded_count+1}.jpg"
                    success = download_image(high_res_url, keyword_folder, image_name)
                    if success:
                        downloaded_count += 1
            
            if downloaded_count >= count:
                break
                
            # Scroll down
            print("Scrolling down for more images...")
            page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            page.wait_for_timeout(2000)  # Wait for loading
            
            new_height = page.evaluate("document.body.scrollHeight")
            if new_height == last_height:
                no_new_content_count += 1
                if no_new_content_count > 3:
                    print("Reached end of page or no more images loading.")
                    break
            else:
                no_new_content_count = 0
                
            last_height = new_height

        print(f"Finished scraping. Downloaded {downloaded_count} images.")
        browser.close()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Pinterest Keyword Scraper")
    parser.add_argument("keyword", type=str, help="The keyword to search for on Pinterest")
    parser.add_argument("-c", "--count", type=int, default=10, help="Number of images to scrape (default: 10)")
    
    args = parser.parse_args()
    
    scrape_pinterest(args.keyword, args.count)