import argparse import os import time import requests from playwright.sync_api import sync_playwright def download_image(url, folder_path, image_name): try: response = requests.get(url, stream=True, timeout=10) if response.status_code == 200: file_path = os.path.join(folder_path, image_name) with open(file_path, 'wb') as f: for chunk in response.iter_content(1024): f.write(chunk) print(f"Downloaded: {image_name}") return True else: print(f"Failed to download {url}: Status code {response.status_code}") except Exception as e: print(f"Error downloading {url}: {e}") return False def scrape_pinterest(keyword, count): # Setup downloads folder base_folder = "downloads" keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_")) os.makedirs(keyword_folder, exist_ok=True) print(f"Scraping {count} images for '{keyword}'...") print(f"Saving to {keyword_folder}/") with sync_playwright() as p: # Pinterest sometimes blocks headless without proper user agents or stealth, # but standard headless=False or providing a realistic UA usually works. # We will use headless=True and the local Edge installation to save 150MB of downloads. browser = p.chromium.launch( headless=True, executable_path=r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe" ) context = browser.new_context( user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" ) page = context.new_page() # Navigate to search page search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}" print(f"Navigating to {search_url}") try: page.goto(search_url, timeout=60000) # Wait for content to load page.wait_for_selector("div[data-test-id='pin']", timeout=15000) except Exception as e: print(f"Error loading page: {e}. Check if Pinterest requires login or if the IP is blocked.") browser.close() return downloaded_count = 0 seen_urls = set() # Scroll and extract last_height = page.evaluate("document.body.scrollHeight") no_new_content_count = 0 while downloaded_count < count: # Find all image elements within pins # Pinterest structured images usually have a srcset or src. We look for high-res. images = page.locator("div[data-test-id='pin'] img").all() for img in images: if downloaded_count >= count: break src = img.get_attribute("src") if not src: continue # Pinterest thumbnails are often 236x. Let's try to get the original or larger versions. # typical url: https://i.pinimg.com/236x/xx/xx/xx/...jpg # hi-res url: https://i.pinimg.com/736x/xx/xx/xx/...jpg or originals/ high_res_url = src.replace("236x", "736x") if high_res_url not in seen_urls: seen_urls.add(high_res_url) image_name = f"pinterest_{downloaded_count+1}.jpg" success = download_image(high_res_url, keyword_folder, image_name) if success: downloaded_count += 1 if downloaded_count >= count: break # Scroll down print("Scrolling down for more images...") page.evaluate("window.scrollTo(0, document.body.scrollHeight)") page.wait_for_timeout(2000) # Wait for loading new_height = page.evaluate("document.body.scrollHeight") if new_height == last_height: no_new_content_count += 1 if no_new_content_count > 3: print("Reached end of page or no more images loading.") break else: no_new_content_count = 0 last_height = new_height print(f"Finished scraping. Downloaded {downloaded_count} images.") browser.close() if __name__ == "__main__": parser = argparse.ArgumentParser(description="Pinterest Keyword Scraper") parser.add_argument("keyword", type=str, help="The keyword to search for on Pinterest") parser.add_argument("-c", "--count", type=int, default=10, help="Number of images to scrape (default: 10)") args = parser.parse_args() scrape_pinterest(args.keyword, args.count)