Spaces:
Sleeping
Sleeping
| import argparse | |
| import os | |
| import time | |
| import requests | |
| from playwright.sync_api import sync_playwright | |
| def download_image(url, folder_path, image_name): | |
| try: | |
| response = requests.get(url, stream=True, timeout=10) | |
| if response.status_code == 200: | |
| file_path = os.path.join(folder_path, image_name) | |
| with open(file_path, 'wb') as f: | |
| for chunk in response.iter_content(1024): | |
| f.write(chunk) | |
| print(f"Downloaded: {image_name}") | |
| return True | |
| else: | |
| print(f"Failed to download {url}: Status code {response.status_code}") | |
| except Exception as e: | |
| print(f"Error downloading {url}: {e}") | |
| return False | |
| def scrape_pinterest(keyword, count): | |
| # Setup downloads folder | |
| base_folder = "downloads" | |
| keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_")) | |
| os.makedirs(keyword_folder, exist_ok=True) | |
| print(f"Scraping {count} images for '{keyword}'...") | |
| print(f"Saving to {keyword_folder}/") | |
| with sync_playwright() as p: | |
| # Pinterest sometimes blocks headless without proper user agents or stealth, | |
| # but standard headless=False or providing a realistic UA usually works. | |
| # We will use headless=True and the local Edge installation to save 150MB of downloads. | |
| browser = p.chromium.launch( | |
| headless=True, | |
| executable_path=r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe" | |
| ) | |
| context = browser.new_context( | |
| user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" | |
| ) | |
| page = context.new_page() | |
| # Navigate to search page | |
| search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}" | |
| print(f"Navigating to {search_url}") | |
| try: | |
| page.goto(search_url, timeout=60000) | |
| # Wait for content to load | |
| page.wait_for_selector("div[data-test-id='pin']", timeout=15000) | |
| except Exception as e: | |
| print(f"Error loading page: {e}. Check if Pinterest requires login or if the IP is blocked.") | |
| browser.close() | |
| return | |
| downloaded_count = 0 | |
| seen_urls = set() | |
| # Scroll and extract | |
| last_height = page.evaluate("document.body.scrollHeight") | |
| no_new_content_count = 0 | |
| while downloaded_count < count: | |
| # Find all image elements within pins | |
| # Pinterest structured images usually have a srcset or src. We look for high-res. | |
| images = page.locator("div[data-test-id='pin'] img").all() | |
| for img in images: | |
| if downloaded_count >= count: | |
| break | |
| src = img.get_attribute("src") | |
| if not src: | |
| continue | |
| # Pinterest thumbnails are often 236x. Let's try to get the original or larger versions. | |
| # typical url: https://i.pinimg.com/236x/xx/xx/xx/...jpg | |
| # hi-res url: https://i.pinimg.com/736x/xx/xx/xx/...jpg or originals/ | |
| high_res_url = src.replace("236x", "736x") | |
| if high_res_url not in seen_urls: | |
| seen_urls.add(high_res_url) | |
| image_name = f"pinterest_{downloaded_count+1}.jpg" | |
| success = download_image(high_res_url, keyword_folder, image_name) | |
| if success: | |
| downloaded_count += 1 | |
| if downloaded_count >= count: | |
| break | |
| # Scroll down | |
| print("Scrolling down for more images...") | |
| page.evaluate("window.scrollTo(0, document.body.scrollHeight)") | |
| page.wait_for_timeout(2000) # Wait for loading | |
| new_height = page.evaluate("document.body.scrollHeight") | |
| if new_height == last_height: | |
| no_new_content_count += 1 | |
| if no_new_content_count > 3: | |
| print("Reached end of page or no more images loading.") | |
| break | |
| else: | |
| no_new_content_count = 0 | |
| last_height = new_height | |
| print(f"Finished scraping. Downloaded {downloaded_count} images.") | |
| browser.close() | |
| if __name__ == "__main__": | |
| parser = argparse.ArgumentParser(description="Pinterest Keyword Scraper") | |
| parser.add_argument("keyword", type=str, help="The keyword to search for on Pinterest") | |
| parser.add_argument("-c", "--count", type=int, default=10, help="Number of images to scrape (default: 10)") | |
| args = parser.parse_args() | |
| scrape_pinterest(args.keyword, args.count) | |