pinteresting / app.py
factorstudios's picture
Upload 5 files
cfa4580 verified
Raw
History Blame Contribute Delete
5.04 kB
import argparse
import os
import time
import requests
from playwright.sync_api import sync_playwright
def download_image(url, folder_path, image_name):
try:
response = requests.get(url, stream=True, timeout=10)
if response.status_code == 200:
file_path = os.path.join(folder_path, image_name)
with open(file_path, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
print(f"Downloaded: {image_name}")
return True
else:
print(f"Failed to download {url}: Status code {response.status_code}")
except Exception as e:
print(f"Error downloading {url}: {e}")
return False
def scrape_pinterest(keyword, count):
# Setup downloads folder
base_folder = "downloads"
keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_"))
os.makedirs(keyword_folder, exist_ok=True)
print(f"Scraping {count} images for '{keyword}'...")
print(f"Saving to {keyword_folder}/")
with sync_playwright() as p:
# Pinterest sometimes blocks headless without proper user agents or stealth,
# but standard headless=False or providing a realistic UA usually works.
# We will use headless=True and the local Edge installation to save 150MB of downloads.
browser = p.chromium.launch(
headless=True,
executable_path=r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
)
context = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
)
page = context.new_page()
# Navigate to search page
search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}"
print(f"Navigating to {search_url}")
try:
page.goto(search_url, timeout=60000)
# Wait for content to load
page.wait_for_selector("div[data-test-id='pin']", timeout=15000)
except Exception as e:
print(f"Error loading page: {e}. Check if Pinterest requires login or if the IP is blocked.")
browser.close()
return
downloaded_count = 0
seen_urls = set()
# Scroll and extract
last_height = page.evaluate("document.body.scrollHeight")
no_new_content_count = 0
while downloaded_count < count:
# Find all image elements within pins
# Pinterest structured images usually have a srcset or src. We look for high-res.
images = page.locator("div[data-test-id='pin'] img").all()
for img in images:
if downloaded_count >= count:
break
src = img.get_attribute("src")
if not src:
continue
# Pinterest thumbnails are often 236x. Let's try to get the original or larger versions.
# typical url: https://i.pinimg.com/236x/xx/xx/xx/...jpg
# hi-res url: https://i.pinimg.com/736x/xx/xx/xx/...jpg or originals/
high_res_url = src.replace("236x", "736x")
if high_res_url not in seen_urls:
seen_urls.add(high_res_url)
image_name = f"pinterest_{downloaded_count+1}.jpg"
success = download_image(high_res_url, keyword_folder, image_name)
if success:
downloaded_count += 1
if downloaded_count >= count:
break
# Scroll down
print("Scrolling down for more images...")
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(2000) # Wait for loading
new_height = page.evaluate("document.body.scrollHeight")
if new_height == last_height:
no_new_content_count += 1
if no_new_content_count > 3:
print("Reached end of page or no more images loading.")
break
else:
no_new_content_count = 0
last_height = new_height
print(f"Finished scraping. Downloaded {downloaded_count} images.")
browser.close()
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Pinterest Keyword Scraper")
parser.add_argument("keyword", type=str, help="The keyword to search for on Pinterest")
parser.add_argument("-c", "--count", type=int, default=10, help="Number of images to scrape (default: 10)")
args = parser.parse_args()
scrape_pinterest(args.keyword, args.count)