Spaces:

factorstudios
/

pinteresting

Sleeping

App Files Files Community

pinteresting / app.py

factorstudios

Upload 5 files

cfa4580 verified 30 days ago

Raw

History Blame Contribute Delete

5.04 kB

	import argparse
	import os
	import time
	import requests
	from playwright.sync_api import sync_playwright

	def download_image(url, folder_path, image_name):
	try:
	response = requests.get(url, stream=True, timeout=10)
	if response.status_code == 200:
	file_path = os.path.join(folder_path, image_name)
	with open(file_path, 'wb') as f:
	for chunk in response.iter_content(1024):
	f.write(chunk)
	print(f"Downloaded: {image_name}")
	return True
	else:
	print(f"Failed to download {url}: Status code {response.status_code}")
	except Exception as e:
	print(f"Error downloading {url}: {e}")
	return False

	def scrape_pinterest(keyword, count):
	# Setup downloads folder
	base_folder = "downloads"
	keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_"))
	os.makedirs(keyword_folder, exist_ok=True)

	print(f"Scraping {count} images for '{keyword}'...")
	print(f"Saving to {keyword_folder}/")

	with sync_playwright() as p:
	# Pinterest sometimes blocks headless without proper user agents or stealth,
	# but standard headless=False or providing a realistic UA usually works.
	# We will use headless=True and the local Edge installation to save 150MB of downloads.
	browser = p.chromium.launch(
	headless=True,
	executable_path=r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
	)
	context = browser.new_context(
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
	)
	page = context.new_page()

	# Navigate to search page
	search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}"
	print(f"Navigating to {search_url}")

	try:
	page.goto(search_url, timeout=60000)
	# Wait for content to load
	page.wait_for_selector("div[data-test-id='pin']", timeout=15000)
	except Exception as e:
	print(f"Error loading page: {e}. Check if Pinterest requires login or if the IP is blocked.")
	browser.close()
	return

	downloaded_count = 0
	seen_urls = set()

	# Scroll and extract
	last_height = page.evaluate("document.body.scrollHeight")
	no_new_content_count = 0

	while downloaded_count < count:
	# Find all image elements within pins
	# Pinterest structured images usually have a srcset or src. We look for high-res.
	images = page.locator("div[data-test-id='pin'] img").all()

	for img in images:
	if downloaded_count >= count:
	break

	src = img.get_attribute("src")
	if not src:
	continue

	# Pinterest thumbnails are often 236x. Let's try to get the original or larger versions.
	# typical url: https://i.pinimg.com/236x/xx/xx/xx/...jpg
	# hi-res url: https://i.pinimg.com/736x/xx/xx/xx/...jpg or originals/

	high_res_url = src.replace("236x", "736x")

	if high_res_url not in seen_urls:
	seen_urls.add(high_res_url)
	image_name = f"pinterest_{downloaded_count+1}.jpg"
	success = download_image(high_res_url, keyword_folder, image_name)
	if success:
	downloaded_count += 1

	if downloaded_count >= count:
	break

	# Scroll down
	print("Scrolling down for more images...")
	page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
	page.wait_for_timeout(2000) # Wait for loading

	new_height = page.evaluate("document.body.scrollHeight")
	if new_height == last_height:
	no_new_content_count += 1
	if no_new_content_count > 3:
	print("Reached end of page or no more images loading.")
	break
	else:
	no_new_content_count = 0

	last_height = new_height

	print(f"Finished scraping. Downloaded {downloaded_count} images.")
	browser.close()

	if __name__ == "__main__":
	parser = argparse.ArgumentParser(description="Pinterest Keyword Scraper")
	parser.add_argument("keyword", type=str, help="The keyword to search for on Pinterest")
	parser.add_argument("-c", "--count", type=int, default=10, help="Number of images to scrape (default: 10)")

	args = parser.parse_args()

	scrape_pinterest(args.keyword, args.count)