Spaces:

factorstudios
/

pinteresting

Sleeping

App Files Files Community

pinteresting / hf_app.py

factorstudios

Upload 5 files

cfa4580 verified 30 days ago

Raw

History Blame Contribute Delete

4.73 kB

	import os
	import time
	import requests
	import gradio as gr
	from playwright.sync_api import sync_playwright

	# Install playwright browser on startup (HF Space compatible)
	os.system("playwright install chromium")

	# HF Space deployment - ready

	def download_image(url, folder_path, image_name):
	try:
	response = requests.get(url, stream=True, timeout=10)
	if response.status_code == 200:
	file_path = os.path.join(folder_path, image_name)
	with open(file_path, 'wb') as f:
	for chunk in response.iter_content(1024):
	f.write(chunk)
	return file_path
	except Exception as e:
	print(f"Error downloading {url}: {e}")
	return None

	def scrape_pinterest(keyword, count):
	base_folder = "downloads"
	keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_"))
	os.makedirs(keyword_folder, exist_ok=True)

	downloaded_paths = []

	with sync_playwright() as p:
	# HF compatible - use default chromium without Edge path
	browser = p.chromium.launch(headless=True)
	context = browser.new_context(
	user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
	)
	page = context.new_page()

	search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}"

	try:
	page.goto(search_url, timeout=60000)
	page.wait_for_selector("div[data-test-id='pin']", timeout=15000)
	except Exception as e:
	browser.close()
	return [], f"Error: {str(e)}"

	downloaded_count = 0
	seen_urls = set()
	last_height = page.evaluate("document.body.scrollHeight")
	no_new_content_count = 0

	while downloaded_count < count:
	images = page.locator("div[data-test-id='pin'] img").all()

	for img in images:
	if downloaded_count >= count:
	break

	src = img.get_attribute("src")
	if not src:
	continue

	high_res_url = src.replace("236x", "736x")

	if high_res_url not in seen_urls:
	seen_urls.add(high_res_url)
	image_name = f"pinterest_{downloaded_count+1}.jpg"
	file_path = download_image(high_res_url, keyword_folder, image_name)
	if file_path:
	downloaded_paths.append(file_path)
	downloaded_count += 1

	if downloaded_count >= count:
	break

	page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
	page.wait_for_timeout(2000)

	new_height = page.evaluate("document.body.scrollHeight")
	if new_height == last_height:
	no_new_content_count += 1
	if no_new_content_count > 3:
	break
	else:
	no_new_content_count = 0

	last_height = new_height

	browser.close()

	return downloaded_paths, f"Downloaded {len(downloaded_paths)} images"

	# Gradio Interface
	def scrape_interface(keyword, count):
	if not keyword:
	return [], "Please enter a keyword"

	paths, msg = scrape_pinterest(keyword, count)

	# Return images for display
	return paths, msg

	with gr.Blocks(title="Pinterest Image Scraper") as demo:
	gr.Markdown("# Pinterest Image Scraper")
	gr.Markdown("Search and download Pinterest images by keyword")

	with gr.Row():
	with gr.Column():
	keyword_input = gr.Textbox(
	label="Search Keyword",
	placeholder="Enter keyword (e.g., aesthetic wallpaper, anime girl)",
	value="aesthetic wallpaper"
	)
	count_slider = gr.Slider(
	minimum=1,
	maximum=20,
	value=5,
	step=1,
	label="Number of Images"
	)
	scrape_btn = gr.Button("Scrape Images", variant="primary")

	with gr.Column():
	status = gr.Textbox(label="Status")
	gallery = gr.Gallery(label="Downloaded Images", columns=3)

	scrape_btn.click(
	fn=scrape_interface,
	inputs=[keyword_input, count_slider],
	outputs=[gallery, status]
	)

	if __name__ == "__main__":
	demo.launch()