pinteresting / hf_app.py
factorstudios's picture
Upload 5 files
cfa4580 verified
Raw
History Blame Contribute Delete
4.73 kB
import os
import time
import requests
import gradio as gr
from playwright.sync_api import sync_playwright
# Install playwright browser on startup (HF Space compatible)
os.system("playwright install chromium")
# HF Space deployment - ready
def download_image(url, folder_path, image_name):
try:
response = requests.get(url, stream=True, timeout=10)
if response.status_code == 200:
file_path = os.path.join(folder_path, image_name)
with open(file_path, 'wb') as f:
for chunk in response.iter_content(1024):
f.write(chunk)
return file_path
except Exception as e:
print(f"Error downloading {url}: {e}")
return None
def scrape_pinterest(keyword, count):
base_folder = "downloads"
keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_"))
os.makedirs(keyword_folder, exist_ok=True)
downloaded_paths = []
with sync_playwright() as p:
# HF compatible - use default chromium without Edge path
browser = p.chromium.launch(headless=True)
context = browser.new_context(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
)
page = context.new_page()
search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}"
try:
page.goto(search_url, timeout=60000)
page.wait_for_selector("div[data-test-id='pin']", timeout=15000)
except Exception as e:
browser.close()
return [], f"Error: {str(e)}"
downloaded_count = 0
seen_urls = set()
last_height = page.evaluate("document.body.scrollHeight")
no_new_content_count = 0
while downloaded_count < count:
images = page.locator("div[data-test-id='pin'] img").all()
for img in images:
if downloaded_count >= count:
break
src = img.get_attribute("src")
if not src:
continue
high_res_url = src.replace("236x", "736x")
if high_res_url not in seen_urls:
seen_urls.add(high_res_url)
image_name = f"pinterest_{downloaded_count+1}.jpg"
file_path = download_image(high_res_url, keyword_folder, image_name)
if file_path:
downloaded_paths.append(file_path)
downloaded_count += 1
if downloaded_count >= count:
break
page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
page.wait_for_timeout(2000)
new_height = page.evaluate("document.body.scrollHeight")
if new_height == last_height:
no_new_content_count += 1
if no_new_content_count > 3:
break
else:
no_new_content_count = 0
last_height = new_height
browser.close()
return downloaded_paths, f"Downloaded {len(downloaded_paths)} images"
# Gradio Interface
def scrape_interface(keyword, count):
if not keyword:
return [], "Please enter a keyword"
paths, msg = scrape_pinterest(keyword, count)
# Return images for display
return paths, msg
with gr.Blocks(title="Pinterest Image Scraper") as demo:
gr.Markdown("# Pinterest Image Scraper")
gr.Markdown("Search and download Pinterest images by keyword")
with gr.Row():
with gr.Column():
keyword_input = gr.Textbox(
label="Search Keyword",
placeholder="Enter keyword (e.g., aesthetic wallpaper, anime girl)",
value="aesthetic wallpaper"
)
count_slider = gr.Slider(
minimum=1,
maximum=20,
value=5,
step=1,
label="Number of Images"
)
scrape_btn = gr.Button("Scrape Images", variant="primary")
with gr.Column():
status = gr.Textbox(label="Status")
gallery = gr.Gallery(label="Downloaded Images", columns=3)
scrape_btn.click(
fn=scrape_interface,
inputs=[keyword_input, count_slider],
outputs=[gallery, status]
)
if __name__ == "__main__":
demo.launch()