Spaces:

factorstudios
/

pinteresting

Sleeping

App Files Files Community

factorstudios commited on May 31

Commit

cfa4580

verified ·

1 Parent(s): d3bc69b

Upload 5 files

Browse files

Files changed (5) hide show

Dockerfile +25 -0
api_app.py +196 -0
app.py +120 -0
hf_app.py +136 -0
requirements.txt +6 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,25 @@

+FROM python:3.10-slim
+WORKDIR /app
+# Install system dependencies for Playwright
+RUN apt-get update && apt-get install -y \
+    chromium \
+    chromium-driver \
+    && rm -rf /var/lib/apt/lists/*
+# Copy requirements
+COPY requirements.txt .
+RUN pip install --no-cache-dir -r requirements.txt
+# Install Playwright browsers
+RUN playwright install chromium
+# Copy app
+COPY api_app.py .
+# Expose port
+EXPOSE 7860
+# Run the app
+CMD ["python", "api_app.py"]

api_app.py ADDED Viewed

	@@ -0,0 +1,196 @@

+import os
+import asyncio
+from fastapi import FastAPI, HTTPException
+from playwright.async_api import async_playwright
+from pydantic import BaseModel
+from typing import List, Optional
+import uvicorn
+app = FastAPI(title="Pinterest Scraper API")
+# Cache for search results (optional optimization)
+search_cache = {}
+CACHE_DURATION = 300  # 5 minutes
+class ScrapeRequest(BaseModel):
+    keyword: str
+    count: int = 10
+    aspect_ratio: str = None  # Options: "9:16", "16:9", "1:1", "4:5", "any"
+class ScrapeResponse(BaseModel):
+    success: bool
+    message: str
+    images: List[dict]  # Each image has url, width, height, aspect_ratio
+    keyword: str
+def check_aspect_ratio(width: int, height: int, target_ratio: str) -> bool:
+    """Check if image matches target aspect ratio within tolerance."""
+    if not target_ratio or target_ratio == "any":
+        return True
+    current_ratio = width / height
+    ratios = {
+        "9:16": 9/16,      # Vertical (Shorts/Reels)
+        "16:9": 16/9,      # Horizontal (Landscape)
+        "1:1": 1/1,        # Square
+        "4:5": 4/5,        # Portrait (Instagram)
+        "3:4": 3/4,        # Portrait (Standard)
+        "21:9": 21/9,      # Ultrawide
+    }
+    if target_ratio not in ratios:
+        return True
+    target = ratios[target_ratio]
+    tolerance = 0.15  # 15% tolerance
+    return abs(current_ratio - target) <= tolerance * target
+async def scrape_pinterest_api(keyword: str, count: int, aspect_ratio: str = None):
+    print(f"Starting scrape for '{keyword}', count={count}, ratio={aspect_ratio}")
+    images = []  # List of dict with url, width, height
+    try:
+        async with async_playwright() as p:
+            browser = await p.chromium.launch(headless=True)
+            context = await browser.new_context(
+                user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+            )
+            page = await context.new_page()
+            search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}"
+            try:
+                await page.goto(search_url, timeout=60000)
+                await page.wait_for_selector("div[data-test-id='pin']", timeout=15000)
+            except Exception as e:
+                await browser.close()
+                return [], str(e)
+            downloaded_count = 0
+            seen_urls = set()
+            last_height = await page.evaluate("document.body.scrollHeight")
+            no_new_content_count = 0
+            scroll_attempts = 0
+            max_scrolls = 8  # Limit scroll attempts
+            while downloaded_count < count and scroll_attempts < max_scrolls:
+                # Wait for images to load and get dimensions properly
+                await page.wait_for_timeout(500)  # Let lazy images load
+                img_data = await page.evaluate("""
+                    () => {
+                        const pins = document.querySelectorAll("div[data-test-id='pin']");
+                        return Array.from(pins).map(pin => {
+                            const img = pin.querySelector('img');
+                            if (!img || !img.src) return null;
+                            // Get actual rendered dimensions from parent container
+                            const rect = pin.getBoundingClientRect();
+                            return {
+                                src: img.src,
+                                // Use container aspect ratio if image not loaded
+                                width: img.naturalWidth || Math.round(rect.width),
+                                height: img.naturalHeight || Math.round(rect.height),
+                                container_width: Math.round(rect.width),
+                                container_height: Math.round(rect.height)
+                            };
+                        }).filter(item => item && item.src.includes('pinimg.com'));
+                    }
+                """)
+                for img_info in img_data:
+                    if downloaded_count >= count:
+                        break
+                    src = img_info.get("src", "")
+                    if not src:
+                        continue
+                    # Convert to high-res URL
+                    high_res_url = src.replace("236x", "736x").replace("474x", "736x")
+                    if high_res_url not in seen_urls:
+                        seen_urls.add(high_res_url)
+                        # Use natural dimensions if available, else container dimensions
+                        width = img_info.get("width", 0) or img_info.get("container_width", 0)
+                        height = img_info.get("height", 0) or img_info.get("container_height", 0)
+                        # Check aspect ratio if specified
+                        passes_ratio = True
+                        if aspect_ratio and aspect_ratio != "any":
+                            if width > 0 and height > 0:
+                                passes_ratio = check_aspect_ratio(width, height, aspect_ratio)
+                                print(f"Checking ratio: {width}x{height} = {width/height:.2f} for {aspect_ratio} -> {passes_ratio}")
+                        if passes_ratio:
+                            images.append({
+                                "url": high_res_url,
+                                "width": width,
+                                "height": height,
+                                "aspect_ratio": f"{width}:{height}" if width > 0 else "unknown"
+                            })
+                            downloaded_count += 1
+                if downloaded_count >= count:
+                    break
+                scroll_attempts += 1
+                # Scroll down - reduced wait time
+                await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+                await page.wait_for_timeout(800)  # Reduced from 2000ms
+                new_height = await page.evaluate("document.body.scrollHeight")
+                if new_height == last_height:
+                    no_new_content_count += 1
+                    if no_new_content_count > 3:
+                        break
+                else:
+                    no_new_content_count = 0
+                last_height = new_height
+            await browser.close()
+    except Exception as e:
+        print(f"Playwright error: {e}")
+        return [], str(e)
+    return images, None
+@app.post("/scrape", response_model=ScrapeResponse)
+async def scrape(request: ScrapeRequest):
+    if not request.keyword:
+        raise HTTPException(status_code=400, detail="Keyword is required")
+    if request.count < 1 or request.count > 20:
+        raise HTTPException(status_code=400, detail="Count must be between 1 and 20")
+    paths, error = await scrape_pinterest_api(request.keyword, request.count, request.aspect_ratio)
+    if error:
+        return ScrapeResponse(
+            success=False,
+            message=f"Error: {error}",
+            images=[],
+            keyword=request.keyword
+        )
+    return ScrapeResponse(
+        success=True,
+        message=f"Found {len(paths)} images",
+        images=paths,
+        keyword=request.keyword
+    )
+@app.get("/health")
+async def health():
+    return {"status": "healthy", "service": "pinterest-scraper-api"}
+if __name__ == "__main__":
+    uvicorn.run(app, host="0.0.0.0", port=7860)

app.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import argparse
+import os
+import time
+import requests
+from playwright.sync_api import sync_playwright
+def download_image(url, folder_path, image_name):
+    try:
+        response = requests.get(url, stream=True, timeout=10)
+        if response.status_code == 200:
+            file_path = os.path.join(folder_path, image_name)
+            with open(file_path, 'wb') as f:
+                for chunk in response.iter_content(1024):
+                    f.write(chunk)
+            print(f"Downloaded: {image_name}")
+            return True
+        else:
+            print(f"Failed to download {url}: Status code {response.status_code}")
+    except Exception as e:
+        print(f"Error downloading {url}: {e}")
+    return False
+def scrape_pinterest(keyword, count):
+    # Setup downloads folder
+    base_folder = "downloads"
+    keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_"))
+    os.makedirs(keyword_folder, exist_ok=True)
+    print(f"Scraping {count} images for '{keyword}'...")
+    print(f"Saving to {keyword_folder}/")
+    with sync_playwright() as p:
+        # Pinterest sometimes blocks headless without proper user agents or stealth,
+        # but standard headless=False or providing a realistic UA usually works.
+        # We will use headless=True and the local Edge installation to save 150MB of downloads.
+        browser = p.chromium.launch(
+            headless=True,
+            executable_path=r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
+        )
+        context = browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
+        )
+        page = context.new_page()
+        # Navigate to search page
+        search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}"
+        print(f"Navigating to {search_url}")
+        try:
+            page.goto(search_url, timeout=60000)
+            # Wait for content to load
+            page.wait_for_selector("div[data-test-id='pin']", timeout=15000)
+        except Exception as e:
+            print(f"Error loading page: {e}. Check if Pinterest requires login or if the IP is blocked.")
+            browser.close()
+            return
+        downloaded_count = 0
+        seen_urls = set()
+        # Scroll and extract
+        last_height = page.evaluate("document.body.scrollHeight")
+        no_new_content_count = 0
+        while downloaded_count < count:
+            # Find all image elements within pins
+            # Pinterest structured images usually have a srcset or src. We look for high-res.
+            images = page.locator("div[data-test-id='pin'] img").all()
+            for img in images:
+                if downloaded_count >= count:
+                    break
+                src = img.get_attribute("src")
+                if not src:
+                    continue
+                # Pinterest thumbnails are often 236x. Let's try to get the original or larger versions.
+                # typical url: https://i.pinimg.com/236x/xx/xx/xx/...jpg
+                # hi-res url: https://i.pinimg.com/736x/xx/xx/xx/...jpg or originals/
+                high_res_url = src.replace("236x", "736x")
+                if high_res_url not in seen_urls:
+                    seen_urls.add(high_res_url)
+                    image_name = f"pinterest_{downloaded_count+1}.jpg"
+                    success = download_image(high_res_url, keyword_folder, image_name)
+                    if success:
+                        downloaded_count += 1
+            if downloaded_count >= count:
+                break
+            # Scroll down
+            print("Scrolling down for more images...")
+            page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+            page.wait_for_timeout(2000)  # Wait for loading
+            new_height = page.evaluate("document.body.scrollHeight")
+            if new_height == last_height:
+                no_new_content_count += 1
+                if no_new_content_count > 3:
+                    print("Reached end of page or no more images loading.")
+                    break
+            else:
+                no_new_content_count = 0
+            last_height = new_height
+        print(f"Finished scraping. Downloaded {downloaded_count} images.")
+        browser.close()
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Pinterest Keyword Scraper")
+    parser.add_argument("keyword", type=str, help="The keyword to search for on Pinterest")
+    parser.add_argument("-c", "--count", type=int, default=10, help="Number of images to scrape (default: 10)")
+    args = parser.parse_args()
+    scrape_pinterest(args.keyword, args.count)

hf_app.py ADDED Viewed

	@@ -0,0 +1,136 @@

+import os
+import time
+import requests
+import gradio as gr
+from playwright.sync_api import sync_playwright
+# Install playwright browser on startup (HF Space compatible)
+os.system("playwright install chromium")
+# HF Space deployment - ready
+def download_image(url, folder_path, image_name):
+    try:
+        response = requests.get(url, stream=True, timeout=10)
+        if response.status_code == 200:
+            file_path = os.path.join(folder_path, image_name)
+            with open(file_path, 'wb') as f:
+                for chunk in response.iter_content(1024):
+                    f.write(chunk)
+            return file_path
+    except Exception as e:
+        print(f"Error downloading {url}: {e}")
+    return None
+def scrape_pinterest(keyword, count):
+    base_folder = "downloads"
+    keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_"))
+    os.makedirs(keyword_folder, exist_ok=True)
+    downloaded_paths = []
+    with sync_playwright() as p:
+        # HF compatible - use default chromium without Edge path
+        browser = p.chromium.launch(headless=True)
+        context = browser.new_context(
+            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
+        )
+        page = context.new_page()
+        search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}"
+        try:
+            page.goto(search_url, timeout=60000)
+            page.wait_for_selector("div[data-test-id='pin']", timeout=15000)
+        except Exception as e:
+            browser.close()
+            return [], f"Error: {str(e)}"
+        downloaded_count = 0
+        seen_urls = set()
+        last_height = page.evaluate("document.body.scrollHeight")
+        no_new_content_count = 0
+        while downloaded_count < count:
+            images = page.locator("div[data-test-id='pin'] img").all()
+            for img in images:
+                if downloaded_count >= count:
+                    break
+                src = img.get_attribute("src")
+                if not src:
+                    continue
+                high_res_url = src.replace("236x", "736x")
+                if high_res_url not in seen_urls:
+                    seen_urls.add(high_res_url)
+                    image_name = f"pinterest_{downloaded_count+1}.jpg"
+                    file_path = download_image(high_res_url, keyword_folder, image_name)
+                    if file_path:
+                        downloaded_paths.append(file_path)
+                        downloaded_count += 1
+            if downloaded_count >= count:
+                break
+            page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
+            page.wait_for_timeout(2000)
+            new_height = page.evaluate("document.body.scrollHeight")
+            if new_height == last_height:
+                no_new_content_count += 1
+                if no_new_content_count > 3:
+                    break
+            else:
+                no_new_content_count = 0
+            last_height = new_height
+        browser.close()
+    return downloaded_paths, f"Downloaded {len(downloaded_paths)} images"
+# Gradio Interface
+def scrape_interface(keyword, count):
+    if not keyword:
+        return [], "Please enter a keyword"
+    paths, msg = scrape_pinterest(keyword, count)
+    # Return images for display
+    return paths, msg
+with gr.Blocks(title="Pinterest Image Scraper") as demo:
+    gr.Markdown("# Pinterest Image Scraper")
+    gr.Markdown("Search and download Pinterest images by keyword")
+    with gr.Row():
+        with gr.Column():
+            keyword_input = gr.Textbox(
+                label="Search Keyword",
+                placeholder="Enter keyword (e.g., aesthetic wallpaper, anime girl)",
+                value="aesthetic wallpaper"
+            )
+            count_slider = gr.Slider(
+                minimum=1,
+                maximum=20,
+                value=5,
+                step=1,
+                label="Number of Images"
+            )
+            scrape_btn = gr.Button("Scrape Images", variant="primary")
+        with gr.Column():
+            status = gr.Textbox(label="Status")
+            gallery = gr.Gallery(label="Downloaded Images", columns=3)
+    scrape_btn.click(
+        fn=scrape_interface,
+        inputs=[keyword_input, count_slider],
+        outputs=[gallery, status]
+    )
+if __name__ == "__main__":
+    demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+fastapi
+uvicorn
+playwright
+requests
+pillow
+pydantic