factorstudios commited on
Commit
cfa4580
·
verified ·
1 Parent(s): d3bc69b

Upload 5 files

Browse files
Files changed (5) hide show
  1. Dockerfile +25 -0
  2. api_app.py +196 -0
  3. app.py +120 -0
  4. hf_app.py +136 -0
  5. requirements.txt +6 -0
Dockerfile ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ FROM python:3.10-slim
2
+
3
+ WORKDIR /app
4
+
5
+ # Install system dependencies for Playwright
6
+ RUN apt-get update && apt-get install -y \
7
+ chromium \
8
+ chromium-driver \
9
+ && rm -rf /var/lib/apt/lists/*
10
+
11
+ # Copy requirements
12
+ COPY requirements.txt .
13
+ RUN pip install --no-cache-dir -r requirements.txt
14
+
15
+ # Install Playwright browsers
16
+ RUN playwright install chromium
17
+
18
+ # Copy app
19
+ COPY api_app.py .
20
+
21
+ # Expose port
22
+ EXPOSE 7860
23
+
24
+ # Run the app
25
+ CMD ["python", "api_app.py"]
api_app.py ADDED
@@ -0,0 +1,196 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import asyncio
3
+ from fastapi import FastAPI, HTTPException
4
+ from playwright.async_api import async_playwright
5
+ from pydantic import BaseModel
6
+ from typing import List, Optional
7
+ import uvicorn
8
+
9
+ app = FastAPI(title="Pinterest Scraper API")
10
+
11
+ # Cache for search results (optional optimization)
12
+ search_cache = {}
13
+ CACHE_DURATION = 300 # 5 minutes
14
+
15
+ class ScrapeRequest(BaseModel):
16
+ keyword: str
17
+ count: int = 10
18
+ aspect_ratio: str = None # Options: "9:16", "16:9", "1:1", "4:5", "any"
19
+
20
+ class ScrapeResponse(BaseModel):
21
+ success: bool
22
+ message: str
23
+ images: List[dict] # Each image has url, width, height, aspect_ratio
24
+ keyword: str
25
+
26
+ def check_aspect_ratio(width: int, height: int, target_ratio: str) -> bool:
27
+ """Check if image matches target aspect ratio within tolerance."""
28
+ if not target_ratio or target_ratio == "any":
29
+ return True
30
+
31
+ current_ratio = width / height
32
+
33
+ ratios = {
34
+ "9:16": 9/16, # Vertical (Shorts/Reels)
35
+ "16:9": 16/9, # Horizontal (Landscape)
36
+ "1:1": 1/1, # Square
37
+ "4:5": 4/5, # Portrait (Instagram)
38
+ "3:4": 3/4, # Portrait (Standard)
39
+ "21:9": 21/9, # Ultrawide
40
+ }
41
+
42
+ if target_ratio not in ratios:
43
+ return True
44
+
45
+ target = ratios[target_ratio]
46
+ tolerance = 0.15 # 15% tolerance
47
+
48
+ return abs(current_ratio - target) <= tolerance * target
49
+
50
+ async def scrape_pinterest_api(keyword: str, count: int, aspect_ratio: str = None):
51
+ print(f"Starting scrape for '{keyword}', count={count}, ratio={aspect_ratio}")
52
+
53
+ images = [] # List of dict with url, width, height
54
+
55
+ try:
56
+ async with async_playwright() as p:
57
+ browser = await p.chromium.launch(headless=True)
58
+ context = await browser.new_context(
59
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
60
+ )
61
+ page = await context.new_page()
62
+
63
+ search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}"
64
+
65
+ try:
66
+ await page.goto(search_url, timeout=60000)
67
+ await page.wait_for_selector("div[data-test-id='pin']", timeout=15000)
68
+ except Exception as e:
69
+ await browser.close()
70
+ return [], str(e)
71
+
72
+ downloaded_count = 0
73
+ seen_urls = set()
74
+ last_height = await page.evaluate("document.body.scrollHeight")
75
+ no_new_content_count = 0
76
+
77
+ scroll_attempts = 0
78
+ max_scrolls = 8 # Limit scroll attempts
79
+
80
+ while downloaded_count < count and scroll_attempts < max_scrolls:
81
+ # Wait for images to load and get dimensions properly
82
+ await page.wait_for_timeout(500) # Let lazy images load
83
+
84
+ img_data = await page.evaluate("""
85
+ () => {
86
+ const pins = document.querySelectorAll("div[data-test-id='pin']");
87
+ return Array.from(pins).map(pin => {
88
+ const img = pin.querySelector('img');
89
+ if (!img || !img.src) return null;
90
+
91
+ // Get actual rendered dimensions from parent container
92
+ const rect = pin.getBoundingClientRect();
93
+
94
+ return {
95
+ src: img.src,
96
+ // Use container aspect ratio if image not loaded
97
+ width: img.naturalWidth || Math.round(rect.width),
98
+ height: img.naturalHeight || Math.round(rect.height),
99
+ container_width: Math.round(rect.width),
100
+ container_height: Math.round(rect.height)
101
+ };
102
+ }).filter(item => item && item.src.includes('pinimg.com'));
103
+ }
104
+ """)
105
+
106
+ for img_info in img_data:
107
+ if downloaded_count >= count:
108
+ break
109
+
110
+ src = img_info.get("src", "")
111
+ if not src:
112
+ continue
113
+
114
+ # Convert to high-res URL
115
+ high_res_url = src.replace("236x", "736x").replace("474x", "736x")
116
+
117
+ if high_res_url not in seen_urls:
118
+ seen_urls.add(high_res_url)
119
+
120
+ # Use natural dimensions if available, else container dimensions
121
+ width = img_info.get("width", 0) or img_info.get("container_width", 0)
122
+ height = img_info.get("height", 0) or img_info.get("container_height", 0)
123
+
124
+ # Check aspect ratio if specified
125
+ passes_ratio = True
126
+ if aspect_ratio and aspect_ratio != "any":
127
+ if width > 0 and height > 0:
128
+ passes_ratio = check_aspect_ratio(width, height, aspect_ratio)
129
+ print(f"Checking ratio: {width}x{height} = {width/height:.2f} for {aspect_ratio} -> {passes_ratio}")
130
+
131
+ if passes_ratio:
132
+ images.append({
133
+ "url": high_res_url,
134
+ "width": width,
135
+ "height": height,
136
+ "aspect_ratio": f"{width}:{height}" if width > 0 else "unknown"
137
+ })
138
+ downloaded_count += 1
139
+
140
+ if downloaded_count >= count:
141
+ break
142
+
143
+ scroll_attempts += 1
144
+
145
+ # Scroll down - reduced wait time
146
+ await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
147
+ await page.wait_for_timeout(800) # Reduced from 2000ms
148
+
149
+ new_height = await page.evaluate("document.body.scrollHeight")
150
+ if new_height == last_height:
151
+ no_new_content_count += 1
152
+ if no_new_content_count > 3:
153
+ break
154
+ else:
155
+ no_new_content_count = 0
156
+
157
+ last_height = new_height
158
+
159
+ await browser.close()
160
+ except Exception as e:
161
+ print(f"Playwright error: {e}")
162
+ return [], str(e)
163
+
164
+ return images, None
165
+
166
+ @app.post("/scrape", response_model=ScrapeResponse)
167
+ async def scrape(request: ScrapeRequest):
168
+ if not request.keyword:
169
+ raise HTTPException(status_code=400, detail="Keyword is required")
170
+
171
+ if request.count < 1 or request.count > 20:
172
+ raise HTTPException(status_code=400, detail="Count must be between 1 and 20")
173
+
174
+ paths, error = await scrape_pinterest_api(request.keyword, request.count, request.aspect_ratio)
175
+
176
+ if error:
177
+ return ScrapeResponse(
178
+ success=False,
179
+ message=f"Error: {error}",
180
+ images=[],
181
+ keyword=request.keyword
182
+ )
183
+
184
+ return ScrapeResponse(
185
+ success=True,
186
+ message=f"Found {len(paths)} images",
187
+ images=paths,
188
+ keyword=request.keyword
189
+ )
190
+
191
+ @app.get("/health")
192
+ async def health():
193
+ return {"status": "healthy", "service": "pinterest-scraper-api"}
194
+
195
+ if __name__ == "__main__":
196
+ uvicorn.run(app, host="0.0.0.0", port=7860)
app.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import time
4
+ import requests
5
+ from playwright.sync_api import sync_playwright
6
+
7
+ def download_image(url, folder_path, image_name):
8
+ try:
9
+ response = requests.get(url, stream=True, timeout=10)
10
+ if response.status_code == 200:
11
+ file_path = os.path.join(folder_path, image_name)
12
+ with open(file_path, 'wb') as f:
13
+ for chunk in response.iter_content(1024):
14
+ f.write(chunk)
15
+ print(f"Downloaded: {image_name}")
16
+ return True
17
+ else:
18
+ print(f"Failed to download {url}: Status code {response.status_code}")
19
+ except Exception as e:
20
+ print(f"Error downloading {url}: {e}")
21
+ return False
22
+
23
+ def scrape_pinterest(keyword, count):
24
+ # Setup downloads folder
25
+ base_folder = "downloads"
26
+ keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_"))
27
+ os.makedirs(keyword_folder, exist_ok=True)
28
+
29
+ print(f"Scraping {count} images for '{keyword}'...")
30
+ print(f"Saving to {keyword_folder}/")
31
+
32
+ with sync_playwright() as p:
33
+ # Pinterest sometimes blocks headless without proper user agents or stealth,
34
+ # but standard headless=False or providing a realistic UA usually works.
35
+ # We will use headless=True and the local Edge installation to save 150MB of downloads.
36
+ browser = p.chromium.launch(
37
+ headless=True,
38
+ executable_path=r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
39
+ )
40
+ context = browser.new_context(
41
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
42
+ )
43
+ page = context.new_page()
44
+
45
+ # Navigate to search page
46
+ search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}"
47
+ print(f"Navigating to {search_url}")
48
+
49
+ try:
50
+ page.goto(search_url, timeout=60000)
51
+ # Wait for content to load
52
+ page.wait_for_selector("div[data-test-id='pin']", timeout=15000)
53
+ except Exception as e:
54
+ print(f"Error loading page: {e}. Check if Pinterest requires login or if the IP is blocked.")
55
+ browser.close()
56
+ return
57
+
58
+ downloaded_count = 0
59
+ seen_urls = set()
60
+
61
+ # Scroll and extract
62
+ last_height = page.evaluate("document.body.scrollHeight")
63
+ no_new_content_count = 0
64
+
65
+ while downloaded_count < count:
66
+ # Find all image elements within pins
67
+ # Pinterest structured images usually have a srcset or src. We look for high-res.
68
+ images = page.locator("div[data-test-id='pin'] img").all()
69
+
70
+ for img in images:
71
+ if downloaded_count >= count:
72
+ break
73
+
74
+ src = img.get_attribute("src")
75
+ if not src:
76
+ continue
77
+
78
+ # Pinterest thumbnails are often 236x. Let's try to get the original or larger versions.
79
+ # typical url: https://i.pinimg.com/236x/xx/xx/xx/...jpg
80
+ # hi-res url: https://i.pinimg.com/736x/xx/xx/xx/...jpg or originals/
81
+
82
+ high_res_url = src.replace("236x", "736x")
83
+
84
+ if high_res_url not in seen_urls:
85
+ seen_urls.add(high_res_url)
86
+ image_name = f"pinterest_{downloaded_count+1}.jpg"
87
+ success = download_image(high_res_url, keyword_folder, image_name)
88
+ if success:
89
+ downloaded_count += 1
90
+
91
+ if downloaded_count >= count:
92
+ break
93
+
94
+ # Scroll down
95
+ print("Scrolling down for more images...")
96
+ page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
97
+ page.wait_for_timeout(2000) # Wait for loading
98
+
99
+ new_height = page.evaluate("document.body.scrollHeight")
100
+ if new_height == last_height:
101
+ no_new_content_count += 1
102
+ if no_new_content_count > 3:
103
+ print("Reached end of page or no more images loading.")
104
+ break
105
+ else:
106
+ no_new_content_count = 0
107
+
108
+ last_height = new_height
109
+
110
+ print(f"Finished scraping. Downloaded {downloaded_count} images.")
111
+ browser.close()
112
+
113
+ if __name__ == "__main__":
114
+ parser = argparse.ArgumentParser(description="Pinterest Keyword Scraper")
115
+ parser.add_argument("keyword", type=str, help="The keyword to search for on Pinterest")
116
+ parser.add_argument("-c", "--count", type=int, default=10, help="Number of images to scrape (default: 10)")
117
+
118
+ args = parser.parse_args()
119
+
120
+ scrape_pinterest(args.keyword, args.count)
hf_app.py ADDED
@@ -0,0 +1,136 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import time
3
+ import requests
4
+ import gradio as gr
5
+ from playwright.sync_api import sync_playwright
6
+
7
+ # Install playwright browser on startup (HF Space compatible)
8
+ os.system("playwright install chromium")
9
+
10
+ # HF Space deployment - ready
11
+
12
+ def download_image(url, folder_path, image_name):
13
+ try:
14
+ response = requests.get(url, stream=True, timeout=10)
15
+ if response.status_code == 200:
16
+ file_path = os.path.join(folder_path, image_name)
17
+ with open(file_path, 'wb') as f:
18
+ for chunk in response.iter_content(1024):
19
+ f.write(chunk)
20
+ return file_path
21
+ except Exception as e:
22
+ print(f"Error downloading {url}: {e}")
23
+ return None
24
+
25
+ def scrape_pinterest(keyword, count):
26
+ base_folder = "downloads"
27
+ keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_"))
28
+ os.makedirs(keyword_folder, exist_ok=True)
29
+
30
+ downloaded_paths = []
31
+
32
+ with sync_playwright() as p:
33
+ # HF compatible - use default chromium without Edge path
34
+ browser = p.chromium.launch(headless=True)
35
+ context = browser.new_context(
36
+ user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
37
+ )
38
+ page = context.new_page()
39
+
40
+ search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}"
41
+
42
+ try:
43
+ page.goto(search_url, timeout=60000)
44
+ page.wait_for_selector("div[data-test-id='pin']", timeout=15000)
45
+ except Exception as e:
46
+ browser.close()
47
+ return [], f"Error: {str(e)}"
48
+
49
+ downloaded_count = 0
50
+ seen_urls = set()
51
+ last_height = page.evaluate("document.body.scrollHeight")
52
+ no_new_content_count = 0
53
+
54
+ while downloaded_count < count:
55
+ images = page.locator("div[data-test-id='pin'] img").all()
56
+
57
+ for img in images:
58
+ if downloaded_count >= count:
59
+ break
60
+
61
+ src = img.get_attribute("src")
62
+ if not src:
63
+ continue
64
+
65
+ high_res_url = src.replace("236x", "736x")
66
+
67
+ if high_res_url not in seen_urls:
68
+ seen_urls.add(high_res_url)
69
+ image_name = f"pinterest_{downloaded_count+1}.jpg"
70
+ file_path = download_image(high_res_url, keyword_folder, image_name)
71
+ if file_path:
72
+ downloaded_paths.append(file_path)
73
+ downloaded_count += 1
74
+
75
+ if downloaded_count >= count:
76
+ break
77
+
78
+ page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
79
+ page.wait_for_timeout(2000)
80
+
81
+ new_height = page.evaluate("document.body.scrollHeight")
82
+ if new_height == last_height:
83
+ no_new_content_count += 1
84
+ if no_new_content_count > 3:
85
+ break
86
+ else:
87
+ no_new_content_count = 0
88
+
89
+ last_height = new_height
90
+
91
+ browser.close()
92
+
93
+ return downloaded_paths, f"Downloaded {len(downloaded_paths)} images"
94
+
95
+ # Gradio Interface
96
+ def scrape_interface(keyword, count):
97
+ if not keyword:
98
+ return [], "Please enter a keyword"
99
+
100
+ paths, msg = scrape_pinterest(keyword, count)
101
+
102
+ # Return images for display
103
+ return paths, msg
104
+
105
+ with gr.Blocks(title="Pinterest Image Scraper") as demo:
106
+ gr.Markdown("# Pinterest Image Scraper")
107
+ gr.Markdown("Search and download Pinterest images by keyword")
108
+
109
+ with gr.Row():
110
+ with gr.Column():
111
+ keyword_input = gr.Textbox(
112
+ label="Search Keyword",
113
+ placeholder="Enter keyword (e.g., aesthetic wallpaper, anime girl)",
114
+ value="aesthetic wallpaper"
115
+ )
116
+ count_slider = gr.Slider(
117
+ minimum=1,
118
+ maximum=20,
119
+ value=5,
120
+ step=1,
121
+ label="Number of Images"
122
+ )
123
+ scrape_btn = gr.Button("Scrape Images", variant="primary")
124
+
125
+ with gr.Column():
126
+ status = gr.Textbox(label="Status")
127
+ gallery = gr.Gallery(label="Downloaded Images", columns=3)
128
+
129
+ scrape_btn.click(
130
+ fn=scrape_interface,
131
+ inputs=[keyword_input, count_slider],
132
+ outputs=[gallery, status]
133
+ )
134
+
135
+ if __name__ == "__main__":
136
+ demo.launch()
requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ fastapi
2
+ uvicorn
3
+ playwright
4
+ requests
5
+ pillow
6
+ pydantic