File size: 5,044 Bytes
cfa4580
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import argparse
import os
import time
import requests
from playwright.sync_api import sync_playwright

def download_image(url, folder_path, image_name):
    try:
        response = requests.get(url, stream=True, timeout=10)
        if response.status_code == 200:
            file_path = os.path.join(folder_path, image_name)
            with open(file_path, 'wb') as f:
                for chunk in response.iter_content(1024):
                    f.write(chunk)
            print(f"Downloaded: {image_name}")
            return True
        else:
            print(f"Failed to download {url}: Status code {response.status_code}")
    except Exception as e:
        print(f"Error downloading {url}: {e}")
    return False

def scrape_pinterest(keyword, count):
    # Setup downloads folder
    base_folder = "downloads"
    keyword_folder = os.path.join(base_folder, keyword.replace(" ", "_"))
    os.makedirs(keyword_folder, exist_ok=True)
    
    print(f"Scraping {count} images for '{keyword}'...")
    print(f"Saving to {keyword_folder}/")

    with sync_playwright() as p:
        # Pinterest sometimes blocks headless without proper user agents or stealth, 
        # but standard headless=False or providing a realistic UA usually works.
        # We will use headless=True and the local Edge installation to save 150MB of downloads.
        browser = p.chromium.launch(
            headless=True,
            executable_path=r"C:\Program Files (x86)\Microsoft\Edge\Application\msedge.exe"
        )
        context = browser.new_context(
            user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        )
        page = context.new_page()
        
        # Navigate to search page
        search_url = f"https://www.pinterest.com/search/pins/?q={keyword.replace(' ', '%20')}"
        print(f"Navigating to {search_url}")
        
        try:
            page.goto(search_url, timeout=60000)
            # Wait for content to load
            page.wait_for_selector("div[data-test-id='pin']", timeout=15000)
        except Exception as e:
            print(f"Error loading page: {e}. Check if Pinterest requires login or if the IP is blocked.")
            browser.close()
            return

        downloaded_count = 0
        seen_urls = set()
        
        # Scroll and extract
        last_height = page.evaluate("document.body.scrollHeight")
        no_new_content_count = 0
        
        while downloaded_count < count:
            # Find all image elements within pins
            # Pinterest structured images usually have a srcset or src. We look for high-res.
            images = page.locator("div[data-test-id='pin'] img").all()
            
            for img in images:
                if downloaded_count >= count:
                    break
                    
                src = img.get_attribute("src")
                if not src:
                    continue
                
                # Pinterest thumbnails are often 236x. Let's try to get the original or larger versions.
                # typical url: https://i.pinimg.com/236x/xx/xx/xx/...jpg
                # hi-res url: https://i.pinimg.com/736x/xx/xx/xx/...jpg or originals/
                
                high_res_url = src.replace("236x", "736x")
                
                if high_res_url not in seen_urls:
                    seen_urls.add(high_res_url)
                    image_name = f"pinterest_{downloaded_count+1}.jpg"
                    success = download_image(high_res_url, keyword_folder, image_name)
                    if success:
                        downloaded_count += 1
            
            if downloaded_count >= count:
                break
                
            # Scroll down
            print("Scrolling down for more images...")
            page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
            page.wait_for_timeout(2000)  # Wait for loading
            
            new_height = page.evaluate("document.body.scrollHeight")
            if new_height == last_height:
                no_new_content_count += 1
                if no_new_content_count > 3:
                    print("Reached end of page or no more images loading.")
                    break
            else:
                no_new_content_count = 0
                
            last_height = new_height

        print(f"Finished scraping. Downloaded {downloaded_count} images.")
        browser.close()

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description="Pinterest Keyword Scraper")
    parser.add_argument("keyword", type=str, help="The keyword to search for on Pinterest")
    parser.add_argument("-c", "--count", type=int, default=10, help="Number of images to scrape (default: 10)")
    
    args = parser.parse_args()
    
    scrape_pinterest(args.keyword, args.count)