File size: 7,399 Bytes
092e58d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium.common.exceptions import TimeoutException, WebDriverException
from contextlib import contextmanager
import logging
import json
import os

logger = logging.getLogger(__name__)

class GoogleAdsScraper:
    def __init__(self, selenium_hub_url=None):
        self.driver = None
        self.selenium_hub_url = selenium_hub_url or os.getenv('SELENIUM_HUB_URL')

    def _setup_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        
        if self.selenium_hub_url:
            logger.info(f"Using Selenium Hub at {self.selenium_hub_url}")
            return webdriver.Remote(
                command_executor=self.selenium_hub_url,
                options=options
            )
        else:
            return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    @contextmanager
    def _get_driver(self):
        try:
            self.driver = self._setup_driver()
            yield self.driver
        finally:
            if self.driver:
                self.driver.quit()

    def scrape_search_ads(self, search_query, num_pages=3):
        """Scrape Google search ads for a given query."""
        with self._get_driver() as driver:
            try:
                url = f"https://www.google.com/search?q={search_query}"
                driver.get(url)
                driver.implicitly_wait(5)
                
                ads = []
                
                # Process first page
                ads.extend(self._extract_search_ads(driver))
                
                # Navigate through additional pages if requested
                for page in range(2, num_pages + 1):
                    try:
                        next_button = driver.find_element(By.ID, "pnnext")
                        next_button.click()
                        time.sleep(2)
                        ads.extend(self._extract_search_ads(driver))
                    except Exception as e:
                        logger.warning(f"Could not navigate to page {page}: {e}")
                        break
                
                return ads
                
            except (TimeoutException, WebDriverException) as e:
                logger.error(f"Error during Google Ads scraping: {e}")
                return []
    
    def _extract_search_ads(self, driver):
        """Extract ad data from the current search results page."""
        ads = []
        try:
            # Look for ad containers
            ad_elements = driver.find_elements(By.CSS_SELECTOR, "div.uEierd")
            
            for ad in ad_elements:
                try:
                    ad_data = {}
                    
                    # Extract ad title
                    title_element = ad.find_element(By.CSS_SELECTOR, "div.CCgQ5.vCa9Yd.QfkTvb.MUxGbd.v0nnCb")
                    ad_data["title"] = title_element.text if title_element else ""
                    
                    # Extract ad description
                    desc_element = ad.find_element(By.CSS_SELECTOR, "div.MUxGbd.yDYNvb.lyLwlc")
                    ad_data["description"] = desc_element.text if desc_element else ""
                    
                    # Extract ad URL
                    url_element = ad.find_element(By.CSS_SELECTOR, "a.sVXRqc")
                    ad_data["display_url"] = url_element.text if url_element else ""
                    ad_data["target_url"] = url_element.get_attribute("href") if url_element else ""
                    
                    # Extract ad position
                    ad_data["position"] = len(ads) + 1
                    
                    # Add timestamp
                    ad_data["scrape_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
                    
                    ads.append(ad_data)
                except Exception as e:
                    logger.warning(f"Error extracting ad data: {e}")
                    continue
            
            return ads
        except Exception as e:
            logger.error(f"Error extracting search ads: {e}")
            return []
            
    def scrape_display_ads(self, target_url, scroll_count=5):
        """Scrape Google display ads from a specific page."""
        with self._get_driver() as driver:
            try:
                driver.get(target_url)
                driver.implicitly_wait(5)
                
                # Scroll to load dynamic content
                for _ in range(scroll_count):
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(2)
                
                # Extract iframe ads
                iframes = driver.find_elements(By.CSS_SELECTOR, "iframe[id^='google_ads_iframe']")
                
                ads = []
                for iframe in iframes:
                    try:
                        # Switch to iframe context
                        driver.switch_to.frame(iframe)
                        
                        # Extract ad data
                        ad_data = {
                            "iframe_id": iframe.get_attribute("id"),
                            "width": iframe.get_attribute("width"),
                            "height": iframe.get_attribute("height"),
                            "scrape_time": time.strftime("%Y-%m-%d %H:%M:%S"),
                            "page_url": target_url
                        }
                        
                        # Try to get the ad image
                        try:
                            img = driver.find_element(By.CSS_SELECTOR, "img")
                            ad_data["image_url"] = img.get_attribute("src")
                        except:
                            ad_data["image_url"] = None
                        
                        # Try to get the ad destination
                        try:
                            link = driver.find_element(By.CSS_SELECTOR, "a")
                            ad_data["target_url"] = link.get_attribute("href")
                        except:
                            ad_data["target_url"] = None
                        
                        ads.append(ad_data)
                        
                        # Switch back to main content
                        driver.switch_to.default_content()
                    except Exception as e:
                        logger.warning(f"Error processing iframe: {e}")
                        driver.switch_to.default_content()
                        continue
                
                return ads
                
            except (TimeoutException, WebDriverException) as e:
                logger.error(f"Error during Google Display Ads scraping: {e}")
                return []