from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service from selenium.webdriver.support.ui import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager import time from selenium.common.exceptions import TimeoutException, WebDriverException from contextlib import contextmanager import logging import json import os logger = logging.getLogger(__name__) class GoogleAdsScraper: def __init__(self, selenium_hub_url=None): self.driver = None self.selenium_hub_url = selenium_hub_url or os.getenv('SELENIUM_HUB_URL') def _setup_driver(self): options = webdriver.ChromeOptions() options.add_argument("--headless") options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") if self.selenium_hub_url: logger.info(f"Using Selenium Hub at {self.selenium_hub_url}") return webdriver.Remote( command_executor=self.selenium_hub_url, options=options ) else: return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options) @contextmanager def _get_driver(self): try: self.driver = self._setup_driver() yield self.driver finally: if self.driver: self.driver.quit() def scrape_search_ads(self, search_query, num_pages=3): """Scrape Google search ads for a given query.""" with self._get_driver() as driver: try: url = f"https://www.google.com/search?q={search_query}" driver.get(url) driver.implicitly_wait(5) ads = [] # Process first page ads.extend(self._extract_search_ads(driver)) # Navigate through additional pages if requested for page in range(2, num_pages + 1): try: next_button = driver.find_element(By.ID, "pnnext") next_button.click() time.sleep(2) ads.extend(self._extract_search_ads(driver)) except Exception as e: logger.warning(f"Could not navigate to page {page}: {e}") break return ads except (TimeoutException, WebDriverException) as e: logger.error(f"Error during Google Ads scraping: {e}") return [] def _extract_search_ads(self, driver): """Extract ad data from the current search results page.""" ads = [] try: # Look for ad containers ad_elements = driver.find_elements(By.CSS_SELECTOR, "div.uEierd") for ad in ad_elements: try: ad_data = {} # Extract ad title title_element = ad.find_element(By.CSS_SELECTOR, "div.CCgQ5.vCa9Yd.QfkTvb.MUxGbd.v0nnCb") ad_data["title"] = title_element.text if title_element else "" # Extract ad description desc_element = ad.find_element(By.CSS_SELECTOR, "div.MUxGbd.yDYNvb.lyLwlc") ad_data["description"] = desc_element.text if desc_element else "" # Extract ad URL url_element = ad.find_element(By.CSS_SELECTOR, "a.sVXRqc") ad_data["display_url"] = url_element.text if url_element else "" ad_data["target_url"] = url_element.get_attribute("href") if url_element else "" # Extract ad position ad_data["position"] = len(ads) + 1 # Add timestamp ad_data["scrape_time"] = time.strftime("%Y-%m-%d %H:%M:%S") ads.append(ad_data) except Exception as e: logger.warning(f"Error extracting ad data: {e}") continue return ads except Exception as e: logger.error(f"Error extracting search ads: {e}") return [] def scrape_display_ads(self, target_url, scroll_count=5): """Scrape Google display ads from a specific page.""" with self._get_driver() as driver: try: driver.get(target_url) driver.implicitly_wait(5) # Scroll to load dynamic content for _ in range(scroll_count): driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") time.sleep(2) # Extract iframe ads iframes = driver.find_elements(By.CSS_SELECTOR, "iframe[id^='google_ads_iframe']") ads = [] for iframe in iframes: try: # Switch to iframe context driver.switch_to.frame(iframe) # Extract ad data ad_data = { "iframe_id": iframe.get_attribute("id"), "width": iframe.get_attribute("width"), "height": iframe.get_attribute("height"), "scrape_time": time.strftime("%Y-%m-%d %H:%M:%S"), "page_url": target_url } # Try to get the ad image try: img = driver.find_element(By.CSS_SELECTOR, "img") ad_data["image_url"] = img.get_attribute("src") except: ad_data["image_url"] = None # Try to get the ad destination try: link = driver.find_element(By.CSS_SELECTOR, "a") ad_data["target_url"] = link.get_attribute("href") except: ad_data["target_url"] = None ads.append(ad_data) # Switch back to main content driver.switch_to.default_content() except Exception as e: logger.warning(f"Error processing iframe: {e}") driver.switch_to.default_content() continue return ads except (TimeoutException, WebDriverException) as e: logger.error(f"Error during Google Display Ads scraping: {e}") return []