|
|
from selenium import webdriver
|
|
|
from selenium.webdriver.common.by import By
|
|
|
from selenium.webdriver.chrome.service import Service
|
|
|
from selenium.webdriver.support.ui import WebDriverWait
|
|
|
from selenium.webdriver.support import expected_conditions as EC
|
|
|
from webdriver_manager.chrome import ChromeDriverManager
|
|
|
import time
|
|
|
from selenium.common.exceptions import TimeoutException, WebDriverException
|
|
|
from contextlib import contextmanager
|
|
|
import logging
|
|
|
import json
|
|
|
import os
|
|
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class GoogleAdsScraper:
|
|
|
def __init__(self, selenium_hub_url=None):
|
|
|
self.driver = None
|
|
|
self.selenium_hub_url = selenium_hub_url or os.getenv('SELENIUM_HUB_URL')
|
|
|
|
|
|
def _setup_driver(self):
|
|
|
options = webdriver.ChromeOptions()
|
|
|
options.add_argument("--headless")
|
|
|
options.add_argument("--no-sandbox")
|
|
|
options.add_argument("--disable-dev-shm-usage")
|
|
|
|
|
|
if self.selenium_hub_url:
|
|
|
logger.info(f"Using Selenium Hub at {self.selenium_hub_url}")
|
|
|
return webdriver.Remote(
|
|
|
command_executor=self.selenium_hub_url,
|
|
|
options=options
|
|
|
)
|
|
|
else:
|
|
|
return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
|
|
|
|
|
|
@contextmanager
|
|
|
def _get_driver(self):
|
|
|
try:
|
|
|
self.driver = self._setup_driver()
|
|
|
yield self.driver
|
|
|
finally:
|
|
|
if self.driver:
|
|
|
self.driver.quit()
|
|
|
|
|
|
def scrape_search_ads(self, search_query, num_pages=3):
|
|
|
"""Scrape Google search ads for a given query."""
|
|
|
with self._get_driver() as driver:
|
|
|
try:
|
|
|
url = f"https://www.google.com/search?q={search_query}"
|
|
|
driver.get(url)
|
|
|
driver.implicitly_wait(5)
|
|
|
|
|
|
ads = []
|
|
|
|
|
|
|
|
|
ads.extend(self._extract_search_ads(driver))
|
|
|
|
|
|
|
|
|
for page in range(2, num_pages + 1):
|
|
|
try:
|
|
|
next_button = driver.find_element(By.ID, "pnnext")
|
|
|
next_button.click()
|
|
|
time.sleep(2)
|
|
|
ads.extend(self._extract_search_ads(driver))
|
|
|
except Exception as e:
|
|
|
logger.warning(f"Could not navigate to page {page}: {e}")
|
|
|
break
|
|
|
|
|
|
return ads
|
|
|
|
|
|
except (TimeoutException, WebDriverException) as e:
|
|
|
logger.error(f"Error during Google Ads scraping: {e}")
|
|
|
return []
|
|
|
|
|
|
def _extract_search_ads(self, driver):
|
|
|
"""Extract ad data from the current search results page."""
|
|
|
ads = []
|
|
|
try:
|
|
|
|
|
|
ad_elements = driver.find_elements(By.CSS_SELECTOR, "div.uEierd")
|
|
|
|
|
|
for ad in ad_elements:
|
|
|
try:
|
|
|
ad_data = {}
|
|
|
|
|
|
|
|
|
title_element = ad.find_element(By.CSS_SELECTOR, "div.CCgQ5.vCa9Yd.QfkTvb.MUxGbd.v0nnCb")
|
|
|
ad_data["title"] = title_element.text if title_element else ""
|
|
|
|
|
|
|
|
|
desc_element = ad.find_element(By.CSS_SELECTOR, "div.MUxGbd.yDYNvb.lyLwlc")
|
|
|
ad_data["description"] = desc_element.text if desc_element else ""
|
|
|
|
|
|
|
|
|
url_element = ad.find_element(By.CSS_SELECTOR, "a.sVXRqc")
|
|
|
ad_data["display_url"] = url_element.text if url_element else ""
|
|
|
ad_data["target_url"] = url_element.get_attribute("href") if url_element else ""
|
|
|
|
|
|
|
|
|
ad_data["position"] = len(ads) + 1
|
|
|
|
|
|
|
|
|
ad_data["scrape_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
|
|
|
|
|
|
ads.append(ad_data)
|
|
|
except Exception as e:
|
|
|
logger.warning(f"Error extracting ad data: {e}")
|
|
|
continue
|
|
|
|
|
|
return ads
|
|
|
except Exception as e:
|
|
|
logger.error(f"Error extracting search ads: {e}")
|
|
|
return []
|
|
|
|
|
|
def scrape_display_ads(self, target_url, scroll_count=5):
|
|
|
"""Scrape Google display ads from a specific page."""
|
|
|
with self._get_driver() as driver:
|
|
|
try:
|
|
|
driver.get(target_url)
|
|
|
driver.implicitly_wait(5)
|
|
|
|
|
|
|
|
|
for _ in range(scroll_count):
|
|
|
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
|
|
|
time.sleep(2)
|
|
|
|
|
|
|
|
|
iframes = driver.find_elements(By.CSS_SELECTOR, "iframe[id^='google_ads_iframe']")
|
|
|
|
|
|
ads = []
|
|
|
for iframe in iframes:
|
|
|
try:
|
|
|
|
|
|
driver.switch_to.frame(iframe)
|
|
|
|
|
|
|
|
|
ad_data = {
|
|
|
"iframe_id": iframe.get_attribute("id"),
|
|
|
"width": iframe.get_attribute("width"),
|
|
|
"height": iframe.get_attribute("height"),
|
|
|
"scrape_time": time.strftime("%Y-%m-%d %H:%M:%S"),
|
|
|
"page_url": target_url
|
|
|
}
|
|
|
|
|
|
|
|
|
try:
|
|
|
img = driver.find_element(By.CSS_SELECTOR, "img")
|
|
|
ad_data["image_url"] = img.get_attribute("src")
|
|
|
except:
|
|
|
ad_data["image_url"] = None
|
|
|
|
|
|
|
|
|
try:
|
|
|
link = driver.find_element(By.CSS_SELECTOR, "a")
|
|
|
ad_data["target_url"] = link.get_attribute("href")
|
|
|
except:
|
|
|
ad_data["target_url"] = None
|
|
|
|
|
|
ads.append(ad_data)
|
|
|
|
|
|
|
|
|
driver.switch_to.default_content()
|
|
|
except Exception as e:
|
|
|
logger.warning(f"Error processing iframe: {e}")
|
|
|
driver.switch_to.default_content()
|
|
|
continue
|
|
|
|
|
|
return ads
|
|
|
|
|
|
except (TimeoutException, WebDriverException) as e:
|
|
|
logger.error(f"Error during Google Display Ads scraping: {e}")
|
|
|
return [] |