fb / app /services /google_scraper.py
rastof9's picture
Saving local changes before rebase
092e58d
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium.common.exceptions import TimeoutException, WebDriverException
from contextlib import contextmanager
import logging
import json
import os
logger = logging.getLogger(__name__)
class GoogleAdsScraper:
def __init__(self, selenium_hub_url=None):
self.driver = None
self.selenium_hub_url = selenium_hub_url or os.getenv('SELENIUM_HUB_URL')
def _setup_driver(self):
options = webdriver.ChromeOptions()
options.add_argument("--headless")
options.add_argument("--no-sandbox")
options.add_argument("--disable-dev-shm-usage")
if self.selenium_hub_url:
logger.info(f"Using Selenium Hub at {self.selenium_hub_url}")
return webdriver.Remote(
command_executor=self.selenium_hub_url,
options=options
)
else:
return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)
@contextmanager
def _get_driver(self):
try:
self.driver = self._setup_driver()
yield self.driver
finally:
if self.driver:
self.driver.quit()
def scrape_search_ads(self, search_query, num_pages=3):
"""Scrape Google search ads for a given query."""
with self._get_driver() as driver:
try:
url = f"https://www.google.com/search?q={search_query}"
driver.get(url)
driver.implicitly_wait(5)
ads = []
# Process first page
ads.extend(self._extract_search_ads(driver))
# Navigate through additional pages if requested
for page in range(2, num_pages + 1):
try:
next_button = driver.find_element(By.ID, "pnnext")
next_button.click()
time.sleep(2)
ads.extend(self._extract_search_ads(driver))
except Exception as e:
logger.warning(f"Could not navigate to page {page}: {e}")
break
return ads
except (TimeoutException, WebDriverException) as e:
logger.error(f"Error during Google Ads scraping: {e}")
return []
def _extract_search_ads(self, driver):
"""Extract ad data from the current search results page."""
ads = []
try:
# Look for ad containers
ad_elements = driver.find_elements(By.CSS_SELECTOR, "div.uEierd")
for ad in ad_elements:
try:
ad_data = {}
# Extract ad title
title_element = ad.find_element(By.CSS_SELECTOR, "div.CCgQ5.vCa9Yd.QfkTvb.MUxGbd.v0nnCb")
ad_data["title"] = title_element.text if title_element else ""
# Extract ad description
desc_element = ad.find_element(By.CSS_SELECTOR, "div.MUxGbd.yDYNvb.lyLwlc")
ad_data["description"] = desc_element.text if desc_element else ""
# Extract ad URL
url_element = ad.find_element(By.CSS_SELECTOR, "a.sVXRqc")
ad_data["display_url"] = url_element.text if url_element else ""
ad_data["target_url"] = url_element.get_attribute("href") if url_element else ""
# Extract ad position
ad_data["position"] = len(ads) + 1
# Add timestamp
ad_data["scrape_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
ads.append(ad_data)
except Exception as e:
logger.warning(f"Error extracting ad data: {e}")
continue
return ads
except Exception as e:
logger.error(f"Error extracting search ads: {e}")
return []
def scrape_display_ads(self, target_url, scroll_count=5):
"""Scrape Google display ads from a specific page."""
with self._get_driver() as driver:
try:
driver.get(target_url)
driver.implicitly_wait(5)
# Scroll to load dynamic content
for _ in range(scroll_count):
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
time.sleep(2)
# Extract iframe ads
iframes = driver.find_elements(By.CSS_SELECTOR, "iframe[id^='google_ads_iframe']")
ads = []
for iframe in iframes:
try:
# Switch to iframe context
driver.switch_to.frame(iframe)
# Extract ad data
ad_data = {
"iframe_id": iframe.get_attribute("id"),
"width": iframe.get_attribute("width"),
"height": iframe.get_attribute("height"),
"scrape_time": time.strftime("%Y-%m-%d %H:%M:%S"),
"page_url": target_url
}
# Try to get the ad image
try:
img = driver.find_element(By.CSS_SELECTOR, "img")
ad_data["image_url"] = img.get_attribute("src")
except:
ad_data["image_url"] = None
# Try to get the ad destination
try:
link = driver.find_element(By.CSS_SELECTOR, "a")
ad_data["target_url"] = link.get_attribute("href")
except:
ad_data["target_url"] = None
ads.append(ad_data)
# Switch back to main content
driver.switch_to.default_content()
except Exception as e:
logger.warning(f"Error processing iframe: {e}")
driver.switch_to.default_content()
continue
return ads
except (TimeoutException, WebDriverException) as e:
logger.error(f"Error during Google Display Ads scraping: {e}")
return []