Spaces:

rastof9
/

fb

Runtime error

File size: 7,399 Bytes

092e58d

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from webdriver_manager.chrome import ChromeDriverManager
import time
from selenium.common.exceptions import TimeoutException, WebDriverException
from contextlib import contextmanager
import logging
import json
import os

logger = logging.getLogger(__name__)

class GoogleAdsScraper:
    def __init__(self, selenium_hub_url=None):
        self.driver = None
        self.selenium_hub_url = selenium_hub_url or os.getenv('SELENIUM_HUB_URL')

    def _setup_driver(self):
        options = webdriver.ChromeOptions()
        options.add_argument("--headless")
        options.add_argument("--no-sandbox")
        options.add_argument("--disable-dev-shm-usage")
        
        if self.selenium_hub_url:
            logger.info(f"Using Selenium Hub at {self.selenium_hub_url}")
            return webdriver.Remote(
                command_executor=self.selenium_hub_url,
                options=options
            )
        else:
            return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

    @contextmanager
    def _get_driver(self):
        try:
            self.driver = self._setup_driver()
            yield self.driver
        finally:
            if self.driver:
                self.driver.quit()

    def scrape_search_ads(self, search_query, num_pages=3):
        """Scrape Google search ads for a given query."""
        with self._get_driver() as driver:
            try:
                url = f"https://www.google.com/search?q={search_query}"
                driver.get(url)
                driver.implicitly_wait(5)
                
                ads = []
                
                # Process first page
                ads.extend(self._extract_search_ads(driver))
                
                # Navigate through additional pages if requested
                for page in range(2, num_pages + 1):
                    try:
                        next_button = driver.find_element(By.ID, "pnnext")
                        next_button.click()
                        time.sleep(2)
                        ads.extend(self._extract_search_ads(driver))
                    except Exception as e:
                        logger.warning(f"Could not navigate to page {page}: {e}")
                        break
                
                return ads
                
            except (TimeoutException, WebDriverException) as e:
                logger.error(f"Error during Google Ads scraping: {e}")
                return []
    
    def _extract_search_ads(self, driver):
        """Extract ad data from the current search results page."""
        ads = []
        try:
            # Look for ad containers
            ad_elements = driver.find_elements(By.CSS_SELECTOR, "div.uEierd")
            
            for ad in ad_elements:
                try:
                    ad_data = {}
                    
                    # Extract ad title
                    title_element = ad.find_element(By.CSS_SELECTOR, "div.CCgQ5.vCa9Yd.QfkTvb.MUxGbd.v0nnCb")
                    ad_data["title"] = title_element.text if title_element else ""
                    
                    # Extract ad description
                    desc_element = ad.find_element(By.CSS_SELECTOR, "div.MUxGbd.yDYNvb.lyLwlc")
                    ad_data["description"] = desc_element.text if desc_element else ""
                    
                    # Extract ad URL
                    url_element = ad.find_element(By.CSS_SELECTOR, "a.sVXRqc")
                    ad_data["display_url"] = url_element.text if url_element else ""
                    ad_data["target_url"] = url_element.get_attribute("href") if url_element else ""
                    
                    # Extract ad position
                    ad_data["position"] = len(ads) + 1
                    
                    # Add timestamp
                    ad_data["scrape_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
                    
                    ads.append(ad_data)
                except Exception as e:
                    logger.warning(f"Error extracting ad data: {e}")
                    continue
            
            return ads
        except Exception as e:
            logger.error(f"Error extracting search ads: {e}")
            return []
            
    def scrape_display_ads(self, target_url, scroll_count=5):
        """Scrape Google display ads from a specific page."""
        with self._get_driver() as driver:
            try:
                driver.get(target_url)
                driver.implicitly_wait(5)
                
                # Scroll to load dynamic content
                for _ in range(scroll_count):
                    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
                    time.sleep(2)
                
                # Extract iframe ads
                iframes = driver.find_elements(By.CSS_SELECTOR, "iframe[id^='google_ads_iframe']")
                
                ads = []
                for iframe in iframes:
                    try:
                        # Switch to iframe context
                        driver.switch_to.frame(iframe)
                        
                        # Extract ad data
                        ad_data = {
                            "iframe_id": iframe.get_attribute("id"),
                            "width": iframe.get_attribute("width"),
                            "height": iframe.get_attribute("height"),
                            "scrape_time": time.strftime("%Y-%m-%d %H:%M:%S"),
                            "page_url": target_url
                        }
                        
                        # Try to get the ad image
                        try:
                            img = driver.find_element(By.CSS_SELECTOR, "img")
                            ad_data["image_url"] = img.get_attribute("src")
                        except:
                            ad_data["image_url"] = None
                        
                        # Try to get the ad destination
                        try:
                            link = driver.find_element(By.CSS_SELECTOR, "a")
                            ad_data["target_url"] = link.get_attribute("href")
                        except:
                            ad_data["target_url"] = None
                        
                        ads.append(ad_data)
                        
                        # Switch back to main content
                        driver.switch_to.default_content()
                    except Exception as e:
                        logger.warning(f"Error processing iframe: {e}")
                        driver.switch_to.default_content()
                        continue
                
                return ads
                
            except (TimeoutException, WebDriverException) as e:
                logger.error(f"Error during Google Display Ads scraping: {e}")
                return []