Spaces:

rastof9
/

fb

Runtime error

App Files Files Community

fb / app /services /google_scraper.py

rastof9

Saving local changes before rebase

092e58d 10 months ago

raw

history blame contribute delete

7.4 kB

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from webdriver_manager.chrome import ChromeDriverManager
	import time
	from selenium.common.exceptions import TimeoutException, WebDriverException
	from contextlib import contextmanager
	import logging
	import json
	import os

	logger = logging.getLogger(__name__)

	class GoogleAdsScraper:
	def __init__(self, selenium_hub_url=None):
	self.driver = None
	self.selenium_hub_url = selenium_hub_url or os.getenv('SELENIUM_HUB_URL')

	def _setup_driver(self):
	options = webdriver.ChromeOptions()
	options.add_argument("--headless")
	options.add_argument("--no-sandbox")
	options.add_argument("--disable-dev-shm-usage")

	if self.selenium_hub_url:
	logger.info(f"Using Selenium Hub at {self.selenium_hub_url}")
	return webdriver.Remote(
	command_executor=self.selenium_hub_url,
	options=options
	)
	else:
	return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

	@contextmanager
	def _get_driver(self):
	try:
	self.driver = self._setup_driver()
	yield self.driver
	finally:
	if self.driver:
	self.driver.quit()

	def scrape_search_ads(self, search_query, num_pages=3):
	"""Scrape Google search ads for a given query."""
	with self._get_driver() as driver:
	try:
	url = f"https://www.google.com/search?q={search_query}"
	driver.get(url)
	driver.implicitly_wait(5)

	ads = []

	# Process first page
	ads.extend(self._extract_search_ads(driver))

	# Navigate through additional pages if requested
	for page in range(2, num_pages + 1):
	try:
	next_button = driver.find_element(By.ID, "pnnext")
	next_button.click()
	time.sleep(2)
	ads.extend(self._extract_search_ads(driver))
	except Exception as e:
	logger.warning(f"Could not navigate to page {page}: {e}")
	break

	return ads

	except (TimeoutException, WebDriverException) as e:
	logger.error(f"Error during Google Ads scraping: {e}")
	return []

	def _extract_search_ads(self, driver):
	"""Extract ad data from the current search results page."""
	ads = []
	try:
	# Look for ad containers
	ad_elements = driver.find_elements(By.CSS_SELECTOR, "div.uEierd")

	for ad in ad_elements:
	try:
	ad_data = {}

	# Extract ad title
	title_element = ad.find_element(By.CSS_SELECTOR, "div.CCgQ5.vCa9Yd.QfkTvb.MUxGbd.v0nnCb")
	ad_data["title"] = title_element.text if title_element else ""

	# Extract ad description
	desc_element = ad.find_element(By.CSS_SELECTOR, "div.MUxGbd.yDYNvb.lyLwlc")
	ad_data["description"] = desc_element.text if desc_element else ""

	# Extract ad URL
	url_element = ad.find_element(By.CSS_SELECTOR, "a.sVXRqc")
	ad_data["display_url"] = url_element.text if url_element else ""
	ad_data["target_url"] = url_element.get_attribute("href") if url_element else ""

	# Extract ad position
	ad_data["position"] = len(ads) + 1

	# Add timestamp
	ad_data["scrape_time"] = time.strftime("%Y-%m-%d %H:%M:%S")

	ads.append(ad_data)
	except Exception as e:
	logger.warning(f"Error extracting ad data: {e}")
	continue

	return ads
	except Exception as e:
	logger.error(f"Error extracting search ads: {e}")
	return []

	def scrape_display_ads(self, target_url, scroll_count=5):
	"""Scrape Google display ads from a specific page."""
	with self._get_driver() as driver:
	try:
	driver.get(target_url)
	driver.implicitly_wait(5)

	# Scroll to load dynamic content
	for _ in range(scroll_count):
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
	time.sleep(2)

	# Extract iframe ads
	iframes = driver.find_elements(By.CSS_SELECTOR, "iframe[id^='google_ads_iframe']")

	ads = []
	for iframe in iframes:
	try:
	# Switch to iframe context
	driver.switch_to.frame(iframe)

	# Extract ad data
	ad_data = {
	"iframe_id": iframe.get_attribute("id"),
	"width": iframe.get_attribute("width"),
	"height": iframe.get_attribute("height"),
	"scrape_time": time.strftime("%Y-%m-%d %H:%M:%S"),
	"page_url": target_url
	}

	# Try to get the ad image
	try:
	img = driver.find_element(By.CSS_SELECTOR, "img")
	ad_data["image_url"] = img.get_attribute("src")
	except:
	ad_data["image_url"] = None

	# Try to get the ad destination
	try:
	link = driver.find_element(By.CSS_SELECTOR, "a")
	ad_data["target_url"] = link.get_attribute("href")
	except:
	ad_data["target_url"] = None

	ads.append(ad_data)

	# Switch back to main content
	driver.switch_to.default_content()
	except Exception as e:
	logger.warning(f"Error processing iframe: {e}")
	driver.switch_to.default_content()
	continue

	return ads

	except (TimeoutException, WebDriverException) as e:
	logger.error(f"Error during Google Display Ads scraping: {e}")
	return []