Spaces:

rastof9
/

fb

Runtime error

App Files Files Community

fb / app /services /facebook_scraper.py

rastof9

test

f788a29 10 months ago

raw

history blame contribute delete

18.3 kB

	from selenium import webdriver
	from selenium.webdriver.common.by import By
	from selenium.webdriver.chrome.service import Service
	from selenium.webdriver.support.ui import WebDriverWait
	from selenium.webdriver.support import expected_conditions as EC
	from selenium.common.exceptions import TimeoutException, WebDriverException, NoSuchElementException
	from webdriver_manager.chrome import ChromeDriverManager
	import time
	import json
	import logging
	import re
	from datetime import datetime
	from contextlib import contextmanager
	from typing import List, Dict, Any, Optional
	from bs4 import BeautifulSoup
	from urllib.parse import urlparse, parse_qs

	logger = logging.getLogger(__name__)

	class FacebookScraper:
	"""
	Enhanced Facebook Ads Library scraper with improved robustness and features.
	"""

	def __init__(self, headless: bool = True, timeout: int = 10, use_proxy: bool = False, proxy: str = None):
	"""
	Initialize the Facebook scraper with configurable options.

	Args:
	headless: Whether to run the browser in headless mode
	timeout: Default timeout for waiting operations in seconds
	use_proxy: Whether to use a proxy
	proxy: Proxy server address (e.g., "http://user:pass@ip:port")
	"""
	self.driver = None
	self.headless = headless
	self.timeout = timeout
	self.use_proxy = use_proxy
	self.proxy = proxy

	def _setup_driver(self):
	"""Configure and initialize the Chrome WebDriver with optimal settings."""
	options = webdriver.ChromeOptions()

	if self.headless:
	options.add_argument("--headless")

	# Add common options for stability
	options.add_argument("--no-sandbox")
	options.add_argument("--disable-dev-shm-usage")
	options.add_argument("--disable-gpu")
	options.add_argument("--window-size=1920,1080")

	# Add user agent to appear more like a regular browser
	options.add_argument("--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36")

	# Add proxy if specified
	if self.use_proxy and self.proxy:
	options.add_argument(f'--proxy-server={self.proxy}')

	# Disable automation flags to avoid detection
	options.add_experimental_option("excludeSwitches", ["enable-automation"])
	options.add_experimental_option('useAutomationExtension', False)

	return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=options)

	@contextmanager
	def _get_driver(self):
	"""Context manager for browser session to ensure proper cleanup."""
	try:
	self.driver = self._setup_driver()
	yield self.driver
	except Exception as e:
	logger.error(f"Error initializing WebDriver: {e}")
	raise
	finally:
	if self.driver:
	self.driver.quit()

	def _wait_for_element(self, driver, selector: str, by: By = By.CSS_SELECTOR, timeout: int = None) -> Any:
	"""
	Wait for an element to be present and return it.

	Args:
	driver: WebDriver instance
	selector: Element selector
	by: Selector type (CSS, XPATH, etc.)
	timeout: Wait timeout in seconds

	Returns:
	The found web element
	"""
	if timeout is None:
	timeout = self.timeout

	wait = WebDriverWait(driver, timeout)
	return wait.until(EC.presence_of_element_located((by, selector)))

	def _wait_for_elements(self, driver, selector: str, by: By = By.CSS_SELECTOR, timeout: int = None) -> List[Any]:
	"""
	Wait for elements to be present and return them.

	Args:
	driver: WebDriver instance
	selector: Elements selector
	by: Selector type (CSS, XPATH, etc.)
	timeout: Wait timeout in seconds

	Returns:
	List of found web elements
	"""
	if timeout is None:
	timeout = self.timeout

	wait = WebDriverWait(driver, timeout)
	return wait.until(EC.presence_of_all_elements_located((by, selector)))

	def _scroll_to_load_more(self, driver, scroll_count: int = 5, scroll_pause: float = 2.0):
	"""
	Scroll down the page to load more content.

	Args:
	driver: WebDriver instance
	scroll_count: Number of times to scroll
	scroll_pause: Pause between scrolls in seconds
	"""
	for i in range(scroll_count):
	# Scroll down to bottom
	driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

	# Wait to load page
	time.sleep(scroll_pause)

	# Log progress
	logger.debug(f"Completed scroll {i+1}/{scroll_count}")

	def _extract_ad_details(self, ad_element) -> Dict[str, Any]:
	"""
	Extract detailed information from an ad element.

	Args:
	ad_element: WebElement containing the ad

	Returns:
	Dictionary with ad details
	"""
	ad_data = {
	"scrape_time": datetime.now().isoformat(),
	"platform": "facebook",
	"raw_text": ad_element.text
	}

	try:
	# Try to extract advertiser name
	advertiser_elem = ad_element.find_elements(By.CSS_SELECTOR, "span[dir='auto']")
	if advertiser_elem:
	ad_data["advertiser"] = advertiser_elem[0].text

	# Try to extract ad content
	content_elem = ad_element.find_elements(By.CSS_SELECTOR, "div[dir='auto']")
	if content_elem:
	ad_data["content"] = "\n".join([elem.text for elem in content_elem])

	# Try to extract images
	img_elems = ad_element.find_elements(By.TAG_NAME, "img")
	if img_elems:
	ad_data["images"] = [img.get_attribute("src") for img in img_elems if img.get_attribute("src")]

	# Try to extract links
	link_elems = ad_element.find_elements(By.TAG_NAME, "a")
	if link_elems:
	ad_data["links"] = [link.get_attribute("href") for link in link_elems if link.get_attribute("href")]

	# Try to extract ad ID from URL
	if "links" in ad_data and ad_data["links"]:
	for link in ad_data["links"]:
	id_match = re.search(r'id=(\d+)', link)
	if id_match:
	ad_data["ad_id"] = id_match.group(1)
	break

	except Exception as e:
	logger.warning(f"Error extracting ad details: {e}")

	return ad_data

	def scrape_ads(self, search_query: str, num_scrolls: int = 5, country_code: str = "ALL") -> List[Dict[str, Any]]:
	"""
	Scrape ads from Facebook Ads Library based on a search query.

	Args:
	search_query: Keyword to search for
	num_scrolls: Number of times to scroll to load more ads
	country_code: Country code filter (e.g., "US", "GB", "ALL")

	Returns:
	List of dictionaries containing ad information
	"""
	with self._get_driver() as driver:
	try:
	# Construct URL with parameters
	url = f"https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country={country_code}&q={search_query}&search_type=keyword"
	logger.info(f"Accessing Facebook Ads Library: {url}")

	# Navigate to the URL
	driver.get(url)

	# Wait for initial content to load
	try:
	self._wait_for_element(driver, "div[role='main']")
	except TimeoutException:
	logger.warning("Timeout waiting for main content to load")

	# Scroll to load more ads
	self._scroll_to_load_more(driver, num_scrolls)

	# Find all ad elements
	# Try multiple selectors as Facebook might change their structure
	selectors = [
	"div.x1yztbdb", # Current selector
	"div[role='article']", # Alternative selector
	"div.x1iorvi4" # Another possible selector
	]

	ad_elements = []
	for selector in selectors:
	try:
	elements = driver.find_elements(By.CSS_SELECTOR, selector)
	if elements:
	ad_elements = elements
	logger.info(f"Found {len(elements)} ads using selector: {selector}")
	break
	except Exception as e:
	logger.debug(f"Selector {selector} failed: {e}")

	if not ad_elements:
	logger.warning("No ad elements found with any selector")
	return []

	# Extract detailed information from each ad
	ads_data = []
	for i, ad_element in enumerate(ad_elements):
	try:
	ad_data = self._extract_ad_details(ad_element)
	ad_data["position"] = i + 1
	ad_data["search_query"] = search_query
	ads_data.append(ad_data)
	except Exception as e:
	logger.error(f"Error processing ad {i+1}: {e}")

	logger.info(f"Successfully scraped {len(ads_data)} ads")
	return ads_data

	except (TimeoutException, WebDriverException) as e:
	logger.error(f"Error during scraping: {e}")
	return []

	def scrape_advertiser_details(self, advertiser_id: str) -> Dict[str, Any]:
	"""
	Scrape details about a specific advertiser.

	Args:
	advertiser_id: Facebook ID of the advertiser

	Returns:
	Dictionary with advertiser information
	"""
	with self._get_driver() as driver:
	try:
	url = f"https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=ALL&view_all_page_id={advertiser_id}"
	logger.info(f"Accessing advertiser page: {url}")

	driver.get(url)

	# Wait for page to load
	try:
	self._wait_for_element(driver, "div[role='main']")
	except TimeoutException:
	logger.warning("Timeout waiting for advertiser page to load")

	# Extract advertiser information
	advertiser_data = {
	"id": advertiser_id,
	"scrape_time": datetime.now().isoformat()
	}

	# Try to get advertiser name
	try:
	name_elem = self._wait_for_element(driver, "div[role='main'] h1", timeout=5)
	advertiser_data["name"] = name_elem.text
	except:
	pass

	# Try to get ad count
	try:
	count_text = driver.find_element(By.XPATH, "//div[contains(text(), 'ads')]").text
	count_match = re.search(r'(\d+)\s+ads', count_text)
	if count_match:
	advertiser_data["ad_count"] = int(count_match.group(1))
	except:
	pass

	# Scroll to load some ads
	self._scroll_to_load_more(driver, 3)

	# Get sample ads
	ad_elements = driver.find_elements(By.CSS_SELECTOR, "div.x1yztbdb")
	sample_ads = []

	for i, ad_element in enumerate(ad_elements[:5]): # Get up to 5 sample ads
	try:
	ad_data = self._extract_ad_details(ad_element)
	sample_ads.append(ad_data)
	except Exception as e:
	logger.error(f"Error processing sample ad {i+1}: {e}")

	advertiser_data["sample_ads"] = sample_ads
	advertiser_data["sample_ad_count"] = len(sample_ads)

	return advertiser_data

	except Exception as e:
	logger.error(f"Error scraping advertiser details: {e}")
	return {"id": advertiser_id, "error": str(e)}

	def scrape_ads_by_topic(self, topic: str, num_scrolls: int = 5, country_code: str = "ALL") -> List[Dict[str, Any]]:
	"""
	Scrape ads related to a specific topic.

	Args:
	topic: Topic to search for (e.g., "politics", "health", "finance")
	num_scrolls: Number of times to scroll to load more ads
	country_code: Country code filter

	Returns:
	List of dictionaries containing ad information
	"""
	# This is essentially the same as scrape_ads but with a different name for clarity
	return self.scrape_ads(topic, num_scrolls, country_code)

	def scrape_ads_by_page(self, page_name: str, num_scrolls: int = 5) -> List[Dict[str, Any]]:
	"""
	Scrape ads from a specific Facebook page.

	Args:
	page_name: Name of the Facebook page
	num_scrolls: Number of times to scroll to load more ads

	Returns:
	List of dictionaries containing ad information
	"""
	with self._get_driver() as driver:
	try:
	# First, try to find the page ID
	search_url = f"https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=ALL&q={page_name}&search_type=page"
	logger.info(f"Searching for page: {search_url}")

	driver.get(search_url)

	# Wait for search results
	try:
	self._wait_for_element(driver, "div[role='main']")
	except TimeoutException:
	logger.warning("Timeout waiting for page search results")

	# Try to find and click on the first page result
	try:
	page_links = driver.find_elements(By.CSS_SELECTOR, "a[href*='view_all_page_id=']")
	if page_links:
	# Extract page ID from URL
	href = page_links[0].get_attribute("href")
	page_id_match = re.search(r'view_all_page_id=(\d+)', href)

	if page_id_match:
	page_id = page_id_match.group(1)
	logger.info(f"Found page ID: {page_id}")

	# Navigate directly to page's ads
	page_url = f"https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=ALL&view_all_page_id={page_id}"
	driver.get(page_url)

	# Wait for page to load
	try:
	self._wait_for_element(driver, "div[role='main']")
	except TimeoutException:
	logger.warning("Timeout waiting for page ads to load")

	# Scroll to load more ads
	self._scroll_to_load_more(driver, num_scrolls)

	# Find all ad elements
	ad_elements = driver.find_elements(By.CSS_SELECTOR, "div.x1yztbdb")

	# Extract detailed information from each ad
	ads_data = []
	for i, ad_element in enumerate(ad_elements):
	try:
	ad_data = self._extract_ad_details(ad_element)
	ad_data["position"] = i + 1
	ad_data["page_name"] = page_name
	ad_data["page_id"] = page_id
	ads_data.append(ad_data)
	except Exception as e:
	logger.error(f"Error processing ad {i+1}: {e}")

	logger.info(f"Successfully scraped {len(ads_data)} ads from page {page_name}")
	return ads_data
	except Exception as e:
	logger.error(f"Error finding page: {e}")

	# If we couldn't find the page, fall back to regular search
	logger.warning(f"Could not find page {page_name}, falling back to keyword search")
	return self.scrape_ads(page_name, num_scrolls)

	except Exception as e:
	logger.error(f"Error during page scraping: {e}")
	return []