Spaces:
Runtime error
Runtime error
| from __future__ import annotations | |
| import sys | |
| import os | |
| import time | |
| import csv | |
| import json | |
| import dataclasses | |
| from dataclasses import dataclass, asdict | |
| from typing import List, Dict, Optional, Tuple | |
| from urllib.parse import quote_plus | |
| from bs4 import BeautifulSoup # type: ignore | |
| import pandas as pd # type: ignore | |
| from selenium import webdriver # type: ignore | |
| from selenium.webdriver.common.by import By # type: ignore | |
| from selenium.webdriver.chrome.service import Service # type: ignore | |
| from selenium.webdriver.chrome.options import Options # type: ignore | |
| from selenium.webdriver.support.ui import WebDriverWait # type: ignore | |
| from selenium.webdriver.support import expected_conditions as EC # type: ignore | |
| from selenium.common.exceptions import TimeoutException, NoSuchElementException # type: ignore | |
| # ----------------------------- | |
| # Data model | |
| # ----------------------------- | |
| class AdRecord: | |
| source: str # "facebook_ad_library" / "instagram" | |
| advertiser: str # Page/Account name if detectable | |
| ad_text: str # Primary text we could capture | |
| ad_link: str # Link to the ad details or outbound | |
| media_urls: List[str] # Images/videos if easily captured | |
| timestamp: float # When we scraped | |
| # ----------------------------- | |
| # Pure HTML parsing utils (unit-testable, no Selenium) | |
| # ----------------------------- | |
| SAMPLE_AD_HTML = """ | |
| <div class="ad-card"> | |
| <div class="header"><span>Sponsored</span> · <a href="https://facebook.com/SomeBrand">SomeBrand</a></div> | |
| <div class="body">Glow faster with our GenZ serum! #skincare #genz</div> | |
| <div class="footer"><a href="https://example.com/buy-now">Shop Now</a></div> | |
| </div> | |
| <div class="ad-card"> | |
| <div class="header"><span>Sponsored</span> · <a href="https://facebook.com/AnotherBrand">AnotherBrand</a></div> | |
| <div class="body">Meet the new foam cleanser — gentle, effective, and clean.</div> | |
| <div class="footer"><a href="https://example.com/learn-more">Learn More</a></div> | |
| </div> | |
| """ | |
| def extract_ads_from_html(html: str) -> List[AdRecord]: | |
| """Best-effort extraction from generic ad-like HTML (for testing). | |
| Looks for blocks that contain the word 'Sponsored'. | |
| """ | |
| soup = BeautifulSoup(html, "html.parser") | |
| ads: List[AdRecord] = [] | |
| candidates = soup.find_all(string=lambda s: isinstance(s, str) and "sponsored" in s.lower()) | |
| seen_blocks = set() | |
| for node in candidates: | |
| block = node | |
| # Climb to a container block | |
| for _ in range(3): | |
| if block and block.parent: | |
| block = block.parent | |
| if not block or id(block) in seen_blocks: | |
| continue | |
| seen_blocks.add(id(block)) | |
| advertiser = "" | |
| adv_a = block.find("a") | |
| if adv_a and adv_a.text: | |
| advertiser = adv_a.text.strip() | |
| text_parts = [] | |
| body = block.find(class_="body") | |
| if body and body.text: | |
| text_parts.append(body.text.strip()) | |
| else: | |
| text_parts.append(block.get_text(" ", strip=True)) | |
| ad_text = " ".join(text_parts)[:5000] | |
| link = "" | |
| a_tag = block.find("a", href=True) | |
| if a_tag: | |
| link = a_tag["href"] | |
| ads.append( | |
| AdRecord( | |
| source="html_sample", | |
| advertiser=advertiser, | |
| ad_text=ad_text, | |
| ad_link=link, | |
| media_urls=[], | |
| timestamp=time.time(), | |
| ) | |
| ) | |
| return ads | |
| # ----------------------------- | |
| # Selenium scraping (Meta Ad Library) | |
| # ----------------------------- | |
| def _build_chrome(headless: bool = True) -> webdriver.Chrome: | |
| opts = Options() | |
| if headless: | |
| # "new" headless is more stable in recent Chrome | |
| opts.add_argument("--headless=new") | |
| opts.add_argument("--no-sandbox") | |
| opts.add_argument("--disable-dev-shm-usage") | |
| opts.add_argument("--window-size=1600,1200") | |
| # 1) Try system ChromeDriver first (recommended for CI/sandboxes without ssl) | |
| try: | |
| return webdriver.Chrome(options=opts) | |
| except Exception as e1: | |
| # 2) Fallback to webdriver_manager (requires internet & ssl). Do inside try. | |
| try: | |
| from webdriver_manager.chrome import ChromeDriverManager # type: ignore | |
| return webdriver.Chrome(service=Service(ChromeDriverManager().install()), options=opts) | |
| except Exception as e2: | |
| raise RuntimeError( | |
| "Failed to start ChromeDriver. Ensure Chrome/Chromium + ChromeDriver are installed " | |
| "and on PATH, or run in an environment with internet/SSL for webdriver_manager.\n" | |
| f"System driver error: {e1}\nManager error: {e2}" | |
| ) | |
| def scrape_meta_ad_library( | |
| keyword: str, | |
| country: str = "IN", | |
| max_ads: int = 20, | |
| headless: bool = True, | |
| scroll_rounds: int = 8, | |
| per_scroll_pause: float = 2.5, | |
| ) -> List[AdRecord]: | |
| """Scrape Facebook Ad Library search results for a keyword. | |
| NOTE: Selectors on facebook.com change frequently; this is best-effort and may | |
| require updates. Works best when you're logged in and have accepted cookies. | |
| """ | |
| driver = _build_chrome(headless=headless) | |
| try: | |
| base = ( | |
| "https://www.facebook.com/ads/library/?active_status=all&ad_type=all&country=" | |
| f"{country}&q={quote_plus(keyword)}&search_type=keyword" | |
| ) | |
| driver.get(base) | |
| # Give time for cookie banners or initial JS | |
| time.sleep(4) | |
| # Try to accept cookies if a button exists | |
| try: | |
| WebDriverWait(driver, 5).until( | |
| EC.element_to_be_clickable((By.XPATH, "//button[contains(., 'Allow')] | //button[contains(., 'Accept')]")) | |
| ).click() | |
| time.sleep(2) | |
| except Exception: | |
| pass | |
| # Scroll to load more results | |
| last_height = driver.execute_script("return document.body.scrollHeight") | |
| for _ in range(scroll_rounds): | |
| driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") | |
| time.sleep(per_scroll_pause) | |
| new_height = driver.execute_script("return document.body.scrollHeight") | |
| if new_height == last_height: | |
| break | |
| last_height = new_height | |
| cards = [] | |
| # Heuristic selectors (may require tweaks over time) | |
| selectors = [ | |
| "div[role='article']", # generic card | |
| "div.x1lliihq.x1n2onr6", # fb new class soup (example) | |
| "div._99s5", # legacy ad card | |
| ] | |
| seen = set() | |
| for sel in selectors: | |
| cards.extend(driver.find_elements(By.CSS_SELECTOR, sel)) | |
| records: List[AdRecord] = [] | |
| for el in cards: | |
| try: | |
| text = el.text.strip() | |
| if not text: | |
| continue | |
| # Require some signal that it's an ad | |
| if "Sponsored" not in text and "Ad details" not in text and "Why am I seeing this ad" not in text: | |
| continue | |
| advertiser = "" | |
| try: | |
| adv = el.find_element(By.XPATH, ".//a[starts-with(@href, 'https://www.facebook.com/')]") | |
| advertiser = adv.text.strip() | |
| except Exception: | |
| pass | |
| ad_link = "" | |
| try: | |
| link = el.find_element(By.XPATH, ".//a[@href and contains(@href, 'ads/library')]") | |
| ad_link = link.get_attribute("href") | |
| except Exception: | |
| # fallback: first link | |
| try: | |
| link = el.find_element(By.XPATH, ".//a[@href]") | |
| ad_link = link.get_attribute("href") | |
| except Exception: | |
| pass | |
| key = (advertiser, ad_link, hash(text)) | |
| if key in seen: | |
| continue | |
| seen.add(key) | |
| records.append( | |
| AdRecord( | |
| source="facebook_ad_library", | |
| advertiser=advertiser, | |
| ad_text=text[:5000], | |
| ad_link=ad_link, | |
| media_urls=[], | |
| timestamp=time.time(), | |
| ) | |
| ) | |
| if len(records) >= max_ads: | |
| break | |
| except Exception: | |
| continue | |
| return records | |
| finally: | |
| driver.quit() | |
| # ----------------------------- | |
| # CSV/DF helpers | |
| # ----------------------------- | |
| def records_to_dataframe(records: List[AdRecord]) -> pd.DataFrame: | |
| return pd.DataFrame([asdict(r) for r in records]) | |
| def save_records_csv(records: List[AdRecord], path: str = "ads_results.csv") -> str: | |
| df = records_to_dataframe(records) | |
| df.to_csv(path, index=False) | |
| return path | |
| # ----------------------------- | |
| # Optional Gradio UI (lazy import to avoid ssl at import time) | |
| # ----------------------------- | |
| def launch_gradio_ui(): | |
| try: | |
| import gradio as gr # type: ignore | |
| except Exception as e: # Includes ModuleNotFoundError: ssl | |
| print( | |
| "[WARN] Gradio could not be imported (likely due to missing ssl in this environment).\n" | |
| " You can still use the CLI: python ad_scraper.py --keyword 'your term'\n" | |
| f" Import error: {e}" | |
| ) | |
| return | |
| def _scrape(keyword: str, country: str, max_ads: int, headless: bool): | |
| if not keyword.strip(): | |
| return pd.DataFrame(), None | |
| try: | |
| records = scrape_meta_ad_library(keyword=keyword.strip(), country=country, max_ads=max_ads, headless=headless) | |
| except Exception as e: | |
| # Show error in a friendly way | |
| err_df = pd.DataFrame([[str(e)]], columns=["Error"]) | |
| return err_df, None | |
| if not records: | |
| return pd.DataFrame(columns=[f"No results for '{keyword}'"]), None | |
| csv_path = save_records_csv(records) | |
| return records_to_dataframe(records), csv_path | |
| with gr.Blocks() as demo: | |
| gr.Markdown("# 📢 Meta Ad Library Scraper (Selenium)\nEnter a keyword to fetch matching ads.") | |
| with gr.Row(): | |
| kw = gr.Textbox(label="Keyword", value="GenZ skin care brand") | |
| with gr.Row(): | |
| country = gr.Dropdown(["IN","US","GB","CA","AU"], value="IN", label="Country") | |
| max_ads = gr.Slider(1, 100, value=20, step=1, label="Max Ads") | |
| headless = gr.Checkbox(value=True, label="Headless browser") | |
| btn = gr.Button("Scrape") | |
| out_df = gr.Dataframe(label="Results", interactive=False) | |
| out_file = gr.File(label="Download CSV") | |
| btn.click(_scrape, inputs=[kw, country, max_ads, headless], outputs=[out_df, out_file]) | |
| demo.launch() | |
| # ----------------------------- | |
| # CLI Interface | |
| # ----------------------------- | |
| def main_cli(argv: List[str]): | |
| import argparse | |
| p = argparse.ArgumentParser(description="Meta Ad Library scraper with Selenium (Gradio optional)") | |
| p.add_argument("--keyword", "-k", type=str, default="GenZ skin care brand", help="Search keyword") | |
| p.add_argument("--country", "-c", type=str, default="IN", help="Country code (e.g., IN, US)") | |
| p.add_argument("--max-ads", type=int, default=20, help="Max ads to collect") | |
| p.add_argument("--no-headless", action="store_true", help="Run browser with a window") | |
| p.add_argument("--gradio", action="store_true", help="Launch Gradio UI (requires ssl)") | |
| p.add_argument("--test", action="store_true", help="Run unit tests for HTML parsing") | |
| args = p.parse_args(argv) | |
| if args.gradio: | |
| launch_gradio_ui() | |
| return | |
| if args.test: | |
| run_unit_tests() | |
| return | |
| print(f"[INFO] Scraping Meta Ad Library for keyword='{args.keyword}' in country='{args.country}'...") | |
| try: | |
| records = scrape_meta_ad_library( | |
| keyword=args.keyword, | |
| country=args.country, | |
| max_ads=args.max_ads, | |
| headless=not args.no_headless, | |
| ) | |
| except Exception as e: | |
| print("[ERROR] Scrape failed:", e) | |
| print("Tip: Ensure Chrome + ChromeDriver are installed and on PATH, or re-run with --gradio in an env that has ssl.") | |
| return | |
| if not records: | |
| print("[INFO] No ads found.") | |
| return | |
| csv_path = save_records_csv(records) | |
| print(f"[INFO] Saved {len(records)} ads to {csv_path}") | |
| # ----------------------------- | |
| # Tests (no network/browser needed) | |
| # ----------------------------- | |
| def run_unit_tests(): | |
| print("[TEST] Running HTML parsing tests...") | |
| ads = extract_ads_from_html(SAMPLE_AD_HTML) | |
| assert len(ads) >= 2, "Expected at least 2 ads from sample HTML" | |
| assert any("GenZ serum" in a.ad_text for a in ads), "Should capture sample ad body text" | |
| assert any("SomeBrand" in a.advertiser for a in ads), "Should capture advertiser name" | |
| print("[TEST] OK — basic HTML extraction works.") | |
| if __name__ == "__main__": | |
| main_cli(sys.argv[1:]) |