""" scraper/base_scraper.py — Abstract base class for all scrapers. Every concrete scraper (Google Maps, JustDial, etc.) inherits from ``BaseScraper`` and implements ``scrape()``. Common retry + rate-limiting logic lives here so subclasses stay clean. """ import time import random from abc import ABC, abstractmethod from typing import List, Optional import requests import config from models import Lead from utils.logger import get_logger from utils.rate_limiter import RateLimiter from utils.helpers import clean_text logger = get_logger(__name__) class BaseScraper(ABC): """ Abstract scraper base. Subclasses must implement: scrape(keyword: str, location: str, limit: int) -> List[Lead] Optionally override: _check_robots_txt(url: str) -> bool """ SOURCE_NAME: str = "Unknown" def __init__(self): self.rate_limiter = RateLimiter() self.session = self._build_session() self._scraped_count = 0 self._error_count = 0 # ────────────────────────────────────────────────────────────────────── # Abstract # ────────────────────────────────────────────────────────────────────── @abstractmethod def scrape( self, keyword : str, location: str, limit : int = config.DEFAULT_LIMIT, ) -> List[Lead]: """ Scrape leads for *keyword* in *location*. Must return a (possibly empty) list of :class:`Lead` objects. Raises no exceptions — errors are logged and an empty list returned. """ ... # ────────────────────────────────────────────────────────────────────── # Shared helpers # ────────────────────────────────────────────────────────────────────── def _build_session(self) -> requests.Session: """Return a requests.Session with sensible defaults.""" session = requests.Session() session.headers.update( { "User-Agent": random.choice(config.USER_AGENTS), "Accept-Language": "en-US,en;q=0.9", "Accept-Encoding": "gzip, deflate", "Connection": "keep-alive", } ) return session def _get(self, url: str, **kwargs) -> Optional[requests.Response]: """ Perform a GET request with retry + rate-limiting. Returns the response or None on final failure. """ self.rate_limiter.wait() for attempt in range(1, config.MAX_RETRIES + 1): try: resp = self.session.get( url, timeout=config.REQUEST_TIMEOUT_SEC, **kwargs, ) if resp.status_code == 200: self.rate_limiter.record_success() logger.debug(f"GET {url} → {resp.status_code}") return resp elif resp.status_code in (429, 503): logger.warning( f"Rate-limited ({resp.status_code}) on {url}. " f"Backing off (attempt {attempt}/{config.MAX_RETRIES})." ) self.rate_limiter.record_error() time.sleep(config.RETRY_BACKOFF_SEC * attempt) else: logger.warning(f"Non-200 ({resp.status_code}) for {url}") return None except requests.RequestException as exc: logger.warning( f"Request error (attempt {attempt}/{config.MAX_RETRIES}): {exc}" ) self.rate_limiter.record_error() time.sleep(config.RETRY_BACKOFF_SEC * attempt) logger.error(f"All retries exhausted for {url}") self._error_count += 1 return None def _check_robots_txt(self, base_url: str, path: str = "/") -> bool: """ Lightweight robots.txt check. Returns True if crawling is allowed (or robots.txt is unavailable). """ try: from urllib.robotparser import RobotFileParser rp = RobotFileParser() rp.set_url(f"{base_url.rstrip('/')}/robots.txt") rp.read() allowed = rp.can_fetch("*", f"{base_url}{path}") if not allowed: logger.info(f"robots.txt disallows {base_url}{path}") return allowed except Exception: return True # If robots.txt unreadable, assume OK def _rotate_user_agent(self) -> None: """Rotate the User-Agent header to reduce detection risk.""" self.session.headers["User-Agent"] = random.choice(config.USER_AGENTS) @property def stats(self) -> dict: return { "source" : self.SOURCE_NAME, "scraped_count" : self._scraped_count, "error_count" : self._error_count, }