Spaces:
Running
Running
| """ | |
| scraper/base_scraper.py β Abstract base class for all scrapers. | |
| Every concrete scraper (Google Maps, JustDial, etc.) inherits from | |
| ``BaseScraper`` and implements ``scrape()``. Common retry + rate-limiting | |
| logic lives here so subclasses stay clean. | |
| """ | |
| import time | |
| import random | |
| from abc import ABC, abstractmethod | |
| from typing import List, Optional | |
| import requests | |
| import config | |
| from models import Lead | |
| from utils.logger import get_logger | |
| from utils.rate_limiter import RateLimiter | |
| from utils.helpers import clean_text | |
| logger = get_logger(__name__) | |
| class BaseScraper(ABC): | |
| """ | |
| Abstract scraper base. | |
| Subclasses must implement: | |
| scrape(keyword: str, location: str, limit: int) -> List[Lead] | |
| Optionally override: | |
| _check_robots_txt(url: str) -> bool | |
| """ | |
| SOURCE_NAME: str = "Unknown" | |
| def __init__(self): | |
| self.rate_limiter = RateLimiter() | |
| self.session = self._build_session() | |
| self._scraped_count = 0 | |
| self._error_count = 0 | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Abstract | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def scrape( | |
| self, | |
| keyword : str, | |
| location: str, | |
| limit : int = config.DEFAULT_LIMIT, | |
| ) -> List[Lead]: | |
| """ | |
| Scrape leads for *keyword* in *location*. | |
| Must return a (possibly empty) list of :class:`Lead` objects. | |
| Raises no exceptions β errors are logged and an empty list returned. | |
| """ | |
| ... | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| # Shared helpers | |
| # ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| def _build_session(self) -> requests.Session: | |
| """Return a requests.Session with sensible defaults.""" | |
| session = requests.Session() | |
| session.headers.update( | |
| { | |
| "User-Agent": random.choice(config.USER_AGENTS), | |
| "Accept-Language": "en-US,en;q=0.9", | |
| "Accept-Encoding": "gzip, deflate", | |
| "Connection": "keep-alive", | |
| } | |
| ) | |
| return session | |
| def _get(self, url: str, **kwargs) -> Optional[requests.Response]: | |
| """ | |
| Perform a GET request with retry + rate-limiting. | |
| Returns the response or None on final failure. | |
| """ | |
| self.rate_limiter.wait() | |
| for attempt in range(1, config.MAX_RETRIES + 1): | |
| try: | |
| resp = self.session.get( | |
| url, | |
| timeout=config.REQUEST_TIMEOUT_SEC, | |
| **kwargs, | |
| ) | |
| if resp.status_code == 200: | |
| self.rate_limiter.record_success() | |
| logger.debug(f"GET {url} β {resp.status_code}") | |
| return resp | |
| elif resp.status_code in (429, 503): | |
| logger.warning( | |
| f"Rate-limited ({resp.status_code}) on {url}. " | |
| f"Backing off (attempt {attempt}/{config.MAX_RETRIES})." | |
| ) | |
| self.rate_limiter.record_error() | |
| time.sleep(config.RETRY_BACKOFF_SEC * attempt) | |
| else: | |
| logger.warning(f"Non-200 ({resp.status_code}) for {url}") | |
| return None | |
| except requests.RequestException as exc: | |
| logger.warning( | |
| f"Request error (attempt {attempt}/{config.MAX_RETRIES}): {exc}" | |
| ) | |
| self.rate_limiter.record_error() | |
| time.sleep(config.RETRY_BACKOFF_SEC * attempt) | |
| logger.error(f"All retries exhausted for {url}") | |
| self._error_count += 1 | |
| return None | |
| def _check_robots_txt(self, base_url: str, path: str = "/") -> bool: | |
| """ | |
| Lightweight robots.txt check. | |
| Returns True if crawling is allowed (or robots.txt is unavailable). | |
| """ | |
| try: | |
| from urllib.robotparser import RobotFileParser | |
| rp = RobotFileParser() | |
| rp.set_url(f"{base_url.rstrip('/')}/robots.txt") | |
| rp.read() | |
| allowed = rp.can_fetch("*", f"{base_url}{path}") | |
| if not allowed: | |
| logger.info(f"robots.txt disallows {base_url}{path}") | |
| return allowed | |
| except Exception: | |
| return True # If robots.txt unreadable, assume OK | |
| def _rotate_user_agent(self) -> None: | |
| """Rotate the User-Agent header to reduce detection risk.""" | |
| self.session.headers["User-Agent"] = random.choice(config.USER_AGENTS) | |
| def stats(self) -> dict: | |
| return { | |
| "source" : self.SOURCE_NAME, | |
| "scraped_count" : self._scraped_count, | |
| "error_count" : self._error_count, | |
| } | |