LeadGenPro / lead_gen /scraper /base_scraper.py
MaSTer-suFYan
feat: LeadGen Pro v2.0 β€” full system with bug fixes
beec01d
"""
scraper/base_scraper.py β€” Abstract base class for all scrapers.
Every concrete scraper (Google Maps, JustDial, etc.) inherits from
``BaseScraper`` and implements ``scrape()``. Common retry + rate-limiting
logic lives here so subclasses stay clean.
"""
import time
import random
from abc import ABC, abstractmethod
from typing import List, Optional
import requests
import config
from models import Lead
from utils.logger import get_logger
from utils.rate_limiter import RateLimiter
from utils.helpers import clean_text
logger = get_logger(__name__)
class BaseScraper(ABC):
"""
Abstract scraper base.
Subclasses must implement:
scrape(keyword: str, location: str, limit: int) -> List[Lead]
Optionally override:
_check_robots_txt(url: str) -> bool
"""
SOURCE_NAME: str = "Unknown"
def __init__(self):
self.rate_limiter = RateLimiter()
self.session = self._build_session()
self._scraped_count = 0
self._error_count = 0
# ──────────────────────────────────────────────────────────────────────
# Abstract
# ──────────────────────────────────────────────────────────────────────
@abstractmethod
def scrape(
self,
keyword : str,
location: str,
limit : int = config.DEFAULT_LIMIT,
) -> List[Lead]:
"""
Scrape leads for *keyword* in *location*.
Must return a (possibly empty) list of :class:`Lead` objects.
Raises no exceptions β€” errors are logged and an empty list returned.
"""
...
# ──────────────────────────────────────────────────────────────────────
# Shared helpers
# ──────────────────────────────────────────────────────────────────────
def _build_session(self) -> requests.Session:
"""Return a requests.Session with sensible defaults."""
session = requests.Session()
session.headers.update(
{
"User-Agent": random.choice(config.USER_AGENTS),
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate",
"Connection": "keep-alive",
}
)
return session
def _get(self, url: str, **kwargs) -> Optional[requests.Response]:
"""
Perform a GET request with retry + rate-limiting.
Returns the response or None on final failure.
"""
self.rate_limiter.wait()
for attempt in range(1, config.MAX_RETRIES + 1):
try:
resp = self.session.get(
url,
timeout=config.REQUEST_TIMEOUT_SEC,
**kwargs,
)
if resp.status_code == 200:
self.rate_limiter.record_success()
logger.debug(f"GET {url} β†’ {resp.status_code}")
return resp
elif resp.status_code in (429, 503):
logger.warning(
f"Rate-limited ({resp.status_code}) on {url}. "
f"Backing off (attempt {attempt}/{config.MAX_RETRIES})."
)
self.rate_limiter.record_error()
time.sleep(config.RETRY_BACKOFF_SEC * attempt)
else:
logger.warning(f"Non-200 ({resp.status_code}) for {url}")
return None
except requests.RequestException as exc:
logger.warning(
f"Request error (attempt {attempt}/{config.MAX_RETRIES}): {exc}"
)
self.rate_limiter.record_error()
time.sleep(config.RETRY_BACKOFF_SEC * attempt)
logger.error(f"All retries exhausted for {url}")
self._error_count += 1
return None
def _check_robots_txt(self, base_url: str, path: str = "/") -> bool:
"""
Lightweight robots.txt check.
Returns True if crawling is allowed (or robots.txt is unavailable).
"""
try:
from urllib.robotparser import RobotFileParser
rp = RobotFileParser()
rp.set_url(f"{base_url.rstrip('/')}/robots.txt")
rp.read()
allowed = rp.can_fetch("*", f"{base_url}{path}")
if not allowed:
logger.info(f"robots.txt disallows {base_url}{path}")
return allowed
except Exception:
return True # If robots.txt unreadable, assume OK
def _rotate_user_agent(self) -> None:
"""Rotate the User-Agent header to reduce detection risk."""
self.session.headers["User-Agent"] = random.choice(config.USER_AGENTS)
@property
def stats(self) -> dict:
return {
"source" : self.SOURCE_NAME,
"scraped_count" : self._scraped_count,
"error_count" : self._error_count,
}