#!/usr/bin/env python3 """ scraper.py — Lead Generation Scraper System — Main CLI entry point. Usage examples -------------- # Demo run (no network, instant results): python scraper.py --demo --location="Mumbai" --limit=30 # Scrape web development leads from JustDial: python scraper.py --service=web --location="Mumbai" --source=justdial --limit=50 # Scrape all services from Google Maps: python scraper.py --service=all --location="Bangalore" --source=googlemaps --limit=100 # Scrape specific service, skip website analysis (faster): python scraper.py --service=ai --location="Delhi" --no-analyse --limit=40 # Only export PDF (no CSV/JSON): python scraper.py --demo --formats=pdf Full option reference --------------------- --service web | app | ai | all (default: all) --location City name (default: Mumbai) --source googlemaps | justdial | all (default: all) --limit Max leads per service (default: 50) --formats pdf,csv,json (comma-separated) (default: pdf,csv,json) --no-analyse Skip website analysis (speeds up scraping) --demo Use built-in demo data (no network) --output-dir Custom output directory --verbose Enable DEBUG logging """ import argparse import sys import os import time # ── Path setup (allow running from any directory) ────────────────────────── sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) import config from utils.logger import get_logger from utils.helpers import timestamp_for_filename, now_str logger = get_logger("scraper.main") # ────────────────────────────────────────────────────────────────────────────── # CLI argument parser # ────────────────────────────────────────────────────────────────────────────── def build_parser() -> argparse.ArgumentParser: parser = argparse.ArgumentParser( prog="scraper.py", description="🚀 LeadGen Pro — Intelligent Business Lead Generation System", formatter_class=argparse.RawDescriptionHelpFormatter, epilog=__doc__, ) parser.add_argument( "--service", default="all", choices=["web", "app", "ai", "all"], help="Service category to target (default: all)", ) parser.add_argument( "--location", default=config.DEFAULT_LOCATION, help="City / location to search (default: Mumbai)", ) parser.add_argument( "--source", default="all", choices=["googlemaps", "justdial", "all"], help="Data source(s) to scrape (default: all)", ) parser.add_argument( "--limit", type=int, default=config.DEFAULT_LIMIT, help="Max leads per service category (default: 50)", ) parser.add_argument( "--formats", default="pdf,csv,json", help="Comma-separated output formats: pdf,csv,json (default: all)", ) parser.add_argument( "--no-analyse", action="store_true", help="Skip per-website analysis (faster but less intelligence)", ) parser.add_argument( "--demo", action="store_true", help="Use built-in demo data — no network access required", ) parser.add_argument( "--output-dir", default=None, help="Override output directory (default: output/)", ) parser.add_argument( "--verbose", action="store_true", help="Enable DEBUG-level logging", ) return parser # ────────────────────────────────────────────────────────────────────────────── # Pipeline orchestration # ────────────────────────────────────────────────────────────────────────────── def run_pipeline(args: argparse.Namespace) -> None: """End-to-end lead generation pipeline.""" # ── 0. Logging verbosity ────────────────────────────────────────────── if args.verbose: config.LOG_LEVEL = "DEBUG" # ── 1. Resolve services to scrape ───────────────────────────────────── services = ( ["web", "app", "ai"] if args.service == "all" else [args.service] ) formats = [f.strip().lower() for f in args.formats.split(",")] _banner(args, services) start_time = time.monotonic() # ── 2. Collect leads ────────────────────────────────────────────────── all_leads = _collect_leads(args, services) if not all_leads: logger.warning("No leads collected — exiting.") print("\n⚠️ No leads were collected. Use --demo to run with sample data.\n") return # ── 3. Process ───────────────────────────────────────────────────────── all_leads = _process(all_leads, args, services) # ── 4. Export ────────────────────────────────────────────────────────── output_paths = _export(all_leads, args, services, formats) # ── 5. Summary ───────────────────────────────────────────────────────── elapsed = time.monotonic() - start_time _print_summary(all_leads, output_paths, elapsed) # ────────────────────────────────────────────────────────────────────────────── # Step 2: Collect # ────────────────────────────────────────────────────────────────────────────── def _collect_leads(args, services): from models import Lead from typing import List leads: List[Lead] = [] if args.demo: logger.info("Running in DEMO mode — using built-in dataset.") print(" 📦 Loading demo data...") from scraper.demo_scraper import DemoScraper demo = DemoScraper() leads = demo.scrape( location=args.location, limit=args.limit * len(services), services=services, ) return leads # ── Live scraping ───────────────────────────────────────────────────── from config import SEARCH_KEYWORDS if args.source in ("googlemaps", "all"): try: from scraper.google_maps import GoogleMapsScraper gm = GoogleMapsScraper() for svc in services: keywords = SEARCH_KEYWORDS.get(svc, ["businesses"]) per_kw = max(1, args.limit // len(keywords)) for kw in keywords[:3]: # Limit keyword iterations logger.info(f"[Google Maps] keyword={kw!r} svc={svc}") print(f" 🗺 Google Maps: {kw} in {args.location}...") new = gm.scrape(kw, args.location, limit=per_kw) for lead in new: lead.service_category = svc leads.extend(new) except Exception as exc: logger.error(f"Google Maps scraper failed: {exc}") if args.source in ("justdial", "all"): try: from scraper.justdial import JustDialScraper jd = JustDialScraper() for svc in services: keywords = SEARCH_KEYWORDS.get(svc, ["businesses"]) per_kw = max(1, args.limit // len(keywords)) for kw in keywords[:3]: logger.info(f"[JustDial] keyword={kw!r} svc={svc}") print(f" 📋 JustDial: {kw} in {args.location}...") new = jd.scrape(kw, args.location, limit=per_kw) for lead in new: lead.service_category = svc leads.extend(new) except Exception as exc: logger.error(f"JustDial scraper failed: {exc}") return leads # ────────────────────────────────────────────────────────────────────────────── # Step 3: Process # ────────────────────────────────────────────────────────────────────────────── def _process(all_leads, args, services): from processors.deduplicator import Deduplicator from processors.categorizer import Categorizer from processors.scorer import LeadScorer print(f"\n 🔧 Processing {len(all_leads)} raw leads...") # Deduplication dedup = Deduplicator() all_leads = dedup.deduplicate(all_leads) # Website analysis (optional) if not args.no_analyse and not args.demo: from scraper.website_analyzer import WebsiteAnalyzer analyser = WebsiteAnalyzer() total = len(all_leads) print(f" 🔍 Analysing {total} websites (use --no-analyse to skip)...") for i, lead in enumerate(all_leads, 1): if lead.website: analyser.analyse(lead) if i % 10 == 0: print(f" {i}/{total} analysed...", end="\r") print() # Categorization cat = Categorizer() all_leads = cat.categorize_all(all_leads) # Scoring scorer = LeadScorer() all_leads = scorer.score_all(all_leads) return all_leads # ────────────────────────────────────────────────────────────────────────────── # Step 4: Export # ────────────────────────────────────────────────────────────────────────────── def _export(all_leads, args, services, formats): from processors.categorizer import Categorizer from exporters.pdf_exporter import PDFExporter from exporters.csv_exporter import CSVExporter from exporters.json_exporter import JSONExporter cat = Categorizer() leads_by_service = cat.split_by_service(all_leads) out_dir = args.output_dir or config.OUTPUT_DIR reports_dir = os.path.join(out_dir, "reports") data_dir = os.path.join(out_dir, "data") ts = timestamp_for_filename() loc_slug = args.location.replace(" ", "_") print(f"\n 📄 Exporting results...") paths = {} if "pdf" in formats: try: pdf = PDFExporter() p = pdf.export( leads_by_service, location = args.location, out_dir = reports_dir, filename = f"lead_report_{loc_slug}_{ts}.pdf", ) paths["pdf"] = p print(f" ✅ PDF → {p}") except Exception as exc: logger.error(f"PDF export failed: {exc}") if "csv" in formats: try: csv_exp = CSVExporter() p = csv_exp.export( leads_by_service, location = args.location, out_dir = data_dir, filename = f"leads_{loc_slug}_{ts}.csv", ) paths["csv"] = p print(f" ✅ CSV → {p}") except Exception as exc: logger.error(f"CSV export failed: {exc}") if "json" in formats: try: json_exp = JSONExporter() p = json_exp.export( leads_by_service, location = args.location, out_dir = data_dir, filename = f"leads_{loc_slug}_{ts}.json", ) paths["json"] = p print(f" ✅ JSON → {p}") except Exception as exc: logger.error(f"JSON export failed: {exc}") return paths # ────────────────────────────────────────────────────────────────────────────── # Pretty I/O helpers # ────────────────────────────────────────────────────────────────────────────── def _banner(args, services): print("\n" + "═" * 60) print(" 🚀 LeadGen Pro — Business Lead Generation System") print("═" * 60) print(f" Location : {args.location}") print(f" Services : {', '.join(s.upper() for s in services)}") print(f" Source : {'Demo Data' if args.demo else args.source.title()}") print(f" Limit : {args.limit} per service") print(f" Mode : {'DEMO (offline)' if args.demo else 'LIVE'}") print("─" * 60 + "\n") def _print_summary(all_leads, output_paths, elapsed): from collections import Counter scores = Counter(l.score for l in all_leads) cats = Counter(l.service_category for l in all_leads) print("\n" + "═" * 60) print(" 📊 RESULTS SUMMARY") print("═" * 60) print(f" Total Leads : {len(all_leads)}") print(f" ├─ Web Dev : {cats.get('web', 0)}") print(f" ├─ App Dev : {cats.get('app', 0)}") print(f" └─ AI Auto : {cats.get('ai', 0)}") print() print(f" Priority Breakdown:") print(f" ├─ 🔴 HIGH : {scores.get('HIGH', 0)}") print(f" ├─ 🟠 MEDIUM : {scores.get('MEDIUM', 0)}") print(f" └─ 🟢 LOW : {scores.get('LOW', 0)}") print() print(f" ⏱ Time : {elapsed:.1f}s") print() if output_paths: print(" 📁 Output Files:") for fmt, path in output_paths.items(): print(f" ├─ {fmt.upper():5s} → {path}") print("═" * 60 + "\n") # ────────────────────────────────────────────────────────────────────────────── # Entry point # ────────────────────────────────────────────────────────────────────────────── if __name__ == "__main__": parser = build_parser() args = parser.parse_args() try: run_pipeline(args) except KeyboardInterrupt: print("\n\n⚠️ Interrupted by user.\n") sys.exit(0) except Exception as exc: logger.exception(f"Unhandled error: {exc}") print(f"\n❌ Fatal error: {exc}\n") sys.exit(1)