File size: 3,768 Bytes
b56d4a6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
import json
import time
from datetime import datetime
from typing import Dict, List
import logging
from serpapi import GoogleSearch
from pathlib import Path

from config import (
    SERP_API_KEY, SERP_MONTHLY_LIMIT, SEARCH_QUERIES,
    RAW_DIR, LOG_DIR
)

class BloomingtonScraper:
    def __init__(self):
        self.search_count = 0
        self.results_by_category = {}
        
        # Set up logging
        log_file = LOG_DIR / f"scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
        logging.basicConfig(
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s',
            filename=log_file
        )

    def _make_serp_request(self, query: str, category: str) -> List[Dict]:
        """Make a single SERP API request"""
        if self.search_count >= SERP_MONTHLY_LIMIT:
            logging.warning("Monthly SERP API limit reached")
            return []

        params = {
            "api_key": SERP_API_KEY,
            "engine": "google",
            "q": query,
            "location": "Bloomington, Indiana, United States",
            "google_domain": "google.com",
            "num": 100,  # Get maximum results per query
            "start": 0
        }

        try:
            search = GoogleSearch(params)
            results = search.get_dict()
            self.search_count += 1
            
            # Save raw results
            timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
            raw_file = RAW_DIR / f"raw_results_{category}_{timestamp}.json"
            with open(raw_file, 'w') as f:
                json.dump(results, f, indent=2)
            
            logging.info(f"SERP API calls used: {self.search_count}/{SERP_MONTHLY_LIMIT}")
            return results.get('organic_results', [])

        except Exception as e:
            logging.error(f"SERP API error for query '{query}': {e}")
            return []

    def scrape_all_categories(self) -> Dict[str, List[Dict]]:
        """Scrape data for all categories"""
        for category, queries in SEARCH_QUERIES.items():
            logging.info(f"Starting scraping for category: {category}")
            category_results = []
            
            for query in queries:
                if self.search_count >= SERP_MONTHLY_LIMIT:
                    logging.warning(f"Monthly limit reached during {category} scraping")
                    break
                
                results = self._make_serp_request(query, category)
                category_results.extend(results)
                time.sleep(2)  # Polite delay between requests
            
            self.results_by_category[category] = category_results
            
            # Save category results
            category_file = RAW_DIR / f"{category}_results.json"
            with open(category_file, 'w') as f:
                json.dump(category_results, f, indent=2)
            
            logging.info(f"Completed scraping for {category}: {len(category_results)} results")
        
        return self.results_by_category

    def get_search_stats(self) -> Dict:
        """Get statistics about the search results"""
        stats = {
            "total_searches": self.search_count,
            "remaining_searches": SERP_MONTHLY_LIMIT - self.search_count,
            "results_per_category": {
                category: len(results) 
                for category, results in self.results_by_category.items()
            }
        }
        
        # Save stats
        stats_file = RAW_DIR / "search_stats.json"
        with open(stats_file, 'w') as f:
            json.dump(stats, f, indent=2)
        
        return stats