|
|
import json
|
|
|
import time
|
|
|
from datetime import datetime
|
|
|
from typing import Dict, List
|
|
|
import logging
|
|
|
from serpapi import GoogleSearch
|
|
|
from pathlib import Path
|
|
|
|
|
|
from config import (
|
|
|
SERP_API_KEY, SERP_MONTHLY_LIMIT, SEARCH_QUERIES,
|
|
|
RAW_DIR, LOG_DIR
|
|
|
)
|
|
|
|
|
|
class BloomingtonScraper:
|
|
|
def __init__(self):
|
|
|
self.search_count = 0
|
|
|
self.results_by_category = {}
|
|
|
|
|
|
|
|
|
log_file = LOG_DIR / f"scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
|
|
|
logging.basicConfig(
|
|
|
level=logging.INFO,
|
|
|
format='%(asctime)s - %(levelname)s - %(message)s',
|
|
|
filename=log_file
|
|
|
)
|
|
|
|
|
|
def _make_serp_request(self, query: str, category: str) -> List[Dict]:
|
|
|
"""Make a single SERP API request"""
|
|
|
if self.search_count >= SERP_MONTHLY_LIMIT:
|
|
|
logging.warning("Monthly SERP API limit reached")
|
|
|
return []
|
|
|
|
|
|
params = {
|
|
|
"api_key": SERP_API_KEY,
|
|
|
"engine": "google",
|
|
|
"q": query,
|
|
|
"location": "Bloomington, Indiana, United States",
|
|
|
"google_domain": "google.com",
|
|
|
"num": 100,
|
|
|
"start": 0
|
|
|
}
|
|
|
|
|
|
try:
|
|
|
search = GoogleSearch(params)
|
|
|
results = search.get_dict()
|
|
|
self.search_count += 1
|
|
|
|
|
|
|
|
|
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
|
|
|
raw_file = RAW_DIR / f"raw_results_{category}_{timestamp}.json"
|
|
|
with open(raw_file, 'w') as f:
|
|
|
json.dump(results, f, indent=2)
|
|
|
|
|
|
logging.info(f"SERP API calls used: {self.search_count}/{SERP_MONTHLY_LIMIT}")
|
|
|
return results.get('organic_results', [])
|
|
|
|
|
|
except Exception as e:
|
|
|
logging.error(f"SERP API error for query '{query}': {e}")
|
|
|
return []
|
|
|
|
|
|
def scrape_all_categories(self) -> Dict[str, List[Dict]]:
|
|
|
"""Scrape data for all categories"""
|
|
|
for category, queries in SEARCH_QUERIES.items():
|
|
|
logging.info(f"Starting scraping for category: {category}")
|
|
|
category_results = []
|
|
|
|
|
|
for query in queries:
|
|
|
if self.search_count >= SERP_MONTHLY_LIMIT:
|
|
|
logging.warning(f"Monthly limit reached during {category} scraping")
|
|
|
break
|
|
|
|
|
|
results = self._make_serp_request(query, category)
|
|
|
category_results.extend(results)
|
|
|
time.sleep(2)
|
|
|
|
|
|
self.results_by_category[category] = category_results
|
|
|
|
|
|
|
|
|
category_file = RAW_DIR / f"{category}_results.json"
|
|
|
with open(category_file, 'w') as f:
|
|
|
json.dump(category_results, f, indent=2)
|
|
|
|
|
|
logging.info(f"Completed scraping for {category}: {len(category_results)} results")
|
|
|
|
|
|
return self.results_by_category
|
|
|
|
|
|
def get_search_stats(self) -> Dict:
|
|
|
"""Get statistics about the search results"""
|
|
|
stats = {
|
|
|
"total_searches": self.search_count,
|
|
|
"remaining_searches": SERP_MONTHLY_LIMIT - self.search_count,
|
|
|
"results_per_category": {
|
|
|
category: len(results)
|
|
|
for category, results in self.results_by_category.items()
|
|
|
}
|
|
|
}
|
|
|
|
|
|
|
|
|
stats_file = RAW_DIR / "search_stats.json"
|
|
|
with open(stats_file, 'w') as f:
|
|
|
json.dump(stats, f, indent=2)
|
|
|
|
|
|
return stats |