version1 / scraper.py
krishna3103's picture
Upload 9 files
b56d4a6 verified
import json
import time
from datetime import datetime
from typing import Dict, List
import logging
from serpapi import GoogleSearch
from pathlib import Path
from config import (
SERP_API_KEY, SERP_MONTHLY_LIMIT, SEARCH_QUERIES,
RAW_DIR, LOG_DIR
)
class BloomingtonScraper:
def __init__(self):
self.search_count = 0
self.results_by_category = {}
# Set up logging
log_file = LOG_DIR / f"scraper_{datetime.now().strftime('%Y%m%d_%H%M%S')}.log"
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(levelname)s - %(message)s',
filename=log_file
)
def _make_serp_request(self, query: str, category: str) -> List[Dict]:
"""Make a single SERP API request"""
if self.search_count >= SERP_MONTHLY_LIMIT:
logging.warning("Monthly SERP API limit reached")
return []
params = {
"api_key": SERP_API_KEY,
"engine": "google",
"q": query,
"location": "Bloomington, Indiana, United States",
"google_domain": "google.com",
"num": 100, # Get maximum results per query
"start": 0
}
try:
search = GoogleSearch(params)
results = search.get_dict()
self.search_count += 1
# Save raw results
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
raw_file = RAW_DIR / f"raw_results_{category}_{timestamp}.json"
with open(raw_file, 'w') as f:
json.dump(results, f, indent=2)
logging.info(f"SERP API calls used: {self.search_count}/{SERP_MONTHLY_LIMIT}")
return results.get('organic_results', [])
except Exception as e:
logging.error(f"SERP API error for query '{query}': {e}")
return []
def scrape_all_categories(self) -> Dict[str, List[Dict]]:
"""Scrape data for all categories"""
for category, queries in SEARCH_QUERIES.items():
logging.info(f"Starting scraping for category: {category}")
category_results = []
for query in queries:
if self.search_count >= SERP_MONTHLY_LIMIT:
logging.warning(f"Monthly limit reached during {category} scraping")
break
results = self._make_serp_request(query, category)
category_results.extend(results)
time.sleep(2) # Polite delay between requests
self.results_by_category[category] = category_results
# Save category results
category_file = RAW_DIR / f"{category}_results.json"
with open(category_file, 'w') as f:
json.dump(category_results, f, indent=2)
logging.info(f"Completed scraping for {category}: {len(category_results)} results")
return self.results_by_category
def get_search_stats(self) -> Dict:
"""Get statistics about the search results"""
stats = {
"total_searches": self.search_count,
"remaining_searches": SERP_MONTHLY_LIMIT - self.search_count,
"results_per_category": {
category: len(results)
for category, results in self.results_by_category.items()
}
}
# Save stats
stats_file = RAW_DIR / "search_stats.json"
with open(stats_file, 'w') as f:
json.dump(stats, f, indent=2)
return stats