Spaces:

pareshmishra
/

MT564AITraining

Running

MT564AITraining / services /scraper_service.py

pareshmishra

Add full project source files for MT564 AI

2c72e40 5 months ago

5.77 kB

	import logging
	import time
	from typing import Dict, Any, List, Optional, Tuple, Type
	from concurrent.futures import ThreadPoolExecutor, as_completed
	from urllib.parse import urlparse

	from scrapers.base_scraper import BaseScraper
	from scrapers.news_scraper import NewsScraper
	from scrapers.blog_scraper import BlogScraper
	from utils.rate_limiter import RateLimiter

	logger = logging.getLogger(__name__)

	class ScraperService:
	"""Service to manage scraping operations"""

	def __init__(self, max_workers: int = 5, timeout: int = 30):
	"""
	Initialize scraper service

	Args:
	max_workers: Maximum number of concurrent scrapers
	timeout: Timeout for each scraping operation in seconds
	"""
	self.max_workers = max_workers
	self.timeout = timeout
	self.rate_limiters = {} # Domain-specific rate limiters

	# Register available scrapers
	self.scrapers = {
	"news": NewsScraper(timeout),
	"blog": BlogScraper(timeout),
	}

	def get_scraper_for_url(self, url: str) -> Tuple[BaseScraper, str]:
	"""
	Determine the appropriate scraper to use for a URL

	Args:
	url: URL to scrape

	Returns:
	Tuple of (scraper instance, scraper type)
	"""
	# Simple logic to determine scraper type based on URL patterns
	# This could be enhanced with more sophisticated detection
	domain = urlparse(url).netloc.lower()

	# News site patterns
	news_patterns = ["news", "cnn", "bbc", "reuters", "nytimes", "washingtonpost",
	"guardian", "aljazeera", "foxnews", "nbcnews", "abc"]

	# Blog patterns
	blog_patterns = ["blog", "medium", "wordpress", "blogspot", "tumblr",
	"substack", "ghost", "hashnode"]

	# Check domain against patterns
	for pattern in news_patterns:
	if pattern in domain:
	return self.scrapers["news"], "news"

	for pattern in blog_patterns:
	if pattern in domain:
	return self.scrapers["blog"], "blog"

	# Default to news scraper
	return self.scrapers["news"], "news"

	def _get_rate_limiter(self, domain: str) -> RateLimiter:
	"""Get or create a rate limiter for a specific domain"""
	if domain not in self.rate_limiters:
	# Default: 5 requests per minute for each domain
	self.rate_limiters[domain] = RateLimiter(window_size=60, max_requests=5)
	return self.rate_limiters[domain]

	def scrape_url(self, url: str, scraper_type: Optional[str] = None) -> Dict[str, Any]:
	"""
	Scrape a single URL

	Args:
	url: URL to scrape
	scraper_type: Optional type of scraper to use

	Returns:
	Dictionary with scraped data
	"""
	try:
	# Parse domain for rate limiting
	domain = urlparse(url).netloc
	rate_limiter = self._get_rate_limiter(domain)

	# Check if we can proceed with the request
	if not rate_limiter.can_proceed():
	wait_time = rate_limiter.get_wait_time()
	logger.warning(f"Rate limit reached for {domain}. Waiting {wait_time:.2f} seconds")
	time.sleep(wait_time)

	# Select appropriate scraper
	if scraper_type and scraper_type in self.scrapers:
	scraper = self.scrapers[scraper_type]
	selected_type = scraper_type
	else:
	scraper, selected_type = self.get_scraper_for_url(url)

	logger.info(f"Scraping {url} with {selected_type} scraper")

	# Perform scraping and record the request
	result = scraper.scrape(url)
	rate_limiter.record_request()

	# Add metadata about scraping
	result["scraper_type"] = selected_type
	result["scraped_at"] = time.time()

	return result

	except Exception as e:
	logger.error(f"Error scraping URL {url}: {str(e)}")
	return {
	"success": False,
	"url": url,
	"error": str(e),
	"scraped_at": time.time()
	}

	def scrape_multiple_urls(self, urls: List[str]) -> List[Dict[str, Any]]:
	"""
	Scrape multiple URLs in parallel

	Args:
	urls: List of URLs to scrape

	Returns:
	List of dictionaries with scraped data
	"""
	results = []

	# Use ThreadPoolExecutor for concurrent scraping
	with ThreadPoolExecutor(max_workers=self.max_workers) as executor:
	# Submit scraping tasks
	future_to_url = {executor.submit(self.scrape_url, url): url for url in urls}

	# Collect results as they complete
	for future in as_completed(future_to_url):
	url = future_to_url[future]
	try:
	result = future.result()
	results.append(result)
	logger.info(f"Completed scraping: {url}")
	except Exception as e:
	logger.error(f"Exception scraping {url}: {str(e)}")
	results.append({
	"success": False,
	"url": url,
	"error": str(e),
	"scraped_at": time.time()
	})

	return results