update app/grabber/scraping_enhancements.py
Browse files
app/grabber/scraping_enhancements.py
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Enhanced scraping service with advanced features like proxy rotation,
|
| 3 |
+
rate limiting, and performance monitoring.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
from typing import List, Dict, Any, Optional
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
import asyncio
|
| 9 |
+
import time
|
| 10 |
+
import logging
|
| 11 |
+
|
| 12 |
+
from app.grabber.scraping_utils import ProxyAgent
|
| 13 |
+
|
| 14 |
+
|
| 15 |
+
class ScrapingEnhancementConfig:
|
| 16 |
+
"""Configuration for enhanced scraping operations."""
|
| 17 |
+
|
| 18 |
+
def __init__(
|
| 19 |
+
self,
|
| 20 |
+
enable_proxy_rotation: bool = True,
|
| 21 |
+
max_concurrent_requests: int = 10,
|
| 22 |
+
rate_limit_per_second: int = 5,
|
| 23 |
+
enable_retry: bool = True,
|
| 24 |
+
max_retries: int = 3,
|
| 25 |
+
timeout_seconds: int = 30,
|
| 26 |
+
):
|
| 27 |
+
self.enable_proxy_rotation = enable_proxy_rotation
|
| 28 |
+
self.max_concurrent_requests = max_concurrent_requests
|
| 29 |
+
self.rate_limit_per_second = rate_limit_per_second
|
| 30 |
+
self.enable_retry = enable_retry
|
| 31 |
+
self.max_retries = max_retries
|
| 32 |
+
self.timeout_seconds = timeout_seconds
|
| 33 |
+
|
| 34 |
+
|
| 35 |
+
class EnhancedScrapingService:
|
| 36 |
+
"""Enhanced scraping service with proxy rotation and monitoring."""
|
| 37 |
+
|
| 38 |
+
def __init__(self, config: ScrapingEnhancementConfig):
|
| 39 |
+
self.config = config
|
| 40 |
+
self.proxy_rotator = ProxyRotator(proxies=[])
|
| 41 |
+
self.request_queue = RequestQueue(max_concurrent=config.max_concurrent_requests)
|
| 42 |
+
self.rate_limiter = RateLimiter()
|
| 43 |
+
self.performance_monitor = PerformanceMonitor()
|
| 44 |
+
self.logger = logging.getLogger(__name__)
|
| 45 |
+
|
| 46 |
+
from app.grabber.scraping_config import ScrapingConfigManager
|
| 47 |
+
|
| 48 |
+
self.config_manager = ScrapingConfigManager()
|
| 49 |
+
|
| 50 |
+
async def initialize_services(self):
|
| 51 |
+
"""Initialize async components of the service."""
|
| 52 |
+
self.logger.info("Enhanced scraping service initialized successfully")
|
| 53 |
+
return True
|
| 54 |
+
|
| 55 |
+
async def scrape_with_enhancements(
|
| 56 |
+
self,
|
| 57 |
+
source_configs: List[Dict[str, Any]],
|
| 58 |
+
) -> Dict[str, Any]:
|
| 59 |
+
"""Scrape multiple sources with enhanced features."""
|
| 60 |
+
results = []
|
| 61 |
+
|
| 62 |
+
for source_config in source_configs:
|
| 63 |
+
proxy = await self.proxy_rotator.get_next_proxy()
|
| 64 |
+
result = await self._scrape_single_with_rate_limit(source_config, proxy, {})
|
| 65 |
+
if result:
|
| 66 |
+
results.append(result)
|
| 67 |
+
|
| 68 |
+
return {
|
| 69 |
+
"sources_processed": len(source_configs),
|
| 70 |
+
"successful_scrapes": len(results),
|
| 71 |
+
"results": results,
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
async def _scrape_single_with_rate_limit(
|
| 75 |
+
self,
|
| 76 |
+
source_config: Dict[str, Any],
|
| 77 |
+
proxy: ProxyAgent,
|
| 78 |
+
retry_config: Dict[str, Any],
|
| 79 |
+
) -> Optional[str]:
|
| 80 |
+
"""Scrape source with rate limit per proxy."""
|
| 81 |
+
|
| 82 |
+
return f"proxy_data_for_{proxy.proxy_url}" if proxy else "no_proxy_available"
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
class PerformanceMonitor:
|
| 86 |
+
"""Monitor scraping performance metrics."""
|
| 87 |
+
|
| 88 |
+
def __init__(self):
|
| 89 |
+
self.metrics = {
|
| 90 |
+
"total_requests": 0,
|
| 91 |
+
"successful_requests": 0,
|
| 92 |
+
"failed_requests": 0,
|
| 93 |
+
"total_data_bytes": 0,
|
| 94 |
+
"avg_response_time": 0.0,
|
| 95 |
+
}
|
| 96 |
+
self.logger = logging.getLogger(__name__)
|
| 97 |
+
|
| 98 |
+
def record_request(self, success: bool, response_time: float, data_size: int = 0):
|
| 99 |
+
"""Record a request metric."""
|
| 100 |
+
self.metrics["total_requests"] += 1
|
| 101 |
+
|
| 102 |
+
if success:
|
| 103 |
+
self.metrics["successful_requests"] += 1
|
| 104 |
+
else:
|
| 105 |
+
self.metrics["failed_requests"] += 1
|
| 106 |
+
|
| 107 |
+
self.metrics["total_data_bytes"] += data_size
|
| 108 |
+
|
| 109 |
+
total_requests = self.metrics["total_requests"]
|
| 110 |
+
current_avg = self.metrics["avg_response_time"]
|
| 111 |
+
self.metrics["avg_response_time"] = (
|
| 112 |
+
current_avg * (total_requests - 1) + response_time
|
| 113 |
+
) / total_requests
|
| 114 |
+
|
| 115 |
+
def get_overall_stats(self) -> Dict[str, Any]:
|
| 116 |
+
"""Get overall performance statistics."""
|
| 117 |
+
total_requests = self.metrics["total_requests"]
|
| 118 |
+
successful_requests = self.metrics["successful_requests"]
|
| 119 |
+
|
| 120 |
+
success_rate = (
|
| 121 |
+
(successful_requests / total_requests * 100) if total_requests > 0 else 0
|
| 122 |
+
)
|
| 123 |
+
|
| 124 |
+
return {
|
| 125 |
+
"total_requests": total_requests,
|
| 126 |
+
"successful_requests": successful_requests,
|
| 127 |
+
"failed_requests": self.metrics["failed_requests"],
|
| 128 |
+
"success_rate": round(success_rate, 2),
|
| 129 |
+
"total_data_bytes": self.metrics["total_data_bytes"],
|
| 130 |
+
"avg_response_time": round(self.metrics["avg_response_time"], 3),
|
| 131 |
+
}
|