paijo77 commited on
Commit
404d1c0
·
verified ·
1 Parent(s): 8d739ff

update app/grabber/scraping_enhancements.py

Browse files
Files changed (1) hide show
  1. app/grabber/scraping_enhancements.py +131 -0
app/grabber/scraping_enhancements.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Enhanced scraping service with advanced features like proxy rotation,
3
+ rate limiting, and performance monitoring.
4
+ """
5
+
6
+ from typing import List, Dict, Any, Optional
7
+ from datetime import datetime
8
+ import asyncio
9
+ import time
10
+ import logging
11
+
12
+ from app.grabber.scraping_utils import ProxyAgent
13
+
14
+
15
+ class ScrapingEnhancementConfig:
16
+ """Configuration for enhanced scraping operations."""
17
+
18
+ def __init__(
19
+ self,
20
+ enable_proxy_rotation: bool = True,
21
+ max_concurrent_requests: int = 10,
22
+ rate_limit_per_second: int = 5,
23
+ enable_retry: bool = True,
24
+ max_retries: int = 3,
25
+ timeout_seconds: int = 30,
26
+ ):
27
+ self.enable_proxy_rotation = enable_proxy_rotation
28
+ self.max_concurrent_requests = max_concurrent_requests
29
+ self.rate_limit_per_second = rate_limit_per_second
30
+ self.enable_retry = enable_retry
31
+ self.max_retries = max_retries
32
+ self.timeout_seconds = timeout_seconds
33
+
34
+
35
+ class EnhancedScrapingService:
36
+ """Enhanced scraping service with proxy rotation and monitoring."""
37
+
38
+ def __init__(self, config: ScrapingEnhancementConfig):
39
+ self.config = config
40
+ self.proxy_rotator = ProxyRotator(proxies=[])
41
+ self.request_queue = RequestQueue(max_concurrent=config.max_concurrent_requests)
42
+ self.rate_limiter = RateLimiter()
43
+ self.performance_monitor = PerformanceMonitor()
44
+ self.logger = logging.getLogger(__name__)
45
+
46
+ from app.grabber.scraping_config import ScrapingConfigManager
47
+
48
+ self.config_manager = ScrapingConfigManager()
49
+
50
+ async def initialize_services(self):
51
+ """Initialize async components of the service."""
52
+ self.logger.info("Enhanced scraping service initialized successfully")
53
+ return True
54
+
55
+ async def scrape_with_enhancements(
56
+ self,
57
+ source_configs: List[Dict[str, Any]],
58
+ ) -> Dict[str, Any]:
59
+ """Scrape multiple sources with enhanced features."""
60
+ results = []
61
+
62
+ for source_config in source_configs:
63
+ proxy = await self.proxy_rotator.get_next_proxy()
64
+ result = await self._scrape_single_with_rate_limit(source_config, proxy, {})
65
+ if result:
66
+ results.append(result)
67
+
68
+ return {
69
+ "sources_processed": len(source_configs),
70
+ "successful_scrapes": len(results),
71
+ "results": results,
72
+ }
73
+
74
+ async def _scrape_single_with_rate_limit(
75
+ self,
76
+ source_config: Dict[str, Any],
77
+ proxy: ProxyAgent,
78
+ retry_config: Dict[str, Any],
79
+ ) -> Optional[str]:
80
+ """Scrape source with rate limit per proxy."""
81
+
82
+ return f"proxy_data_for_{proxy.proxy_url}" if proxy else "no_proxy_available"
83
+
84
+
85
+ class PerformanceMonitor:
86
+ """Monitor scraping performance metrics."""
87
+
88
+ def __init__(self):
89
+ self.metrics = {
90
+ "total_requests": 0,
91
+ "successful_requests": 0,
92
+ "failed_requests": 0,
93
+ "total_data_bytes": 0,
94
+ "avg_response_time": 0.0,
95
+ }
96
+ self.logger = logging.getLogger(__name__)
97
+
98
+ def record_request(self, success: bool, response_time: float, data_size: int = 0):
99
+ """Record a request metric."""
100
+ self.metrics["total_requests"] += 1
101
+
102
+ if success:
103
+ self.metrics["successful_requests"] += 1
104
+ else:
105
+ self.metrics["failed_requests"] += 1
106
+
107
+ self.metrics["total_data_bytes"] += data_size
108
+
109
+ total_requests = self.metrics["total_requests"]
110
+ current_avg = self.metrics["avg_response_time"]
111
+ self.metrics["avg_response_time"] = (
112
+ current_avg * (total_requests - 1) + response_time
113
+ ) / total_requests
114
+
115
+ def get_overall_stats(self) -> Dict[str, Any]:
116
+ """Get overall performance statistics."""
117
+ total_requests = self.metrics["total_requests"]
118
+ successful_requests = self.metrics["successful_requests"]
119
+
120
+ success_rate = (
121
+ (successful_requests / total_requests * 100) if total_requests > 0 else 0
122
+ )
123
+
124
+ return {
125
+ "total_requests": total_requests,
126
+ "successful_requests": successful_requests,
127
+ "failed_requests": self.metrics["failed_requests"],
128
+ "success_rate": round(success_rate, 2),
129
+ "total_data_bytes": self.metrics["total_data_bytes"],
130
+ "avg_response_time": round(self.metrics["avg_response_time"], 3),
131
+ }