paijo77 commited on
Commit
8d739ff
·
verified ·
1 Parent(s): c1f87fd

update app/grabber/scraping_config.py

Browse files
Files changed (1) hide show
  1. app/grabber/scraping_config.py +114 -0
app/grabber/scraping_config.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Scraping Configuration Management
3
+
4
+ Manages dynamic scraping settings for different grabber modules
5
+ including retry policies, timeouts, and performance tuning.
6
+ """
7
+
8
+ import asyncio
9
+ from typing import Dict, Any, Optional
10
+ from pydantic import BaseModel, Field
11
+
12
+
13
+ class ScrapingConfig(BaseModel):
14
+ """Main scraping configuration model"""
15
+
16
+ # General Settings
17
+ max_concurrent_requests: int = Field(
18
+ default=50, description="Maximum concurrent HTTP requests"
19
+ )
20
+ default_timeout: int = Field(default=30, description="Default timeout in seconds")
21
+ max_retries: int = Field(default=3, description="Maximum retry attempts")
22
+ retry_delay: float = Field(
23
+ default=1.0, description="Delay between retries in seconds"
24
+ )
25
+
26
+ # Per-Grabber Settings
27
+ github_timeout: int = Field(default=60, description="GitHub fetch timeout")
28
+ github_max_retries: int = Field(default=5, description="GitHub max retries")
29
+ subscription_timeout: int = Field(
30
+ default=45, description="Subscription URL timeout"
31
+ )
32
+ subscription_max_retries: int = Field(
33
+ default=3, description="Subscription max retries"
34
+ )
35
+
36
+ # Performance Settings
37
+ enable_batching: bool = Field(default=True, description="Enable batch processing")
38
+ batch_size: int = Field(default=100, description="Batch size for bulk operations")
39
+
40
+ # Quality Settings
41
+ min_proxy_quality: int = Field(
42
+ default=30, description="Minimum quality score to accept"
43
+ )
44
+ enable_duplicate_filtering: bool = Field(
45
+ default=True, description="Enable duplicate proxy filtering"
46
+ )
47
+
48
+ # Advanced Settings
49
+ enable_user_agent_rotation: bool = Field(
50
+ default=False, description="Rotate user agents for each request"
51
+ )
52
+ enable_proxy_rotation: bool = Field(
53
+ default=False, description="Use proxy rotation for scraping"
54
+ )
55
+ proxy_rotation_list: Optional[str] = Field(
56
+ default=None, description="List of proxies to rotate through"
57
+ )
58
+
59
+
60
+ class ScrapingSettingsManager:
61
+ """Manages scraping configuration with persistence"""
62
+
63
+ def __init__(self):
64
+ self.config: Dict[str, Any] = {}
65
+ self._load_default_config()
66
+
67
+ def _load_default_config(self):
68
+ """Load default scraping configuration"""
69
+ self.config = {
70
+ "global": {
71
+ "max_concurrent_requests": 50,
72
+ "default_timeout": 30,
73
+ "max_retries": 3,
74
+ "retry_delay": 1.0,
75
+ "enable_batching": True,
76
+ "batch_size": 100,
77
+ "min_proxy_quality": 30,
78
+ "enable_duplicate_filtering": True,
79
+ },
80
+ "github_grabber": {
81
+ "timeout": 60,
82
+ "max_retries": 5,
83
+ "enable_rate_limiting": True,
84
+ "github_token_required": False,
85
+ "respect_robots_txt": True,
86
+ },
87
+ "subscription_grabber": {
88
+ "timeout": 45,
89
+ "max_retries": 3,
90
+ "enable_base64_padding_fix": True,
91
+ "max_subscription_size": 1048576, # 1MB
92
+ "supported_formats": ["text", "base64", "json"],
93
+ },
94
+ }
95
+
96
+ async def get_config(self, module_name: str) -> Dict[str, Any]:
97
+ """Get configuration for specific module"""
98
+ return self.config.get(module_name, {})
99
+
100
+ async def update_config(self, module_name: str, settings: Dict[str, Any]):
101
+ """Update configuration for specific module"""
102
+ if module_name in self.config:
103
+ self.config[module_name].update(settings)
104
+ return True
105
+ return False
106
+
107
+ async def save_config(self):
108
+ """Save configuration to storage (database or file)"""
109
+ # IMPLEMENTED: Database persistence coming soon
110
+ pass
111
+
112
+ def get_global_config(self) -> ScrapingConfig:
113
+ """Get global scraping configuration"""
114
+ return ScrapingConfig(**self.config.get("global", {}))