File size: 9,348 Bytes
ae588db
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
"""
Error Handling Utilities for Medium Scraper

Provides a standardized exception hierarchy and error handling utilities
with context preservation for better debugging and observability.
"""

import logging
from datetime import datetime
from typing import Optional, Dict, Any
from enum import Enum


# ============================================================================
# Exception Hierarchy
# ============================================================================

class ScraperError(Exception):
    """Base exception for all scraper errors."""
    
    def __init__(self, message: str, context: Optional[Dict[str, Any]] = None):
        super().__init__(message)
        self.message = message
        self.context = context or {}
        self.timestamp = datetime.now().isoformat()


class NetworkError(ScraperError):
    """Network-related errors (timeouts, connection failures, DNS issues)."""
    pass


class ParseError(ScraperError):
    """Content parsing errors (invalid HTML, missing data, malformed JSON)."""
    pass


class RateLimitError(ScraperError):
    """Rate limit exceeded (API quota, throttling)."""
    pass


class AuthenticationError(ScraperError):
    """Authentication/authorization errors (invalid cookies, expired tokens)."""
    pass


class CircuitBreakerError(ScraperError):
    """Circuit breaker is open, preventing requests."""
    pass


class CacheError(ScraperError):
    """Cache-related errors (corruption, unavailable)."""
    pass


class DatabaseError(ScraperError):
    """Database operation errors (connection, query, integrity)."""
    pass


class ValidationError(ScraperError):
    """Input validation errors (invalid URL, missing required fields)."""
    pass


# ============================================================================
# Error Severity Levels
# ============================================================================

class ErrorSeverity(Enum):
    """Error severity levels for categorization."""
    LOW = "low"  # Recoverable, expected in some cases
    MEDIUM = "medium"  # Unexpected but handled gracefully
    HIGH = "high"  # Critical, needs immediate attention
    CRITICAL = "critical"  # System failure, service degraded


# ============================================================================
# Error Handler
# ============================================================================

def handle_error(
    error: Exception,
    context: str,
    logger: logging.Logger,
    severity: ErrorSeverity = ErrorSeverity.MEDIUM,
    additional_context: Optional[Dict[str, Any]] = None
) -> None:
    """
    Standardized error handler with context preservation.
    
    Args:
        error: The exception that occurred
        context: Contextual string describing where error occurred
        logger: Logger instance to use
        severity: Error severity level
        additional_context: Additional context data
    
    Example:
        try:
            article = await scrape_article(url)
        except Exception as e:
            handle_error(
                e,
                context="scrape_article",
                logger=logger,
                severity=ErrorSeverity.HIGH,
                additional_context={"url": url, "tier": "graphql"}
            )
    """
    error_class = type(error).__name__
    error_message = str(error)
    
    # Build structured error data
    error_data = {
        "context": context,
        "error_type": error_class,
        "message": error_message,
        "severity": severity.value,
        "timestamp": datetime.now().isoformat()
    }
    
    # Add custom context from ScraperError instances
    if isinstance(error, ScraperError):
        error_data["scraper_context"] = error.context
        error_data["error_timestamp"] = error.timestamp
    
    # Add additional context
    if additional_context:
        error_data["additional_context"] = additional_context
    
    # Log based on severity
    log_message = f"[{context}] {error_class}: {error_message}"
    
    if severity == ErrorSeverity.CRITICAL:
        logger.critical(log_message, extra=error_data, exc_info=True)
    elif severity == ErrorSeverity.HIGH:
        logger.error(log_message, extra=error_data, exc_info=True)
    elif severity == ErrorSeverity.MEDIUM:
        logger.warning(log_message, extra=error_data)
    else:  # LOW
        logger.info(log_message, extra=error_data)


# ============================================================================
# Error Context Manager
# ============================================================================

class ErrorContext:
    """
    Context manager for adding context to errors.
    
    Usage:
        with ErrorContext("scraping_article", url=url) as ctx:
            article = await scrape(url)
            ctx.add_data("tier", "graphql")
    """
    
    def __init__(self, operation: str, **initial_context):
        self.operation = operation
        self.context_data = initial_context
    
    def add_data(self, key: str, value: Any) -> None:
        """Add additional context data."""
        self.context_data[key] = value
    
    def __enter__(self):
        return self
    
    def __exit__(self, exc_type, exc_val, exc_tb):
        if exc_type is not None and isinstance(exc_val, ScraperError):
            # Add context data to ScraperError
            exc_val.context.update(self.context_data)
        return False  # Don't suppress exceptions


# ============================================================================
# Retry Decorator with Error Handling
# ============================================================================

def with_retry(
    max_retries: int = 3,
    backoff_base: float = 1.0,
    backoff_multiplier: float = 2.0,
    retry_on: tuple = (NetworkError, RateLimitError),
    logger: Optional[logging.Logger] = None
):
    """
    Decorator for automatic retries with exponential backoff.
    
    Args:
        max_retries: Maximum number of retry attempts
        backoff_base: Initial delay in seconds
        backoff_multiplier: Multiplier for exponential backoff
        retry_on: Tuple of exception types to retry on
        logger: Optional logger for retry events
    
    Example:
        @with_retry(max_retries=3, retry_on=(NetworkError,))
        async def fetch_data(url):
            return await httpx.get(url)
    """
    import asyncio
    from functools import wraps
    
    def decorator(func):
        @wraps(func)
        async def async_wrapper(*args, **kwargs):
            delay = backoff_base
            last_exception = None
            
            for attempt in range(max_retries + 1):
                try:
                    return await func(*args, **kwargs)
                except retry_on as e:
                    last_exception = e
                    
                    if attempt == max_retries:
                        if logger:
                            logger.error(
                                f"Max retries ({max_retries}) exceeded for {func.__name__}",
                                extra={"attempts": attempt + 1, "error": str(e)}
                            )
                        raise
                    
                    if logger:
                        logger.warning(
                            f"Retry {attempt + 1}/{max_retries} for {func.__name__}",
                            extra={"delay": delay, "error": str(e)}
                        )
                    
                    await asyncio.sleep(delay)
                    delay *= backoff_multiplier
                except Exception as e:
                    # Don't retry on non-retryable errors
                    raise
            
            # Should never reach here
            raise last_exception
        
        @wraps(func)
        def sync_wrapper(*args, **kwargs):
            import time
            delay = backoff_base
            last_exception = None
            
            for attempt in range(max_retries + 1):
                try:
                    return func(*args, **kwargs)
                except retry_on as e:
                    last_exception = e
                    
                    if attempt == max_retries:
                        if logger:
                            logger.error(
                                f"Max retries ({max_retries}) exceeded for {func.__name__}",
                                extra={"attempts": attempt + 1, "error": str(e)}
                            )
                        raise
                    
                    if logger:
                        logger.warning(
                            f"Retry {attempt + 1}/{max_retries} for {func.__name__}",
                            extra={"delay": delay, "error": str(e)}
                        )
                    
                    time.sleep(delay)
                    delay *= backoff_multiplier
                except Exception as e:
                    # Don't retry on non-retryable errors
                    raise
            
            raise last_exception
        
        # Return appropriate wrapper based on function type
        import inspect
        if inspect.iscoroutinefunction(func):
            return async_wrapper
        else:
            return sync_wrapper
    
    return decorator