File size: 7,787 Bytes
330b6e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
"""Health check endpoints for monitoring and load balancing."""

import time
import psutil
from datetime import datetime
from flask import Blueprint, jsonify, current_app
import redis
import psycopg2
from sqlalchemy import text

from chat_agent.models.base import db
from chat_agent.utils.error_handler import get_error_handler

health_bp = Blueprint('health', __name__, url_prefix='/health')


def check_database():
    """Check database connectivity and basic operations."""
    try:
        # Test basic database connection
        result = db.session.execute(text('SELECT 1'))
        result.fetchone()
        
        # Test if migrations table exists (indicates proper setup)
        result = db.session.execute(text(
            "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = 'schema_migrations'"
        ))
        migrations_table_exists = result.fetchone()[0] > 0
        
        return {
            'status': 'healthy',
            'connection': 'ok',
            'migrations_table': 'exists' if migrations_table_exists else 'missing',
            'response_time_ms': 0  # Will be calculated by caller
        }
    except Exception as e:
        return {
            'status': 'unhealthy',
            'error': str(e),
            'connection': 'failed'
        }


def check_redis():
    """Check Redis connectivity and basic operations."""
    redis_url = current_app.config.get('REDIS_URL')
    if not redis_url or redis_url == 'None':
        return {
            'status': 'disabled',
            'message': 'Redis is disabled in configuration'
        }
    
    try:
        redis_client = redis.from_url(redis_url)
        
        # Test basic operations
        start_time = time.time()
        redis_client.ping()
        response_time = (time.time() - start_time) * 1000
        
        # Test set/get operation
        test_key = 'health_check_test'
        redis_client.set(test_key, 'test_value', ex=10)
        value = redis_client.get(test_key)
        redis_client.delete(test_key)
        
        return {
            'status': 'healthy',
            'connection': 'ok',
            'response_time_ms': round(response_time, 2),
            'operations': 'ok' if value == b'test_value' else 'failed'
        }
    except Exception as e:
        return {
            'status': 'unhealthy',
            'error': str(e),
            'connection': 'failed'
        }


def check_groq_api():
    """Check Groq API configuration and basic connectivity."""
    groq_api_key = current_app.config.get('GROQ_API_KEY')
    
    if not groq_api_key:
        return {
            'status': 'unhealthy',
            'error': 'GROQ_API_KEY not configured'
        }
    
    # Basic configuration check
    return {
        'status': 'configured',
        'api_key_present': bool(groq_api_key),
        'model': current_app.config.get('GROQ_MODEL', 'not_configured'),
        'note': 'API connectivity not tested in health check to avoid quota usage'
    }


def get_system_metrics():
    """Get basic system metrics."""
    try:
        return {
            'cpu_percent': psutil.cpu_percent(interval=1),
            'memory_percent': psutil.virtual_memory().percent,
            'disk_percent': psutil.disk_usage('/').percent,
            'load_average': psutil.getloadavg()[0] if hasattr(psutil, 'getloadavg') else None
        }
    except Exception as e:
        return {
            'error': f'Failed to get system metrics: {str(e)}'
        }


@health_bp.route('/')
@health_bp.route('/basic')
def basic_health():
    """Basic health check endpoint for load balancers."""
    return jsonify({
        'status': 'healthy',
        'timestamp': datetime.utcnow().isoformat(),
        'service': 'chat-agent',
        'version': '1.0.0'
    }), 200


@health_bp.route('/detailed')
def detailed_health():
    """Detailed health check with all dependencies."""
    start_time = time.time()
    
    # Check all components
    db_start = time.time()
    database_health = check_database()
    database_health['response_time_ms'] = round((time.time() - db_start) * 1000, 2)
    
    redis_health = check_redis()
    groq_health = check_groq_api()
    system_metrics = get_system_metrics()
    
    # Determine overall status
    overall_status = 'healthy'
    if database_health['status'] == 'unhealthy':
        overall_status = 'unhealthy'
    elif redis_health['status'] == 'unhealthy':
        overall_status = 'degraded'  # Redis failure is not critical
    elif groq_health['status'] == 'unhealthy':
        overall_status = 'degraded'  # Can still serve static content
    
    response = {
        'status': overall_status,
        'timestamp': datetime.utcnow().isoformat(),
        'service': 'chat-agent',
        'version': '1.0.0',
        'uptime_seconds': round(time.time() - start_time, 2),
        'components': {
            'database': database_health,
            'redis': redis_health,
            'groq_api': groq_health
        },
        'system': system_metrics,
        'config': {
            'environment': current_app.config.get('FLASK_ENV', 'unknown'),
            'debug': current_app.config.get('DEBUG', False),
            'default_language': current_app.config.get('DEFAULT_LANGUAGE', 'python')
        }
    }
    
    # Return appropriate HTTP status code
    status_code = 200
    if overall_status == 'unhealthy':
        status_code = 503
    elif overall_status == 'degraded':
        status_code = 200  # Still functional
    
    return jsonify(response), status_code


@health_bp.route('/ready')
def readiness():
    """Readiness probe for Kubernetes/container orchestration."""
    # Check critical dependencies only
    db_health = check_database()
    
    if db_health['status'] == 'healthy':
        return jsonify({
            'status': 'ready',
            'timestamp': datetime.utcnow().isoformat(),
            'database': 'connected'
        }), 200
    else:
        return jsonify({
            'status': 'not_ready',
            'timestamp': datetime.utcnow().isoformat(),
            'database': 'disconnected',
            'error': db_health.get('error', 'Database check failed')
        }), 503


@health_bp.route('/live')
def liveness():
    """Liveness probe for Kubernetes/container orchestration."""
    # Simple check that the application is running
    return jsonify({
        'status': 'alive',
        'timestamp': datetime.utcnow().isoformat(),
        'service': 'chat-agent'
    }), 200


@health_bp.route('/metrics')
def metrics():
    """Basic metrics endpoint for monitoring systems."""
    system_metrics = get_system_metrics()
    
    # Add application-specific metrics
    app_metrics = {
        'active_sessions': 0,  # TODO: Implement session counting
        'total_messages': 0,   # TODO: Implement message counting
        'cache_hit_rate': 0.0  # TODO: Implement cache metrics
    }
    
    return jsonify({
        'timestamp': datetime.utcnow().isoformat(),
        'system': system_metrics,
        'application': app_metrics
    }), 200


# Error handler for health check blueprint
@health_bp.errorhandler(Exception)
def handle_health_error(error):
    """Handle errors in health check endpoints."""
    error_handler = get_error_handler()
    if error_handler:
        error_handler.handle_error(error, context="health_check")
    
    return jsonify({
        'status': 'error',
        'timestamp': datetime.utcnow().isoformat(),
        'error': 'Health check failed',
        'message': str(error)
    }), 500