Spaces:
Runtime error
Runtime error
File size: 7,787 Bytes
330b6e4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 | """Health check endpoints for monitoring and load balancing."""
import time
import psutil
from datetime import datetime
from flask import Blueprint, jsonify, current_app
import redis
import psycopg2
from sqlalchemy import text
from chat_agent.models.base import db
from chat_agent.utils.error_handler import get_error_handler
health_bp = Blueprint('health', __name__, url_prefix='/health')
def check_database():
"""Check database connectivity and basic operations."""
try:
# Test basic database connection
result = db.session.execute(text('SELECT 1'))
result.fetchone()
# Test if migrations table exists (indicates proper setup)
result = db.session.execute(text(
"SELECT COUNT(*) FROM information_schema.tables WHERE table_name = 'schema_migrations'"
))
migrations_table_exists = result.fetchone()[0] > 0
return {
'status': 'healthy',
'connection': 'ok',
'migrations_table': 'exists' if migrations_table_exists else 'missing',
'response_time_ms': 0 # Will be calculated by caller
}
except Exception as e:
return {
'status': 'unhealthy',
'error': str(e),
'connection': 'failed'
}
def check_redis():
"""Check Redis connectivity and basic operations."""
redis_url = current_app.config.get('REDIS_URL')
if not redis_url or redis_url == 'None':
return {
'status': 'disabled',
'message': 'Redis is disabled in configuration'
}
try:
redis_client = redis.from_url(redis_url)
# Test basic operations
start_time = time.time()
redis_client.ping()
response_time = (time.time() - start_time) * 1000
# Test set/get operation
test_key = 'health_check_test'
redis_client.set(test_key, 'test_value', ex=10)
value = redis_client.get(test_key)
redis_client.delete(test_key)
return {
'status': 'healthy',
'connection': 'ok',
'response_time_ms': round(response_time, 2),
'operations': 'ok' if value == b'test_value' else 'failed'
}
except Exception as e:
return {
'status': 'unhealthy',
'error': str(e),
'connection': 'failed'
}
def check_groq_api():
"""Check Groq API configuration and basic connectivity."""
groq_api_key = current_app.config.get('GROQ_API_KEY')
if not groq_api_key:
return {
'status': 'unhealthy',
'error': 'GROQ_API_KEY not configured'
}
# Basic configuration check
return {
'status': 'configured',
'api_key_present': bool(groq_api_key),
'model': current_app.config.get('GROQ_MODEL', 'not_configured'),
'note': 'API connectivity not tested in health check to avoid quota usage'
}
def get_system_metrics():
"""Get basic system metrics."""
try:
return {
'cpu_percent': psutil.cpu_percent(interval=1),
'memory_percent': psutil.virtual_memory().percent,
'disk_percent': psutil.disk_usage('/').percent,
'load_average': psutil.getloadavg()[0] if hasattr(psutil, 'getloadavg') else None
}
except Exception as e:
return {
'error': f'Failed to get system metrics: {str(e)}'
}
@health_bp.route('/')
@health_bp.route('/basic')
def basic_health():
"""Basic health check endpoint for load balancers."""
return jsonify({
'status': 'healthy',
'timestamp': datetime.utcnow().isoformat(),
'service': 'chat-agent',
'version': '1.0.0'
}), 200
@health_bp.route('/detailed')
def detailed_health():
"""Detailed health check with all dependencies."""
start_time = time.time()
# Check all components
db_start = time.time()
database_health = check_database()
database_health['response_time_ms'] = round((time.time() - db_start) * 1000, 2)
redis_health = check_redis()
groq_health = check_groq_api()
system_metrics = get_system_metrics()
# Determine overall status
overall_status = 'healthy'
if database_health['status'] == 'unhealthy':
overall_status = 'unhealthy'
elif redis_health['status'] == 'unhealthy':
overall_status = 'degraded' # Redis failure is not critical
elif groq_health['status'] == 'unhealthy':
overall_status = 'degraded' # Can still serve static content
response = {
'status': overall_status,
'timestamp': datetime.utcnow().isoformat(),
'service': 'chat-agent',
'version': '1.0.0',
'uptime_seconds': round(time.time() - start_time, 2),
'components': {
'database': database_health,
'redis': redis_health,
'groq_api': groq_health
},
'system': system_metrics,
'config': {
'environment': current_app.config.get('FLASK_ENV', 'unknown'),
'debug': current_app.config.get('DEBUG', False),
'default_language': current_app.config.get('DEFAULT_LANGUAGE', 'python')
}
}
# Return appropriate HTTP status code
status_code = 200
if overall_status == 'unhealthy':
status_code = 503
elif overall_status == 'degraded':
status_code = 200 # Still functional
return jsonify(response), status_code
@health_bp.route('/ready')
def readiness():
"""Readiness probe for Kubernetes/container orchestration."""
# Check critical dependencies only
db_health = check_database()
if db_health['status'] == 'healthy':
return jsonify({
'status': 'ready',
'timestamp': datetime.utcnow().isoformat(),
'database': 'connected'
}), 200
else:
return jsonify({
'status': 'not_ready',
'timestamp': datetime.utcnow().isoformat(),
'database': 'disconnected',
'error': db_health.get('error', 'Database check failed')
}), 503
@health_bp.route('/live')
def liveness():
"""Liveness probe for Kubernetes/container orchestration."""
# Simple check that the application is running
return jsonify({
'status': 'alive',
'timestamp': datetime.utcnow().isoformat(),
'service': 'chat-agent'
}), 200
@health_bp.route('/metrics')
def metrics():
"""Basic metrics endpoint for monitoring systems."""
system_metrics = get_system_metrics()
# Add application-specific metrics
app_metrics = {
'active_sessions': 0, # TODO: Implement session counting
'total_messages': 0, # TODO: Implement message counting
'cache_hit_rate': 0.0 # TODO: Implement cache metrics
}
return jsonify({
'timestamp': datetime.utcnow().isoformat(),
'system': system_metrics,
'application': app_metrics
}), 200
# Error handler for health check blueprint
@health_bp.errorhandler(Exception)
def handle_health_error(error):
"""Handle errors in health check endpoints."""
error_handler = get_error_handler()
if error_handler:
error_handler.handle_error(error, context="health_check")
return jsonify({
'status': 'error',
'timestamp': datetime.utcnow().isoformat(),
'error': 'Health check failed',
'message': str(error)
}), 500 |