"""Health check endpoints for monitoring and load balancing.""" import time import psutil from datetime import datetime from flask import Blueprint, jsonify, current_app import redis import psycopg2 from sqlalchemy import text from chat_agent.models.base import db from chat_agent.utils.error_handler import get_error_handler health_bp = Blueprint('health', __name__, url_prefix='/health') def check_database(): """Check database connectivity and basic operations.""" try: # Test basic database connection result = db.session.execute(text('SELECT 1')) result.fetchone() # Test if migrations table exists (indicates proper setup) result = db.session.execute(text( "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = 'schema_migrations'" )) migrations_table_exists = result.fetchone()[0] > 0 return { 'status': 'healthy', 'connection': 'ok', 'migrations_table': 'exists' if migrations_table_exists else 'missing', 'response_time_ms': 0 # Will be calculated by caller } except Exception as e: return { 'status': 'unhealthy', 'error': str(e), 'connection': 'failed' } def check_redis(): """Check Redis connectivity and basic operations.""" redis_url = current_app.config.get('REDIS_URL') if not redis_url or redis_url == 'None': return { 'status': 'disabled', 'message': 'Redis is disabled in configuration' } try: redis_client = redis.from_url(redis_url) # Test basic operations start_time = time.time() redis_client.ping() response_time = (time.time() - start_time) * 1000 # Test set/get operation test_key = 'health_check_test' redis_client.set(test_key, 'test_value', ex=10) value = redis_client.get(test_key) redis_client.delete(test_key) return { 'status': 'healthy', 'connection': 'ok', 'response_time_ms': round(response_time, 2), 'operations': 'ok' if value == b'test_value' else 'failed' } except Exception as e: return { 'status': 'unhealthy', 'error': str(e), 'connection': 'failed' } def check_groq_api(): """Check Groq API configuration and basic connectivity.""" groq_api_key = current_app.config.get('GROQ_API_KEY') if not groq_api_key: return { 'status': 'unhealthy', 'error': 'GROQ_API_KEY not configured' } # Basic configuration check return { 'status': 'configured', 'api_key_present': bool(groq_api_key), 'model': current_app.config.get('GROQ_MODEL', 'not_configured'), 'note': 'API connectivity not tested in health check to avoid quota usage' } def get_system_metrics(): """Get basic system metrics.""" try: return { 'cpu_percent': psutil.cpu_percent(interval=1), 'memory_percent': psutil.virtual_memory().percent, 'disk_percent': psutil.disk_usage('/').percent, 'load_average': psutil.getloadavg()[0] if hasattr(psutil, 'getloadavg') else None } except Exception as e: return { 'error': f'Failed to get system metrics: {str(e)}' } @health_bp.route('/') @health_bp.route('/basic') def basic_health(): """Basic health check endpoint for load balancers.""" return jsonify({ 'status': 'healthy', 'timestamp': datetime.utcnow().isoformat(), 'service': 'chat-agent', 'version': '1.0.0' }), 200 @health_bp.route('/detailed') def detailed_health(): """Detailed health check with all dependencies.""" start_time = time.time() # Check all components db_start = time.time() database_health = check_database() database_health['response_time_ms'] = round((time.time() - db_start) * 1000, 2) redis_health = check_redis() groq_health = check_groq_api() system_metrics = get_system_metrics() # Determine overall status overall_status = 'healthy' if database_health['status'] == 'unhealthy': overall_status = 'unhealthy' elif redis_health['status'] == 'unhealthy': overall_status = 'degraded' # Redis failure is not critical elif groq_health['status'] == 'unhealthy': overall_status = 'degraded' # Can still serve static content response = { 'status': overall_status, 'timestamp': datetime.utcnow().isoformat(), 'service': 'chat-agent', 'version': '1.0.0', 'uptime_seconds': round(time.time() - start_time, 2), 'components': { 'database': database_health, 'redis': redis_health, 'groq_api': groq_health }, 'system': system_metrics, 'config': { 'environment': current_app.config.get('FLASK_ENV', 'unknown'), 'debug': current_app.config.get('DEBUG', False), 'default_language': current_app.config.get('DEFAULT_LANGUAGE', 'python') } } # Return appropriate HTTP status code status_code = 200 if overall_status == 'unhealthy': status_code = 503 elif overall_status == 'degraded': status_code = 200 # Still functional return jsonify(response), status_code @health_bp.route('/ready') def readiness(): """Readiness probe for Kubernetes/container orchestration.""" # Check critical dependencies only db_health = check_database() if db_health['status'] == 'healthy': return jsonify({ 'status': 'ready', 'timestamp': datetime.utcnow().isoformat(), 'database': 'connected' }), 200 else: return jsonify({ 'status': 'not_ready', 'timestamp': datetime.utcnow().isoformat(), 'database': 'disconnected', 'error': db_health.get('error', 'Database check failed') }), 503 @health_bp.route('/live') def liveness(): """Liveness probe for Kubernetes/container orchestration.""" # Simple check that the application is running return jsonify({ 'status': 'alive', 'timestamp': datetime.utcnow().isoformat(), 'service': 'chat-agent' }), 200 @health_bp.route('/metrics') def metrics(): """Basic metrics endpoint for monitoring systems.""" system_metrics = get_system_metrics() # Add application-specific metrics app_metrics = { 'active_sessions': 0, # TODO: Implement session counting 'total_messages': 0, # TODO: Implement message counting 'cache_hit_rate': 0.0 # TODO: Implement cache metrics } return jsonify({ 'timestamp': datetime.utcnow().isoformat(), 'system': system_metrics, 'application': app_metrics }), 200 # Error handler for health check blueprint @health_bp.errorhandler(Exception) def handle_health_error(error): """Handle errors in health check endpoints.""" error_handler = get_error_handler() if error_handler: error_handler.handle_error(error, context="health_check") return jsonify({ 'status': 'error', 'timestamp': datetime.utcnow().isoformat(), 'error': 'Health check failed', 'message': str(error) }), 500