Spaces:
Runtime error
Runtime error
| """Health check endpoints for monitoring and load balancing.""" | |
| import time | |
| import psutil | |
| from datetime import datetime | |
| from flask import Blueprint, jsonify, current_app | |
| import redis | |
| import psycopg2 | |
| from sqlalchemy import text | |
| from chat_agent.models.base import db | |
| from chat_agent.utils.error_handler import get_error_handler | |
| health_bp = Blueprint('health', __name__, url_prefix='/health') | |
| def check_database(): | |
| """Check database connectivity and basic operations.""" | |
| try: | |
| # Test basic database connection | |
| result = db.session.execute(text('SELECT 1')) | |
| result.fetchone() | |
| # Test if migrations table exists (indicates proper setup) | |
| result = db.session.execute(text( | |
| "SELECT COUNT(*) FROM information_schema.tables WHERE table_name = 'schema_migrations'" | |
| )) | |
| migrations_table_exists = result.fetchone()[0] > 0 | |
| return { | |
| 'status': 'healthy', | |
| 'connection': 'ok', | |
| 'migrations_table': 'exists' if migrations_table_exists else 'missing', | |
| 'response_time_ms': 0 # Will be calculated by caller | |
| } | |
| except Exception as e: | |
| return { | |
| 'status': 'unhealthy', | |
| 'error': str(e), | |
| 'connection': 'failed' | |
| } | |
| def check_redis(): | |
| """Check Redis connectivity and basic operations.""" | |
| redis_url = current_app.config.get('REDIS_URL') | |
| if not redis_url or redis_url == 'None': | |
| return { | |
| 'status': 'disabled', | |
| 'message': 'Redis is disabled in configuration' | |
| } | |
| try: | |
| redis_client = redis.from_url(redis_url) | |
| # Test basic operations | |
| start_time = time.time() | |
| redis_client.ping() | |
| response_time = (time.time() - start_time) * 1000 | |
| # Test set/get operation | |
| test_key = 'health_check_test' | |
| redis_client.set(test_key, 'test_value', ex=10) | |
| value = redis_client.get(test_key) | |
| redis_client.delete(test_key) | |
| return { | |
| 'status': 'healthy', | |
| 'connection': 'ok', | |
| 'response_time_ms': round(response_time, 2), | |
| 'operations': 'ok' if value == b'test_value' else 'failed' | |
| } | |
| except Exception as e: | |
| return { | |
| 'status': 'unhealthy', | |
| 'error': str(e), | |
| 'connection': 'failed' | |
| } | |
| def check_groq_api(): | |
| """Check Groq API configuration and basic connectivity.""" | |
| groq_api_key = current_app.config.get('GROQ_API_KEY') | |
| if not groq_api_key: | |
| return { | |
| 'status': 'unhealthy', | |
| 'error': 'GROQ_API_KEY not configured' | |
| } | |
| # Basic configuration check | |
| return { | |
| 'status': 'configured', | |
| 'api_key_present': bool(groq_api_key), | |
| 'model': current_app.config.get('GROQ_MODEL', 'not_configured'), | |
| 'note': 'API connectivity not tested in health check to avoid quota usage' | |
| } | |
| def get_system_metrics(): | |
| """Get basic system metrics.""" | |
| try: | |
| return { | |
| 'cpu_percent': psutil.cpu_percent(interval=1), | |
| 'memory_percent': psutil.virtual_memory().percent, | |
| 'disk_percent': psutil.disk_usage('/').percent, | |
| 'load_average': psutil.getloadavg()[0] if hasattr(psutil, 'getloadavg') else None | |
| } | |
| except Exception as e: | |
| return { | |
| 'error': f'Failed to get system metrics: {str(e)}' | |
| } | |
| def basic_health(): | |
| """Basic health check endpoint for load balancers.""" | |
| return jsonify({ | |
| 'status': 'healthy', | |
| 'timestamp': datetime.utcnow().isoformat(), | |
| 'service': 'chat-agent', | |
| 'version': '1.0.0' | |
| }), 200 | |
| def detailed_health(): | |
| """Detailed health check with all dependencies.""" | |
| start_time = time.time() | |
| # Check all components | |
| db_start = time.time() | |
| database_health = check_database() | |
| database_health['response_time_ms'] = round((time.time() - db_start) * 1000, 2) | |
| redis_health = check_redis() | |
| groq_health = check_groq_api() | |
| system_metrics = get_system_metrics() | |
| # Determine overall status | |
| overall_status = 'healthy' | |
| if database_health['status'] == 'unhealthy': | |
| overall_status = 'unhealthy' | |
| elif redis_health['status'] == 'unhealthy': | |
| overall_status = 'degraded' # Redis failure is not critical | |
| elif groq_health['status'] == 'unhealthy': | |
| overall_status = 'degraded' # Can still serve static content | |
| response = { | |
| 'status': overall_status, | |
| 'timestamp': datetime.utcnow().isoformat(), | |
| 'service': 'chat-agent', | |
| 'version': '1.0.0', | |
| 'uptime_seconds': round(time.time() - start_time, 2), | |
| 'components': { | |
| 'database': database_health, | |
| 'redis': redis_health, | |
| 'groq_api': groq_health | |
| }, | |
| 'system': system_metrics, | |
| 'config': { | |
| 'environment': current_app.config.get('FLASK_ENV', 'unknown'), | |
| 'debug': current_app.config.get('DEBUG', False), | |
| 'default_language': current_app.config.get('DEFAULT_LANGUAGE', 'python') | |
| } | |
| } | |
| # Return appropriate HTTP status code | |
| status_code = 200 | |
| if overall_status == 'unhealthy': | |
| status_code = 503 | |
| elif overall_status == 'degraded': | |
| status_code = 200 # Still functional | |
| return jsonify(response), status_code | |
| def readiness(): | |
| """Readiness probe for Kubernetes/container orchestration.""" | |
| # Check critical dependencies only | |
| db_health = check_database() | |
| if db_health['status'] == 'healthy': | |
| return jsonify({ | |
| 'status': 'ready', | |
| 'timestamp': datetime.utcnow().isoformat(), | |
| 'database': 'connected' | |
| }), 200 | |
| else: | |
| return jsonify({ | |
| 'status': 'not_ready', | |
| 'timestamp': datetime.utcnow().isoformat(), | |
| 'database': 'disconnected', | |
| 'error': db_health.get('error', 'Database check failed') | |
| }), 503 | |
| def liveness(): | |
| """Liveness probe for Kubernetes/container orchestration.""" | |
| # Simple check that the application is running | |
| return jsonify({ | |
| 'status': 'alive', | |
| 'timestamp': datetime.utcnow().isoformat(), | |
| 'service': 'chat-agent' | |
| }), 200 | |
| def metrics(): | |
| """Basic metrics endpoint for monitoring systems.""" | |
| system_metrics = get_system_metrics() | |
| # Add application-specific metrics | |
| app_metrics = { | |
| 'active_sessions': 0, # TODO: Implement session counting | |
| 'total_messages': 0, # TODO: Implement message counting | |
| 'cache_hit_rate': 0.0 # TODO: Implement cache metrics | |
| } | |
| return jsonify({ | |
| 'timestamp': datetime.utcnow().isoformat(), | |
| 'system': system_metrics, | |
| 'application': app_metrics | |
| }), 200 | |
| # Error handler for health check blueprint | |
| def handle_health_error(error): | |
| """Handle errors in health check endpoints.""" | |
| error_handler = get_error_handler() | |
| if error_handler: | |
| error_handler.handle_error(error, context="health_check") | |
| return jsonify({ | |
| 'status': 'error', | |
| 'timestamp': datetime.utcnow().isoformat(), | |
| 'error': 'Health check failed', | |
| 'message': str(error) | |
| }), 500 |