scratch_chat / chat_agent /api /health.py
WebashalarForML's picture
Upload 178 files
330b6e4 verified
"""Health check endpoints for monitoring and load balancing."""
import time
import psutil
from datetime import datetime
from flask import Blueprint, jsonify, current_app
import redis
import psycopg2
from sqlalchemy import text
from chat_agent.models.base import db
from chat_agent.utils.error_handler import get_error_handler
health_bp = Blueprint('health', __name__, url_prefix='/health')
def check_database():
"""Check database connectivity and basic operations."""
try:
# Test basic database connection
result = db.session.execute(text('SELECT 1'))
result.fetchone()
# Test if migrations table exists (indicates proper setup)
result = db.session.execute(text(
"SELECT COUNT(*) FROM information_schema.tables WHERE table_name = 'schema_migrations'"
))
migrations_table_exists = result.fetchone()[0] > 0
return {
'status': 'healthy',
'connection': 'ok',
'migrations_table': 'exists' if migrations_table_exists else 'missing',
'response_time_ms': 0 # Will be calculated by caller
}
except Exception as e:
return {
'status': 'unhealthy',
'error': str(e),
'connection': 'failed'
}
def check_redis():
"""Check Redis connectivity and basic operations."""
redis_url = current_app.config.get('REDIS_URL')
if not redis_url or redis_url == 'None':
return {
'status': 'disabled',
'message': 'Redis is disabled in configuration'
}
try:
redis_client = redis.from_url(redis_url)
# Test basic operations
start_time = time.time()
redis_client.ping()
response_time = (time.time() - start_time) * 1000
# Test set/get operation
test_key = 'health_check_test'
redis_client.set(test_key, 'test_value', ex=10)
value = redis_client.get(test_key)
redis_client.delete(test_key)
return {
'status': 'healthy',
'connection': 'ok',
'response_time_ms': round(response_time, 2),
'operations': 'ok' if value == b'test_value' else 'failed'
}
except Exception as e:
return {
'status': 'unhealthy',
'error': str(e),
'connection': 'failed'
}
def check_groq_api():
"""Check Groq API configuration and basic connectivity."""
groq_api_key = current_app.config.get('GROQ_API_KEY')
if not groq_api_key:
return {
'status': 'unhealthy',
'error': 'GROQ_API_KEY not configured'
}
# Basic configuration check
return {
'status': 'configured',
'api_key_present': bool(groq_api_key),
'model': current_app.config.get('GROQ_MODEL', 'not_configured'),
'note': 'API connectivity not tested in health check to avoid quota usage'
}
def get_system_metrics():
"""Get basic system metrics."""
try:
return {
'cpu_percent': psutil.cpu_percent(interval=1),
'memory_percent': psutil.virtual_memory().percent,
'disk_percent': psutil.disk_usage('/').percent,
'load_average': psutil.getloadavg()[0] if hasattr(psutil, 'getloadavg') else None
}
except Exception as e:
return {
'error': f'Failed to get system metrics: {str(e)}'
}
@health_bp.route('/')
@health_bp.route('/basic')
def basic_health():
"""Basic health check endpoint for load balancers."""
return jsonify({
'status': 'healthy',
'timestamp': datetime.utcnow().isoformat(),
'service': 'chat-agent',
'version': '1.0.0'
}), 200
@health_bp.route('/detailed')
def detailed_health():
"""Detailed health check with all dependencies."""
start_time = time.time()
# Check all components
db_start = time.time()
database_health = check_database()
database_health['response_time_ms'] = round((time.time() - db_start) * 1000, 2)
redis_health = check_redis()
groq_health = check_groq_api()
system_metrics = get_system_metrics()
# Determine overall status
overall_status = 'healthy'
if database_health['status'] == 'unhealthy':
overall_status = 'unhealthy'
elif redis_health['status'] == 'unhealthy':
overall_status = 'degraded' # Redis failure is not critical
elif groq_health['status'] == 'unhealthy':
overall_status = 'degraded' # Can still serve static content
response = {
'status': overall_status,
'timestamp': datetime.utcnow().isoformat(),
'service': 'chat-agent',
'version': '1.0.0',
'uptime_seconds': round(time.time() - start_time, 2),
'components': {
'database': database_health,
'redis': redis_health,
'groq_api': groq_health
},
'system': system_metrics,
'config': {
'environment': current_app.config.get('FLASK_ENV', 'unknown'),
'debug': current_app.config.get('DEBUG', False),
'default_language': current_app.config.get('DEFAULT_LANGUAGE', 'python')
}
}
# Return appropriate HTTP status code
status_code = 200
if overall_status == 'unhealthy':
status_code = 503
elif overall_status == 'degraded':
status_code = 200 # Still functional
return jsonify(response), status_code
@health_bp.route('/ready')
def readiness():
"""Readiness probe for Kubernetes/container orchestration."""
# Check critical dependencies only
db_health = check_database()
if db_health['status'] == 'healthy':
return jsonify({
'status': 'ready',
'timestamp': datetime.utcnow().isoformat(),
'database': 'connected'
}), 200
else:
return jsonify({
'status': 'not_ready',
'timestamp': datetime.utcnow().isoformat(),
'database': 'disconnected',
'error': db_health.get('error', 'Database check failed')
}), 503
@health_bp.route('/live')
def liveness():
"""Liveness probe for Kubernetes/container orchestration."""
# Simple check that the application is running
return jsonify({
'status': 'alive',
'timestamp': datetime.utcnow().isoformat(),
'service': 'chat-agent'
}), 200
@health_bp.route('/metrics')
def metrics():
"""Basic metrics endpoint for monitoring systems."""
system_metrics = get_system_metrics()
# Add application-specific metrics
app_metrics = {
'active_sessions': 0, # TODO: Implement session counting
'total_messages': 0, # TODO: Implement message counting
'cache_hit_rate': 0.0 # TODO: Implement cache metrics
}
return jsonify({
'timestamp': datetime.utcnow().isoformat(),
'system': system_metrics,
'application': app_metrics
}), 200
# Error handler for health check blueprint
@health_bp.errorhandler(Exception)
def handle_health_error(error):
"""Handle errors in health check endpoints."""
error_handler = get_error_handler()
if error_handler:
error_handler.handle_error(error, context="health_check")
return jsonify({
'status': 'error',
'timestamp': datetime.utcnow().isoformat(),
'error': 'Health check failed',
'message': str(error)
}), 500