Spaces:
Running
Running
| """ | |
| Compilation Health Monitoring API | |
| Provides endpoints to monitor compilation job health and detect issues. | |
| """ | |
| from fastapi import APIRouter, Depends | |
| from sqlalchemy.orm import Session | |
| from sqlalchemy import text | |
| from core.database import get_db | |
| from api.deps import get_current_user | |
| from models.user import User | |
| from datetime import datetime, timedelta | |
| router = APIRouter(prefix="/api/diagnostics", tags=["diagnostics"]) | |
| def get_compilation_health( | |
| db: Session = Depends(get_db), | |
| current_user: User = Depends(get_current_user) | |
| ): | |
| """ | |
| Get overall compilation health status. | |
| Shows active jobs, stuck jobs, and recent failures. | |
| """ | |
| # Active jobs | |
| active_result = db.execute(text(""" | |
| SELECT COUNT(*) as count | |
| FROM compilation_jobs cj | |
| JOIN agents a ON cj.agent_id = a.id | |
| WHERE cj.status = 'in_progress' | |
| AND a.user_id = :user_id | |
| """), {"user_id": current_user.id}) | |
| active_count = active_result.fetchone().count | |
| # Stuck jobs (running > 30 minutes) | |
| stuck_result = db.execute(text(""" | |
| SELECT | |
| cj.id, | |
| a.name as agent_name, | |
| cj.progress, | |
| cj.current_step, | |
| EXTRACT(EPOCH FROM (NOW() - cj.created_at)) / 60 as minutes_running | |
| FROM compilation_jobs cj | |
| JOIN agents a ON cj.agent_id = a.id | |
| WHERE cj.status = 'in_progress' | |
| AND a.user_id = :user_id | |
| AND cj.created_at < NOW() - INTERVAL '30 minutes' | |
| """), {"user_id": current_user.id}) | |
| stuck_jobs = stuck_result.fetchall() | |
| # Recent failures (last 24 hours) | |
| failed_result = db.execute(text(""" | |
| SELECT | |
| a.name as agent_name, | |
| cj.error_message, | |
| cj.created_at | |
| FROM compilation_jobs cj | |
| JOIN agents a ON cj.agent_id = a.id | |
| WHERE cj.status = 'failed' | |
| AND a.user_id = :user_id | |
| AND cj.created_at > NOW() - INTERVAL '24 hours' | |
| ORDER BY cj.created_at DESC | |
| LIMIT 5 | |
| """), {"user_id": current_user.id}) | |
| recent_failures = failed_result.fetchall() | |
| # Success rate (last 24 hours) | |
| stats_result = db.execute(text(""" | |
| SELECT | |
| COUNT(*) as total, | |
| SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed, | |
| SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed | |
| FROM compilation_jobs cj | |
| JOIN agents a ON cj.agent_id = a.id | |
| WHERE a.user_id = :user_id | |
| AND cj.created_at > NOW() - INTERVAL '24 hours' | |
| """), {"user_id": current_user.id}) | |
| stats = stats_result.fetchone() | |
| success_rate = (stats.completed / stats.total * 100) if stats.total > 0 else 0 | |
| return { | |
| "status": "healthy" if len(stuck_jobs) == 0 else "warning", | |
| "active_jobs": active_count, | |
| "stuck_jobs": [ | |
| { | |
| "id": job.id, | |
| "agent_name": job.agent_name, | |
| "progress": job.progress, | |
| "current_step": job.current_step, | |
| "minutes_running": round(job.minutes_running, 1) | |
| } | |
| for job in stuck_jobs | |
| ], | |
| "recent_failures": [ | |
| { | |
| "agent_name": f.agent_name, | |
| "error": f.error_message, | |
| "created_at": f.created_at.isoformat() | |
| } | |
| for f in recent_failures | |
| ], | |
| "stats_24h": { | |
| "total_jobs": stats.total, | |
| "completed": stats.completed, | |
| "failed": stats.failed, | |
| "success_rate": round(success_rate, 1) | |
| } | |
| } | |
| def get_embedding_model_status(): | |
| """Check if the embedding model is working""" | |
| try: | |
| from fastembed import TextEmbedding | |
| model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5") | |
| test_text = ["Test sentence"] | |
| embeddings = list(model.embed(test_text)) | |
| return { | |
| "status": "healthy", | |
| "model": "BAAI/bge-small-en-v1.5", | |
| "dimension": len(embeddings[0]), | |
| "message": "Embedding model is working correctly" | |
| } | |
| except Exception as e: | |
| return { | |
| "status": "error", | |
| "message": str(e) | |
| } | |