Spaces:
Running
Running
File size: 4,333 Bytes
b0b150b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 |
"""
Compilation Health Monitoring API
Provides endpoints to monitor compilation job health and detect issues.
"""
from fastapi import APIRouter, Depends
from sqlalchemy.orm import Session
from sqlalchemy import text
from core.database import get_db
from api.deps import get_current_user
from models.user import User
from datetime import datetime, timedelta
router = APIRouter(prefix="/api/diagnostics", tags=["diagnostics"])
@router.get("/compilation-health")
def get_compilation_health(
db: Session = Depends(get_db),
current_user: User = Depends(get_current_user)
):
"""
Get overall compilation health status.
Shows active jobs, stuck jobs, and recent failures.
"""
# Active jobs
active_result = db.execute(text("""
SELECT COUNT(*) as count
FROM compilation_jobs cj
JOIN agents a ON cj.agent_id = a.id
WHERE cj.status = 'in_progress'
AND a.user_id = :user_id
"""), {"user_id": current_user.id})
active_count = active_result.fetchone().count
# Stuck jobs (running > 30 minutes)
stuck_result = db.execute(text("""
SELECT
cj.id,
a.name as agent_name,
cj.progress,
cj.current_step,
EXTRACT(EPOCH FROM (NOW() - cj.created_at)) / 60 as minutes_running
FROM compilation_jobs cj
JOIN agents a ON cj.agent_id = a.id
WHERE cj.status = 'in_progress'
AND a.user_id = :user_id
AND cj.created_at < NOW() - INTERVAL '30 minutes'
"""), {"user_id": current_user.id})
stuck_jobs = stuck_result.fetchall()
# Recent failures (last 24 hours)
failed_result = db.execute(text("""
SELECT
a.name as agent_name,
cj.error_message,
cj.created_at
FROM compilation_jobs cj
JOIN agents a ON cj.agent_id = a.id
WHERE cj.status = 'failed'
AND a.user_id = :user_id
AND cj.created_at > NOW() - INTERVAL '24 hours'
ORDER BY cj.created_at DESC
LIMIT 5
"""), {"user_id": current_user.id})
recent_failures = failed_result.fetchall()
# Success rate (last 24 hours)
stats_result = db.execute(text("""
SELECT
COUNT(*) as total,
SUM(CASE WHEN status = 'completed' THEN 1 ELSE 0 END) as completed,
SUM(CASE WHEN status = 'failed' THEN 1 ELSE 0 END) as failed
FROM compilation_jobs cj
JOIN agents a ON cj.agent_id = a.id
WHERE a.user_id = :user_id
AND cj.created_at > NOW() - INTERVAL '24 hours'
"""), {"user_id": current_user.id})
stats = stats_result.fetchone()
success_rate = (stats.completed / stats.total * 100) if stats.total > 0 else 0
return {
"status": "healthy" if len(stuck_jobs) == 0 else "warning",
"active_jobs": active_count,
"stuck_jobs": [
{
"id": job.id,
"agent_name": job.agent_name,
"progress": job.progress,
"current_step": job.current_step,
"minutes_running": round(job.minutes_running, 1)
}
for job in stuck_jobs
],
"recent_failures": [
{
"agent_name": f.agent_name,
"error": f.error_message,
"created_at": f.created_at.isoformat()
}
for f in recent_failures
],
"stats_24h": {
"total_jobs": stats.total,
"completed": stats.completed,
"failed": stats.failed,
"success_rate": round(success_rate, 1)
}
}
@router.get("/embedding-model-status")
def get_embedding_model_status():
"""Check if the embedding model is working"""
try:
from fastembed import TextEmbedding
model = TextEmbedding(model_name="BAAI/bge-small-en-v1.5")
test_text = ["Test sentence"]
embeddings = list(model.embed(test_text))
return {
"status": "healthy",
"model": "BAAI/bge-small-en-v1.5",
"dimension": len(embeddings[0]),
"message": "Embedding model is working correctly"
}
except Exception as e:
return {
"status": "error",
"message": str(e)
}
|