Spaces:

samuellimabraz
/

quantum-assistant

Running

File size: 4,350 Bytes

6cdce85

import { NextResponse } from 'next/server';

export interface RunPodHealth {
  jobs: {
    completed: number;
    failed: number;
    inProgress: number;
    inQueue: number;
    retried: number;
  };
  workers: {
    idle: number;
    initializing: number;
    running: number;
    throttled: number;
  };
}

export interface StatusResponse {
  status: 'ready' | 'cold_start' | 'initializing' | 'processing' | 'unavailable';
  message: string;
  workers: {
    idle: number;
    running: number;
    initializing: number;
  };
  queue: {
    inProgress: number;
    inQueue: number;
  };
  estimatedWait?: number; // seconds
}

/**
 * Check RunPod endpoint health to provide user feedback during cold starts
 */
export async function GET(): Promise<NextResponse<StatusResponse>> {
  const baseUrl = process.env.DEMO_MODEL_URL || 'http://localhost:8000/v1';
  const apiKey = process.env.DEMO_API_KEY || '';

  // Extract RunPod endpoint URL from the vLLM base URL
  // vLLM URL format: https://api.runpod.ai/v2/{endpoint_id}/openai/v1
  // Health URL format: https://api.runpod.ai/v2/{endpoint_id}/health
  const runpodMatch = baseUrl.match(/https:\/\/api\.runpod\.ai\/v2\/([^/]+)/);
  
  if (!runpodMatch) {
    // Not a RunPod endpoint, assume it's always ready (local/other provider)
    return NextResponse.json({
      status: 'ready',
      message: 'Model server ready',
      workers: { idle: 1, running: 0, initializing: 0 },
      queue: { inProgress: 0, inQueue: 0 },
    });
  }

  const endpointId = runpodMatch[1];
  const healthUrl = `https://api.runpod.ai/v2/${endpointId}/health`;

  try {
    const response = await fetch(healthUrl, {
      method: 'GET',
      headers: {
        'Authorization': `Bearer ${apiKey}`,
        'Content-Type': 'application/json',
      },
      // Short timeout for health check
      signal: AbortSignal.timeout(5000),
    });

    if (!response.ok) {
      return NextResponse.json({
        status: 'unavailable',
        message: 'Unable to check model status',
        workers: { idle: 0, running: 0, initializing: 0 },
        queue: { inProgress: 0, inQueue: 0 },
      });
    }

    const health: RunPodHealth = await response.json();
    
    const totalWorkers = health.workers.idle + health.workers.running + (health.workers.initializing || 0);
    const hasActiveWorkers = totalWorkers > 0;
    const hasIdleWorkers = health.workers.idle > 0;
    const isInitializing = (health.workers.initializing || 0) > 0;
    const hasQueuedJobs = health.jobs.inQueue > 0;
    const hasRunningJobs = health.jobs.inProgress > 0;

    let status: StatusResponse['status'];
    let message: string;
    let estimatedWait: number | undefined;

    if (hasIdleWorkers) {
      status = 'ready';
      message = 'Model ready';
    } else if (isInitializing) {
      status = 'initializing';
      message = 'Model loading...';
      estimatedWait = 30; // Typical vLLM model load time
    } else if (health.workers.running > 0) {
      status = 'processing';
      message = hasQueuedJobs 
        ? `Processing (${health.jobs.inQueue} in queue)` 
        : 'Processing request...';
      estimatedWait = hasQueuedJobs ? health.jobs.inQueue * 15 : undefined;
    } else if (!hasActiveWorkers && (hasQueuedJobs || hasRunningJobs)) {
      status = 'cold_start';
      message = 'Starting worker...';
      estimatedWait = 45; // Cold start + model load
    } else if (!hasActiveWorkers) {
      status = 'cold_start';
      message = 'Workers scaled to zero, will start on request';
      estimatedWait = 45;
    } else {
      status = 'ready';
      message = 'Model ready';
    }

    return NextResponse.json({
      status,
      message,
      workers: {
        idle: health.workers.idle,
        running: health.workers.running,
        initializing: health.workers.initializing || 0,
      },
      queue: {
        inProgress: health.jobs.inProgress,
        inQueue: health.jobs.inQueue,
      },
      estimatedWait,
    });
  } catch (error) {
    console.error('Health check error:', error);
    
    // Network error might indicate cold start
    return NextResponse.json({
      status: 'cold_start',
      message: 'Connecting to model server...',
      workers: { idle: 0, running: 0, initializing: 0 },
      queue: { inProgress: 0, inQueue: 0 },
      estimatedWait: 45,
    });
  }
}