File size: 4,350 Bytes
6cdce85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
import { NextResponse } from 'next/server';

export interface RunPodHealth {
  jobs: {
    completed: number;
    failed: number;
    inProgress: number;
    inQueue: number;
    retried: number;
  };
  workers: {
    idle: number;
    initializing: number;
    running: number;
    throttled: number;
  };
}

export interface StatusResponse {
  status: 'ready' | 'cold_start' | 'initializing' | 'processing' | 'unavailable';
  message: string;
  workers: {
    idle: number;
    running: number;
    initializing: number;
  };
  queue: {
    inProgress: number;
    inQueue: number;
  };
  estimatedWait?: number; // seconds
}

/**
 * Check RunPod endpoint health to provide user feedback during cold starts
 */
export async function GET(): Promise<NextResponse<StatusResponse>> {
  const baseUrl = process.env.DEMO_MODEL_URL || 'http://localhost:8000/v1';
  const apiKey = process.env.DEMO_API_KEY || '';

  // Extract RunPod endpoint URL from the vLLM base URL
  // vLLM URL format: https://api.runpod.ai/v2/{endpoint_id}/openai/v1
  // Health URL format: https://api.runpod.ai/v2/{endpoint_id}/health
  const runpodMatch = baseUrl.match(/https:\/\/api\.runpod\.ai\/v2\/([^/]+)/);
  
  if (!runpodMatch) {
    // Not a RunPod endpoint, assume it's always ready (local/other provider)
    return NextResponse.json({
      status: 'ready',
      message: 'Model server ready',
      workers: { idle: 1, running: 0, initializing: 0 },
      queue: { inProgress: 0, inQueue: 0 },
    });
  }

  const endpointId = runpodMatch[1];
  const healthUrl = `https://api.runpod.ai/v2/${endpointId}/health`;

  try {
    const response = await fetch(healthUrl, {
      method: 'GET',
      headers: {
        'Authorization': `Bearer ${apiKey}`,
        'Content-Type': 'application/json',
      },
      // Short timeout for health check
      signal: AbortSignal.timeout(5000),
    });

    if (!response.ok) {
      return NextResponse.json({
        status: 'unavailable',
        message: 'Unable to check model status',
        workers: { idle: 0, running: 0, initializing: 0 },
        queue: { inProgress: 0, inQueue: 0 },
      });
    }

    const health: RunPodHealth = await response.json();
    
    const totalWorkers = health.workers.idle + health.workers.running + (health.workers.initializing || 0);
    const hasActiveWorkers = totalWorkers > 0;
    const hasIdleWorkers = health.workers.idle > 0;
    const isInitializing = (health.workers.initializing || 0) > 0;
    const hasQueuedJobs = health.jobs.inQueue > 0;
    const hasRunningJobs = health.jobs.inProgress > 0;

    let status: StatusResponse['status'];
    let message: string;
    let estimatedWait: number | undefined;

    if (hasIdleWorkers) {
      status = 'ready';
      message = 'Model ready';
    } else if (isInitializing) {
      status = 'initializing';
      message = 'Model loading...';
      estimatedWait = 30; // Typical vLLM model load time
    } else if (health.workers.running > 0) {
      status = 'processing';
      message = hasQueuedJobs 
        ? `Processing (${health.jobs.inQueue} in queue)` 
        : 'Processing request...';
      estimatedWait = hasQueuedJobs ? health.jobs.inQueue * 15 : undefined;
    } else if (!hasActiveWorkers && (hasQueuedJobs || hasRunningJobs)) {
      status = 'cold_start';
      message = 'Starting worker...';
      estimatedWait = 45; // Cold start + model load
    } else if (!hasActiveWorkers) {
      status = 'cold_start';
      message = 'Workers scaled to zero, will start on request';
      estimatedWait = 45;
    } else {
      status = 'ready';
      message = 'Model ready';
    }

    return NextResponse.json({
      status,
      message,
      workers: {
        idle: health.workers.idle,
        running: health.workers.running,
        initializing: health.workers.initializing || 0,
      },
      queue: {
        inProgress: health.jobs.inProgress,
        inQueue: health.jobs.inQueue,
      },
      estimatedWait,
    });
  } catch (error) {
    console.error('Health check error:', error);
    
    // Network error might indicate cold start
    return NextResponse.json({
      status: 'cold_start',
      message: 'Connecting to model server...',
      workers: { idle: 0, running: 0, initializing: 0 },
      queue: { inProgress: 0, inQueue: 0 },
      estimatedWait: 45,
    });
  }
}