File size: 7,514 Bytes
4d35814 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 |
import { config } from '$lib/stores/settings.svelte';
/**
* SlotsService - Real-time processing state monitoring and token rate calculation
*
* This service provides real-time information about generation progress, token rates,
* and context usage based on timing data from ChatService streaming responses.
* It manages streaming session tracking and provides accurate processing state updates.
*
* **Architecture & Relationships:**
* - **SlotsService** (this class): Processing state monitoring
* - Receives timing data from ChatService streaming responses
* - Calculates token generation rates and context usage
* - Manages streaming session lifecycle
* - Provides real-time updates to UI components
*
* - **ChatService**: Provides timing data from `/chat/completions` streaming
* - **UI Components**: Subscribe to processing state for progress indicators
*
* **Key Features:**
* - **Real-time Monitoring**: Live processing state during generation
* - **Token Rate Calculation**: Accurate tokens/second from timing data
* - **Context Tracking**: Current context usage and remaining capacity
* - **Streaming Lifecycle**: Start/stop tracking for streaming sessions
* - **Timing Data Processing**: Converts streaming timing data to structured state
* - **Error Handling**: Graceful handling when timing data is unavailable
*
* **Processing States:**
* - `idle`: No active processing
* - `generating`: Actively generating tokens
*
* **Token Rate Calculation:**
* Uses timing data from `/chat/completions` streaming response for accurate
* real-time token generation rate measurement.
*/
export class SlotsService {
private callbacks: Set<(state: ApiProcessingState | null) => void> = new Set();
private isStreamingActive: boolean = false;
private lastKnownState: ApiProcessingState | null = null;
/**
* Start streaming session tracking
*/
startStreaming(): void {
this.isStreamingActive = true;
}
/**
* Stop streaming session tracking
*/
stopStreaming(): void {
this.isStreamingActive = false;
}
/**
* Clear the current processing state
* Used when switching to a conversation without timing data
*/
clearState(): void {
this.lastKnownState = null;
for (const callback of this.callbacks) {
try {
callback(null);
} catch (error) {
console.error('Error in clearState callback:', error);
}
}
}
/**
* Check if currently in a streaming session
*/
isStreaming(): boolean {
return this.isStreamingActive;
}
/**
* @deprecated Polling is no longer used - timing data comes from ChatService streaming response
* This method logs a warning if called to help identify outdated usage
*/
fetchAndNotify(): void {
console.warn(
'SlotsService.fetchAndNotify() is deprecated - use timing data from ChatService instead'
);
}
subscribe(callback: (state: ApiProcessingState | null) => void): () => void {
this.callbacks.add(callback);
if (this.lastKnownState) {
callback(this.lastKnownState);
}
return () => {
this.callbacks.delete(callback);
};
}
/**
* Updates processing state with timing data from ChatService streaming response
*/
async updateFromTimingData(timingData: {
prompt_n: number;
predicted_n: number;
predicted_per_second: number;
cache_n: number;
prompt_progress?: ChatMessagePromptProgress;
}): Promise<void> {
const processingState = await this.parseCompletionTimingData(timingData);
// Only update if we successfully parsed the state
if (processingState === null) {
console.warn('Failed to parse timing data - skipping update');
return;
}
this.lastKnownState = processingState;
for (const callback of this.callbacks) {
try {
callback(processingState);
} catch (error) {
console.error('Error in timing callback:', error);
}
}
}
/**
* Gets context total from last known slots data or fetches from server
*/
private async getContextTotal(): Promise<number | null> {
if (this.lastKnownState && this.lastKnownState.contextTotal > 0) {
return this.lastKnownState.contextTotal;
}
try {
const currentConfig = config();
const apiKey = currentConfig.apiKey?.toString().trim();
const response = await fetch(`./slots`, {
headers: {
...(apiKey ? { Authorization: `Bearer ${apiKey}` } : {})
}
});
if (response.ok) {
const slotsData = await response.json();
if (Array.isArray(slotsData) && slotsData.length > 0) {
const slot = slotsData[0];
if (slot.n_ctx && slot.n_ctx > 0) {
return slot.n_ctx;
}
}
}
} catch (error) {
console.warn('Failed to fetch context total from /slots:', error);
}
return 4096;
}
private async parseCompletionTimingData(
timingData: Record<string, unknown>
): Promise<ApiProcessingState | null> {
const promptTokens = (timingData.prompt_n as number) || 0;
const predictedTokens = (timingData.predicted_n as number) || 0;
const tokensPerSecond = (timingData.predicted_per_second as number) || 0;
const cacheTokens = (timingData.cache_n as number) || 0;
const promptProgress = timingData.prompt_progress as
| {
total: number;
cache: number;
processed: number;
time_ms: number;
}
| undefined;
const contextTotal = await this.getContextTotal();
if (contextTotal === null) {
console.warn('No context total available - cannot calculate processing state');
return null;
}
const currentConfig = config();
const outputTokensMax = currentConfig.max_tokens || -1;
const contextUsed = promptTokens + cacheTokens + predictedTokens;
const outputTokensUsed = predictedTokens;
const progressPercent = promptProgress
? Math.round((promptProgress.processed / promptProgress.total) * 100)
: undefined;
return {
status: predictedTokens > 0 ? 'generating' : promptProgress ? 'preparing' : 'idle',
tokensDecoded: predictedTokens,
tokensRemaining: outputTokensMax - predictedTokens,
contextUsed,
contextTotal,
outputTokensUsed,
outputTokensMax,
hasNextToken: predictedTokens > 0,
tokensPerSecond,
temperature: currentConfig.temperature ?? 0.8,
topP: currentConfig.top_p ?? 0.95,
speculative: false,
progressPercent,
promptTokens,
cacheTokens
};
}
/**
* Get current processing state
* Returns the last known state from timing data, or null if no data available
*/
async getCurrentState(): Promise<ApiProcessingState | null> {
if (this.lastKnownState) {
return this.lastKnownState;
}
try {
// Import dynamically to avoid circular dependency
const { chatStore } = await import('$lib/stores/chat.svelte');
const messages = chatStore.activeMessages;
for (let i = messages.length - 1; i >= 0; i--) {
const message = messages[i];
if (message.role === 'assistant' && message.timings) {
const restoredState = await this.parseCompletionTimingData({
prompt_n: message.timings.prompt_n || 0,
predicted_n: message.timings.predicted_n || 0,
predicted_per_second:
message.timings.predicted_n && message.timings.predicted_ms
? (message.timings.predicted_n / message.timings.predicted_ms) * 1000
: 0,
cache_n: message.timings.cache_n || 0
});
if (restoredState) {
this.lastKnownState = restoredState;
return restoredState;
}
}
}
} catch (error) {
console.warn('Failed to restore timing data from messages:', error);
}
return null;
}
}
export const slotsService = new SlotsService();
|