File size: 6,093 Bytes
fc69895
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
import type { TextGenerationStreamOutput } from "@huggingface/inference";
import type OpenAI from "openai";
import type { Stream } from "openai/streaming";

/**
 * Transform a stream of OpenAI.Chat.ChatCompletion into a stream of TextGenerationStreamOutput
 */
export async function* openAIChatToTextGenerationStream(
	completionStream: Stream<OpenAI.Chat.Completions.ChatCompletionChunk>,
	getRouterMetadata?: () => { route?: string; model?: string; provider?: string }
) {
	let generatedText = "";
	let tokenId = 0;
	let toolBuffer = ""; // legacy hack kept harmless
	let metadataYielded = false;
	let thinkOpen = false;

	for await (const completion of completionStream) {
		const retyped = completion as {
			"x-router-metadata"?: { route: string; model: string; provider?: string };
		};
		// Check if this chunk contains router metadata (first chunk from llm-router)
		if (!metadataYielded && retyped["x-router-metadata"]) {
			const metadata = retyped["x-router-metadata"];
			yield {
				token: {
					id: tokenId++,
					text: "",
					logprob: 0,
					special: true,
				},
				generated_text: null,
				details: null,
				routerMetadata: {
					route: metadata.route,
					model: metadata.model,
					provider: metadata.provider,
				},
			} as TextGenerationStreamOutput & {
				routerMetadata: { route: string; model: string; provider?: string };
			};
			metadataYielded = true;
			// Skip processing this chunk as content since it's just metadata
			if (
				!completion.choices ||
				completion.choices.length === 0 ||
				!completion.choices[0].delta?.content
			) {
				continue;
			}
		}
		const { choices } = completion;
		const delta: OpenAI.Chat.Completions.ChatCompletionChunk.Choice.Delta & {
			reasoning?: string;
			reasoning_content?: string;
		} = choices?.[0]?.delta ?? {};
		const content: string = delta.content ?? "";
		const reasoning: string =
			typeof delta?.reasoning === "string"
				? (delta.reasoning as string)
				: typeof delta?.reasoning_content === "string"
					? (delta.reasoning_content as string)
					: "";
		const last = choices?.[0]?.finish_reason === "stop" || choices?.[0]?.finish_reason === "length";

		// if the last token is a stop and the tool buffer is not empty, yield it as a generated_text
		if (choices?.[0]?.finish_reason === "stop" && toolBuffer.length > 0) {
			yield {
				token: {
					id: tokenId++,
					special: true,
					logprob: 0,
					text: "",
				},
				generated_text: toolBuffer,
				details: null,
			} as TextGenerationStreamOutput;
			break;
		}

		// weird bug where the parameters are streamed in like this
		if (choices?.[0]?.delta?.tool_calls) {
			const calls = Array.isArray(choices[0].delta.tool_calls)
				? choices[0].delta.tool_calls
				: [choices[0].delta.tool_calls];

			if (
				calls.length === 1 &&
				calls[0].index === 0 &&
				calls[0].id === "" &&
				calls[0].type === "function" &&
				!!calls[0].function &&
				calls[0].function.name === null
			) {
				toolBuffer += calls[0].function.arguments;
				continue;
			}
		}

		let combined = "";
		if (reasoning && reasoning.length > 0) {
			if (!thinkOpen) {
				combined += "<think>" + reasoning;
				thinkOpen = true;
			} else {
				combined += reasoning;
			}
		}

		if (content && content.length > 0) {
			const trimmed = content.trim();
			// Allow <think> tags in content to pass through (for models like DeepSeek R1)
			if (thinkOpen && trimmed === "</think>") {
				// close once without duplicating the tag
				combined += "</think>";
				thinkOpen = false;
			} else if (thinkOpen) {
				combined += "</think>" + content;
				thinkOpen = false;
			} else {
				combined += content;
			}
		}

		// Accumulate the combined token into the full text
		generatedText += combined;
		const output: TextGenerationStreamOutput = {
			token: {
				id: tokenId++,
				text: combined,
				logprob: 0,
				special: last,
			},
			generated_text: last ? generatedText : null,
			details: null,
		};
		yield output;

		// Tools removed: ignore tool_calls deltas
	}

	// If metadata wasn't yielded from chunks (e.g., from headers), yield it at the end
	if (!metadataYielded && getRouterMetadata) {
		const routerMetadata = getRouterMetadata();
		// Yield if we have either complete router metadata OR just provider info
		if (
			(routerMetadata && routerMetadata.route && routerMetadata.model) ||
			routerMetadata?.provider
		) {
			yield {
				token: {
					id: tokenId++,
					text: "",
					logprob: 0,
					special: true,
				},
				generated_text: null,
				details: null,
				routerMetadata,
			} as TextGenerationStreamOutput & {
				routerMetadata: { route?: string; model?: string; provider?: string };
			};
		}
	}
}

/**
 * Transform a non-streaming OpenAI chat completion into a stream of TextGenerationStreamOutput
 */
export async function* openAIChatToTextGenerationSingle(
	completion: OpenAI.Chat.Completions.ChatCompletion,
	getRouterMetadata?: () => { route?: string; model?: string; provider?: string }
) {
	const message: NonNullable<OpenAI.Chat.Completions.ChatCompletion.Choice>["message"] & {
		reasoning?: string;
		reasoning_content?: string;
	} = completion.choices?.[0]?.message ?? {};
	let content: string = message?.content || "";
	// Provider-dependent reasoning shapes (non-streaming)
	const r: string =
		typeof message?.reasoning === "string"
			? (message.reasoning as string)
			: typeof message?.reasoning_content === "string"
				? (message.reasoning_content as string)
				: "";
	if (r && r.length > 0) {
		content = `<think>${r}</think>` + content;
	}
	const tokenId = 0;

	// Yield the content as a single token
	yield {
		token: {
			id: tokenId,
			text: content,
			logprob: 0,
			special: false,
		},
		generated_text: content,
		details: null,
		...(getRouterMetadata
			? (() => {
					const metadata = getRouterMetadata();
					return (metadata && metadata.route && metadata.model) || metadata?.provider
						? { routerMetadata: metadata }
						: {};
				})()
			: {}),
	} as TextGenerationStreamOutput & {
		routerMetadata?: { route?: string; model?: string; provider?: string };
	};
}