xt8 commited on
Commit
eb765b9
·
verified ·
1 Parent(s): 41409c8

Update main.ts

Browse files
Files changed (1) hide show
  1. main.ts +165 -221
main.ts CHANGED
@@ -1,21 +1,13 @@
1
  import { serve } from "https://deno.land/std@0.208.0/http/server.ts";
 
 
2
 
3
  // --- 常量定义 ---
4
  const MAX_DOCUMENT_SIZE_MB = 20; // 设置最大文档大小限制(单位:MB)
5
  const MAX_DOCUMENT_SIZE_BYTES = MAX_DOCUMENT_SIZE_MB * 1024 * 1024;
6
  const MODELS_CACHE_DURATION = 60000; // 1分钟模型缓存
7
 
8
- // Gemini 支持的声音列表
9
- const GEMINI_VOICES = [
10
- { name: "Puck", language: "en-US", gender: "neutral" },
11
- { name: "Charon", language: "en-US", gender: "neutral" },
12
- { name: "Kore", language: "en-US", gender: "neutral" },
13
- { name: "Fenrir", language: "en-US", gender: "neutral" },
14
- { name: "Aoede", language: "en-US", gender: "neutral" },
15
- ] as const;
16
-
17
- type VoiceName = typeof GEMINI_VOICES[number]["name"];
18
-
19
  interface OpenAIMessage {
20
  role: "system" | "user" | "assistant";
21
  content: string | Array<{
@@ -34,17 +26,11 @@ interface OpenAIRequest {
34
  stream?: boolean;
35
  }
36
 
37
- // TTS 相关接口
38
- interface TTSRequest {
39
  model: string;
40
  input: string;
41
- voice?: VoiceName;
42
- response_format?: "mp3" | "opus" | "aac" | "flac";
43
- speed?: number;
44
- }
45
-
46
- interface TTSResponse {
47
- audio: string; // base64 编码的音频数据
48
  }
49
 
50
  class GoogleAIService {
@@ -102,7 +88,6 @@ class GoogleAIService {
102
  console.log(`Fetched ${this.cachedModels.length} models from Google AI`);
103
  return this.cachedModels;
104
  }
105
-
106
  return this.getFallbackModels();
107
  } catch (error) {
108
  console.warn("Error fetching models from Google AI:", error.message, ". Using fallback models.");
@@ -114,14 +99,79 @@ class GoogleAIService {
114
  return [
115
  { name: "models/gemini-1.5-pro", displayName: "Gemini 1.5 Pro", description: "Mid-size multimodal model that supports up to 1 million tokens, images, and documents (PDF, TXT, MD)", supportedGenerationMethods: ["generateContent"], maxTokens: 1000000, supportsDocuments: true },
116
  { name: "models/gemini-1.5-flash", displayName: "Gemini 1.5 Flash", description: "Fast and versatile multimodal model for diverse tasks, supports images and documents (PDF, TXT, MD)", supportedGenerationMethods: ["generateContent"], maxTokens: 1000000, supportsDocuments: true },
117
- { name: "models/gemini-2.0-flash-preview-image-generation", displayName: "Gemini 2.0 Flash Image Generation", description: "Advanced model for generating and editing high-quality images with text and image outputs", supportedGenerationMethods: ["generateContent"], maxTokens: 100000, capabilities: ["text", "image_generation", "image_editing"] }
 
 
118
  ];
119
  }
120
 
 
121
  public isVisionModel = (modelName: string): boolean => modelName.toLowerCase().includes('vision') || modelName.toLowerCase().includes('pro');
122
  public isImageGenerationModel = (modelName: string): boolean => modelName.includes('image-generation') || modelName === 'gemini-2.0-flash-preview-image-generation';
123
  public isImageEditingModel = (modelName: string): boolean => modelName.includes('image-generation') || modelName === 'gemini-2.0-flash-preview-image-generation';
124
  public isDocumentModel = (modelName: string): boolean => modelName.toLowerCase().includes('gemini-1.5') || modelName.toLowerCase().includes('pro') || modelName.toLowerCase().includes('flash');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
125
 
126
  private getDocumentType(url: string): string {
127
  const lowerUrl = url.toLowerCase();
@@ -133,17 +183,13 @@ class GoogleAIService {
133
  return 'unknown';
134
  }
135
 
136
- /**
137
- * [关键改进] 提取并验证文档数据,增加大小检查和更稳健的解析
138
- */
139
  private extractDocumentData(documentUrl: string): { mimeType: string; data: string; text?: string; docType: string } {
140
  const docType = this.getDocumentType(documentUrl);
 
141
  if (!documentUrl.startsWith("data:")) {
142
  if (documentUrl.startsWith("http")) {
143
  throw new Error("Document URL downloads are not supported. Please provide base64 encoded data URLs.");
144
  }
145
- // 如果不是data url或http url,则假定为纯base64数据,但这是一种不推荐的格式
146
- // 为了健壮性,我们强制要求使用标准的 data URL
147
  throw new Error("Document must be provided as a standard base64 data URL (e.g., 'data:application/pdf;base64,...').");
148
  }
149
 
@@ -151,16 +197,15 @@ class GoogleAIService {
151
  if (parts.length !== 2) {
152
  throw new Error("Invalid data URL format for document. Expected 'data:[mime];base64,[data]'.");
153
  }
154
-
155
  const [mimeInfo, base64Data] = parts;
156
- // **改进1: 检查文件大小**
157
- // Base64 字符串的长度约是原始数据的 4/3。
158
  const approxSizeInBytes = base64Data.length * 0.75;
159
  if (approxSizeInBytes > MAX_DOCUMENT_SIZE_BYTES) {
160
  throw new Error(`Document size (${(approxSizeInBytes / 1024 / 1024).toFixed(2)}MB) exceeds the ${MAX_DOCUMENT_SIZE_MB}MB limit.`);
161
  }
162
 
163
  const mimeType = mimeInfo.split(":")[1]?.split(";")[0] || 'application/octet-stream';
 
164
  if (docType === 'txt' || docType === 'md') {
165
  try {
166
  const textContent = atob(base64Data);
@@ -170,12 +215,11 @@ class GoogleAIService {
170
  throw new Error(`Invalid base64 encoding for ${docType} document.`);
171
  }
172
  }
173
-
174
- // 自动识别PDF的MIME类型
175
  const finalMimeType = docType === 'pdf' ? 'application/pdf' : mimeType;
176
  return { mimeType: finalMimeType, data: base64Data, docType };
177
  }
178
-
179
  private extractImageData(imageUrl: string): { mimeType: string; data: string } {
180
  if (imageUrl.startsWith("data:image/")) {
181
  const [mimeInfo, base64Data] = imageUrl.split(",");
@@ -192,6 +236,7 @@ class GoogleAIService {
192
  const apiKey = this.getNextApiKey();
193
  const fullModelName = modelName.startsWith('models/') ? modelName : `models/${modelName}`;
194
  const documentModel = this.isDocumentModel(fullModelName) ? fullModelName : 'models/gemini-1.5-pro-latest';
 
195
  console.log(`Processing document with model: ${documentModel}`);
196
 
197
  let contents;
@@ -203,28 +248,27 @@ class GoogleAIService {
203
 
204
  const messageParts = msg.content.map(part => {
205
  if (part.type === "text") return { text: part.text };
 
206
  if (part.type === "image_url" && part.image_url) {
207
  const { mimeType, data } = this.extractImageData(part.image_url.url);
208
  return { inlineData: { mimeType, data } };
209
  }
 
210
  if (part.type === "document" && part.document) {
211
  const docData = this.extractDocumentData(part.document.url);
212
  console.log(`Processing document: ${docData.docType}, mime: ${docData.mimeType}, size: ${(docData.data.length * 0.75 / 1024).toFixed(2)} KB`);
213
 
214
  if (docData.docType === 'txt' || docData.docType === 'md') {
215
- const prefix = docData.docType === 'md' ? 'Markdown document content:\\n' : 'Text document content:\\n';
216
  return { text: `${prefix}${docData.text}` };
217
  }
218
-
219
  if (docData.docType === 'pdf') {
220
  return { inlineData: { mimeType: docData.mimeType, data: docData.data } };
221
  }
222
-
223
  return { text: `[Document type '${docData.docType}' is not supported for direct processing. Please convert to PDF, TXT, or MD.]` };
224
  }
225
  return { text: "" };
226
  });
227
-
228
  return { role: msg.role === "assistant" ? "model" : "user", parts: messageParts.filter(p => p.text || p.inlineData) };
229
  });
230
  } catch (error) {
@@ -268,70 +312,13 @@ class GoogleAIService {
268
  if (candidate.finishReason === "SAFETY") {
269
  throw new Error("Response blocked due to safety filters. Check content for sensitive topics.");
270
  }
271
-
272
  if (candidate.finishReason === "RECITATION") {
273
  throw new Error("Response blocked due to recitation policy. The model's output was too similar to a copyrighted source.");
274
  }
275
 
276
  return candidate.content?.parts[0]?.text || "Document processed, but no text response was generated.";
277
  }
278
-
279
- // TTS 功能
280
- async generateSpeech(text: string, voice: VoiceName = "Puck", model: string): Promise<string> {
281
- const apiKey = this.getNextApiKey();
282
-
283
- const requestBody = {
284
- "contents": [{
285
- "parts":[{
286
- "text": text
287
- }]
288
- }],
289
- "generationConfig": {
290
- "responseModalities": ["AUDIO"],
291
- "speechConfig": {
292
- "voiceConfig": {
293
- "prebuiltVoiceConfig": {
294
- "voiceName": voice
295
- }
296
- }
297
- }
298
- },
299
- "model": model,
300
- };
301
-
302
- try {
303
- const response = await fetch(
304
- `https://generativelanguage.googleapis.com/v1beta/models/${model}:generateContent?key=${apiKey}`,
305
- {
306
- method: "POST",
307
- headers: { "Content-Type": "application/json" },
308
- body: JSON.stringify(requestBody)
309
- }
310
- );
311
-
312
- if (!response.ok) {
313
- const errorText = await response.text();
314
- throw new Error(`Gemini TTS API error: ${response.status} - ${errorText}`);
315
- }
316
-
317
- const data = await response.json();
318
-
319
- if (!data.audioContent) {
320
- throw new Error("No audio content returned from Gemini TTS");
321
- }
322
-
323
- return data.audioContent; // 返回 base64 编码的音频数据
324
- } catch (error) {
325
- console.error("Error generating speech:", error);
326
- throw error;
327
- }
328
- }
329
-
330
- // 获取可用的声音列表
331
- getAvailableVoices(): typeof GEMINI_VOICES {
332
- return GEMINI_VOICES;
333
- }
334
-
335
  async generateContent(messages: OpenAIMessage[], modelName: string, enableSearch: boolean = false): Promise<string> {
336
  const hasDocument = messages.some(msg => Array.isArray(msg.content) && msg.content.some(part => part.type === "document"));
337
  if (hasDocument) {
@@ -340,6 +327,7 @@ class GoogleAIService {
340
 
341
  const apiKey = this.getNextApiKey();
342
  const fullModelName = modelName.startsWith('models/') ? modelName : `models/${modelName}`;
 
343
  const contents = messages.map(msg => {
344
  if (typeof msg.content === "string") {
345
  return { role: msg.role === "assistant" ? "model" : "user", parts: [{ text: msg.content }] };
@@ -366,7 +354,6 @@ class GoogleAIService {
366
  contents,
367
  generationConfig: { temperature: 0.7, maxOutputTokens: 4096 }
368
  };
369
-
370
  if (enableSearch) {
371
  requestBody.tools = [{ googleSearchRetrieval: {} }];
372
  }
@@ -380,17 +367,14 @@ class GoogleAIService {
380
  const errorText = await response.text();
381
  throw new Error(`Google AI API error: ${response.status} - ${errorText}`);
382
  }
383
-
384
  const data = await response.json();
385
  if (!data.candidates || data.candidates.length === 0) {
386
  throw new Error("No response generated from Google AI");
387
  }
388
-
389
  const candidate = data.candidates[0];
390
  if (candidate.finishReason === "SAFETY") {
391
  throw new Error("Response blocked due to safety filters");
392
  }
393
-
394
  return candidate.content?.parts[0]?.text || "No response generated";
395
  }
396
 
@@ -420,7 +404,6 @@ class GoogleAIService {
420
  const errorText = await response.text();
421
  throw new Error(`Image ${inputImage ? 'editing' : 'generation'} failed: ${response.status} - ${errorText}`);
422
  }
423
-
424
  const data = await response.json();
425
  if (!data.candidates || data.candidates.length === 0) {
426
  throw new Error(`No ${inputImage ? 'edited' : 'generated'} image returned`);
@@ -447,10 +430,9 @@ class GoogleAIService {
447
  result.imageBase64 = imageBase64;
448
  result.imageUrl = `data:image/png;base64,${imageBase64}`;
449
  }
450
-
451
  return result;
452
  }
453
-
454
  async generateContentWithGrounding(messages: OpenAIMessage[], modelName: string): Promise<string> {
455
  const apiKey = this.getNextApiKey();
456
  const fullModelName = modelName.startsWith('models/') ? modelName : `models/${modelName}`;
@@ -481,7 +463,6 @@ class GoogleAIService {
481
  if (candidate.finishReason === "SAFETY") {
482
  throw new Error("Response blocked due to safety filters");
483
  }
484
-
485
  return candidate.content?.parts[0]?.text || "No response generated";
486
  }
487
 
@@ -529,102 +510,62 @@ class OpenAICompatibleServer {
529
  return authHeader ? authHeader.replace("Bearer ", "") === this.authKey : false;
530
  }
531
 
532
- private isDocumentContent(url?: string): boolean {
533
- if (!url) return false;
534
- const lowerUrl = url.toLowerCase();
535
- return lowerUrl.includes('.pdf') || lowerUrl.startsWith('data:application/pdf') ||
536
- lowerUrl.includes('.txt') || lowerUrl.startsWith('data:text/plain') ||
537
- lowerUrl.includes('.md') || lowerUrl.startsWith('data:text/markdown');
538
- }
539
-
540
- private async handleTTS(request: Request): Promise<Response> {
541
  try {
542
- const body: TTSRequest = await request.json();
543
- const { input, voice = "Puck", model } = body;
544
-
545
- // 验证输入
546
- if (!input || input.trim().length === 0) {
547
- return new Response(
548
- JSON.stringify({
549
- error: {
550
- message: "Input text is required",
551
- type: "invalid_request_error",
552
- code: null
553
- }
554
- }),
555
- { status: 400, headers: { "Content-Type": "application/json" } }
556
- );
557
- }
558
 
559
- // 验证声音
560
- const availableVoices = this.googleAI.getAvailableVoices();
561
- const isValidVoice = availableVoices.some(v => v.name === voice);
562
- if (!isValidVoice) {
563
- return new Response(
564
- JSON.stringify({
565
- error: {
566
- message: `Invalid voice "${voice}". Available voices: ${availableVoices.map(v => v.name).join(", ")}`,
567
- type: "invalid_request_error",
568
- code: null
569
- }
570
- }),
571
- { status: 400, headers: { "Content-Type": "application/json" } }
572
- );
573
- }
574
 
575
- console.log(`TTS request: voice=${voice}, text length=${input.length}`);
 
 
 
 
 
576
 
577
- // 生成语音
578
- const audioBase64 = await this.googleAI.generateSpeech(input, voice, model);
 
579
 
580
- // base64 转换为二进制数据
581
- const audioData = Uint8Array.from(atob(audioBase64), c => c.charCodeAt(0));
582
 
583
- return new Response(audioData, {
584
- headers: {
585
- "Content-Type": "audio/mpeg",
586
- "Content-Length": audioData.length.toString()
587
- }
588
  });
589
 
590
  } catch (error) {
591
- console.error("Error in TTS:", error.message);
 
592
  return new Response(
593
  JSON.stringify({
594
  error: {
595
  message: error.message,
596
- type: "api_error",
597
- code: null
598
  }
599
  }),
600
- { status: 500, headers: { "Content-Type": "application/json" } }
601
  );
602
  }
603
  }
604
 
605
- private async handleVoices(): Promise<Response> {
606
- try {
607
- const voices = this.googleAI.getAvailableVoices();
608
- const voicesResponse = {
609
- object: "list",
610
- data: voices.map(voice => ({
611
- id: voice.name,
612
- name: voice.name,
613
- language: voice.language,
614
- gender: voice.gender
615
- }))
616
- };
617
-
618
- return new Response(JSON.stringify(voicesResponse), {
619
- headers: { "Content-Type": "application/json" }
620
- });
621
- } catch (error) {
622
- console.error("Error fetching voices:", error);
623
- return new Response(
624
- JSON.stringify({ error: { message: "Failed to fetch voices." } }),
625
- { status: 500, headers: { "Content-Type": "application/json" } }
626
- );
627
- }
628
  }
629
 
630
  private async handleChatCompletions(request: Request): Promise<Response> {
@@ -645,7 +586,7 @@ class OpenAICompatibleServer {
645
  );
646
 
647
  const hasImages = body.messages.some(msg => Array.isArray(msg.content) && msg.content.some(part => part.type === "image_url"));
648
-
649
  let inputImages: any[] = [];
650
  if (hasImages) {
651
  body.messages.forEach(msg => {
@@ -656,8 +597,9 @@ class OpenAICompatibleServer {
656
  }
657
  });
658
  }
659
-
660
  let responseText: string;
 
661
  // Routing logic based on keywords and content types
662
  if (hasDocument) {
663
  responseText = await this.googleAI.generateContentWithDocument(body.messages, requestedModel);
@@ -701,7 +643,7 @@ class OpenAICompatibleServer {
701
  );
702
  }
703
  }
704
-
705
  private async streamStringAsOpenAIResponse(content: string, modelName: string): Promise<ReadableStream<Uint8Array>> {
706
  const encoder = new TextEncoder();
707
  const streamId = `chatcmpl-${Date.now()}`;
@@ -711,56 +653,62 @@ class OpenAICompatibleServer {
711
  return new ReadableStream({
712
  start(controller) {
713
  const initialChunk = { id: streamId, object: 'chat.completion.chunk', created: creationTime, model: modelName, choices: [{ index: 0, delta: { role: 'assistant', content: '' }, finish_reason: null }] };
714
- controller.enqueue(encoder.encode(`data: ${JSON.stringify(initialChunk)}\\n\\n`));
715
  },
716
  pull(controller) {
717
  if (contentQueue.length === 0) {
718
  const finalChunk = { id: streamId, object: 'chat.completion.chunk', created: creationTime, model: modelName, choices: [{ index: 0, delta: {}, finish_reason: 'stop' }] };
719
- controller.enqueue(encoder.encode(`data: ${JSON.stringify(finalChunk)}\\n\\n`));
720
- controller.enqueue(encoder.encode('data: [DONE]\\n\\n'));
721
  controller.close();
722
  return;
723
  }
724
  const char = contentQueue.shift();
725
  const chunk = { id: streamId, object: 'chat.completion.chunk', created: creationTime, model: modelName, choices: [{ index: 0, delta: { content: char }, finish_reason: null }] };
726
- controller.enqueue(encoder.encode(`data: ${JSON.stringify(chunk)}\\n\\n`));
727
  }
728
  });
729
  }
730
-
731
  private async handleModels(): Promise<Response> {
732
  try {
733
  const googleModels = await this.googleAI.fetchOfficialModels();
734
- const models = {
735
- object: "list",
736
- data: googleModels.map(model => {
737
- const modelId = model.name.replace('models/', '');
738
- return {
739
- id: modelId, object: "model", created: Math.floor(Date.now() / 1000), owned_by: "google",
740
- description: model.description || model.displayName, maxTokens: model.inputTokenLimit || model.maxTokens
741
- };
742
- })
743
- };
 
 
 
 
 
 
 
 
 
 
 
 
 
744
  return new Response(JSON.stringify(models), { headers: { "Content-Type": "application/json" } });
745
  } catch (error) {
746
  console.error("Error fetching models:", error);
747
  return new Response(JSON.stringify({ error: { message: "Failed to fetch models." } }), { status: 500 });
748
  }
749
  }
750
-
751
  private async handleStatus(): Promise<Response> {
752
  const status = {
753
- status: "healthy", timestamp: new Date().toISOString(), version: "2.6.0",
754
  api_keys_loaded: this.googleAI.apiKeys.length,
755
  models_in_cache: this.googleAI.cachedModels.length,
756
- models_last_fetched: this.googleAI.modelsLastFetch > 0 ? new Date(this.googleAI.modelsLastFetch).toISOString() : "never",
757
- features: {
758
- chat_completions: true,
759
- image_generation: true,
760
- document_processing: true,
761
- text_to_speech: true,
762
- voice_list: true
763
- }
764
  };
765
  return new Response(JSON.stringify(status), { headers: { "Content-Type": "application/json" } });
766
  }
@@ -779,19 +727,18 @@ class OpenAICompatibleServer {
779
  const url = new URL(request.url);
780
  let response: Response;
781
 
782
- // Handle routes
783
  if (url.pathname === "/health" || url.pathname === "/status") {
784
  response = await this.handleStatus();
785
  } else if (!this.authenticate(request)) {
786
  response = new Response(JSON.stringify({ error: { message: "Unauthorized" } }), { status: 401 });
 
 
 
787
  } else if (url.pathname === "/v1/chat/completions" && request.method === "POST") {
788
  response = await this.handleChatCompletions(request);
789
  } else if (url.pathname === "/v1/models" && request.method === "GET") {
790
  response = await this.handleModels();
791
- } else if (url.pathname === "/v1/audio/speech" && request.method === "POST") {
792
- response = await this.handleTTS(request);
793
- } else if (url.pathname === "/v1/voices" && request.method === "GET") {
794
- response = await this.handleVoices();
795
  } else {
796
  response = new Response("Not Found", { status: 404 });
797
  }
@@ -808,7 +755,8 @@ class OpenAICompatibleServer {
808
 
809
  // --- 服务器启动 ---
810
  const server = new OpenAICompatibleServer();
811
- console.log("🚀 OpenAI Compatible Server with Google AI and TTS starting on port 7860...");
 
812
  console.log(`✅ Loaded ${server.googleAI.apiKeys.length} API key(s).`);
813
  console.log(`📄 Max document size set to ${MAX_DOCUMENT_SIZE_MB}MB.`);
814
 
@@ -819,17 +767,13 @@ server.googleAI.fetchOfficialModels().then(models => {
819
  console.warn(`⚠️ Could not pre-fetch models: ${error.message}. Will use fallbacks or fetch on first request.`);
820
  });
821
 
822
- console.log("\\n🔗 Endpoints:");
823
  console.log(" POST /v1/chat/completions");
 
824
  console.log(" GET /v1/models");
825
- console.log(" POST /v1/audio/speech (TTS)");
826
- console.log(" GET /v1/voices");
827
  console.log(" GET /status");
828
 
829
- const voices = server.googleAI.getAvailableVoices();
830
- console.log(`\\n🎤 Available TTS voices: ${voices.map(v => v.name).join(", ")}`);
831
-
832
  await serve(
833
  (request: Request) => server.handleRequest(request),
834
  { port: 7860 }
835
- );
 
1
  import { serve } from "https://deno.land/std@0.208.0/http/server.ts";
2
+ // [新增] 导入 base64 解码库,用于处理音频数据
3
+ import { decode } from "https://deno.land/std@0.208.0/encoding/base64.ts";
4
 
5
  // --- 常量定义 ---
6
  const MAX_DOCUMENT_SIZE_MB = 20; // 设置最大文档大小限制(单位:MB)
7
  const MAX_DOCUMENT_SIZE_BYTES = MAX_DOCUMENT_SIZE_MB * 1024 * 1024;
8
  const MODELS_CACHE_DURATION = 60000; // 1分钟模型缓存
9
 
10
+ // --- 接口定义 ---
 
 
 
 
 
 
 
 
 
 
11
  interface OpenAIMessage {
12
  role: "system" | "user" | "assistant";
13
  content: string | Array<{
 
26
  stream?: boolean;
27
  }
28
 
29
+ // [新增] OpenAI TTS 请求接口定义
30
+ interface OpenAITTSRequest {
31
  model: string;
32
  input: string;
33
+ voice: 'alloy' | 'echo' | 'fable' | 'onyx' | 'shimmer' | 'nova' | string;
 
 
 
 
 
 
34
  }
35
 
36
  class GoogleAIService {
 
88
  console.log(`Fetched ${this.cachedModels.length} models from Google AI`);
89
  return this.cachedModels;
90
  }
 
91
  return this.getFallbackModels();
92
  } catch (error) {
93
  console.warn("Error fetching models from Google AI:", error.message, ". Using fallback models.");
 
99
  return [
100
  { name: "models/gemini-1.5-pro", displayName: "Gemini 1.5 Pro", description: "Mid-size multimodal model that supports up to 1 million tokens, images, and documents (PDF, TXT, MD)", supportedGenerationMethods: ["generateContent"], maxTokens: 1000000, supportsDocuments: true },
101
  { name: "models/gemini-1.5-flash", displayName: "Gemini 1.5 Flash", description: "Fast and versatile multimodal model for diverse tasks, supports images and documents (PDF, TXT, MD)", supportedGenerationMethods: ["generateContent"], maxTokens: 1000000, supportsDocuments: true },
102
+ { name: "models/gemini-2.0-flash-preview-image-generation", displayName: "Gemini 2.0 Flash Image Generation", description: "Advanced model for generating and editing high-quality images with text and image outputs", supportedGenerationMethods: ["generateContent"], maxTokens: 100000, capabilities: ["text", "image_generation", "image_editing"] },
103
+ // [新增] 添加TTS模型到回退列表,确保/v1/models能看到它
104
+ { name: "models/gemini-2.5-flash-preview-tts", displayName: "Gemini 2.5 Flash TTS", description: "Advanced model for generating high-quality speech from text.", supportedGenerationMethods: ["generateContent"] },
105
  ];
106
  }
107
 
108
+ // --- 模型能力判断辅助函数 ---
109
  public isVisionModel = (modelName: string): boolean => modelName.toLowerCase().includes('vision') || modelName.toLowerCase().includes('pro');
110
  public isImageGenerationModel = (modelName: string): boolean => modelName.includes('image-generation') || modelName === 'gemini-2.0-flash-preview-image-generation';
111
  public isImageEditingModel = (modelName: string): boolean => modelName.includes('image-generation') || modelName === 'gemini-2.0-flash-preview-image-generation';
112
  public isDocumentModel = (modelName: string): boolean => modelName.toLowerCase().includes('gemini-1.5') || modelName.toLowerCase().includes('pro') || modelName.toLowerCase().includes('flash');
113
+ // [新增] 判断是否为TTS模型
114
+ public isTTSModel = (modelName: string): boolean => modelName.toLowerCase().includes('tts');
115
+
116
+
117
+ /**
118
+ * [新增] 调用 Gemini TTS API 生成语音
119
+ * @param text 要转换为语音的文本
120
+ * @param modelName 使用的TTS模型名称
121
+ * @param voiceName 使用的语音名称 (e.g., "Kore", "Krew")
122
+ * @returns 返回 Base64 编码的 MP3 音频数据字符串
123
+ */
124
+ async generateSpeech(text: string, modelName: string, voiceName: string): Promise<string> {
125
+ const apiKey = this.getNextApiKey();
126
+ const fullModelName = modelName.startsWith('models/') ? modelName : `models/${modelName}`;
127
+
128
+ console.log(`Generating speech with model: ${fullModelName}, voice: ${voiceName}`);
129
+
130
+ const requestBody = {
131
+ contents: [{
132
+ parts: [{ "text": text }]
133
+ }],
134
+ generationConfig: {
135
+ responseModalities: ["AUDIO"],
136
+ speechConfig: {
137
+ // 直接请求 MP3 格式,兼容性最好
138
+ outputAudioEncoding: "MP3",
139
+ voiceConfig: {
140
+ prebuiltVoiceConfig: {
141
+ voiceName: voiceName
142
+ }
143
+ }
144
+ }
145
+ },
146
+ model: fullModelName,
147
+ };
148
+
149
+ const response = await fetch(
150
+ `https://generativelanguage.googleapis.com/v1beta/${fullModelName}:generateContent?key=${apiKey}`,
151
+ {
152
+ method: "POST",
153
+ headers: { "Content-Type": "application/json" },
154
+ body: JSON.stringify(requestBody),
155
+ }
156
+ );
157
+
158
+ if (!response.ok) {
159
+ const errorBody = await response.json().catch(() => response.text());
160
+ const errorMessage = errorBody?.error?.message || JSON.stringify(errorBody);
161
+ console.error(`Google TTS API Error: ${response.status} - ${errorMessage}`);
162
+ throw new Error(`Google TTS API request failed with status ${response.status}: ${errorMessage}`);
163
+ }
164
+
165
+ const data = await response.json();
166
+ const audioData = data.candidates?.[0]?.content?.parts?.[0]?.inlineData?.data;
167
+
168
+ if (!audioData) {
169
+ console.error("Invalid TTS response from Google AI:", JSON.stringify(data));
170
+ throw new Error("No audio data received from Google AI TTS service.");
171
+ }
172
+
173
+ return audioData;
174
+ }
175
 
176
  private getDocumentType(url: string): string {
177
  const lowerUrl = url.toLowerCase();
 
183
  return 'unknown';
184
  }
185
 
 
 
 
186
  private extractDocumentData(documentUrl: string): { mimeType: string; data: string; text?: string; docType: string } {
187
  const docType = this.getDocumentType(documentUrl);
188
+
189
  if (!documentUrl.startsWith("data:")) {
190
  if (documentUrl.startsWith("http")) {
191
  throw new Error("Document URL downloads are not supported. Please provide base64 encoded data URLs.");
192
  }
 
 
193
  throw new Error("Document must be provided as a standard base64 data URL (e.g., 'data:application/pdf;base64,...').");
194
  }
195
 
 
197
  if (parts.length !== 2) {
198
  throw new Error("Invalid data URL format for document. Expected 'data:[mime];base64,[data]'.");
199
  }
 
200
  const [mimeInfo, base64Data] = parts;
201
+
 
202
  const approxSizeInBytes = base64Data.length * 0.75;
203
  if (approxSizeInBytes > MAX_DOCUMENT_SIZE_BYTES) {
204
  throw new Error(`Document size (${(approxSizeInBytes / 1024 / 1024).toFixed(2)}MB) exceeds the ${MAX_DOCUMENT_SIZE_MB}MB limit.`);
205
  }
206
 
207
  const mimeType = mimeInfo.split(":")[1]?.split(";")[0] || 'application/octet-stream';
208
+
209
  if (docType === 'txt' || docType === 'md') {
210
  try {
211
  const textContent = atob(base64Data);
 
215
  throw new Error(`Invalid base64 encoding for ${docType} document.`);
216
  }
217
  }
218
+
 
219
  const finalMimeType = docType === 'pdf' ? 'application/pdf' : mimeType;
220
  return { mimeType: finalMimeType, data: base64Data, docType };
221
  }
222
+
223
  private extractImageData(imageUrl: string): { mimeType: string; data: string } {
224
  if (imageUrl.startsWith("data:image/")) {
225
  const [mimeInfo, base64Data] = imageUrl.split(",");
 
236
  const apiKey = this.getNextApiKey();
237
  const fullModelName = modelName.startsWith('models/') ? modelName : `models/${modelName}`;
238
  const documentModel = this.isDocumentModel(fullModelName) ? fullModelName : 'models/gemini-1.5-pro-latest';
239
+
240
  console.log(`Processing document with model: ${documentModel}`);
241
 
242
  let contents;
 
248
 
249
  const messageParts = msg.content.map(part => {
250
  if (part.type === "text") return { text: part.text };
251
+
252
  if (part.type === "image_url" && part.image_url) {
253
  const { mimeType, data } = this.extractImageData(part.image_url.url);
254
  return { inlineData: { mimeType, data } };
255
  }
256
+
257
  if (part.type === "document" && part.document) {
258
  const docData = this.extractDocumentData(part.document.url);
259
  console.log(`Processing document: ${docData.docType}, mime: ${docData.mimeType}, size: ${(docData.data.length * 0.75 / 1024).toFixed(2)} KB`);
260
 
261
  if (docData.docType === 'txt' || docData.docType === 'md') {
262
+ const prefix = docData.docType === 'md' ? 'Markdown document content:\n' : 'Text document content:\n';
263
  return { text: `${prefix}${docData.text}` };
264
  }
 
265
  if (docData.docType === 'pdf') {
266
  return { inlineData: { mimeType: docData.mimeType, data: docData.data } };
267
  }
 
268
  return { text: `[Document type '${docData.docType}' is not supported for direct processing. Please convert to PDF, TXT, or MD.]` };
269
  }
270
  return { text: "" };
271
  });
 
272
  return { role: msg.role === "assistant" ? "model" : "user", parts: messageParts.filter(p => p.text || p.inlineData) };
273
  });
274
  } catch (error) {
 
312
  if (candidate.finishReason === "SAFETY") {
313
  throw new Error("Response blocked due to safety filters. Check content for sensitive topics.");
314
  }
 
315
  if (candidate.finishReason === "RECITATION") {
316
  throw new Error("Response blocked due to recitation policy. The model's output was too similar to a copyrighted source.");
317
  }
318
 
319
  return candidate.content?.parts[0]?.text || "Document processed, but no text response was generated.";
320
  }
321
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
322
  async generateContent(messages: OpenAIMessage[], modelName: string, enableSearch: boolean = false): Promise<string> {
323
  const hasDocument = messages.some(msg => Array.isArray(msg.content) && msg.content.some(part => part.type === "document"));
324
  if (hasDocument) {
 
327
 
328
  const apiKey = this.getNextApiKey();
329
  const fullModelName = modelName.startsWith('models/') ? modelName : `models/${modelName}`;
330
+
331
  const contents = messages.map(msg => {
332
  if (typeof msg.content === "string") {
333
  return { role: msg.role === "assistant" ? "model" : "user", parts: [{ text: msg.content }] };
 
354
  contents,
355
  generationConfig: { temperature: 0.7, maxOutputTokens: 4096 }
356
  };
 
357
  if (enableSearch) {
358
  requestBody.tools = [{ googleSearchRetrieval: {} }];
359
  }
 
367
  const errorText = await response.text();
368
  throw new Error(`Google AI API error: ${response.status} - ${errorText}`);
369
  }
 
370
  const data = await response.json();
371
  if (!data.candidates || data.candidates.length === 0) {
372
  throw new Error("No response generated from Google AI");
373
  }
 
374
  const candidate = data.candidates[0];
375
  if (candidate.finishReason === "SAFETY") {
376
  throw new Error("Response blocked due to safety filters");
377
  }
 
378
  return candidate.content?.parts[0]?.text || "No response generated";
379
  }
380
 
 
404
  const errorText = await response.text();
405
  throw new Error(`Image ${inputImage ? 'editing' : 'generation'} failed: ${response.status} - ${errorText}`);
406
  }
 
407
  const data = await response.json();
408
  if (!data.candidates || data.candidates.length === 0) {
409
  throw new Error(`No ${inputImage ? 'edited' : 'generated'} image returned`);
 
430
  result.imageBase64 = imageBase64;
431
  result.imageUrl = `data:image/png;base64,${imageBase64}`;
432
  }
 
433
  return result;
434
  }
435
+
436
  async generateContentWithGrounding(messages: OpenAIMessage[], modelName: string): Promise<string> {
437
  const apiKey = this.getNextApiKey();
438
  const fullModelName = modelName.startsWith('models/') ? modelName : `models/${modelName}`;
 
463
  if (candidate.finishReason === "SAFETY") {
464
  throw new Error("Response blocked due to safety filters");
465
  }
 
466
  return candidate.content?.parts[0]?.text || "No response generated";
467
  }
468
 
 
510
  return authHeader ? authHeader.replace("Bearer ", "") === this.authKey : false;
511
  }
512
 
513
+ /**
514
+ * [新增] 处理 TTS 请求的句柄
515
+ * @param request - HTTP 请求对象
516
+ * @returns 返回包含 MP3 音频数据的响应
517
+ */
518
+ private async handleAudioSpeech(request: Request): Promise<Response> {
 
 
 
519
  try {
520
+ const body: OpenAITTSRequest = await request.json();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
521
 
522
+ // 模型映射: 将 OpenAI 标准模型名映射到 Gemini 模型名
523
+ const modelMap: { [key: string]: string } = {
524
+ 'tts-1': 'gemini-2.5-flash-preview-tts',
525
+ 'tts-1-hd': 'gemini-2.5-flash-preview-tts', // HD 也暂时映射到同一个
526
+ };
527
+ const geminiModel = modelMap[body.model] || (this.googleAI.isTTSModel(body.model) ? body.model : 'gemini-2.5-flash-preview-tts');
 
 
 
 
 
 
 
 
 
528
 
529
+ // 语音映射: OpenAI 标准语音名映射到 Gemini 语音名
530
+ const voiceMap: { [key: string]: string } = {
531
+ 'alloy': 'Krew', 'echo': 'Kore', 'fable': 'Chiron',
532
+ 'onyx': 'Calypso', 'nova': 'Cria', 'shimmer': 'Estrella',
533
+ };
534
+ const geminiVoice = voiceMap[body.voice] || 'Kore'; // 默认使用 Kore
535
 
536
+ if (!body.input) {
537
+ throw new Error("The 'input' field is required for TTS requests.");
538
+ }
539
 
540
+ const audioBase64 = await this.googleAI.generateSpeech(body.input, geminiModel, geminiVoice);
541
+ const audioBytes = decode(audioBase64);
542
 
543
+ return new Response(audioBytes, {
544
+ headers: { "Content-Type": "audio/mpeg" }
 
 
 
545
  });
546
 
547
  } catch (error) {
548
+ console.error("Error in audio speech generation:", error.message);
549
+ const status = error.message.includes("required") ? 400 : 500;
550
  return new Response(
551
  JSON.stringify({
552
  error: {
553
  message: error.message,
554
+ type: status === 400 ? "invalid_request_error" : "api_error",
555
+ code: "tts_failed"
556
  }
557
  }),
558
+ { status, headers: { "Content-Type": "application/json" } }
559
  );
560
  }
561
  }
562
 
563
+ private isDocumentContent(url?: string): boolean {
564
+ if (!url) return false;
565
+ const lowerUrl = url.toLowerCase();
566
+ return lowerUrl.includes('.pdf') || lowerUrl.startsWith('data:application/pdf') ||
567
+ lowerUrl.includes('.txt') || lowerUrl.startsWith('data:text/plain') ||
568
+ lowerUrl.includes('.md') || lowerUrl.startsWith('data:text/markdown');
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
569
  }
570
 
571
  private async handleChatCompletions(request: Request): Promise<Response> {
 
586
  );
587
 
588
  const hasImages = body.messages.some(msg => Array.isArray(msg.content) && msg.content.some(part => part.type === "image_url"));
589
+
590
  let inputImages: any[] = [];
591
  if (hasImages) {
592
  body.messages.forEach(msg => {
 
597
  }
598
  });
599
  }
600
+
601
  let responseText: string;
602
+
603
  // Routing logic based on keywords and content types
604
  if (hasDocument) {
605
  responseText = await this.googleAI.generateContentWithDocument(body.messages, requestedModel);
 
643
  );
644
  }
645
  }
646
+
647
  private async streamStringAsOpenAIResponse(content: string, modelName: string): Promise<ReadableStream<Uint8Array>> {
648
  const encoder = new TextEncoder();
649
  const streamId = `chatcmpl-${Date.now()}`;
 
653
  return new ReadableStream({
654
  start(controller) {
655
  const initialChunk = { id: streamId, object: 'chat.completion.chunk', created: creationTime, model: modelName, choices: [{ index: 0, delta: { role: 'assistant', content: '' }, finish_reason: null }] };
656
+ controller.enqueue(encoder.encode(`data: ${JSON.stringify(initialChunk)}\n\n`));
657
  },
658
  pull(controller) {
659
  if (contentQueue.length === 0) {
660
  const finalChunk = { id: streamId, object: 'chat.completion.chunk', created: creationTime, model: modelName, choices: [{ index: 0, delta: {}, finish_reason: 'stop' }] };
661
+ controller.enqueue(encoder.encode(`data: ${JSON.stringify(finalChunk)}\n\n`));
662
+ controller.enqueue(encoder.encode('data: [DONE]\n\n'));
663
  controller.close();
664
  return;
665
  }
666
  const char = contentQueue.shift();
667
  const chunk = { id: streamId, object: 'chat.completion.chunk', created: creationTime, model: modelName, choices: [{ index: 0, delta: { content: char }, finish_reason: null }] };
668
+ controller.enqueue(encoder.encode(`data: ${JSON.stringify(chunk)}\n\n`));
669
  }
670
  });
671
  }
672
+
673
  private async handleModels(): Promise<Response> {
674
  try {
675
  const googleModels = await this.googleAI.fetchOfficialModels();
676
+ const openAIFormattedModels = googleModels.map(model => {
677
+ const modelId = model.name.replace('models/', '');
678
+ return {
679
+ id: modelId,
680
+ object: "model",
681
+ created: Math.floor(Date.now() / 1000),
682
+ owned_by: "google",
683
+ description: model.description || model.displayName,
684
+ maxTokens: model.inputTokenLimit || model.maxTokens
685
+ };
686
+ });
687
+
688
+ // 确保TTS模型以OpenAI兼容的名称存在
689
+ if (openAIFormattedModels.some(m => this.googleAI.isTTSModel(m.id))) {
690
+ if (!openAIFormattedModels.some(m => m.id === 'tts-1')) {
691
+ openAIFormattedModels.push({
692
+ id: 'tts-1', object: "model", created: Math.floor(Date.now() / 1000), owned_by: "google",
693
+ description: "Text-to-speech model, mapped to gemini-2.5-flash-preview-tts", maxTokens: 4096
694
+ });
695
+ }
696
+ }
697
+
698
+ const models = { object: "list", data: openAIFormattedModels };
699
  return new Response(JSON.stringify(models), { headers: { "Content-Type": "application/json" } });
700
  } catch (error) {
701
  console.error("Error fetching models:", error);
702
  return new Response(JSON.stringify({ error: { message: "Failed to fetch models." } }), { status: 500 });
703
  }
704
  }
705
+
706
  private async handleStatus(): Promise<Response> {
707
  const status = {
708
+ status: "healthy", timestamp: new Date().toISOString(), version: "2.5.0",
709
  api_keys_loaded: this.googleAI.apiKeys.length,
710
  models_in_cache: this.googleAI.cachedModels.length,
711
+ models_last_fetched: this.googleAI.modelsLastFetch > 0 ? new Date(this.googleAI.modelsLastFetch).toISOString() : "never"
 
 
 
 
 
 
 
712
  };
713
  return new Response(JSON.stringify(status), { headers: { "Content-Type": "application/json" } });
714
  }
 
727
  const url = new URL(request.url);
728
  let response: Response;
729
 
730
+ // --- [更新] 路由处理 ---
731
  if (url.pathname === "/health" || url.pathname === "/status") {
732
  response = await this.handleStatus();
733
  } else if (!this.authenticate(request)) {
734
  response = new Response(JSON.stringify({ error: { message: "Unauthorized" } }), { status: 401 });
735
+ } else if (url.pathname === "/v1/audio/speech" && request.method === "POST") {
736
+ // [新增] 路由到 TTS 处理器
737
+ response = await this.handleAudioSpeech(request);
738
  } else if (url.pathname === "/v1/chat/completions" && request.method === "POST") {
739
  response = await this.handleChatCompletions(request);
740
  } else if (url.pathname === "/v1/models" && request.method === "GET") {
741
  response = await this.handleModels();
 
 
 
 
742
  } else {
743
  response = new Response("Not Found", { status: 404 });
744
  }
 
755
 
756
  // --- 服务器启动 ---
757
  const server = new OpenAICompatibleServer();
758
+
759
+ console.log("🚀 OpenAI Compatible Server with Google AI starting on port 8000...");
760
  console.log(`✅ Loaded ${server.googleAI.apiKeys.length} API key(s).`);
761
  console.log(`📄 Max document size set to ${MAX_DOCUMENT_SIZE_MB}MB.`);
762
 
 
767
  console.warn(`⚠️ Could not pre-fetch models: ${error.message}. Will use fallbacks or fetch on first request.`);
768
  });
769
 
770
+ console.log("\n🔗 Endpoints:");
771
  console.log(" POST /v1/chat/completions");
772
+ console.log(" POST /v1/audio/speech"); // [新增]
773
  console.log(" GET /v1/models");
 
 
774
  console.log(" GET /status");
775
 
 
 
 
776
  await serve(
777
  (request: Request) => server.handleRequest(request),
778
  { port: 7860 }
779
+ );