File size: 3,596 Bytes
c0ddd13
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
function getVoiceBaseUrl(): string {
  const w = window as unknown as { __APP_CONFIG__?: { VOICE_API_URL?: string } };
  return (
    w.__APP_CONFIG__?.VOICE_API_URL ||
    (import.meta as unknown as { env: Record<string, string> }).env.VITE_API_BASE_VOICE_URL ||
    "http://localhost:7861"
  );
}
const VOICE_BASE_URL = getVoiceBaseUrl();

function writeString(view: DataView, offset: number, str: string): void {
  for (let i = 0; i < str.length; i++) view.setUint8(offset + i, str.charCodeAt(i));
}

export function createWavBlob(chunks: ArrayBuffer[], sampleRate: number): Blob {
  const pcmByteLength = chunks.reduce((sum, c) => sum + c.byteLength, 0);
  const buffer = new ArrayBuffer(44 + pcmByteLength);
  const view = new DataView(buffer);
  writeString(view, 0, "RIFF");
  view.setUint32(4, 36 + pcmByteLength, true);
  writeString(view, 8, "WAVE");
  writeString(view, 12, "fmt ");
  view.setUint32(16, 16, true);
  view.setUint16(20, 1, true);             // PCM
  view.setUint16(22, 1, true);             // mono
  view.setUint32(24, sampleRate, true);
  view.setUint32(28, sampleRate * 2, true);
  view.setUint16(32, 2, true);
  view.setUint16(34, 16, true);
  writeString(view, 36, "data");
  view.setUint32(40, pcmByteLength, true);
  let offset = 44;
  for (const chunk of chunks) {
    new Uint8Array(buffer, offset, chunk.byteLength).set(new Uint8Array(chunk));
    offset += chunk.byteLength;
  }
  return new Blob([buffer], { type: "audio/wav" });
}

export async function speechToText(
  wavBlob: Blob,
  provider = "chirp3"
): Promise<{ text: string; language: string; duration: number | null }> {
  const form = new FormData();
  form.append("audio", wavBlob, "recording.wav");
  form.append("provider", provider);
  const res = await fetch(`${VOICE_BASE_URL}/stt`, { method: "POST", body: form });
  if (!res.ok) throw new Error(`STT error: ${res.status}`);
  const contentType = res.headers.get("content-type") ?? "";
  if (!contentType.includes("application/json")) {
    const body = await res.text();
    throw new Error(`STT returned non-JSON (${res.status}): ${body.slice(0, 200)}`);
  }
  return res.json();
}

export async function textToSpeechStreaming(
  text: string,
  provider = "gemini"
): Promise<{ sampleRate: number; stream: ReadableStream<Uint8Array> }> {
  const abort = new AbortController();
  const timer = setTimeout(() => abort.abort(), 120_000);
  const res = await fetch(`${VOICE_BASE_URL}/tts`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify({ text, provider }),
    signal: abort.signal,
  }).finally(() => clearTimeout(timer));
  if (!res.ok) throw new Error(`TTS error: ${res.status}`);
  if (!res.body) throw new Error("TTS response has no body");
  const sampleRate = parseInt(res.headers.get("X-Sample-Rate") ?? "24000", 10);
  return { sampleRate, stream: res.body };
}

export async function textToSpeech(
  text: string,
  provider = "gemini"
): Promise<{ pcm: ArrayBuffer; sampleRate: number }> {
  const abort = new AbortController();
  const timer = setTimeout(() => abort.abort(), 90_000);
  const response = await fetch(`${VOICE_BASE_URL}/tts`, {
    method: "POST",
    headers: { "Content-Type": "application/json" },
    body: JSON.stringify({ text, provider }),
    signal: abort.signal,
  }).finally(() => clearTimeout(timer));
  if (!response.ok) throw new Error(`TTS error: ${response.status}`);
  const sampleRate = parseInt(
    response.headers.get("X-Sample-Rate") ?? "24000",
    10
  );
  const pcm = await response.arrayBuffer();
  return { pcm, sampleRate };
}