File size: 3,146 Bytes
993bb20
 
 
1f1d266
 
0603011
 
 
1f1d266
993bb20
1f1d266
 
 
 
 
993bb20
 
1f1d266
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
993bb20
 
1f1d266
 
 
 
 
 
 
 
 
 
 
993bb20
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
import numpy as np
import io
import wave
import os
import requests

class AudioHandler:
    def __init__(self):
        # Sample rate will depend on upstream TTS; default is safe for playback.
        self.sample_rate = 22050
        # Optional: URL of your external TTS Space or vLLM-style OpenAI server.
        # For your case, set KANITTS_URL in the my-voice-agent Space settings to point at:
        #   https://jblast94-KaniTTS.hf.space (or the Space's /proxy or API endpoint)
        self.kanitts_url = os.getenv("KANITTS_URL")
        print(f"AudioHandler initialized. KaniTTS URL: {self.kanitts_url}")

    def text_to_speech(self, text: str):
        """
        Convert text to speech using an external TTS backend.

        Recommended setups:
        - Use your KaniTTS Space:
          - Expose an HTTP endpoint there (e.g. /tts) that returns raw WAV/OGG bytes.
          - Set KANITTS_URL in this Space to that endpoint URL.
        - OR use a vLLM/OpenAI-compatible TTS server:
          - Point KANITTS_URL to its TTS endpoint.

        Returns:
            A tuple (sample_rate, np.ndarray) suitable for Gradio's Audio component,
            or None if TTS is not configured or fails.
        """
        if not text or not text.strip():
            return None

        if not self.kanitts_url:
            print("KANITTS_URL is not set; skipping TTS.")
            return None

        try:
            # Example: POST JSON; adjust if your KaniTTS/vLLM API differs.
            # For KaniTTS Space, implement a compatible /tts handler that:
            # - Accepts: { "text": "..." }
            # - Returns: audio bytes (wav/ogg) as response body.
            resp = requests.post(
                self.kanitts_url,
                json={"text": text},
                timeout=30,
            )
            resp.raise_for_status()
            audio_bytes = resp.content
            if not audio_bytes:
                print("KaniTTS/vLLM TTS returned empty audio.")
                return None

            # Try to parse as WAV; if different format, adapt accordingly.
            with wave.open(io.BytesIO(audio_bytes), "rb") as wf:
                sr = wf.getframerate()
                n_channels = wf.getnchannels()
                n_frames = wf.getnframes()
                audio_data = wf.readframes(n_frames)

            audio_np = np.frombuffer(audio_data, dtype=np.int16)
            if n_channels > 1:
                audio_np = audio_np.reshape(-1, n_channels).mean(axis=1).astype(np.int16)

            return (sr, audio_np.astype(np.float32) / 32768.0)
        except Exception as e:
            print(f"Error during TTS request to {self.kanitts_url}: {e}")
            return None

    def speech_to_text(self, audio_filepath):
        """
        Placeholder STT.

        Options:
        - Connect to OpenAI Whisper / local STT / another Space and call it here.
        - For now, returns None so the rest of the app still works.
        """
        if not audio_filepath:
            return None

        print(f"Speech-to-text not configured. Received audio file: '{audio_filepath}'")
        return None