audio-stream / app.py
Xenobd's picture
Create app.py
12a75f1 verified
import torch
import numpy as np
from flask import Flask, Response, request, stream_with_context
from pocket_tts import TTSModel
app = Flask(__name__)
# Load model globally
print("Loading TTS Model...")
model = TTSModel.load_model()
# Pre-load voice state
voice_state = model.get_state_for_audio_prompt(
"hf://kyutai/tts-voices/alba-mackenna/casual.wav"
)
def generate_wav_header(sample_rate):
"""Generates a standard PCM WAV header."""
# Using a smaller size for the header; many players handle 0 length fine
# or you can re-calculate if the total length is known beforehand.
data_size = 0 # 0 indicates unknown length for streaming
o = b"RIFF" + (data_size + 36).to_bytes(4, 'little')
o += b"WAVE" + b"fmt " + (16).to_bytes(4, 'little')
o += (1).to_bytes(2, 'little') # PCM
o += (1).to_bytes(2, 'little') # Mono
o += sample_rate.to_bytes(4, 'little')
o += (sample_rate * 2).to_bytes(4, 'little')
o += (2).to_bytes(2, 'little')
o += (16).to_bytes(2, 'little')
o += b"data" + data_size.to_bytes(4, 'little')
return o
@app.route('/stream')
def stream_audio():
text = request.args.get('text', 'Streaming real-time audio with Pocket TTS.')
def generate():
# Yield header
yield generate_wav_header(model.sample_rate)
# Stream chunks
try:
for chunk in model.generate_audio_stream(voice_state, text):
# Ensure device-agnostic conversion to CPU
audio_data = chunk.cpu().clamp(-1, 1).numpy()
pcm_data = (audio_data * 32767).astype(np.int16).tobytes()
yield pcm_data
except Exception as e:
print(f"Error during streaming: {e}")
return Response(
stream_with_context(generate()),
mimetype="audio/wav",
headers={"Content-Disposition": "inline; filename=output.wav"}
)
if __name__ == '__main__':
# Use Gunicorn/Uvicorn for production; threaded is fine for dev
app.run(host='0.0.0.0', port=7860, threaded=True)