Texttospeech / app.py
Adedoyinjames's picture
Update app.py
771533b verified
import io
import scipy.io.wavfile as wavf
from fastapi import FastAPI, Response
from pydantic import BaseModel
import gradio as gr
from transformers import pipeline, AutoTokenizer, VitsModel # VitsModel kept for compatibility fallback
import torch
import uvicorn
import numpy as np
import copy # For potential TTS voice embeddings if you add custom inference
# ================================================
# IMPORTANT SETUP NOTES FOR VibeVoice 0.5B (Realtime TTS)
# ================================================
# 1. The VibeVoice-Realtime-0.5B is NOT a standard VITS model.
# It uses a custom Qwen2.5-0.5B + diffusion head + streaming processor.
# 2. To use it properly:
# git clone https://github.com/microsoft/VibeVoice.git
# cd VibeVoice
# pip install -e .[streamingtts]
# 3. The full inference code (including streaming & voice presets) is in:
# demo/realtime_model_inference_from_file.py
# demo/vibevoice_realtime_demo.py
# 4. For now, this script keeps the old VITS-style TTS as a fallback.
# Replace the synthesize_speech() function with the custom logic from the repo
# once installed (see the snippets in the comments below).
# 5. VibeVoice-ASR works out-of-the-box with the standard Transformers pipeline.
print("Loading VibeVoice-ASR (for recording classes β†’ instant text)...")
asr_pipeline = pipeline(
"automatic-speech-recognition",
model="microsoft/VibeVoice-ASR",
trust_remote_code=True
)
print("βœ… VibeVoice-ASR loaded successfully!")
# TTS Model (0.5B Realtime) - placeholder loading
# Uncomment and adapt the lines below AFTER installing the VibeVoice package:
# from transformers import AutoModelForCausalLM
# from VibeVoice.tokenizer import VibeVoiceTextTokenizerFast # custom from repo
# from VibeVoice.model import VibeVoiceStreamingProcessor, VibeVoiceStreamingForConditionalGenerationInference
#
# MODEL_PATH = "microsoft/VibeVoice-Realtime-0.5B"
# processor = VibeVoiceStreamingProcessor.from_pretrained(MODEL_PATH)
# model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained(
# MODEL_PATH, trust_remote_code=True
# )
# # Example voice preset (Carter, Emma, etc.)
# # voice_embeddings = torch.load("demo/voices/streaming_model/carter.pt", map_location="cpu")
model_name = "microsoft/VibeVoice-Realtime-0.5B" # ← Changed to your requested 0.5B model
print(f"Loading TTS model: {model_name} (note: custom inference required for full realtime streaming)")
tokenizer = AutoTokenizer.from_pretrained(model_name) # May fail until custom tokenizer is installed
model = VitsModel.from_pretrained(model_name) # Will fail for 0.5B until you swap to custom classes
print("βœ… TTS model loaded (fallback mode)")
# ================================================
# Core TTS Synthesis Function (0.5B model)
# ================================================
def synthesize_speech(text: str):
"""
TODO: Replace this entire function with the official VibeVoice realtime inference
once you have installed the package (see notes above).
Example skeleton from the repo (adapt as needed):
inputs = processor(text, return_tensors="pt")
# Add voice preset embeddings here if desired
with torch.no_grad():
output = model.generate(
**inputs,
tokenizer=processor.tokenizer,
cfg_scale=1.5,
ddpm_steps=10, # lower = faster, higher = better quality
verbose=False
)
# output will contain waveform or latent β†’ decode to 24kHz audio
"""
# Current fallback (will work only until you replace with custom code)
inputs = tokenizer(text, return_tensors="pt")
with torch.no_grad():
output = model(**inputs).waveform
audio_data = output.squeeze().numpy()
sample_rate = model.config.sampling_rate
wav_io = io.BytesIO()
wavf.write(wav_io, sample_rate, audio_data)
wav_io.seek(0)
return wav_io.read(), sample_rate, audio_data
# ================================================
# Core ASR Function (Record classes β†’ instant text)
# ================================================
def transcribe_audio(audio):
"""Record audio (classes/lectures) β†’ returns clean text instantly."""
if audio is None:
return "No audio recorded."
sr, data = audio
# VibeVoice-ASR handles numpy array + sample rate directly
result = asr_pipeline({"sampling_rate": sr, "raw": data.astype(np.float32)})
return result["text"]
# ================================================
# FastAPI Setup + API Endpoint (kept exactly as requested)
# ================================================
app = FastAPI(title="VibeVoice 0.5B Realtime TTS + ASR API")
class TTSRequest(BaseModel):
text: str
@app.post("/api/tts", summary="Generate Speech from Text (0.5B model)")
async def api_tts(request: TTSRequest):
"""
POST {"text": "your text here"}
Returns downloadable WAV (maintains original API).
"""
wav_bytes, _, _ = synthesize_speech(request.text)
return Response(content=wav_bytes, media_type="audio/wav")
# ================================================
# Gradio Interface (now with BOTH features in tabs)
# ================================================
with gr.Blocks(title="VibeVoice 0.5B β€’ Record Classes + TTS", theme=gr.themes.Soft()) as ui:
gr.Markdown("# ⚑ VibeVoice 0.5B Realtime TTS + ASR\nRecord class lectures β†’ get text instantly. Text β†’ speech in one click.")
with gr.Tabs():
# ====================== ASR TAB ======================
with gr.Tab("🎀 Record Class β†’ Instant Text"):
gr.Markdown("**Record your class/lecture audio β†’ copy the transcribed text instantly**")
audio_input = gr.Audio(
sources=["microphone"],
type="numpy",
label="Record Audio (or upload)",
waveform_options=gr.WaveformOptions(waveform_color="#4F46E5")
)
transcribe_btn = gr.Button("πŸ“ Transcribe Now", variant="primary", size="large")
text_output = gr.Textbox(label="Transcribed Text", lines=8, show_copy_button=True)
transcribe_btn.click(
fn=transcribe_audio,
inputs=audio_input,
outputs=text_output
)
# ====================== TTS TAB ======================
with gr.Tab("πŸ”Š Text β†’ Speech (0.5B Realtime)"):
gr.Markdown("**Type text β†’ generate natural speech** (uses VibeVoice-Realtime-0.5B)")
text_input = gr.Textbox(
label="Enter Text",
lines=4,
placeholder="Type or paste your script here...",
value="Hello, this is a test of Microsoft's new VibeVoice 0.5B real-time TTS."
)
tts_btn = gr.Button("πŸ”Š Generate Speech", variant="primary", size="large")
audio_output = gr.Audio(label="Generated Speech", type="numpy")
tts_btn.click(
fn=lambda t: synthesize_speech(t)[1:3], # returns (rate, audio) for Gradio
inputs=text_input,
outputs=audio_output
)
gr.Markdown(
"**How to use the API**: `curl -X POST http://localhost:7860/api/tts -d '{\"text\":\"Hello world\"}' --output speech.wav`\n\n"
"Full realtime streaming & voice presets available in the official VibeVoice repo."
)
# Mount Gradio onto FastAPI (keeps original behavior)
app = gr.mount_gradio_app(app, ui, path="/")
# Run the server
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=7860)