import io import scipy.io.wavfile as wavf from fastapi import FastAPI, Response from pydantic import BaseModel import gradio as gr from transformers import pipeline, AutoTokenizer, VitsModel # VitsModel kept for compatibility fallback import torch import uvicorn import numpy as np import copy # For potential TTS voice embeddings if you add custom inference # ================================================ # IMPORTANT SETUP NOTES FOR VibeVoice 0.5B (Realtime TTS) # ================================================ # 1. The VibeVoice-Realtime-0.5B is NOT a standard VITS model. # It uses a custom Qwen2.5-0.5B + diffusion head + streaming processor. # 2. To use it properly: # git clone https://github.com/microsoft/VibeVoice.git # cd VibeVoice # pip install -e .[streamingtts] # 3. The full inference code (including streaming & voice presets) is in: # demo/realtime_model_inference_from_file.py # demo/vibevoice_realtime_demo.py # 4. For now, this script keeps the old VITS-style TTS as a fallback. # Replace the synthesize_speech() function with the custom logic from the repo # once installed (see the snippets in the comments below). # 5. VibeVoice-ASR works out-of-the-box with the standard Transformers pipeline. print("Loading VibeVoice-ASR (for recording classes → instant text)...") asr_pipeline = pipeline( "automatic-speech-recognition", model="microsoft/VibeVoice-ASR", trust_remote_code=True ) print("✅ VibeVoice-ASR loaded successfully!") # TTS Model (0.5B Realtime) - placeholder loading # Uncomment and adapt the lines below AFTER installing the VibeVoice package: # from transformers import AutoModelForCausalLM # from VibeVoice.tokenizer import VibeVoiceTextTokenizerFast # custom from repo # from VibeVoice.model import VibeVoiceStreamingProcessor, VibeVoiceStreamingForConditionalGenerationInference # # MODEL_PATH = "microsoft/VibeVoice-Realtime-0.5B" # processor = VibeVoiceStreamingProcessor.from_pretrained(MODEL_PATH) # model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained( # MODEL_PATH, trust_remote_code=True # ) # # Example voice preset (Carter, Emma, etc.) # # voice_embeddings = torch.load("demo/voices/streaming_model/carter.pt", map_location="cpu") model_name = "microsoft/VibeVoice-Realtime-0.5B" # ← Changed to your requested 0.5B model print(f"Loading TTS model: {model_name} (note: custom inference required for full realtime streaming)") tokenizer = AutoTokenizer.from_pretrained(model_name) # May fail until custom tokenizer is installed model = VitsModel.from_pretrained(model_name) # Will fail for 0.5B until you swap to custom classes print("✅ TTS model loaded (fallback mode)") # ================================================ # Core TTS Synthesis Function (0.5B model) # ================================================ def synthesize_speech(text: str): """ TODO: Replace this entire function with the official VibeVoice realtime inference once you have installed the package (see notes above). Example skeleton from the repo (adapt as needed): inputs = processor(text, return_tensors="pt") # Add voice preset embeddings here if desired with torch.no_grad(): output = model.generate( **inputs, tokenizer=processor.tokenizer, cfg_scale=1.5, ddpm_steps=10, # lower = faster, higher = better quality verbose=False ) # output will contain waveform or latent → decode to 24kHz audio """ # Current fallback (will work only until you replace with custom code) inputs = tokenizer(text, return_tensors="pt") with torch.no_grad(): output = model(**inputs).waveform audio_data = output.squeeze().numpy() sample_rate = model.config.sampling_rate wav_io = io.BytesIO() wavf.write(wav_io, sample_rate, audio_data) wav_io.seek(0) return wav_io.read(), sample_rate, audio_data # ================================================ # Core ASR Function (Record classes → instant text) # ================================================ def transcribe_audio(audio): """Record audio (classes/lectures) → returns clean text instantly.""" if audio is None: return "No audio recorded." sr, data = audio # VibeVoice-ASR handles numpy array + sample rate directly result = asr_pipeline({"sampling_rate": sr, "raw": data.astype(np.float32)}) return result["text"] # ================================================ # FastAPI Setup + API Endpoint (kept exactly as requested) # ================================================ app = FastAPI(title="VibeVoice 0.5B Realtime TTS + ASR API") class TTSRequest(BaseModel): text: str @app.post("/api/tts", summary="Generate Speech from Text (0.5B model)") async def api_tts(request: TTSRequest): """ POST {"text": "your text here"} Returns downloadable WAV (maintains original API). """ wav_bytes, _, _ = synthesize_speech(request.text) return Response(content=wav_bytes, media_type="audio/wav") # ================================================ # Gradio Interface (now with BOTH features in tabs) # ================================================ with gr.Blocks(title="VibeVoice 0.5B • Record Classes + TTS", theme=gr.themes.Soft()) as ui: gr.Markdown("# ⚡ VibeVoice 0.5B Realtime TTS + ASR\nRecord class lectures → get text instantly. Text → speech in one click.") with gr.Tabs(): # ====================== ASR TAB ====================== with gr.Tab("🎤 Record Class → Instant Text"): gr.Markdown("**Record your class/lecture audio → copy the transcribed text instantly**") audio_input = gr.Audio( sources=["microphone"], type="numpy", label="Record Audio (or upload)", waveform_options=gr.WaveformOptions(waveform_color="#4F46E5") ) transcribe_btn = gr.Button("📝 Transcribe Now", variant="primary", size="large") text_output = gr.Textbox(label="Transcribed Text", lines=8, show_copy_button=True) transcribe_btn.click( fn=transcribe_audio, inputs=audio_input, outputs=text_output ) # ====================== TTS TAB ====================== with gr.Tab("🔊 Text → Speech (0.5B Realtime)"): gr.Markdown("**Type text → generate natural speech** (uses VibeVoice-Realtime-0.5B)") text_input = gr.Textbox( label="Enter Text", lines=4, placeholder="Type or paste your script here...", value="Hello, this is a test of Microsoft's new VibeVoice 0.5B real-time TTS." ) tts_btn = gr.Button("🔊 Generate Speech", variant="primary", size="large") audio_output = gr.Audio(label="Generated Speech", type="numpy") tts_btn.click( fn=lambda t: synthesize_speech(t)[1:3], # returns (rate, audio) for Gradio inputs=text_input, outputs=audio_output ) gr.Markdown( "**How to use the API**: `curl -X POST http://localhost:7860/api/tts -d '{\"text\":\"Hello world\"}' --output speech.wav`\n\n" "Full realtime streaming & voice presets available in the official VibeVoice repo." ) # Mount Gradio onto FastAPI (keeps original behavior) app = gr.mount_gradio_app(app, ui, path="/") # Run the server if __name__ == "__main__": uvicorn.run(app, host="0.0.0.0", port=7860)