Spaces:
Runtime error
Runtime error
| import io | |
| import scipy.io.wavfile as wavf | |
| from fastapi import FastAPI, Response | |
| from pydantic import BaseModel | |
| import gradio as gr | |
| from transformers import pipeline, AutoTokenizer, VitsModel # VitsModel kept for compatibility fallback | |
| import torch | |
| import uvicorn | |
| import numpy as np | |
| import copy # For potential TTS voice embeddings if you add custom inference | |
| # ================================================ | |
| # IMPORTANT SETUP NOTES FOR VibeVoice 0.5B (Realtime TTS) | |
| # ================================================ | |
| # 1. The VibeVoice-Realtime-0.5B is NOT a standard VITS model. | |
| # It uses a custom Qwen2.5-0.5B + diffusion head + streaming processor. | |
| # 2. To use it properly: | |
| # git clone https://github.com/microsoft/VibeVoice.git | |
| # cd VibeVoice | |
| # pip install -e .[streamingtts] | |
| # 3. The full inference code (including streaming & voice presets) is in: | |
| # demo/realtime_model_inference_from_file.py | |
| # demo/vibevoice_realtime_demo.py | |
| # 4. For now, this script keeps the old VITS-style TTS as a fallback. | |
| # Replace the synthesize_speech() function with the custom logic from the repo | |
| # once installed (see the snippets in the comments below). | |
| # 5. VibeVoice-ASR works out-of-the-box with the standard Transformers pipeline. | |
| print("Loading VibeVoice-ASR (for recording classes β instant text)...") | |
| asr_pipeline = pipeline( | |
| "automatic-speech-recognition", | |
| model="microsoft/VibeVoice-ASR", | |
| trust_remote_code=True | |
| ) | |
| print("β VibeVoice-ASR loaded successfully!") | |
| # TTS Model (0.5B Realtime) - placeholder loading | |
| # Uncomment and adapt the lines below AFTER installing the VibeVoice package: | |
| # from transformers import AutoModelForCausalLM | |
| # from VibeVoice.tokenizer import VibeVoiceTextTokenizerFast # custom from repo | |
| # from VibeVoice.model import VibeVoiceStreamingProcessor, VibeVoiceStreamingForConditionalGenerationInference | |
| # | |
| # MODEL_PATH = "microsoft/VibeVoice-Realtime-0.5B" | |
| # processor = VibeVoiceStreamingProcessor.from_pretrained(MODEL_PATH) | |
| # model = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained( | |
| # MODEL_PATH, trust_remote_code=True | |
| # ) | |
| # # Example voice preset (Carter, Emma, etc.) | |
| # # voice_embeddings = torch.load("demo/voices/streaming_model/carter.pt", map_location="cpu") | |
| model_name = "microsoft/VibeVoice-Realtime-0.5B" # β Changed to your requested 0.5B model | |
| print(f"Loading TTS model: {model_name} (note: custom inference required for full realtime streaming)") | |
| tokenizer = AutoTokenizer.from_pretrained(model_name) # May fail until custom tokenizer is installed | |
| model = VitsModel.from_pretrained(model_name) # Will fail for 0.5B until you swap to custom classes | |
| print("β TTS model loaded (fallback mode)") | |
| # ================================================ | |
| # Core TTS Synthesis Function (0.5B model) | |
| # ================================================ | |
| def synthesize_speech(text: str): | |
| """ | |
| TODO: Replace this entire function with the official VibeVoice realtime inference | |
| once you have installed the package (see notes above). | |
| Example skeleton from the repo (adapt as needed): | |
| inputs = processor(text, return_tensors="pt") | |
| # Add voice preset embeddings here if desired | |
| with torch.no_grad(): | |
| output = model.generate( | |
| **inputs, | |
| tokenizer=processor.tokenizer, | |
| cfg_scale=1.5, | |
| ddpm_steps=10, # lower = faster, higher = better quality | |
| verbose=False | |
| ) | |
| # output will contain waveform or latent β decode to 24kHz audio | |
| """ | |
| # Current fallback (will work only until you replace with custom code) | |
| inputs = tokenizer(text, return_tensors="pt") | |
| with torch.no_grad(): | |
| output = model(**inputs).waveform | |
| audio_data = output.squeeze().numpy() | |
| sample_rate = model.config.sampling_rate | |
| wav_io = io.BytesIO() | |
| wavf.write(wav_io, sample_rate, audio_data) | |
| wav_io.seek(0) | |
| return wav_io.read(), sample_rate, audio_data | |
| # ================================================ | |
| # Core ASR Function (Record classes β instant text) | |
| # ================================================ | |
| def transcribe_audio(audio): | |
| """Record audio (classes/lectures) β returns clean text instantly.""" | |
| if audio is None: | |
| return "No audio recorded." | |
| sr, data = audio | |
| # VibeVoice-ASR handles numpy array + sample rate directly | |
| result = asr_pipeline({"sampling_rate": sr, "raw": data.astype(np.float32)}) | |
| return result["text"] | |
| # ================================================ | |
| # FastAPI Setup + API Endpoint (kept exactly as requested) | |
| # ================================================ | |
| app = FastAPI(title="VibeVoice 0.5B Realtime TTS + ASR API") | |
| class TTSRequest(BaseModel): | |
| text: str | |
| async def api_tts(request: TTSRequest): | |
| """ | |
| POST {"text": "your text here"} | |
| Returns downloadable WAV (maintains original API). | |
| """ | |
| wav_bytes, _, _ = synthesize_speech(request.text) | |
| return Response(content=wav_bytes, media_type="audio/wav") | |
| # ================================================ | |
| # Gradio Interface (now with BOTH features in tabs) | |
| # ================================================ | |
| with gr.Blocks(title="VibeVoice 0.5B β’ Record Classes + TTS", theme=gr.themes.Soft()) as ui: | |
| gr.Markdown("# β‘ VibeVoice 0.5B Realtime TTS + ASR\nRecord class lectures β get text instantly. Text β speech in one click.") | |
| with gr.Tabs(): | |
| # ====================== ASR TAB ====================== | |
| with gr.Tab("π€ Record Class β Instant Text"): | |
| gr.Markdown("**Record your class/lecture audio β copy the transcribed text instantly**") | |
| audio_input = gr.Audio( | |
| sources=["microphone"], | |
| type="numpy", | |
| label="Record Audio (or upload)", | |
| waveform_options=gr.WaveformOptions(waveform_color="#4F46E5") | |
| ) | |
| transcribe_btn = gr.Button("π Transcribe Now", variant="primary", size="large") | |
| text_output = gr.Textbox(label="Transcribed Text", lines=8, show_copy_button=True) | |
| transcribe_btn.click( | |
| fn=transcribe_audio, | |
| inputs=audio_input, | |
| outputs=text_output | |
| ) | |
| # ====================== TTS TAB ====================== | |
| with gr.Tab("π Text β Speech (0.5B Realtime)"): | |
| gr.Markdown("**Type text β generate natural speech** (uses VibeVoice-Realtime-0.5B)") | |
| text_input = gr.Textbox( | |
| label="Enter Text", | |
| lines=4, | |
| placeholder="Type or paste your script here...", | |
| value="Hello, this is a test of Microsoft's new VibeVoice 0.5B real-time TTS." | |
| ) | |
| tts_btn = gr.Button("π Generate Speech", variant="primary", size="large") | |
| audio_output = gr.Audio(label="Generated Speech", type="numpy") | |
| tts_btn.click( | |
| fn=lambda t: synthesize_speech(t)[1:3], # returns (rate, audio) for Gradio | |
| inputs=text_input, | |
| outputs=audio_output | |
| ) | |
| gr.Markdown( | |
| "**How to use the API**: `curl -X POST http://localhost:7860/api/tts -d '{\"text\":\"Hello world\"}' --output speech.wav`\n\n" | |
| "Full realtime streaming & voice presets available in the official VibeVoice repo." | |
| ) | |
| # Mount Gradio onto FastAPI (keeps original behavior) | |
| app = gr.mount_gradio_app(app, ui, path="/") | |
| # Run the server | |
| if __name__ == "__main__": | |
| uvicorn.run(app, host="0.0.0.0", port=7860) |