|
|
import gradio as gr |
|
|
import torch |
|
|
import os |
|
|
import time |
|
|
import copy |
|
|
from pathlib import Path |
|
|
from typing import Optional, Tuple |
|
|
import spaces |
|
|
|
|
|
from vibevoice.modular.modeling_vibevoice_streaming_inference import ( |
|
|
VibeVoiceStreamingForConditionalGenerationInference, |
|
|
) |
|
|
from vibevoice.processor.vibevoice_streaming_processor import ( |
|
|
VibeVoiceStreamingProcessor, |
|
|
) |
|
|
|
|
|
|
|
|
class VoiceMapper: |
|
|
"""Maps speaker names to voice file paths""" |
|
|
|
|
|
def __init__(self): |
|
|
self.setup_voice_presets() |
|
|
|
|
|
|
|
|
new_dict = {} |
|
|
for name, path in self.voice_presets.items(): |
|
|
if "_" in name: |
|
|
name = name.split("_")[0] |
|
|
|
|
|
if "-" in name: |
|
|
name = name.split("-")[-1] |
|
|
|
|
|
new_dict[name] = path |
|
|
self.voice_presets.update(new_dict) |
|
|
|
|
|
def setup_voice_presets(self): |
|
|
"""Setup voice presets by scanning the voices directory.""" |
|
|
voices_dir = os.path.join(os.path.dirname(__file__), "demo/voices/streaming_model") |
|
|
|
|
|
|
|
|
if not os.path.exists(voices_dir): |
|
|
print(f"Warning: Voices directory not found at {voices_dir}") |
|
|
self.voice_presets = {} |
|
|
self.available_voices = {} |
|
|
return |
|
|
|
|
|
|
|
|
self.voice_presets = {} |
|
|
|
|
|
|
|
|
pt_files = [ |
|
|
f |
|
|
for f in os.listdir(voices_dir) |
|
|
if f.lower().endswith(".pt") and os.path.isfile(os.path.join(voices_dir, f)) |
|
|
] |
|
|
|
|
|
|
|
|
for pt_file in pt_files: |
|
|
|
|
|
name = os.path.splitext(pt_file)[0] |
|
|
|
|
|
full_path = os.path.join(voices_dir, pt_file) |
|
|
self.voice_presets[name] = full_path |
|
|
|
|
|
|
|
|
self.voice_presets = dict(sorted(self.voice_presets.items())) |
|
|
|
|
|
|
|
|
self.available_voices = { |
|
|
name: path for name, path in self.voice_presets.items() if os.path.exists(path) |
|
|
} |
|
|
|
|
|
print(f"Found {len(self.available_voices)} voice files in {voices_dir}") |
|
|
print(f"Available voices: {', '.join(self.available_voices.keys())}") |
|
|
|
|
|
def get_voice_path(self, speaker_name: str) -> str: |
|
|
"""Get voice file path for a given speaker name""" |
|
|
|
|
|
if speaker_name in self.voice_presets: |
|
|
return self.voice_presets[speaker_name] |
|
|
|
|
|
|
|
|
speaker_lower = speaker_name.lower() |
|
|
for preset_name, path in self.voice_presets.items(): |
|
|
if preset_name.lower() in speaker_lower or speaker_lower in preset_name.lower(): |
|
|
return path |
|
|
|
|
|
|
|
|
default_voice = list(self.voice_presets.values())[0] |
|
|
print( |
|
|
f"Warning: No voice preset found for '{speaker_name}', using default voice: {default_voice}" |
|
|
) |
|
|
return default_voice |
|
|
|
|
|
|
|
|
|
|
|
print("Loading VibeVoice-Realtime model...") |
|
|
|
|
|
MODEL_PATH = "microsoft/VibeVoice-Realtime-0.5B" |
|
|
|
|
|
|
|
|
PROCESSOR = VibeVoiceStreamingProcessor.from_pretrained(MODEL_PATH) |
|
|
|
|
|
|
|
|
MODEL = VibeVoiceStreamingForConditionalGenerationInference.from_pretrained( |
|
|
MODEL_PATH, |
|
|
torch_dtype=torch.float16, |
|
|
device_map="cpu", |
|
|
attn_implementation="sdpa", |
|
|
) |
|
|
|
|
|
MODEL.eval() |
|
|
MODEL.set_ddpm_inference_steps(num_steps=5) |
|
|
|
|
|
|
|
|
VOICE_MAPPER = VoiceMapper() |
|
|
|
|
|
print("Model loaded successfully!") |
|
|
|
|
|
|
|
|
@spaces.GPU(duration=60) |
|
|
def generate_speech( |
|
|
text: str, |
|
|
speaker_name: str, |
|
|
cfg_scale: float = 1.5, |
|
|
progress=gr.Progress(), |
|
|
) -> Tuple[Optional[str], str]: |
|
|
""" |
|
|
Generate speech from text using VibeVoice-Realtime with ZeroGPU |
|
|
|
|
|
Args: |
|
|
text: Input text to convert to speech |
|
|
speaker_name: Name of the speaker voice to use |
|
|
cfg_scale: Classifier-Free Guidance scale (higher = more faithful to text) |
|
|
progress: Gradio progress tracker |
|
|
|
|
|
Returns: |
|
|
Tuple of (audio_path, status_message) |
|
|
""" |
|
|
if not text or not text.strip(): |
|
|
return None, "❌ Error: Please enter some text to convert to speech." |
|
|
|
|
|
try: |
|
|
progress(0, desc="Loading voice preset...") |
|
|
|
|
|
|
|
|
full_script = text.strip().replace("'", "'").replace('"', '"').replace('"', '"') |
|
|
|
|
|
|
|
|
voice_sample = VOICE_MAPPER.get_voice_path(speaker_name) |
|
|
|
|
|
|
|
|
all_prefilled_outputs = torch.load( |
|
|
voice_sample, map_location="cuda", weights_only=False |
|
|
) |
|
|
|
|
|
progress(0.2, desc="Preparing inputs...") |
|
|
|
|
|
|
|
|
inputs = PROCESSOR.process_input_with_cached_prompt( |
|
|
text=full_script, |
|
|
cached_prompt=all_prefilled_outputs, |
|
|
padding=True, |
|
|
return_tensors="pt", |
|
|
return_attention_mask=True, |
|
|
) |
|
|
|
|
|
|
|
|
MODEL.to("cuda") |
|
|
for k, v in inputs.items(): |
|
|
if torch.is_tensor(v): |
|
|
inputs[k] = v.to("cuda") |
|
|
|
|
|
progress(0.4, desc="Generating speech on GPU...") |
|
|
|
|
|
|
|
|
start_time = time.time() |
|
|
with torch.cuda.amp.autocast(): |
|
|
outputs = MODEL.generate( |
|
|
**inputs, |
|
|
max_new_tokens=None, |
|
|
cfg_scale=cfg_scale, |
|
|
tokenizer=PROCESSOR.tokenizer, |
|
|
generation_config={"do_sample": False}, |
|
|
verbose=False, |
|
|
all_prefilled_outputs=copy.deepcopy(all_prefilled_outputs) |
|
|
if all_prefilled_outputs is not None |
|
|
else None, |
|
|
) |
|
|
generation_time = time.time() - start_time |
|
|
|
|
|
progress(0.8, desc="Saving audio...") |
|
|
|
|
|
|
|
|
if outputs.speech_outputs and outputs.speech_outputs[0] is not None: |
|
|
sample_rate = 24000 |
|
|
audio_samples = ( |
|
|
outputs.speech_outputs[0].shape[-1] |
|
|
if len(outputs.speech_outputs[0].shape) > 0 |
|
|
else len(outputs.speech_outputs[0]) |
|
|
) |
|
|
audio_duration = audio_samples / sample_rate |
|
|
rtf = generation_time / audio_duration if audio_duration > 0 else float("inf") |
|
|
|
|
|
|
|
|
output_dir = "./outputs" |
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
output_path = os.path.join(output_dir, f"generated_{int(time.time())}.wav") |
|
|
|
|
|
PROCESSOR.save_audio( |
|
|
outputs.speech_outputs[0].cpu(), |
|
|
output_path=output_path, |
|
|
) |
|
|
|
|
|
progress(1.0, desc="Complete!") |
|
|
|
|
|
|
|
|
status = f"""✅ **Generation Complete!** |
|
|
|
|
|
📊 **Metrics:** |
|
|
- Audio Duration: {audio_duration:.2f}s |
|
|
- Generation Time: {generation_time:.2f}s |
|
|
- Real-Time Factor: {rtf:.2f}x |
|
|
- Speaker: {speaker_name} |
|
|
- CFG Scale: {cfg_scale} |
|
|
- Device: ZeroGPU (CUDA) |
|
|
""" |
|
|
|
|
|
|
|
|
MODEL.to("cpu") |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
return output_path, status |
|
|
else: |
|
|
MODEL.to("cpu") |
|
|
torch.cuda.empty_cache() |
|
|
return None, "❌ Error: No audio output generated." |
|
|
|
|
|
except Exception as e: |
|
|
import traceback |
|
|
|
|
|
error_msg = f"❌ Error during generation:\n{str(e)}\n\n{traceback.format_exc()}" |
|
|
print(error_msg) |
|
|
|
|
|
|
|
|
try: |
|
|
MODEL.to("cpu") |
|
|
torch.cuda.empty_cache() |
|
|
except: |
|
|
pass |
|
|
|
|
|
return None, error_msg |
|
|
|
|
|
|
|
|
|
|
|
with gr.Blocks(fill_height=True) as demo: |
|
|
gr.Markdown( |
|
|
""" |
|
|
# 🎙️ VibeVoice-Realtime Text-to-Speech |
|
|
|
|
|
Convert text to natural-sounding speech using Microsoft's VibeVoice-Realtime model. |
|
|
|
|
|
**🚀 Powered by ZeroGPU** - Efficient GPU allocation for fast inference! |
|
|
|
|
|
<div style="text-align: center; margin-top: 10px;"> |
|
|
<a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank" style="text-decoration: none; color: #4F46E5; font-weight: 600;"> |
|
|
Built with anycoder ✨ |
|
|
</a> |
|
|
</div> |
|
|
""" |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
with gr.Column(scale=2): |
|
|
|
|
|
text_input = gr.Textbox( |
|
|
label="Text to Convert", |
|
|
placeholder="Enter the text you want to convert to speech...", |
|
|
lines=8, |
|
|
max_lines=20, |
|
|
) |
|
|
|
|
|
with gr.Row(): |
|
|
speaker_dropdown = gr.Dropdown( |
|
|
choices=list(VOICE_MAPPER.available_voices.keys()), |
|
|
value=list(VOICE_MAPPER.available_voices.keys())[0] |
|
|
if VOICE_MAPPER.available_voices |
|
|
else None, |
|
|
label="Speaker Voice", |
|
|
info="Select the voice to use for speech generation", |
|
|
) |
|
|
|
|
|
cfg_slider = gr.Slider( |
|
|
minimum=1.0, |
|
|
maximum=3.0, |
|
|
value=1.5, |
|
|
step=0.1, |
|
|
label="CFG Scale", |
|
|
info="Higher values = more faithful to text (1.0-3.0)", |
|
|
) |
|
|
|
|
|
generate_btn = gr.Button("🎵 Generate Speech", variant="primary", size="lg") |
|
|
|
|
|
with gr.Column(scale=1): |
|
|
|
|
|
audio_output = gr.Audio( |
|
|
label="Generated Speech", |
|
|
type="filepath", |
|
|
interactive=False, |
|
|
) |
|
|
|
|
|
status_output = gr.Markdown( |
|
|
""" |
|
|
**Status:** Ready to generate speech |
|
|
|
|
|
Enter text and click "Generate Speech" to start. |
|
|
|
|
|
⚡ Using ZeroGPU for efficient processing |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
gr.Examples( |
|
|
examples=[ |
|
|
[ |
|
|
"VibeVoice is a novel framework designed for generating expressive, long-form, multi-speaker conversational audio.", |
|
|
list(VOICE_MAPPER.available_voices.keys())[0] |
|
|
if VOICE_MAPPER.available_voices |
|
|
else "Wayne", |
|
|
1.5, |
|
|
], |
|
|
[ |
|
|
"The quick brown fox jumps over the lazy dog. This is a test of the text-to-speech system.", |
|
|
list(VOICE_MAPPER.available_voices.keys())[0] |
|
|
if VOICE_MAPPER.available_voices |
|
|
else "Wayne", |
|
|
1.5, |
|
|
], |
|
|
], |
|
|
inputs=[text_input, speaker_dropdown, cfg_slider], |
|
|
label="Example Inputs", |
|
|
) |
|
|
|
|
|
|
|
|
generate_btn.click( |
|
|
fn=generate_speech, |
|
|
inputs=[text_input, speaker_dropdown, cfg_slider], |
|
|
outputs=[audio_output, status_output], |
|
|
api_name="generate", |
|
|
) |
|
|
|
|
|
|
|
|
gr.Markdown( |
|
|
""" |
|
|
--- |
|
|
|
|
|
### 📝 Notes: |
|
|
- **Model**: Microsoft VibeVoice-Realtime-0.5B |
|
|
- **Sample Rate**: 24kHz |
|
|
- **Context Length**: 8K tokens |
|
|
- **Generation Length**: ~10 minutes |
|
|
- **Infrastructure**: ZeroGPU (Hugging Face Spaces) |
|
|
|
|
|
### ⚠️ Important: |
|
|
- The model is designed for English text only |
|
|
- Very short inputs (< 3 words) may produce unstable results |
|
|
- Code, formulas, and special symbols are not supported |
|
|
- Please use responsibly and disclose AI-generated content |
|
|
- GPU is allocated dynamically - generation may take a few seconds to start |
|
|
""" |
|
|
) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
demo.launch( |
|
|
theme=gr.themes.Soft( |
|
|
primary_hue="blue", |
|
|
secondary_hue="indigo", |
|
|
neutral_hue="slate", |
|
|
), |
|
|
footer_links=[ |
|
|
{"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"} |
|
|
], |
|
|
) |