Spaces:

12labs
/

ind

Runtime error

App Files Files Community

12labs commited on Jan 29

Commit

026659d

verified ·

1 Parent(s): 70ff92e

Upload 3 files

Browse files

Files changed (3) hide show

README.md +121 -0
app.py +110 -0
requirements.txt +11 -0

README.md ADDED Viewed

	@@ -0,0 +1,121 @@

+---
+title: Hindi Voice Cloning (VibeVoice)
+emoji: 🎙️
+colorFrom: red
+colorTo: purple
+sdk: gradio
+sdk_version: "4.44.0"
+app_file: app.py
+pinned: false
+---
+# 🇮🇳 Hindi Voice Cloning with Emotion
+This Hugging Face Space provides **high-quality Hindi Text-to-Speech with voice cloning and expressive emotion**.
+Users can upload a short reference voice sample and generate Hindi speech in the **same voice, tone, and emotional style**.
+The system is powered by **VibeVoice-7B** with **Hindi LoRA fine-tuning**, optimized for natural prosody and long-form speech.
+---
+## ✨ Features
+- 🎙️ Voice cloning from uploaded reference audio
+- 🎭 Emotion & speaking style transfer
+- 🗣️ Natural-sounding Hindi TTS
+- 📄 Long-form narration support
+- 🚀 GPU-accelerated inference
+- 🎚️ Expression strength control (CFG scale)
+---
+## 🧪 How to Use
+1. Enter Hindi text in the text box
+2. Upload a **reference voice (WAV format)**
+3. Adjust **Expression Strength (CFG Scale)**
+4. Click **🚀 Generate Voice**
+5. Listen to or download the generated audio
+---
+## 🎧 Reference Voice Guidelines (Very Important)
+For best quality voice cloning:
+- WAV format only
+- 10–30 seconds duration recommended
+- Single speaker
+- Clear audio, minimal background noise
+- Natural emotion (happy, calm, sad, etc.)
+> ⚠️ Emotion is copied from the **reference voice**, not from the text.
+---
+## 🎭 Expression Control (CFG Scale)
+| CFG Scale | Effect |
+|---------|------|
+| 0.8 – 1.0 | Calm / neutral |
+| 1.2 – 1.4 | Natural & expressive (recommended) |
+| 1.5 – 2.0 | Strong emotion (may distort if too high) |
+---
+## ⚠️ System Requirements
+- ✅ GPU required
+  - Recommended: A10 / A100 / H100
+- ❌ CPU-only Spaces will not work
+- ⏳ First run may take time due to model loading
+---
+## 🔐 Privacy & Data Handling
+- Uploaded voice files are used **only for generation**
+- Voice files are overwritten per request
+- No permanent storage or reuse of user voices
+---
+## 🚫 Responsible Use Policy
+This Space is intended for **research and demonstration purposes only**.
+❌ Do NOT clone voices of real individuals without **explicit consent**
+❌ Do NOT use for impersonation, fraud, or misinformation
+❌ Do NOT present generated audio as real recordings
+✔ Always disclose AI-generated audio when sharing publicly
+---
+## 🧠 Model Information
+- **Base Model:** VibeVoice-7B
+- **Hindi Fine-Tuning:** Hindi LoRA adapters
+- **Architecture:** LLM + acoustic & semantic tokenizers + diffusion head
+- **Technique:** LoRA (parameter-efficient fine-tuning)
+---
+## 📜 License
+MIT License
+(Same as the base VibeVoice model and adapters)
+---
+## 🙏 Acknowledgements
+- Microsoft Research – VibeVoice
+- VibeVoice Community
+- Hugging Face Open-Source Ecosystem
+---
+### ⚡ Note
+This is a **research/demo Space**, not recommended for production or real-time applications.

app.py ADDED Viewed

	@@ -0,0 +1,110 @@

+import gradio as gr
+import subprocess
+import uuid
+import os
+import shutil
+BASE_MODEL = "vibevoice/VibeVoice-7B"
+CHECKPOINT = "tarun7r/vibevoice-hindi-lora"
+VOICES_DIR = "demo/voices"
+OUTPUT_DIR = "outputs"
+os.makedirs(VOICES_DIR, exist_ok=True)
+os.makedirs(OUTPUT_DIR, exist_ok=True)
+def generate_voice(text, voice_file, cfg_scale, seed):
+    if not text.strip():
+        raise gr.Error("❌ Hindi text empty hai")
+    if voice_file is None:
+        raise gr.Error("❌ Reference voice upload karo (WAV)")
+    speaker_name = "user_voice"
+    speaker_path = os.path.join(VOICES_DIR, f"{speaker_name}.wav")
+    # Replace previous voice
+    shutil.copy(voice_file, speaker_path)
+    out_file = os.path.join(
+        OUTPUT_DIR, f"out_{uuid.uuid4().hex}.wav"
+    )
+    cmd = [
+        "python", "demo/inference_from_file.py",
+        "--model_path", BASE_MODEL,
+        "--checkpoint_path", CHECKPOINT,
+        "--speaker_names", speaker_name,
+        "--txt", text,
+        "--cfg_scale", str(cfg_scale),
+        "--seed", str(seed),
+        "--output_path", out_file
+    ]
+    try:
+        subprocess.run(cmd, check=True)
+    except subprocess.CalledProcessError:
+        raise gr.Error("❌ Generation failed (check GPU / logs)")
+    return out_file
+with gr.Blocks(theme=gr.themes.Soft()) as demo:
+    gr.Markdown(
+        """
+        # 🇮🇳 Hindi Voice Cloning (VibeVoice)
+        **High-quality Hindi TTS with emotion & voice cloning**
+        Upload a reference voice (10–30 sec) and generate expressive speech.
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=1):
+            text = gr.Textbox(
+                label="📝 Hindi Text",
+                placeholder="नमस्ते, आज हम आर्टिफिशियल इंटेलिजेंस के बारे में बात करेंगे...",
+                lines=6
+            )
+            voice = gr.Audio(
+                label="🎙️ Reference Voice (WAV only)",
+                type="filepath"
+            )
+            cfg = gr.Slider(
+                0.8, 2.0, value=1.3, step=0.1,
+                label="🎭 Expression Strength (CFG Scale)"
+            )
+            seed = gr.Number(
+                value=42,
+                precision=0,
+                label="🎲 Seed (same seed = same style)"
+            )
+            btn = gr.Button("🚀 Generate Voice")
+        with gr.Column(scale=1):
+            output = gr.Audio(
+                label="🔊 Generated Audio",
+                type="filepath"
+            )
+    btn.click(
+        generate_voice,
+        inputs=[text, voice, cfg, seed],
+        outputs=output
+    )
+    gr.Markdown(
+        """
+        ### ℹ️ Tips for Best Quality
+        - Use **clean WAV** (10–30 sec)
+        - Emotion reference voice se aata hai
+        - Higher CFG = more expressive (but too high = distortion)
+        - GPU required (A10 / A100 / H100 recommended)
+        """
+    )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,11 @@

+torch
+torchaudio
+transformers
+gradio
+peft
+diffusers
+accelerate
+sentencepiece
+soundfile
+uv
+git+https://github.com/vibevoice-community/VibeVoice.git