Spaces:

D3vShoaib
/

anycoder-81f107bc

Runtime error

File size: 17,116 Bytes

8c16911

import gradio as gr
import torch
import os
from datetime import datetime

# Try to import the pocket-tts model, with graceful fallback
try:
    from pocket_tts import TextToSpeech, Voice, VoiceProfile, Speaker
    MODEL_AVAILABLE = True
except ImportError:
    MODEL_AVAILABLE = False
    print("pocket-tts not installed. Run: pip install pocket-tts")

# Voice configuration - Pocket-TTS typically supports multiple speakers
VOICE_OPTIONS = {
    "en_US_male_1": "American Male (Deep)",
    "en_US_female_1": "American Female (Clear)",
    "en_US_female_2": "American Female (Warm)",
    "en_UK_male_1": "British Male (Formal)",
    "en_UK_female_1": "British Female (Elegant)",
    "en_AU_male_1": "Australian Male (Casual)",
    "en_AU_female_1": "Australian Female (Friendly)",
}

# Language options
LANGUAGE_OPTIONS = [
    ("English (US)", "en_US"),
    ("English (UK)", "en_UK"),
    ("English (Australia)", "en_AU"),
]

# Speed options (0.5x to 2.0x)
SPEED_OPTIONS = [0.5, 0.75, 1.0, 1.25, 1.5, 2.0]

# Pitch options (-12 to +12 semitones)
PITCH_OPTIONS = [-12, -6, 0, 6, 12]


def load_model(device="cpu"):
    """
    Load the Pocket-TTS model.
    
    Args:
        device: Device to load the model on ('cpu' or 'cuda')
    
    Returns:
        TextToSpeech model instance or None if not available
    """
    if not MODEL_AVAILABLE:
        return None
    
    try:
        model = TextToSpeech(device=device)
        return model
    except Exception as e:
        print(f"Error loading model: {e}")
        return None


def generate_speech(
    text: str,
    voice: str,
    language: str,
    speed: float,
    pitch: int,
    sample_rate: int,
    model_instance
) -> tuple:
    """
    Generate speech from text using Pocket-TTS.
    
    Args:
        text: Input text to synthesize
        voice: Voice identifier
        language: Language code
        speed: Speech speed multiplier (0.5 - 2.0)
        pitch: Pitch adjustment in semitones (-12 to +12)
        sample_rate: Output audio sample rate
        model_instance: Loaded TTS model
    
    Returns:
        Tuple of (audio_data, sample_rate) or error message
    """
    # Validate input
    if not text or not text.strip():
        return None, "Please enter some text to synthesize."
    
    if len(text.strip()) < 2:
        return None, "Text is too short. Please enter at least 2 characters."
    
    if not MODEL_AVAILABLE:
        # Return a demo message when model is not available
        return None, "Model not available. Please install pocket-tts: pip install pocket-tts"
    
    if model_instance is None:
        return None, "Model failed to load. Please check your installation."
    
    try:
        # Construct voice configuration
        voice_config = Voice(
            profile=VoiceProfile.from_id(voice),
            speaker=Speaker.from_id(voice)
        )
        
        # Generate speech with options
        audio = model_instance.tts(
            text=text.strip(),
            voice=voice_config,
            speed=speed,
            pitch_shift=pitch,
            sample_rate=sample_rate
        )
        
        return (sample_rate, audio), None
    
    except Exception as e:
        return None, f"Error generating speech: {str(e)}"


def clear_all():
    """Reset all inputs to default values."""
    return "", "en_US", list(VOICE_OPTIONS.keys())[0], 1.0, 0, 24000, None


def get_voice_list(language: str):
    """Get available voices for the selected language."""
    # Filter voices by language prefix
    lang_prefix = language.split("_")[0]
    lang_voices = {k: v for k, v in VOICE_OPTIONS.items() if k.startswith(lang_prefix)}
    
    # If no language-specific voices, return all voices
    if not lang_voices:
        lang_voices = VOICE_OPTIONS
    
    return list(lang_voices.keys()), list(lang_voices.values())


# Custom CSS for the app
CUSTOM_CSS = """
:root {
    --primary-color: #6366f1;
    --secondary-color: #8b5cf6;
}

.gradio-container {
    max-width: 1200px !important;
}

.header-title {
    text-align: center;
    background: linear-gradient(135deg, #6366f1, #8b5cf6, #a855f7);
    -webkit-background-clip: text;
    -webkit-text-fill-color: transparent;
    background-clip: text;
    font-size: 2.5rem;
    font-weight: 700;
    margin-bottom: 0.5rem;
}

.header-subtitle {
    text-align: center;
    color: #6b7280;
    font-size: 1.1rem;
    margin-bottom: 1.5rem;
}

.built-with {
    text-align: center;
    margin-top: 1rem;
    padding: 0.75rem;
    background: linear-gradient(135deg, #f0f9ff, #e0f2fe);
    border-radius: 0.5rem;
    border: 1px solid #bae6fd;
}

.built-with a {
    color: #0369a1;
    text-decoration: none;
    font-weight: 500;
}

.built-with a:hover {
    text-decoration: underline;
}

.audio-player {
    border-radius: 0.75rem;
    overflow: hidden;
}

.section-title {
    font-size: 1.1rem;
    font-weight: 600;
    color: #374151;
    margin-bottom: 0.5rem;
    padding-bottom: 0.25rem;
    border-bottom: 2px solid #e5e7eb;
}

.info-box {
    background: #fef3c7;
    border: 1px solid #fcd34d;
    border-radius: 0.5rem;
    padding: 0.75rem;
    margin: 0.5rem 0;
    font-size: 0.9rem;
    color: #92400e;
}

.success-box {
    background: #d1fae5;
    border: 1px solid #6ee7b7;
    border-radius: 0.5rem;
    padding: 0.75rem;
    margin: 0.5rem 0;
    font-size: 0.9rem;
    color: #065f46;
}

.tips-box {
    background: #f3f4f6;
    border: 1px solid #d1d5db;
    border-radius: 0.5rem;
    padding: 0.75rem;
    margin: 0.5rem 0;
    font-size: 0.85rem;
    color: #4b5563;
}

.tips-box ul {
    margin: 0.5rem 0 0 0;
    padding-left: 1.25rem;
}

.tips-box li {
    margin: 0.25rem 0;
}
"""


# Create custom theme
custom_theme = gr.themes.Soft(
    primary_hue="indigo",
    secondary_hue="violet",
    neutral_hue="slate",
    text_size="lg",
    spacing_size="lg",
    radius_size="md",
    font=gr.themes.GoogleFont("Inter")
).set(
    button_primary_background_fill="*primary_600",
    button_primary_background_fill_hover="*primary_700",
    button_secondary_background_fill="*secondary_200",
    button_secondary_background_fill_hover="*secondary_300",
    block_title_text_weight="600",
    block_title_text_color="*primary_700",
    input_background_fill="*neutral_100",
)


def create_app():
    """Create the Gradio application."""
    
    # Load model on startup
    model = load_model("cpu")
    
    with gr.Blocks(
        theme=custom_theme,
        css=CUSTOM_CSS,
        title="Pocket-TTS - Text to Speech Converter",
        fill_height=True
    ) as demo:
        
        # Header with branding
        gr.HTML("""
            <div class="header-section">
                <h1 class="header-title">🎙️ Pocket-TTS</h1>
                <p class="header-subtitle">High-Quality Text-to-Speech Synthesis with Natural Voices</p>
            </div>
        """)
        
        # Main content
        with gr.Row(equal_height=True):
            with gr.Column(scale=2):
                # Text input section
                gr.HTML('<p class="section-title">📝 Text Input</p>')
                
                text_input = gr.Textbox(
                    label="Text to Synthesize",
                    placeholder="Enter your text here... e.g., Hello! This is a text-to-speech demonstration.",
                    lines=5,
                    max_lines=10,
                    info="Enter the text you want to convert to speech",
                    interactive=True,
                    elem_id="text-input"
                )
                
                # Quick text buttons
                with gr.Row():
                    gr.Button("👋 Hello World", size="sm").click(
                        lambda: "Hello World! Welcome to the Pocket-TTS demo.", 
                        outputs=text_input
                    )
                    gr.Button("📖 Sample Text", size="sm").click(
                        lambda: "The quick brown fox jumps over the lazy dog. This is a sample sentence to test the text-to-speech system.",
                        outputs=text_input
                    )
                    gr.Button("🧪 Long Text", size="sm").click(
                        lambda: "Artificial intelligence has revolutionized the way we interact with technology. From virtual assistants to autonomous vehicles, AI is everywhere. Text-to-speech systems have improved dramatically, offering more natural and expressive voices than ever before.",
                        outputs=text_input
                    )
                
                # Voice settings section
                gr.HTML('<p class="section-title">🎵 Voice Settings</p>')
                
                with gr.Row():
                    with gr.Column(scale=1):
                        language_dropdown = gr.Dropdown(
                            choices=LANGUAGE_OPTIONS,
                            value="en_US",
                            label="Language",
                            info="Select the language of your text",
                            elem_id="language"
                        )
                    
                    with gr.Column(scale=2):
                        voice_dropdown = gr.Dropdown(
                            choices=list(VOICE_OPTIONS.keys()),
                            value=list(VOICE_OPTIONS.keys())[0],
                            label="Voice",
                            info="Choose a voice for synthesis",
                            elem_id="voice"
                        )
                
                # Advanced settings accordion
                with gr.Accordion("⚙️ Advanced Settings", open=False):
                    with gr.Row():
                        with gr.Column():
                            speed_slider = gr.Slider(
                                minimum=0.5,
                                maximum=2.0,
                                value=1.0,
                                step=0.25,
                                label="Speed",
                                info="Speech speed (0.5x - 2.0x)",
                                elem_id="speed"
                            )
                        
                        with gr.Column():
                            pitch_slider = gr.Slider(
                                minimum=-12,
                                maximum=12,
                                value=0,
                                step=1,
                                label="Pitch",
                                info="Pitch shift (-12 to +12 semitones)",
                                elem_id="pitch"
                            )
                    
                    with gr.Row():
                        with gr.Column():
                            sample_rate_dropdown = gr.Dropdown(
                                choices=[(str(sr), sr) for sr in [16000, 22050, 24000, 44100, 48000]],
                                value=24000,
                                type="index",
                                label="Sample Rate",
                                info="Output audio sample rate (Hz)",
                                elem_id="sample-rate"
                            )
                
                # Generate button
                generate_btn = gr.Button(
                    "🎙️ Generate Speech",
                    variant="primary",
                    size="lg",
                    elem_id="generate-btn"
                )
            
            with gr.Column(scale=1):
                # Output section
                gr.HTML('<p class="section-title">🔊 Audio Output</p>')
                
                audio_output = gr.Audio(
                    label="Generated Audio",
                    type="numpy",
                    interactive=False,
                    elem_id="audio-output"
                )
                
                # Download button (appears when audio is generated)
                download_btn = gr.DownloadButton(
                    "📥 Download Audio",
                    variant="secondary",
                    size="sm",
                    visible=False,
                    elem_id="download-btn"
                )
                
                # Status message
                status_output = gr.Markdown(
                    value="",
                    visible=False,
                    elem_id="status"
                )
                
                # Model info
                with gr.Accordion("ℹ️ Model Information", open=False):
                    gr.Markdown("""
                    **Pocket-TTS** by Kyutai Labs
                    
                    - Lightweight text-to-speech model
                    - Optimized for CPU inference
                    - Multiple voice options
                    - Real-time synthesis support
                    
                    **Requirements:**
                    - Python 3.8+
                    - PyTorch
                    - 2GB+ RAM
                    """)
                
                # Tips section
                with gr.Accordion("💡 Tips", open=False):
                    gr.HTML("""
                    <div class="tips-box">
                        <strong>Tips for best results:</strong>
                        <ul>
                            <li>Use proper punctuation for natural pauses</li>
                            <li>Try different voices for different contexts</li>
                            <li>Adjust speed for clarity (slower = clearer)</li>
                            <li>Pitch works best within ±6 semitones</li>
                        </ul>
                    </div>
                    """)
        
        # Footer with "Built with anycoder"
        gr.HTML("""
            <div class="built-with">
                <p>🚀 Powered by Pocket-TTS from Kyutai Labs</p>
                <p>🔧 Built with <a href="https://huggingface.co/spaces/akhaliq/anycoder" target="_blank">anycoder</a> — Deploy ML models in minutes</p>
            </div>
        """)
        
        # Event handlers
        def update_voices(language):
            """Update voice dropdown based on selected language."""
            voices, labels = get_voice_list(language)
            return gr.Dropdown(value=voices[0] if voices else None)
        
        def update_status(message, success=False):
            """Update status message."""
            if success:
                return gr.Markdown(
                    value=f'<div class="success-box">✅ {message}</div>',
                    visible=True
                )
            elif message:
                return gr.Markdown(
                    value=f'<div class="info-box">ℹ️ {message}</div>',
                    visible=True
                )
            return gr.Markdown(value="", visible=False)
        
        # Connect events
        language_dropdown.change(
            update_voices,
            inputs=language_dropdown,
            outputs=voice_dropdown
        )
        
        generate_btn.click(
            generate_speech,
            inputs=[
                text_input,
                voice_dropdown,
                language_dropdown,
                speed_slider,
                pitch_slider,
                sample_rate_dropdown,
                gr.State(model)
            ],
            outputs=[audio_output, status_output],
            show_progress="full"
        )
        
        # Enable download button when audio is generated
        audio_output.change(
            lambda x: (gr.DownloadButton(visible=True) if x is not None else gr.DownloadButton(visible=False)),
            inputs=audio_output,
            outputs=download_btn
        )
        
        # Clear button functionality
        clear_btn = gr.Button("🗑️ Clear", size="sm", variant="stop")
        clear_btn.click(
            clear_all,
            outputs=[
                text_input,
                language_dropdown,
                voice_dropdown,
                speed_slider,
                pitch_slider,
                sample_rate_dropdown,
                audio_output
            ]
        )
    
    return demo


def main():
    """Main entry point for the application."""
    demo = create_app()
    
    # Launch the application
    demo.launch(
        theme=custom_theme,
        title="Pocket-TTS - Text to Speech",
        description="High-Quality Text-to-Speech with Pocket-TTS by Kyutai Labs",
        article="## About Pocket-TTS\n\nPocket-TTS is a lightweight, efficient text-to-speech model developed by Kyutai Labs. It offers natural-sounding voice synthesis optimized for CPU inference.",
        footer_links=[
            {"label": "Kyutai Labs", "url": "https://kyutai.org"},
            {"label": "GitHub", "url": "https://github.com/kyutai-labs/pocket-tts"},
            {"label": "Built with anycoder", "url": "https://huggingface.co/spaces/akhaliq/anycoder"}
        ],
        show_error=True,
        quiet=False
    )


if __name__ == "__main__":
    main()