File size: 10,457 Bytes

6ef63ba

"""
🗽 Jerome Voice Generator
Type anything → hear Jerome say it with his thick New York accent.
Uses Edge TTS for base speech + RVC for voice conversion.
"""

import os
import sys
import subprocess
import asyncio
import tempfile
import shutil
import logging
import gradio as gr
import edge_tts
from huggingface_hub import hf_hub_download

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# ─── Configuration ───────────────────────────────────────────
MODEL_REPO = "khobster/jerome"
MODEL_FILE = "jerome_100e_1000s.pth"
INDEX_FILE = "jerome.index"
APPLIO_DIR = "/app/applio"
MODEL_DIR = "/app/models"
TEMP_DIR = "/app/temp"

# Edge TTS voices (male voices that work well as RVC input)
TTS_VOICES = {
    "Guy (US)": "en-US-GuyNeural",
    "Andrew (US)": "en-US-AndrewNeural",
    "Eric (US)": "en-US-EricNeural",
    "Christopher (US)": "en-US-ChristopherNeural",
    "Roger (US)": "en-US-RogerNeural",
    "Ryan (UK)": "en-GB-RyanNeural",
}

DEFAULT_VOICE = "en-US-GuyNeural"

# ─── Setup ───────────────────────────────────────────────────

def setup():
    """Download model files and verify Applio installation."""
    os.makedirs(MODEL_DIR, exist_ok=True)
    os.makedirs(TEMP_DIR, exist_ok=True)
    
    # Download RVC model from HuggingFace
    logger.info("Downloading Jerome's RVC model...")
    model_path = hf_hub_download(
        repo_id=MODEL_REPO,
        filename=MODEL_FILE,
        local_dir=MODEL_DIR,
    )
    logger.info(f"Model downloaded: {model_path}")
    
    index_path = hf_hub_download(
        repo_id=MODEL_REPO,
        filename=INDEX_FILE,
        local_dir=MODEL_DIR,
    )
    logger.info(f"Index downloaded: {index_path}")
    
    # Verify Applio is available
    if not os.path.exists(os.path.join(APPLIO_DIR, "core.py")):
        raise RuntimeError("Applio not found! Check Dockerfile.")
    
    return model_path, index_path

# ─── TTS Engine ──────────────────────────────────────────────

async def generate_base_tts(text: str, voice: str, output_path: str):
    """Generate base speech using Edge TTS."""
    communicate = edge_tts.Communicate(text, voice)
    await communicate.save(output_path)
    logger.info(f"Base TTS generated: {output_path}")

# ─── RVC Conversion ─────────────────────────────────────────

def convert_voice(input_path: str, output_path: str, model_path: str, 
                  index_path: str, f0_shift: int = 0, index_rate: float = 0.75):
    """Convert voice using Applio's RVC inference."""
    
    cmd = [
        sys.executable, os.path.join(APPLIO_DIR, "core.py"), "infer",
        "--input_path", input_path,
        "--output_path", output_path,
        "--pth_path", model_path,
        "--index_path", index_path,
        "--f0_method", "rmvpe",
        "--pitch", str(f0_shift),
        "--index_rate", str(index_rate),
        "--filter_radius", "3",
        "--volume_envelope", "0.25",
        "--protect", "0.33",
        "--hop_length", "128",
        "--split_audio", "False",
        "--f0_autotune", "False",
        "--clean_audio", "True",
        "--clean_strength", "0.5",
        "--export_format", "WAV",
        "--embedder_model", "contentvec",
    ]
    
    logger.info(f"Running RVC inference...")
    result = subprocess.run(
        cmd,
        capture_output=True,
        text=True,
        timeout=120,
        cwd=APPLIO_DIR,
        env={**os.environ, "PYTHONPATH": f"{APPLIO_DIR}:{APPLIO_DIR}/rvc/train"}
    )
    
    if result.returncode != 0:
        logger.error(f"RVC STDOUT: {result.stdout}")
        logger.error(f"RVC STDERR: {result.stderr}")
        raise RuntimeError(f"RVC inference failed: {result.stderr[-500:]}")
    
    if not os.path.exists(output_path):
        # Check if output was saved elsewhere
        logger.warning(f"Output not at expected path, searching...")
        raise RuntimeError("RVC did not produce output file")
    
    logger.info(f"Voice conversion complete: {output_path}")

# ─── Main Pipeline ───────────────────────────────────────────

def text_to_jerome(text: str, voice_name: str = "Guy (US)", 
                   pitch_shift: int = 0, index_rate: float = 0.75):
    """Full pipeline: Text → Base TTS → RVC → Jerome's voice"""
    
    if not text.strip():
        return None
    
    voice = TTS_VOICES.get(voice_name, DEFAULT_VOICE)
    
    # Create temp files
    base_path = os.path.join(TEMP_DIR, "base_tts.wav")
    output_path = os.path.join(TEMP_DIR, "jerome_output.wav")
    
    # Clean up old files
    for p in [base_path, output_path]:
        if os.path.exists(p):
            os.remove(p)
    
    try:
        # Step 1: Generate base TTS
        asyncio.run(generate_base_tts(text, voice, base_path))
        
        if not os.path.exists(base_path):
            return None
        
        # Step 2: Convert to Jerome's voice
        convert_voice(
            input_path=base_path,
            output_path=output_path,
            model_path=os.path.join(MODEL_DIR, MODEL_FILE),
            index_path=os.path.join(MODEL_DIR, INDEX_FILE),
            f0_shift=pitch_shift,
            index_rate=index_rate,
        )
        
        if os.path.exists(output_path):
            return output_path
        else:
            return base_path  # Fallback to base TTS
            
    except Exception as e:
        logger.error(f"Pipeline error: {e}")
        # Return base TTS as fallback
        if os.path.exists(base_path):
            return base_path
        return None

# ─── Gradio UI ───────────────────────────────────────────────

def build_ui():
    """Build the Gradio interface."""
    
    with gr.Blocks(
        title="Jerome Voice Generator",
        theme=gr.themes.Base(
            primary_hue=gr.themes.colors.orange,
            secondary_hue=gr.themes.colors.amber,
            neutral_hue=gr.themes.colors.gray,
            font=["Inter", "system-ui", "sans-serif"],
        ),
        css="""
        .main-title { 
            text-align: center; 
            font-size: 2.5em; 
            font-weight: 800;
            margin-bottom: 0;
            background: linear-gradient(135deg, #ff6b35, #f7c948);
            -webkit-background-clip: text;
            -webkit-text-fill-color: transparent;
        }
        .subtitle {
            text-align: center;
            color: #666;
            font-size: 1.1em;
            margin-top: 0;
        }
        footer { display: none !important; }
        """
    ) as demo:
        
        gr.HTML("""
            <h1 class="main-title">🗽 Jerome Voice Generator</h1>
            <p class="subtitle">Type anything and hear Jerome say it — straight outta New York</p>
        """)
        
        with gr.Row():
            with gr.Column(scale=3):
                text_input = gr.Textbox(
                    label="What should Jerome say?",
                    placeholder="Yo, let me tell you somethin' about this game right here...",
                    lines=3,
                    max_lines=10,
                )
                
                generate_btn = gr.Button(
                    "🎤 Make Jerome Say It",
                    variant="primary",
                    size="lg",
                )
            
            with gr.Column(scale=2):
                audio_output = gr.Audio(
                    label="Jerome's Voice",
                    type="filepath",
                )
        
        with gr.Accordion("⚙️ Advanced Settings", open=False):
            with gr.Row():
                voice_select = gr.Dropdown(
                    choices=list(TTS_VOICES.keys()),
                    value="Guy (US)",
                    label="Base Voice (input to RVC)",
                    info="The base TTS voice that gets converted to Jerome's voice"
                )
                pitch_shift = gr.Slider(
                    minimum=-12, maximum=12, value=0, step=1,
                    label="Pitch Shift (semitones)",
                    info="Adjust if the output pitch sounds off"
                )
                index_rate = gr.Slider(
                    minimum=0, maximum=1, value=0.75, step=0.05,
                    label="Index Rate",
                    info="How much to use the voice index (higher = more like training data)"
                )
        
        # Example phrases
        gr.Examples(
            examples=[
                ["Yo what's good everybody, welcome back to the show!"],
                ["Let me tell you somethin', this team ain't got what it takes to win a championship."],
                ["I'm walkin' here! You believe this guy? Unbelievable."],
                ["Listen, the pizza in this city? Fuggedaboutit. Best in the world, no question."],
                ["Alright folks, that's gonna wrap it up for tonight. Thanks for tuning in!"],
            ],
            inputs=text_input,
        )
        
        generate_btn.click(
            fn=text_to_jerome,
            inputs=[text_input, voice_select, pitch_shift, index_rate],
            outputs=audio_output,
        )
        
        # Also generate on Enter
        text_input.submit(
            fn=text_to_jerome,
            inputs=[text_input, voice_select, pitch_shift, index_rate],
            outputs=audio_output,
        )
    
    return demo

# ─── Launch ──────────────────────────────────────────────────

if __name__ == "__main__":
    logger.info("🗽 Starting Jerome Voice Generator...")
    
    # Setup: download model
    model_path, index_path = setup()
    logger.info(f"Model ready: {model_path}")
    logger.info(f"Index ready: {index_path}")
    
    # Build and launch UI
    demo = build_ui()
    demo.launch(
        server_name="0.0.0.0",
        server_port=7860,
        share=False,
    )