""" 🗽 Jerome Voice Generator Type anything → hear Jerome say it with his thick New York accent. Uses Edge TTS for base speech + RVC for voice conversion. """ import os import sys import subprocess import asyncio import tempfile import shutil import logging import gradio as gr import edge_tts from huggingface_hub import hf_hub_download logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) # ─── Configuration ─────────────────────────────────────────── MODEL_REPO = "khobster/jerome" MODEL_FILE = "jerome_100e_1000s.pth" INDEX_FILE = "jerome.index" APPLIO_DIR = "/app/applio" MODEL_DIR = "/app/models" TEMP_DIR = "/app/temp" # Edge TTS voices (male voices that work well as RVC input) TTS_VOICES = { "Guy (US)": "en-US-GuyNeural", "Andrew (US)": "en-US-AndrewNeural", "Eric (US)": "en-US-EricNeural", "Christopher (US)": "en-US-ChristopherNeural", "Roger (US)": "en-US-RogerNeural", "Ryan (UK)": "en-GB-RyanNeural", } DEFAULT_VOICE = "en-US-GuyNeural" # ─── Setup ─────────────────────────────────────────────────── def setup(): """Download model files and verify Applio installation.""" os.makedirs(MODEL_DIR, exist_ok=True) os.makedirs(TEMP_DIR, exist_ok=True) # Download RVC model from HuggingFace logger.info("Downloading Jerome's RVC model...") model_path = hf_hub_download( repo_id=MODEL_REPO, filename=MODEL_FILE, local_dir=MODEL_DIR, ) logger.info(f"Model downloaded: {model_path}") index_path = hf_hub_download( repo_id=MODEL_REPO, filename=INDEX_FILE, local_dir=MODEL_DIR, ) logger.info(f"Index downloaded: {index_path}") # Verify Applio is available if not os.path.exists(os.path.join(APPLIO_DIR, "core.py")): raise RuntimeError("Applio not found! Check Dockerfile.") return model_path, index_path # ─── TTS Engine ────────────────────────────────────────────── async def generate_base_tts(text: str, voice: str, output_path: str): """Generate base speech using Edge TTS.""" communicate = edge_tts.Communicate(text, voice) await communicate.save(output_path) logger.info(f"Base TTS generated: {output_path}") # ─── RVC Conversion ───────────────────────────────────────── def convert_voice(input_path: str, output_path: str, model_path: str, index_path: str, f0_shift: int = 0, index_rate: float = 0.75): """Convert voice using Applio's RVC inference.""" cmd = [ sys.executable, os.path.join(APPLIO_DIR, "core.py"), "infer", "--input_path", input_path, "--output_path", output_path, "--pth_path", model_path, "--index_path", index_path, "--f0_method", "rmvpe", "--pitch", str(f0_shift), "--index_rate", str(index_rate), "--filter_radius", "3", "--volume_envelope", "0.25", "--protect", "0.33", "--hop_length", "128", "--split_audio", "False", "--f0_autotune", "False", "--clean_audio", "True", "--clean_strength", "0.5", "--export_format", "WAV", "--embedder_model", "contentvec", ] logger.info(f"Running RVC inference...") result = subprocess.run( cmd, capture_output=True, text=True, timeout=120, cwd=APPLIO_DIR, env={**os.environ, "PYTHONPATH": f"{APPLIO_DIR}:{APPLIO_DIR}/rvc/train"} ) if result.returncode != 0: logger.error(f"RVC STDOUT: {result.stdout}") logger.error(f"RVC STDERR: {result.stderr}") raise RuntimeError(f"RVC inference failed: {result.stderr[-500:]}") if not os.path.exists(output_path): # Check if output was saved elsewhere logger.warning(f"Output not at expected path, searching...") raise RuntimeError("RVC did not produce output file") logger.info(f"Voice conversion complete: {output_path}") # ─── Main Pipeline ─────────────────────────────────────────── def text_to_jerome(text: str, voice_name: str = "Guy (US)", pitch_shift: int = 0, index_rate: float = 0.75): """Full pipeline: Text → Base TTS → RVC → Jerome's voice""" if not text.strip(): return None voice = TTS_VOICES.get(voice_name, DEFAULT_VOICE) # Create temp files base_path = os.path.join(TEMP_DIR, "base_tts.wav") output_path = os.path.join(TEMP_DIR, "jerome_output.wav") # Clean up old files for p in [base_path, output_path]: if os.path.exists(p): os.remove(p) try: # Step 1: Generate base TTS asyncio.run(generate_base_tts(text, voice, base_path)) if not os.path.exists(base_path): return None # Step 2: Convert to Jerome's voice convert_voice( input_path=base_path, output_path=output_path, model_path=os.path.join(MODEL_DIR, MODEL_FILE), index_path=os.path.join(MODEL_DIR, INDEX_FILE), f0_shift=pitch_shift, index_rate=index_rate, ) if os.path.exists(output_path): return output_path else: return base_path # Fallback to base TTS except Exception as e: logger.error(f"Pipeline error: {e}") # Return base TTS as fallback if os.path.exists(base_path): return base_path return None # ─── Gradio UI ─────────────────────────────────────────────── def build_ui(): """Build the Gradio interface.""" with gr.Blocks( title="Jerome Voice Generator", theme=gr.themes.Base( primary_hue=gr.themes.colors.orange, secondary_hue=gr.themes.colors.amber, neutral_hue=gr.themes.colors.gray, font=["Inter", "system-ui", "sans-serif"], ), css=""" .main-title { text-align: center; font-size: 2.5em; font-weight: 800; margin-bottom: 0; background: linear-gradient(135deg, #ff6b35, #f7c948); -webkit-background-clip: text; -webkit-text-fill-color: transparent; } .subtitle { text-align: center; color: #666; font-size: 1.1em; margin-top: 0; } footer { display: none !important; } """ ) as demo: gr.HTML("""
Type anything and hear Jerome say it — straight outta New York
""") with gr.Row(): with gr.Column(scale=3): text_input = gr.Textbox( label="What should Jerome say?", placeholder="Yo, let me tell you somethin' about this game right here...", lines=3, max_lines=10, ) generate_btn = gr.Button( "🎤 Make Jerome Say It", variant="primary", size="lg", ) with gr.Column(scale=2): audio_output = gr.Audio( label="Jerome's Voice", type="filepath", ) with gr.Accordion("⚙️ Advanced Settings", open=False): with gr.Row(): voice_select = gr.Dropdown( choices=list(TTS_VOICES.keys()), value="Guy (US)", label="Base Voice (input to RVC)", info="The base TTS voice that gets converted to Jerome's voice" ) pitch_shift = gr.Slider( minimum=-12, maximum=12, value=0, step=1, label="Pitch Shift (semitones)", info="Adjust if the output pitch sounds off" ) index_rate = gr.Slider( minimum=0, maximum=1, value=0.75, step=0.05, label="Index Rate", info="How much to use the voice index (higher = more like training data)" ) # Example phrases gr.Examples( examples=[ ["Yo what's good everybody, welcome back to the show!"], ["Let me tell you somethin', this team ain't got what it takes to win a championship."], ["I'm walkin' here! You believe this guy? Unbelievable."], ["Listen, the pizza in this city? Fuggedaboutit. Best in the world, no question."], ["Alright folks, that's gonna wrap it up for tonight. Thanks for tuning in!"], ], inputs=text_input, ) generate_btn.click( fn=text_to_jerome, inputs=[text_input, voice_select, pitch_shift, index_rate], outputs=audio_output, ) # Also generate on Enter text_input.submit( fn=text_to_jerome, inputs=[text_input, voice_select, pitch_shift, index_rate], outputs=audio_output, ) return demo # ─── Launch ────────────────────────────────────────────────── if __name__ == "__main__": logger.info("🗽 Starting Jerome Voice Generator...") # Setup: download model model_path, index_path = setup() logger.info(f"Model ready: {model_path}") logger.info(f"Index ready: {index_path}") # Build and launch UI demo = build_ui() demo.launch( server_name="0.0.0.0", server_port=7860, share=False, )