| """ |
| π½ Jerome Voice Generator |
| Type anything β hear Jerome say it with his thick New York accent. |
| Uses Edge TTS for base speech + RVC for voice conversion. |
| """ |
|
|
| import os |
| import sys |
| import subprocess |
| import asyncio |
| import tempfile |
| import shutil |
| import logging |
| import gradio as gr |
| import edge_tts |
| from huggingface_hub import hf_hub_download |
|
|
| logging.basicConfig(level=logging.INFO) |
| logger = logging.getLogger(__name__) |
|
|
| |
| MODEL_REPO = "khobster/jerome" |
| MODEL_FILE = "jerome_100e_1000s.pth" |
| INDEX_FILE = "jerome.index" |
| APPLIO_DIR = "/app/applio" |
| MODEL_DIR = "/app/models" |
| TEMP_DIR = "/app/temp" |
|
|
| |
| TTS_VOICES = { |
| "Guy (US)": "en-US-GuyNeural", |
| "Andrew (US)": "en-US-AndrewNeural", |
| "Eric (US)": "en-US-EricNeural", |
| "Christopher (US)": "en-US-ChristopherNeural", |
| "Roger (US)": "en-US-RogerNeural", |
| "Ryan (UK)": "en-GB-RyanNeural", |
| } |
|
|
| DEFAULT_VOICE = "en-US-GuyNeural" |
|
|
| |
|
|
| def setup(): |
| """Download model files and verify Applio installation.""" |
| os.makedirs(MODEL_DIR, exist_ok=True) |
| os.makedirs(TEMP_DIR, exist_ok=True) |
| |
| |
| logger.info("Downloading Jerome's RVC model...") |
| model_path = hf_hub_download( |
| repo_id=MODEL_REPO, |
| filename=MODEL_FILE, |
| local_dir=MODEL_DIR, |
| ) |
| logger.info(f"Model downloaded: {model_path}") |
| |
| index_path = hf_hub_download( |
| repo_id=MODEL_REPO, |
| filename=INDEX_FILE, |
| local_dir=MODEL_DIR, |
| ) |
| logger.info(f"Index downloaded: {index_path}") |
| |
| |
| if not os.path.exists(os.path.join(APPLIO_DIR, "core.py")): |
| raise RuntimeError("Applio not found! Check Dockerfile.") |
| |
| return model_path, index_path |
|
|
| |
|
|
| async def generate_base_tts(text: str, voice: str, output_path: str): |
| """Generate base speech using Edge TTS.""" |
| communicate = edge_tts.Communicate(text, voice) |
| await communicate.save(output_path) |
| logger.info(f"Base TTS generated: {output_path}") |
|
|
| |
|
|
| def convert_voice(input_path: str, output_path: str, model_path: str, |
| index_path: str, f0_shift: int = 0, index_rate: float = 0.75): |
| """Convert voice using Applio's RVC inference.""" |
| |
| cmd = [ |
| sys.executable, os.path.join(APPLIO_DIR, "core.py"), "infer", |
| "--input_path", input_path, |
| "--output_path", output_path, |
| "--pth_path", model_path, |
| "--index_path", index_path, |
| "--f0_method", "rmvpe", |
| "--pitch", str(f0_shift), |
| "--index_rate", str(index_rate), |
| "--filter_radius", "3", |
| "--volume_envelope", "0.25", |
| "--protect", "0.33", |
| "--hop_length", "128", |
| "--split_audio", "False", |
| "--f0_autotune", "False", |
| "--clean_audio", "True", |
| "--clean_strength", "0.5", |
| "--export_format", "WAV", |
| "--embedder_model", "contentvec", |
| ] |
| |
| logger.info(f"Running RVC inference...") |
| result = subprocess.run( |
| cmd, |
| capture_output=True, |
| text=True, |
| timeout=120, |
| cwd=APPLIO_DIR, |
| env={**os.environ, "PYTHONPATH": f"{APPLIO_DIR}:{APPLIO_DIR}/rvc/train"} |
| ) |
| |
| if result.returncode != 0: |
| logger.error(f"RVC STDOUT: {result.stdout}") |
| logger.error(f"RVC STDERR: {result.stderr}") |
| raise RuntimeError(f"RVC inference failed: {result.stderr[-500:]}") |
| |
| if not os.path.exists(output_path): |
| |
| logger.warning(f"Output not at expected path, searching...") |
| raise RuntimeError("RVC did not produce output file") |
| |
| logger.info(f"Voice conversion complete: {output_path}") |
|
|
| |
|
|
| def text_to_jerome(text: str, voice_name: str = "Guy (US)", |
| pitch_shift: int = 0, index_rate: float = 0.75): |
| """Full pipeline: Text β Base TTS β RVC β Jerome's voice""" |
| |
| if not text.strip(): |
| return None |
| |
| voice = TTS_VOICES.get(voice_name, DEFAULT_VOICE) |
| |
| |
| base_path = os.path.join(TEMP_DIR, "base_tts.wav") |
| output_path = os.path.join(TEMP_DIR, "jerome_output.wav") |
| |
| |
| for p in [base_path, output_path]: |
| if os.path.exists(p): |
| os.remove(p) |
| |
| try: |
| |
| asyncio.run(generate_base_tts(text, voice, base_path)) |
| |
| if not os.path.exists(base_path): |
| return None |
| |
| |
| convert_voice( |
| input_path=base_path, |
| output_path=output_path, |
| model_path=os.path.join(MODEL_DIR, MODEL_FILE), |
| index_path=os.path.join(MODEL_DIR, INDEX_FILE), |
| f0_shift=pitch_shift, |
| index_rate=index_rate, |
| ) |
| |
| if os.path.exists(output_path): |
| return output_path |
| else: |
| return base_path |
| |
| except Exception as e: |
| logger.error(f"Pipeline error: {e}") |
| |
| if os.path.exists(base_path): |
| return base_path |
| return None |
|
|
| |
|
|
| def build_ui(): |
| """Build the Gradio interface.""" |
| |
| with gr.Blocks( |
| title="Jerome Voice Generator", |
| theme=gr.themes.Base( |
| primary_hue=gr.themes.colors.orange, |
| secondary_hue=gr.themes.colors.amber, |
| neutral_hue=gr.themes.colors.gray, |
| font=["Inter", "system-ui", "sans-serif"], |
| ), |
| css=""" |
| .main-title { |
| text-align: center; |
| font-size: 2.5em; |
| font-weight: 800; |
| margin-bottom: 0; |
| background: linear-gradient(135deg, #ff6b35, #f7c948); |
| -webkit-background-clip: text; |
| -webkit-text-fill-color: transparent; |
| } |
| .subtitle { |
| text-align: center; |
| color: #666; |
| font-size: 1.1em; |
| margin-top: 0; |
| } |
| footer { display: none !important; } |
| """ |
| ) as demo: |
| |
| gr.HTML(""" |
| <h1 class="main-title">π½ Jerome Voice Generator</h1> |
| <p class="subtitle">Type anything and hear Jerome say it β straight outta New York</p> |
| """) |
| |
| with gr.Row(): |
| with gr.Column(scale=3): |
| text_input = gr.Textbox( |
| label="What should Jerome say?", |
| placeholder="Yo, let me tell you somethin' about this game right here...", |
| lines=3, |
| max_lines=10, |
| ) |
| |
| generate_btn = gr.Button( |
| "π€ Make Jerome Say It", |
| variant="primary", |
| size="lg", |
| ) |
| |
| with gr.Column(scale=2): |
| audio_output = gr.Audio( |
| label="Jerome's Voice", |
| type="filepath", |
| ) |
| |
| with gr.Accordion("βοΈ Advanced Settings", open=False): |
| with gr.Row(): |
| voice_select = gr.Dropdown( |
| choices=list(TTS_VOICES.keys()), |
| value="Guy (US)", |
| label="Base Voice (input to RVC)", |
| info="The base TTS voice that gets converted to Jerome's voice" |
| ) |
| pitch_shift = gr.Slider( |
| minimum=-12, maximum=12, value=0, step=1, |
| label="Pitch Shift (semitones)", |
| info="Adjust if the output pitch sounds off" |
| ) |
| index_rate = gr.Slider( |
| minimum=0, maximum=1, value=0.75, step=0.05, |
| label="Index Rate", |
| info="How much to use the voice index (higher = more like training data)" |
| ) |
| |
| |
| gr.Examples( |
| examples=[ |
| ["Yo what's good everybody, welcome back to the show!"], |
| ["Let me tell you somethin', this team ain't got what it takes to win a championship."], |
| ["I'm walkin' here! You believe this guy? Unbelievable."], |
| ["Listen, the pizza in this city? Fuggedaboutit. Best in the world, no question."], |
| ["Alright folks, that's gonna wrap it up for tonight. Thanks for tuning in!"], |
| ], |
| inputs=text_input, |
| ) |
| |
| generate_btn.click( |
| fn=text_to_jerome, |
| inputs=[text_input, voice_select, pitch_shift, index_rate], |
| outputs=audio_output, |
| ) |
| |
| |
| text_input.submit( |
| fn=text_to_jerome, |
| inputs=[text_input, voice_select, pitch_shift, index_rate], |
| outputs=audio_output, |
| ) |
| |
| return demo |
|
|
| |
|
|
| if __name__ == "__main__": |
| logger.info("π½ Starting Jerome Voice Generator...") |
| |
| |
| model_path, index_path = setup() |
| logger.info(f"Model ready: {model_path}") |
| logger.info(f"Index ready: {index_path}") |
| |
| |
| demo = build_ui() |
| demo.launch( |
| server_name="0.0.0.0", |
| server_port=7860, |
| share=False, |
| ) |
|
|