jerome / app.py
khobster's picture
Upload 3 files
6ef63ba verified
"""
πŸ—½ Jerome Voice Generator
Type anything β†’ hear Jerome say it with his thick New York accent.
Uses Edge TTS for base speech + RVC for voice conversion.
"""
import os
import sys
import subprocess
import asyncio
import tempfile
import shutil
import logging
import gradio as gr
import edge_tts
from huggingface_hub import hf_hub_download
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# ─── Configuration ───────────────────────────────────────────
MODEL_REPO = "khobster/jerome"
MODEL_FILE = "jerome_100e_1000s.pth"
INDEX_FILE = "jerome.index"
APPLIO_DIR = "/app/applio"
MODEL_DIR = "/app/models"
TEMP_DIR = "/app/temp"
# Edge TTS voices (male voices that work well as RVC input)
TTS_VOICES = {
"Guy (US)": "en-US-GuyNeural",
"Andrew (US)": "en-US-AndrewNeural",
"Eric (US)": "en-US-EricNeural",
"Christopher (US)": "en-US-ChristopherNeural",
"Roger (US)": "en-US-RogerNeural",
"Ryan (UK)": "en-GB-RyanNeural",
}
DEFAULT_VOICE = "en-US-GuyNeural"
# ─── Setup ───────────────────────────────────────────────────
def setup():
"""Download model files and verify Applio installation."""
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(TEMP_DIR, exist_ok=True)
# Download RVC model from HuggingFace
logger.info("Downloading Jerome's RVC model...")
model_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=MODEL_FILE,
local_dir=MODEL_DIR,
)
logger.info(f"Model downloaded: {model_path}")
index_path = hf_hub_download(
repo_id=MODEL_REPO,
filename=INDEX_FILE,
local_dir=MODEL_DIR,
)
logger.info(f"Index downloaded: {index_path}")
# Verify Applio is available
if not os.path.exists(os.path.join(APPLIO_DIR, "core.py")):
raise RuntimeError("Applio not found! Check Dockerfile.")
return model_path, index_path
# ─── TTS Engine ──────────────────────────────────────────────
async def generate_base_tts(text: str, voice: str, output_path: str):
"""Generate base speech using Edge TTS."""
communicate = edge_tts.Communicate(text, voice)
await communicate.save(output_path)
logger.info(f"Base TTS generated: {output_path}")
# ─── RVC Conversion ─────────────────────────────────────────
def convert_voice(input_path: str, output_path: str, model_path: str,
index_path: str, f0_shift: int = 0, index_rate: float = 0.75):
"""Convert voice using Applio's RVC inference."""
cmd = [
sys.executable, os.path.join(APPLIO_DIR, "core.py"), "infer",
"--input_path", input_path,
"--output_path", output_path,
"--pth_path", model_path,
"--index_path", index_path,
"--f0_method", "rmvpe",
"--pitch", str(f0_shift),
"--index_rate", str(index_rate),
"--filter_radius", "3",
"--volume_envelope", "0.25",
"--protect", "0.33",
"--hop_length", "128",
"--split_audio", "False",
"--f0_autotune", "False",
"--clean_audio", "True",
"--clean_strength", "0.5",
"--export_format", "WAV",
"--embedder_model", "contentvec",
]
logger.info(f"Running RVC inference...")
result = subprocess.run(
cmd,
capture_output=True,
text=True,
timeout=120,
cwd=APPLIO_DIR,
env={**os.environ, "PYTHONPATH": f"{APPLIO_DIR}:{APPLIO_DIR}/rvc/train"}
)
if result.returncode != 0:
logger.error(f"RVC STDOUT: {result.stdout}")
logger.error(f"RVC STDERR: {result.stderr}")
raise RuntimeError(f"RVC inference failed: {result.stderr[-500:]}")
if not os.path.exists(output_path):
# Check if output was saved elsewhere
logger.warning(f"Output not at expected path, searching...")
raise RuntimeError("RVC did not produce output file")
logger.info(f"Voice conversion complete: {output_path}")
# ─── Main Pipeline ───────────────────────────────────────────
def text_to_jerome(text: str, voice_name: str = "Guy (US)",
pitch_shift: int = 0, index_rate: float = 0.75):
"""Full pipeline: Text β†’ Base TTS β†’ RVC β†’ Jerome's voice"""
if not text.strip():
return None
voice = TTS_VOICES.get(voice_name, DEFAULT_VOICE)
# Create temp files
base_path = os.path.join(TEMP_DIR, "base_tts.wav")
output_path = os.path.join(TEMP_DIR, "jerome_output.wav")
# Clean up old files
for p in [base_path, output_path]:
if os.path.exists(p):
os.remove(p)
try:
# Step 1: Generate base TTS
asyncio.run(generate_base_tts(text, voice, base_path))
if not os.path.exists(base_path):
return None
# Step 2: Convert to Jerome's voice
convert_voice(
input_path=base_path,
output_path=output_path,
model_path=os.path.join(MODEL_DIR, MODEL_FILE),
index_path=os.path.join(MODEL_DIR, INDEX_FILE),
f0_shift=pitch_shift,
index_rate=index_rate,
)
if os.path.exists(output_path):
return output_path
else:
return base_path # Fallback to base TTS
except Exception as e:
logger.error(f"Pipeline error: {e}")
# Return base TTS as fallback
if os.path.exists(base_path):
return base_path
return None
# ─── Gradio UI ───────────────────────────────────────────────
def build_ui():
"""Build the Gradio interface."""
with gr.Blocks(
title="Jerome Voice Generator",
theme=gr.themes.Base(
primary_hue=gr.themes.colors.orange,
secondary_hue=gr.themes.colors.amber,
neutral_hue=gr.themes.colors.gray,
font=["Inter", "system-ui", "sans-serif"],
),
css="""
.main-title {
text-align: center;
font-size: 2.5em;
font-weight: 800;
margin-bottom: 0;
background: linear-gradient(135deg, #ff6b35, #f7c948);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
}
.subtitle {
text-align: center;
color: #666;
font-size: 1.1em;
margin-top: 0;
}
footer { display: none !important; }
"""
) as demo:
gr.HTML("""
<h1 class="main-title">πŸ—½ Jerome Voice Generator</h1>
<p class="subtitle">Type anything and hear Jerome say it β€” straight outta New York</p>
""")
with gr.Row():
with gr.Column(scale=3):
text_input = gr.Textbox(
label="What should Jerome say?",
placeholder="Yo, let me tell you somethin' about this game right here...",
lines=3,
max_lines=10,
)
generate_btn = gr.Button(
"🎀 Make Jerome Say It",
variant="primary",
size="lg",
)
with gr.Column(scale=2):
audio_output = gr.Audio(
label="Jerome's Voice",
type="filepath",
)
with gr.Accordion("βš™οΈ Advanced Settings", open=False):
with gr.Row():
voice_select = gr.Dropdown(
choices=list(TTS_VOICES.keys()),
value="Guy (US)",
label="Base Voice (input to RVC)",
info="The base TTS voice that gets converted to Jerome's voice"
)
pitch_shift = gr.Slider(
minimum=-12, maximum=12, value=0, step=1,
label="Pitch Shift (semitones)",
info="Adjust if the output pitch sounds off"
)
index_rate = gr.Slider(
minimum=0, maximum=1, value=0.75, step=0.05,
label="Index Rate",
info="How much to use the voice index (higher = more like training data)"
)
# Example phrases
gr.Examples(
examples=[
["Yo what's good everybody, welcome back to the show!"],
["Let me tell you somethin', this team ain't got what it takes to win a championship."],
["I'm walkin' here! You believe this guy? Unbelievable."],
["Listen, the pizza in this city? Fuggedaboutit. Best in the world, no question."],
["Alright folks, that's gonna wrap it up for tonight. Thanks for tuning in!"],
],
inputs=text_input,
)
generate_btn.click(
fn=text_to_jerome,
inputs=[text_input, voice_select, pitch_shift, index_rate],
outputs=audio_output,
)
# Also generate on Enter
text_input.submit(
fn=text_to_jerome,
inputs=[text_input, voice_select, pitch_shift, index_rate],
outputs=audio_output,
)
return demo
# ─── Launch ──────────────────────────────────────────────────
if __name__ == "__main__":
logger.info("πŸ—½ Starting Jerome Voice Generator...")
# Setup: download model
model_path, index_path = setup()
logger.info(f"Model ready: {model_path}")
logger.info(f"Index ready: {index_path}")
# Build and launch UI
demo = build_ui()
demo.launch(
server_name="0.0.0.0",
server_port=7860,
share=False,
)