jerome / app.py

Upload 3 files

6ef63ba verified about 1 month ago

10.5 kB

	"""
	🗽 Jerome Voice Generator
	Type anything → hear Jerome say it with his thick New York accent.
	Uses Edge TTS for base speech + RVC for voice conversion.
	"""

	import os
	import sys
	import subprocess
	import asyncio
	import tempfile
	import shutil
	import logging
	import gradio as gr
	import edge_tts
	from huggingface_hub import hf_hub_download

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# ─── Configuration ───────────────────────────────────────────
	MODEL_REPO = "khobster/jerome"
	MODEL_FILE = "jerome_100e_1000s.pth"
	INDEX_FILE = "jerome.index"
	APPLIO_DIR = "/app/applio"
	MODEL_DIR = "/app/models"
	TEMP_DIR = "/app/temp"

	# Edge TTS voices (male voices that work well as RVC input)
	TTS_VOICES = {
	"Guy (US)": "en-US-GuyNeural",
	"Andrew (US)": "en-US-AndrewNeural",
	"Eric (US)": "en-US-EricNeural",
	"Christopher (US)": "en-US-ChristopherNeural",
	"Roger (US)": "en-US-RogerNeural",
	"Ryan (UK)": "en-GB-RyanNeural",
	}

	DEFAULT_VOICE = "en-US-GuyNeural"

	# ─── Setup ───────────────────────────────────────────────────

	def setup():
	"""Download model files and verify Applio installation."""
	os.makedirs(MODEL_DIR, exist_ok=True)
	os.makedirs(TEMP_DIR, exist_ok=True)

	# Download RVC model from HuggingFace
	logger.info("Downloading Jerome's RVC model...")
	model_path = hf_hub_download(
	repo_id=MODEL_REPO,
	filename=MODEL_FILE,
	local_dir=MODEL_DIR,
	)
	logger.info(f"Model downloaded: {model_path}")

	index_path = hf_hub_download(
	repo_id=MODEL_REPO,
	filename=INDEX_FILE,
	local_dir=MODEL_DIR,
	)
	logger.info(f"Index downloaded: {index_path}")

	# Verify Applio is available
	if not os.path.exists(os.path.join(APPLIO_DIR, "core.py")):
	raise RuntimeError("Applio not found! Check Dockerfile.")

	return model_path, index_path

	# ─── TTS Engine ──────────────────────────────────────────────

	async def generate_base_tts(text: str, voice: str, output_path: str):
	"""Generate base speech using Edge TTS."""
	communicate = edge_tts.Communicate(text, voice)
	await communicate.save(output_path)
	logger.info(f"Base TTS generated: {output_path}")

	# ─── RVC Conversion ─────────────────────────────────────────

	def convert_voice(input_path: str, output_path: str, model_path: str,
	index_path: str, f0_shift: int = 0, index_rate: float = 0.75):
	"""Convert voice using Applio's RVC inference."""

	cmd = [
	sys.executable, os.path.join(APPLIO_DIR, "core.py"), "infer",
	"--input_path", input_path,
	"--output_path", output_path,
	"--pth_path", model_path,
	"--index_path", index_path,
	"--f0_method", "rmvpe",
	"--pitch", str(f0_shift),
	"--index_rate", str(index_rate),
	"--filter_radius", "3",
	"--volume_envelope", "0.25",
	"--protect", "0.33",
	"--hop_length", "128",
	"--split_audio", "False",
	"--f0_autotune", "False",
	"--clean_audio", "True",
	"--clean_strength", "0.5",
	"--export_format", "WAV",
	"--embedder_model", "contentvec",
	]

	logger.info(f"Running RVC inference...")
	result = subprocess.run(
	cmd,
	capture_output=True,
	text=True,
	timeout=120,
	cwd=APPLIO_DIR,
	env={**os.environ, "PYTHONPATH": f"{APPLIO_DIR}:{APPLIO_DIR}/rvc/train"}
	)

	if result.returncode != 0:
	logger.error(f"RVC STDOUT: {result.stdout}")
	logger.error(f"RVC STDERR: {result.stderr}")
	raise RuntimeError(f"RVC inference failed: {result.stderr[-500:]}")

	if not os.path.exists(output_path):
	# Check if output was saved elsewhere
	logger.warning(f"Output not at expected path, searching...")
	raise RuntimeError("RVC did not produce output file")

	logger.info(f"Voice conversion complete: {output_path}")

	# ─── Main Pipeline ───────────────────────────────────────────

	def text_to_jerome(text: str, voice_name: str = "Guy (US)",
	pitch_shift: int = 0, index_rate: float = 0.75):
	"""Full pipeline: Text → Base TTS → RVC → Jerome's voice"""

	if not text.strip():
	return None

	voice = TTS_VOICES.get(voice_name, DEFAULT_VOICE)

	# Create temp files
	base_path = os.path.join(TEMP_DIR, "base_tts.wav")
	output_path = os.path.join(TEMP_DIR, "jerome_output.wav")

	# Clean up old files
	for p in [base_path, output_path]:
	if os.path.exists(p):
	os.remove(p)

	try:
	# Step 1: Generate base TTS
	asyncio.run(generate_base_tts(text, voice, base_path))

	if not os.path.exists(base_path):
	return None

	# Step 2: Convert to Jerome's voice
	convert_voice(
	input_path=base_path,
	output_path=output_path,
	model_path=os.path.join(MODEL_DIR, MODEL_FILE),
	index_path=os.path.join(MODEL_DIR, INDEX_FILE),
	f0_shift=pitch_shift,
	index_rate=index_rate,
	)

	if os.path.exists(output_path):
	return output_path
	else:
	return base_path # Fallback to base TTS

	except Exception as e:
	logger.error(f"Pipeline error: {e}")
	# Return base TTS as fallback
	if os.path.exists(base_path):
	return base_path
	return None

	# ─── Gradio UI ───────────────────────────────────────────────

	def build_ui():
	"""Build the Gradio interface."""

	with gr.Blocks(
	title="Jerome Voice Generator",
	theme=gr.themes.Base(
	primary_hue=gr.themes.colors.orange,
	secondary_hue=gr.themes.colors.amber,
	neutral_hue=gr.themes.colors.gray,
	font=["Inter", "system-ui", "sans-serif"],
	),
	css="""
	.main-title {
	text-align: center;
	font-size: 2.5em;
	font-weight: 800;
	margin-bottom: 0;
	background: linear-gradient(135deg, #ff6b35, #f7c948);
	-webkit-background-clip: text;
	-webkit-text-fill-color: transparent;
	}
	.subtitle {
	text-align: center;
	color: #666;
	font-size: 1.1em;
	margin-top: 0;
	}
	footer { display: none !important; }
	"""
	) as demo:

	gr.HTML("""
	<h1 class="main-title">🗽 Jerome Voice Generator</h1>
	<p class="subtitle">Type anything and hear Jerome say it — straight outta New York</p>
	""")

	with gr.Row():
	with gr.Column(scale=3):
	text_input = gr.Textbox(
	label="What should Jerome say?",
	placeholder="Yo, let me tell you somethin' about this game right here...",
	lines=3,
	max_lines=10,
	)

	generate_btn = gr.Button(
	"🎤 Make Jerome Say It",
	variant="primary",
	size="lg",
	)

	with gr.Column(scale=2):
	audio_output = gr.Audio(
	label="Jerome's Voice",
	type="filepath",
	)

	with gr.Accordion("⚙️ Advanced Settings", open=False):
	with gr.Row():
	voice_select = gr.Dropdown(
	choices=list(TTS_VOICES.keys()),
	value="Guy (US)",
	label="Base Voice (input to RVC)",
	info="The base TTS voice that gets converted to Jerome's voice"
	)
	pitch_shift = gr.Slider(
	minimum=-12, maximum=12, value=0, step=1,
	label="Pitch Shift (semitones)",
	info="Adjust if the output pitch sounds off"
	)
	index_rate = gr.Slider(
	minimum=0, maximum=1, value=0.75, step=0.05,
	label="Index Rate",
	info="How much to use the voice index (higher = more like training data)"
	)

	# Example phrases
	gr.Examples(
	examples=[
	["Yo what's good everybody, welcome back to the show!"],
	["Let me tell you somethin', this team ain't got what it takes to win a championship."],
	["I'm walkin' here! You believe this guy? Unbelievable."],
	["Listen, the pizza in this city? Fuggedaboutit. Best in the world, no question."],
	["Alright folks, that's gonna wrap it up for tonight. Thanks for tuning in!"],
	],
	inputs=text_input,
	)

	generate_btn.click(
	fn=text_to_jerome,
	inputs=[text_input, voice_select, pitch_shift, index_rate],
	outputs=audio_output,
	)

	# Also generate on Enter
	text_input.submit(
	fn=text_to_jerome,
	inputs=[text_input, voice_select, pitch_shift, index_rate],
	outputs=audio_output,
	)

	return demo

	# ─── Launch ──────────────────────────────────────────────────

	if __name__ == "__main__":
	logger.info("🗽 Starting Jerome Voice Generator...")

	# Setup: download model
	model_path, index_path = setup()
	logger.info(f"Model ready: {model_path}")
	logger.info(f"Index ready: {index_path}")

	# Build and launch UI
	demo = build_ui()
	demo.launch(
	server_name="0.0.0.0",
	server_port=7860,
	share=False,
	)