RVC

Running

RVC / app.py

ozipoetra

refactor: modularize codebase into separate modules

61cc64e about 2 months ago

15.8 kB

	"""RVC Voice Conversion – HuggingFace Space

	Simple, fast, GPU/CPU auto-detected.
	"""
	from __future__ import annotations

	import os

	import gradio as gr

	from lib.config import (
	BUILTIN_MODELS,
	CSS,
	DEVICE_LABEL,
	MAX_INPUT_DURATION,
	logger,
	)
	from lib.jobs import (
	get_jobs_table,
	get_queue_info,
	poll_job,
	submit_job,
	)
	from lib.models import list_models, startup_downloads
	from lib.ui import refresh_models, toggle_autotune, upload_model

	# ── Startup ───────────────────────────────────────────────────────────────────
	startup_status = ""
	default_model = ""
	try:
	default_model = startup_downloads()
	startup_status = f"✅ Ready  ·  {DEVICE_LABEL}"
	except Exception as e:
	startup_status = f"⚠️ Some assets unavailable: {e}  ·  {DEVICE_LABEL}"
	logger.warning("Startup download issue: %s", e)

	initial_models = list_models()
	initial_value = default_model if default_model in initial_models else (
	initial_models[0] if initial_models else None
	)

	# ── Gradio UI ─────────────────────────────────────────────────────────────────
	with gr.Blocks(title="RVC Voice Conversion", delete_cache=(3600, 3600)) as demo:

	gr.HTML(f"""
	<div id="header">
	<h1>🎙️ RVC Voice Conversion</h1>
	<p>Retrieval-Based Voice Conversion · record or upload · custom models · GPU/CPU auto</p>
	</div>
	<p id="status">{startup_status}</p>
	""")

	with gr.Tabs():

	# ── TAB 1: Convert ────────────────────────────────────────────────────
	with gr.Tab("🎤 Convert"):
	with gr.Row():

	with gr.Column(scale=1):
	gr.Markdown("### 🔊 Input Audio")
	with gr.Tabs():
	with gr.Tab("🎙️ Microphone"):
	inp_mic = gr.Audio(
	sources=["microphone"],
	type="filepath",
	label="Record",
	)
	with gr.Tab("📁 Upload File"):
	inp_file = gr.Audio(
	sources=["upload"],
	type="filepath",
	label="Upload audio (wav / mp3 / flac / ogg …)",
	)

	gr.Markdown("### 🤖 Model")
	model_dd = gr.Dropdown(
	choices=initial_models,
	value=initial_value,
	label="Active Voice Model",
	interactive=True,
	)

	gr.Markdown("### 🎚️ Basic Settings")
	pitch_sl = gr.Slider(
	minimum=-24, maximum=24, value=0, step=1,
	label="Pitch Shift (semitones)",
	info="0 = unchanged · positive = higher · negative = lower",
	)
	f0_radio = gr.Radio(
	choices=["rmvpe", "fcpe", "crepe", "crepe-tiny"],
	value="rmvpe",
	label="Pitch Extraction Method",
	info="rmvpe = fastest & accurate · crepe = highest quality (slower)",
	)

	with gr.Column(scale=1):
	gr.Markdown("### ⚙️ Advanced Settings")
	with gr.Accordion("Expand advanced options", open=False):
	index_rate_sl = gr.Slider(
	0.0, 1.0, value=0.75, step=0.05,
	label="Index Rate",
	info="How strongly the FAISS index influences timbre (0 = off)",
	)
	protect_sl = gr.Slider(
	0.0, 0.5, value=0.5, step=0.01,
	label="Protect Consonants",
	info="Protects unvoiced consonants — 0.5 = max protection",
	)
	filter_radius_sl = gr.Slider(
	0, 7, value=3, step=1,
	label="Respiration Filter Radius",
	info="Median filter on pitch — higher = smoother, reduces breath noise",
	)
	vol_env_sl = gr.Slider(
	0.0, 1.0, value=0.25, step=0.05,
	label="Volume Envelope Mix",
	info="0.25 = natural blend · 1 = preserve input loudness · 0 = model output",
	)
	with gr.Row():
	clean_cb = gr.Checkbox(value=False, label="Noise Reduction")
	clean_sl = gr.Slider(
	0.0, 1.0, value=0.5, step=0.05,
	label="Reduction Strength",
	)
	with gr.Row():
	split_cb = gr.Checkbox(value=False, label="Split Long Audio")
	autotune_cb = gr.Checkbox(value=False, label="Autotune")
	autotune_sl = gr.Slider(
	0.0, 1.0, value=1.0, step=0.05,
	label="Autotune Strength",
	visible=False,
	)
	autotune_cb.change(
	fn=toggle_autotune,
	inputs=autotune_cb,
	outputs=autotune_sl,
	)

	gr.Markdown("🎛️ Reverb")
	reverb_cb = gr.Checkbox(value=False, label="Enable Reverb")
	with gr.Group(visible=False) as reverb_group:
	reverb_room_sl = gr.Slider(
	0.0, 1.0, value=0.15, step=0.05,
	label="Room Size",
	info="Larger = bigger sounding space",
	)
	reverb_damp_sl = gr.Slider(
	0.0, 1.0, value=0.7, step=0.05,
	label="Damping",
	info="Higher = more absorption, less echo tail",
	)
	reverb_wet_sl = gr.Slider(
	0.0, 1.0, value=0.15, step=0.05,
	label="Wet Level",
	info="How much reverb is mixed in (0.15 = subtle)",
	)
	reverb_cb.change(
	fn=lambda v: gr.update(visible=v),
	inputs=reverb_cb,
	outputs=reverb_group,
	)

	fmt_radio = gr.Radio(
	choices=["WAV", "MP3", "FLAC", "OPUS"],
	value="WAV",
	label="Output Format",
	info="OPUS = small file (~64 kbps, Telegram/Discord quality)",
	)
	convert_btn = gr.Button(
	"🚀 Convert Voice",
	variant="primary",
	)

	gr.Markdown("### 🎧 Output")
	out_status = gr.Markdown(value="")
	out_audio = gr.Audio(label="Result (if still on page)", type="filepath", interactive=False)

	gr.Markdown("#### 🔍 Check Job Status")
	with gr.Row():
	job_id_box = gr.Textbox(
	label="Job ID",
	placeholder="e.g. a3f2b1c9",
	scale=3,
	)
	poll_btn = gr.Button("🔄 Check", scale=1)
	poll_status = gr.Markdown(value="")
	poll_audio = gr.Audio(label="Result", type="filepath", interactive=False)

	# ── TAB 2: Models ─────────────────────────────────────────────────────
	with gr.Tab("📦 Models"):
	gr.Markdown("""
	### Upload a Custom RVC Model
	Provide a `.zip` containing:
	- `model.pth` — weights (required)
	- `model.index` — FAISS index (optional, improves voice matching)

	Built-in models (pre-downloaded on startup):
	Vestia Zeta v1 · Vestia Zeta v2 · Ayunda Risu · Gawr Gura
	""")
	with gr.Row():
	with gr.Column(scale=1):
	up_zip = gr.File(label="Model ZIP", file_types=[".zip"])
	up_name = gr.Textbox(
	label="Model Name",
	placeholder="Leave blank to use zip filename",
	)
	up_btn = gr.Button("📤 Load Model", variant="primary")
	up_status = gr.Textbox(label="Status", interactive=False, lines=2)
	with gr.Column(scale=1):
	gr.Markdown("### Loaded Models")
	models_table = gr.Dataframe(
	col_count=(1, "fixed"),
	value=[[m] for m in initial_models],
	interactive=False,
	label="",
	)
	refresh_btn = gr.Button("🔄 Refresh")

	up_btn.click(
	fn=upload_model,
	inputs=[up_zip, up_name],
	outputs=[up_status, model_dd, models_table],
	)
	refresh_btn.click(
	fn=refresh_models,
	outputs=[models_table, model_dd],
	)

	# ── TAB 3: Jobs ───────────────────────────────────────────────────────
	with gr.Tab("📋 Jobs"):
	gr.Markdown("All submitted jobs, newest first. Click Refresh to update.")
	queue_status = gr.Markdown(value=get_queue_info, every=10)
	jobs_table = gr.Dataframe(
	headers=["Job ID", "Model", "Status", "Time", "Download"],
	col_count=(5, "fixed"),
	value=get_jobs_table,
	interactive=False,
	wrap=True,
	datatype=["str", "str", "str", "str", "markdown"],
	every=10,
	)
	refresh_jobs_btn = gr.Button("🔄 Refresh")

	def _refresh_jobs():
	return get_queue_info(), get_jobs_table()

	refresh_jobs_btn.click(fn=_refresh_jobs, outputs=[queue_status, jobs_table])

	# ── TAB 4: Help ───────────────────────────────────────────────────────
	with gr.Tab("ℹ️ Help"):
	gr.Markdown(f"""
	## How it works
	RVC (Retrieval-Based Voice Conversion) transforms a voice recording to sound
	like a target speaker using only that speaker's model file.

	---

	## Quick Guide
	1. Open the Convert tab
	2. Record via microphone or upload an audio file (wav, mp3, flac, ogg …)
	3. Choose a model from the dropdown — 4 models are pre-loaded on startup
	4. Set Pitch Shift if needed (e.g. male → female: try +12 semitones)
	5. Click 🚀 Convert Voice and wait for the result

	---

	## Built-in Models
	\| Model \| Description \|
	\|---\|---\|
	\| Vestia Zeta v1 \| Hololive ID VTuber, v1 model \|
	\| Vestia Zeta v2 \| Hololive ID VTuber, v2 model (recommended) \|
	\| Ayunda Risu \| Hololive ID VTuber \|
	\| Gawr Gura \| Hololive EN VTuber \|

	---

	## Pitch Extraction Methods
	\| Method \| Speed \| Quality \| Best for \|
	\|---\|---\|---\|---\|
	\| rmvpe \| ⚡⚡⚡ \| ★★★★ \| General use (default) \|
	\| fcpe \| ⚡⚡ \| ★★★★ \| Singing \|
	\| crepe \| ⚡ \| ★★★★★ \| Highest quality, slow \|
	\| crepe-tiny \| ⚡⚡ \| ★★★ \| Low resource \|

	---

	## Advanced Settings
	\| Setting \| Description \|
	\|---\|---\|
	\| Index Rate \| Influence of FAISS index on output timbre (0.75 recommended) \|
	\| Protect Consonants \| Prevents artefacts on consonants (0.5 = max) \|
	\| Respiration Filter Radius \| Smooths pitch curve — higher reduces breath noise (0–7, default 3) \|
	\| Volume Envelope Mix \| 0.25 = natural blend · 1 = preserve input loudness \|
	\| Noise Reduction \| Removes background noise before conversion \|
	\| Split Long Audio \| Chunks audio for recordings > 60 s \|
	\| Autotune \| Snaps pitch to nearest musical note \|

	---

	## Output Formats
	\| Format \| Size \| Quality \|
	\|---\|---\|---\|
	\| WAV \| Large \| Lossless \|
	\| FLAC \| Medium \| Lossless compressed \|
	\| MP3 \| Small \| Lossy \|
	\| OPUS \| Tiny (~64 kbps) \| Telegram/Discord quality \|

	---

	Device: `{DEVICE_LABEL}`
	Max input duration: {MAX_INPUT_DURATION // 60} minutes

	---

	## Credits
	Engine: [Ultimate RVC](https://github.com/JackismyShephard/ultimate-rvc)
	""")

	# Wire convert button after all tabs
	def _submit_and_extract_id(*args):
	import re
	status, audio = submit_job(*args)
	match = re.search(r"[a-f0-9]{8}", status or "")
	job_id = match.group(0) if match else ""
	return status, audio, job_id, get_queue_info(), get_jobs_table()

	convert_btn.click(
	fn=_submit_and_extract_id,
	inputs=[
	inp_mic, inp_file, model_dd,
	pitch_sl, f0_radio,
	index_rate_sl, protect_sl, vol_env_sl,
	clean_cb, clean_sl,
	split_cb, autotune_cb, autotune_sl,
	filter_radius_sl,
	fmt_radio,
	reverb_cb, reverb_room_sl, reverb_damp_sl, reverb_wet_sl,
	],
	outputs=[out_status, out_audio, job_id_box, queue_status, jobs_table],
	)

	def _poll_and_refresh(job_id):
	status, file = poll_job(job_id)
	return status, file, get_queue_info(), get_jobs_table()

	poll_btn.click(
	fn=_poll_and_refresh,
	inputs=[job_id_box],
	outputs=[poll_status, poll_audio, queue_status, jobs_table],
	)


	# ── Launch ────────────────────────────────────────────────────────────────────
	if __name__ == "__main__":
	demo.queue(default_concurrency_limit=5)
	demo.launch(
	server_name="0.0.0.0",
	server_port=int(os.getenv("PORT", 7860)),
	max_threads=10,
	ssr_mode=False,
	css=CSS,
	)