RVC-CH

Running

App Files Files Community

RVC-CH / app.py

LosCaquitos

Update app.py

5d082c2 verified about 4 hours ago

raw

history blame contribute delete

22.4 kB

	"""
	RVC Voice Conversion – HuggingFace Space

	Simple, fast, GPU/CPU auto-detected.
	"""
	from __future__ import annotations

	import os
	import subprocess
	import tempfile
	import shutil
	from pathlib import Path

	import gradio as gr
	import numpy as np

	from lib.config import (
	BUILTIN_MODELS,
	CSS,
	DEVICE_LABEL,
	MAX_INPUT_DURATION,
	logger,
	)
	from lib.jobs import (
	get_jobs_table,
	get_queue_info,
	poll_job,
	submit_job,
	)
	from lib.models import list_models, startup_downloads
	from lib.ui import refresh_models, toggle_autotune, upload_model

	# ── Startup ───────────────────────────────────────────────────────────────────
	startup_status = ""
	default_model = ""
	try:
	default_model = startup_downloads()
	startup_status = f"✅ Ready  ·  {DEVICE_LABEL}"
	except Exception as e:
	startup_status = f"⚠️ Some assets unavailable: {e}  ·  {DEVICE_LABEL}"
	logger.warning("Startup download issue: %s", e)

	initial_models = list_models()
	initial_value = default_model if default_model in initial_models else (
	initial_models[0] if initial_models else None
	)


	# ── Função para processar vídeo ───────────────────────────────────────────────
	def process_video(video_file, model, pitch, f0_method, index_rate, protect, vol_env,
	clean, clean_strength, split, autotune, autotune_strength,
	filter_radius, output_format, reverb, reverb_room, reverb_damp, reverb_wet):
	"""Processa vídeo e retorna ZIP com 5 outputs."""
	if video_file is None:
	return None, "No video file provided"

	# Criar diretório temporário
	output_dir = Path(tempfile.mkdtemp())

	# Extrair áudio
	audio_path = tempfile.NamedTemporaryFile(suffix=".wav", delete=False).name
	cmd = ["ffmpeg", "-i", video_file, "-q:a", "0", "-map", "a", "-y", audio_path]
	subprocess.run(cmd, check=True, capture_output=True)

	# Salvar entrada.wav
	entrada_wav = output_dir / "entrada.wav"
	shutil.copy2(audio_path, entrada_wav)

	# Criar versões (simplificado)
	entrada_acapella = output_dir / "entrada_acapella.wav"
	entrada_instrumental = output_dir / "entrada_instrumental.wav"
	shutil.copy2(audio_path, entrada_acapella)

	import soundfile as sf
	data, sr = sf.read(audio_path)
	silent = np.zeros_like(data)
	sf.write(entrada_instrumental, silent, sr)

	# Converter com RVC
	status, converted = submit_job(
	None, str(entrada_acapella), model, pitch, f0_method,
	index_rate, protect, vol_env, clean, clean_strength,
	split, autotune, autotune_strength, filter_radius,
	output_format, reverb, reverb_room, reverb_damp, reverb_wet
	)

	if not converted:
	return None, f"Conversion failed: {status}"

	# Salvar outputs
	saida_acapella = output_dir / "saida_acapella.wav"
	shutil.copy2(converted, saida_acapella)

	# Mixar
	rvc_audio, rvc_sr = sf.read(saida_acapella)
	inst_audio, inst_sr = sf.read(entrada_instrumental)
	if rvc_sr != inst_sr:
	from scipy import signal
	inst_audio = signal.resample(inst_audio, int(len(inst_audio) * rvc_sr / inst_sr))
	min_len = min(len(rvc_audio), len(inst_audio))
	mixed = rvc_audio[:min_len] + inst_audio[:min_len]
	mixed = mixed * (0.95 / max(np.abs(mixed))) if max(np.abs(mixed)) > 0.95 else mixed
	saida_wav = output_dir / "saida.wav"
	sf.write(saida_wav, mixed, rvc_sr)

	# Criar ZIP
	zip_path = output_dir / "outputs.zip"
	import zipfile
	with zipfile.ZipFile(zip_path, 'w') as z:
	z.write(saida_acapella, "saida_acapella.wav")
	z.write(saida_wav, "saida.wav")
	z.write(entrada_acapella, "entrada_acapella.wav")
	z.write(entrada_wav, "entrada.wav")
	z.write(entrada_instrumental, "entrada_instrumental.wav")

	return str(zip_path), "✅ Conversion complete! ZIP with 5 files ready."


	# ── Gradio UI ─────────────────────────────────────────────────────────────────
	with gr.Blocks(title="RVC Voice Conversion", delete_cache=(3600, 3600)) as demo:

	gr.HTML(f"""
	<div id="header">
	<h1>🎙️ RVC Voice Conversion</h1>
	<p>Retrieval-Based Voice Conversion · record or upload · custom models · GPU/CPU auto</p>
	</div>
	<p id="status">{startup_status}</p>
	""")

	with gr.Tabs():

	# ── TAB 1: Convert ────────────────────────────────────────────────────
	with gr.Tab("🎤 Convert"):
	with gr.Row():

	with gr.Column(scale=1):
	gr.Markdown("### 🔊 Input Audio")
	with gr.Tabs():
	with gr.Tab("🎙️ Microphone"):
	inp_mic = gr.Audio(
	sources=["microphone"],
	type="filepath",
	label="Record",
	)
	with gr.Tab("📁 Upload File"):
	inp_file = gr.Audio(
	sources=["upload"],
	type="filepath",
	label="Upload audio (wav / mp3 / flac / ogg …)",
	)

	gr.Markdown("### 🤖 Model")
	model_dd = gr.Dropdown(
	choices=initial_models,
	value=initial_value,
	label="Active Voice Model",
	interactive=True,
	)

	gr.Markdown("### 🎚️ Basic Settings")
	pitch_sl = gr.Slider(
	minimum=-24, maximum=24, value=0, step=1,
	label="Pitch Shift (semitones)",
	info="0 = unchanged · positive = higher · negative = lower",
	)
	f0_radio = gr.Radio(
	choices=["rmvpe", "fcpe", "crepe", "crepe-tiny"],
	value="rmvpe",
	label="Pitch Extraction Method",
	info="rmvpe = fastest & accurate · crepe = highest quality (slower)",
	)

	with gr.Column(scale=1):
	gr.Markdown("### ⚙️ Advanced Settings")
	with gr.Accordion("Expand advanced options", open=False):
	index_rate_sl = gr.Slider(
	0.0, 1.0, value=0.75, step=0.05,
	label="Index Rate",
	info="How strongly the FAISS index influences timbre (0 = off)",
	)
	protect_sl = gr.Slider(
	0.0, 0.5, value=0.5, step=0.01,
	label="Protect Consonants",
	info="Protects unvoiced consonants — 0.5 = max protection",
	)
	filter_radius_sl = gr.Slider(
	0, 7, value=3, step=1,
	label="Respiration Filter Radius",
	info="Median filter on pitch — higher = smoother, reduces breath noise",
	)
	vol_env_sl = gr.Slider(
	0.0, 1.0, value=0.25, step=0.05,
	label="Volume Envelope Mix",
	info="0.25 = natural blend · 1 = preserve input loudness · 0 = model output",
	)
	with gr.Row():
	clean_cb = gr.Checkbox(value=False, label="Noise Reduction")
	clean_sl = gr.Slider(
	0.0, 1.0, value=0.5, step=0.05,
	label="Reduction Strength",
	)
	with gr.Row():
	split_cb = gr.Checkbox(value=False, label="Split Long Audio")
	autotune_cb = gr.Checkbox(value=False, label="Autotune")
	autotune_sl = gr.Slider(
	0.0, 1.0, value=1.0, step=0.05,
	label="Autotune Strength",
	visible=False,
	)
	autotune_cb.change(
	fn=toggle_autotune,
	inputs=autotune_cb,
	outputs=autotune_sl,
	)

	gr.Markdown("🎛️ Reverb")
	reverb_cb = gr.Checkbox(value=False, label="Enable Reverb")
	with gr.Group(visible=False) as reverb_group:
	reverb_room_sl = gr.Slider(
	0.0, 1.0, value=0.15, step=0.05,
	label="Room Size",
	info="Larger = bigger sounding space",
	)
	reverb_damp_sl = gr.Slider(
	0.0, 1.0, value=0.7, step=0.05,
	label="Damping",
	info="Higher = more absorption, less echo tail",
	)
	reverb_wet_sl = gr.Slider(
	0.0, 1.0, value=0.15, step=0.05,
	label="Wet Level",
	info="How much reverb is mixed in (0.15 = subtle)",
	)
	reverb_cb.change(
	fn=lambda v: gr.update(visible=v),
	inputs=reverb_cb,
	outputs=reverb_group,
	)

	fmt_radio = gr.Radio(
	choices=["WAV", "MP3", "FLAC", "OPUS"],
	value="WAV",
	label="Output Format",
	info="OPUS = small file (~64 kbps, Telegram/Discord quality)",
	)
	convert_btn = gr.Button(
	"🚀 Convert Voice",
	variant="primary",
	)

	gr.Markdown("### 🎧 Output")
	out_status = gr.Markdown(value="")
	out_audio = gr.Audio(label="Result (if still on page)", type="filepath", interactive=False)

	gr.Markdown("#### 🔍 Check Job Status")
	with gr.Row():
	job_id_box = gr.Textbox(
	label="Job ID",
	placeholder="e.g. a3f2b1c9",
	scale=3,
	)
	poll_btn = gr.Button("🔄 Check", scale=1)
	poll_status = gr.Markdown(value="")
	poll_audio = gr.Audio(label="Result", type="filepath", interactive=False)

	# ── TAB 2: Video Convert ────────────────────────────────────────────────
	with gr.Tab("🎬 Video Convert"):
	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("### 🎥 Input Video")
	video_file = gr.Video(label="Upload MP4 Video", sources=["upload"], format="mp4")

	gr.Markdown("### 🤖 Model")
	video_model = gr.Dropdown(
	choices=initial_models,
	value=initial_value,
	label="Active Voice Model",
	interactive=True,
	)

	gr.Markdown("### 🎚️ Basic Settings")
	video_pitch = gr.Slider(-24, 24, value=0, step=1, label="Pitch Shift")
	video_f0 = gr.Radio(["rmvpe", "fcpe", "crepe", "crepe-tiny"], value="rmvpe", label="Pitch Method")

	with gr.Column(scale=1):
	gr.Markdown("### ⚙️ Advanced Settings")
	with gr.Accordion("Expand", open=False):
	video_index = gr.Slider(0.0, 1.0, value=0.75, step=0.05, label="Index Rate")
	video_protect = gr.Slider(0.0, 0.5, value=0.5, step=0.01, label="Protect Consonants")
	video_filter = gr.Slider(0, 7, value=3, step=1, label="Filter Radius")
	video_vol = gr.Slider(0.0, 1.0, value=0.25, step=0.05, label="Volume Envelope")
	video_clean = gr.Checkbox(value=False, label="Noise Reduction")
	video_clean_strength = gr.Slider(0.0, 1.0, value=0.5, step=0.05, label="Clean Strength")
	video_split = gr.Checkbox(value=False, label="Split Long Audio")
	video_autotune = gr.Checkbox(value=False, label="Autotune")
	video_autotune_strength = gr.Slider(0.0, 1.0, value=1.0, step=0.05, label="Autotune Strength")

	gr.Markdown("🎛️ Reverb")
	video_reverb = gr.Checkbox(value=False, label="Enable Reverb")
	video_reverb_room = gr.Slider(0.0, 1.0, value=0.15, step=0.05, label="Room Size")
	video_reverb_damp = gr.Slider(0.0, 1.0, value=0.7, step=0.05, label="Damping")
	video_reverb_wet = gr.Slider(0.0, 1.0, value=0.15, step=0.05, label="Wet Level")

	video_format = gr.Radio(["WAV", "MP3", "FLAC", "OPUS"], value="WAV", label="Output Format")
	video_convert_btn = gr.Button("🎬 Convert Video", variant="primary")

	gr.Markdown("### 📦 Output")
	video_status = gr.Markdown(value="")
	video_output = gr.File(label="Download ZIP (5 audio files)", type="filepath")

	# ── TAB 3: Models ─────────────────────────────────────────────────────
	with gr.Tab("📦 Models"):
	gr.Markdown("""
	### Upload a Custom RVC Model
	Provide a `.zip` containing:
	- `model.pth` — weights (required)
	- `model.index` — FAISS index (optional, improves voice matching)

	Built-in models (pre-downloaded on startup):
	Vestia Zeta v1 · Vestia Zeta v2 · Ayunda Risu · Gawr Gura
	""")
	with gr.Row():
	with gr.Column(scale=1):
	up_zip = gr.File(label="Model ZIP", file_types=[".zip"])
	up_name = gr.Textbox(
	label="Model Name",
	placeholder="Leave blank to use zip filename",
	)
	up_btn = gr.Button("📤 Load Model", variant="primary")
	up_status = gr.Textbox(label="Status", interactive=False, lines=2)
	with gr.Column(scale=1):
	gr.Markdown("### Loaded Models")
	models_table = gr.Dataframe(
	col_count=(1, "fixed"),
	value=[[m] for m in initial_models],
	interactive=False,
	label="",
	)
	refresh_btn = gr.Button("🔄 Refresh")

	up_btn.click(
	fn=upload_model,
	inputs=[up_zip, up_name],
	outputs=[up_status, model_dd, models_table],
	)
	refresh_btn.click(
	fn=refresh_models,
	outputs=[models_table, model_dd],
	)

	# ── TAB 4: Jobs ───────────────────────────────────────────────────────
	with gr.Tab("📋 Jobs"):
	gr.Markdown("All submitted jobs, newest first. Click Refresh to update.")
	queue_status = gr.Markdown(value=get_queue_info, every=10)
	jobs_table = gr.Dataframe(
	headers=["Job ID", "Model", "Status", "Time", "Download"],
	col_count=(5, "fixed"),
	value=get_jobs_table,
	interactive=False,
	wrap=True,
	datatype=["str", "str", "str", "str", "markdown"],
	every=10,
	)
	refresh_jobs_btn = gr.Button("🔄 Refresh")

	def _refresh_jobs():
	return get_queue_info(), get_jobs_table()

	refresh_jobs_btn.click(fn=_refresh_jobs, outputs=[queue_status, jobs_table])

	# ── TAB 5: Help ───────────────────────────────────────────────────────
	with gr.Tab("ℹ️ Help"):
	gr.Markdown(f"""
	## How it works
	RVC (Retrieval-Based Voice Conversion) transforms a voice recording to sound
	like a target speaker using only that speaker's model file.

	---

	## Quick Guide
	1. Open the Convert tab
	2. Record via microphone or upload an audio file (wav, mp3, flac, ogg …)
	3. Choose a model from the dropdown — 4 models are pre-loaded on startup
	4. Set Pitch Shift if needed (e.g. male → female: try +12 semitones)
	5. Click 🚀 Convert Voice and wait for the result

	---

	## Built-in Models
	\| Model \| Description \|
	\|---\|---\|
	\| Vestia Zeta v1 \| Hololive ID VTuber, v1 model \|
	\| Vestia Zeta v2 \| Hololive ID VTuber, v2 model (recommended) \|
	\| Ayunda Risu \| Hololive ID VTuber \|
	\| Gawr Gura \| Hololive EN VTuber \|

	---

	## Pitch Extraction Methods
	\| Method \| Speed \| Quality \| Best for \|
	\|---\|---\|---\|---\|
	\| rmvpe \| ⚡⚡⚡ \| ★★★★ \| General use (default) \|
	\| fcpe \| ⚡⚡ \| ★★★★ \| Singing \|
	\| crepe \| ⚡ \| ★★★★★ \| Highest quality, slow \|
	\| crepe-tiny \| ⚡⚡ \| ★★★ \| Low resource \|

	---

	## Advanced Settings
	\| Setting \| Description \|
	\|---\|---\|
	\| Index Rate \| Influence of FAISS index on output timbre (0.75 recommended) \|
	\| Protect Consonants \| Prevents artefacts on consonants (0.5 = max) \|
	\| Respiration Filter Radius \| Smooths pitch curve — higher reduces breath noise (0–7, default 3) \|
	\| Volume Envelope Mix \| 0.25 = natural blend · 1 = preserve input loudness \|
	\| Noise Reduction \| Removes background noise before conversion \|
	\| Split Long Audio \| Chunks audio for recordings > 60 s \|
	\| Autotune \| Snaps pitch to nearest musical note \|

	---

	## Output Formats
	\| Format \| Size \| Quality \|
	\|---\|---\|---\|
	\| WAV \| Large \| Lossless \|
	\| FLAC \| Medium \| Lossless compressed \|
	\| MP3 \| Small \| Lossy \|
	\| OPUS \| Tiny (~64 kbps) \| Telegram/Discord quality \|

	---

	Device: `{DEVICE_LABEL}`
	Max input duration: {MAX_INPUT_DURATION // 60} minutes

	---

	## Credits
	Engine: [Ultimate RVC](https://github.com/JackismyShephard/ultimate-rvc)
	""")

	# Wire convert button after all tabs
	def _submit_and_extract_id(*args):
	import re
	status, audio = submit_job(*args)
	match = re.search(r"[a-f0-9]{8}", status or "")
	job_id = match.group(0) if match else ""
	return status, audio, job_id, get_queue_info(), get_jobs_table()

	convert_btn.click(
	fn=_submit_and_extract_id,
	inputs=[
	inp_mic, inp_file, model_dd,
	pitch_sl, f0_radio,
	index_rate_sl, protect_sl, vol_env_sl,
	clean_cb, clean_sl,
	split_cb, autotune_cb, autotune_sl,
	filter_radius_sl,
	fmt_radio,
	reverb_cb, reverb_room_sl, reverb_damp_sl, reverb_wet_sl,
	],
	outputs=[out_status, out_audio, job_id_box, queue_status, jobs_table],
	)

	def _poll_and_refresh(job_id):
	status, file = poll_job(job_id)
	return status, file, get_queue_info(), get_jobs_table()

	poll_btn.click(
	fn=_poll_and_refresh,
	inputs=[job_id_box],
	outputs=[poll_status, poll_audio, queue_status, jobs_table],
	)

	# Video convert
	video_convert_btn.click(
	fn=process_video,
	inputs=[
	video_file, video_model, video_pitch, video_f0,
	video_index, video_protect, video_vol,
	video_clean, video_clean_strength,
	video_split, video_autotune, video_autotune_strength,
	video_filter, video_format,
	video_reverb, video_reverb_room, video_reverb_damp, video_reverb_wet
	],
	outputs=[video_output, video_status]
	)


	# ── Launch ────────────────────────────────────────────────────────────────────
	if __name__ == "__main__":
	demo.queue(default_concurrency_limit=5)
	demo.launch(
	server_name="0.0.0.0",
	server_port=int(os.getenv("PORT", 7860)),
	max_threads=10,
	ssr_mode=False,
	css=CSS,
	)