quevedo / app.py
lagosproject's picture
Upload folder using huggingface_hub
674e662 verified
import os
import json
from pathlib import Path
import gradio as gr
# Setup paths
MODEL_PATH = Path("G_777.pth")
CONFIG_PATH = Path("config.json")
BANNER_PATH = Path("assets/banner.png")
# Dynamic speaker loader
speakers = ["quevedo"]
if CONFIG_PATH.exists():
try:
with open(CONFIG_PATH, "r", encoding="utf-8") as f:
config_data = json.load(f)
if "spk" in config_data:
speakers = list(config_data["spk"].keys())
except Exception as e:
print(f"Error loading speakers from config: {e}")
# Inference function
def convert_voice(input_audio, speaker, transpose, auto_predict_f0, f0_method, noise_scale):
if input_audio is None:
return None, "Please upload an audio file or use the microphone."
input_path = Path(input_audio)
output_path = input_path.parent / f"{input_path.stem}_quevedo.wav"
# Lazy import to avoid startup errors if so-vits-svc-fork is not yet installed
try:
from so_vits_svc_fork.inference.main import infer
except ImportError:
return None, (
"Error: 'so-vits-svc-fork' is not installed in this environment.\n"
"Please run: pip install so-vits-svc-fork"
)
if not MODEL_PATH.exists():
return None, f"Error: Model file {MODEL_PATH} not found."
if not CONFIG_PATH.exists():
return None, f"Error: Config file {CONFIG_PATH} not found."
try:
# Perform inference using the fork's main infer function
infer(
input_path=input_path,
output_path=output_path,
model_path=MODEL_PATH,
config_path=CONFIG_PATH,
recursive=False,
speaker=speaker,
transpose=int(transpose),
auto_predict_f0=bool(auto_predict_f0),
noise_scale=float(noise_scale),
f0_method=f0_method
)
if output_path.exists():
return str(output_path), "Conversion completed successfully!"
else:
return None, "Error: Output file was not generated."
except Exception as e:
return None, f"Error during inference: {str(e)}"
# Custom CSS for premium styling matching the blue-purple theme
custom_css = """
body {
background-color: #0b0c10;
}
.gradio-container {
background-color: #0b0c10 !important;
font-family: 'Outfit', 'Inter', sans-serif !important;
max-width: 900px !important;
margin: 0 auto !important;
border-radius: 12px;
}
.header-area {
text-align: center;
padding: 20px 0;
}
.header-title {
color: #4f46e5;
background: linear-gradient(90deg, #818cf8 0%, #c084fc 100%);
-webkit-background-clip: text;
-webkit-text-fill-color: transparent;
font-weight: 800 !important;
font-size: 2.5rem !important;
margin-bottom: 0.5rem;
}
.header-desc {
color: #9ca3af;
font-size: 1.1rem;
margin-bottom: 20px;
}
.main-box {
background: rgba(17, 24, 39, 0.7);
border: 1px solid rgba(255, 255, 255, 0.1);
backdrop-filter: blur(10px);
border-radius: 16px;
padding: 20px;
margin-bottom: 20px;
}
.convert-btn {
background: linear-gradient(135deg, #6366f1 0%, #a855f7 100%) !important;
border: none !important;
color: white !important;
font-weight: bold !important;
transition: all 0.3s ease !important;
}
.convert-btn:hover {
transform: translateY(-2px);
box-shadow: 0 4px 20px rgba(139, 92, 246, 0.4);
}
"""
# Build Gradio UI
with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo:
# Banner/Header
with gr.Row():
# Fallback to CDN URL if local banner is missing or is just a small Git LFS pointer file
if BANNER_PATH.exists() and BANNER_PATH.stat().st_size > 5000:
gr.Image(str(BANNER_PATH), show_label=False, container=False, interactive=False)
else:
gr.Image("https://huggingface.co/lagosproject/quevedo/resolve/main/assets/banner.png", show_label=False, container=False, interactive=False)
with gr.Row(elem_classes=["header-area"]):
gr.HTML(
"<h1 class='header-title'>🗣️ Quevedo Voice Model (so-vits-svc-fork)</h1>"
"<p class='header-desc'>Convert any voice or singing file into the voice of the Spanish singer Quevedo.</p>"
)
# Main conversion section
with gr.Row(elem_classes=["main-box"]):
with gr.Column(scale=1):
gr.Markdown("### 📥 1. Audio Input")
input_audio = gr.Audio(
label="Audio to Convert (Clean Vocals / Acapella)",
type="filepath",
sources=["upload", "microphone"]
)
gr.Markdown("### ⚙️ 2. Conversion Parameters")
speaker = gr.Dropdown(
choices=speakers,
value=speakers[0],
label="Speaker Name"
)
transpose = gr.Slider(
minimum=-12,
maximum=12,
value=0,
step=1,
label="Pitch Shift (Semitones)",
info="Increase for female-to-male voices (e.g. -5 to -12), or decrease for male-to-female."
)
with gr.Accordion("Advanced Options", open=False):
auto_predict_f0 = gr.Checkbox(
value=False,
label="Auto Predict F0",
info="Recommended for speech/narration. UNCHECK for singing to preserve notes."
)
f0_method = gr.Dropdown(
choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"],
value="crepe",
label="F0 Predictor Algorithm",
info="crepe offers the best quality but is slower; dio is the fastest."
)
noise_scale = gr.Slider(
minimum=0.1,
maximum=1.0,
value=0.4,
step=0.05,
label="Noise Scale",
info="Controls pitch variance and expressiveness (0.4 is standard)."
)
with gr.Column(scale=1):
gr.Markdown("### 📤 3. Output Audio")
output_audio = gr.Audio(
label="Converted Audio",
type="filepath"
)
status_output = gr.Textbox(
label="Status",
value="Ready",
interactive=False
)
submit_btn = gr.Button(
"Convert Voice 🚀",
variant="primary",
elem_classes=["convert-btn"]
)
submit_btn.click(
fn=convert_voice,
inputs=[input_audio, speaker, transpose, auto_predict_f0, f0_method, noise_scale],
outputs=[output_audio, status_output]
)
# Footer
gr.HTML(
"<div style='text-align: center; color: #4b5563; font-size: 0.85rem; padding: 20px 0;'>"
"This model is for artistic demonstration and research purposes only. "
"Uses so-vits-svc-fork for inference.<br>"
"Developed with 💜 for the open voice community.</div>"
)
if __name__ == "__main__":
demo.launch(server_name="0.0.0.0", server_port=7860)