import os import json from pathlib import Path import gradio as gr # Setup paths MODEL_PATH = Path("G_777.pth") CONFIG_PATH = Path("config.json") BANNER_PATH = Path("assets/banner.png") # Dynamic speaker loader speakers = ["quevedo"] if CONFIG_PATH.exists(): try: with open(CONFIG_PATH, "r", encoding="utf-8") as f: config_data = json.load(f) if "spk" in config_data: speakers = list(config_data["spk"].keys()) except Exception as e: print(f"Error loading speakers from config: {e}") # Inference function def convert_voice(input_audio, speaker, transpose, auto_predict_f0, f0_method, noise_scale): if input_audio is None: return None, "Please upload an audio file or use the microphone." input_path = Path(input_audio) output_path = input_path.parent / f"{input_path.stem}_quevedo.wav" # Lazy import to avoid startup errors if so-vits-svc-fork is not yet installed try: from so_vits_svc_fork.inference.main import infer except ImportError: return None, ( "Error: 'so-vits-svc-fork' is not installed in this environment.\n" "Please run: pip install so-vits-svc-fork" ) if not MODEL_PATH.exists(): return None, f"Error: Model file {MODEL_PATH} not found." if not CONFIG_PATH.exists(): return None, f"Error: Config file {CONFIG_PATH} not found." try: # Perform inference using the fork's main infer function infer( input_path=input_path, output_path=output_path, model_path=MODEL_PATH, config_path=CONFIG_PATH, recursive=False, speaker=speaker, transpose=int(transpose), auto_predict_f0=bool(auto_predict_f0), noise_scale=float(noise_scale), f0_method=f0_method ) if output_path.exists(): return str(output_path), "Conversion completed successfully!" else: return None, "Error: Output file was not generated." except Exception as e: return None, f"Error during inference: {str(e)}" # Custom CSS for premium styling matching the blue-purple theme custom_css = """ body { background-color: #0b0c10; } .gradio-container { background-color: #0b0c10 !important; font-family: 'Outfit', 'Inter', sans-serif !important; max-width: 900px !important; margin: 0 auto !important; border-radius: 12px; } .header-area { text-align: center; padding: 20px 0; } .header-title { color: #4f46e5; background: linear-gradient(90deg, #818cf8 0%, #c084fc 100%); -webkit-background-clip: text; -webkit-text-fill-color: transparent; font-weight: 800 !important; font-size: 2.5rem !important; margin-bottom: 0.5rem; } .header-desc { color: #9ca3af; font-size: 1.1rem; margin-bottom: 20px; } .main-box { background: rgba(17, 24, 39, 0.7); border: 1px solid rgba(255, 255, 255, 0.1); backdrop-filter: blur(10px); border-radius: 16px; padding: 20px; margin-bottom: 20px; } .convert-btn { background: linear-gradient(135deg, #6366f1 0%, #a855f7 100%) !important; border: none !important; color: white !important; font-weight: bold !important; transition: all 0.3s ease !important; } .convert-btn:hover { transform: translateY(-2px); box-shadow: 0 4px 20px rgba(139, 92, 246, 0.4); } """ # Build Gradio UI with gr.Blocks(css=custom_css, theme=gr.themes.Soft()) as demo: # Banner/Header with gr.Row(): # Fallback to CDN URL if local banner is missing or is just a small Git LFS pointer file if BANNER_PATH.exists() and BANNER_PATH.stat().st_size > 5000: gr.Image(str(BANNER_PATH), show_label=False, container=False, interactive=False) else: gr.Image("https://huggingface.co/lagosproject/quevedo/resolve/main/assets/banner.png", show_label=False, container=False, interactive=False) with gr.Row(elem_classes=["header-area"]): gr.HTML( "

🗣️ Quevedo Voice Model (so-vits-svc-fork)

" "

Convert any voice or singing file into the voice of the Spanish singer Quevedo.

" ) # Main conversion section with gr.Row(elem_classes=["main-box"]): with gr.Column(scale=1): gr.Markdown("### 📥 1. Audio Input") input_audio = gr.Audio( label="Audio to Convert (Clean Vocals / Acapella)", type="filepath", sources=["upload", "microphone"] ) gr.Markdown("### ⚙️ 2. Conversion Parameters") speaker = gr.Dropdown( choices=speakers, value=speakers[0], label="Speaker Name" ) transpose = gr.Slider( minimum=-12, maximum=12, value=0, step=1, label="Pitch Shift (Semitones)", info="Increase for female-to-male voices (e.g. -5 to -12), or decrease for male-to-female." ) with gr.Accordion("Advanced Options", open=False): auto_predict_f0 = gr.Checkbox( value=False, label="Auto Predict F0", info="Recommended for speech/narration. UNCHECK for singing to preserve notes." ) f0_method = gr.Dropdown( choices=["crepe", "crepe-tiny", "parselmouth", "dio", "harvest"], value="crepe", label="F0 Predictor Algorithm", info="crepe offers the best quality but is slower; dio is the fastest." ) noise_scale = gr.Slider( minimum=0.1, maximum=1.0, value=0.4, step=0.05, label="Noise Scale", info="Controls pitch variance and expressiveness (0.4 is standard)." ) with gr.Column(scale=1): gr.Markdown("### 📤 3. Output Audio") output_audio = gr.Audio( label="Converted Audio", type="filepath" ) status_output = gr.Textbox( label="Status", value="Ready", interactive=False ) submit_btn = gr.Button( "Convert Voice 🚀", variant="primary", elem_classes=["convert-btn"] ) submit_btn.click( fn=convert_voice, inputs=[input_audio, speaker, transpose, auto_predict_f0, f0_method, noise_scale], outputs=[output_audio, status_output] ) # Footer gr.HTML( "
" "This model is for artistic demonstration and research purposes only. " "Uses so-vits-svc-fork for inference.
" "Developed with 💜 for the open voice community.
" ) if __name__ == "__main__": demo.launch(server_name="0.0.0.0", server_port=7860)