AIConvers / app.py
sugakrit6's picture
Update app.py
1b61020 verified
import gradio as gr
import torch
import torchaudio
import numpy as np
from pathlib import Path
import os
# This is a template - you'll need to add actual RVC processing
# For now, this creates the UI structure
def process_audio(
audio_input,
model_name,
pitch_conversion,
semitones,
reverb,
algorithm,
main_vocals_vol,
backup_vocals_vol,
instrumentals_vol,
protection,
index_rate,
filter_radius,
rms_envelope,
use_onnx,
cpu_threads
):
"""
Process audio with RVC model
"""
if audio_input is None:
return None, "Please provide an audio file"
# Here you would add your actual RVC processing
# Key parameters for high similarity:
# - index_rate: 0.75-0.85 for maximum similarity
# - protection: 0.33-0.5 to prevent artifacts
# - filter_radius: ≥3 to reduce breathiness
# - rms_envelope: 0.25 for natural volume envelope
return audio_input, f"Processing complete! Settings optimized for maximum voice similarity (Demo mode)"
def process_youtube(url, model_name, *args):
"""
Download and process YouTube audio
"""
if not url:
return None, "Please provide a YouTube URL"
# Here you would add YouTube download and processing
return None, f"YouTube processing not yet implemented in this demo"
# Create Gradio interface
with gr.Blocks(theme=gr.themes.Soft()) as app:
gr.Markdown("""
# 🎤 AI Cover Generator
### Transform any song with AI voice models - CPU Optimized
""")
with gr.Row():
with gr.Column(scale=1):
gr.Markdown("## 🎵 Model Selection")
model_dropdown = gr.Dropdown(
choices=["Model 1", "Model 2", "Model 3"],
label="Select Voice Model",
value="Model 1"
)
model_upload = gr.File(
label="Upload Model File (.pth, .pt)",
file_types=[".pth", ".pt", ".ckpt"]
)
model_url = gr.Textbox(
label="Or enter model URL",
placeholder="https://huggingface.co/..."
)
with gr.Column(scale=1):
gr.Markdown("## 🎧 Audio Input")
input_type = gr.Radio(
choices=["File Upload", "YouTube URL"],
label="Input Type",
value="File Upload",
type="value"
)
audio_input = gr.Audio(
label="Upload Audio File",
type="filepath"
)
youtube_url = gr.Textbox(
label="YouTube URL",
placeholder="https://www.youtube.com/watch?v=...",
visible=False
)
def toggle_input(choice):
return {
audio_input: gr.update(visible=choice == "File Upload"),
youtube_url: gr.update(visible=choice == "YouTube URL")
}
input_type.change(
toggle_input,
inputs=[input_type],
outputs=[audio_input, youtube_url]
)
with gr.Accordion("⚙️ Audio Processing Settings", open=False):
with gr.Row():
pitch_conversion = gr.Radio(
choices=[-1, 0, 1],
label="Pitch Conversion",
value=0,
info="Use +12 semitones for male→female, -12 for female→male"
)
semitones = gr.Slider(
minimum=-12,
maximum=12,
value=0,
step=1,
label="Semitones"
)
reverb = gr.Slider(
minimum=0,
maximum=100,
value=0,
label="Reverb (%)"
)
with gr.Row():
algorithm = gr.Dropdown(
choices=["rmvpe", "mangio-crepe", "crepe", "fcpe"],
label="Pitch Extraction Algorithm",
value="rmvpe",
info="RMVPE recommended: fast & accurate"
)
with gr.Row():
main_vocals_vol = gr.Slider(
minimum=-20,
maximum=20,
value=0,
label="Main Vocals (dB)"
)
backup_vocals_vol = gr.Slider(
minimum=-20,
maximum=20,
value=0,
label="Backup Vocals (dB)"
)
instrumentals_vol = gr.Slider(
minimum=-20,
maximum=20,
value=0,
label="Instrumentals (dB)"
)
with gr.Accordion("🎯 Voice Quality & Similarity Settings", open=True):
gr.Markdown("""
### Optimize these settings for maximum voice similarity
These parameters control how closely the output matches the target voice
""")
with gr.Row():
index_rate = gr.Slider(
minimum=0,
maximum=1,
value=0.75,
step=0.01,
label="Index Rate",
info="Higher = more similar to target voice (0.75-0.85 recommended)"
)
protection = gr.Slider(
minimum=0,
maximum=0.5,
value=0.33,
step=0.01,
label="Voice Protection",
info="Prevents artifacts in consonants (0.33-0.5 recommended)"
)
with gr.Row():
filter_radius = gr.Slider(
minimum=0,
maximum=7,
value=3,
step=1,
label="Filter Radius",
info="Median filtering for smoother pitch (≥3 reduces breathiness)"
)
rms_envelope = gr.Slider(
minimum=0,
maximum=1,
value=0.25,
step=0.01,
label="Volume Envelope Mix",
info="Controls volume envelope blend (0.25 recommended)"
)
with gr.Accordion("🚀 CPU Optimization Settings", open=False):
with gr.Row():
use_onnx = gr.Checkbox(
label="Use ONNX (CPU Optimized)",
value=True
)
cpu_threads = gr.Slider(
minimum=1,
maximum=16,
value=4,
step=1,
label="CPU Threads"
)
gr.Markdown("""
### Performance Tips:
- **ONNX format** is much faster on CPU
- **RMVPE algorithm** is 2-3x faster than Crepe
- More CPU threads = faster (if available)
- Expect ~30-60 seconds for a 3-5 minute song
### For Maximum Voice Similarity:
- **Index Rate 0.75-0.85**: Controls how much the model uses the training data index
- **Protection 0.33-0.5**: Protects voiceless consonants without losing quality
- **Filter Radius ≥3**: Smooths pitch transitions and reduces breathiness
- **Train with 5-10 minutes** of clear, noise-free target voice audio
- **Use 200+ epochs** for training to maximize similarity
""")
generate_btn = gr.Button("🎵 Generate AI Cover", variant="primary", size="lg")
with gr.Row():
output_audio = gr.Audio(label="Generated Cover")
output_message = gr.Textbox(label="Status")
# Connect the button
generate_btn.click(
fn=process_audio,
inputs=[
audio_input,
model_dropdown,
pitch_conversion,
semitones,
reverb,
algorithm,
main_vocals_vol,
backup_vocals_vol,
instrumentals_vol,
protection,
index_rate,
filter_radius,
rms_envelope,
use_onnx,
cpu_threads
],
outputs=[output_audio, output_message]
)
gr.Markdown("""
---
### 📝 Note
This is a template interface. To make it fully functional, you need to:
1. Integrate actual RVC (Retrieval-based Voice Conversion) backend
2. Add model loading and caching logic
3. Implement YouTube download functionality
4. Add vocal separation (UVR5) if needed
See the deployment guide for more details!
""")
if __name__ == "__main__":
app.launch(server_name="0.0.0.0", server_port=7860)