Spaces:

sugakrit6
/

AIConvers

Runtime error

App Files Files Community

AIConvers / app.py

sugakrit6

Update app.py

1b61020 verified 4 months ago

raw

history blame contribute delete

8.68 kB

	import gradio as gr
	import torch
	import torchaudio
	import numpy as np
	from pathlib import Path
	import os

	# This is a template - you'll need to add actual RVC processing
	# For now, this creates the UI structure

	def process_audio(
	audio_input,
	model_name,
	pitch_conversion,
	semitones,
	reverb,
	algorithm,
	main_vocals_vol,
	backup_vocals_vol,
	instrumentals_vol,
	protection,
	index_rate,
	filter_radius,
	rms_envelope,
	use_onnx,
	cpu_threads
	):
	"""
	Process audio with RVC model
	"""
	if audio_input is None:
	return None, "Please provide an audio file"

	# Here you would add your actual RVC processing
	# Key parameters for high similarity:
	# - index_rate: 0.75-0.85 for maximum similarity
	# - protection: 0.33-0.5 to prevent artifacts
	# - filter_radius: ≥3 to reduce breathiness
	# - rms_envelope: 0.25 for natural volume envelope

	return audio_input, f"Processing complete! Settings optimized for maximum voice similarity (Demo mode)"

	def process_youtube(url, model_name, *args):
	"""
	Download and process YouTube audio
	"""
	if not url:
	return None, "Please provide a YouTube URL"

	# Here you would add YouTube download and processing
	return None, f"YouTube processing not yet implemented in this demo"

	# Create Gradio interface
	with gr.Blocks(theme=gr.themes.Soft()) as app:
	gr.Markdown("""
	# 🎤 AI Cover Generator
	### Transform any song with AI voice models - CPU Optimized
	""")

	with gr.Row():
	with gr.Column(scale=1):
	gr.Markdown("## 🎵 Model Selection")

	model_dropdown = gr.Dropdown(
	choices=["Model 1", "Model 2", "Model 3"],
	label="Select Voice Model",
	value="Model 1"
	)

	model_upload = gr.File(
	label="Upload Model File (.pth, .pt)",
	file_types=[".pth", ".pt", ".ckpt"]
	)

	model_url = gr.Textbox(
	label="Or enter model URL",
	placeholder="https://huggingface.co/..."
	)

	with gr.Column(scale=1):
	gr.Markdown("## 🎧 Audio Input")

	input_type = gr.Radio(
	choices=["File Upload", "YouTube URL"],
	label="Input Type",
	value="File Upload",
	type="value"
	)

	audio_input = gr.Audio(
	label="Upload Audio File",
	type="filepath"
	)

	youtube_url = gr.Textbox(
	label="YouTube URL",
	placeholder="https://www.youtube.com/watch?v=...",
	visible=False
	)

	def toggle_input(choice):
	return {
	audio_input: gr.update(visible=choice == "File Upload"),
	youtube_url: gr.update(visible=choice == "YouTube URL")
	}

	input_type.change(
	toggle_input,
	inputs=[input_type],
	outputs=[audio_input, youtube_url]
	)

	with gr.Accordion("⚙️ Audio Processing Settings", open=False):
	with gr.Row():
	pitch_conversion = gr.Radio(
	choices=[-1, 0, 1],
	label="Pitch Conversion",
	value=0,
	info="Use +12 semitones for male→female, -12 for female→male"
	)
	semitones = gr.Slider(
	minimum=-12,
	maximum=12,
	value=0,
	step=1,
	label="Semitones"
	)
	reverb = gr.Slider(
	minimum=0,
	maximum=100,
	value=0,
	label="Reverb (%)"
	)

	with gr.Row():
	algorithm = gr.Dropdown(
	choices=["rmvpe", "mangio-crepe", "crepe", "fcpe"],
	label="Pitch Extraction Algorithm",
	value="rmvpe",
	info="RMVPE recommended: fast & accurate"
	)

	with gr.Row():
	main_vocals_vol = gr.Slider(
	minimum=-20,
	maximum=20,
	value=0,
	label="Main Vocals (dB)"
	)
	backup_vocals_vol = gr.Slider(
	minimum=-20,
	maximum=20,
	value=0,
	label="Backup Vocals (dB)"
	)
	instrumentals_vol = gr.Slider(
	minimum=-20,
	maximum=20,
	value=0,
	label="Instrumentals (dB)"
	)

	with gr.Accordion("🎯 Voice Quality & Similarity Settings", open=True):
	gr.Markdown("""
	### Optimize these settings for maximum voice similarity
	These parameters control how closely the output matches the target voice
	""")

	with gr.Row():
	index_rate = gr.Slider(
	minimum=0,
	maximum=1,
	value=0.75,
	step=0.01,
	label="Index Rate",
	info="Higher = more similar to target voice (0.75-0.85 recommended)"
	)
	protection = gr.Slider(
	minimum=0,
	maximum=0.5,
	value=0.33,
	step=0.01,
	label="Voice Protection",
	info="Prevents artifacts in consonants (0.33-0.5 recommended)"
	)

	with gr.Row():
	filter_radius = gr.Slider(
	minimum=0,
	maximum=7,
	value=3,
	step=1,
	label="Filter Radius",
	info="Median filtering for smoother pitch (≥3 reduces breathiness)"
	)
	rms_envelope = gr.Slider(
	minimum=0,
	maximum=1,
	value=0.25,
	step=0.01,
	label="Volume Envelope Mix",
	info="Controls volume envelope blend (0.25 recommended)"
	)

	with gr.Accordion("🚀 CPU Optimization Settings", open=False):
	with gr.Row():
	use_onnx = gr.Checkbox(
	label="Use ONNX (CPU Optimized)",
	value=True
	)
	cpu_threads = gr.Slider(
	minimum=1,
	maximum=16,
	value=4,
	step=1,
	label="CPU Threads"
	)

	gr.Markdown("""
	### Performance Tips:
	- ONNX format is much faster on CPU
	- RMVPE algorithm is 2-3x faster than Crepe
	- More CPU threads = faster (if available)
	- Expect ~30-60 seconds for a 3-5 minute song

	### For Maximum Voice Similarity:
	- Index Rate 0.75-0.85: Controls how much the model uses the training data index
	- Protection 0.33-0.5: Protects voiceless consonants without losing quality
	- Filter Radius ≥3: Smooths pitch transitions and reduces breathiness
	- Train with 5-10 minutes of clear, noise-free target voice audio
	- Use 200+ epochs for training to maximize similarity
	""")

	generate_btn = gr.Button("🎵 Generate AI Cover", variant="primary", size="lg")

	with gr.Row():
	output_audio = gr.Audio(label="Generated Cover")
	output_message = gr.Textbox(label="Status")

	# Connect the button
	generate_btn.click(
	fn=process_audio,
	inputs=[
	audio_input,
	model_dropdown,
	pitch_conversion,
	semitones,
	reverb,
	algorithm,
	main_vocals_vol,
	backup_vocals_vol,
	instrumentals_vol,
	protection,
	index_rate,
	filter_radius,
	rms_envelope,
	use_onnx,
	cpu_threads
	],
	outputs=[output_audio, output_message]
	)

	gr.Markdown("""
	---
	### 📝 Note
	This is a template interface. To make it fully functional, you need to:
	1. Integrate actual RVC (Retrieval-based Voice Conversion) backend
	2. Add model loading and caching logic
	3. Implement YouTube download functionality
	4. Add vocal separation (UVR5) if needed

	See the deployment guide for more details!
	""")

	if __name__ == "__main__":
	app.launch(server_name="0.0.0.0", server_port=7860)