|
|
import spaces |
|
|
import gradio as gr |
|
|
from audiosr import super_resolution, build_model |
|
|
import torch |
|
|
import gc |
|
|
import soundfile as sf |
|
|
import math |
|
|
|
|
|
|
|
|
|
|
|
def get_duration(audio_file, model_name, guidance_scale, ddim_steps, seed): |
|
|
if not audio_file: |
|
|
return 0 |
|
|
|
|
|
try: |
|
|
info = sf.info(audio_file) |
|
|
audio_duration = info.duration |
|
|
|
|
|
|
|
|
|
|
|
base_overhead = 24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
time_per_audio_second = 1.0 |
|
|
time_per_ddim_step = 0.06 |
|
|
|
|
|
|
|
|
estimated_time = base_overhead + (audio_duration * time_per_audio_second) + (ddim_steps * time_per_ddim_step) |
|
|
|
|
|
|
|
|
safety_buffer = 10 |
|
|
|
|
|
calculated_duration = estimated_time + safety_buffer |
|
|
|
|
|
|
|
|
min_duration = 50 |
|
|
max_duration = 180 |
|
|
|
|
|
final_duration = max(min_duration, min(max_duration, calculated_duration)) |
|
|
print("FINAL DURATION", final_duration) |
|
|
return math.ceil(final_duration) |
|
|
|
|
|
except Exception as e: |
|
|
|
|
|
print(f"Error in get_duration, using fallback (60): {e}") |
|
|
return 60 |
|
|
|
|
|
|
|
|
@spaces.GPU(duration=get_duration) |
|
|
def inference(audio_file, model_name, guidance_scale, ddim_steps, seed): |
|
|
|
|
|
if not audio_file: |
|
|
print("No audio file provided, skipping inference.") |
|
|
raise gr.Error( |
|
|
"Please upload an audio file." |
|
|
) |
|
|
audiosr = build_model(model_name=model_name) |
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
gc.collect() |
|
|
|
|
|
|
|
|
if seed == 0: |
|
|
import random |
|
|
seed = random.randint(1, 2**32-1) |
|
|
|
|
|
waveform = super_resolution( |
|
|
audiosr, |
|
|
audio_file, |
|
|
seed, |
|
|
guidance_scale=guidance_scale, |
|
|
ddim_steps=ddim_steps |
|
|
) |
|
|
|
|
|
if torch.cuda.is_available(): |
|
|
torch.cuda.empty_cache() |
|
|
|
|
|
gc.collect() |
|
|
|
|
|
return (48000, waveform) |
|
|
|
|
|
iface = gr.Interface( |
|
|
fn=inference, |
|
|
inputs=[ |
|
|
gr.Audio(type="filepath", label="Input Audio"), |
|
|
gr.Dropdown(["basic", "speech"], value="basic", label="Model"), |
|
|
gr.Slider(1, 10, value=3.5, step=0.1, label="Guidance Scale", info="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)"), |
|
|
gr.Slider(1, 100, value=50, step=1, label="DDIM Steps", info="The sampling step for DDIM"), |
|
|
gr.Number(value=42, precision=0, label="Seed", info="Changing this value (any integer number) will lead to a different generation result, put 0 for a random one.") |
|
|
], |
|
|
outputs=gr.Audio(type="numpy", label="Output Audio"), |
|
|
title="AudioSR", |
|
|
description="Audio Super Resolution with AudioSR. <br> It estimates a dynamic gpu duration done by a private Benchmarking HuggingFace ZeroGPU (H200) Space on the 16th November 2025 for saving quota." |
|
|
) |
|
|
|
|
|
iface.launch(share=False) |