Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import gradio as gr | |
| from audiosr import super_resolution, build_model | |
| import torch | |
| import gc # free up memory | |
| import soundfile as sf # read audio | |
| import math # For dynamic gpu duration calculation | |
| # Estimate a dynamic gpu duration done by a private Benchmarking HuggingFace ZeroGPU (H200) Space on the 16th November 2025 for saving quota | |
| def get_duration(audio_file, model_name, guidance_scale, ddim_steps, seed): | |
| if not audio_file: | |
| return 0 | |
| try: | |
| info = sf.info(audio_file) | |
| audio_duration = info.duration | |
| # 1. Base overhead for model loading (using the higher 'speech' model value). | |
| base_overhead = 24 # seconds | |
| # 2. Multipliers for the core ML task. | |
| # From benchmark: ~11s for 8s audio @ 50 steps. | |
| # Formula: (8s * C1) + (50 steps * C2) = 11s. | |
| # We'll estimate C1=1.0 and C2=0.06. | |
| time_per_audio_second = 1.0 | |
| time_per_ddim_step = 0.06 | |
| # 3. Calculate the estimated processing time. | |
| estimated_time = base_overhead + (audio_duration * time_per_audio_second) + (ddim_steps * time_per_ddim_step) | |
| # 4. Add a safety buffer to prevent unexpected timeouts. | |
| safety_buffer = 10 | |
| calculated_duration = estimated_time + safety_buffer | |
| # 5. Apply min/max constraints. | |
| min_duration = 50 # Must be enough for model load + buffer | |
| max_duration = 180 # Current ZeroGPU maximum duration | |
| final_duration = max(min_duration, min(max_duration, calculated_duration)) | |
| print("FINAL DURATION", final_duration) | |
| return math.ceil(final_duration) | |
| except Exception as e: | |
| # Fallback to a safe default duration if reading the audio fails. | |
| print(f"Error in get_duration, using fallback (60): {e}") | |
| return 60 | |
| def inference(audio_file, model_name, guidance_scale, ddim_steps, seed): | |
| if not audio_file: | |
| print("No audio file provided, skipping inference.") | |
| raise gr.Error( | |
| "Please upload an audio file." | |
| ) | |
| audiosr = build_model(model_name=model_name) | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() # empty cuda cache | |
| gc.collect() | |
| # set random seed when seed input value is 0 | |
| if seed == 0: | |
| import random | |
| seed = random.randint(1, 2**32-1) | |
| waveform = super_resolution( | |
| audiosr, | |
| audio_file, | |
| seed, | |
| guidance_scale=guidance_scale, | |
| ddim_steps=ddim_steps | |
| ) | |
| if torch.cuda.is_available(): | |
| torch.cuda.empty_cache() | |
| gc.collect() | |
| return (48000, waveform) | |
| iface = gr.Interface( | |
| fn=inference, | |
| inputs=[ | |
| gr.Audio(type="filepath", label="Input Audio"), | |
| gr.Dropdown(["basic", "speech"], value="basic", label="Model"), | |
| gr.Slider(1, 10, value=3.5, step=0.1, label="Guidance Scale", info="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)"), | |
| gr.Slider(1, 100, value=50, step=1, label="DDIM Steps", info="The sampling step for DDIM"), | |
| gr.Number(value=42, precision=0, label="Seed", info="Changing this value (any integer number) will lead to a different generation result, put 0 for a random one.") | |
| ], | |
| outputs=gr.Audio(type="numpy", label="Output Audio"), | |
| title="AudioSR", | |
| description="Audio Super Resolution with AudioSR. <br> It estimates a dynamic gpu duration done by a private Benchmarking HuggingFace ZeroGPU (H200) Space on the 16th November 2025 for saving quota." | |
| ) | |
| iface.launch(share=False) |