Spaces:

Nick088
/

Audio-SR

Running on Zero

App Files Files Community

Audio-SR / app.py

Nick088

add dynamic zerogpu duration & catch no audio file errors

6e70b11 verified 26 days ago

raw

history blame contribute delete

3.63 kB

	import spaces
	import gradio as gr
	from audiosr import super_resolution, build_model
	import torch
	import gc # free up memory
	import soundfile as sf # read audio
	import math # For dynamic gpu duration calculation


	# Estimate a dynamic gpu duration done by a private Benchmarking HuggingFace ZeroGPU (H200) Space on the 16th November 2025 for saving quota
	def get_duration(audio_file, model_name, guidance_scale, ddim_steps, seed):
	if not audio_file:
	return 0

	try:
	info = sf.info(audio_file)
	audio_duration = info.duration


	# 1. Base overhead for model loading (using the higher 'speech' model value).
	base_overhead = 24 # seconds

	# 2. Multipliers for the core ML task.
	# From benchmark: ~11s for 8s audio @ 50 steps.
	# Formula: (8s * C1) + (50 steps * C2) = 11s.
	# We'll estimate C1=1.0 and C2=0.06.
	time_per_audio_second = 1.0
	time_per_ddim_step = 0.06

	# 3. Calculate the estimated processing time.
	estimated_time = base_overhead + (audio_duration * time_per_audio_second) + (ddim_steps * time_per_ddim_step)

	# 4. Add a safety buffer to prevent unexpected timeouts.
	safety_buffer = 10

	calculated_duration = estimated_time + safety_buffer

	# 5. Apply min/max constraints.
	min_duration = 50 # Must be enough for model load + buffer
	max_duration = 180 # Current ZeroGPU maximum duration

	final_duration = max(min_duration, min(max_duration, calculated_duration))
	print("FINAL DURATION", final_duration)
	return math.ceil(final_duration)

	except Exception as e:
	# Fallback to a safe default duration if reading the audio fails.
	print(f"Error in get_duration, using fallback (60): {e}")
	return 60


	@spaces.GPU(duration=get_duration)
	def inference(audio_file, model_name, guidance_scale, ddim_steps, seed):

	if not audio_file:
	print("No audio file provided, skipping inference.")
	raise gr.Error(
	"Please upload an audio file."
	)
	audiosr = build_model(model_name=model_name)

	if torch.cuda.is_available():
	torch.cuda.empty_cache() # empty cuda cache

	gc.collect()

	# set random seed when seed input value is 0
	if seed == 0:
	import random
	seed = random.randint(1, 2**32-1)

	waveform = super_resolution(
	audiosr,
	audio_file,
	seed,
	guidance_scale=guidance_scale,
	ddim_steps=ddim_steps
	)

	if torch.cuda.is_available():
	torch.cuda.empty_cache()

	gc.collect()

	return (48000, waveform)

	iface = gr.Interface(
	fn=inference,
	inputs=[
	gr.Audio(type="filepath", label="Input Audio"),
	gr.Dropdown(["basic", "speech"], value="basic", label="Model"),
	gr.Slider(1, 10, value=3.5, step=0.1, label="Guidance Scale", info="Guidance scale (Large => better quality and relavancy to text; Small => better diversity)"),
	gr.Slider(1, 100, value=50, step=1, label="DDIM Steps", info="The sampling step for DDIM"),
	gr.Number(value=42, precision=0, label="Seed", info="Changing this value (any integer number) will lead to a different generation result, put 0 for a random one.")
	],
	outputs=gr.Audio(type="numpy", label="Output Audio"),
	title="AudioSR",
	description="Audio Super Resolution with AudioSR. <br> It estimates a dynamic gpu duration done by a private Benchmarking HuggingFace ZeroGPU (H200) Space on the 16th November 2025 for saving quota."
	)

	iface.launch(share=False)