Spaces:

lifesee
/

VoiceoverStudio

Sleeping

App Files Files Community

VoiceoverStudio / app.py

lifesee

Update app.py

89f7924 verified 7 months ago

raw

history blame contribute delete

8.08 kB

	import gradio as gr
	import io, os, uuid, zipfile, tempfile, subprocess
	from pydub import AudioSegment
	from pydub.silence import split_on_silence

	# ---------- helpers ----------
	def _load(file_or_bytes):
	if isinstance(file_or_bytes, (bytes, bytearray)):
	return AudioSegment.from_file(io.BytesIO(file_or_bytes))
	if hasattr(file_or_bytes, "read"):
	return AudioSegment.from_file(file_or_bytes)
	return AudioSegment.from_file(file_or_bytes)

	def _export(seg: AudioSegment, fmt="mp3") -> io.BytesIO:
	buf = io.BytesIO()
	seg.export(buf, format=fmt)
	buf.seek(0)
	return buf

	def remove_silence(seg: AudioSegment, keep_ms=250, min_silence_ms=120, thresh_db=-45):
	"""
	keep_ms: how much silence to keep at each cut (your final pause length)
	min_silence_ms: only treat silence >= this length as a pause
	thresh_db: what counts as "silence" (in dBFS), e.g., -45 for voiceovers
	"""
	chunks = split_on_silence(
	seg,
	min_silence_len=int(min_silence_ms),
	silence_thresh=float(thresh_db),
	keep_silence=int(keep_ms),
	)
	return sum(chunks, AudioSegment.silent(duration=0)) if chunks else seg

	def trim_to_seconds(seg: AudioSegment, target_s: float):
	t_ms = max(0, int(float(target_s) * 1000))
	if len(seg) >= t_ms:
	return seg[:t_ms]
	# pad if shorter
	return seg + AudioSegment.silent(duration=t_ms - len(seg))

	def _atempo_chain(factor: float) -> str:
	# Split large/small adjustments into steps within [0.5, 2.0] for quality
	steps = []
	f = max(0.1, min(10.0, float(factor)))
	while f < 0.5:
	steps.append(0.5); f /= 0.5
	while f > 2.0:
	steps.append(2.0); f /= 2.0
	steps.append(f)
	return ",".join([f"atempo={s:.5f}" for s in steps])

	def fit_to_seconds(seg: AudioSegment, target_s: float, fmt_out="mp3") -> io.BytesIO:
	"""Pitch-preserving time stretch via FFmpeg atempo."""
	with tempfile.TemporaryDirectory() as d:
	inp = os.path.join(d, "in.wav")
	outp = os.path.join(d, f"out.{fmt_out}")
	seg.export(inp, format="wav")
	orig = max(0.01, len(seg) / 1000)
	factor = float(target_s) / orig
	af = _atempo_chain(factor)
	codec = ["-c:a", "libmp3lame", "-b:a", "128k"] if fmt_out == "mp3" else []
	cmd = ["ffmpeg", "-y", "-i", inp, "-vn", "-af", af, *codec, outp]
	subprocess.run(cmd, check=True)
	with open(outp, "rb") as f:
	return io.BytesIO(f.read())

	def normalize_lufs(seg: AudioSegment, target_lufs=-14.0):
	# Lightweight RMS-based normalization (minimal deps)
	import math
	rms = seg.rms or 1
	current_db = 20 * math.log10(rms / (1 << 15))
	gain_db = float(target_lufs) - current_db
	return seg.apply_gain(gain_db)

	# ---------- processors ----------
	def process_single(file, mode, target_seconds, keep_silence_s,
	min_silence_ms, silence_thresh_db, do_normalize, fmt):
	raw = file if isinstance(file, (bytes, bytearray)) else file.read()
	original = _load(raw)

	# 1) pause cleanup / normalization
	cleaned = remove_silence(
	original,
	keep_ms=int(float(keep_silence_s) * 1000),
	min_silence_ms=int(min_silence_ms),
	thresh_db=float(silence_thresh_db),
	)

	# 2) loudness normalize
	if do_normalize:
	cleaned = normalize_lufs(cleaned, -14.0)

	# 3) timing
	if mode == "trim" and target_seconds:
	final = trim_to_seconds(cleaned, target_seconds)
	out = _export(final, fmt)
	elif mode == "fit" and target_seconds:
	out = fit_to_seconds(cleaned, target_seconds, fmt_out=fmt)
	else:
	out = _export(cleaned, fmt)

	before = len(original) / 1000
	after = len(_load(out.getvalue())) / 1000
	report = f"Before: {before:.2f}s \| After: {after:.2f}s"
	return out, report

	def process_batch(files, **kwargs) -> io.BytesIO:
	zbuf = io.BytesIO()
	with zipfile.ZipFile(zbuf, "w", zipfile.ZIP_DEFLATED) as z:
	for f in files:
	single, _ = process_single(f, **kwargs)
	name = getattr(f, "name", f"audio_{uuid.uuid4().hex}")
	stem = os.path.splitext(name)[0]
	z.writestr(f"{stem}_processed.{kwargs['fmt']}", single.getvalue())
	zbuf.seek(0)
	return zbuf

	def write_temp_for_preview(blob: io.BytesIO, fmt: str) -> str:
	# Gradio audio prefers a file path for the preview widget
	tf = tempfile.NamedTemporaryFile(delete=False, suffix=f".{fmt}")
	tf.write(blob.getvalue())
	tf.flush(); tf.close()
	return tf.name

	# ---------- UI (force two-column, compact) ----------
	CSS = """
	/* wider canvas */
	.gradio-container { max-width: 1200px !important; margin: 0 auto !important; padding: 8px 10px !important; }

	/* force two columns with sane minimums */
	#twocol {
	display: grid;
	grid-template-columns: minmax(320px, 1fr) minmax(320px, 1fr);
	gap: 12px;
	align-items: start;
	}

	/* tighten component spacing */
	#twocol .block, #twocol .form, #twocol .gap { gap: 8px !important; }
	#twocol .gr-button { height: 40px; }
	#twocol .gr-number input { height: 36px; }
	#twocol .gr-textbox textarea { min-height: 40px; }

	/* compact audio bar */
	#preview-audio audio { width: 100%; height: 36px; }

	/* Only stack on very small screens */
	@media (max-width: 600px) {
	#twocol { grid-template-columns: 1fr; }
	}
	"""

	with gr.Blocks(title="AI Voice Studio – Simple", css=CSS) as demo:
	gr.Markdown("### AI Voice Studio — Set pause length; optionally Trim or Fit to exact time. Export MP3/WAV/M4A/OGG.")

	with gr.Row(elem_id="twocol"):
	# Left column: controls
	with gr.Column():
	files = gr.Files(label="Upload audio", file_types=["audio"], type="filepath")
	mode = gr.Radio(["none", "trim", "fit"], value="none", label="Timing mode")
	target = gr.Number(value=30, label="Target seconds (used for trim/fit)")
	keep = gr.Number(value=0.25, label="Set pause length (seconds)")

	with gr.Accordion("Advanced options", open=False):
	min_sil = gr.Slider(50, 1000, 120, step=10, label="Pause if silence ≥ (ms)")
	thresh = gr.Slider(-80, -10, -45, step=1, label="Silence threshold (dBFS)")
	do_norm = gr.Checkbox(True, label="Normalize loudness (~-14 LUFS)")

	fmt = gr.Dropdown(["mp3","wav","m4a","ogg"], value="mp3", label="Output format")
	go = gr.Button("Process", variant="primary")

	# Right column: outputs
	with gr.Column():
	preview = gr.Audio(label="Preview (first file)", type="filepath", interactive=False, elem_id="preview-audio")
	direct = gr.File(label="Download processed file (single)")
	zip_out = gr.File(label="Download ZIP (if multiple)")
	rep = gr.Textbox(label="Report", lines=1)

	def run(files, mode, target, keep, min_sil, thresh, do_norm, fmt):
	files = files or []
	if not files:
	return None, None, None, "Please upload at least one audio file."

	# process first file (preview + single download)
	single_blob, report = process_single(
	open(files[0], "rb"),
	mode=mode, target_seconds=target, keep_silence_s=keep,
	min_silence_ms=min_sil, silence_thresh_db=thresh,
	do_normalize=do_norm, fmt=fmt
	)
	preview_path = write_temp_for_preview(single_blob, fmt)

	if len(files) == 1:
	return preview_path, single_blob, None, report
	else:
	opened = [open(p, "rb") for p in files]
	zipped = process_batch(
	opened, mode=mode, target_seconds=target, keep_silence_s=keep,
	min_silence_ms=min_sil, silence_thresh_db=thresh,
	do_normalize=do_norm, fmt=fmt
	)
	return preview_path, None, zipped, report

	# wire UI
	go.click(
	run,
	[files, mode, target, keep, min_sil, thresh, do_norm, fmt],
	[preview, direct, zip_out, rep]
	)

	if __name__ == "__main__":
	demo.queue().launch()