import gradio as gr import io, os, uuid, zipfile, tempfile, subprocess from pydub import AudioSegment from pydub.silence import split_on_silence # ---------- helpers ---------- def _load(file_or_bytes): if isinstance(file_or_bytes, (bytes, bytearray)): return AudioSegment.from_file(io.BytesIO(file_or_bytes)) if hasattr(file_or_bytes, "read"): return AudioSegment.from_file(file_or_bytes) return AudioSegment.from_file(file_or_bytes) def _export(seg: AudioSegment, fmt="mp3") -> io.BytesIO: buf = io.BytesIO() seg.export(buf, format=fmt) buf.seek(0) return buf def remove_silence(seg: AudioSegment, keep_ms=250, min_silence_ms=120, thresh_db=-45): """ keep_ms: how much silence to keep at each cut (your final pause length) min_silence_ms: only treat silence >= this length as a pause thresh_db: what counts as "silence" (in dBFS), e.g., -45 for voiceovers """ chunks = split_on_silence( seg, min_silence_len=int(min_silence_ms), silence_thresh=float(thresh_db), keep_silence=int(keep_ms), ) return sum(chunks, AudioSegment.silent(duration=0)) if chunks else seg def trim_to_seconds(seg: AudioSegment, target_s: float): t_ms = max(0, int(float(target_s) * 1000)) if len(seg) >= t_ms: return seg[:t_ms] # pad if shorter return seg + AudioSegment.silent(duration=t_ms - len(seg)) def _atempo_chain(factor: float) -> str: # Split large/small adjustments into steps within [0.5, 2.0] for quality steps = [] f = max(0.1, min(10.0, float(factor))) while f < 0.5: steps.append(0.5); f /= 0.5 while f > 2.0: steps.append(2.0); f /= 2.0 steps.append(f) return ",".join([f"atempo={s:.5f}" for s in steps]) def fit_to_seconds(seg: AudioSegment, target_s: float, fmt_out="mp3") -> io.BytesIO: """Pitch-preserving time stretch via FFmpeg atempo.""" with tempfile.TemporaryDirectory() as d: inp = os.path.join(d, "in.wav") outp = os.path.join(d, f"out.{fmt_out}") seg.export(inp, format="wav") orig = max(0.01, len(seg) / 1000) factor = float(target_s) / orig af = _atempo_chain(factor) codec = ["-c:a", "libmp3lame", "-b:a", "128k"] if fmt_out == "mp3" else [] cmd = ["ffmpeg", "-y", "-i", inp, "-vn", "-af", af, *codec, outp] subprocess.run(cmd, check=True) with open(outp, "rb") as f: return io.BytesIO(f.read()) def normalize_lufs(seg: AudioSegment, target_lufs=-14.0): # Lightweight RMS-based normalization (minimal deps) import math rms = seg.rms or 1 current_db = 20 * math.log10(rms / (1 << 15)) gain_db = float(target_lufs) - current_db return seg.apply_gain(gain_db) # ---------- processors ---------- def process_single(file, mode, target_seconds, keep_silence_s, min_silence_ms, silence_thresh_db, do_normalize, fmt): raw = file if isinstance(file, (bytes, bytearray)) else file.read() original = _load(raw) # 1) pause cleanup / normalization cleaned = remove_silence( original, keep_ms=int(float(keep_silence_s) * 1000), min_silence_ms=int(min_silence_ms), thresh_db=float(silence_thresh_db), ) # 2) loudness normalize if do_normalize: cleaned = normalize_lufs(cleaned, -14.0) # 3) timing if mode == "trim" and target_seconds: final = trim_to_seconds(cleaned, target_seconds) out = _export(final, fmt) elif mode == "fit" and target_seconds: out = fit_to_seconds(cleaned, target_seconds, fmt_out=fmt) else: out = _export(cleaned, fmt) before = len(original) / 1000 after = len(_load(out.getvalue())) / 1000 report = f"Before: {before:.2f}s | After: {after:.2f}s" return out, report def process_batch(files, **kwargs) -> io.BytesIO: zbuf = io.BytesIO() with zipfile.ZipFile(zbuf, "w", zipfile.ZIP_DEFLATED) as z: for f in files: single, _ = process_single(f, **kwargs) name = getattr(f, "name", f"audio_{uuid.uuid4().hex}") stem = os.path.splitext(name)[0] z.writestr(f"{stem}_processed.{kwargs['fmt']}", single.getvalue()) zbuf.seek(0) return zbuf def write_temp_for_preview(blob: io.BytesIO, fmt: str) -> str: # Gradio audio prefers a file path for the preview widget tf = tempfile.NamedTemporaryFile(delete=False, suffix=f".{fmt}") tf.write(blob.getvalue()) tf.flush(); tf.close() return tf.name # ---------- UI (force two-column, compact) ---------- CSS = """ /* wider canvas */ .gradio-container { max-width: 1200px !important; margin: 0 auto !important; padding: 8px 10px !important; } /* force two columns with sane minimums */ #twocol { display: grid; grid-template-columns: minmax(320px, 1fr) minmax(320px, 1fr); gap: 12px; align-items: start; } /* tighten component spacing */ #twocol .block, #twocol .form, #twocol .gap { gap: 8px !important; } #twocol .gr-button { height: 40px; } #twocol .gr-number input { height: 36px; } #twocol .gr-textbox textarea { min-height: 40px; } /* compact audio bar */ #preview-audio audio { width: 100%; height: 36px; } /* Only stack on very small screens */ @media (max-width: 600px) { #twocol { grid-template-columns: 1fr; } } """ with gr.Blocks(title="AI Voice Studio – Simple", css=CSS) as demo: gr.Markdown("### AI Voice Studio — Set pause length; optionally **Trim** or **Fit** to exact time. Export MP3/WAV/M4A/OGG.") with gr.Row(elem_id="twocol"): # Left column: controls with gr.Column(): files = gr.Files(label="Upload audio", file_types=["audio"], type="filepath") mode = gr.Radio(["none", "trim", "fit"], value="none", label="Timing mode") target = gr.Number(value=30, label="Target seconds (used for trim/fit)") keep = gr.Number(value=0.25, label="Set pause length (seconds)") with gr.Accordion("Advanced options", open=False): min_sil = gr.Slider(50, 1000, 120, step=10, label="Pause if silence ≥ (ms)") thresh = gr.Slider(-80, -10, -45, step=1, label="Silence threshold (dBFS)") do_norm = gr.Checkbox(True, label="Normalize loudness (~-14 LUFS)") fmt = gr.Dropdown(["mp3","wav","m4a","ogg"], value="mp3", label="Output format") go = gr.Button("Process", variant="primary") # Right column: outputs with gr.Column(): preview = gr.Audio(label="Preview (first file)", type="filepath", interactive=False, elem_id="preview-audio") direct = gr.File(label="Download processed file (single)") zip_out = gr.File(label="Download ZIP (if multiple)") rep = gr.Textbox(label="Report", lines=1) def run(files, mode, target, keep, min_sil, thresh, do_norm, fmt): files = files or [] if not files: return None, None, None, "Please upload at least one audio file." # process first file (preview + single download) single_blob, report = process_single( open(files[0], "rb"), mode=mode, target_seconds=target, keep_silence_s=keep, min_silence_ms=min_sil, silence_thresh_db=thresh, do_normalize=do_norm, fmt=fmt ) preview_path = write_temp_for_preview(single_blob, fmt) if len(files) == 1: return preview_path, single_blob, None, report else: opened = [open(p, "rb") for p in files] zipped = process_batch( opened, mode=mode, target_seconds=target, keep_silence_s=keep, min_silence_ms=min_sil, silence_thresh_db=thresh, do_normalize=do_norm, fmt=fmt ) return preview_path, None, zipped, report # wire UI go.click( run, [files, mode, target, keep, min_sil, thresh, do_norm, fmt], [preview, direct, zip_out, rep] ) if __name__ == "__main__": demo.queue().launch()