Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import io, os, uuid, zipfile, tempfile, subprocess | |
| from pydub import AudioSegment | |
| from pydub.silence import split_on_silence | |
| # ---------- helpers ---------- | |
| def _load(file_or_bytes): | |
| if isinstance(file_or_bytes, (bytes, bytearray)): | |
| return AudioSegment.from_file(io.BytesIO(file_or_bytes)) | |
| if hasattr(file_or_bytes, "read"): | |
| return AudioSegment.from_file(file_or_bytes) | |
| return AudioSegment.from_file(file_or_bytes) | |
| def _export(seg: AudioSegment, fmt="mp3") -> io.BytesIO: | |
| buf = io.BytesIO() | |
| seg.export(buf, format=fmt) | |
| buf.seek(0) | |
| return buf | |
| def remove_silence(seg: AudioSegment, keep_ms=250, min_silence_ms=120, thresh_db=-45): | |
| """ | |
| keep_ms: how much silence to keep at each cut (your final pause length) | |
| min_silence_ms: only treat silence >= this length as a pause | |
| thresh_db: what counts as "silence" (in dBFS), e.g., -45 for voiceovers | |
| """ | |
| chunks = split_on_silence( | |
| seg, | |
| min_silence_len=int(min_silence_ms), | |
| silence_thresh=float(thresh_db), | |
| keep_silence=int(keep_ms), | |
| ) | |
| return sum(chunks, AudioSegment.silent(duration=0)) if chunks else seg | |
| def trim_to_seconds(seg: AudioSegment, target_s: float): | |
| t_ms = max(0, int(float(target_s) * 1000)) | |
| if len(seg) >= t_ms: | |
| return seg[:t_ms] | |
| # pad if shorter | |
| return seg + AudioSegment.silent(duration=t_ms - len(seg)) | |
| def _atempo_chain(factor: float) -> str: | |
| # Split large/small adjustments into steps within [0.5, 2.0] for quality | |
| steps = [] | |
| f = max(0.1, min(10.0, float(factor))) | |
| while f < 0.5: | |
| steps.append(0.5); f /= 0.5 | |
| while f > 2.0: | |
| steps.append(2.0); f /= 2.0 | |
| steps.append(f) | |
| return ",".join([f"atempo={s:.5f}" for s in steps]) | |
| def fit_to_seconds(seg: AudioSegment, target_s: float, fmt_out="mp3") -> io.BytesIO: | |
| """Pitch-preserving time stretch via FFmpeg atempo.""" | |
| with tempfile.TemporaryDirectory() as d: | |
| inp = os.path.join(d, "in.wav") | |
| outp = os.path.join(d, f"out.{fmt_out}") | |
| seg.export(inp, format="wav") | |
| orig = max(0.01, len(seg) / 1000) | |
| factor = float(target_s) / orig | |
| af = _atempo_chain(factor) | |
| codec = ["-c:a", "libmp3lame", "-b:a", "128k"] if fmt_out == "mp3" else [] | |
| cmd = ["ffmpeg", "-y", "-i", inp, "-vn", "-af", af, *codec, outp] | |
| subprocess.run(cmd, check=True) | |
| with open(outp, "rb") as f: | |
| return io.BytesIO(f.read()) | |
| def normalize_lufs(seg: AudioSegment, target_lufs=-14.0): | |
| # Lightweight RMS-based normalization (minimal deps) | |
| import math | |
| rms = seg.rms or 1 | |
| current_db = 20 * math.log10(rms / (1 << 15)) | |
| gain_db = float(target_lufs) - current_db | |
| return seg.apply_gain(gain_db) | |
| # ---------- processors ---------- | |
| def process_single(file, mode, target_seconds, keep_silence_s, | |
| min_silence_ms, silence_thresh_db, do_normalize, fmt): | |
| raw = file if isinstance(file, (bytes, bytearray)) else file.read() | |
| original = _load(raw) | |
| # 1) pause cleanup / normalization | |
| cleaned = remove_silence( | |
| original, | |
| keep_ms=int(float(keep_silence_s) * 1000), | |
| min_silence_ms=int(min_silence_ms), | |
| thresh_db=float(silence_thresh_db), | |
| ) | |
| # 2) loudness normalize | |
| if do_normalize: | |
| cleaned = normalize_lufs(cleaned, -14.0) | |
| # 3) timing | |
| if mode == "trim" and target_seconds: | |
| final = trim_to_seconds(cleaned, target_seconds) | |
| out = _export(final, fmt) | |
| elif mode == "fit" and target_seconds: | |
| out = fit_to_seconds(cleaned, target_seconds, fmt_out=fmt) | |
| else: | |
| out = _export(cleaned, fmt) | |
| before = len(original) / 1000 | |
| after = len(_load(out.getvalue())) / 1000 | |
| report = f"Before: {before:.2f}s | After: {after:.2f}s" | |
| return out, report | |
| def process_batch(files, **kwargs) -> io.BytesIO: | |
| zbuf = io.BytesIO() | |
| with zipfile.ZipFile(zbuf, "w", zipfile.ZIP_DEFLATED) as z: | |
| for f in files: | |
| single, _ = process_single(f, **kwargs) | |
| name = getattr(f, "name", f"audio_{uuid.uuid4().hex}") | |
| stem = os.path.splitext(name)[0] | |
| z.writestr(f"{stem}_processed.{kwargs['fmt']}", single.getvalue()) | |
| zbuf.seek(0) | |
| return zbuf | |
| def write_temp_for_preview(blob: io.BytesIO, fmt: str) -> str: | |
| # Gradio audio prefers a file path for the preview widget | |
| tf = tempfile.NamedTemporaryFile(delete=False, suffix=f".{fmt}") | |
| tf.write(blob.getvalue()) | |
| tf.flush(); tf.close() | |
| return tf.name | |
| # ---------- UI (force two-column, compact) ---------- | |
| CSS = """ | |
| /* wider canvas */ | |
| .gradio-container { max-width: 1200px !important; margin: 0 auto !important; padding: 8px 10px !important; } | |
| /* force two columns with sane minimums */ | |
| #twocol { | |
| display: grid; | |
| grid-template-columns: minmax(320px, 1fr) minmax(320px, 1fr); | |
| gap: 12px; | |
| align-items: start; | |
| } | |
| /* tighten component spacing */ | |
| #twocol .block, #twocol .form, #twocol .gap { gap: 8px !important; } | |
| #twocol .gr-button { height: 40px; } | |
| #twocol .gr-number input { height: 36px; } | |
| #twocol .gr-textbox textarea { min-height: 40px; } | |
| /* compact audio bar */ | |
| #preview-audio audio { width: 100%; height: 36px; } | |
| /* Only stack on very small screens */ | |
| @media (max-width: 600px) { | |
| #twocol { grid-template-columns: 1fr; } | |
| } | |
| """ | |
| with gr.Blocks(title="AI Voice Studio – Simple", css=CSS) as demo: | |
| gr.Markdown("### AI Voice Studio — Set pause length; optionally **Trim** or **Fit** to exact time. Export MP3/WAV/M4A/OGG.") | |
| with gr.Row(elem_id="twocol"): | |
| # Left column: controls | |
| with gr.Column(): | |
| files = gr.Files(label="Upload audio", file_types=["audio"], type="filepath") | |
| mode = gr.Radio(["none", "trim", "fit"], value="none", label="Timing mode") | |
| target = gr.Number(value=30, label="Target seconds (used for trim/fit)") | |
| keep = gr.Number(value=0.25, label="Set pause length (seconds)") | |
| with gr.Accordion("Advanced options", open=False): | |
| min_sil = gr.Slider(50, 1000, 120, step=10, label="Pause if silence ≥ (ms)") | |
| thresh = gr.Slider(-80, -10, -45, step=1, label="Silence threshold (dBFS)") | |
| do_norm = gr.Checkbox(True, label="Normalize loudness (~-14 LUFS)") | |
| fmt = gr.Dropdown(["mp3","wav","m4a","ogg"], value="mp3", label="Output format") | |
| go = gr.Button("Process", variant="primary") | |
| # Right column: outputs | |
| with gr.Column(): | |
| preview = gr.Audio(label="Preview (first file)", type="filepath", interactive=False, elem_id="preview-audio") | |
| direct = gr.File(label="Download processed file (single)") | |
| zip_out = gr.File(label="Download ZIP (if multiple)") | |
| rep = gr.Textbox(label="Report", lines=1) | |
| def run(files, mode, target, keep, min_sil, thresh, do_norm, fmt): | |
| files = files or [] | |
| if not files: | |
| return None, None, None, "Please upload at least one audio file." | |
| # process first file (preview + single download) | |
| single_blob, report = process_single( | |
| open(files[0], "rb"), | |
| mode=mode, target_seconds=target, keep_silence_s=keep, | |
| min_silence_ms=min_sil, silence_thresh_db=thresh, | |
| do_normalize=do_norm, fmt=fmt | |
| ) | |
| preview_path = write_temp_for_preview(single_blob, fmt) | |
| if len(files) == 1: | |
| return preview_path, single_blob, None, report | |
| else: | |
| opened = [open(p, "rb") for p in files] | |
| zipped = process_batch( | |
| opened, mode=mode, target_seconds=target, keep_silence_s=keep, | |
| min_silence_ms=min_sil, silence_thresh_db=thresh, | |
| do_normalize=do_norm, fmt=fmt | |
| ) | |
| return preview_path, None, zipped, report | |
| # wire UI | |
| go.click( | |
| run, | |
| [files, mode, target, keep, min_sil, thresh, do_norm, fmt], | |
| [preview, direct, zip_out, rep] | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue().launch() | |