VoiceoverStudio / app.py
lifesee's picture
Update app.py
89f7924 verified
import gradio as gr
import io, os, uuid, zipfile, tempfile, subprocess
from pydub import AudioSegment
from pydub.silence import split_on_silence
# ---------- helpers ----------
def _load(file_or_bytes):
if isinstance(file_or_bytes, (bytes, bytearray)):
return AudioSegment.from_file(io.BytesIO(file_or_bytes))
if hasattr(file_or_bytes, "read"):
return AudioSegment.from_file(file_or_bytes)
return AudioSegment.from_file(file_or_bytes)
def _export(seg: AudioSegment, fmt="mp3") -> io.BytesIO:
buf = io.BytesIO()
seg.export(buf, format=fmt)
buf.seek(0)
return buf
def remove_silence(seg: AudioSegment, keep_ms=250, min_silence_ms=120, thresh_db=-45):
"""
keep_ms: how much silence to keep at each cut (your final pause length)
min_silence_ms: only treat silence >= this length as a pause
thresh_db: what counts as "silence" (in dBFS), e.g., -45 for voiceovers
"""
chunks = split_on_silence(
seg,
min_silence_len=int(min_silence_ms),
silence_thresh=float(thresh_db),
keep_silence=int(keep_ms),
)
return sum(chunks, AudioSegment.silent(duration=0)) if chunks else seg
def trim_to_seconds(seg: AudioSegment, target_s: float):
t_ms = max(0, int(float(target_s) * 1000))
if len(seg) >= t_ms:
return seg[:t_ms]
# pad if shorter
return seg + AudioSegment.silent(duration=t_ms - len(seg))
def _atempo_chain(factor: float) -> str:
# Split large/small adjustments into steps within [0.5, 2.0] for quality
steps = []
f = max(0.1, min(10.0, float(factor)))
while f < 0.5:
steps.append(0.5); f /= 0.5
while f > 2.0:
steps.append(2.0); f /= 2.0
steps.append(f)
return ",".join([f"atempo={s:.5f}" for s in steps])
def fit_to_seconds(seg: AudioSegment, target_s: float, fmt_out="mp3") -> io.BytesIO:
"""Pitch-preserving time stretch via FFmpeg atempo."""
with tempfile.TemporaryDirectory() as d:
inp = os.path.join(d, "in.wav")
outp = os.path.join(d, f"out.{fmt_out}")
seg.export(inp, format="wav")
orig = max(0.01, len(seg) / 1000)
factor = float(target_s) / orig
af = _atempo_chain(factor)
codec = ["-c:a", "libmp3lame", "-b:a", "128k"] if fmt_out == "mp3" else []
cmd = ["ffmpeg", "-y", "-i", inp, "-vn", "-af", af, *codec, outp]
subprocess.run(cmd, check=True)
with open(outp, "rb") as f:
return io.BytesIO(f.read())
def normalize_lufs(seg: AudioSegment, target_lufs=-14.0):
# Lightweight RMS-based normalization (minimal deps)
import math
rms = seg.rms or 1
current_db = 20 * math.log10(rms / (1 << 15))
gain_db = float(target_lufs) - current_db
return seg.apply_gain(gain_db)
# ---------- processors ----------
def process_single(file, mode, target_seconds, keep_silence_s,
min_silence_ms, silence_thresh_db, do_normalize, fmt):
raw = file if isinstance(file, (bytes, bytearray)) else file.read()
original = _load(raw)
# 1) pause cleanup / normalization
cleaned = remove_silence(
original,
keep_ms=int(float(keep_silence_s) * 1000),
min_silence_ms=int(min_silence_ms),
thresh_db=float(silence_thresh_db),
)
# 2) loudness normalize
if do_normalize:
cleaned = normalize_lufs(cleaned, -14.0)
# 3) timing
if mode == "trim" and target_seconds:
final = trim_to_seconds(cleaned, target_seconds)
out = _export(final, fmt)
elif mode == "fit" and target_seconds:
out = fit_to_seconds(cleaned, target_seconds, fmt_out=fmt)
else:
out = _export(cleaned, fmt)
before = len(original) / 1000
after = len(_load(out.getvalue())) / 1000
report = f"Before: {before:.2f}s | After: {after:.2f}s"
return out, report
def process_batch(files, **kwargs) -> io.BytesIO:
zbuf = io.BytesIO()
with zipfile.ZipFile(zbuf, "w", zipfile.ZIP_DEFLATED) as z:
for f in files:
single, _ = process_single(f, **kwargs)
name = getattr(f, "name", f"audio_{uuid.uuid4().hex}")
stem = os.path.splitext(name)[0]
z.writestr(f"{stem}_processed.{kwargs['fmt']}", single.getvalue())
zbuf.seek(0)
return zbuf
def write_temp_for_preview(blob: io.BytesIO, fmt: str) -> str:
# Gradio audio prefers a file path for the preview widget
tf = tempfile.NamedTemporaryFile(delete=False, suffix=f".{fmt}")
tf.write(blob.getvalue())
tf.flush(); tf.close()
return tf.name
# ---------- UI (force two-column, compact) ----------
CSS = """
/* wider canvas */
.gradio-container { max-width: 1200px !important; margin: 0 auto !important; padding: 8px 10px !important; }
/* force two columns with sane minimums */
#twocol {
display: grid;
grid-template-columns: minmax(320px, 1fr) minmax(320px, 1fr);
gap: 12px;
align-items: start;
}
/* tighten component spacing */
#twocol .block, #twocol .form, #twocol .gap { gap: 8px !important; }
#twocol .gr-button { height: 40px; }
#twocol .gr-number input { height: 36px; }
#twocol .gr-textbox textarea { min-height: 40px; }
/* compact audio bar */
#preview-audio audio { width: 100%; height: 36px; }
/* Only stack on very small screens */
@media (max-width: 600px) {
#twocol { grid-template-columns: 1fr; }
}
"""
with gr.Blocks(title="AI Voice Studio – Simple", css=CSS) as demo:
gr.Markdown("### AI Voice Studio — Set pause length; optionally **Trim** or **Fit** to exact time. Export MP3/WAV/M4A/OGG.")
with gr.Row(elem_id="twocol"):
# Left column: controls
with gr.Column():
files = gr.Files(label="Upload audio", file_types=["audio"], type="filepath")
mode = gr.Radio(["none", "trim", "fit"], value="none", label="Timing mode")
target = gr.Number(value=30, label="Target seconds (used for trim/fit)")
keep = gr.Number(value=0.25, label="Set pause length (seconds)")
with gr.Accordion("Advanced options", open=False):
min_sil = gr.Slider(50, 1000, 120, step=10, label="Pause if silence ≥ (ms)")
thresh = gr.Slider(-80, -10, -45, step=1, label="Silence threshold (dBFS)")
do_norm = gr.Checkbox(True, label="Normalize loudness (~-14 LUFS)")
fmt = gr.Dropdown(["mp3","wav","m4a","ogg"], value="mp3", label="Output format")
go = gr.Button("Process", variant="primary")
# Right column: outputs
with gr.Column():
preview = gr.Audio(label="Preview (first file)", type="filepath", interactive=False, elem_id="preview-audio")
direct = gr.File(label="Download processed file (single)")
zip_out = gr.File(label="Download ZIP (if multiple)")
rep = gr.Textbox(label="Report", lines=1)
def run(files, mode, target, keep, min_sil, thresh, do_norm, fmt):
files = files or []
if not files:
return None, None, None, "Please upload at least one audio file."
# process first file (preview + single download)
single_blob, report = process_single(
open(files[0], "rb"),
mode=mode, target_seconds=target, keep_silence_s=keep,
min_silence_ms=min_sil, silence_thresh_db=thresh,
do_normalize=do_norm, fmt=fmt
)
preview_path = write_temp_for_preview(single_blob, fmt)
if len(files) == 1:
return preview_path, single_blob, None, report
else:
opened = [open(p, "rb") for p in files]
zipped = process_batch(
opened, mode=mode, target_seconds=target, keep_silence_s=keep,
min_silence_ms=min_sil, silence_thresh_db=thresh,
do_normalize=do_norm, fmt=fmt
)
return preview_path, None, zipped, report
# wire UI
go.click(
run,
[files, mode, target, keep, min_sil, thresh, do_norm, fmt],
[preview, direct, zip_out, rep]
)
if __name__ == "__main__":
demo.queue().launch()