Spaces:

VeuReu
/

tts

Sleeping

App Files Files Community

VeuReu commited on Dec 22, 2025

Commit

dccfe73

verified ·

1 Parent(s): cf4287a

Delete tts_ad_from_srt.py

Browse files

Files changed (1) hide show

tts_ad_from_srt.py +0 -351

tts_ad_from_srt.py DELETED Viewed

@@ -1,351 +0,0 @@
-import os
-import re
-import math
-import tempfile
-import subprocess
-from dataclasses import dataclass
-from typing import List, Optional, Tuple
-import numpy as np
-import soundfile as sf
-# TTS plugin Matxa (ONNX/OVOS)
-from ovos_tts_plugin_matxa_multispeaker_cat import MatxaCatalanTTSPlugin
-# MP3 (vía ffmpeg) con pydub
-from pydub import AudioSegment
-@dataclass
-class Segment:
-    idx: int
-    start_s: float
-    end_s: float
-    text: str  # ya sin "(AD): "
-SRT_TS = re.compile(
-    r"(?P<h1>\d{2}):(?P<m1>\d{2}):(?P<s1>\d{2}),(?P<ms1>\d{3})\s*-->\s*"
-    r"(?P<h2>\d{2}):(?P<m2>\d{2}):(?P<s2>\d{2}),(?P<ms2>\d{3})"
-)
-def _ts_to_seconds(h: str, m: str, s: str, ms: str) -> float:
-    return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000.0
-def _is_empty_ad_text(t: str) -> bool:
-    """
-    Devuelve True si el texto de AD está vacío de contenido (solo espacios/puntuación).
-    """
-    # quita espacios y signos; si no queda nada, es "vacío"
-    cleaned = re.sub(r"[^\wÀ-ÿ]", "", t, flags=re.UNICODE)  # conserva letras y dígitos (incluye acentos)
-    return len(cleaned.strip()) == 0
-def parse_srt_ad_only(path: str) -> List[Segment]:
-    """
-    Devuelve sólo segmentos cuyo bloque contiene líneas que empiezan por '(AD):', '[AD]:', '(AD)' o '[AD]'.
-    Ignora los (AD) sin información (punto 1 del encargo).
-    """
-    with open(path, "r", encoding="utf-8") as f:
-        content = f.read()
-    content = content.replace("\r\n", "\n").replace("\r", "\n")
-    blocks = [b.strip() for b in re.split(r"\n\s*\n", content) if b.strip()]
-    segs: List[Segment] = []
-    for block in blocks:
-        lines = block.split("\n")
-        if len(lines) < 2:
-            continue
-        try:
-            idx = int(lines[0].strip())
-            ts_line = lines[1].strip()
-            m = SRT_TS.match(ts_line)
-            if not m:
-                continue
-            start_s = _ts_to_seconds(m["h1"], m["m1"], m["s1"], m["ms1"])
-            end_s = _ts_to_seconds(m["h2"], m["m2"], m["s2"], m["ms2"])
-            ad_texts = []
-            for t in lines[2:]:
-                t = t.strip()
-                # Aceptar múltiples formatos: (AD):, [AD]:, (AD), [AD]
-                if (t.startswith("(AD):") or t.startswith("[AD]:") or
-                    t.startswith("(AD)") and not t.startswith("(AD):") or
-                    t.startswith("[AD]") and not t.startswith("[AD]:")):
-                    # Extraer el texto después del prefijo
-                    if t.startswith("(AD):"):
-                        t = t[len("(AD):"):].lstrip()
-                    elif t.startswith("[AD]:"):
-                        t = t[len("[AD]:"):].lstrip()
-                    elif t.startswith("(AD)"):
-                        t = t[len("(AD)"):].lstrip()
-                    elif t.startswith("[AD]"):
-                        t = t[len("[AD]"):].lstrip()
-                    if t and not _is_empty_ad_text(t):
-                        ad_texts.append(t)
-            if not ad_texts:
-                continue  # ignora bloques (AD) vacíos o sin contenido
-            text = " ".join(ad_texts)
-            segs.append(Segment(idx=idx, start_s=start_s, end_s=end_s, text=text))
-        except Exception:
-            continue
-    segs.sort(key=lambda s: (s.start_s, s.idx))
-    return segs
-def tts_to_wav(
-    text: str,
-    out_path: str,
-    voice: str = "central/grau",
-    tts: Optional[MatxaCatalanTTSPlugin] = None
-) -> Tuple[int, np.ndarray]:
-    created_tts = tts is None
-    # No necesitamos inicializar tts aquí, ya se inicializa en app.py (get_tts)
-    # y se pasa aquí, o se inicializa con MatxaCatalanTTSPlugin() en mix_segments_on_timeline.
-    if tts is None:
-        # CORRECCIÓN TEMPORAL: Inicializamos aquí si no viene para CLI (aunque en FastAPI lo hace get_tts)
-        tts = MatxaCatalanTTSPlugin()
-    tts.get_tts(text, out_path, voice=voice)
-    data, sr = sf.read(out_path, dtype="float32", always_2d=False)
-    if created_tts:
-        # Si lo creamos aquí, lo borramos. En FastAPI se reutiliza.
-        del tts
-    if data.ndim == 2:
-        data = data.mean(axis=1)
-    return sr, data
-def trim_or_pad_to_duration(data: np.ndarray, sr: int, target_sec: float) -> np.ndarray:
-    target_len = int(round(target_sec * sr))
-    cur_len = len(data)
-    if cur_len > target_len:
-        return data[:target_len]
-    elif cur_len < target_len:
-        pad = np.zeros(target_len - cur_len, dtype=data.dtype)
-        return np.concatenate([data, pad])
-    return data
-def _resample_np(x: np.ndarray, sr_from: int, sr_to: int) -> np.ndarray:
-    if sr_from == sr_to:
-        return x
-    ratio = sr_to / sr_from
-    new_len = int(round(len(x) * ratio))
-    xp = np.linspace(0, 1, num=len(x), endpoint=False)
-    fp = x
-    xq = np.linspace(0, 1, num=new_len, endpoint=False)
-    yq = np.interp(xq, xp, fp).astype(np.float32)
-    return yq
-def mix_segments_on_timeline(
-    segments: List[Segment],
-    voice: str,
-    out_final: str,
-    target_sr: Optional[int] = None
-) -> str:
-    """
-    Genera un master de todos los segmentos AD, colocándolos en su timestamp SRT.
-    Si out_final termina en .mp3, escribe MP3 (vía ffmpeg/pydub); si no, WAV.
-    """
-    if not segments:
-        raise ValueError("No hay segmentos (AD) con contenido en el SRT.")
-    total_dur = max(s.end_s for s in segments)
-    # CORRECCIÓN CLAVE: Inicializar sin argumentos para evitar TypeError: 'lang'
-    # Esta inicialización es para el modo CLI o si se usa fuera de FastAPI.
-    tts = MatxaCatalanTTSPlugin()
-    tmpdir = tempfile.mkdtemp(prefix="matxa_ad_")
-    tmp_clips: List[Tuple[int, np.ndarray, float, float]] = []
-    for seg in segments:
-        seg_wav = os.path.join(tmpdir, f"ad_{seg.idx}.wav")
-        # Pasamos la instancia tts creada arriba
-        sr, data = tts_to_wav(seg.text, seg_wav, voice=voice, tts=tts)
-        seg_dur = seg.end_s - seg.start_s
-        data = trim_or_pad_to_duration(data, sr, seg_dur)
-        tmp_clips.append((sr, data, seg.start_s, seg.end_s))
-    master_sr = target_sr or tmp_clips[0][0]
-    master_len = int(round(total_dur * master_sr))
-    master = np.zeros(master_len, dtype=np.float32)
-    for sr, data, start_s, _ in tmp_clips:
-        d = _resample_np(data, sr, master_sr)
-        start_i = int(round(start_s * master_sr))
-        end_i = start_i + len(d)
-        if end_i > len(master):
-            end_i = len(master)
-            d = d[: end_i - start_i]
-        master[start_i:end_i] += d
-    peak = np.max(np.abs(master)) if master.size else 0.0
-    if peak > 0.999:
-        master = (master / peak * 0.98).astype(np.float32)
-    base, ext = os.path.splitext(out_final)
-    if ext.lower() == ".mp3":
-        tmp_wav = base + ".__tmp_master__.wav"
-        sf.write(tmp_wav, master, master_sr, subtype="PCM_16")
-        au = AudioSegment.from_wav(tmp_wav)
-        au.export(out_final, format="mp3")
-        os.remove(tmp_wav)
-        return out_final
-    else:
-        out_wav = base + ".wav" if ext.lower() != ".wav" else out_final
-        sf.write(out_wav, master, master_sr, subtype="PCM_16")
-        return out_wav
-# ---------- (2) extraer audio de MP4 y mezclarlo con AD (simultáneo) ----------
-def ffmpeg_extract_audio_mp4_to_mp3(mp4_path: str, out_mp3_path: str, bitrate: str = "192k") -> str:
-    """
-    Extrae el audio del MP4 y lo guarda como MP3 (requiere ffmpeg).
-    """
-    cmd = [
-        "ffmpeg", "-y",
-        "-i", mp4_path,
-        "-vn",
-        "-acodec", "libmp3lame", "-b:a", bitrate,
-        out_mp3_path
-    ]
-    subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    return out_mp3_path
-def mix_two_audios_simultaneous(mp3_a_path: str, mp3_b_path: str, out_mp3_path: str, normalise: bool = True) -> str:
-    """
-    Mezcla simultáneamente dos MP3 (p.ej., audio original + AD) y exporta un MP3.
-    - Ajusta la duración al máximo de ambas (rellena silencio si hace falta).
-    - Si normalise=True, aplica una normalización suave para evitar clipping.
-    """
-    a = AudioSegment.from_file(mp3_a_path)
-    b = AudioSegment.from_file(mp3_b_path)
-    # Igualamos duración: fondo del más largo
-    max_len = max(len(a), len(b))
-    if len(a) < max_len:
-        a = a.append(AudioSegment.silent(duration=max_len - len(a)), crossfade=0)
-    if len(b) < max_len:
-        b = b.append(AudioSegment.silent(duration=max_len - len(b)), crossfade=0)
-    # Mezcla: simple overlay. Puedes bajar el AD o el original si lo deseas (dB).
-    mixed = a.overlay(b)  # overlay simultáneo
-    if normalise:
-        peak = mixed.max_dBFS  # valor negativo, cercano a 0 dBFS
-        headroom = -1.0  # deja 1 dB de margen
-        gain = headroom - peak
-        mixed = mixed.apply_gain(gain)
-    mixed.export(out_mp3_path, format="mp3")
-    return out_mp3_path
-# ---------- (3) generar MP4 final: vídeo mudo + pista mezclada ----------
-def ffmpeg_mux_video_with_audio(video_mp4: str, audio_mp3: str, out_mp4: str) -> str:
-    """
-    Crea un MP4 con el vídeo mudo del original y la pista de audio proporcionada.
-    Mantiene el vídeo sin recomprimir (-c:v copy).
-    """
-    cmd = [
-        "ffmpeg", "-y",
-        "-i", video_mp4,
-        "-i", audio_mp3,
-        "-map", "0:v:0",  # coge el vídeo de la 1ª entrada
-        "-map", "1:a:0",  # coge el audio de la 2ª entrada
-        "-c:v", "copy",
-        "-shortest",
-        out_mp4
-    ]
-    subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
-    return out_mp4
-# --------------------------- pipeline convenientes ----------------------------
-def build_ad_track_from_srt(srt_path: str, output_path: str = "ad_master.mp3", voice: str = "central/grau") -> str:
-    segs = parse_srt_ad_only(srt_path)
-    if not segs:
-        # En lugar de fallar, crear un archivo de audio silencioso
-        print("⚠️  No se encontraron bloques (AD) con contenido en el SRT. Creando pista silenciosa.")
-        from pydub import AudioSegment
-        # Crear 1 segundo de silencio
-        silence = AudioSegment.silent(duration=1000)
-        silence.export(output_path, format="mp3" if output_path.endswith(".mp3") else "wav")
-        return output_path
-    result = mix_segments_on_timeline(segs, voice=voice, out_final=output_path)
-    return result
-def make_final_assets_from_video_and_srt(
-    video_mp4: str,
-    srt_path: str,
-    out_ad_mp3: str = "ad_master.mp3",
-    out_mix_mp3: str = "mix_original_plus_ad.mp3",
-    out_final_mp4: str = "video_con_ad.mp4",
-    voice: str = "upc_ona-medium"
-) -> Tuple[str, str, str]:
-    """
-    Pipeline completo:
-      1) genera la pista AD desde el SRT,
-      2) extrae audio del MP4 a MP3,
-      3) mezcla simultánea original+AD a MP3,
-      4) remuxa vídeo mudo + pista mezclada a MP4 final.
-    Devuelve rutas: (ad_mp3, mix_mp3, final_mp4)
-    """
-    ad_mp3 = build_ad_track_from_srt(srt_path, output_path=out_ad_mp3, voice=voice)
-    ori_mp3 = ffmpeg_extract_audio_mp4_to_mp3(video_mp4, out_mp3_path=os.path.splitext(out_ad_mp3)[0] + "_original.mp3")
-    mix_mp3 = mix_two_audios_simultaneous(ori_mp3, ad_mp3, out_mix_mp3)
-    final_mp4 = ffmpeg_mux_video_with_audio(video_mp4, mix_mp3, out_final_mp4)
-    return ad_mp3, mix_mp3, final_mp4
-# -------------------------------- CLI ---------------------------------------
-if __name__ == "__main__":
-    import argparse
-    ap = argparse.ArgumentParser(description="Genera AD desde SRT y compone con video/audio usando Matxa + ffmpeg.")
-    ap.add_argument("--srt", help="Ruta al archivo .srt")
-    ap.add_argument("--video", help="Ruta al archivo .mp4 (para mezclar con AD y remux final)")
-    ap.add_argument("-o", "--output", default="ad_master.mp3", help="Salida de la pista AD (mp3 o wav).")
-    ap.add_argument("--voice", default="central/grau", help="Voz Matxa (ej: central/grau, upc/pau-medium)")
-    ap.add_argument("--do-pipeline", action="store_true",
-                    help="Ejecuta pipeline completo: genera AD, extrae audio del video, mezcla ambos y crea MP4 final.")
-    ap.add_argument("--mix-output", default="mix_original_plus_ad.mp3", help="Salida de audio mezclado (original+AD)")
-    ap.add_argument("--final-mp4", default="video_con_ad.mp4", help="Salida del MP4 final con AD")
-    args = ap.parse_args()
-    if args.do_pipeline:
-        if not args.srt or not args.video:
-            raise SystemExit("Para --do-pipeline necesitas --srt y --video.")
-        ad_mp3, mix_mp3, final_mp4 = make_final_assets_from_video_and_srt(
-            args.video, args.srt,
-            out_ad_mp3=args.output,
-            out_mix_mp3=args.mix_output,
-            out_final_mp4=args.final_mp4,
-            voice=args.voice
-        )
-        print(f"✔ AD:   {ad_mp3}")
-        print(f"✔ MIX:  {mix_mp3}")
-        print(f"✔ MP4:  {final_mp4}")
-    else:
-        if not args.srt:
-            raise SystemExit("Especifica --srt o usa --do-pipeline con --video.")
-        result = build_ad_track_from_srt(args.srt, output_path=args.output, voice=args.voice)
-        print(f"✔ Audio AD escrito en: {result}")