VeuReu commited on
Commit
b90479c
·
verified ·
1 Parent(s): 9f91e92

Upload 2 files

Browse files
Files changed (2) hide show
  1. app.py +27 -2
  2. generate_tts.py +293 -0
app.py CHANGED
@@ -17,13 +17,13 @@ from ovos_tts_plugin_matxa_multispeaker_cat import MatxaCatalanTTSPlugin
17
 
18
  # Utilidades SRT/AD (tu script original)
19
  # - parsea SRT y construye pistas AD y mezcla, usando ffmpeg/pydub
20
- from tts_ad_from_srt import (
21
  parse_srt_ad_only,
22
  mix_segments_on_timeline,
23
  build_ad_track_from_srt,
24
  ffmpeg_extract_audio_mp4_to_mp3,
25
  mix_two_audios_simultaneous,
26
- ffmpeg_mux_video_with_audio
27
  )
28
 
29
  APP_STARTED_AT = time.strftime("%Y-%m-%d %H:%M:%S")
@@ -219,6 +219,31 @@ def tts_from_long_text(
219
 
220
  # ---------- SRT → AD (+ mezcla y opcional MP4) ----------
221
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
222
  @app.post("/tts/srt")
223
  def tts_from_srt(
224
  srt: UploadFile = File(..., description="Archivo .srt con líneas (AD): ..."),
 
17
 
18
  # Utilidades SRT/AD (tu script original)
19
  # - parsea SRT y construye pistas AD y mezcla, usando ffmpeg/pydub
20
+ from generate_tts import (
21
  parse_srt_ad_only,
22
  mix_segments_on_timeline,
23
  build_ad_track_from_srt,
24
  ffmpeg_extract_audio_mp4_to_mp3,
25
  mix_two_audios_simultaneous,
26
+ ffmpeg_mux_video_with_audio,
27
  )
28
 
29
  APP_STARTED_AT = time.strftime("%Y-%m-%d %H:%M:%S")
 
219
 
220
  # ---------- SRT → AD (+ mezcla y opcional MP4) ----------
221
 
222
+ @app.post("/tts/srt_ad_audio")
223
+ def tts_ad_audio_from_srt(
224
+ srt: UploadFile = File(..., description="Archivo .srt con líneas (AD): ..."),
225
+ voice: str = Form("central/grau"),
226
+ ad_format: str = Form("mp3"),
227
+ ):
228
+ """Genera solo la pista de audiodescripción (une_ad.mp3/une_ad.wav) a partir de un SRT."""
229
+ ad_format = _ensure_fmt(ad_format)
230
+
231
+ with tempfile.TemporaryDirectory(prefix="matxa_srt_ad_only_") as td:
232
+ td = Path(td)
233
+ srt_path = td / "input.srt"
234
+ srt_bytes = srt.file.read()
235
+ srt_path.write_bytes(srt_bytes)
236
+
237
+ ad_out = td / f"une_ad.{ad_format}"
238
+ _ = build_ad_track_from_srt(str(srt_path), output_path=str(ad_out), voice=voice)
239
+
240
+ data = ad_out.read_bytes()
241
+ content_type = "audio/wav" if ad_format == "wav" else "audio/mpeg"
242
+ filename = f"une_ad.{ad_format}"
243
+ headers = {"Content-Disposition": f'inline; filename="{filename}"'}
244
+ return Response(content=data, media_type=content_type, headers=headers)
245
+
246
+
247
  @app.post("/tts/srt")
248
  def tts_from_srt(
249
  srt: UploadFile = File(..., description="Archivo .srt con líneas (AD): ..."),
generate_tts.py ADDED
@@ -0,0 +1,293 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import math
4
+ import tempfile
5
+ import subprocess
6
+ from dataclasses import dataclass
7
+ from typing import List, Optional, Tuple
8
+
9
+ import numpy as np
10
+ import soundfile as sf
11
+
12
+ # TTS plugin Matxa (ONNX/OVOS)
13
+ from ovos_tts_plugin_matxa_multispeaker_cat import MatxaCatalanTTSPlugin
14
+
15
+ # MP3 (vía ffmpeg) con pydub
16
+ from pydub import AudioSegment
17
+
18
+
19
+ @dataclass
20
+ class Segment:
21
+ idx: int
22
+ start_s: float
23
+ end_s: float
24
+ text: str # ya sin "(AD): "
25
+
26
+
27
+ SRT_TS = re.compile(
28
+ r"(?P<h1>\d{2}):(?P<m1>\d{2}):(?P<s1>\d{2}),(?P<ms1>\d{3})\s*-->\s*"
29
+ r"(?P<h2>\d{2}):(?P<m2>\d{2}):(?P<s2>\d{2}),(?P<ms2>\d{3})"
30
+ )
31
+
32
+
33
+ def _ts_to_seconds(h: str, m: str, s: str, ms: str) -> float:
34
+ return int(h) * 3600 + int(m) * 60 + int(s) + int(ms) / 1000.0
35
+
36
+
37
+ def _is_empty_ad_text(t: str) -> bool:
38
+ """Devuelve True si el texto de AD está vacío de contenido (solo espacios/puntuación)."""
39
+ cleaned = re.sub(r"[^\wÀ-ÿ]", "", t, flags=re.UNICODE)
40
+ return len(cleaned.strip()) == 0
41
+
42
+
43
+ def parse_srt_ad_only(path: str) -> List[Segment]:
44
+ """Devuelve sólo segmentos cuyo bloque contiene líneas de AD con contenido."""
45
+ with open(path, "r", encoding="utf-8") as f:
46
+ content = f.read()
47
+
48
+ content = content.replace("\r\n", "\n").replace("\r", "\n")
49
+ blocks = [b.strip() for b in re.split(r"\n\s*\n", content) if b.strip()]
50
+
51
+ segs: List[Segment] = []
52
+ for block in blocks:
53
+ lines = block.split("\n")
54
+ if len(lines) < 2:
55
+ continue
56
+ try:
57
+ idx = int(lines[0].strip())
58
+ ts_line = lines[1].strip()
59
+ m = SRT_TS.match(ts_line)
60
+ if not m:
61
+ continue
62
+ start_s = _ts_to_seconds(m["h1"], m["m1"], m["s1"], m["ms1"])
63
+ end_s = _ts_to_seconds(m["h2"], m["m2"], m["s2"], m["ms2"])
64
+
65
+ ad_texts = []
66
+ for t in lines[2:]:
67
+ t = t.strip()
68
+ if (t.startswith("(AD):") or t.startswith("[AD]:") or
69
+ t.startswith("(AD)") and not t.startswith("(AD):") or
70
+ t.startswith("[AD]") and not t.startswith("[AD]:")):
71
+
72
+ if t.startswith("(AD):"):
73
+ t = t[len("(AD):"):].lstrip()
74
+ elif t.startswith("[AD]:"):
75
+ t = t[len("[AD]:"):].lstrip()
76
+ elif t.startswith("(AD)"):
77
+ t = t[len("(AD)"):].lstrip()
78
+ elif t.startswith("[AD]"):
79
+ t = t[len("[AD]"):].lstrip()
80
+
81
+ if t and not _is_empty_ad_text(t):
82
+ ad_texts.append(t)
83
+
84
+ if not ad_texts:
85
+ continue
86
+
87
+ text = " ".join(ad_texts)
88
+ segs.append(Segment(idx=idx, start_s=start_s, end_s=end_s, text=text))
89
+ except Exception:
90
+ continue
91
+
92
+ segs.sort(key=lambda s: (s.start_s, s.idx))
93
+ return segs
94
+
95
+
96
+ def tts_to_wav(
97
+ text: str,
98
+ out_path: str,
99
+ voice: str = "central/grau",
100
+ tts: Optional[MatxaCatalanTTSPlugin] = None,
101
+ ) -> Tuple[int, np.ndarray]:
102
+ created_tts = tts is None
103
+ if tts is None:
104
+ tts = MatxaCatalanTTSPlugin()
105
+
106
+ tts.get_tts(text, out_path, voice=voice)
107
+
108
+ data, sr = sf.read(out_path, dtype="float32", always_2d=False)
109
+ if created_tts:
110
+ del tts
111
+ if data.ndim == 2:
112
+ data = data.mean(axis=1)
113
+ return sr, data
114
+
115
+
116
+ def trim_or_pad_to_duration(data: np.ndarray, sr: int, target_sec: float) -> np.ndarray:
117
+ target_len = int(round(target_sec * sr))
118
+ cur_len = len(data)
119
+ if cur_len > target_len:
120
+ return data[:target_len]
121
+ elif cur_len < target_len:
122
+ pad = np.zeros(target_len - cur_len, dtype=data.dtype)
123
+ return np.concatenate([data, pad])
124
+ return data
125
+
126
+
127
+ def _resample_np(x: np.ndarray, sr_from: int, sr_to: int) -> np.ndarray:
128
+ if sr_from == sr_to:
129
+ return x
130
+ ratio = sr_to / sr_from
131
+ new_len = int(round(len(x) * ratio))
132
+ xp = np.linspace(0, 1, num=len(x), endpoint=False)
133
+ fp = x
134
+ xq = np.linspace(0, 1, num=new_len, endpoint=False)
135
+ yq = np.interp(xq, xp, fp).astype(np.float32)
136
+ return yq
137
+
138
+
139
+ def mix_segments_on_timeline(
140
+ segments: List[Segment],
141
+ voice: str,
142
+ out_final: str,
143
+ target_sr: Optional[int] = None,
144
+ ) -> str:
145
+ """Genera un master de todos los segmentos AD, colocándolos en su timestamp SRT."""
146
+ if not segments:
147
+ raise ValueError("No hay segmentos (AD) con contenido en el SRT.")
148
+
149
+ total_dur = max(s.end_s for s in segments)
150
+
151
+ tts = MatxaCatalanTTSPlugin()
152
+
153
+ tmpdir = tempfile.mkdtemp(prefix="matxa_ad_")
154
+ tmp_clips: List[Tuple[int, np.ndarray, float, float]] = []
155
+
156
+ for seg in segments:
157
+ seg_wav = os.path.join(tmpdir, f"ad_{seg.idx}.wav")
158
+ sr, data = tts_to_wav(seg.text, seg_wav, voice=voice, tts=tts)
159
+ seg_dur = seg.end_s - seg.start_s
160
+ data = trim_or_pad_to_duration(data, sr, seg_dur)
161
+ tmp_clips.append((sr, data, seg.start_s, seg.end_s))
162
+
163
+ master_sr = target_sr or tmp_clips[0][0]
164
+ master_len = int(round(total_dur * master_sr))
165
+ master = np.zeros(master_len, dtype=np.float32)
166
+
167
+ for sr, data, start_s, _ in tmp_clips:
168
+ d = _resample_np(data, sr, master_sr)
169
+ start_i = int(round(start_s * master_sr))
170
+ end_i = start_i + len(d)
171
+ if end_i > len(master):
172
+ end_i = len(master)
173
+ d = d[: end_i - start_i]
174
+ master[start_i:end_i] += d
175
+
176
+ peak = np.max(np.abs(master)) if master.size else 0.0
177
+ if peak > 0.999:
178
+ master = (master / peak * 0.98).astype(np.float32)
179
+
180
+ base, ext = os.path.splitext(out_final)
181
+ if ext.lower() == ".mp3":
182
+ tmp_wav = base + ".__tmp_master__.wav"
183
+ sf.write(tmp_wav, master, master_sr, subtype="PCM_16")
184
+ au = AudioSegment.from_wav(tmp_wav)
185
+ au.export(out_final, format="mp3")
186
+ os.remove(tmp_wav)
187
+ return out_final
188
+ else:
189
+ out_wav = base + ".wav" if ext.lower() != ".wav" else out_final
190
+ sf.write(out_wav, master, master_sr, subtype="PCM_16")
191
+ return out_wav
192
+
193
+
194
+ def ffmpeg_extract_audio_mp4_to_mp3(mp4_path: str, out_mp3_path: str, bitrate: str = "192k") -> str:
195
+ """Extrae el audio del MP4 y lo guarda como MP3 (requiere ffmpeg)."""
196
+ cmd = [
197
+ "ffmpeg",
198
+ "-y",
199
+ "-i",
200
+ mp4_path,
201
+ "-vn",
202
+ "-acodec",
203
+ "libmp3lame",
204
+ "-b:a",
205
+ bitrate,
206
+ out_mp3_path,
207
+ ]
208
+ subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
209
+ return out_mp3_path
210
+
211
+
212
+ def mix_two_audios_simultaneous(
213
+ mp3_a_path: str,
214
+ mp3_b_path: str,
215
+ out_mp3_path: str,
216
+ normalise: bool = True,
217
+ ) -> str:
218
+ """Mezcla simultáneamente dos MP3 (p.ej., audio original + AD) y exporta un MP3."""
219
+ a = AudioSegment.from_file(mp3_a_path)
220
+ b = AudioSegment.from_file(mp3_b_path)
221
+
222
+ max_len = max(len(a), len(b))
223
+ if len(a) < max_len:
224
+ a = a.append(AudioSegment.silent(duration=max_len - len(a)), crossfade=0)
225
+ if len(b) < max_len:
226
+ b = b.append(AudioSegment.silent(duration=max_len - len(b)), crossfade=0)
227
+
228
+ mixed = a.overlay(b)
229
+
230
+ if normalise:
231
+ peak = mixed.max_dBFS
232
+ headroom = -1.0
233
+ gain = headroom - peak
234
+ mixed = mixed.apply_gain(gain)
235
+
236
+ mixed.export(out_mp3_path, format="mp3")
237
+ return out_mp3_path
238
+
239
+
240
+ def ffmpeg_mux_video_with_audio(video_mp4: str, audio_mp3: str, out_mp4: str) -> str:
241
+ """Crea un MP4 con el vídeo mudo del original y la pista de audio proporcionada."""
242
+ cmd = [
243
+ "ffmpeg",
244
+ "-y",
245
+ "-i",
246
+ video_mp4,
247
+ "-i",
248
+ audio_mp3,
249
+ "-map",
250
+ "0:v:0",
251
+ "-map",
252
+ "1:a:0",
253
+ "-c:v",
254
+ "copy",
255
+ "-shortest",
256
+ out_mp4,
257
+ ]
258
+ subprocess.run(cmd, check=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
259
+ return out_mp4
260
+
261
+
262
+ def build_ad_track_from_srt(srt_path: str, output_path: str = "ad_master.mp3", voice: str = "central/grau") -> str:
263
+ segs = parse_srt_ad_only(srt_path)
264
+ if not segs:
265
+ from pydub import AudioSegment as _AS
266
+
267
+ silence = _AS.silent(duration=1000)
268
+ silence.export(output_path, format="mp3" if output_path.endswith(".mp3") else "wav")
269
+ return output_path
270
+
271
+ result = mix_segments_on_timeline(segs, voice=voice, out_final=output_path)
272
+ return result
273
+
274
+
275
+ def make_final_assets_from_video_and_srt(
276
+ video_mp4: str,
277
+ srt_path: str,
278
+ out_ad_mp3: str = "ad_master.mp3",
279
+ out_mix_mp3: str = "mix_original_plus_ad.mp3",
280
+ out_final_mp4: str = "video_con_ad.mp4",
281
+ voice: str = "upc_ona-medium",
282
+ ) -> Tuple[str, str, str]:
283
+ """Pipeline completo: AD, extracción de audio, mezcla y MP4 final."""
284
+ ad_mp3 = build_ad_track_from_srt(srt_path, output_path=out_ad_mp3, voice=voice)
285
+
286
+ ori_mp3 = ffmpeg_extract_audio_mp4_to_mp3(
287
+ video_mp4,
288
+ out_mp3_path=os.path.splitext(out_ad_mp3)[0] + "_original.mp3",
289
+ )
290
+ mix_mp3 = mix_two_audios_simultaneous(ori_mp3, ad_mp3, out_mix_mp3)
291
+
292
+ final_mp4 = ffmpeg_mux_video_with_audio(video_mp4, mix_mp3, out_final_mp4)
293
+ return ad_mp3, mix_mp3, final_mp4