ulduldp commited on
Commit
62e66d3
·
verified ·
1 Parent(s): 9ca7553

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +173 -110
app.py CHANGED
@@ -2,7 +2,6 @@ from flask import Flask, render_template_string, request, jsonify
2
  import os
3
  import uuid
4
  import subprocess
5
- import textwrap
6
  import tempfile
7
  from werkzeug.utils import secure_filename
8
  from faster_whisper import WhisperModel
@@ -12,11 +11,9 @@ app = Flask(__name__)
12
 
13
  UPLOAD_FOLDER = "uploads"
14
  OUTPUT_FOLDER = "static/videos"
15
- SUBTITLE_FOLDER = "subtitles"
16
 
17
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
18
  os.makedirs(OUTPUT_FOLDER, exist_ok=True)
19
- os.makedirs(SUBTITLE_FOLDER, exist_ok=True)
20
 
21
  # Load Whisper once
22
  model = WhisperModel(
@@ -105,7 +102,7 @@ video{
105
  margin-top:20px;
106
  border-radius:15px;
107
  display:none;
108
- aspect-ratio: 9 / 16;
109
  background:#000;
110
  object-fit:cover;
111
  }
@@ -133,22 +130,30 @@ video{
133
  <body>
134
  <div class="container">
135
  <h1>Photo + Audio → Video</h1>
 
136
  <form id="form">
137
  <div class="upload-box">
138
  <label>Select Photo</label>
139
  <input type="file" id="image" name="image" accept="image/*" required>
 
140
  <img id="preview" class="preview">
 
141
  <label>Select Audio (mp3/wav)</label>
142
  <input type="file" name="audio" accept="audio/*" required>
143
  </div>
 
144
  <button type="submit">Generate Video</button>
145
  </form>
 
146
  <div id="loading">Generating Video...</div>
 
147
  <video id="video" controls></video>
 
148
  <div class="download-btn" id="downloadDiv">
149
  <a id="downloadBtn" download>Download Video</a>
150
  </div>
151
  </div>
 
152
  <script>
153
  const form = document.getElementById("form");
154
  const loading = document.getElementById("loading");
@@ -156,34 +161,45 @@ const video = document.getElementById("video");
156
  const downloadBtn = document.getElementById("downloadBtn");
157
  const downloadDiv = document.getElementById("downloadDiv");
158
  const preview = document.getElementById("preview");
 
159
  document.getElementById("image").addEventListener("change", function(e){
160
  const file = e.target.files[0];
 
161
  if(file){
162
  preview.src = URL.createObjectURL(file);
163
  preview.style.display = "block";
164
  }
165
  });
 
166
  form.addEventListener("submit", async (e)=>{
167
  e.preventDefault();
 
168
  loading.style.display = "block";
169
  video.style.display = "none";
170
  downloadDiv.style.display = "none";
 
171
  const formData = new FormData(form);
 
172
  try{
173
  const response = await fetch("/generate", {
174
  method: "POST",
175
  body: formData
176
  });
 
177
  const data = await response.json();
 
178
  loading.style.display = "none";
 
179
  if(data.video_url){
180
  video.src = data.video_url + "?t=" + new Date().getTime();
181
  video.style.display = "block";
 
182
  downloadBtn.href = data.video_url;
183
  downloadDiv.style.display = "block";
184
  }else{
185
  alert(data.error || "Failed");
186
  }
 
187
  }catch(err){
188
  loading.style.display = "none";
189
  alert("Server Error");
@@ -194,100 +210,97 @@ form.addEventListener("submit", async (e)=>{
194
  </html>
195
  """
196
 
197
- # Reel canvas size
198
  VIDEO_W = 1080
199
  VIDEO_H = 1920
200
 
201
- # Caption box settings
202
- BOX_MARGIN_BOTTOM = 180
203
- BOX_MAX_MARGIN_X = 80
204
- BOX_PADDING_X = 42
205
- BOX_PADDING_Y = 24
206
- BOX_RADIUS = 28
207
- FONT_SIZE = 54
208
 
209
 
210
- def find_font() -> str:
211
  candidates = [
212
  "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
213
  "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
214
- "/System/Library/Fonts/Supplemental/Arial Bold.ttf",
215
  "C:\\Windows\\Fonts\\arialbd.ttf",
216
  "C:\\Windows\\Fonts\\arial.ttf",
217
  ]
 
218
  for p in candidates:
219
  if os.path.exists(p):
220
  return p
221
- raise FileNotFoundError("No usable font found.")
222
 
 
223
 
224
- def wrap_text_by_width(text: str, font: ImageFont.FreeTypeFont, max_width: int) -> list[str]:
225
- words = text.split()
226
- if not words:
227
- return [""]
228
 
 
229
  dummy = Image.new("RGBA", (10, 10))
230
  draw = ImageDraw.Draw(dummy)
231
 
 
232
  lines = []
 
 
 
 
233
  current = words[0]
234
 
235
  for word in words[1:]:
236
- trial = current + " " + word
237
- bbox = draw.textbbox((0, 0), trial, font=font)
 
238
  width = bbox[2] - bbox[0]
 
239
  if width <= max_width:
240
- current = trial
241
  else:
242
  lines.append(current)
243
  current = word
244
 
245
  lines.append(current)
246
- return lines
247
 
 
248
 
249
- def draw_rounded_rect(draw: ImageDraw.ImageDraw, xy, radius, fill):
250
- # Pillow has rounded_rectangle in modern versions.
251
- # This fallback keeps it safe if needed.
252
- if hasattr(draw, "rounded_rectangle"):
253
- draw.rounded_rectangle(xy, radius=radius, fill=fill)
254
- return
255
 
256
- x1, y1, x2, y2 = xy
257
- draw.rectangle([x1 + radius, y1, x2 - radius, y2], fill=fill)
258
- draw.rectangle([x1, y1 + radius, x2, y2 - radius], fill=fill)
259
- draw.pieslice([x1, y1, x1 + 2 * radius, y1 + 2 * radius], 180, 270, fill=fill)
260
- draw.pieslice([x2 - 2 * radius, y1, x2, y1 + 2 * radius], 270, 360, fill=fill)
261
- draw.pieslice([x1, y2 - 2 * radius, x1 + 2 * radius, y2], 90, 180, fill=fill)
262
- draw.pieslice([x2 - 2 * radius, y2 - 2 * radius, x2, y2], 0, 90, fill=fill)
263
 
264
 
265
- def make_caption_png(text: str, out_path: str):
266
  font = ImageFont.truetype(find_font(), FONT_SIZE)
267
 
268
- max_text_width = VIDEO_W - (2 * BOX_MAX_MARGIN_X) - (2 * BOX_PADDING_X)
269
- lines = wrap_text_by_width(text, font, max_text_width)
 
270
 
271
  measure_img = Image.new("RGBA", (10, 10))
272
  measure_draw = ImageDraw.Draw(measure_img)
273
 
274
- line_metrics = []
 
275
  for line in lines:
276
  bbox = measure_draw.textbbox((0, 0), line, font=font)
 
277
  line_w = bbox[2] - bbox[0]
278
  line_h = bbox[3] - bbox[1]
279
- line_metrics.append((line_w, line_h))
280
 
281
- text_w = max((w for w, h in line_metrics), default=0)
282
- text_h = sum(h for w, h in line_metrics)
283
- line_gap = max(10, FONT_SIZE // 5)
284
- if len(lines) > 1:
285
- text_h += line_gap * (len(lines) - 1)
286
 
287
- box_w = min(VIDEO_W - 2 * BOX_MAX_MARGIN_X, text_w + 2 * BOX_PADDING_X)
288
- box_h = text_h + 2 * BOX_PADDING_Y
289
 
290
- img = Image.new("RGBA", (VIDEO_W, VIDEO_H), (0, 0, 0, 0))
 
 
 
 
 
 
 
291
  draw = ImageDraw.Draw(img)
292
 
293
  x1 = (VIDEO_W - box_w) // 2
@@ -295,44 +308,64 @@ def make_caption_png(text: str, out_path: str):
295
  y1 = y2 - box_h
296
  x2 = x1 + box_w
297
 
298
- # Black rounded background
299
- draw_rounded_rect(draw, (x1, y1, x2, y2), BOX_RADIUS, (0, 0, 0, 215))
 
 
 
 
 
300
 
301
  # Glow layer
302
- glow = Image.new("RGBA", (VIDEO_W, VIDEO_H), (0, 0, 0, 0))
303
  glow_draw = ImageDraw.Draw(glow)
304
 
305
  current_y = y1 + BOX_PADDING_Y
306
- for i, line in enumerate(lines):
307
- bbox = measure_draw.textbbox((0, 0), line, font=font)
308
- line_w = bbox[2] - bbox[0]
309
- line_h = bbox[3] - bbox[1]
310
  tx = (VIDEO_W - line_w) // 2
311
 
312
- # soft glow
313
- for dx, dy in [(-2, 0), (2, 0), (0, -2), (0, 2), (-2, -2), (-2, 2), (2, -2), (2, 2)]:
314
- glow_draw.text((tx + dx, current_y + dy), line, font=font, fill=(255, 255, 255, 90))
 
 
 
 
 
 
 
 
315
 
316
  current_y += line_h + line_gap
317
 
318
  glow = glow.filter(ImageFilter.GaussianBlur(4))
 
319
  img = Image.alpha_composite(img, glow)
320
 
321
- # Crisp white text over glow
 
 
322
  current_y = y1 + BOX_PADDING_Y
323
- for i, line in enumerate(lines):
324
- bbox = measure_draw.textbbox((0, 0), line, font=font)
325
- line_w = bbox[2] - bbox[0]
326
- line_h = bbox[3] - bbox[1]
327
  tx = (VIDEO_W - line_w) // 2
328
- draw.text((tx, current_y), line, font=font, fill=(255, 255, 255, 255))
 
 
 
 
 
 
 
 
 
329
  current_y += line_h + line_gap
330
 
331
  img.save(out_path)
332
 
333
 
334
- def build_filter_complex(num_caption_inputs: int, transcript):
335
- # Base vertical reel pipeline unchanged
336
  base = (
337
  "[0:v]"
338
  "scale=1080:1920:force_original_aspect_ratio=increase,"
@@ -345,20 +378,21 @@ def build_filter_complex(num_caption_inputs: int, transcript):
345
  "[base]"
346
  )
347
 
348
- if num_caption_inputs == 0:
349
- return base, "[base]"
350
-
351
  parts = [base]
 
352
  last = "[base]"
353
 
354
  for idx, seg in enumerate(transcript, start=2):
355
  start = f"{seg['start']:.2f}"
356
  end = f"{seg['end']:.2f}"
357
- out_label = f"[v{idx}]"
 
 
358
  parts.append(
359
- f"{last}[{idx}:v]overlay=0:0:enable='between(t,{start},{end})'{out_label}"
360
  )
361
- last = out_label
 
362
 
363
  return ";".join(parts), last
364
 
@@ -370,6 +404,7 @@ def home():
370
 
371
  @app.route("/generate", methods=["POST"])
372
  def generate():
 
373
  if "image" not in request.files or "audio" not in request.files:
374
  return jsonify({"error": "Missing files"})
375
 
@@ -384,16 +419,28 @@ def generate():
384
  image_name = secure_filename(image.filename)
385
  audio_name = secure_filename(audio.filename)
386
 
387
- image_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{image_name}")
388
- audio_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{audio_name}")
 
 
 
 
 
 
 
 
389
  output_filename = f"{uid}.mp4"
390
- output_path = os.path.join(OUTPUT_FOLDER, output_filename)
 
 
 
 
391
 
392
  image.save(image_path)
393
  audio.save(audio_path)
394
 
395
  try:
396
- # Transcribe audio
397
  segments_iter, info = model.transcribe(
398
  audio_path,
399
  beam_size=5,
@@ -405,6 +452,7 @@ def generate():
405
 
406
  for segment in segments_iter:
407
  text = segment.text.strip()
 
408
  if not text:
409
  continue
410
 
@@ -413,57 +461,69 @@ def generate():
413
  "end": round(segment.end, 2),
414
  "text": text
415
  })
 
416
  full_text_parts.append(text)
417
 
418
  with tempfile.TemporaryDirectory() as tmpdir:
 
419
  caption_paths = []
420
 
421
  for i, seg in enumerate(transcript, start=1):
422
- cap_path = os.path.join(tmpdir, f"caption_{i:04d}.png")
423
- make_caption_png(seg["text"], cap_path)
424
- caption_paths.append(cap_path)
 
 
 
 
 
 
 
 
 
425
 
426
  cmd = [
427
  "ffmpeg",
428
  "-y",
 
429
  "-loop", "1",
430
  "-i", image_path,
431
- "-i", audio_path,
 
432
  ]
433
 
434
  for p in caption_paths:
435
- cmd += ["-loop", "1", "-i", p]
 
 
 
436
 
437
- filter_complex, last_video_label = build_filter_complex(len(caption_paths), transcript)
438
 
439
- if len(caption_paths) > 0:
440
- filter_file = os.path.join(tmpdir, "filter_complex.txt")
441
- with open(filter_file, "w", encoding="utf-8") as f:
442
- f.write(filter_complex)
443
 
444
- cmd += [
445
- "-filter_complex_script", filter_file,
446
- "-map", last_video_label,
447
- "-map", "1:a?",
448
- "-c:v", "libx264",
449
- "-pix_fmt", "yuv420p",
450
- "-c:a", "aac",
451
- "-b:a", "192k",
452
- "-shortest",
453
- output_path
454
- ]
455
- else:
456
- # No transcript found: still create the video without captions
457
- cmd += [
458
- "-vf",
459
- "scale=1080:1920:force_original_aspect_ratio=increase,crop=1080:1920,zoompan=z='min(zoom+0.0008,1.10)':x='iw/2-(iw/zoom/2)':y='ih/2-(ih/zoom/2)':d=999999:s=1080x1920:fps=30",
460
- "-c:v", "libx264",
461
- "-pix_fmt", "yuv420p",
462
- "-c:a", "aac",
463
- "-b:a", "192k",
464
- "-shortest",
465
- output_path
466
- ]
467
 
468
  subprocess.run(
469
  cmd,
@@ -480,11 +540,14 @@ def generate():
480
  })
481
 
482
  except subprocess.CalledProcessError as e:
 
483
  return jsonify({
484
  "error": "FFmpeg failed",
485
  "details": e.stderr.decode("utf-8", errors="ignore")
486
  })
 
487
  except Exception as e:
 
488
  return jsonify({
489
  "error": "Processing failed",
490
  "details": str(e)
 
2
  import os
3
  import uuid
4
  import subprocess
 
5
  import tempfile
6
  from werkzeug.utils import secure_filename
7
  from faster_whisper import WhisperModel
 
11
 
12
  UPLOAD_FOLDER = "uploads"
13
  OUTPUT_FOLDER = "static/videos"
 
14
 
15
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
16
  os.makedirs(OUTPUT_FOLDER, exist_ok=True)
 
17
 
18
  # Load Whisper once
19
  model = WhisperModel(
 
102
  margin-top:20px;
103
  border-radius:15px;
104
  display:none;
105
+ aspect-ratio:9/16;
106
  background:#000;
107
  object-fit:cover;
108
  }
 
130
  <body>
131
  <div class="container">
132
  <h1>Photo + Audio → Video</h1>
133
+
134
  <form id="form">
135
  <div class="upload-box">
136
  <label>Select Photo</label>
137
  <input type="file" id="image" name="image" accept="image/*" required>
138
+
139
  <img id="preview" class="preview">
140
+
141
  <label>Select Audio (mp3/wav)</label>
142
  <input type="file" name="audio" accept="audio/*" required>
143
  </div>
144
+
145
  <button type="submit">Generate Video</button>
146
  </form>
147
+
148
  <div id="loading">Generating Video...</div>
149
+
150
  <video id="video" controls></video>
151
+
152
  <div class="download-btn" id="downloadDiv">
153
  <a id="downloadBtn" download>Download Video</a>
154
  </div>
155
  </div>
156
+
157
  <script>
158
  const form = document.getElementById("form");
159
  const loading = document.getElementById("loading");
 
161
  const downloadBtn = document.getElementById("downloadBtn");
162
  const downloadDiv = document.getElementById("downloadDiv");
163
  const preview = document.getElementById("preview");
164
+
165
  document.getElementById("image").addEventListener("change", function(e){
166
  const file = e.target.files[0];
167
+
168
  if(file){
169
  preview.src = URL.createObjectURL(file);
170
  preview.style.display = "block";
171
  }
172
  });
173
+
174
  form.addEventListener("submit", async (e)=>{
175
  e.preventDefault();
176
+
177
  loading.style.display = "block";
178
  video.style.display = "none";
179
  downloadDiv.style.display = "none";
180
+
181
  const formData = new FormData(form);
182
+
183
  try{
184
  const response = await fetch("/generate", {
185
  method: "POST",
186
  body: formData
187
  });
188
+
189
  const data = await response.json();
190
+
191
  loading.style.display = "none";
192
+
193
  if(data.video_url){
194
  video.src = data.video_url + "?t=" + new Date().getTime();
195
  video.style.display = "block";
196
+
197
  downloadBtn.href = data.video_url;
198
  downloadDiv.style.display = "block";
199
  }else{
200
  alert(data.error || "Failed");
201
  }
202
+
203
  }catch(err){
204
  loading.style.display = "none";
205
  alert("Server Error");
 
210
  </html>
211
  """
212
 
213
+ # Reel resolution
214
  VIDEO_W = 1080
215
  VIDEO_H = 1920
216
 
217
+ # Caption styling
218
+ FONT_SIZE = 58
219
+ BOX_RADIUS = 32
220
+ BOX_PADDING_X = 45
221
+ BOX_PADDING_Y = 28
222
+ BOX_MARGIN_BOTTOM = 190
223
+ BOX_MARGIN_X = 80
224
 
225
 
226
+ def find_font():
227
  candidates = [
228
  "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
229
  "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
 
230
  "C:\\Windows\\Fonts\\arialbd.ttf",
231
  "C:\\Windows\\Fonts\\arial.ttf",
232
  ]
233
+
234
  for p in candidates:
235
  if os.path.exists(p):
236
  return p
 
237
 
238
+ raise FileNotFoundError("No font found")
239
 
 
 
 
 
240
 
241
+ def wrap_text(text, font, max_width):
242
  dummy = Image.new("RGBA", (10, 10))
243
  draw = ImageDraw.Draw(dummy)
244
 
245
+ words = text.split()
246
  lines = []
247
+
248
+ if not words:
249
+ return [""]
250
+
251
  current = words[0]
252
 
253
  for word in words[1:]:
254
+ test = current + " " + word
255
+
256
+ bbox = draw.textbbox((0, 0), test, font=font)
257
  width = bbox[2] - bbox[0]
258
+
259
  if width <= max_width:
260
+ current = test
261
  else:
262
  lines.append(current)
263
  current = word
264
 
265
  lines.append(current)
 
266
 
267
+ return lines
268
 
 
 
 
 
 
 
269
 
270
+ def draw_rounded_rect(draw, xy, radius, fill):
271
+ draw.rounded_rectangle(xy, radius=radius, fill=fill)
 
 
 
 
 
272
 
273
 
274
+ def make_caption_png(text, out_path):
275
  font = ImageFont.truetype(find_font(), FONT_SIZE)
276
 
277
+ max_text_width = VIDEO_W - (2 * BOX_MARGIN_X) - (2 * BOX_PADDING_X)
278
+
279
+ lines = wrap_text(text, font, max_text_width)
280
 
281
  measure_img = Image.new("RGBA", (10, 10))
282
  measure_draw = ImageDraw.Draw(measure_img)
283
 
284
+ line_data = []
285
+
286
  for line in lines:
287
  bbox = measure_draw.textbbox((0, 0), line, font=font)
288
+
289
  line_w = bbox[2] - bbox[0]
290
  line_h = bbox[3] - bbox[1]
 
291
 
292
+ line_data.append((line, line_w, line_h))
 
 
 
 
293
 
294
+ text_w = max([x[1] for x in line_data]) if line_data else 0
 
295
 
296
+ line_gap = 12
297
+
298
+ text_h = sum([x[2] for x in line_data]) + line_gap * (len(line_data)-1)
299
+
300
+ box_w = text_w + (BOX_PADDING_X * 2)
301
+ box_h = text_h + (BOX_PADDING_Y * 2)
302
+
303
+ img = Image.new("RGBA", (VIDEO_W, VIDEO_H), (0,0,0,0))
304
  draw = ImageDraw.Draw(img)
305
 
306
  x1 = (VIDEO_W - box_w) // 2
 
308
  y1 = y2 - box_h
309
  x2 = x1 + box_w
310
 
311
+ # Solid black rounded background
312
+ draw_rounded_rect(
313
+ draw,
314
+ (x1, y1, x2, y2),
315
+ BOX_RADIUS,
316
+ (0,0,0,240)
317
+ )
318
 
319
  # Glow layer
320
+ glow = Image.new("RGBA", (VIDEO_W, VIDEO_H), (0,0,0,0))
321
  glow_draw = ImageDraw.Draw(glow)
322
 
323
  current_y = y1 + BOX_PADDING_Y
324
+
325
+ for line, line_w, line_h in line_data:
 
 
326
  tx = (VIDEO_W - line_w) // 2
327
 
328
+ # glow
329
+ for dx, dy in [
330
+ (-3,0),(3,0),(0,-3),(0,3),
331
+ (-2,-2),(-2,2),(2,-2),(2,2)
332
+ ]:
333
+ glow_draw.text(
334
+ (tx+dx, current_y+dy),
335
+ line,
336
+ font=font,
337
+ fill=(255,255,255,90)
338
+ )
339
 
340
  current_y += line_h + line_gap
341
 
342
  glow = glow.filter(ImageFilter.GaussianBlur(4))
343
+
344
  img = Image.alpha_composite(img, glow)
345
 
346
+ # Main crisp white text
347
+ draw = ImageDraw.Draw(img)
348
+
349
  current_y = y1 + BOX_PADDING_Y
350
+
351
+ for line, line_w, line_h in line_data:
 
 
352
  tx = (VIDEO_W - line_w) // 2
353
+
354
+ draw.text(
355
+ (tx, current_y),
356
+ line,
357
+ font=font,
358
+ fill=(255,255,255,255),
359
+ stroke_width=4,
360
+ stroke_fill=(0,0,0,255)
361
+ )
362
+
363
  current_y += line_h + line_gap
364
 
365
  img.save(out_path)
366
 
367
 
368
+ def build_filter_complex(transcript):
 
369
  base = (
370
  "[0:v]"
371
  "scale=1080:1920:force_original_aspect_ratio=increase,"
 
378
  "[base]"
379
  )
380
 
 
 
 
381
  parts = [base]
382
+
383
  last = "[base]"
384
 
385
  for idx, seg in enumerate(transcript, start=2):
386
  start = f"{seg['start']:.2f}"
387
  end = f"{seg['end']:.2f}"
388
+
389
+ out = f"[v{idx}]"
390
+
391
  parts.append(
392
+ f"{last}[{idx}:v]overlay=0:0:enable='between(t,{start},{end})'{out}"
393
  )
394
+
395
+ last = out
396
 
397
  return ";".join(parts), last
398
 
 
404
 
405
  @app.route("/generate", methods=["POST"])
406
  def generate():
407
+
408
  if "image" not in request.files or "audio" not in request.files:
409
  return jsonify({"error": "Missing files"})
410
 
 
419
  image_name = secure_filename(image.filename)
420
  audio_name = secure_filename(audio.filename)
421
 
422
+ image_path = os.path.join(
423
+ UPLOAD_FOLDER,
424
+ f"{uid}_{image_name}"
425
+ )
426
+
427
+ audio_path = os.path.join(
428
+ UPLOAD_FOLDER,
429
+ f"{uid}_{audio_name}"
430
+ )
431
+
432
  output_filename = f"{uid}.mp4"
433
+
434
+ output_path = os.path.join(
435
+ OUTPUT_FOLDER,
436
+ output_filename
437
+ )
438
 
439
  image.save(image_path)
440
  audio.save(audio_path)
441
 
442
  try:
443
+ # Transcribe
444
  segments_iter, info = model.transcribe(
445
  audio_path,
446
  beam_size=5,
 
452
 
453
  for segment in segments_iter:
454
  text = segment.text.strip()
455
+
456
  if not text:
457
  continue
458
 
 
461
  "end": round(segment.end, 2),
462
  "text": text
463
  })
464
+
465
  full_text_parts.append(text)
466
 
467
  with tempfile.TemporaryDirectory() as tmpdir:
468
+
469
  caption_paths = []
470
 
471
  for i, seg in enumerate(transcript, start=1):
472
+
473
+ caption_path = os.path.join(
474
+ tmpdir,
475
+ f"caption_{i:04d}.png"
476
+ )
477
+
478
+ make_caption_png(
479
+ seg["text"],
480
+ caption_path
481
+ )
482
+
483
+ caption_paths.append(caption_path)
484
 
485
  cmd = [
486
  "ffmpeg",
487
  "-y",
488
+
489
  "-loop", "1",
490
  "-i", image_path,
491
+
492
+ "-i", audio_path
493
  ]
494
 
495
  for p in caption_paths:
496
+ cmd += [
497
+ "-loop", "1",
498
+ "-i", p
499
+ ]
500
 
501
+ filter_complex, last_video = build_filter_complex(transcript)
502
 
503
+ filter_script = os.path.join(
504
+ tmpdir,
505
+ "filter.txt"
506
+ )
507
 
508
+ with open(filter_script, "w", encoding="utf-8") as f:
509
+ f.write(filter_complex)
510
+
511
+ cmd += [
512
+ "-filter_complex_script", filter_script,
513
+
514
+ "-map", last_video,
515
+ "-map", "1:a?",
516
+
517
+ "-c:v", "libx264",
518
+ "-pix_fmt", "yuv420p",
519
+
520
+ "-c:a", "aac",
521
+ "-b:a", "192k",
522
+
523
+ "-shortest",
524
+
525
+ output_path
526
+ ]
 
 
 
 
527
 
528
  subprocess.run(
529
  cmd,
 
540
  })
541
 
542
  except subprocess.CalledProcessError as e:
543
+
544
  return jsonify({
545
  "error": "FFmpeg failed",
546
  "details": e.stderr.decode("utf-8", errors="ignore")
547
  })
548
+
549
  except Exception as e:
550
+
551
  return jsonify({
552
  "error": "Processing failed",
553
  "details": str(e)