ulduldp commited on
Commit
3241474
·
verified ·
1 Parent(s): 4882d31

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +103 -350
app.py CHANGED
@@ -1,302 +1,89 @@
1
  from flask import Flask, render_template_string, request, jsonify, send_from_directory, abort
2
- import os
3
- import uuid
4
- import subprocess
5
  from werkzeug.utils import secure_filename
6
  from faster_whisper import WhisperModel
7
  from PIL import ImageFont
8
 
9
  app = Flask(__name__)
10
-
11
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
12
-
13
  UPLOAD_FOLDER = os.path.join(BASE_DIR, "uploads")
14
  OUTPUT_FOLDER = os.path.join(BASE_DIR, "static", "videos")
15
  SUBTITLE_FOLDER = os.path.join(BASE_DIR, "subtitles")
16
-
17
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
18
  os.makedirs(OUTPUT_FOLDER, exist_ok=True)
19
  os.makedirs(SUBTITLE_FOLDER, exist_ok=True)
20
 
21
- # Fast CPU model
22
- model = WhisperModel(
23
- "tiny",
24
- device="cpu",
25
- compute_type="int8"
26
- )
27
-
28
- HTML = """
29
- <!DOCTYPE html>
30
- <html lang="en">
31
- <head>
32
- <meta charset="UTF-8">
33
- <meta name="viewport" content="width=device-width, initial-scale=1.0">
34
- <title>Photo + Audio To Video</title>
35
 
 
 
36
  <style>
37
- *{
38
- margin:0;
39
- padding:0;
40
- box-sizing:border-box;
41
- font-family:Arial;
42
- }
43
-
44
- body{
45
- background:#0f0f0f;
46
- color:white;
47
- min-height:100vh;
48
- display:flex;
49
- justify-content:center;
50
- align-items:center;
51
- padding:20px;
52
- }
53
-
54
- .container{
55
- width:100%;
56
- max-width:500px;
57
- background:#1b1b1b;
58
- border-radius:20px;
59
- padding:25px;
60
- box-shadow:0 0 20px rgba(0,0,0,0.4);
61
- }
62
-
63
- h1{
64
- text-align:center;
65
- margin-bottom:25px;
66
- font-size:28px;
67
- }
68
-
69
- .upload-box{
70
- border:2px dashed #444;
71
- padding:20px;
72
- border-radius:15px;
73
- margin-bottom:20px;
74
- }
75
-
76
- label{
77
- display:block;
78
- margin-bottom:8px;
79
- color:#ccc;
80
- }
81
-
82
- input{
83
- width:100%;
84
- padding:12px;
85
- background:#2a2a2a;
86
- border:none;
87
- border-radius:10px;
88
- color:white;
89
- margin-bottom:15px;
90
- }
91
-
92
- button{
93
- width:100%;
94
- padding:15px;
95
- border:none;
96
- border-radius:12px;
97
- background:#00aaff;
98
- color:white;
99
- font-size:18px;
100
- cursor:pointer;
101
- transition:0.3s;
102
- }
103
-
104
- button:hover{
105
- opacity:0.9;
106
- }
107
-
108
- #loading{
109
- display:none;
110
- text-align:center;
111
- margin-top:20px;
112
- }
113
-
114
- video{
115
- width:100%;
116
- margin-top:20px;
117
- border-radius:15px;
118
- display:none;
119
- aspect-ratio:9/16;
120
- background:#000;
121
- object-fit:cover;
122
- }
123
-
124
- .download-btn{
125
- display:none;
126
- margin-top:15px;
127
- text-align:center;
128
- }
129
-
130
- .download-btn a{
131
- display:inline-block;
132
- background:#22c55e;
133
- color:white;
134
- text-decoration:none;
135
- padding:12px 20px;
136
- border-radius:10px;
137
- }
138
-
139
- .preview{
140
- margin-top:15px;
141
- width:100%;
142
- border-radius:15px;
143
- display:none;
144
- }
145
  </style>
146
- </head>
147
-
148
- <body>
149
  <div class="container">
150
  <h1>Photo + Audio → Video</h1>
151
-
152
  <form id="form">
153
  <div class="upload-box">
154
  <label>Select Photo</label>
155
  <input type="file" id="image" name="image" accept="image/*" required>
156
-
157
  <img id="preview" class="preview">
158
-
159
  <label>Select Audio (mp3/wav)</label>
160
  <input type="file" name="audio" accept="audio/*" required>
161
  </div>
162
-
163
  <button type="submit">Generate Video</button>
164
  </form>
165
-
166
  <div id="loading">Generating Video...</div>
167
-
168
  <video id="video" controls playsinline></video>
169
-
170
  <div class="download-btn" id="downloadDiv">
171
  <a id="downloadBtn" download>Download Video</a>
172
  </div>
173
  </div>
174
-
175
  <script>
176
- const form = document.getElementById("form");
177
- const loading = document.getElementById("loading");
178
- const video = document.getElementById("video");
179
- const downloadBtn = document.getElementById("downloadBtn");
180
- const downloadDiv = document.getElementById("downloadDiv");
181
- const preview = document.getElementById("preview");
182
-
183
- document.getElementById("image").addEventListener("change", function(e){
184
- const file = e.target.files[0];
185
- if(file){
186
- preview.src = URL.createObjectURL(file);
187
- preview.style.display = "block";
188
- }
189
- });
190
-
191
- form.addEventListener("submit", async (e)=>{
192
- e.preventDefault();
193
-
194
- loading.style.display = "block";
195
- video.style.display = "none";
196
- downloadDiv.style.display = "none";
197
-
198
- const formData = new FormData(form);
199
-
200
- try{
201
- const response = await fetch("/generate", {
202
- method:"POST",
203
- body:formData
204
- });
205
-
206
- const data = await response.json();
207
- loading.style.display = "none";
208
-
209
- if(data.video_url){
210
- video.src = data.video_url + "?t=" + new Date().getTime();
211
- video.style.display = "block";
212
-
213
- downloadBtn.href = data.video_url;
214
- downloadDiv.style.display = "block";
215
- }else{
216
- alert(data.error || "Failed");
217
- console.log(data.details || "");
218
- }
219
- }catch(err){
220
- loading.style.display = "none";
221
- alert("Server Error");
222
- console.error(err);
223
- }
224
- });
225
  </script>
226
- </body>
227
- </html>
228
  """
229
 
230
- def ass_time(seconds: float) -> str:
231
- if seconds < 0:
232
- seconds = 0
233
- h = int(seconds // 3600)
234
- m = int((seconds % 3600) // 60)
235
- s = seconds % 60
236
  return f"{h}:{m:02d}:{s:05.2f}"
237
 
238
  def ass_escape(text: str) -> str:
239
- text = text.replace("\\", "\\\\")
240
- text = text.replace("{", "\\{")
241
- text = text.replace("}", "\\}")
242
- text = text.replace("\n", " ")
243
- return text
244
-
245
- def escape_ffmpeg_path(path: str) -> str:
246
- return (
247
- path
248
- .replace("\\", "\\\\")
249
- .replace(":", "\\:")
250
- .replace("'", r"\'")
251
- )
252
-
253
- def find_font_path():
254
- candidates = [
255
- "/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf",
256
- "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
257
- "/usr/share/fonts/truetype/liberation2/LiberationSans-Bold.ttf",
258
- "/usr/share/fonts/truetype/liberation2/LiberationSans-Regular.ttf",
259
- "/usr/share/fonts/truetype/freefont/FreeSansBold.ttf",
260
- "/usr/share/fonts/truetype/freefont/FreeSans.ttf",
261
- ]
262
- for path in candidates:
263
- if os.path.exists(path):
264
- return path
265
- return None
266
-
267
- FONT_PATH = find_font_path()
268
 
269
- def measure_text_width(font, text: str) -> int:
270
- bbox = font.getbbox(text)
271
- return bbox[2] - bbox[0]
272
-
273
- def pixel_wrap_text(text: str, font_path: str, font_size: int, max_width_px: int, max_lines: int = 5) -> str:
274
  """
275
- Wrap text based on actual pixel width, not character count.
276
- Also splits long words if they exceed max_width_px.
 
277
  """
278
  text = " ".join(text.strip().split())
279
  if not text:
280
  return ""
281
 
 
282
  if font_path:
283
  font = ImageFont.truetype(font_path, font_size)
284
  else:
285
  font = ImageFont.load_default()
286
 
287
- words = text.split(" ")
288
- lines = []
289
- current = ""
290
-
291
- def split_long_word(word: str):
292
- if measure_text_width(font, word) <= max_width_px:
293
  return [word]
294
-
295
  parts = []
296
  chunk = ""
297
  for ch in word:
298
  trial = chunk + ch
299
- if measure_text_width(font, trial) <= max_width_px:
 
300
  chunk = trial
301
  else:
302
  if chunk:
@@ -306,26 +93,31 @@ def pixel_wrap_text(text: str, font_path: str, font_size: int, max_width_px: int
306
  parts.append(chunk)
307
  return parts
308
 
 
309
  tokens = []
310
- for word in words:
311
- tokens.extend(split_long_word(word))
312
 
 
 
 
313
  for token in tokens:
314
- trial = token if not current else f"{current} {token}"
315
- if measure_text_width(font, trial) <= max_width_px:
316
- current = trial
 
317
  else:
318
  if current:
319
  lines.append(current)
320
  current = token
321
-
322
  if current:
323
  lines.append(current)
324
 
 
325
  if len(lines) > max_lines:
326
- # last line gets the rest so text doesn't disappear
327
- kept = lines[:max_lines - 1]
328
- rest = " ".join(lines[max_lines - 1:])
329
  kept.append(rest)
330
  lines = kept
331
 
@@ -333,56 +125,54 @@ def pixel_wrap_text(text: str, font_path: str, font_size: int, max_width_px: int
333
 
334
  def make_ass_subtitles(segments, ass_path):
335
  """
336
- Solid black box behind white text.
337
- Font size reduced and wrap based on pixel width.
338
  """
 
339
  header = """[Script Info]
340
  ScriptType: v4.00+
341
  PlayResX: 1080
342
  PlayResY: 1920
343
- ScaledBorderAndShadow: yes
344
  WrapStyle: 2
345
-
346
  [V4+ Styles]
347
  Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
348
-
349
- Style: Default,Arial,38,&H00FFFFFF,&H000000FF,&H00000000,&H00000000,1,0,0,0,100,100,0,0,3,0,0,2,120,120,220,1
350
 
351
  [Events]
352
  Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
353
  """
354
-
355
  lines = [header]
 
 
 
 
 
 
 
 
 
356
 
357
- # Available width inside 1080 frame with margins
358
- max_width_px = 820
359
 
360
  for seg in segments:
361
  text = seg["text"].strip()
362
  if not text:
363
  continue
364
-
365
  start = ass_time(seg["start"])
366
  end = ass_time(seg["end"])
367
-
368
- wrapped = pixel_wrap_text(
369
- text=text,
370
- font_path=FONT_PATH,
371
- font_size=38,
372
- max_width_px=max_width_px,
373
- max_lines=5
374
- )
375
-
376
- wrapped = ass_escape(wrapped).replace("\n", r"\N")
377
-
378
- # BorderStyle=3 gives the opaque black box background
379
  dialogue = (
380
  f"Dialogue: 0,{start},{end},Default,,0,0,0,,"
381
  r"{\bord0\shad0\blur0\be0\1c&HFFFFFF&\3c&H000000&\4c&H000000&\3a&H00&\4a&H00}"
382
- f"{wrapped}\n"
383
  )
384
  lines.append(dialogue)
385
 
 
386
  with open(ass_path, "w", encoding="utf-8") as f:
387
  f.writelines(lines)
388
 
@@ -392,129 +182,92 @@ def home():
392
 
393
  @app.route("/video/<path:filename>")
394
  def serve_video(filename):
 
395
  file_path = os.path.join(OUTPUT_FOLDER, filename)
396
  if not os.path.exists(file_path):
397
  abort(404)
398
-
399
- response = send_from_directory(
400
- OUTPUT_FOLDER,
401
- filename,
402
- as_attachment=False,
403
- conditional=True
404
- )
405
  response.headers["Cache-Control"] = "no-store"
406
  return response
407
 
408
  @app.route("/generate", methods=["POST"])
409
  def generate():
 
410
  if "image" not in request.files or "audio" not in request.files:
411
- return jsonify({"error": "Missing files"})
412
-
413
  image = request.files["image"]
414
  audio = request.files["audio"]
415
-
416
  if not image.filename or not audio.filename:
417
- return jsonify({"error": "Please upload both image and audio"})
418
 
419
  uid = str(uuid.uuid4())
420
-
421
- image_name = secure_filename(image.filename)
422
- audio_name = secure_filename(audio.filename)
423
-
424
- image_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{image_name}")
425
- audio_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{audio_name}")
426
  output_filename = f"{uid}.mp4"
427
  output_path = os.path.join(OUTPUT_FOLDER, output_filename)
428
  ass_path = os.path.join(SUBTITLE_FOLDER, f"{uid}.ass")
429
 
 
430
  image.save(image_path)
431
  audio.save(audio_path)
432
 
433
  try:
434
- segments_iter, info = model.transcribe(
435
- audio_path,
436
- beam_size=1,
437
- vad_filter=True
438
- )
439
-
440
  transcript = []
441
- full_text_parts = []
442
-
443
- for segment in segments_iter:
444
- text = segment.text.strip()
445
- if not text:
446
- continue
447
-
448
  transcript.append({
449
- "start": round(segment.start, 2),
450
- "end": round(segment.end, 2),
451
  "text": text
452
  })
453
- full_text_parts.append(text)
454
-
455
  make_ass_subtitles(transcript, ass_path)
456
- safe_ass_path = escape_ffmpeg_path(os.path.abspath(ass_path))
457
 
458
- # IMPORTANT: crop first, then burn subtitles
459
- vf = (
 
 
 
460
  "scale=1080:1920:force_original_aspect_ratio=increase,"
461
  "crop=1080:1920,"
462
  f"ass='{safe_ass_path}'"
463
  )
464
-
465
  cmd = [
466
- "ffmpeg",
467
- "-y",
468
- "-loop", "1",
469
- "-framerate", "1",
470
- "-i", image_path,
471
  "-i", audio_path,
472
- "-vf", vf,
473
- "-map", "0:v:0",
474
- "-map", "1:a:0",
475
- "-c:v", "libx264",
476
- "-preset", "ultrafast",
477
- "-crf", "20",
478
- "-pix_fmt", "yuv420p",
479
- "-r", "24",
480
- "-c:a", "aac",
481
- "-b:a", "128k",
482
- "-movflags", "+faststart",
483
- "-shortest",
484
- output_path
485
  ]
 
486
 
487
- subprocess.run(
488
- cmd,
489
- stdout=subprocess.PIPE,
490
- stderr=subprocess.PIPE,
491
- check=True
492
- )
493
-
494
  if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
495
- return jsonify({
496
- "error": "Video file not created",
497
- "details": "FFmpeg ran but output file is missing or empty."
498
- })
499
 
500
  return jsonify({
501
  "video_url": f"/video/{output_filename}",
502
  "transcript": transcript,
503
- "full_text": " ".join(full_text_parts).strip(),
504
  "language": getattr(info, "language", None)
505
  })
506
 
507
  except subprocess.CalledProcessError as e:
508
- return jsonify({
509
- "error": "FFmpeg failed",
510
- "details": e.stderr.decode("utf-8", errors="ignore")
511
- })
512
-
513
  except Exception as e:
514
- return jsonify({
515
- "error": "Processing failed",
516
- "details": str(e)
517
- })
518
 
519
  if __name__ == "__main__":
520
- app.run(host="0.0.0.0", port=7860, debug=True)
 
1
  from flask import Flask, render_template_string, request, jsonify, send_from_directory, abort
2
+ import os, uuid, subprocess, textwrap
 
 
3
  from werkzeug.utils import secure_filename
4
  from faster_whisper import WhisperModel
5
  from PIL import ImageFont
6
 
7
  app = Flask(__name__)
 
8
  BASE_DIR = os.path.dirname(os.path.abspath(__file__))
 
9
  UPLOAD_FOLDER = os.path.join(BASE_DIR, "uploads")
10
  OUTPUT_FOLDER = os.path.join(BASE_DIR, "static", "videos")
11
  SUBTITLE_FOLDER = os.path.join(BASE_DIR, "subtitles")
 
12
  os.makedirs(UPLOAD_FOLDER, exist_ok=True)
13
  os.makedirs(OUTPUT_FOLDER, exist_ok=True)
14
  os.makedirs(SUBTITLE_FOLDER, exist_ok=True)
15
 
16
+ # Load Whisper model (CPU, fast)
17
+ model = WhisperModel("tiny", device="cpu", compute_type="int8")
 
 
 
 
 
 
 
 
 
 
 
 
18
 
19
+ HTML = """<!DOCTYPE html>
20
+ <html lang="en"><head><meta charset="UTF-8"><meta name="viewport" content="width=device-width, initial-scale=1.0"><title>Photo+Audio→Video</title>
21
  <style>
22
+ /* (CSS same as before) */
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  </style>
24
+ </head><body>
 
 
25
  <div class="container">
26
  <h1>Photo + Audio → Video</h1>
 
27
  <form id="form">
28
  <div class="upload-box">
29
  <label>Select Photo</label>
30
  <input type="file" id="image" name="image" accept="image/*" required>
 
31
  <img id="preview" class="preview">
 
32
  <label>Select Audio (mp3/wav)</label>
33
  <input type="file" name="audio" accept="audio/*" required>
34
  </div>
 
35
  <button type="submit">Generate Video</button>
36
  </form>
 
37
  <div id="loading">Generating Video...</div>
 
38
  <video id="video" controls playsinline></video>
 
39
  <div class="download-btn" id="downloadDiv">
40
  <a id="downloadBtn" download>Download Video</a>
41
  </div>
42
  </div>
 
43
  <script>
44
+ // (JS same as before)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
45
  </script>
46
+ </body></html>
 
47
  """
48
 
49
+ def ass_time(sec: float) -> str:
50
+ """Convert seconds to H:MM:SS.CS (ASS format)."""
51
+ if sec < 0: sec = 0
52
+ h = int(sec // 3600); m = int((sec % 3600) // 60); s = sec % 60
 
 
53
  return f"{h}:{m:02d}:{s:05.2f}"
54
 
55
  def ass_escape(text: str) -> str:
56
+ """Escape {\} characters for ASS subtitle text."""
57
+ return text.replace("\\", "\\\\").replace("{", "\\{").replace("}", "\\}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
58
 
59
+ def wrap_caption_pixel(text: str, font_path: str, font_size: int, max_width_px: int, max_lines: int = 5) -> str:
 
 
 
 
60
  """
61
+ Wrap text to fit within max_width_px pixels using the specified TrueType font.
62
+ Splits long words if needed and limits to max_lines.
63
+ Returns text with '\\n' as line breaks (converted later to '\\N').
64
  """
65
  text = " ".join(text.strip().split())
66
  if not text:
67
  return ""
68
 
69
+ # Load font
70
  if font_path:
71
  font = ImageFont.truetype(font_path, font_size)
72
  else:
73
  font = ImageFont.load_default()
74
 
75
+ # Function to split a single long word
76
+ def split_long(word):
77
+ bbox = font.getbbox(word)
78
+ word_width = bbox[2] - bbox[0]
79
+ if word_width <= max_width_px:
 
80
  return [word]
 
81
  parts = []
82
  chunk = ""
83
  for ch in word:
84
  trial = chunk + ch
85
+ trial_width = font.getbbox(trial)[2] - font.getbbox(trial)[0]
86
+ if trial_width <= max_width_px:
87
  chunk = trial
88
  else:
89
  if chunk:
 
93
  parts.append(chunk)
94
  return parts
95
 
96
+ # Break text into tokens (splitting long words)
97
  tokens = []
98
+ for word in text.split(" "):
99
+ tokens.extend(split_long(word))
100
 
101
+ # Build lines
102
+ lines = []
103
+ current = ""
104
  for token in tokens:
105
+ trial_line = token if not current else f"{current} {token}"
106
+ trial_width = font.getbbox(trial_line)[2] - font.getbbox(trial_line)[0]
107
+ if trial_width <= max_width_px:
108
+ current = trial_line
109
  else:
110
  if current:
111
  lines.append(current)
112
  current = token
 
113
  if current:
114
  lines.append(current)
115
 
116
+ # Limit number of lines
117
  if len(lines) > max_lines:
118
+ # Merge overflow into the last line
119
+ kept = lines[:max_lines-1]
120
+ rest = " ".join(lines[max_lines-1:])
121
  kept.append(rest)
122
  lines = kept
123
 
 
125
 
126
  def make_ass_subtitles(segments, ass_path):
127
  """
128
+ Write ASS subtitle file with white text and solid black background.
 
129
  """
130
+ # ASS header with one style
131
  header = """[Script Info]
132
  ScriptType: v4.00+
133
  PlayResX: 1080
134
  PlayResY: 1920
 
135
  WrapStyle: 2
 
136
  [V4+ Styles]
137
  Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
138
+ Style: Default,Arial,38,&H00FFFFFF,&H00000000,&H00000000,&H00000000,0,0,0,0,100,100,0,0,3,0,0,2,120,120,220,1
 
139
 
140
  [Events]
141
  Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
142
  """
 
143
  lines = [header]
144
+ font_path = None
145
+ # Attempt to find a common TTF font
146
+ for candidate in [
147
+ "/usr/share/fonts/truetype/dejavu/DejaVuSans.ttf",
148
+ "/usr/share/fonts/truetype/liberation/LiberationSans-Regular.ttf"
149
+ ]:
150
+ if os.path.exists(candidate):
151
+ font_path = candidate
152
+ break
153
 
154
+ # Maximum pixel width inside frame (1080 minus horizontal margins)
155
+ max_width_px = 1080 - 240 # 120 left + 120 right margins
156
 
157
  for seg in segments:
158
  text = seg["text"].strip()
159
  if not text:
160
  continue
 
161
  start = ass_time(seg["start"])
162
  end = ass_time(seg["end"])
163
+ # Wrap text by pixel width
164
+ wrapped = wrap_caption_pixel(text, font_path, font_size=38, max_width_px=max_width_px, max_lines=4)
165
+ # Escape and replace newline with ASS newline
166
+ ass_text = ass_escape(wrapped).replace("\n", r"\N")
167
+ # Dialogue with overrides: primary color white, outline color black, background black
 
 
 
 
 
 
 
168
  dialogue = (
169
  f"Dialogue: 0,{start},{end},Default,,0,0,0,,"
170
  r"{\bord0\shad0\blur0\be0\1c&HFFFFFF&\3c&H000000&\4c&H000000&\3a&H00&\4a&H00}"
171
+ f"{ass_text}\n"
172
  )
173
  lines.append(dialogue)
174
 
175
+ # Write .ass file
176
  with open(ass_path, "w", encoding="utf-8") as f:
177
  f.writelines(lines)
178
 
 
182
 
183
  @app.route("/video/<path:filename>")
184
  def serve_video(filename):
185
+ """Serve video file from OUTPUT_FOLDER with no caching."""
186
  file_path = os.path.join(OUTPUT_FOLDER, filename)
187
  if not os.path.exists(file_path):
188
  abort(404)
189
+ response = send_from_directory(OUTPUT_FOLDER, filename, as_attachment=False, conditional=True)
 
 
 
 
 
 
190
  response.headers["Cache-Control"] = "no-store"
191
  return response
192
 
193
  @app.route("/generate", methods=["POST"])
194
  def generate():
195
+ """Handle upload, transcription, subtitle generation, and video rendering."""
196
  if "image" not in request.files or "audio" not in request.files:
197
+ return jsonify({"error": "Missing files"}), 400
 
198
  image = request.files["image"]
199
  audio = request.files["audio"]
 
200
  if not image.filename or not audio.filename:
201
+ return jsonify({"error": "Please upload both image and audio"}), 400
202
 
203
  uid = str(uuid.uuid4())
204
+ image_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{secure_filename(image.filename)}")
205
+ audio_path = os.path.join(UPLOAD_FOLDER, f"{uid}_{secure_filename(audio.filename)}")
 
 
 
 
206
  output_filename = f"{uid}.mp4"
207
  output_path = os.path.join(OUTPUT_FOLDER, output_filename)
208
  ass_path = os.path.join(SUBTITLE_FOLDER, f"{uid}.ass")
209
 
210
+ # Save uploads
211
  image.save(image_path)
212
  audio.save(audio_path)
213
 
214
  try:
215
+ # Transcribe audio with Whisper (fast VAD mode)
216
+ segments_iter, info = model.transcribe(audio_path, beam_size=1, vad_filter=True)
 
 
 
 
217
  transcript = []
218
+ full_text = []
219
+ for seg in segments_iter:
220
+ text = seg.text.strip()
221
+ if not text: continue
 
 
 
222
  transcript.append({
223
+ "start": round(seg.start, 2),
224
+ "end": round(seg.end, 2),
225
  "text": text
226
  })
227
+ full_text.append(text)
228
+ # Create ASS subtitles
229
  make_ass_subtitles(transcript, ass_path)
 
230
 
231
+ # Escape path for ffmpeg
232
+ safe_ass_path = ass_path.replace("\\", "\\\\").replace(":", "\\:").replace("'", r"\'")
233
+
234
+ # FFmpeg filters: scale to 1080x1920 (increase), crop to 1080x1920, overlay subtitles
235
+ vf_filter = (
236
  "scale=1080:1920:force_original_aspect_ratio=increase,"
237
  "crop=1080:1920,"
238
  f"ass='{safe_ass_path}'"
239
  )
 
240
  cmd = [
241
+ "ffmpeg", "-y",
242
+ "-loop", "1", "-framerate", "1", "-i", image_path,
 
 
 
243
  "-i", audio_path,
244
+ "-vf", vf_filter,
245
+ "-map", "0:v:0", "-map", "1:a:0", # select streams
246
+ "-c:v", "libx264", "-preset", "ultrafast", "-crf", "20",
247
+ "-pix_fmt", "yuv420p", "-r", "24",
248
+ "-c:a", "aac", "-b:a", "128k",
249
+ "-movflags", "+faststart", # for streaming
250
+ "-shortest", output_path
 
 
 
 
 
 
251
  ]
252
+ result = subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=True)
253
 
254
+ # Check output
 
 
 
 
 
 
255
  if not os.path.exists(output_path) or os.path.getsize(output_path) == 0:
256
+ return jsonify({"error": "Video file missing", "details": result.stderr.decode()}), 500
 
 
 
257
 
258
  return jsonify({
259
  "video_url": f"/video/{output_filename}",
260
  "transcript": transcript,
261
+ "full_text": " ".join(full_text).strip(),
262
  "language": getattr(info, "language", None)
263
  })
264
 
265
  except subprocess.CalledProcessError as e:
266
+ # FFmpeg error
267
+ return jsonify({"error": "FFmpeg failed", "details": e.stderr.decode()}), 500
 
 
 
268
  except Exception as e:
269
+ # General error
270
+ return jsonify({"error": "Processing failed", "details": str(e)}), 500
 
 
271
 
272
  if __name__ == "__main__":
273
+ app.run(host="0.0.0.0", port=7860, debug=True)