Gagandeep12 commited on
Commit
3cbd91e
·
verified ·
1 Parent(s): ad81ed4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -45
app.py CHANGED
@@ -42,9 +42,11 @@ def upload_video():
42
  highlight_color = request.form.get("highlight_color", "#FFFFFF") # default white
43
  language = request.form.get("language", "auto") # chosen language
44
 
45
- # Map Hinglish -> English transcription
46
- if language.lower() == "hinglish":
47
- language = "en"
 
 
48
 
49
  video_id = str(uuid.uuid4())
50
  input_path = os.path.join(UPLOAD_FOLDER, f"{video_id}.mp4")
@@ -89,22 +91,57 @@ def download(filename):
89
 
90
  # ---------------- Helper functions ----------------
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  def format_ass_time(seconds):
 
93
  h = int(seconds // 3600)
94
  m = int((seconds % 3600) // 60)
95
  s = int(seconds % 60)
96
- cs = int((seconds - int(seconds)) * 100)
97
  return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
98
 
99
 
100
- def hex_to_ass_color(hex_color):
101
- hex_color = hex_color.lstrip("#")
102
- if len(hex_color) != 6:
103
- return "&H00FFFF00" # fallback yellow
104
- r, g, b = hex_color[0:2], hex_color[2:4], hex_color[4:6]
105
- return f"&H00{b}{g}{r}" # ASS uses BBGGRR
106
-
107
-
108
  def generate_karaoke_ass(
109
  segments,
110
  position,
@@ -128,30 +165,36 @@ PlayResY: 720
128
 
129
  [V4+ Styles]
130
  Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
131
- Style: CustomStyle,Arial,{text_size},{ass_color},&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,2,0,{alignment},0,0,{margin_v},1
132
 
133
 
134
  [Events]
135
  Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
136
  """
137
  dialogues = ""
138
- line_words = []
139
- line_start = None
140
  display_lines = []
141
 
142
  for seg in segments:
143
- for w in seg.get("words", []):
144
- if "start" not in w or "end" not in w:
 
 
 
 
 
 
 
145
  continue
146
 
147
  if line_start is None:
148
  line_start = w["start"]
149
 
150
- duration_cs = int((w["end"] - w["start"]) * 100)
151
- line_words.append(f"{{\\k{duration_cs}}}{w['word']} ")
 
152
 
153
- if len(line_words) >= words_per_line:
154
- text = "".join(line_words)
155
  display_lines.append((line_start, w["end"], text))
156
  line_words = []
157
  line_start = None
@@ -163,9 +206,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
163
  dialogues += f"Dialogue: 0,{format_ass_time(block_start)},{format_ass_time(block_end)},CustomStyle,,0,0,0,,{block_text}\n"
164
  display_lines = []
165
 
166
- if line_words:
167
- text = "".join(line_words)
168
- display_lines.append((line_start, seg["end"], text))
 
 
 
 
 
 
 
 
169
 
170
  if display_lines:
171
  block_start = display_lines[0][0]
@@ -179,18 +230,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
179
  # ---------------- Gentle alignment integration ----------------
180
 
181
  def format_time(seconds):
182
- """Convert seconds to ASS timestamp format (hh:mm:ss.cc)"""
183
  td = timedelta(seconds=seconds)
184
  total = str(td)
185
  if "." in total:
186
  total = total[: total.index(".") + 3] # keep 2 decimal places
187
  if len(total.split(":")[0]) == 1:
188
- total = "0:" + total # force hh:mm:ss
189
  return total
190
 
191
 
192
  def generate_ass(words, words_per_line=5):
193
- """Generate .ass file with karaoke \\k tags from Gentle alignment words"""
194
  header = """[Script Info]
195
  Title: Karaoke Lyrics
196
  ScriptType: v4.00+
@@ -243,15 +292,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
243
 
244
 
245
  def gentle_align(audio_file, transcript_file, output_ass="gentle_output.ass"):
246
- """Call Gentle local server to align lyrics with audio"""
247
  print("Aligning with Gentle...")
248
  with open(transcript_file, "r", encoding="utf-8") as f:
249
  lyrics = f.read()
250
  with open(audio_file, "rb") as audio:
251
- files = {
252
- "audio": audio,
253
- "transcript": (None, lyrics),
254
- }
255
  response = requests.post(
256
  "http://localhost:8765/transcriptions?async=false", files=files
257
  )
@@ -282,19 +327,30 @@ def process_queue():
282
 
283
  start_time = time.time()
284
 
285
- # Transcribe with Whisper
286
- result = model.transcribe(
287
- job["input"],
288
- language=None if job["language"] == "auto" else job["language"],
289
- word_timestamps=True
290
- )
 
 
 
 
291
  detected_lang = result.get("language", "unknown")
292
  print(f"🌐 Detected language for {video_id}: {detected_lang}")
293
- jobs_status[video_id]["model_used"] = model.name if hasattr(model, "name") else "Whisper"
 
 
 
 
 
 
 
 
294
 
295
- # Create .ass file from Whisper timestamps
296
  ass_content = generate_karaoke_ass(
297
- result["segments"],
298
  job["position"],
299
  job["size"],
300
  job["words_per_line"],
@@ -304,12 +360,10 @@ def process_queue():
304
  with open(job["ass"], "w", encoding="utf-8") as f:
305
  f.write(ass_content)
306
 
307
- # Optionally: also run Gentle alignment if transcript exists
308
  transcript_file = os.path.join(UPLOAD_FOLDER, f"{video_id}.txt")
309
  if os.path.exists(transcript_file):
310
  gentle_align(job["input"], transcript_file, output_ass=job["ass"])
311
 
312
- # Burn subtitles into video
313
  ffmpeg.input(job["input"]).output(
314
  job["output"],
315
  vf=f"ass={job['ass'].replace(os.sep, '/')}"
@@ -335,6 +389,5 @@ def process_queue():
335
  threading.Thread(target=process_queue, daemon=True).start()
336
 
337
  if __name__ == '__main__':
338
- port = int(os.environ.get("PORT", 7860)) # Hugging Face / Docker will set PORT
339
  app.run(host="0.0.0.0", port=port, debug=False)
340
-
 
42
  highlight_color = request.form.get("highlight_color", "#FFFFFF") # default white
43
  language = request.form.get("language", "auto") # chosen language
44
 
45
+ # Map Hinglish properly
46
+ if language.lower() in ("hinglish", "hi-roman", "romanized"):
47
+ # Whisper doesn't produce Hinglish romanization directly;
48
+ # best option is Hindi model output (Devanagari) — can transliterate later if needed.
49
+ language = "hi"
50
 
51
  video_id = str(uuid.uuid4())
52
  input_path = os.path.join(UPLOAD_FOLDER, f"{video_id}.mp4")
 
91
 
92
  # ---------------- Helper functions ----------------
93
 
94
+ def hex_to_ass_color(hex_color):
95
+ """Convert #RRGGBB -> &H00BBGGRR (ASS format)."""
96
+ try:
97
+ hex_color = hex_color.lstrip("#")
98
+ if len(hex_color) != 6:
99
+ raise ValueError("invalid")
100
+ r, g, b = hex_color[0:2], hex_color[2:4], hex_color[4:6]
101
+ return f"&H00{b}{g}{r}"
102
+ except Exception:
103
+ return "&H00FFFF00" # fallback yellow
104
+
105
+
106
+ def escape_ass_text(text: str) -> str:
107
+ """Clean up text for ASS."""
108
+ if text is None:
109
+ return ""
110
+ text = text.replace("\r", " ").strip()
111
+ text = text.replace("\n", "\\N")
112
+ text = text.replace("{", "").replace("}", "")
113
+ return text
114
+
115
+
116
+ def create_word_fallback_from_segment(seg):
117
+ """If Whisper doesn't provide per-word timestamps, create fake words with even timing."""
118
+ text = seg.get("text", "").strip()
119
+ if not text:
120
+ return []
121
+ words = text.split()
122
+ if not words:
123
+ return []
124
+ seg_start = seg.get("start", 0.0)
125
+ seg_end = seg.get("end", seg_start + 0.001)
126
+ total_dur = max(seg_end - seg_start, 0.001)
127
+ per_word = total_dur / len(words)
128
+ out = []
129
+ for i, w in enumerate(words):
130
+ s = seg_start + i * per_word
131
+ e = s + per_word
132
+ out.append({"word": w, "start": s, "end": e})
133
+ return out
134
+
135
+
136
  def format_ass_time(seconds):
137
+ """ASS time format H:MM:SS.cc"""
138
  h = int(seconds // 3600)
139
  m = int((seconds % 3600) // 60)
140
  s = int(seconds % 60)
141
+ cs = int(round((seconds - int(seconds)) * 100))
142
  return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
143
 
144
 
 
 
 
 
 
 
 
 
145
  def generate_karaoke_ass(
146
  segments,
147
  position,
 
165
 
166
  [V4+ Styles]
167
  Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
168
+ Style: CustomStyle,Arial,{int(text_size)},{ass_color},&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,2,0,{alignment},0,0,{margin_v},1
169
 
170
 
171
  [Events]
172
  Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
173
  """
174
  dialogues = ""
 
 
175
  display_lines = []
176
 
177
  for seg in segments:
178
+ if isinstance(seg.get("words"), list) and seg.get("words"):
179
+ words = seg["words"]
180
+ else:
181
+ words = create_word_fallback_from_segment(seg)
182
+
183
+ line_words = []
184
+ line_start = None
185
+ for w in words:
186
+ if "start" not in w or "end" not in w or not w.get("word"):
187
  continue
188
 
189
  if line_start is None:
190
  line_start = w["start"]
191
 
192
+ duration_cs = int(round((w["end"] - w["start"]) * 100))
193
+ word_text = escape_ass_text(w["word"])
194
+ line_words.append(f"{{\\k{duration_cs}}}{word_text} ")
195
 
196
+ if len(line_words) >= words_per_line or word_text.endswith((".", "!", "?", ",")):
197
+ text = "".join(line_words).strip()
198
  display_lines.append((line_start, w["end"], text))
199
  line_words = []
200
  line_start = None
 
206
  dialogues += f"Dialogue: 0,{format_ass_time(block_start)},{format_ass_time(block_end)},CustomStyle,,0,0,0,,{block_text}\n"
207
  display_lines = []
208
 
209
+ if line_words:
210
+ text = "".join(line_words).strip()
211
+ last_end = words[-1]["end"] if words else seg.get("end", seg.get("start", 0))
212
+ display_lines.append((line_start or seg.get("start", 0), last_end, text))
213
+
214
+ if len(display_lines) >= lines_per_display:
215
+ block_start = display_lines[0][0]
216
+ block_end = display_lines[-1][1]
217
+ block_text = "\\N".join([dl[2] for dl in display_lines])
218
+ dialogues += f"Dialogue: 0,{format_ass_time(block_start)},{format_ass_time(block_end)},CustomStyle,,0,0,0,,{block_text}\n"
219
+ display_lines = []
220
 
221
  if display_lines:
222
  block_start = display_lines[0][0]
 
230
  # ---------------- Gentle alignment integration ----------------
231
 
232
  def format_time(seconds):
 
233
  td = timedelta(seconds=seconds)
234
  total = str(td)
235
  if "." in total:
236
  total = total[: total.index(".") + 3] # keep 2 decimal places
237
  if len(total.split(":")[0]) == 1:
238
+ total = "0:" + total
239
  return total
240
 
241
 
242
  def generate_ass(words, words_per_line=5):
 
243
  header = """[Script Info]
244
  Title: Karaoke Lyrics
245
  ScriptType: v4.00+
 
292
 
293
 
294
  def gentle_align(audio_file, transcript_file, output_ass="gentle_output.ass"):
 
295
  print("Aligning with Gentle...")
296
  with open(transcript_file, "r", encoding="utf-8") as f:
297
  lyrics = f.read()
298
  with open(audio_file, "rb") as audio:
299
+ files = {"audio": audio, "transcript": (None, lyrics)}
 
 
 
300
  response = requests.post(
301
  "http://localhost:8765/transcriptions?async=false", files=files
302
  )
 
327
 
328
  start_time = time.time()
329
 
330
+ whisper_lang = None if job["language"] == "auto" else job["language"]
331
+ try:
332
+ result = model.transcribe(
333
+ job["input"],
334
+ language=whisper_lang,
335
+ word_timestamps=True
336
+ )
337
+ except TypeError:
338
+ result = model.transcribe(job["input"], language=whisper_lang)
339
+
340
  detected_lang = result.get("language", "unknown")
341
  print(f"🌐 Detected language for {video_id}: {detected_lang}")
342
+ jobs_status[video_id]["model_used"] = getattr(model, "name", "Whisper")
343
+
344
+ segments = result.get("segments", [])
345
+ if not segments and "words" in result:
346
+ segments = [{
347
+ "start": 0.0,
348
+ "end": result.get("duration", 0.0),
349
+ "words": result["words"]
350
+ }]
351
 
 
352
  ass_content = generate_karaoke_ass(
353
+ segments,
354
  job["position"],
355
  job["size"],
356
  job["words_per_line"],
 
360
  with open(job["ass"], "w", encoding="utf-8") as f:
361
  f.write(ass_content)
362
 
 
363
  transcript_file = os.path.join(UPLOAD_FOLDER, f"{video_id}.txt")
364
  if os.path.exists(transcript_file):
365
  gentle_align(job["input"], transcript_file, output_ass=job["ass"])
366
 
 
367
  ffmpeg.input(job["input"]).output(
368
  job["output"],
369
  vf=f"ass={job['ass'].replace(os.sep, '/')}"
 
389
  threading.Thread(target=process_queue, daemon=True).start()
390
 
391
  if __name__ == '__main__':
392
+ port = int(os.environ.get("PORT", 7860))
393
  app.run(host="0.0.0.0", port=port, debug=False)