Spaces:

Gagandeep12
/

subtitle-medium

Sleeping

App Files Files Community

Gagandeep12 commited on Sep 27, 2025

Commit

3cbd91e

verified ·

1 Parent(s): ad81ed4

Update app.py

Browse files

Files changed (1) hide show

app.py +98 -45

app.py CHANGED Viewed

@@ -42,9 +42,11 @@ def upload_video():
     highlight_color = request.form.get("highlight_color", "#FFFFFF")  # default white
     language = request.form.get("language", "auto")  # chosen language
-    # Map Hinglish -> English transcription
-    if language.lower() == "hinglish":
-        language = "en"
     video_id = str(uuid.uuid4())
     input_path = os.path.join(UPLOAD_FOLDER, f"{video_id}.mp4")
@@ -89,22 +91,57 @@ def download(filename):
 # ---------------- Helper functions ----------------
 def format_ass_time(seconds):
     h = int(seconds // 3600)
     m = int((seconds % 3600) // 60)
     s = int(seconds % 60)
-    cs = int((seconds - int(seconds)) * 100)
     return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
-def hex_to_ass_color(hex_color):
-    hex_color = hex_color.lstrip("#")
-    if len(hex_color) != 6:
-        return "&H00FFFF00"  # fallback yellow
-    r, g, b = hex_color[0:2], hex_color[2:4], hex_color[4:6]
-    return f"&H00{b}{g}{r}"  # ASS uses BBGGRR
 def generate_karaoke_ass(
     segments,
     position,
@@ -128,30 +165,36 @@ PlayResY: 720
 [V4+ Styles]
 Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
-Style: CustomStyle,Arial,{text_size},{ass_color},&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,2,0,{alignment},0,0,{margin_v},1
 [Events]
 Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 """
     dialogues = ""
-    line_words = []
-    line_start = None
     display_lines = []
     for seg in segments:
-        for w in seg.get("words", []):
-            if "start" not in w or "end" not in w:
                 continue
             if line_start is None:
                 line_start = w["start"]
-            duration_cs = int((w["end"] - w["start"]) * 100)
-            line_words.append(f"{{\\k{duration_cs}}}{w['word']} ")
-            if len(line_words) >= words_per_line:
-                text = "".join(line_words)
                 display_lines.append((line_start, w["end"], text))
                 line_words = []
                 line_start = None
@@ -163,9 +206,17 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
                     dialogues += f"Dialogue: 0,{format_ass_time(block_start)},{format_ass_time(block_end)},CustomStyle,,0,0,0,,{block_text}\n"
                     display_lines = []
-    if line_words:
-        text = "".join(line_words)
-        display_lines.append((line_start, seg["end"], text))
     if display_lines:
         block_start = display_lines[0][0]
@@ -179,18 +230,16 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 # ---------------- Gentle alignment integration ----------------
 def format_time(seconds):
-    """Convert seconds to ASS timestamp format (hh:mm:ss.cc)"""
     td = timedelta(seconds=seconds)
     total = str(td)
     if "." in total:
         total = total[: total.index(".") + 3]  # keep 2 decimal places
     if len(total.split(":")[0]) == 1:
-        total = "0:" + total  # force hh:mm:ss
     return total
 def generate_ass(words, words_per_line=5):
-    """Generate .ass file with karaoke \\k tags from Gentle alignment words"""
     header = """[Script Info]
 Title: Karaoke Lyrics
 ScriptType: v4.00+
@@ -243,15 +292,11 @@ Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 def gentle_align(audio_file, transcript_file, output_ass="gentle_output.ass"):
-    """Call Gentle local server to align lyrics with audio"""
     print("Aligning with Gentle...")
     with open(transcript_file, "r", encoding="utf-8") as f:
         lyrics = f.read()
     with open(audio_file, "rb") as audio:
-        files = {
-            "audio": audio,
-            "transcript": (None, lyrics),
-        }
         response = requests.post(
             "http://localhost:8765/transcriptions?async=false", files=files
         )
@@ -282,19 +327,30 @@ def process_queue():
             start_time = time.time()
-            # Transcribe with Whisper
-            result = model.transcribe(
-                job["input"],
-                language=None if job["language"] == "auto" else job["language"],
-                word_timestamps=True
-            )
             detected_lang = result.get("language", "unknown")
             print(f"🌐 Detected language for {video_id}: {detected_lang}")
-            jobs_status[video_id]["model_used"] = model.name if hasattr(model, "name") else "Whisper"
-            # Create .ass file from Whisper timestamps
             ass_content = generate_karaoke_ass(
-                result["segments"],
                 job["position"],
                 job["size"],
                 job["words_per_line"],
@@ -304,12 +360,10 @@ def process_queue():
             with open(job["ass"], "w", encoding="utf-8") as f:
                 f.write(ass_content)
-            # Optionally: also run Gentle alignment if transcript exists
             transcript_file = os.path.join(UPLOAD_FOLDER, f"{video_id}.txt")
             if os.path.exists(transcript_file):
                 gentle_align(job["input"], transcript_file, output_ass=job["ass"])
-            # Burn subtitles into video
             ffmpeg.input(job["input"]).output(
                 job["output"],
                 vf=f"ass={job['ass'].replace(os.sep, '/')}"
@@ -335,6 +389,5 @@ def process_queue():
 threading.Thread(target=process_queue, daemon=True).start()
 if __name__ == '__main__':
-    port = int(os.environ.get("PORT", 7860))  # Hugging Face / Docker will set PORT
     app.run(host="0.0.0.0", port=port, debug=False)

     highlight_color = request.form.get("highlight_color", "#FFFFFF")  # default white
     language = request.form.get("language", "auto")  # chosen language
+    # Map Hinglish properly
+    if language.lower() in ("hinglish", "hi-roman", "romanized"):
+        # Whisper doesn't produce Hinglish romanization directly;
+        # best option is Hindi model output (Devanagari) — can transliterate later if needed.
+        language = "hi"
     video_id = str(uuid.uuid4())
     input_path = os.path.join(UPLOAD_FOLDER, f"{video_id}.mp4")
 # ---------------- Helper functions ----------------
+def hex_to_ass_color(hex_color):
+    """Convert #RRGGBB -> &H00BBGGRR (ASS format)."""
+    try:
+        hex_color = hex_color.lstrip("#")
+        if len(hex_color) != 6:
+            raise ValueError("invalid")
+        r, g, b = hex_color[0:2], hex_color[2:4], hex_color[4:6]
+        return f"&H00{b}{g}{r}"
+    except Exception:
+        return "&H00FFFF00"  # fallback yellow
+def escape_ass_text(text: str) -> str:
+    """Clean up text for ASS."""
+    if text is None:
+        return ""
+    text = text.replace("\r", " ").strip()
+    text = text.replace("\n", "\\N")
+    text = text.replace("{", "").replace("}", "")
+    return text
+def create_word_fallback_from_segment(seg):
+    """If Whisper doesn't provide per-word timestamps, create fake words with even timing."""
+    text = seg.get("text", "").strip()
+    if not text:
+        return []
+    words = text.split()
+    if not words:
+        return []
+    seg_start = seg.get("start", 0.0)
+    seg_end = seg.get("end", seg_start + 0.001)
+    total_dur = max(seg_end - seg_start, 0.001)
+    per_word = total_dur / len(words)
+    out = []
+    for i, w in enumerate(words):
+        s = seg_start + i * per_word
+        e = s + per_word
+        out.append({"word": w, "start": s, "end": e})
+    return out
 def format_ass_time(seconds):
+    """ASS time format H:MM:SS.cc"""
     h = int(seconds // 3600)
     m = int((seconds % 3600) // 60)
     s = int(seconds % 60)
+    cs = int(round((seconds - int(seconds)) * 100))
     return f"{h}:{m:02d}:{s:02d}.{cs:02d}"
 def generate_karaoke_ass(
     segments,
     position,
 [V4+ Styles]
 Format: Name, Fontname, Fontsize, PrimaryColour, SecondaryColour, OutlineColour, BackColour, Bold, Italic, Underline, StrikeOut, ScaleX, ScaleY, Spacing, Angle, BorderStyle, Outline, Shadow, Alignment, MarginL, MarginR, MarginV, Encoding
+Style: CustomStyle,Arial,{int(text_size)},{ass_color},&H00FFFFFF,&H00000000,&H64000000,-1,0,0,0,100,100,0,0,1,2,0,{alignment},0,0,{margin_v},1
 [Events]
 Format: Layer, Start, End, Style, Name, MarginL, MarginR, MarginV, Effect, Text
 """
     dialogues = ""
     display_lines = []
     for seg in segments:
+        if isinstance(seg.get("words"), list) and seg.get("words"):
+            words = seg["words"]
+        else:
+            words = create_word_fallback_from_segment(seg)
+        line_words = []
+        line_start = None
+        for w in words:
+            if "start" not in w or "end" not in w or not w.get("word"):
                 continue
             if line_start is None:
                 line_start = w["start"]
+            duration_cs = int(round((w["end"] - w["start"]) * 100))
+            word_text = escape_ass_text(w["word"])
+            line_words.append(f"{{\\k{duration_cs}}}{word_text} ")
+            if len(line_words) >= words_per_line or word_text.endswith((".", "!", "?", ",")):
+                text = "".join(line_words).strip()
                 display_lines.append((line_start, w["end"], text))
                 line_words = []
                 line_start = None
                     dialogues += f"Dialogue: 0,{format_ass_time(block_start)},{format_ass_time(block_end)},CustomStyle,,0,0,0,,{block_text}\n"
                     display_lines = []
+        if line_words:
+            text = "".join(line_words).strip()
+            last_end = words[-1]["end"] if words else seg.get("end", seg.get("start", 0))
+            display_lines.append((line_start or seg.get("start", 0), last_end, text))
+        if len(display_lines) >= lines_per_display:
+            block_start = display_lines[0][0]
+            block_end = display_lines[-1][1]
+            block_text = "\\N".join([dl[2] for dl in display_lines])
+            dialogues += f"Dialogue: 0,{format_ass_time(block_start)},{format_ass_time(block_end)},CustomStyle,,0,0,0,,{block_text}\n"
+            display_lines = []
     if display_lines:
         block_start = display_lines[0][0]
 # ---------------- Gentle alignment integration ----------------
 def format_time(seconds):
     td = timedelta(seconds=seconds)
     total = str(td)
     if "." in total:
         total = total[: total.index(".") + 3]  # keep 2 decimal places
     if len(total.split(":")[0]) == 1:
+        total = "0:" + total
     return total
 def generate_ass(words, words_per_line=5):
     header = """[Script Info]
 Title: Karaoke Lyrics
 ScriptType: v4.00+
 def gentle_align(audio_file, transcript_file, output_ass="gentle_output.ass"):
     print("Aligning with Gentle...")
     with open(transcript_file, "r", encoding="utf-8") as f:
         lyrics = f.read()
     with open(audio_file, "rb") as audio:
+        files = {"audio": audio, "transcript": (None, lyrics)}
         response = requests.post(
             "http://localhost:8765/transcriptions?async=false", files=files
         )
             start_time = time.time()
+            whisper_lang = None if job["language"] == "auto" else job["language"]
+            try:
+                result = model.transcribe(
+                    job["input"],
+                    language=whisper_lang,
+                    word_timestamps=True
+                )
+            except TypeError:
+                result = model.transcribe(job["input"], language=whisper_lang)
             detected_lang = result.get("language", "unknown")
             print(f"🌐 Detected language for {video_id}: {detected_lang}")
+            jobs_status[video_id]["model_used"] = getattr(model, "name", "Whisper")
+            segments = result.get("segments", [])
+            if not segments and "words" in result:
+                segments = [{
+                    "start": 0.0,
+                    "end": result.get("duration", 0.0),
+                    "words": result["words"]
+                }]
             ass_content = generate_karaoke_ass(
+                segments,
                 job["position"],
                 job["size"],
                 job["words_per_line"],
             with open(job["ass"], "w", encoding="utf-8") as f:
                 f.write(ass_content)
             transcript_file = os.path.join(UPLOAD_FOLDER, f"{video_id}.txt")
             if os.path.exists(transcript_file):
                 gentle_align(job["input"], transcript_file, output_ass=job["ass"])
             ffmpeg.input(job["input"]).output(
                 job["output"],
                 vf=f"ass={job['ass'].replace(os.sep, '/')}"
 threading.Thread(target=process_queue, daemon=True).start()
 if __name__ == '__main__':
+    port = int(os.environ.get("PORT", 7860))
     app.run(host="0.0.0.0", port=port, debug=False)