Spaces:

RayPac006
/

get-lyrics

Running

App Files Files Community

RayPac006 commited on 25 days ago

Commit

2deee0c

verified ·

1 Parent(s): e2e70f5

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -25

app.py CHANGED Viewed

@@ -20,6 +20,18 @@ torch.load = patched_load
 device = "cuda" if torch.cuda.is_available() else "cpu"
 batch_size = 16
 compute_type = "float16" if device == "cuda" else "int8" # int8 is faster on CPU
 # 2. Global Model Load (Load once on startup)
 print(f"Loading WhisperX model on {device}...")
@@ -32,33 +44,54 @@ def generate_lyrics(audio_file_path):
     try:
         # 1. Transcribe
         audio = whisperx.load_audio(audio_file_path)
-        result = model.transcribe(audio, batch_size=batch_size)
-        # 2. Align (Load alignment model dynamically based on detected language)
-        model_a, metadata = whisperx.load_align_model(
-            language_code=result["language"],
-            device=device
-        )
-        result = whisperx.align(
-            result["segments"],
-            model_a,
-            metadata,
-            audio,
-            device,
-            return_char_alignments=False
         )
-        # 3. Format to your TypeScript Interface
-        formatted_lyrics = []
-        for segment in result["segments"]:
-            formatted_lyrics.append({
-                "time": round(segment["start"], 3),
-                "text": segment["text"].strip(),
-                "chords": []
-            })
-        # Memory Cleanup (Crucial for HF Free Tier)
-        del model_a
         gc.collect()
         if device == "cuda":
             torch.cuda.empty_cache()

 device = "cuda" if torch.cuda.is_available() else "cpu"
 batch_size = 16
 compute_type = "float16" if device == "cuda" else "int8" # int8 is faster on CPU
+ALIGN_MODEL_MAP = {
+    # default WhisperX-supported languages (use built-in)
+    "en": None,
+    "tl": None,  # Tagalog works with WhisperX default aligner
+    # languages that NEED explicit wav2vec2 models
+    "th": "airesearch/wav2vec2-large-xlsr-53-th",
+    # you can extend this later:
+    # "ja": "jonatasgrosman/wav2vec2-large-xlsr-53-japanese",
+    # "ko": "kresnik/wav2vec2-large-xlsr-korean",
+}
 # 2. Global Model Load (Load once on startup)
 print(f"Loading WhisperX model on {device}...")
     try:
         # 1. Transcribe
         audio = whisperx.load_audio(audio_file_path)
+        result = model.transcribe(
+            audio,
+            batch_size=batch_size,
+            temperature=0.0
         )
+        lang = result["language"]
+        print(f"Detected language: {lang}")
+        align_model_name = ALIGN_MODEL_MAP.get(lang)
+        # 2. Align (best-effort)
+        try:
+            if align_model_name is None:
+                model_a, metadata = whisperx.load_align_model(
+                    language_code=lang,
+                    device=device
+                )
+            else:
+                model_a, metadata = whisperx.load_align_model(
+                    language_code=lang,
+                    device=device,
+                    model_name=align_model_name
+                )
+            result = whisperx.align(
+                result["segments"],
+                model_a,
+                metadata,
+                audio,
+                device,
+                return_char_alignments=False
+            )
+            del model_a
+        except Exception as align_err:
+            print(f"[WARN] Alignment skipped: {align_err}")
+        # 3. Format output
+        formatted_lyrics = [
+            {
+                "time": round(seg["start"], 3),
+                "text": seg["text"].strip(),
+                "chords": []
+            }
+            for seg in result["segments"]
+        ]
         gc.collect()
         if device == "cuda":
             torch.cuda.empty_cache()