RayPac006 commited on
Commit
2deee0c
·
verified ·
1 Parent(s): e2e70f5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +58 -25
app.py CHANGED
@@ -20,6 +20,18 @@ torch.load = patched_load
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
21
  batch_size = 16
22
  compute_type = "float16" if device == "cuda" else "int8" # int8 is faster on CPU
 
 
 
 
 
 
 
 
 
 
 
 
23
 
24
  # 2. Global Model Load (Load once on startup)
25
  print(f"Loading WhisperX model on {device}...")
@@ -32,33 +44,54 @@ def generate_lyrics(audio_file_path):
32
  try:
33
  # 1. Transcribe
34
  audio = whisperx.load_audio(audio_file_path)
35
- result = model.transcribe(audio, batch_size=batch_size)
36
-
37
- # 2. Align (Load alignment model dynamically based on detected language)
38
- model_a, metadata = whisperx.load_align_model(
39
- language_code=result["language"],
40
- device=device
41
- )
42
- result = whisperx.align(
43
- result["segments"],
44
- model_a,
45
- metadata,
46
- audio,
47
- device,
48
- return_char_alignments=False
49
  )
50
 
51
- # 3. Format to your TypeScript Interface
52
- formatted_lyrics = []
53
- for segment in result["segments"]:
54
- formatted_lyrics.append({
55
- "time": round(segment["start"], 3),
56
- "text": segment["text"].strip(),
57
- "chords": []
58
- })
59
-
60
- # Memory Cleanup (Crucial for HF Free Tier)
61
- del model_a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  gc.collect()
63
  if device == "cuda":
64
  torch.cuda.empty_cache()
 
20
  device = "cuda" if torch.cuda.is_available() else "cpu"
21
  batch_size = 16
22
  compute_type = "float16" if device == "cuda" else "int8" # int8 is faster on CPU
23
+ ALIGN_MODEL_MAP = {
24
+ # default WhisperX-supported languages (use built-in)
25
+ "en": None,
26
+ "tl": None, # Tagalog works with WhisperX default aligner
27
+
28
+ # languages that NEED explicit wav2vec2 models
29
+ "th": "airesearch/wav2vec2-large-xlsr-53-th",
30
+
31
+ # you can extend this later:
32
+ # "ja": "jonatasgrosman/wav2vec2-large-xlsr-53-japanese",
33
+ # "ko": "kresnik/wav2vec2-large-xlsr-korean",
34
+ }
35
 
36
  # 2. Global Model Load (Load once on startup)
37
  print(f"Loading WhisperX model on {device}...")
 
44
  try:
45
  # 1. Transcribe
46
  audio = whisperx.load_audio(audio_file_path)
47
+ result = model.transcribe(
48
+ audio,
49
+ batch_size=batch_size,
50
+ temperature=0.0
 
 
 
 
 
 
 
 
 
 
51
  )
52
 
53
+ lang = result["language"]
54
+ print(f"Detected language: {lang}")
55
+
56
+ align_model_name = ALIGN_MODEL_MAP.get(lang)
57
+
58
+ # 2. Align (best-effort)
59
+ try:
60
+ if align_model_name is None:
61
+ model_a, metadata = whisperx.load_align_model(
62
+ language_code=lang,
63
+ device=device
64
+ )
65
+ else:
66
+ model_a, metadata = whisperx.load_align_model(
67
+ language_code=lang,
68
+ device=device,
69
+ model_name=align_model_name
70
+ )
71
+
72
+ result = whisperx.align(
73
+ result["segments"],
74
+ model_a,
75
+ metadata,
76
+ audio,
77
+ device,
78
+ return_char_alignments=False
79
+ )
80
+
81
+ del model_a
82
+ except Exception as align_err:
83
+ print(f"[WARN] Alignment skipped: {align_err}")
84
+
85
+ # 3. Format output
86
+ formatted_lyrics = [
87
+ {
88
+ "time": round(seg["start"], 3),
89
+ "text": seg["text"].strip(),
90
+ "chords": []
91
+ }
92
+ for seg in result["segments"]
93
+ ]
94
+
95
  gc.collect()
96
  if device == "cuda":
97
  torch.cuda.empty_cache()