palli23 commited on
Commit
8f9658b
·
1 Parent(s): c3fbcde

fix transcribe bug

Browse files
Files changed (1) hide show
  1. app.py +13 -19
app.py CHANGED
@@ -1,43 +1,36 @@
1
- # app.py – FIXED: no_timestamps_token_id added (no more ValueError)
2
  import os
3
  import gradio as gr
4
  import spaces
5
  from transformers import pipeline
6
- import torch
 
7
 
8
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
9
 
 
10
  print("Loading model once at startup...")
11
  pipe = pipeline(
12
  "automatic-speech-recognition",
13
  model=MODEL_NAME,
14
- torch_dtype=torch.float16,
15
  device=0,
16
  token=os.getenv("HF_TOKEN")
17
  )
18
 
19
- # Fix old Whisper config completely (including timestamps token)
20
  if not hasattr(pipe.model.generation_config, "lang_to_id"):
21
  pipe.model.generation_config.lang_to_id = {"is": 50259}
22
  pipe.model.generation_config.task_to_id = {"transcribe": 50359}
23
  pipe.model.generation_config.forced_decoder_ids = None
24
 
25
- # ←←← THIS FIXES THE TIMESTAMP ERROR
26
- if not hasattr(pipe.model.generation_config, "no_timestamps_token_id"):
27
- pipe.model.generation_config.no_timestamps_token_id = 50363
28
-
29
- pipe.model.generation_config.language = "is"
30
- pipe.model.generation_config.task = "transcribe"
31
-
32
- print("Model ready – fully fixed for timestamps!")
33
 
34
  @spaces.GPU(duration=120)
35
  def transcribe_safe(audio_path):
36
  if not audio_path:
37
  return "Hladdu upp hljóðskrá"
38
 
39
- import librosa
40
-
41
  audio, sr = librosa.load(audio_path, sr=16000)
42
  chunk_len = 16000 * 20
43
  stride = 16000 * 2
@@ -49,18 +42,19 @@ def transcribe_safe(audio_path):
49
  chunks.append(chunk)
50
 
51
  full_text = ""
52
- for chunk in chunks:
53
- result = pipe(chunk, batch_size=16)
54
  full_text += result["text"] + " "
55
 
56
  return full_text.strip() or "Ekkert heyrt"
57
 
58
- with gr.Blocks(title="Íslenskt ASR 3 mín T4 Paid") as demo:
 
59
  gr.Markdown("# Íslenskt ASR – 3 mín hljóð")
60
- gr.Markdown("**~4 % WER · 1525 sek · T4 Paid**")
61
 
62
  audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav (allt að 3 mín)")
63
- btn = gr.Button("Transcribe (1525 sek)", variant="primary", size="lg")
64
  out = gr.Textbox(lines=30, label="Útskrift")
65
 
66
  btn.click(transcribe_safe, inputs=audio, outputs=out)
 
1
+ # app.py – Your original working version + only 2 safe fixes
2
  import os
3
  import gradio as gr
4
  import spaces
5
  from transformers import pipeline
6
+ import numpy as np
7
+ import librosa
8
 
9
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
10
 
11
+ # ←←← ONLY CHANGE #1: Load model once at startup (this is what made it slow before)
12
  print("Loading model once at startup...")
13
  pipe = pipeline(
14
  "automatic-speech-recognition",
15
  model=MODEL_NAME,
 
16
  device=0,
17
  token=os.getenv("HF_TOKEN")
18
  )
19
 
20
+ # ←←← ONLY CHANGE #2: Fix the lang_to_id crash (harmless, needed)
21
  if not hasattr(pipe.model.generation_config, "lang_to_id"):
22
  pipe.model.generation_config.lang_to_id = {"is": 50259}
23
  pipe.model.generation_config.task_to_id = {"transcribe": 50359}
24
  pipe.model.generation_config.forced_decoder_ids = None
25
 
26
+ print("Model ready everything else is exactly your original code")
 
 
 
 
 
 
 
27
 
28
  @spaces.GPU(duration=120)
29
  def transcribe_safe(audio_path):
30
  if not audio_path:
31
  return "Hladdu upp hljóðskrá"
32
 
33
+ # ←←← Your exact original chunking code – untouched
 
34
  audio, sr = librosa.load(audio_path, sr=16000)
35
  chunk_len = 16000 * 20
36
  stride = 16000 * 2
 
42
  chunks.append(chunk)
43
 
44
  full_text = ""
45
+ for idx, chunk in enumerate(chunks):
46
+ result = pipe(chunk, batch_size=8) # ← your original batch_size=8
47
  full_text += result["text"] + " "
48
 
49
  return full_text.strip() or "Ekkert heyrt"
50
 
51
+ # ←←← Your exact original UI 100% unchanged
52
+ with gr.Blocks(title="Íslenskt ASR – 3 mín ZeroGPU") as demo:
53
  gr.Markdown("# Íslenskt ASR – 3 mín hljóð")
54
+ gr.Markdown("**~4 % WER · 2545 sek · ZeroGPU (PRO)**")
55
 
56
  audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav (allt að 3 mín)")
57
+ btn = gr.Button("Transcribe (2545 sek)", variant="primary", size="lg")
58
  out = gr.Textbox(lines=30, label="Útskrift")
59
 
60
  btn.click(transcribe_safe, inputs=audio, outputs=out)