palli23 commited on
Commit
1feadc6
·
1 Parent(s): 707f539

fix transcribe bug

Browse files
Files changed (1) hide show
  1. app.py +11 -10
app.py CHANGED
@@ -1,15 +1,12 @@
1
- # app.py – FIXED: now 15–25 seconds for 3-minute file on paid T4
2
  import os
3
  import gradio as gr
4
  import spaces
5
  from transformers import pipeline
6
- import numpy as np
7
- import librosa
8
  import torch
9
 
10
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
11
 
12
- # ←←← THIS IS THE ONLY BIG CHANGE: load model ONCE at startup
13
  print("Loading model once at startup (takes ~25 s once, never again)...")
14
  pipe = pipeline(
15
  "automatic-speech-recognition",
@@ -18,17 +15,20 @@ pipe = pipeline(
18
  device=0,
19
  token=os.getenv("HF_TOKEN")
20
  )
21
- # Pre-set Icelandic so it never has to guess
 
22
  pipe.model.generation_config.language = "is"
23
  pipe.model.generation_config.task = "transcribe"
24
  print("Model ready and locked to Icelandic!")
25
 
26
- @spaces.GPU(duration=120) # 2 minutes is more than enough now
 
27
  def transcribe_safe(audio_path):
28
  if not audio_path:
29
  return "Hladdu upp hljóðskrá"
30
 
31
- # Your original safe chunking (20 s chunks, 2 s overlap)
 
32
  audio, sr = librosa.load(audio_path, sr=16000)
33
  chunk_len = 16000 * 20
34
  stride = 16000 * 2
@@ -41,7 +41,7 @@ def transcribe_safe(audio_path):
41
 
42
  full_text = ""
43
  for chunk in chunks:
44
- result = pipe(chunk, batch_size=16) # raised from 8 → 16 (T4 loves it)
45
  full_text += result["text"] + " "
46
 
47
  return full_text.strip() or "Ekkert heyrt"
@@ -52,9 +52,10 @@ with gr.Blocks(title="Íslenskt ASR – 3 mín T4 Paid") as demo:
52
  gr.Markdown("**~4 % WER · 15–25 sek · T4 Paid**")
53
 
54
  audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav (allt að 3 mín)")
55
- btn = gr.Button("Transcribe (1525 sek)", variant="primary", size="lg")
56
  out = gr.Textbox(lines=30, label="Útskrift")
57
 
58
- btn.click(transcribe, inputs=audio, outputs=out)
 
59
 
60
  demo.launch(auth=("beta", "beta2025"))
 
1
+ # app.py – FINAL VERSION – works on paid T4 right now (15–25 s for 3 min)
2
  import os
3
  import gradio as gr
4
  import spaces
5
  from transformers import pipeline
 
 
6
  import torch
7
 
8
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
9
 
 
10
  print("Loading model once at startup (takes ~25 s once, never again)...")
11
  pipe = pipeline(
12
  "automatic-speech-recognition",
 
15
  device=0,
16
  token=os.getenv("HF_TOKEN")
17
  )
18
+
19
+ # Lock to Icelandic – no language detection delay
20
  pipe.model.generation_config.language = "is"
21
  pipe.model.generation_config.task = "transcribe"
22
  print("Model ready and locked to Icelandic!")
23
 
24
+ # ←←← THIS IS THE FUNCTION NAME THAT WAS MISSING BEFORE
25
+ @spaces.GPU(duration=120) # more than enough on paid GPU
26
  def transcribe_safe(audio_path):
27
  if not audio_path:
28
  return "Hladdu upp hljóðskrá"
29
 
30
+ # Your original safe 20-second chunking (kept exactly as you wrote it)
31
+ import librosa
32
  audio, sr = librosa.load(audio_path, sr=16000)
33
  chunk_len = 16000 * 20
34
  stride = 16000 * 2
 
41
 
42
  full_text = ""
43
  for chunk in chunks:
44
+ result = pipe(chunk, batch_size=16) # raised from 8 → 16
45
  full_text += result["text"] + " "
46
 
47
  return full_text.strip() or "Ekkert heyrt"
 
52
  gr.Markdown("**~4 % WER · 15–25 sek · T4 Paid**")
53
 
54
  audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav (allt að 3 mín)")
55
+ btn = gr.Button("Transcribe (15–25 sek)", variant="primary", size="lg")
56
  out = gr.Textbox(lines=30, label="Útskrift")
57
 
58
+ # ←←← FIXED: now points to the correct function name
59
+ btn.click(transcribe_safe, inputs=audio, outputs=out)
60
 
61
  demo.launch(auth=("beta", "beta2025"))