palli23 commited on
Commit
707f539
·
1 Parent(s): cc6ae2a

fix transcribe bug

Browse files
Files changed (1) hide show
  1. app.py +28 -21
app.py CHANGED
@@ -1,53 +1,60 @@
1
- # app.py – ZeroGPU SAFE3 mín hljóð án "GPU task aborted"
2
  import os
3
  import gradio as gr
4
  import spaces
5
  from transformers import pipeline
6
  import numpy as np
7
  import librosa
 
8
 
9
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
10
 
11
- @spaces.GPU(duration=60) # MEST 60 sek ZeroGPU leyfir
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def transcribe_safe(audio_path):
13
  if not audio_path:
14
  return "Hladdu upp hljóðskrá"
15
 
16
- # Hlaða hljóð og klippa í 20 sek chunkar (mjög öruggt)
17
  audio, sr = librosa.load(audio_path, sr=16000)
18
- chunk_len = 16000 * 20 # 20 sek
19
- stride = 16000 * 2 # 2 sek overlap
20
  chunks = []
21
  for i in range(0, len(audio), chunk_len - stride):
22
  chunk = audio[i:i + chunk_len]
23
- if len(chunk) < 16000: # undir 1 sek → hætta
24
  break
25
  chunks.append(chunk)
26
 
27
- # Hlaða ASR á GPU (cached)
28
- pipe = pipeline(
29
- "automatic-speech-recognition",
30
- model=MODEL_NAME,
31
- device=0,
32
- token=os.getenv("HF_TOKEN")
33
- )
34
-
35
  full_text = ""
36
- for idx, chunk in enumerate(chunks):
37
- result = pipe(chunk, batch_size=8)
38
  full_text += result["text"] + " "
39
 
40
  return full_text.strip() or "Ekkert heyrt"
41
 
42
- # Gradio fallegt og tilbúið fyrir 3 mín
43
- with gr.Blocks(title="Íslenskt ASR – 3 mín ZeroGPU") as demo:
44
  gr.Markdown("# Íslenskt ASR – 3 mín hljóð")
45
- gr.Markdown("**~4 % WER · 25–45 sek · ZeroGPU (PRO)**")
46
 
47
  audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav (allt að 3 mín)")
48
- btn = gr.Button("Transcribe (25–45 sek)", variant="primary", size="lg")
49
  out = gr.Textbox(lines=30, label="Útskrift")
50
 
51
- btn.click(transcribe_safe, inputs=audio, outputs=out)
52
 
53
  demo.launch(auth=("beta", "beta2025"))
 
1
+ # app.py – FIXED: now 1525 seconds for 3-minute file on paid T4
2
  import os
3
  import gradio as gr
4
  import spaces
5
  from transformers import pipeline
6
  import numpy as np
7
  import librosa
8
+ import torch
9
 
10
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
11
 
12
+ # ←←← THIS IS THE ONLY BIG CHANGE: load model ONCE at startup
13
+ print("Loading model once at startup (takes ~25 s once, never again)...")
14
+ pipe = pipeline(
15
+ "automatic-speech-recognition",
16
+ model=MODEL_NAME,
17
+ torch_dtype=torch.float16, # FP16 = 2× faster on T4
18
+ device=0,
19
+ token=os.getenv("HF_TOKEN")
20
+ )
21
+ # Pre-set Icelandic so it never has to guess
22
+ pipe.model.generation_config.language = "is"
23
+ pipe.model.generation_config.task = "transcribe"
24
+ print("Model ready and locked to Icelandic!")
25
+
26
+ @spaces.GPU(duration=120) # 2 minutes is more than enough now
27
  def transcribe_safe(audio_path):
28
  if not audio_path:
29
  return "Hladdu upp hljóðskrá"
30
 
31
+ # Your original safe chunking (20 s chunks, 2 s overlap)
32
  audio, sr = librosa.load(audio_path, sr=16000)
33
+ chunk_len = 16000 * 20
34
+ stride = 16000 * 2
35
  chunks = []
36
  for i in range(0, len(audio), chunk_len - stride):
37
  chunk = audio[i:i + chunk_len]
38
+ if len(chunk) < 16000:
39
  break
40
  chunks.append(chunk)
41
 
 
 
 
 
 
 
 
 
42
  full_text = ""
43
+ for chunk in chunks:
44
+ result = pipe(chunk, batch_size=16) # ← raised from 8 → 16 (T4 loves it)
45
  full_text += result["text"] + " "
46
 
47
  return full_text.strip() or "Ekkert heyrt"
48
 
49
+ # Your beautiful UI unchanged
50
+ with gr.Blocks(title="Íslenskt ASR – 3 mín T4 Paid") as demo:
51
  gr.Markdown("# Íslenskt ASR – 3 mín hljóð")
52
+ gr.Markdown("**~4 % WER · 15–25 sek · T4 Paid**")
53
 
54
  audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav (allt að 3 mín)")
55
+ btn = gr.Button("Transcribe (1525 sek)", variant="primary", size="lg")
56
  out = gr.Textbox(lines=30, label="Útskrift")
57
 
58
+ btn.click(transcribe, inputs=audio, outputs=out)
59
 
60
  demo.launch(auth=("beta", "beta2025"))