palli23 commited on
Commit
ea1ab79
·
1 Parent(s): 8f9658b

fix transcribe bug

Browse files
Files changed (2) hide show
  1. app.py +23 -43
  2. requirements.txt +1 -3
app.py CHANGED
@@ -1,62 +1,42 @@
1
- # app.py – Your original working version + only 2 safe fixes
2
  import os
3
  import gradio as gr
4
  import spaces
5
  from transformers import pipeline
6
- import numpy as np
7
- import librosa
8
 
9
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
10
 
11
- # ←←← ONLY CHANGE #1: Load model once at startup (this is what made it slow before)
12
- print("Loading model once at startup...")
13
- pipe = pipeline(
14
- "automatic-speech-recognition",
15
- model=MODEL_NAME,
16
- device=0,
17
- token=os.getenv("HF_TOKEN")
18
- )
19
-
20
- # ←←← ONLY CHANGE #2: Fix the lang_to_id crash (harmless, needed)
21
- if not hasattr(pipe.model.generation_config, "lang_to_id"):
22
- pipe.model.generation_config.lang_to_id = {"is": 50259}
23
- pipe.model.generation_config.task_to_id = {"transcribe": 50359}
24
- pipe.model.generation_config.forced_decoder_ids = None
25
-
26
- print("Model ready – everything else is exactly your original code")
27
-
28
- @spaces.GPU(duration=120)
29
- def transcribe_safe(audio_path):
30
  if not audio_path:
31
  return "Hladdu upp hljóðskrá"
32
 
33
- # ←←← Your exact original chunking code untouched
34
- audio, sr = librosa.load(audio_path, sr=16000)
35
- chunk_len = 16000 * 20
36
- stride = 16000 * 2
37
- chunks = []
38
- for i in range(0, len(audio), chunk_len - stride):
39
- chunk = audio[i:i + chunk_len]
40
- if len(chunk) < 16000:
41
- break
42
- chunks.append(chunk)
43
 
44
- full_text = ""
45
- for idx, chunk in enumerate(chunks):
46
- result = pipe(chunk, batch_size=8) # your original batch_size=8
47
- full_text += result["text"] + " "
 
 
 
48
 
49
- return full_text.strip() or "Ekkert heyrt"
50
 
51
- # ←←← Your exact original UI – 100% unchanged
52
- with gr.Blocks(title="Íslenskt ASR – 3 mín ZeroGPU") as demo:
53
- gr.Markdown("# Íslenskt ASR – 3 mín hljóð")
54
- gr.Markdown("**~4 % WER · 25–45 sek · ZeroGPU (PRO)**")
55
 
56
  audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav (allt að 3 mín)")
57
- btn = gr.Button("Transcribe (25–45 sek)", variant="primary", size="lg")
58
  out = gr.Textbox(lines=30, label="Útskrift")
59
 
60
- btn.click(transcribe_safe, inputs=audio, outputs=out)
61
 
62
  demo.launch(auth=("beta", "beta2025"))
 
 
1
  import os
2
  import gradio as gr
3
  import spaces
4
  from transformers import pipeline
 
 
5
 
6
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
7
 
8
+ @spaces.GPU(duration=60) # nóg fyrir 3 mín hljóð
9
+ def transcribe_3min(audio_path):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  if not audio_path:
11
  return "Hladdu upp hljóðskrá"
12
 
13
+ # Whisper pipeline með chunking – ZeroGPU öruggt
14
+ pipe = pipeline(
15
+ "automatic-speech-recognition",
16
+ model=MODEL_NAME,
17
+ device=0,
18
+ token=os.getenv("HF_TOKEN")
19
+ )
 
 
 
20
 
21
+ result = pipe(
22
+ audio_path,
23
+ chunk_length_s=30, # 30 sek chunkar
24
+ stride_length_s=(6, 0), # 6 sek overlap
25
+ return_timestamps=False,
26
+ batch_size=8
27
+ )
28
 
29
+ return result["text"]
30
 
31
+ # Interface
32
+ with gr.Blocks(title="Íslenskt ASR – 3 mín") as demo:
33
+ gr.Markdown("# Íslenskt ASR – 3 mínútur")
34
+ gr.Markdown("**Whisper · Very low WER · 0.5-5minute audio transcribe á ZeroGPU**")
35
 
36
  audio = gr.Audio(type="filepath", label="Hladdu upp .mp3 / .wav (allt að 3 mín)")
37
+ btn = gr.Button("Transcribe", variant="primary", size="lg")
38
  out = gr.Textbox(lines=30, label="Útskrift")
39
 
40
+ btn.click(transcribe_3min, inputs=audio, outputs=out)
41
 
42
  demo.launch(auth=("beta", "beta2025"))
requirements.txt CHANGED
@@ -1,6 +1,4 @@
1
  gradio
2
  transformers
3
  torch
4
- spaces
5
- librosa
6
- soundfile
 
1
  gradio
2
  transformers
3
  torch
4
+ spaces # For @spaces.GPU