palli23 commited on
Commit
e83e018
·
verified ·
1 Parent(s): 1f8d8c7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -25
app.py CHANGED
@@ -1,8 +1,8 @@
1
- # app.py — Íslenskt ASR – ZeroGPU Fully Stateless Fix (Dec 2025)
2
  import os
3
  os.environ["OMP_NUM_THREADS"] = "1"
4
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "garbage_collection_threshold:0.6,max_split_size_mb:128"
5
- # Force CPU-only at import to prevent any lazy CUDA init
6
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
7
 
8
  import gradio as gr
@@ -10,35 +10,62 @@ import spaces
10
  from transformers import pipeline
11
  import torch
12
  import gc
 
 
 
13
 
14
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
15
 
16
  @spaces.GPU(duration=180, max_batch_size=4)
17
- def transcribe_3min_gpu(audio_path):
18
  """
19
- FULLY SELF-CONTAINED GPU FUNCTION no globals, no prior CUDA touches.
20
- Loads model fresh on CPU first, then moves to GPU INSIDE worker.
21
  """
22
- if not audio_path:
23
  return "Hlaðið upp hljóðskrá (mp3/wav, max 5 mín)"
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  try:
26
- print("Loading Whisper model on CPU first (safe init)...")
27
- # Load on CPU explicitly to avoid any CUDA during model download/init
28
  pipe = pipeline(
29
  "automatic-speech-recognition",
30
  model=MODEL_NAME,
31
  torch_dtype=torch.float16,
32
- device="cpu", # ← KEY FIX: CPU first, no CUDA yet
33
  token=os.getenv("HF_TOKEN"),
34
  )
35
 
36
- # Now move to GPU this happens INSIDE @spaces.GPU worker, safe!
37
- print("Moving model to GPU...")
38
- pipe.model = pipe.model.to("cuda")
39
- pipe.device = "cuda"
40
- if hasattr(pipe, 'model_decoder'):
41
- pipe.model_decoder = pipe.model_decoder.to("cuda")
42
 
43
  # Run inference
44
  print("Running transcription...")
@@ -46,18 +73,18 @@ def transcribe_3min_gpu(audio_path):
46
  audio_path,
47
  chunk_length_s=30,
48
  stride_length_s=(6, 0),
49
- batch_size=8,
50
  return_timestamps=False,
51
  generate_kwargs={"language": "is", "task": "transcribe"},
52
  )
53
 
54
  text = result["text"].strip()
55
 
56
- # Cleanup chunks
57
  if "chunks" in result:
58
  del result["chunks"]
59
 
60
- # Aggressive cleanup BEFORE returning
61
  del pipe
62
  gc.collect()
63
  if torch.cuda.is_available():
@@ -79,24 +106,24 @@ with gr.Blocks(title="Íslenskt ASR") as demo:
79
  gr.Markdown("# Íslenskt ASR – 3–5 mín hljóð")
80
  gr.Markdown("**Whisper-small fínstillt á íslensku spjalli · mjög lágur WER**")
81
  gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")
82
- gr.Markdown("> Keyrt á **ZeroGPU** – hver umritun hleðst nýtt (15–30 sek), en örugg og stöðug")
83
 
84
  audio_in = gr.Audio(
85
- type="filepath",
86
  label="Hlaðið upp .mp3 / .wav (allt að 5 mín)",
87
  sources=["upload", "microphone"]
88
  )
89
  btn = gr.Button("Umrita", variant="primary", size="lg")
90
- output = gr.Textbox(lines=25, label="Texti")
91
 
92
- # Use the GPU-decorated function directly
93
  btn.click(fn=transcribe_3min_gpu, inputs=audio_in, outputs=output)
94
 
95
  gr.Markdown("""
96
  ### Leiðbeiningar
97
- - Hver umritun hleðst módelinu nýtt á GPU (ZeroGPU regla)
98
- - Tími: 15–30 sek (lengur en á venjulegu GPU, en lifir endalaust)
99
- - Ef villa kemur → bíddu 10 sek og prófaðu aftur
100
  """)
101
 
102
  # ————————————————————— Launch —————————————————————
 
1
+ # app.py — Íslenskt ASR – ZeroGPU Fully Fixed (Dec 2025 – handles str audio + CUDA safe)
2
  import os
3
  os.environ["OMP_NUM_THREADS"] = "1"
4
  os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "garbage_collection_threshold:0.6,max_split_size_mb:128"
5
+ # Disable CUDA visibility at startup to prevent main process init
6
  os.environ["CUDA_VISIBLE_DEVICES"] = ""
7
 
8
  import gradio as gr
 
10
  from transformers import pipeline
11
  import torch
12
  import gc
13
+ import librosa # For loading audio bytes if needed
14
+ import io
15
+ import soundfile as sf # For writing temp files from bytes
16
 
17
  MODEL_NAME = "palli23/whisper-small-sam_spjall"
18
 
19
  @spaces.GPU(duration=180, max_batch_size=4)
20
+ def transcribe_3min_gpu(audio_input):
21
  """
22
+ Handles both filepath (str) and uploaded bytes/temp files.
23
+ Loads model on CPU first, moves to GPU inside worker.
24
  """
25
+ if not audio_input:
26
  return "Hlaðið upp hljóðskrá (mp3/wav, max 5 mín)"
27
 
28
+ # Handle str (filepath) vs bytes/tuple from Gradio upload
29
+ if isinstance(audio_input, str):
30
+ audio_path = audio_input
31
+ else:
32
+ # audio_input is tuple (filepath, tuple(bytes, sample_rate)) or just bytes
33
+ if isinstance(audio_input, tuple) and len(audio_input) > 0:
34
+ audio_path = audio_input[0] # Temp filepath from upload
35
+ else:
36
+ # Fallback: write bytes to temp file if no path
37
+ if isinstance(audio_input, bytes):
38
+ audio_bytes = audio_input
39
+ else:
40
+ return "Ógild hljóðskrá – reyndu aftur"
41
+
42
+ # Assume 16kHz sample rate for Whisper (common fallback)
43
+ sample_rate = 16000
44
+ # Load with librosa if needed, but for simplicity write to temp
45
+ with io.BytesIO(audio_bytes) as audio_io:
46
+ # Use soundfile to write temp wav
47
+ with sf.SoundFile(audio_io, 'r') as f:
48
+ data, sr = f.read(), f.samplerate
49
+ if sr != 16000:
50
+ data = librosa.resample(data, orig_sr=sr, target_sr=16000)
51
+ # Write to temp file
52
+ audio_path = "/tmp/temp_audio.wav"
53
+ sf.write(audio_path, data, 16000)
54
+
55
  try:
56
+ print("Loading Whisper model on CPU first (safe for ZeroGPU)...")
57
+ # Load pipeline on CPU no CUDA touch
58
  pipe = pipeline(
59
  "automatic-speech-recognition",
60
  model=MODEL_NAME,
61
  torch_dtype=torch.float16,
62
+ device="cpu", # Critical: CPU init only
63
  token=os.getenv("HF_TOKEN"),
64
  )
65
 
66
+ # Now inside GPU worker: move entire pipeline to CUDA
67
+ print("Moving pipeline to GPU (ZeroGPU safe)...")
68
+ pipe = pipe.to("cuda")
 
 
 
69
 
70
  # Run inference
71
  print("Running transcription...")
 
73
  audio_path,
74
  chunk_length_s=30,
75
  stride_length_s=(6, 0),
76
+ batch_size=4, # Smaller batch for ZeroGPU stability
77
  return_timestamps=False,
78
  generate_kwargs={"language": "is", "task": "transcribe"},
79
  )
80
 
81
  text = result["text"].strip()
82
 
83
+ # Cleanup
84
  if "chunks" in result:
85
  del result["chunks"]
86
 
87
+ # Delete pipe immediately to free memory
88
  del pipe
89
  gc.collect()
90
  if torch.cuda.is_available():
 
106
  gr.Markdown("# Íslenskt ASR – 3–5 mín hljóð")
107
  gr.Markdown("**Whisper-small fínstillt á íslensku spjalli · mjög lágur WER**")
108
  gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")
109
+ gr.Markdown("> Keyrt á **ZeroGPU** – hver umritun hleðst nýtt (15–30 sek), stöðug og örugg")
110
 
111
  audio_in = gr.Audio(
112
+ type="filepath", # Ensures str output for pipeline
113
  label="Hlaðið upp .mp3 / .wav (allt að 5 mín)",
114
  sources=["upload", "microphone"]
115
  )
116
  btn = gr.Button("Umrita", variant="primary", size="lg")
117
+ output = gr.Textbox(lines=25, label="Texti", placeholder="Hljóðtextinn birtist hér...")
118
 
119
+ # Click event uses GPU-decorated fn
120
  btn.click(fn=transcribe_3min_gpu, inputs=audio_in, outputs=output)
121
 
122
  gr.Markdown("""
123
  ### Leiðbeiningar
124
+ - Hver umritun: 15–30 sek (módel hleðst á GPU)
125
+ - Styður upload og microphone sjálfkrafa umbreytir í rétt format
126
+ - Ef villa: bíddu og prófaðu aftur (ZeroGPU endurræsir)
127
  """)
128
 
129
  # ————————————————————— Launch —————————————————————