palli23 commited on
Commit
e3207ee
·
verified ·
1 Parent(s): 6642f61

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +49 -112
app.py CHANGED
@@ -1,5 +1,4 @@
1
- # app.py — Icelandic ASR (wav2vec2 primary, Whisper repair)
2
- # Dependencies: gradio, transformers, torch, librosa, soundfile
3
 
4
  import os
5
  os.environ["OMP_NUM_THREADS"] = "1"
@@ -11,116 +10,65 @@ import torch
11
  import gc
12
  import re
13
  import librosa
14
- import numpy as np
15
 
16
  from transformers import (
17
  Wav2Vec2Processor,
18
- Wav2Vec2ForCTC,
19
- pipeline
20
  )
21
 
22
- # ============================================================
23
- # MODELS
24
- # ============================================================
25
- W2V_MODEL = "palli23/wav2vec2-icelandic-multi-aug-v2-5e-6"
26
- WHISPER_MODEL = "palli23/whisper-small-sam_spjall"
27
 
28
- # ============================================================
29
- # TEXT CLEANING (SAFE ONLY)
30
- # ============================================================
31
- def clean_ctc(text: str) -> str:
32
  text = text.lower()
33
- text = re.sub(r"(.)\1{3,}", r"\1\1", text) # collapse char loops
34
- text = re.sub(r"\s+", " ", text)
35
- return text.strip()
36
-
37
- # ============================================================
38
- # STRUCTURAL VALIDITY CHECK (CRITICAL)
39
- # ============================================================
40
- def structurally_invalid(text: str) -> bool:
41
- if not text or len(text) < 3:
42
- return True
43
-
44
- words = text.split()
45
-
46
- # word-loop detection
47
- for i in range(len(words) - 3):
48
- if words[i:i+2] == words[i+2:i+4]:
49
- return True
50
-
51
- # collapsed word boundaries
52
- if re.search(r"[a-záðéíóúýþæö]{18,}", text):
53
- return True
54
-
55
- # garbage consonant runs
56
- if re.search(r"[bcdfghjklmnpqrstvwxz]{4,}", text):
57
- return True
58
-
59
- return False
60
 
61
- # ============================================================
62
- # OVERLAP MERGING (ANTI-DUPLICATION)
63
- # ============================================================
64
- def merge_chunks(prev: str, curr: str) -> str:
65
- p = prev.split()
66
- c = curr.split()
67
 
68
- for k in range(6, 2, -1):
69
- if len(p) >= k and len(c) >= k and p[-k:] == c[:k]:
70
- return " ".join(p + c[k:])
71
-
72
- return prev + " " + curr
73
-
74
- # ============================================================
75
- # SENTENCE FINALIZATION (LAST STEP ONLY)
76
- # ============================================================
77
- def finalize_text(text: str) -> str:
78
- text = re.sub(r"\s+,", ",", text)
79
- text = re.sub(r"\s+\.", ".", text)
80
- text = re.sub(r"\s+\?", "?", text)
81
- text = re.sub(r"\s+!", "!", text)
82
 
83
- # light sentence segmentation
84
- text = re.sub(r"([a-záðéíóúýþæö]) ([A-ZÁÉÍÓÚÝÞÆÖ])", r"\1.\n\2", text)
 
85
 
86
  return text.strip()
87
 
88
- # ============================================================
89
- # AUDIO CHUNKING (STABLE)
90
- # ============================================================
91
  def chunk_audio(audio, sr, chunk_s=20, overlap_s=3):
92
- step = int((chunk_s - overlap_s) * sr)
93
- size = int(chunk_s * sr)
 
94
 
95
- for start in range(0, len(audio), step):
96
- chunk = audio[start:start + size]
97
- if len(chunk) < sr:
98
  break
99
  yield chunk
100
 
101
- # ============================================================
102
- # ZeroGPU WORKER
103
- # ============================================================
104
  @spaces.GPU(duration=180)
105
  def transcribe_3min(audio_path):
106
  if not audio_path:
107
  return "Hlaðið upp hljóðskrá"
108
 
109
- # Load models
110
- processor = Wav2Vec2Processor.from_pretrained(W2V_MODEL)
111
- model = Wav2Vec2ForCTC.from_pretrained(W2V_MODEL).eval().to("cuda")
112
-
113
- whisper = pipeline(
114
- "automatic-speech-recognition",
115
- model=WHISPER_MODEL,
116
- device=0,
117
- torch_dtype=torch.float16,
118
- )
119
 
 
120
  audio, sr = librosa.load(audio_path, sr=16000, mono=True)
121
  audio = audio.astype("float32")
122
 
123
- outputs = []
124
 
125
  for chunk in chunk_audio(audio, sr):
126
  inputs = processor(
@@ -131,51 +79,40 @@ def transcribe_3min(audio_path):
131
  )
132
 
133
  with torch.no_grad():
134
- logits = model(inputs.input_values.to("cuda")).logits
 
 
135
 
136
  pred_ids = torch.argmax(logits, dim=-1)
137
- w2v_text = clean_ctc(processor.batch_decode(pred_ids)[0])
 
138
 
139
- # -------- STRUCTURAL GATING --------
140
- if structurally_invalid(w2v_text):
141
- whisper_out = whisper(chunk, chunk_length_s=20)
142
- text = whisper_out["text"].strip()
143
- else:
144
- text = w2v_text
145
 
146
- # -------- MERGE WITH DEDUP --------
147
- if outputs:
148
- outputs[-1] = merge_chunks(outputs[-1], text)
149
- else:
150
- outputs.append(text)
151
-
152
- final = finalize_text(" ".join(outputs))
153
-
154
- # Cleanup
155
  del model
156
  del processor
157
- del whisper
158
  gc.collect()
159
  torch.cuda.empty_cache()
160
 
161
- return final
162
 
163
- # ============================================================
164
  # UI
165
- # ============================================================
166
  with gr.Blocks() as demo:
167
- gr.Markdown("# 🇮🇸 Íslenskt ASR – Stable Hybrid (Fixed)")
168
- gr.Markdown("**wav2vec2 multi-aug · structural gating · Whisper repair**")
169
  gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")
170
 
171
- audio = gr.Audio(type="filepath", label="Hlaðið upp .wav / .mp3")
172
  btn = gr.Button("Transcribe", variant="primary", size="lg")
173
- out = gr.Textbox(lines=26, label="Útskrift")
174
 
175
- btn.click(transcribe_3min, audio, out)
176
 
177
  demo.launch(
178
  share=True,
179
  server_name="0.0.0.0",
180
  server_port=7860,
181
- )
 
1
+ # app.py — wav2vec2 multi-aug (stable + high quality)
 
2
 
3
  import os
4
  os.environ["OMP_NUM_THREADS"] = "1"
 
10
  import gc
11
  import re
12
  import librosa
 
13
 
14
  from transformers import (
15
  Wav2Vec2Processor,
16
+ Wav2Vec2ForCTC
 
17
  )
18
 
19
+ #MODEL_ID = "palli23/wav2vec2-icelandic-multi-aug-v2-5e-6"
20
+ MODEL_ID = "palli23/wav2vec2-icelandic-clean"
21
+ # MODEL_ID = "palli23/wav2vec2-xlsr-300m-icelandic"
 
 
22
 
23
+ # ——————————————————————————————
24
+ # Strong Icelandic cleanup
25
+ # ——————————————————————————————
26
+ def clean_text(text: str) -> str:
27
  text = text.lower()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
 
29
+ # collapse repeats (ctc artifacts)
30
+ text = re.sub(r"(.)\1{3,}", r"\1\1", text)
 
 
 
 
31
 
32
+ # spacing
33
+ text = re.sub(r"\s+", " ", text)
 
 
 
 
 
 
 
 
 
 
 
 
34
 
35
+ # punctuation spacing
36
+ text = text.replace(" ,", ",").replace(" .", ".")
37
+ text = text.replace(" ?", "?").replace(" !", "!")
38
 
39
  return text.strip()
40
 
41
+ # ——————————————————————————————
42
+ # Chunking helper (overlap improves WER)
43
+ # ——————————————————————————————
44
  def chunk_audio(audio, sr, chunk_s=20, overlap_s=3):
45
+ step = chunk_s - overlap_s
46
+ chunk_len = int(chunk_s * sr)
47
+ step_len = int(step * sr)
48
 
49
+ for start in range(0, len(audio), step_len):
50
+ chunk = audio[start:start + chunk_len]
51
+ if len(chunk) < sr: # too short
52
  break
53
  yield chunk
54
 
55
+ # ——————————————————————————————
56
+ # ZeroGPU worker
57
+ # ——————————————————————————————
58
  @spaces.GPU(duration=180)
59
  def transcribe_3min(audio_path):
60
  if not audio_path:
61
  return "Hlaðið upp hljóðskrá"
62
 
63
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
64
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
65
+ model.eval().to("cuda")
 
 
 
 
 
 
 
66
 
67
+ # Load audio (float32 enforced)
68
  audio, sr = librosa.load(audio_path, sr=16000, mono=True)
69
  audio = audio.astype("float32")
70
 
71
+ texts = []
72
 
73
  for chunk in chunk_audio(audio, sr):
74
  inputs = processor(
 
79
  )
80
 
81
  with torch.no_grad():
82
+ logits = model(
83
+ inputs.input_values.to("cuda")
84
+ ).logits
85
 
86
  pred_ids = torch.argmax(logits, dim=-1)
87
+ text = processor.batch_decode(pred_ids)[0]
88
+ texts.append(text)
89
 
90
+ final_text = clean_text(" ".join(texts))
 
 
 
 
 
91
 
92
+ # Cleanup (critical)
 
 
 
 
 
 
 
 
93
  del model
94
  del processor
 
95
  gc.collect()
96
  torch.cuda.empty_cache()
97
 
98
+ return final_text
99
 
100
+ # ——————————————————————————————
101
  # UI
102
+ # ——————————————————————————————
103
  with gr.Blocks() as demo:
104
+ gr.Markdown("# Íslenskt ASR – wav2vec2 (multi-aug)")
105
+ gr.Markdown("**stöðugt · chunked · post-processed**")
106
  gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")
107
 
108
+ audio_in = gr.Audio(type="filepath", label="Hlaðið upp .mp3 / .wav")
109
  btn = gr.Button("Transcribe", variant="primary", size="lg")
110
+ output = gr.Textbox(lines=20, label="Útskrift")
111
 
112
+ btn.click(fn=transcribe_3min, inputs=audio_in, outputs=output)
113
 
114
  demo.launch(
115
  share=True,
116
  server_name="0.0.0.0",
117
  server_port=7860,
118
+ )