palli23 commited on
Commit
f00cb9e
·
verified ·
1 Parent(s): 8e34a29

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +77 -32
app.py CHANGED
@@ -1,30 +1,56 @@
1
- # app.py — wav2vec2 CTC ASR with 10-best decoding + cleanup
2
 
3
  import os
4
  os.environ["OMP_NUM_THREADS"] = "1"
5
- os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:128"
6
 
7
  import gradio as gr
8
  import spaces
9
  import torch
10
  import gc
11
  import re
 
12
 
13
  from transformers import (
14
  Wav2Vec2Processor,
15
  Wav2Vec2ForCTC
16
  )
17
- import librosa
 
 
18
 
19
  # ——————————————————————————————
20
- # Simple Icelandic post-processing
21
  # ——————————————————————————————
22
  def clean_text(text: str) -> str:
23
  text = text.lower()
 
 
 
 
 
24
  text = re.sub(r"\s+", " ", text)
 
 
25
  text = text.replace(" ,", ",").replace(" .", ".")
 
 
26
  return text.strip()
27
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  # ——————————————————————————————
29
  # ZeroGPU worker
30
  # ——————————————————————————————
@@ -33,40 +59,59 @@ def transcribe_3min(audio_path):
33
  if not audio_path:
34
  return "Hlaðið upp hljóðskrá"
35
 
36
- processor = Wav2Vec2Processor.from_pretrained(
37
- #"palli23/wav2vec2-xlsr-300m-icelandic"
38
- "palli23/wav2vec2-icelandic-multi-aug-v2-5e-6"
39
- )
40
 
41
- model = Wav2Vec2ForCTC.from_pretrained(
42
- #"palli23/wav2vec2-xlsr-300m-icelandic"
43
- "palli23/wav2vec2-icelandic-multi-aug-v2-5e-6"
44
- ).to("cuda")
45
 
46
- # Load audio
47
- audio, sr = librosa.load(audio_path, sr=16000)
48
 
49
- inputs = processor(
50
- audio,
51
- sampling_rate=16000,
52
- return_tensors="pt",
53
- padding=True
54
- )
 
55
 
56
- with torch.no_grad():
57
- logits = model(inputs.input_values.to("cuda")).logits
 
 
58
 
59
- # ——— CTC beam search (10 hypotheses) ———
60
- beams = processor.decode(
61
- logits[0].cpu().numpy(),
62
- num_beams=10,
63
- output_word_offsets=False
64
- )
65
 
66
- # Pick best + clean
67
- best = clean_text(beams[0]["text"])
68
 
69
- # Cleanup memory
70
  del model
71
  del processor
72
- del logits
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py — wav2vec2 multi-aug (stable + high quality)
2
 
3
  import os
4
  os.environ["OMP_NUM_THREADS"] = "1"
5
+ os.environ["PYTORCH_ALLOC_CONF"] = "max_split_size_mb:128"
6
 
7
  import gradio as gr
8
  import spaces
9
  import torch
10
  import gc
11
  import re
12
+ import librosa
13
 
14
  from transformers import (
15
  Wav2Vec2Processor,
16
  Wav2Vec2ForCTC
17
  )
18
+
19
+ MODEL_ID = "palli23/wav2vec2-icelandic-multi-aug-v2-5e-6"
20
+ # MODEL_ID = "palli23/wav2vec2-xlsr-300m-icelandic"
21
 
22
  # ——————————————————————————————
23
+ # Strong Icelandic cleanup
24
  # ——————————————————————————————
25
  def clean_text(text: str) -> str:
26
  text = text.lower()
27
+
28
+ # collapse repeats (ctc artifacts)
29
+ text = re.sub(r"(.)\1{3,}", r"\1\1", text)
30
+
31
+ # spacing
32
  text = re.sub(r"\s+", " ", text)
33
+
34
+ # punctuation spacing
35
  text = text.replace(" ,", ",").replace(" .", ".")
36
+ text = text.replace(" ?", "?").replace(" !", "!")
37
+
38
  return text.strip()
39
 
40
+ # ——————————————————————————————
41
+ # Chunking helper (overlap improves WER)
42
+ # ——————————————————————————————
43
+ def chunk_audio(audio, sr, chunk_s=20, overlap_s=3):
44
+ step = chunk_s - overlap_s
45
+ chunk_len = int(chunk_s * sr)
46
+ step_len = int(step * sr)
47
+
48
+ for start in range(0, len(audio), step_len):
49
+ chunk = audio[start:start + chunk_len]
50
+ if len(chunk) < sr: # too short
51
+ break
52
+ yield chunk
53
+
54
  # ——————————————————————————————
55
  # ZeroGPU worker
56
  # ——————————————————————————————
 
59
  if not audio_path:
60
  return "Hlaðið upp hljóðskrá"
61
 
62
+ processor = Wav2Vec2Processor.from_pretrained(MODEL_ID)
63
+ model = Wav2Vec2ForCTC.from_pretrained(MODEL_ID)
64
+ model.eval().to("cuda")
 
65
 
66
+ # Load audio (float32 enforced)
67
+ audio, sr = librosa.load(audio_path, sr=16000, mono=True)
68
+ audio = audio.astype("float32")
 
69
 
70
+ texts = []
 
71
 
72
+ for chunk in chunk_audio(audio, sr):
73
+ inputs = processor(
74
+ chunk,
75
+ sampling_rate=16000,
76
+ return_tensors="pt",
77
+ padding=True
78
+ )
79
 
80
+ with torch.no_grad():
81
+ logits = model(
82
+ inputs.input_values.to("cuda")
83
+ ).logits
84
 
85
+ pred_ids = torch.argmax(logits, dim=-1)
86
+ text = processor.batch_decode(pred_ids)[0]
87
+ texts.append(text)
 
 
 
88
 
89
+ final_text = clean_text(" ".join(texts))
 
90
 
91
+ # Cleanup (critical)
92
  del model
93
  del processor
94
+ gc.collect()
95
+ torch.cuda.empty_cache()
96
+
97
+ return final_text
98
+
99
+ # ——————————————————————————————
100
+ # UI
101
+ # ——————————————————————————————
102
+ with gr.Blocks() as demo:
103
+ gr.Markdown("# Íslenskt ASR – wav2vec2 (multi-aug)")
104
+ gr.Markdown("**stöðugt · chunked · post-processed**")
105
+ gr.Markdown("**Hafa samband:** pallinr1@protonmail.com")
106
+
107
+ audio_in = gr.Audio(type="filepath", label="Hlaðið upp .mp3 / .wav")
108
+ btn = gr.Button("Transcribe", variant="primary", size="lg")
109
+ output = gr.Textbox(lines=20, label="Útskrift")
110
+
111
+ btn.click(fn=transcribe_3min, inputs=audio_in, outputs=output)
112
+
113
+ demo.launch(
114
+ share=True,
115
+ server_name="0.0.0.0",
116
+ server_port=7860,
117
+ )