Badnyal commited on
Commit
3bccac2
·
verified ·
1 Parent(s): 456fd4d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +69 -55
app.py CHANGED
@@ -2,34 +2,56 @@ import gradio as gr
2
  from transformers import WhisperForConditionalGeneration, WhisperProcessor
3
  import torch
4
  import librosa
5
- import os
6
 
7
- # Load model and processor
8
- model_name = "MWirelabs/garo-asr"
9
- token = os.getenv("HF_TOKEN")
10
- processor = WhisperProcessor.from_pretrained(model_name, use_auth_token=token)
11
- model = WhisperForConditionalGeneration.from_pretrained(model_name, use_auth_token=token)
 
 
 
 
12
 
13
- # Move to GPU if available
14
  device = "cuda" if torch.cuda.is_available() else "cpu"
15
  model.to(device)
16
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  def transcribe_audio(audio_path):
18
- """Transcribe Garo audio and clean output artifacts"""
 
 
19
  try:
20
- # With type="filepath", audio_path will be a string path to a temporary file
21
- if audio_path is None:
22
- return "Please upload or record audio first."
23
-
24
- # librosa.load is robust: it handles various formats and
25
- # automatically resamples to 16000Hz as required by Whisper.
26
  audio, sr = librosa.load(audio_path, sr=16000)
27
-
28
- # Process audio
29
- inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
 
 
 
 
30
  input_features = inputs.input_features.to(device)
31
-
32
- # Generate transcription
33
  with torch.no_grad():
34
  generated_ids = model.generate(
35
  input_features,
@@ -38,48 +60,40 @@ def transcribe_audio(audio_path):
38
  repetition_penalty=1.5,
39
  no_repeat_ngram_size=3
40
  )
41
-
42
- transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
43
-
44
- # 1. Clean up annotation artifacts (brackets and dashes)
45
- transcription = transcription.replace('{', '').replace('}', '').replace('--', '').strip()
46
-
47
- # 2. Remove immediate repeated words (e.g., "bano bano" -> "bano")
48
- words = transcription.split()
49
- cleaned_words = []
50
- for i, word in enumerate(words):
51
- if i == 0 or word != words[i-1]:
52
- cleaned_words.append(word)
53
- transcription = ' '.join(cleaned_words)
54
-
55
- return transcription
56
-
57
  except Exception as e:
58
- import traceback
59
- return f"Error: {str(e)}\n\nFull trace:\n{traceback.format_exc()}"
60
 
61
- # Create Gradio interface
 
 
62
  demo = gr.Interface(
63
  fn=transcribe_audio,
64
- # Setting type="filepath" is the key fix for mobile microphone issues
65
- inputs=gr.Audio(sources=["upload"], type="filepath", label="Upload or Record Garo Audio"),
66
- outputs=gr.Textbox(label="Transcription", placeholder="Garo text will appear here..."),
67
- title="Garo ASR - Automatic Speech Recognition",
 
 
 
 
 
 
68
  description="""
69
- ## First-ever ASR model for Garo language
70
-
71
- This model is fine-tuned from Whisper-small on the Vaani dataset, achieving **9.74% WER** on Garo speech recognition.
72
-
73
- **Instructions for Mobile:**
74
- - If the microphone fails to start, try opening the **Direct URL** of the Space (found under "Embed this Space").
75
- - Use Chrome or Safari and ensure you have granted microphone permissions.
76
  """,
77
- article="""
78
- ### About
79
- Garo is a Tibeto-Burman language spoken in Meghalaya, India. Built by [MWire Labs](https://huggingface.co/MWirelabs).
80
- """
81
  )
82
 
83
  if __name__ == "__main__":
84
- # Launching on 0.0.0.0 is necessary for Docker/HF Spaces
85
- demo.launch(server_name="0.0.0.0", server_port=7860)
 
2
  from transformers import WhisperForConditionalGeneration, WhisperProcessor
3
  import torch
4
  import librosa
5
+ import re
6
 
7
+ # =========================
8
+ # CONFIG
9
+ # =========================
10
+ MODEL_NAME = "Badnyal/wancho-asr"
11
+ LANG_LABEL = "Wancho"
12
+
13
+ # Load processor & model (NO TOKEN)
14
+ processor = WhisperProcessor.from_pretrained(MODEL_NAME)
15
+ model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
16
 
 
17
  device = "cuda" if torch.cuda.is_available() else "cpu"
18
  model.to(device)
19
 
20
+ # =========================
21
+ # CLEANING
22
+ # =========================
23
+ def clean_transcription(text: str) -> str:
24
+ # remove {}, <>, [], dashes
25
+ text = re.sub(r"[{}\[\]<>]", "", text)
26
+ text = text.replace("--", " ")
27
+
28
+ # remove immediate repetitions
29
+ words = text.split()
30
+ cleaned = []
31
+ for i, w in enumerate(words):
32
+ if i == 0 or w != words[i - 1]:
33
+ cleaned.append(w)
34
+
35
+ return " ".join(cleaned).strip()
36
+
37
+ # =========================
38
+ # ASR
39
+ # =========================
40
  def transcribe_audio(audio_path):
41
+ if audio_path is None:
42
+ return "Please upload or record audio."
43
+
44
  try:
 
 
 
 
 
 
45
  audio, sr = librosa.load(audio_path, sr=16000)
46
+
47
+ inputs = processor(
48
+ audio,
49
+ sampling_rate=16000,
50
+ return_tensors="pt"
51
+ )
52
+
53
  input_features = inputs.input_features.to(device)
54
+
 
55
  with torch.no_grad():
56
  generated_ids = model.generate(
57
  input_features,
 
60
  repetition_penalty=1.5,
61
  no_repeat_ngram_size=3
62
  )
63
+
64
+ text = processor.batch_decode(
65
+ generated_ids,
66
+ skip_special_tokens=True
67
+ )[0]
68
+
69
+ return clean_transcription(text)
70
+
 
 
 
 
 
 
 
 
71
  except Exception as e:
72
+ return f"Error: {str(e)}"
 
73
 
74
+ # =========================
75
+ # UI
76
+ # =========================
77
  demo = gr.Interface(
78
  fn=transcribe_audio,
79
+ inputs=gr.Audio(
80
+ sources=["upload"],
81
+ type="filepath",
82
+ label=f"Upload or Record {LANG_LABEL} Audio"
83
+ ),
84
+ outputs=gr.Textbox(
85
+ label="Transcription",
86
+ placeholder=f"{LANG_LABEL} text will appear here..."
87
+ ),
88
+ title=f"{LANG_LABEL} ASR – Speech to Text",
89
  description="""
90
+ Open Whisper-based ASR model.
91
+ • No auth token required
92
+ Cleaned transcripts
93
+ • GPU auto-detect
 
 
 
94
  """,
95
+ article="Built by MWire Labs"
 
 
 
96
  )
97
 
98
  if __name__ == "__main__":
99
+ demo.launch(server_name="0.0.0.0", server_port=7860)