Spaces:

Badnyal
/

asr

Sleeping

App Files Files Community

Badnyal commited on Jan 25

Commit

3bccac2

verified ·

1 Parent(s): 456fd4d

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -55

app.py CHANGED Viewed

@@ -2,34 +2,56 @@ import gradio as gr
 from transformers import WhisperForConditionalGeneration, WhisperProcessor
 import torch
 import librosa
-import os
-# Load model and processor
-model_name = "MWirelabs/garo-asr"
-token = os.getenv("HF_TOKEN")
-processor = WhisperProcessor.from_pretrained(model_name, use_auth_token=token)
-model = WhisperForConditionalGeneration.from_pretrained(model_name, use_auth_token=token)
-# Move to GPU if available
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
 def transcribe_audio(audio_path):
-    """Transcribe Garo audio and clean output artifacts"""
     try:
-        # With type="filepath", audio_path will be a string path to a temporary file
-        if audio_path is None:
-            return "Please upload or record audio first."
-        # librosa.load is robust: it handles various formats and
-        # automatically resamples to 16000Hz as required by Whisper.
         audio, sr = librosa.load(audio_path, sr=16000)
-        # Process audio
-        inputs = processor(audio, sampling_rate=16000, return_tensors="pt")
         input_features = inputs.input_features.to(device)
-        # Generate transcription
         with torch.no_grad():
             generated_ids = model.generate(
                 input_features,
@@ -38,48 +60,40 @@ def transcribe_audio(audio_path):
                 repetition_penalty=1.5,
                 no_repeat_ngram_size=3
             )
-        transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
-        # 1. Clean up annotation artifacts (brackets and dashes)
-        transcription = transcription.replace('{', '').replace('}', '').replace('--', '').strip()
-        # 2. Remove immediate repeated words (e.g., "bano bano" -> "bano")
-        words = transcription.split()
-        cleaned_words = []
-        for i, word in enumerate(words):
-            if i == 0 or word != words[i-1]:
-                cleaned_words.append(word)
-        transcription = ' '.join(cleaned_words)
-        return transcription
     except Exception as e:
-        import traceback
-        return f"Error: {str(e)}\n\nFull trace:\n{traceback.format_exc()}"
-# Create Gradio interface
 demo = gr.Interface(
     fn=transcribe_audio,
-    # Setting type="filepath" is the key fix for mobile microphone issues
-    inputs=gr.Audio(sources=["upload"], type="filepath", label="Upload or Record Garo Audio"),
-    outputs=gr.Textbox(label="Transcription", placeholder="Garo text will appear here..."),
-    title="Garo ASR - Automatic Speech Recognition",
     description="""
-    ## First-ever ASR model for Garo language
-    This model is fine-tuned from Whisper-small on the Vaani dataset, achieving **9.74% WER** on Garo speech recognition.
-    **Instructions for Mobile:**
-    - If the microphone fails to start, try opening the **Direct URL** of the Space (found under "Embed this Space").
-    - Use Chrome or Safari and ensure you have granted microphone permissions.
     """,
-    article="""
-    ### About
-    Garo is a Tibeto-Burman language spoken in Meghalaya, India. Built by [MWire Labs](https://huggingface.co/MWirelabs).
-    """
 )
 if __name__ == "__main__":
-    # Launching on 0.0.0.0 is necessary for Docker/HF Spaces
-    demo.launch(server_name="0.0.0.0", server_port=7860)

 from transformers import WhisperForConditionalGeneration, WhisperProcessor
 import torch
 import librosa
+import re
+# =========================
+# CONFIG
+# =========================
+MODEL_NAME = "Badnyal/wancho-asr"
+LANG_LABEL = "Wancho"
+# Load processor & model (NO TOKEN)
+processor = WhisperProcessor.from_pretrained(MODEL_NAME)
+model = WhisperForConditionalGeneration.from_pretrained(MODEL_NAME)
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
+# =========================
+# CLEANING
+# =========================
+def clean_transcription(text: str) -> str:
+    # remove {}, <>, [], dashes
+    text = re.sub(r"[{}\[\]<>]", "", text)
+    text = text.replace("--", " ")
+    # remove immediate repetitions
+    words = text.split()
+    cleaned = []
+    for i, w in enumerate(words):
+        if i == 0 or w != words[i - 1]:
+            cleaned.append(w)
+    return " ".join(cleaned).strip()
+# =========================
+# ASR
+# =========================
 def transcribe_audio(audio_path):
+    if audio_path is None:
+        return "Please upload or record audio."
     try:
         audio, sr = librosa.load(audio_path, sr=16000)
+        inputs = processor(
+            audio,
+            sampling_rate=16000,
+            return_tensors="pt"
+        )
         input_features = inputs.input_features.to(device)
         with torch.no_grad():
             generated_ids = model.generate(
                 input_features,
                 repetition_penalty=1.5,
                 no_repeat_ngram_size=3
             )
+        text = processor.batch_decode(
+            generated_ids,
+            skip_special_tokens=True
+        )[0]
+        return clean_transcription(text)
     except Exception as e:
+        return f"Error: {str(e)}"
+# =========================
+# UI
+# =========================
 demo = gr.Interface(
     fn=transcribe_audio,
+    inputs=gr.Audio(
+        sources=["upload"],
+        type="filepath",
+        label=f"Upload or Record {LANG_LABEL} Audio"
+    ),
+    outputs=gr.Textbox(
+        label="Transcription",
+        placeholder=f"{LANG_LABEL} text will appear here..."
+    ),
+    title=f"{LANG_LABEL} ASR – Speech to Text",
     description="""
+    Open Whisper-based ASR model.
+    • No auth token required
+    • Cleaned transcripts
+    • GPU auto-detect
     """,
+    article="Built by MWire Labs"
 )
 if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7860)