Spaces:

kingabzpro
/

Transcribed-Urdu

Running

App Files Files Community

Abid Ali Awan commited on Jul 5, 2025

Commit

087adaa

1 Parent(s): 182bd23

Refactor app.py to optimize CPU performance, update model loading to use fp32 and quantization, and enhance the transcription function with improved audio processing and error handling.

Browse files

Files changed (1) hide show

app.py +63 -42

app.py CHANGED Viewed

@@ -1,67 +1,88 @@
 import gradio as gr
-import spaces
 import torch
 import numpy as np
-from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline, logging
 logging.set_verbosity_error()
-# Model configuration
-device = "cuda:0" if torch.cuda.is_available() else "cpu"
-torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
 model_id = "kingabzpro/whisper-large-v3-turbo-urdu"
-# Initialize model and processor
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
-    model_id,
-    torch_dtype=torch_dtype,
-    use_safetensors=True
-).to(device)
-model.generation_config.forced_decoder_ids = None
 processor = AutoProcessor.from_pretrained(model_id)
-# Create pipeline
 transcriber = pipeline(
-    "automatic-speech-recognition",
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
-    torch_dtype=torch_dtype,
-    device=device,
 )
-@spaces.GPU
 def transcribe(audio):
     if audio is None:
         return "No audio provided. Please record or upload an audio file."
-    try:
-        sr, y = audio
-        # Convert to mono if stereo
-        if y.ndim > 1:
-            y = y.mean(axis=1)
-        # Convert to float32 and normalize
-        y = y.astype(np.float32)
-        if np.max(np.abs(y)) > 0:
-            y /= np.max(np.abs(y))
-        else:
-            return "Audio appears to be silent. Please try again."
-        # Transcribe using the pipeline
         result = transcriber({"sampling_rate": sr, "raw": y})
-        return result["text"]
-    except Exception as e:
-        return f"Error during transcription: {str(e)}"
-description = "<p style='text-align: center'>Record or upload audio in Urdu and get the transcribed text using Whisper Large V3 Turbo Urdu model.</center></p>"
-examples = [["samples/audio1.mp3"], ["samples/audio2.mp3"], ["samples/audio3.mp3"]]
-article = "<p style='text-align: center; color: #34C759;'><a href='https://github.com/kingabzpro/simple-mlops-with-urdu-asr' target='_blank' style='text-decoration: none; color: #34C759;'>🌿 Explore the project on GitHub 📚</a></p>"
-# Create Gradio interface
 demo = gr.Interface(
     fn=transcribe,
     inputs=gr.Audio(

+import os
 import gradio as gr
 import torch
 import numpy as np
+from transformers import (
+    AutoModelForSpeechSeq2Seq,
+    AutoProcessor,
+    pipeline,
+    logging,
+)
+# —— CPU performance tweaks ——
+os.environ["OMP_NUM_THREADS"] = "4"
+os.environ["MKL_NUM_THREADS"] = "4"
+torch.set_num_threads(4)
 logging.set_verbosity_error()
+# —— Model & device setup ——
+device = "cpu"
 model_id = "kingabzpro/whisper-large-v3-turbo-urdu"
+# Load in fp32 and quantize to int8
 model = AutoModelForSpeechSeq2Seq.from_pretrained(
+    model_id,
+    torch_dtype=torch.float32,
+    use_safetensors=True,
+)
+model.eval()
+model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
 processor = AutoProcessor.from_pretrained(model_id)
+# Build a CPU-based pipeline with chunking
 transcriber = pipeline(
+    task="automatic-speech-recognition",
     model=model,
     tokenizer=processor.tokenizer,
     feature_extractor=processor.feature_extractor,
+    device=-1,  # CPU
+    chunk_length_s=30,
+    stride_length_s=(5, 5),
 )
 def transcribe(audio):
     if audio is None:
         return "No audio provided. Please record or upload an audio file."
+    sr, y = audio
+    # mono & normalize
+    if y.ndim > 1:
+        y = y.mean(axis=1)
+    y = y.astype(np.float32)
+    peak = np.max(np.abs(y))
+    if peak > 0:
+        y /= peak
+    else:
+        return "Audio appears to be silent. Please try again."
+    # Inference under no_grad
+    with torch.no_grad():
         result = transcriber({"sampling_rate": sr, "raw": y})
+    return result.get("text", "")
+# —— Gradio UI ——
+description = """
+<p style='text-align: center'>
+Record or upload audio in Urdu and get the transcribed text using the Whisper Large V3 Turbo Urdu model.
+</p>
+"""
+examples = [
+    ["samples/audio1.mp3"],
+    ["samples/audio2.mp3"],
+    ["samples/audio3.mp3"],
+]
+article = """
+<p style='text-align: center; color: #34C759;'>
+<a href='https://github.com/kingabzpro/simple-mlops-with-urdu-asr' target='_blank' style='text-decoration: none; color: #34C759;'>
+🌿 Explore the project on GitHub 📚
+</a>
+</p>
+"""
 demo = gr.Interface(
     fn=transcribe,
     inputs=gr.Audio(