Spaces:
Running
Running
Abid Ali Awan commited on
Commit Β·
087adaa
1
Parent(s): 182bd23
Refactor app.py to optimize CPU performance, update model loading to use fp32 and quantization, and enhance the transcription function with improved audio processing and error handling.
Browse files
app.py
CHANGED
|
@@ -1,67 +1,88 @@
|
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
-
import spaces
|
| 3 |
import torch
|
| 4 |
import numpy as np
|
| 5 |
-
from transformers import
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
logging.set_verbosity_error()
|
| 8 |
|
| 9 |
-
# Model
|
| 10 |
-
device = "
|
| 11 |
-
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 12 |
model_id = "kingabzpro/whisper-large-v3-turbo-urdu"
|
| 13 |
|
| 14 |
-
#
|
| 15 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
| 16 |
-
model_id,
|
| 17 |
-
torch_dtype=
|
| 18 |
-
use_safetensors=True
|
| 19 |
-
)
|
| 20 |
-
model.
|
|
|
|
| 21 |
|
| 22 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 23 |
|
| 24 |
-
#
|
| 25 |
transcriber = pipeline(
|
| 26 |
-
"automatic-speech-recognition",
|
| 27 |
model=model,
|
| 28 |
tokenizer=processor.tokenizer,
|
| 29 |
feature_extractor=processor.feature_extractor,
|
| 30 |
-
|
| 31 |
-
|
|
|
|
| 32 |
)
|
| 33 |
|
| 34 |
-
|
| 35 |
def transcribe(audio):
|
| 36 |
if audio is None:
|
| 37 |
return "No audio provided. Please record or upload an audio file."
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
# Transcribe using the pipeline
|
| 54 |
result = transcriber({"sampling_rate": sr, "raw": y})
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
| 63 |
-
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
demo = gr.Interface(
|
| 66 |
fn=transcribe,
|
| 67 |
inputs=gr.Audio(
|
|
|
|
| 1 |
+
import os
|
| 2 |
import gradio as gr
|
|
|
|
| 3 |
import torch
|
| 4 |
import numpy as np
|
| 5 |
+
from transformers import (
|
| 6 |
+
AutoModelForSpeechSeq2Seq,
|
| 7 |
+
AutoProcessor,
|
| 8 |
+
pipeline,
|
| 9 |
+
logging,
|
| 10 |
+
)
|
| 11 |
+
|
| 12 |
+
# ββ CPU performance tweaks ββ
|
| 13 |
+
os.environ["OMP_NUM_THREADS"] = "4"
|
| 14 |
+
os.environ["MKL_NUM_THREADS"] = "4"
|
| 15 |
+
torch.set_num_threads(4)
|
| 16 |
|
| 17 |
logging.set_verbosity_error()
|
| 18 |
|
| 19 |
+
# ββ Model & device setup ββ
|
| 20 |
+
device = "cpu"
|
|
|
|
| 21 |
model_id = "kingabzpro/whisper-large-v3-turbo-urdu"
|
| 22 |
|
| 23 |
+
# Load in fp32 and quantize to int8
|
| 24 |
model = AutoModelForSpeechSeq2Seq.from_pretrained(
|
| 25 |
+
model_id,
|
| 26 |
+
torch_dtype=torch.float32,
|
| 27 |
+
use_safetensors=True,
|
| 28 |
+
)
|
| 29 |
+
model.eval()
|
| 30 |
+
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
|
| 31 |
|
| 32 |
processor = AutoProcessor.from_pretrained(model_id)
|
| 33 |
|
| 34 |
+
# Build a CPU-based pipeline with chunking
|
| 35 |
transcriber = pipeline(
|
| 36 |
+
task="automatic-speech-recognition",
|
| 37 |
model=model,
|
| 38 |
tokenizer=processor.tokenizer,
|
| 39 |
feature_extractor=processor.feature_extractor,
|
| 40 |
+
device=-1, # CPU
|
| 41 |
+
chunk_length_s=30,
|
| 42 |
+
stride_length_s=(5, 5),
|
| 43 |
)
|
| 44 |
|
| 45 |
+
|
| 46 |
def transcribe(audio):
|
| 47 |
if audio is None:
|
| 48 |
return "No audio provided. Please record or upload an audio file."
|
| 49 |
+
|
| 50 |
+
sr, y = audio
|
| 51 |
+
# mono & normalize
|
| 52 |
+
if y.ndim > 1:
|
| 53 |
+
y = y.mean(axis=1)
|
| 54 |
+
y = y.astype(np.float32)
|
| 55 |
+
peak = np.max(np.abs(y))
|
| 56 |
+
if peak > 0:
|
| 57 |
+
y /= peak
|
| 58 |
+
else:
|
| 59 |
+
return "Audio appears to be silent. Please try again."
|
| 60 |
+
|
| 61 |
+
# Inference under no_grad
|
| 62 |
+
with torch.no_grad():
|
|
|
|
|
|
|
| 63 |
result = transcriber({"sampling_rate": sr, "raw": y})
|
| 64 |
+
return result.get("text", "")
|
| 65 |
+
|
| 66 |
+
|
| 67 |
+
# ββ Gradio UI ββ
|
| 68 |
+
description = """
|
| 69 |
+
<p style='text-align: center'>
|
| 70 |
+
Record or upload audio in Urdu and get the transcribed text using the Whisper Large V3 Turbo Urdu model.
|
| 71 |
+
</p>
|
| 72 |
+
"""
|
| 73 |
+
examples = [
|
| 74 |
+
["samples/audio1.mp3"],
|
| 75 |
+
["samples/audio2.mp3"],
|
| 76 |
+
["samples/audio3.mp3"],
|
| 77 |
+
]
|
| 78 |
+
article = """
|
| 79 |
+
<p style='text-align: center; color: #34C759;'>
|
| 80 |
+
<a href='https://github.com/kingabzpro/simple-mlops-with-urdu-asr' target='_blank' style='text-decoration: none; color: #34C759;'>
|
| 81 |
+
πΏ Explore the project on GitHub π
|
| 82 |
+
</a>
|
| 83 |
+
</p>
|
| 84 |
+
"""
|
| 85 |
+
|
| 86 |
demo = gr.Interface(
|
| 87 |
fn=transcribe,
|
| 88 |
inputs=gr.Audio(
|