Spaces:
Sleeping
Sleeping
File size: 2,590 Bytes
087adaa 9f507de 68d0f03 9f507de 087adaa 9f507de ca9beed 087adaa 68d0f03 182bd23 68d0f03 e8491b9 2098477 68d0f03 e8491b9 68d0f03 087adaa 4098191 68d0f03 087adaa 68d0f03 087adaa 68d0f03 087adaa 68d0f03 087adaa 68d0f03 087adaa 68d0f03 549eccc 087adaa 4d2aad4 087adaa 4d2aad4 087adaa 68d0f03 f3509ea 68d0f03 f3509ea 68d0f03 82a816c f3509ea a313388 68d0f03 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 |
import os
import warnings
import gradio as gr
import numpy as np
import torch
from transformers import (
AutoModelForSpeechSeq2Seq,
AutoProcessor,
logging,
pipeline,
)
warnings.simplefilter("ignore", FutureWarning)
# ββ CPU performance tweaks ββ
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["MKL_NUM_THREADS"] = "4"
torch.set_num_threads(4)
logging.set_verbosity_error()
# ββ Model setup ββ
model_id = "kingabzpro/whisper-base-urdu-full"
# Load and quantize to int8
model = AutoModelForSpeechSeq2Seq.from_pretrained(
model_id,
use_safetensors=True,
)
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)
processor = AutoProcessor.from_pretrained(model_id)
# Build a CPU-based pipeline with chunking
transcriber = pipeline(
task="automatic-speech-recognition",
model=model,
tokenizer=processor.tokenizer,
feature_extractor=processor.feature_extractor,
device=-1, # CPU
chunk_length_s=30,
stride_length_s=(5, 5),
)
def transcribe(audio):
if audio is None:
return "No audio provided. Please record or upload an audio file."
sr, y = audio
# mono & normalize
if y.ndim > 1:
y = y.mean(axis=1)
y = y.astype(np.float32)
peak = np.max(np.abs(y))
if peak > 0:
y /= peak
else:
return "Audio appears to be silent. Please try again."
# Inference under no_grad
with torch.no_grad():
result = transcriber({"sampling_rate": sr, "raw": y})
text = result.get("text", "")
# Add Urdu full stop if not present
if text:
text = text.rstrip()
if text.endswith("."):
text = text[:-1] + "Ϋ"
elif not text.endswith("Ϋ"):
text = text + "Ϋ"
return text
# ββ Gradio UI ββ
description = """
<p style='text-align: center'>
Record or upload audio in Urdu and get the transcribed text using the Whisper Base Urdu model.
</p>
"""
examples = [
["samples/audio1.mp3"],
["samples/audio2.mp3"],
["samples/audio3.mp3"],
]
demo = gr.Interface(
fn=transcribe,
inputs=gr.Audio(
sources=["microphone", "upload"],
type="numpy",
label="Record or Upload Audio (Urdu)",
),
outputs=gr.Textbox(
label="Transcribed Text (Urdu)",
placeholder="Transcribed Urdu text will appear here...",
),
title="β‘Fast Urdu Speech Recognition",
description=description,
examples=examples,
allow_flagging="never",
theme=gr.themes.Soft(),
)
if __name__ == "__main__":
demo.launch()
|