File size: 2,590 Bytes
087adaa
9f507de
 
68d0f03
 
9f507de
087adaa
 
 
 
9f507de
 
 
ca9beed
087adaa
 
 
 
 
68d0f03
182bd23
68d0f03
e8491b9
2098477
68d0f03
e8491b9
68d0f03
087adaa
 
 
4098191
 
 
68d0f03
087adaa
68d0f03
087adaa
68d0f03
 
 
087adaa
 
 
68d0f03
 
087adaa
68d0f03
 
 
087adaa
 
 
 
 
 
 
 
 
 
 
 
 
 
68d0f03
549eccc
 
 
 
 
 
 
 
 
087adaa
 
 
 
 
4d2aad4
087adaa
 
 
 
 
 
 
4d2aad4
087adaa
68d0f03
 
 
 
 
f3509ea
68d0f03
 
 
f3509ea
68d0f03
82a816c
f3509ea
 
 
a313388
68d0f03
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
import os
import warnings

import gradio as gr
import numpy as np
import torch
from transformers import (
    AutoModelForSpeechSeq2Seq,
    AutoProcessor,
    logging,
    pipeline,
)

warnings.simplefilter("ignore", FutureWarning)

# β€”β€” CPU performance tweaks β€”β€”
os.environ["OMP_NUM_THREADS"] = "4"
os.environ["MKL_NUM_THREADS"] = "4"
torch.set_num_threads(4)

logging.set_verbosity_error()

# β€”β€” Model setup β€”β€”
model_id = "kingabzpro/whisper-base-urdu-full"

# Load and quantize to int8
model = AutoModelForSpeechSeq2Seq.from_pretrained(
    model_id,
    use_safetensors=True,
)
model = torch.quantization.quantize_dynamic(model, {torch.nn.Linear}, dtype=torch.qint8)

processor = AutoProcessor.from_pretrained(model_id)

# Build a CPU-based pipeline with chunking
transcriber = pipeline(
    task="automatic-speech-recognition",
    model=model,
    tokenizer=processor.tokenizer,
    feature_extractor=processor.feature_extractor,
    device=-1,  # CPU
    chunk_length_s=30,
    stride_length_s=(5, 5),
)


def transcribe(audio):
    if audio is None:
        return "No audio provided. Please record or upload an audio file."

    sr, y = audio
    # mono & normalize
    if y.ndim > 1:
        y = y.mean(axis=1)
    y = y.astype(np.float32)
    peak = np.max(np.abs(y))
    if peak > 0:
        y /= peak
    else:
        return "Audio appears to be silent. Please try again."

    # Inference under no_grad
    with torch.no_grad():
        result = transcriber({"sampling_rate": sr, "raw": y})
    text = result.get("text", "")
    # Add Urdu full stop if not present
    if text:
        text = text.rstrip()
        if text.endswith("."):
            text = text[:-1] + "Ϋ”"
        elif not text.endswith("Ϋ”"):
            text = text + "Ϋ”"
    return text


# β€”β€” Gradio UI β€”β€”
description = """
<p style='text-align: center'>
Record or upload audio in Urdu and get the transcribed text using the Whisper Base Urdu model.
</p>
"""
examples = [
    ["samples/audio1.mp3"],
    ["samples/audio2.mp3"],
    ["samples/audio3.mp3"],
]


demo = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(
        sources=["microphone", "upload"],
        type="numpy",
        label="Record or Upload Audio (Urdu)",
    ),
    outputs=gr.Textbox(
        label="Transcribed Text (Urdu)",
        placeholder="Transcribed Urdu text will appear here...",
    ),
    title="⚑Fast Urdu Speech Recognition",
    description=description,
    examples=examples,
    allow_flagging="never",
    theme=gr.themes.Soft(),
)

if __name__ == "__main__":
    demo.launch()