File size: 3,968 Bytes
bbbf3e8
 
 
 
 
6da03f5
bbbf3e8
 
6da03f5
bbbf3e8
30b7049
6da03f5
 
bbbf3e8
 
6da03f5
bbbf3e8
30b7049
bbbf3e8
30b7049
 
bbbf3e8
 
30b7049
 
 
bbbf3e8
30b7049
bbbf3e8
 
6da03f5
bbbf3e8
30b7049
bbbf3e8
6da03f5
 
 
 
 
 
 
 
 
 
 
bbbf3e8
6da03f5
 
bbbf3e8
6da03f5
bbbf3e8
 
 
6da03f5
 
bbbf3e8
6da03f5
bbbf3e8
6da03f5
bbbf3e8
 
 
 
6da03f5
bbbf3e8
 
 
 
30b7049
 
 
bbbf3e8
 
 
30b7049
bbbf3e8
 
 
6da03f5
bbbf3e8
 
6da03f5
bbbf3e8
30b7049
bbbf3e8
30b7049
6da03f5
bbbf3e8
 
30b7049
 
 
bbbf3e8
 
30b7049
 
 
 
 
 
 
 
bbbf3e8
 
30b7049
6da03f5
30b7049
bbbf3e8
 
30b7049
 
6da03f5
30b7049
 
bbbf3e8
 
30b7049
 
bbbf3e8
 
 
 
6da03f5
bbbf3e8
30b7049
 
bbbf3e8
 
30b7049
 
 
bbbf3e8
 
 
30b7049
 
 
 
 
bbbf3e8
30b7049
bbbf3e8
30b7049
 
 
 
 
bbbf3e8
 
30b7049
bbbf3e8
 
 
 
30b7049
 
 
 
 
 
 
 
 
bbbf3e8
 
 
 
30b7049
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
import gradio as gr
import torch
import numpy as np
import librosa

from transformers import WhisperProcessor, WhisperForConditionalGeneration

# =========================
# CONFIG
# =========================
MODEL_ID = "afaqalinagra/PASHTO-ASR-MODEL"
DEVICE = "cpu"
TARGET_SR = 16000

# =========================
# LOAD MODEL
# =========================
processor = WhisperProcessor.from_pretrained(
    MODEL_ID,
    language="pashto",
    task="transcribe"
)

model = WhisperForConditionalGeneration.from_pretrained(
    MODEL_ID
).to(DEVICE)

model.eval()

# =========================
# SAFE AUDIO HANDLER
# =========================
def transcribe_audio(audio):
    if audio is None:
        return "No audio provided."

    # ---- HANDLE BOTH GRADIO FORMATS ----
    if isinstance(audio, dict):
        waveform = audio.get("data", None)
        sample_rate = audio.get("sampling_rate", None)
    else:
        sample_rate, waveform = audio

    if waveform is None or sample_rate is None:
        return "Invalid audio input."

    if len(waveform) == 0:
        return "Empty audio."

    # Convert stereo → mono
    if waveform.ndim > 1:
        waveform = np.mean(waveform, axis=1)

    # Resample to 16kHz
    if sample_rate != TARGET_SR:
        waveform = librosa.resample(
            waveform.astype(np.float32),
            orig_sr=sample_rate,
            target_sr=TARGET_SR
        )

    inputs = processor(
        waveform,
        sampling_rate=TARGET_SR,
        return_tensors="pt"
    )

    with torch.no_grad():
        predicted_ids = model.generate(
            inputs.input_features.to(DEVICE),
            max_length=448
        )

    transcription = processor.batch_decode(
        predicted_ids,
        skip_special_tokens=True
    )[0]

    return transcription.strip() if transcription else "No speech detected."

# =========================
# GLASSMORPHISM CSS
# =========================
CUSTOM_CSS = """
body {
    background: linear-gradient(135deg, #0f2027, #203a43, #2c5364);
    font-family: Inter, sans-serif;
}

.gradio-container {
    max-width: 1100px !important;
    margin: auto;
}

.glass {
    background: rgba(255, 255, 255, 0.12);
    backdrop-filter: blur(18px);
    -webkit-backdrop-filter: blur(18px);
    border-radius: 18px;
    border: 1px solid rgba(255, 255, 255, 0.25);
    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.35);
    padding: 24px;
}

h1, h3, p {
    color: white !important;
    text-align: center;
}

button {
    background: linear-gradient(135deg, #ff8008, #ffc837) !important;
    color: black !important;
    font-weight: 600 !important;
    border-radius: 10px !important;
}

textarea {
    font-size: 16px !important;
}
"""

# =========================
# UI
# =========================
with gr.Blocks(css=CUSTOM_CSS) as demo:
    with gr.Column(elem_classes="glass"):
        gr.Markdown(
            """
            # 🎙️ Pashto Speech-to-Text  
            ### Powered by Whisper ASR  
            Upload or record Pashto audio and get accurate transcription.
            """
        )

        audio_input = gr.Audio(
            sources=["upload", "microphone"],
            type="numpy",
            label="Upload or Record Pashto Audio"
        )

        transcribe_btn = gr.Button("Transcribe")

        output_text = gr.Textbox(
            label="Transcription Output",
            lines=6,
            placeholder="Pashto transcription will appear here..."
        )

        transcribe_btn.click(
            fn=transcribe_audio,
            inputs=audio_input,
            outputs=output_text
        )

        gr.Markdown(
            """
            <hr>
            <p>
            Developed for low-resource Pashto ASR using Whisper fine-tuning.<br>
            Runs entirely on Hugging Face free infrastructure.
            </p>
            """
        )

# =========================
# LAUNCH
# =========================
if __name__ == "__main__":
    demo.launch()