Spaces:
Sleeping
Sleeping
File size: 3,968 Bytes
bbbf3e8 6da03f5 bbbf3e8 6da03f5 bbbf3e8 30b7049 6da03f5 bbbf3e8 6da03f5 bbbf3e8 30b7049 bbbf3e8 30b7049 bbbf3e8 30b7049 bbbf3e8 30b7049 bbbf3e8 6da03f5 bbbf3e8 30b7049 bbbf3e8 6da03f5 bbbf3e8 6da03f5 bbbf3e8 6da03f5 bbbf3e8 6da03f5 bbbf3e8 6da03f5 bbbf3e8 6da03f5 bbbf3e8 6da03f5 bbbf3e8 30b7049 bbbf3e8 30b7049 bbbf3e8 6da03f5 bbbf3e8 6da03f5 bbbf3e8 30b7049 bbbf3e8 30b7049 6da03f5 bbbf3e8 30b7049 bbbf3e8 30b7049 bbbf3e8 30b7049 6da03f5 30b7049 bbbf3e8 30b7049 6da03f5 30b7049 bbbf3e8 30b7049 bbbf3e8 6da03f5 bbbf3e8 30b7049 bbbf3e8 30b7049 bbbf3e8 30b7049 bbbf3e8 30b7049 bbbf3e8 30b7049 bbbf3e8 30b7049 bbbf3e8 30b7049 bbbf3e8 30b7049 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 | import gradio as gr
import torch
import numpy as np
import librosa
from transformers import WhisperProcessor, WhisperForConditionalGeneration
# =========================
# CONFIG
# =========================
MODEL_ID = "afaqalinagra/PASHTO-ASR-MODEL"
DEVICE = "cpu"
TARGET_SR = 16000
# =========================
# LOAD MODEL
# =========================
processor = WhisperProcessor.from_pretrained(
MODEL_ID,
language="pashto",
task="transcribe"
)
model = WhisperForConditionalGeneration.from_pretrained(
MODEL_ID
).to(DEVICE)
model.eval()
# =========================
# SAFE AUDIO HANDLER
# =========================
def transcribe_audio(audio):
if audio is None:
return "No audio provided."
# ---- HANDLE BOTH GRADIO FORMATS ----
if isinstance(audio, dict):
waveform = audio.get("data", None)
sample_rate = audio.get("sampling_rate", None)
else:
sample_rate, waveform = audio
if waveform is None or sample_rate is None:
return "Invalid audio input."
if len(waveform) == 0:
return "Empty audio."
# Convert stereo → mono
if waveform.ndim > 1:
waveform = np.mean(waveform, axis=1)
# Resample to 16kHz
if sample_rate != TARGET_SR:
waveform = librosa.resample(
waveform.astype(np.float32),
orig_sr=sample_rate,
target_sr=TARGET_SR
)
inputs = processor(
waveform,
sampling_rate=TARGET_SR,
return_tensors="pt"
)
with torch.no_grad():
predicted_ids = model.generate(
inputs.input_features.to(DEVICE),
max_length=448
)
transcription = processor.batch_decode(
predicted_ids,
skip_special_tokens=True
)[0]
return transcription.strip() if transcription else "No speech detected."
# =========================
# GLASSMORPHISM CSS
# =========================
CUSTOM_CSS = """
body {
background: linear-gradient(135deg, #0f2027, #203a43, #2c5364);
font-family: Inter, sans-serif;
}
.gradio-container {
max-width: 1100px !important;
margin: auto;
}
.glass {
background: rgba(255, 255, 255, 0.12);
backdrop-filter: blur(18px);
-webkit-backdrop-filter: blur(18px);
border-radius: 18px;
border: 1px solid rgba(255, 255, 255, 0.25);
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.35);
padding: 24px;
}
h1, h3, p {
color: white !important;
text-align: center;
}
button {
background: linear-gradient(135deg, #ff8008, #ffc837) !important;
color: black !important;
font-weight: 600 !important;
border-radius: 10px !important;
}
textarea {
font-size: 16px !important;
}
"""
# =========================
# UI
# =========================
with gr.Blocks(css=CUSTOM_CSS) as demo:
with gr.Column(elem_classes="glass"):
gr.Markdown(
"""
# 🎙️ Pashto Speech-to-Text
### Powered by Whisper ASR
Upload or record Pashto audio and get accurate transcription.
"""
)
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="numpy",
label="Upload or Record Pashto Audio"
)
transcribe_btn = gr.Button("Transcribe")
output_text = gr.Textbox(
label="Transcription Output",
lines=6,
placeholder="Pashto transcription will appear here..."
)
transcribe_btn.click(
fn=transcribe_audio,
inputs=audio_input,
outputs=output_text
)
gr.Markdown(
"""
<hr>
<p>
Developed for low-resource Pashto ASR using Whisper fine-tuning.<br>
Runs entirely on Hugging Face free infrastructure.
</p>
"""
)
# =========================
# LAUNCH
# =========================
if __name__ == "__main__":
demo.launch()
|