afaqalinagra's picture
Update app.py
6da03f5 verified
import gradio as gr
import torch
import numpy as np
import librosa
from transformers import WhisperProcessor, WhisperForConditionalGeneration
# =========================
# CONFIG
# =========================
MODEL_ID = "afaqalinagra/PASHTO-ASR-MODEL"
DEVICE = "cpu"
TARGET_SR = 16000
# =========================
# LOAD MODEL
# =========================
processor = WhisperProcessor.from_pretrained(
MODEL_ID,
language="pashto",
task="transcribe"
)
model = WhisperForConditionalGeneration.from_pretrained(
MODEL_ID
).to(DEVICE)
model.eval()
# =========================
# SAFE AUDIO HANDLER
# =========================
def transcribe_audio(audio):
if audio is None:
return "No audio provided."
# ---- HANDLE BOTH GRADIO FORMATS ----
if isinstance(audio, dict):
waveform = audio.get("data", None)
sample_rate = audio.get("sampling_rate", None)
else:
sample_rate, waveform = audio
if waveform is None or sample_rate is None:
return "Invalid audio input."
if len(waveform) == 0:
return "Empty audio."
# Convert stereo โ†’ mono
if waveform.ndim > 1:
waveform = np.mean(waveform, axis=1)
# Resample to 16kHz
if sample_rate != TARGET_SR:
waveform = librosa.resample(
waveform.astype(np.float32),
orig_sr=sample_rate,
target_sr=TARGET_SR
)
inputs = processor(
waveform,
sampling_rate=TARGET_SR,
return_tensors="pt"
)
with torch.no_grad():
predicted_ids = model.generate(
inputs.input_features.to(DEVICE),
max_length=448
)
transcription = processor.batch_decode(
predicted_ids,
skip_special_tokens=True
)[0]
return transcription.strip() if transcription else "No speech detected."
# =========================
# GLASSMORPHISM CSS
# =========================
CUSTOM_CSS = """
body {
background: linear-gradient(135deg, #0f2027, #203a43, #2c5364);
font-family: Inter, sans-serif;
}
.gradio-container {
max-width: 1100px !important;
margin: auto;
}
.glass {
background: rgba(255, 255, 255, 0.12);
backdrop-filter: blur(18px);
-webkit-backdrop-filter: blur(18px);
border-radius: 18px;
border: 1px solid rgba(255, 255, 255, 0.25);
box-shadow: 0 8px 32px rgba(0, 0, 0, 0.35);
padding: 24px;
}
h1, h3, p {
color: white !important;
text-align: center;
}
button {
background: linear-gradient(135deg, #ff8008, #ffc837) !important;
color: black !important;
font-weight: 600 !important;
border-radius: 10px !important;
}
textarea {
font-size: 16px !important;
}
"""
# =========================
# UI
# =========================
with gr.Blocks(css=CUSTOM_CSS) as demo:
with gr.Column(elem_classes="glass"):
gr.Markdown(
"""
# ๐ŸŽ™๏ธ Pashto Speech-to-Text
### Powered by Whisper ASR
Upload or record Pashto audio and get accurate transcription.
"""
)
audio_input = gr.Audio(
sources=["upload", "microphone"],
type="numpy",
label="Upload or Record Pashto Audio"
)
transcribe_btn = gr.Button("Transcribe")
output_text = gr.Textbox(
label="Transcription Output",
lines=6,
placeholder="Pashto transcription will appear here..."
)
transcribe_btn.click(
fn=transcribe_audio,
inputs=audio_input,
outputs=output_text
)
gr.Markdown(
"""
<hr>
<p>
Developed for low-resource Pashto ASR using Whisper fine-tuning.<br>
Runs entirely on Hugging Face free infrastructure.
</p>
"""
)
# =========================
# LAUNCH
# =========================
if __name__ == "__main__":
demo.launch()