import gradio as gr
import torch
import librosa
import base64
import os
from transformers import WhisperProcessor, WhisperForConditionalGeneration
# ─────────────────────────────────────────────
# LOGO HELPERS
# ─────────────────────────────────────────────
def _img_to_b64(path, mime):
"""Return a data-URI string if the file exists, else empty string."""
if os.path.exists(path):
with open(path, "rb") as f:
data = base64.b64encode(f.read()).decode()
return f"data:{mime};base64,{data}"
return ""
MCS_SRC = _img_to_b64("mcs.jpg", "image/jpeg")
NUST_SRC = _img_to_b64("nust.png", "image/png")
MIC_SRC = _img_to_b64("microphone.png","image/png")
# ─────────────────────────────────────────────
# LOAD ASR MODEL (runs once at startup)
# ─────────────────────────────────────────────
print("Loading ASR model...")
ASR_MODEL_ID = "Zarnabh/whisper-large-ps"
processor = WhisperProcessor.from_pretrained(ASR_MODEL_ID)
asr_model = WhisperForConditionalGeneration.from_pretrained(ASR_MODEL_ID)
asr_model.eval()
print("✅ ASR model loaded")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
asr_model = asr_model.to(DEVICE)
# ─────────────────────────────────────────────
# CORE FUNCTIONS
# ─────────────────────────────────────────────
def transcribe(audio_path):
"""Run Pashto Whisper ASR on an audio file."""
audio, sr = librosa.load(audio_path, sr=16000, mono=True)
inputs = processor(audio, sampling_rate=16000, return_tensors="pt").to(DEVICE)
with torch.no_grad():
predicted_ids = asr_model.generate(
inputs["input_features"],
language="ps",
task="transcribe",
)
transcript = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcript.strip()
def run_pipeline(audio_path):
"""Main pipeline: transcribe audio, return transcript HTML."""
if audio_path:
transcript = transcribe(audio_path)
source_label = "ASR Transcription"
else:
return (
build_html_empty(),
"⚠️ Please provide audio to transcribe.",
)
html = build_html(transcript, source_label)
return html, transcript
def build_html(transcript, source_label):
"""Render the transcript result as a styled HTML string."""
word_count = len(transcript.split())
# Build word pills
word_pills = ""
for word in transcript.split():
word_pills += f"""
{word}
"""
return f"""
🎙️ {source_label}
{word_count} word{'s' if word_count != 1 else ''}
{transcript}
🔤 Tokenized Words
{word_pills}
"""
def build_html_empty():
return """
🎙️
No input provided
Record audio, upload a file, or type Pashto text below.
"""
# ─────────────────────────────────────────────
# CSS
# ─────────────────────────────────────────────
css = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&family=Noto+Naskh+Arabic:wght@400;600;700&display=swap');
body, .gradio-container {
background: #0a0e1a !important;
min-height: 100vh;
font-family: 'Inter', sans-serif !important;
}
.gr-panel, .gr-box, .gradio-container .gap {
background: transparent !important;
}
textarea, input[type="text"] {
color: #e8f0ff !important;
background-color: #0d1626 !important;
border: 1px solid #1e3050 !important;
border-radius: 10px !important;
font-size: 15px !important;
caret-color: #4fc3f7 !important;
font-family: 'Inter', sans-serif !important;
}
.dark textarea,
.dark input[type="text"],
.dark .gr-textbox textarea,
.dark [data-testid="textbox"] textarea,
.dark [data-testid="textbox"] input {
color: #e8f0ff !important;
background-color: #0d1626 !important;
border: 1px solid #1e3050 !important;
font-size: 15px !important;
caret-color: #4fc3f7 !important;
}
textarea:focus, input[type="text"]:focus,
.dark textarea:focus, .dark input[type="text"]:focus {
border-color: #4fc3f7 !important;
box-shadow: 0 0 0 3px rgba(79,195,247,0.12) !important;
outline: none !important;
}
textarea::placeholder, input[type="text"]::placeholder,
.dark textarea::placeholder, .dark input[type="text"]::placeholder {
color: #3d5a7a !important;
opacity: 1 !important;
}
label span,
.dark label span,
.gr-textbox label span,
.dark .gr-textbox label span,
.dark [data-testid="textbox"] label span {
color: #7aa8d8 !important;
font-weight: 700 !important;
font-size: 12px !important;
letter-spacing: 0.09em;
text-transform: uppercase;
font-family: 'Inter', sans-serif !important;
}
#transcript-box textarea,
.dark #transcript-box textarea {
color: #ddeeff !important;
background-color: #0a1220 !important;
border: 1px solid #1a4a7a !important;
font-size: 16px !important;
font-weight: 600;
direction: rtl;
text-align: right;
font-family: 'Noto Naskh Arabic', serif !important;
}
.run-btn {
background: linear-gradient(135deg, #1a237e, #1565c0) !important;
color: #ffffff !important;
font-size: 15px !important;
font-weight: 700 !important;
letter-spacing: 0.08em;
border: none !important;
border-radius: 10px !important;
padding: 14px 0 !important;
box-shadow: 0 4px 20px rgba(21,101,192,0.4) !important;
transition: all 0.2s ease !important;
font-family: 'Inter', sans-serif !important;
text-transform: uppercase;
}
.run-btn:hover {
background: linear-gradient(135deg, #283593, #1976d2) !important;
box-shadow: 0 6px 28px rgba(21,101,192,0.6) !important;
transform: translateY(-2px) !important;
}
.gr-audio {
background: #0d1626 !important;
border: 1px solid #1e3050 !important;
border-radius: 12px !important;
}
::-webkit-scrollbar { width: 5px; background: #0a0e1a; }
::-webkit-scrollbar-thumb { background: #1e3a5a; border-radius: 3px; }
footer { display: none !important; }
"""
# ─────────────────────────────────────────────
# GRADIO UI
# ─────────────────────────────────────────────
with gr.Blocks(css=css, title="Pashto ASR") as demo:
# ── HEADER ──────────────────────────────────────────────────────
gr.HTML(f"""
Military College of Signals
Rawalpindi
Pashto ASR
Whisper · Transformers
WER
24.19%
National University of
Sciences & Technology
""")
# ── MAIN LAYOUT ──────────────────────────────────────────────────
with gr.Row(equal_height=True):
# ── LEFT COLUMN ────────────────────────────────────────────
with gr.Column(scale=1, min_width=300):
_mic_img = f'
' if MIC_SRC else "🎙️ "
gr.HTML(f"""
""")
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Record or Upload Pashto Audio",
)
run_btn = gr.Button("🎙️ Transcribe Speech", variant="primary", size="lg", elem_classes="run-btn")
# Info panel
gr.HTML("""
🧠 How It Works
🎙️Record — use your microphone directly
📂Upload — submit a .wav / .mp3 file
⚙️ Model
Zarnabh/whisper-large-ps
Fine-tuned Whisper for Pashto speech recognition
""")
# ── RIGHT COLUMN ───────────────────────────────────────────
with gr.Column(scale=2):
transcript_out = gr.Textbox(
label="📝 ASR Transcript",
interactive=False,
rtl=True,
text_align="right",
lines=2,
elem_id="transcript-box",
)
gr.HTML("""
""")
html_out = gr.HTML()
run_btn.click(
fn=run_pipeline,
inputs=[audio_input],
outputs=[html_out, transcript_out],
)
# ── FOOTER ──────────────────────────────────────────────────────
gr.HTML("""
ASR Model: Zarnabh/whisper-large-ps
MCS & NUST · Pashto NLP Research
""")
if __name__ == "__main__":
demo.launch()