import gradio as gr import torch import librosa import base64 import os from transformers import WhisperProcessor, WhisperForConditionalGeneration # ───────────────────────────────────────────── # LOGO HELPERS # ───────────────────────────────────────────── def _img_to_b64(path, mime): """Return a data-URI string if the file exists, else empty string.""" if os.path.exists(path): with open(path, "rb") as f: data = base64.b64encode(f.read()).decode() return f"data:{mime};base64,{data}" return "" MCS_SRC = _img_to_b64("mcs.jpg", "image/jpeg") NUST_SRC = _img_to_b64("nust.png", "image/png") MIC_SRC = _img_to_b64("microphone.png","image/png") # ───────────────────────────────────────────── # LOAD ASR MODEL (runs once at startup) # ───────────────────────────────────────────── print("Loading ASR model...") ASR_MODEL_ID = "Zarnabh/whisper-large-ps" processor = WhisperProcessor.from_pretrained(ASR_MODEL_ID) asr_model = WhisperForConditionalGeneration.from_pretrained(ASR_MODEL_ID) asr_model.eval() print("✅ ASR model loaded") DEVICE = "cuda" if torch.cuda.is_available() else "cpu" asr_model = asr_model.to(DEVICE) # ───────────────────────────────────────────── # CORE FUNCTIONS # ───────────────────────────────────────────── def transcribe(audio_path): """Run Pashto Whisper ASR on an audio file.""" audio, sr = librosa.load(audio_path, sr=16000, mono=True) inputs = processor(audio, sampling_rate=16000, return_tensors="pt").to(DEVICE) with torch.no_grad(): predicted_ids = asr_model.generate( inputs["input_features"], language="ps", task="transcribe", ) transcript = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] return transcript.strip() def run_pipeline(audio_path): """Main pipeline: transcribe audio, return transcript HTML.""" if audio_path: transcript = transcribe(audio_path) source_label = "ASR Transcription" else: return ( build_html_empty(), "⚠️ Please provide audio to transcribe.", ) html = build_html(transcript, source_label) return html, transcript def build_html(transcript, source_label): """Render the transcript result as a styled HTML string.""" word_count = len(transcript.split()) # Build word pills word_pills = "" for word in transcript.split(): word_pills += f""" {word} """ return f"""
🎙️ {source_label}
{word_count} word{'s' if word_count != 1 else ''}
{transcript}
🔤 Tokenized Words
{word_pills}
""" def build_html_empty(): return """
🎙️
No input provided
Record audio, upload a file, or type Pashto text below.
""" # ───────────────────────────────────────────── # CSS # ───────────────────────────────────────────── css = """ @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&family=Noto+Naskh+Arabic:wght@400;600;700&display=swap'); body, .gradio-container { background: #0a0e1a !important; min-height: 100vh; font-family: 'Inter', sans-serif !important; } .gr-panel, .gr-box, .gradio-container .gap { background: transparent !important; } textarea, input[type="text"] { color: #e8f0ff !important; background-color: #0d1626 !important; border: 1px solid #1e3050 !important; border-radius: 10px !important; font-size: 15px !important; caret-color: #4fc3f7 !important; font-family: 'Inter', sans-serif !important; } .dark textarea, .dark input[type="text"], .dark .gr-textbox textarea, .dark [data-testid="textbox"] textarea, .dark [data-testid="textbox"] input { color: #e8f0ff !important; background-color: #0d1626 !important; border: 1px solid #1e3050 !important; font-size: 15px !important; caret-color: #4fc3f7 !important; } textarea:focus, input[type="text"]:focus, .dark textarea:focus, .dark input[type="text"]:focus { border-color: #4fc3f7 !important; box-shadow: 0 0 0 3px rgba(79,195,247,0.12) !important; outline: none !important; } textarea::placeholder, input[type="text"]::placeholder, .dark textarea::placeholder, .dark input[type="text"]::placeholder { color: #3d5a7a !important; opacity: 1 !important; } label span, .dark label span, .gr-textbox label span, .dark .gr-textbox label span, .dark [data-testid="textbox"] label span { color: #7aa8d8 !important; font-weight: 700 !important; font-size: 12px !important; letter-spacing: 0.09em; text-transform: uppercase; font-family: 'Inter', sans-serif !important; } #transcript-box textarea, .dark #transcript-box textarea { color: #ddeeff !important; background-color: #0a1220 !important; border: 1px solid #1a4a7a !important; font-size: 16px !important; font-weight: 600; direction: rtl; text-align: right; font-family: 'Noto Naskh Arabic', serif !important; } .run-btn { background: linear-gradient(135deg, #1a237e, #1565c0) !important; color: #ffffff !important; font-size: 15px !important; font-weight: 700 !important; letter-spacing: 0.08em; border: none !important; border-radius: 10px !important; padding: 14px 0 !important; box-shadow: 0 4px 20px rgba(21,101,192,0.4) !important; transition: all 0.2s ease !important; font-family: 'Inter', sans-serif !important; text-transform: uppercase; } .run-btn:hover { background: linear-gradient(135deg, #283593, #1976d2) !important; box-shadow: 0 6px 28px rgba(21,101,192,0.6) !important; transform: translateY(-2px) !important; } .gr-audio { background: #0d1626 !important; border: 1px solid #1e3050 !important; border-radius: 12px !important; } ::-webkit-scrollbar { width: 5px; background: #0a0e1a; } ::-webkit-scrollbar-thumb { background: #1e3a5a; border-radius: 3px; } footer { display: none !important; } """ # ───────────────────────────────────────────── # GRADIO UI # ───────────────────────────────────────────── with gr.Blocks(css=css, title="Pashto ASR") as demo: # ── HEADER ────────────────────────────────────────────────────── gr.HTML(f"""
MCS Logo
Military College of Signals
Rawalpindi
Pashto ASR
Whisper  ·  Transformers
WER 24.19%
National University of
Sciences & Technology
NUST Logo
""") # ── MAIN LAYOUT ────────────────────────────────────────────────── with gr.Row(equal_height=True): # ── LEFT COLUMN ──────────────────────────────────────────── with gr.Column(scale=1, min_width=300): _mic_img = f'' if MIC_SRC else "🎙️ " gr.HTML(f"""
{_mic_img} Audio Input
""") audio_input = gr.Audio( sources=["microphone", "upload"], type="filepath", label="Record or Upload Pashto Audio", ) run_btn = gr.Button("🎙️ Transcribe Speech", variant="primary", size="lg", elem_classes="run-btn") # Info panel gr.HTML("""
🧠 How It Works
🎙️Record — use your microphone directly 📂Upload — submit a .wav / .mp3 file
⚙️ Model
Zarnabh/whisper-large-ps
Fine-tuned Whisper for Pashto speech recognition
""") # ── RIGHT COLUMN ─────────────────────────────────────────── with gr.Column(scale=2): transcript_out = gr.Textbox( label="📝 ASR Transcript", interactive=False, rtl=True, text_align="right", lines=2, elem_id="transcript-box", ) gr.HTML("""
📊
Transcription Result
""") html_out = gr.HTML() run_btn.click( fn=run_pipeline, inputs=[audio_input], outputs=[html_out, transcript_out], ) # ── FOOTER ────────────────────────────────────────────────────── gr.HTML("""
ASR Model: Zarnabh/whisper-large-ps
MCS & NUST  ·  Pashto NLP Research
""") if __name__ == "__main__": demo.launch()