Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import torch | |
| import librosa | |
| import base64 | |
| import os | |
| from transformers import WhisperProcessor, WhisperForConditionalGeneration | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # LOGO HELPERS | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def _img_to_b64(path, mime): | |
| """Return a data-URI string if the file exists, else empty string.""" | |
| if os.path.exists(path): | |
| with open(path, "rb") as f: | |
| data = base64.b64encode(f.read()).decode() | |
| return f"data:{mime};base64,{data}" | |
| return "" | |
| MCS_SRC = _img_to_b64("mcs.jpg", "image/jpeg") | |
| NUST_SRC = _img_to_b64("nust.png", "image/png") | |
| MIC_SRC = _img_to_b64("microphone.png","image/png") | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # LOAD ASR MODEL (runs once at startup) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| print("Loading ASR model...") | |
| ASR_MODEL_ID = "Zarnabh/whisper-large-ps" | |
| processor = WhisperProcessor.from_pretrained(ASR_MODEL_ID) | |
| asr_model = WhisperForConditionalGeneration.from_pretrained(ASR_MODEL_ID) | |
| asr_model.eval() | |
| print("β ASR model loaded") | |
| DEVICE = "cuda" if torch.cuda.is_available() else "cpu" | |
| asr_model = asr_model.to(DEVICE) | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # CORE FUNCTIONS | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| def transcribe(audio_path): | |
| """Run Pashto Whisper ASR on an audio file.""" | |
| audio, sr = librosa.load(audio_path, sr=16000, mono=True) | |
| inputs = processor(audio, sampling_rate=16000, return_tensors="pt").to(DEVICE) | |
| with torch.no_grad(): | |
| predicted_ids = asr_model.generate( | |
| inputs["input_features"], | |
| language="ps", | |
| task="transcribe", | |
| ) | |
| transcript = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0] | |
| return transcript.strip() | |
| def run_pipeline(audio_path): | |
| """Main pipeline: transcribe audio, return transcript HTML.""" | |
| if audio_path: | |
| transcript = transcribe(audio_path) | |
| source_label = "ASR Transcription" | |
| else: | |
| return ( | |
| build_html_empty(), | |
| "β οΈ Please provide audio to transcribe.", | |
| ) | |
| html = build_html(transcript, source_label) | |
| return html, transcript | |
| def build_html(transcript, source_label): | |
| """Render the transcript result as a styled HTML string.""" | |
| word_count = len(transcript.split()) | |
| # Build word pills | |
| word_pills = "" | |
| for word in transcript.split(): | |
| word_pills += f""" | |
| <span style=" | |
| display:inline-block; | |
| background:#0d1a2e; | |
| border:1px solid #1e3a5a; | |
| color:#b0cce8; | |
| border-radius:8px; | |
| padding:4px 10px; | |
| margin:3px 4px; | |
| font-size:14px; | |
| font-family:'Noto Naskh Arabic',serif; | |
| direction:rtl;"> | |
| {word} | |
| </span>""" | |
| return f""" | |
| <div style="font-family:'Inter',sans-serif;"> | |
| <!-- Source badge --> | |
| <div style="display:flex;align-items:center;gap:10px;margin-bottom:14px;"> | |
| <div style="background:linear-gradient(135deg,#1a3a6e,#1565c0); | |
| color:#e3f2fd;font-size:12px;font-weight:700; | |
| letter-spacing:0.08em;text-transform:uppercase; | |
| padding:6px 14px;border-radius:20px;"> | |
| ποΈ {source_label} | |
| </div> | |
| <div style="color:#546e7a;font-size:11px;"> | |
| {word_count} word{'s' if word_count != 1 else ''} | |
| </div> | |
| </div> | |
| <!-- Transcript display --> | |
| <div style="background:#0a1220;border:1px solid #1a4a7a;border-radius:12px; | |
| padding:18px 20px;margin-bottom:16px; | |
| direction:rtl;text-align:right; | |
| font-family:'Noto Naskh Arabic',serif; | |
| font-size:20px;font-weight:600; | |
| color:#ddeeff;line-height:1.8;"> | |
| {transcript} | |
| </div> | |
| <!-- Word tokens --> | |
| <div style="background:#0d1626;border:1px solid #1e3050;border-radius:12px;padding:14px 16px;"> | |
| <div style="color:#5c8abf;font-size:11px;font-weight:700; | |
| letter-spacing:0.1em;text-transform:uppercase;margin-bottom:10px;"> | |
| π€ Tokenized Words | |
| </div> | |
| <div style="display:flex;flex-wrap:wrap;gap:2px;direction:rtl;"> | |
| {word_pills} | |
| </div> | |
| </div> | |
| </div> | |
| """ | |
| def build_html_empty(): | |
| return """ | |
| <div style="background:#0d1626;border:1px solid #1e3050;border-radius:12px; | |
| padding:32px;text-align:center;color:#3d5a7a;font-family:'Inter',sans-serif;"> | |
| <div style="font-size:36px;margin-bottom:12px;">ποΈ</div> | |
| <div style="font-size:14px;font-weight:600;">No input provided</div> | |
| <div style="font-size:12px;margin-top:6px;">Record audio, upload a file, or type Pashto text below.</div> | |
| </div> | |
| """ | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # CSS | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| css = """ | |
| @import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&family=Noto+Naskh+Arabic:wght@400;600;700&display=swap'); | |
| body, .gradio-container { | |
| background: #0a0e1a !important; | |
| min-height: 100vh; | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| .gr-panel, .gr-box, .gradio-container .gap { | |
| background: transparent !important; | |
| } | |
| textarea, input[type="text"] { | |
| color: #e8f0ff !important; | |
| background-color: #0d1626 !important; | |
| border: 1px solid #1e3050 !important; | |
| border-radius: 10px !important; | |
| font-size: 15px !important; | |
| caret-color: #4fc3f7 !important; | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| .dark textarea, | |
| .dark input[type="text"], | |
| .dark .gr-textbox textarea, | |
| .dark [data-testid="textbox"] textarea, | |
| .dark [data-testid="textbox"] input { | |
| color: #e8f0ff !important; | |
| background-color: #0d1626 !important; | |
| border: 1px solid #1e3050 !important; | |
| font-size: 15px !important; | |
| caret-color: #4fc3f7 !important; | |
| } | |
| textarea:focus, input[type="text"]:focus, | |
| .dark textarea:focus, .dark input[type="text"]:focus { | |
| border-color: #4fc3f7 !important; | |
| box-shadow: 0 0 0 3px rgba(79,195,247,0.12) !important; | |
| outline: none !important; | |
| } | |
| textarea::placeholder, input[type="text"]::placeholder, | |
| .dark textarea::placeholder, .dark input[type="text"]::placeholder { | |
| color: #3d5a7a !important; | |
| opacity: 1 !important; | |
| } | |
| label span, | |
| .dark label span, | |
| .gr-textbox label span, | |
| .dark .gr-textbox label span, | |
| .dark [data-testid="textbox"] label span { | |
| color: #7aa8d8 !important; | |
| font-weight: 700 !important; | |
| font-size: 12px !important; | |
| letter-spacing: 0.09em; | |
| text-transform: uppercase; | |
| font-family: 'Inter', sans-serif !important; | |
| } | |
| #transcript-box textarea, | |
| .dark #transcript-box textarea { | |
| color: #ddeeff !important; | |
| background-color: #0a1220 !important; | |
| border: 1px solid #1a4a7a !important; | |
| font-size: 16px !important; | |
| font-weight: 600; | |
| direction: rtl; | |
| text-align: right; | |
| font-family: 'Noto Naskh Arabic', serif !important; | |
| } | |
| .run-btn { | |
| background: linear-gradient(135deg, #1a237e, #1565c0) !important; | |
| color: #ffffff !important; | |
| font-size: 15px !important; | |
| font-weight: 700 !important; | |
| letter-spacing: 0.08em; | |
| border: none !important; | |
| border-radius: 10px !important; | |
| padding: 14px 0 !important; | |
| box-shadow: 0 4px 20px rgba(21,101,192,0.4) !important; | |
| transition: all 0.2s ease !important; | |
| font-family: 'Inter', sans-serif !important; | |
| text-transform: uppercase; | |
| } | |
| .run-btn:hover { | |
| background: linear-gradient(135deg, #283593, #1976d2) !important; | |
| box-shadow: 0 6px 28px rgba(21,101,192,0.6) !important; | |
| transform: translateY(-2px) !important; | |
| } | |
| .gr-audio { | |
| background: #0d1626 !important; | |
| border: 1px solid #1e3050 !important; | |
| border-radius: 12px !important; | |
| } | |
| ::-webkit-scrollbar { width: 5px; background: #0a0e1a; } | |
| ::-webkit-scrollbar-thumb { background: #1e3a5a; border-radius: 3px; } | |
| footer { display: none !important; } | |
| """ | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| # GRADIO UI | |
| # βββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Blocks(css=css, title="Pashto ASR") as demo: | |
| # ββ HEADER ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gr.HTML(f""" | |
| <div style=" | |
| background:linear-gradient(135deg,#060a14 0%,#0d1a30 55%,#060d1c 100%); | |
| border-bottom:1px solid #152440; | |
| padding:22px 28px 18px 28px; | |
| margin-bottom:2px;"> | |
| <div style="display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:14px;"> | |
| <!-- MCS Logo + label --> | |
| <div style="display:flex;align-items:center;gap:12px;flex:0 0 auto;"> | |
| <img src="{MCS_SRC}" | |
| style="width:64px;height:64px;border-radius:50%;border:2.5px solid #b71c1c; | |
| box-shadow:0 0 14px rgba(183,28,28,0.45);" alt="MCS Logo"/> | |
| <div> | |
| <div style="color:#ef9a9a;font-size:11px;font-weight:800;letter-spacing:0.12em; | |
| text-transform:uppercase;font-family:'Inter',sans-serif;">Military College of Signals</div> | |
| <div style="color:#b0bec5;font-size:10px;letter-spacing:0.05em;font-family:'Inter',sans-serif;">Rawalpindi</div> | |
| </div> | |
| </div> | |
| <!-- Center title --> | |
| <div style="text-align:center;flex:1;min-width:240px;"> | |
| <div style="font-family:'Inter',sans-serif;font-size:clamp(18px,2.8vw,26px);font-weight:800; | |
| color:#e3f2fd;letter-spacing:0.03em;line-height:1.2; | |
| text-shadow:0 2px 10px rgba(79,195,247,0.25);"> | |
| Pashto ASR | |
| </div> | |
| <div style="font-family:'Inter',sans-serif;font-size:clamp(11px,1.4vw,13px);font-weight:600; | |
| color:#546e7a;letter-spacing:0.06em;margin-top:5px;"> | |
| Whisper Β· Transformers | |
| </div> | |
| <div style="margin-top:10px;display:inline-block; | |
| background:linear-gradient(135deg,#0d2a0d,#1b5e20); | |
| border:2px solid #00e676; | |
| border-radius:10px;padding:5px 18px; | |
| box-shadow:0 0 18px rgba(0,230,118,0.45),0 0 6px rgba(0,230,118,0.25);"> | |
| <span style="font-family:'Inter',sans-serif;font-size:11px;font-weight:700; | |
| color:#69f0ae;letter-spacing:0.12em;text-transform:uppercase;"> | |
| WER | |
| </span> | |
| <span style="font-family:'Inter',sans-serif;font-size:16px;font-weight:900; | |
| color:#00e676;letter-spacing:0.05em;margin-left:8px; | |
| text-shadow:0 0 10px rgba(0,230,118,0.7);"> | |
| 24.19% | |
| </span> | |
| </div> | |
| </div> | |
| <!-- NUST Logo + label --> | |
| <div style="display:flex;align-items:center;gap:12px;flex:0 0 auto;"> | |
| <div style="text-align:right;"> | |
| <div style="color:#90caf9;font-size:11px;font-weight:800;letter-spacing:0.12em; | |
| text-transform:uppercase;font-family:'Inter',sans-serif;">National University of</div> | |
| <div style="color:#b0bec5;font-size:10px;letter-spacing:0.05em;font-family:'Inter',sans-serif;">Sciences & Technology</div> | |
| </div> | |
| <img src="{NUST_SRC}" | |
| style="width:64px;height:64px;border-radius:50%;border:2.5px solid #1565c0; | |
| box-shadow:0 0 14px rgba(21,101,192,0.45);" alt="NUST Logo"/> | |
| </div> | |
| </div> | |
| </div> | |
| """) | |
| # ββ MAIN LAYOUT ββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Row(equal_height=True): | |
| # ββ LEFT COLUMN ββββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Column(scale=1, min_width=300): | |
| _mic_img = f'<img src="{MIC_SRC}" style="width:20px;height:20px;vertical-align:middle;margin-right:8px;filter:brightness(1.2);">' if MIC_SRC else "ποΈ " | |
| gr.HTML(f""" | |
| <div style="background:#0d1626;border:1px solid #1e3050;border-radius:12px 12px 0 0; | |
| padding:10px 16px 8px 16px;margin-bottom:-2px;"> | |
| <div style="display:flex;align-items:center;gap:4px;"> | |
| {_mic_img} | |
| <span style="color:#7aa8d8;font-size:12px;font-weight:700; | |
| letter-spacing:0.09em;text-transform:uppercase; | |
| font-family:'Inter',sans-serif;">Audio Input</span> | |
| </div> | |
| </div> | |
| """) | |
| audio_input = gr.Audio( | |
| sources=["microphone", "upload"], | |
| type="filepath", | |
| label="Record or Upload Pashto Audio", | |
| ) | |
| run_btn = gr.Button("ποΈ Transcribe Speech", variant="primary", size="lg", elem_classes="run-btn") | |
| # Info panel | |
| gr.HTML(""" | |
| <div style="margin-top:12px;background:#0d1626; | |
| border:1px solid #1e3050;border-radius:12px; | |
| padding:14px 16px;font-size:12px;line-height:1.9; | |
| font-family:'Inter',sans-serif;"> | |
| <div style="color:#5c8abf;font-weight:700;font-size:11px; | |
| letter-spacing:0.1em;text-transform:uppercase;margin-bottom:8px;"> | |
| π§ How It Works | |
| </div> | |
| <div style="display:grid;grid-template-columns:22px 1fr;gap:3px 8px;align-items:start;color:#546e7a;"> | |
| <span>ποΈ</span><span><b style="color:#cdd8e8">Record</b> β use your microphone directly</span> | |
| <span>π</span><span><b style="color:#cdd8e8">Upload</b> β submit a .wav / .mp3 file</span> | |
| </div> | |
| <div style="height:1px;background:rgba(79,195,247,0.1);margin:10px 0;"></div> | |
| <div style="color:#5c8abf;font-weight:700;font-size:11px; | |
| letter-spacing:0.1em;text-transform:uppercase;margin-bottom:6px;"> | |
| βοΈ Model | |
| </div> | |
| <div style="color:#546e7a;font-size:11px;line-height:1.7;"> | |
| <b style="color:#90caf9">Zarnabh/whisper-large-ps</b><br/> | |
| Fine-tuned Whisper for Pashto speech recognition | |
| </div> | |
| </div> | |
| """) | |
| # ββ RIGHT COLUMN βββββββββββββββββββββββββββββββββββββββββββ | |
| with gr.Column(scale=2): | |
| transcript_out = gr.Textbox( | |
| label="π ASR Transcript", | |
| interactive=False, | |
| rtl=True, | |
| text_align="right", | |
| lines=2, | |
| elem_id="transcript-box", | |
| ) | |
| gr.HTML(""" | |
| <div style="display:flex;align-items:center;gap:8px;margin-top:14px;margin-bottom:6px;"> | |
| <div style="width:28px;height:28px;border-radius:7px; | |
| background:linear-gradient(135deg,#1a3a6e,#1565c0); | |
| display:flex;align-items:center;justify-content:center;font-size:14px;">π</div> | |
| <span style="color:#90caf9;font-size:12px;font-weight:700; | |
| letter-spacing:0.09em;text-transform:uppercase; | |
| font-family:'Inter',sans-serif;">Transcription Result</span> | |
| </div> | |
| """) | |
| html_out = gr.HTML() | |
| run_btn.click( | |
| fn=run_pipeline, | |
| inputs=[audio_input], | |
| outputs=[html_out, transcript_out], | |
| ) | |
| # ββ FOOTER ββββββββββββββββββββββββββββββββββββββββββββββββββββββ | |
| gr.HTML(""" | |
| <div style="margin-top:24px;padding:12px 24px; | |
| background:linear-gradient(90deg,#060a14,#0d1a2e,#060a14); | |
| border-top:1px solid #152440; | |
| display:flex;flex-wrap:wrap;justify-content:space-between; | |
| align-items:center;gap:8px;font-family:'Inter',sans-serif;"> | |
| <div style="color:#2a4a6a;font-size:11px;letter-spacing:0.04em;"> | |
| <b style="color:#4a7aab">ASR Model:</b> Zarnabh/whisper-large-ps | |
| </div> | |
| <div style="color:#2a4a6a;font-size:11px;letter-spacing:0.04em;"> | |
| MCS & NUST Β· Pashto NLP Research | |
| </div> | |
| </div> | |
| """) | |
| if __name__ == "__main__": | |
| demo.launch() | |