Pashto-ASR / app.py
Zarnabh's picture
Rename app(1).py to app.py
13de9e1 verified
import gradio as gr
import torch
import librosa
import base64
import os
from transformers import WhisperProcessor, WhisperForConditionalGeneration
# ─────────────────────────────────────────────
# LOGO HELPERS
# ─────────────────────────────────────────────
def _img_to_b64(path, mime):
"""Return a data-URI string if the file exists, else empty string."""
if os.path.exists(path):
with open(path, "rb") as f:
data = base64.b64encode(f.read()).decode()
return f"data:{mime};base64,{data}"
return ""
MCS_SRC = _img_to_b64("mcs.jpg", "image/jpeg")
NUST_SRC = _img_to_b64("nust.png", "image/png")
MIC_SRC = _img_to_b64("microphone.png","image/png")
# ─────────────────────────────────────────────
# LOAD ASR MODEL (runs once at startup)
# ─────────────────────────────────────────────
print("Loading ASR model...")
ASR_MODEL_ID = "Zarnabh/whisper-large-ps"
processor = WhisperProcessor.from_pretrained(ASR_MODEL_ID)
asr_model = WhisperForConditionalGeneration.from_pretrained(ASR_MODEL_ID)
asr_model.eval()
print("βœ… ASR model loaded")
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
asr_model = asr_model.to(DEVICE)
# ─────────────────────────────────────────────
# CORE FUNCTIONS
# ─────────────────────────────────────────────
def transcribe(audio_path):
"""Run Pashto Whisper ASR on an audio file."""
audio, sr = librosa.load(audio_path, sr=16000, mono=True)
inputs = processor(audio, sampling_rate=16000, return_tensors="pt").to(DEVICE)
with torch.no_grad():
predicted_ids = asr_model.generate(
inputs["input_features"],
language="ps",
task="transcribe",
)
transcript = processor.batch_decode(predicted_ids, skip_special_tokens=True)[0]
return transcript.strip()
def run_pipeline(audio_path):
"""Main pipeline: transcribe audio, return transcript HTML."""
if audio_path:
transcript = transcribe(audio_path)
source_label = "ASR Transcription"
else:
return (
build_html_empty(),
"⚠️ Please provide audio to transcribe.",
)
html = build_html(transcript, source_label)
return html, transcript
def build_html(transcript, source_label):
"""Render the transcript result as a styled HTML string."""
word_count = len(transcript.split())
# Build word pills
word_pills = ""
for word in transcript.split():
word_pills += f"""
<span style="
display:inline-block;
background:#0d1a2e;
border:1px solid #1e3a5a;
color:#b0cce8;
border-radius:8px;
padding:4px 10px;
margin:3px 4px;
font-size:14px;
font-family:'Noto Naskh Arabic',serif;
direction:rtl;">
{word}
</span>"""
return f"""
<div style="font-family:'Inter',sans-serif;">
<!-- Source badge -->
<div style="display:flex;align-items:center;gap:10px;margin-bottom:14px;">
<div style="background:linear-gradient(135deg,#1a3a6e,#1565c0);
color:#e3f2fd;font-size:12px;font-weight:700;
letter-spacing:0.08em;text-transform:uppercase;
padding:6px 14px;border-radius:20px;">
πŸŽ™οΈ {source_label}
</div>
<div style="color:#546e7a;font-size:11px;">
{word_count} word{'s' if word_count != 1 else ''}
</div>
</div>
<!-- Transcript display -->
<div style="background:#0a1220;border:1px solid #1a4a7a;border-radius:12px;
padding:18px 20px;margin-bottom:16px;
direction:rtl;text-align:right;
font-family:'Noto Naskh Arabic',serif;
font-size:20px;font-weight:600;
color:#ddeeff;line-height:1.8;">
{transcript}
</div>
<!-- Word tokens -->
<div style="background:#0d1626;border:1px solid #1e3050;border-radius:12px;padding:14px 16px;">
<div style="color:#5c8abf;font-size:11px;font-weight:700;
letter-spacing:0.1em;text-transform:uppercase;margin-bottom:10px;">
πŸ”€ Tokenized Words
</div>
<div style="display:flex;flex-wrap:wrap;gap:2px;direction:rtl;">
{word_pills}
</div>
</div>
</div>
"""
def build_html_empty():
return """
<div style="background:#0d1626;border:1px solid #1e3050;border-radius:12px;
padding:32px;text-align:center;color:#3d5a7a;font-family:'Inter',sans-serif;">
<div style="font-size:36px;margin-bottom:12px;">πŸŽ™οΈ</div>
<div style="font-size:14px;font-weight:600;">No input provided</div>
<div style="font-size:12px;margin-top:6px;">Record audio, upload a file, or type Pashto text below.</div>
</div>
"""
# ─────────────────────────────────────────────
# CSS
# ─────────────────────────────────────────────
css = """
@import url('https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800&family=Noto+Naskh+Arabic:wght@400;600;700&display=swap');
body, .gradio-container {
background: #0a0e1a !important;
min-height: 100vh;
font-family: 'Inter', sans-serif !important;
}
.gr-panel, .gr-box, .gradio-container .gap {
background: transparent !important;
}
textarea, input[type="text"] {
color: #e8f0ff !important;
background-color: #0d1626 !important;
border: 1px solid #1e3050 !important;
border-radius: 10px !important;
font-size: 15px !important;
caret-color: #4fc3f7 !important;
font-family: 'Inter', sans-serif !important;
}
.dark textarea,
.dark input[type="text"],
.dark .gr-textbox textarea,
.dark [data-testid="textbox"] textarea,
.dark [data-testid="textbox"] input {
color: #e8f0ff !important;
background-color: #0d1626 !important;
border: 1px solid #1e3050 !important;
font-size: 15px !important;
caret-color: #4fc3f7 !important;
}
textarea:focus, input[type="text"]:focus,
.dark textarea:focus, .dark input[type="text"]:focus {
border-color: #4fc3f7 !important;
box-shadow: 0 0 0 3px rgba(79,195,247,0.12) !important;
outline: none !important;
}
textarea::placeholder, input[type="text"]::placeholder,
.dark textarea::placeholder, .dark input[type="text"]::placeholder {
color: #3d5a7a !important;
opacity: 1 !important;
}
label span,
.dark label span,
.gr-textbox label span,
.dark .gr-textbox label span,
.dark [data-testid="textbox"] label span {
color: #7aa8d8 !important;
font-weight: 700 !important;
font-size: 12px !important;
letter-spacing: 0.09em;
text-transform: uppercase;
font-family: 'Inter', sans-serif !important;
}
#transcript-box textarea,
.dark #transcript-box textarea {
color: #ddeeff !important;
background-color: #0a1220 !important;
border: 1px solid #1a4a7a !important;
font-size: 16px !important;
font-weight: 600;
direction: rtl;
text-align: right;
font-family: 'Noto Naskh Arabic', serif !important;
}
.run-btn {
background: linear-gradient(135deg, #1a237e, #1565c0) !important;
color: #ffffff !important;
font-size: 15px !important;
font-weight: 700 !important;
letter-spacing: 0.08em;
border: none !important;
border-radius: 10px !important;
padding: 14px 0 !important;
box-shadow: 0 4px 20px rgba(21,101,192,0.4) !important;
transition: all 0.2s ease !important;
font-family: 'Inter', sans-serif !important;
text-transform: uppercase;
}
.run-btn:hover {
background: linear-gradient(135deg, #283593, #1976d2) !important;
box-shadow: 0 6px 28px rgba(21,101,192,0.6) !important;
transform: translateY(-2px) !important;
}
.gr-audio {
background: #0d1626 !important;
border: 1px solid #1e3050 !important;
border-radius: 12px !important;
}
::-webkit-scrollbar { width: 5px; background: #0a0e1a; }
::-webkit-scrollbar-thumb { background: #1e3a5a; border-radius: 3px; }
footer { display: none !important; }
"""
# ─────────────────────────────────────────────
# GRADIO UI
# ─────────────────────────────────────────────
with gr.Blocks(css=css, title="Pashto ASR") as demo:
# ── HEADER ──────────────────────────────────────────────────────
gr.HTML(f"""
<div style="
background:linear-gradient(135deg,#060a14 0%,#0d1a30 55%,#060d1c 100%);
border-bottom:1px solid #152440;
padding:22px 28px 18px 28px;
margin-bottom:2px;">
<div style="display:flex;align-items:center;justify-content:space-between;flex-wrap:wrap;gap:14px;">
<!-- MCS Logo + label -->
<div style="display:flex;align-items:center;gap:12px;flex:0 0 auto;">
<img src="{MCS_SRC}"
style="width:64px;height:64px;border-radius:50%;border:2.5px solid #b71c1c;
box-shadow:0 0 14px rgba(183,28,28,0.45);" alt="MCS Logo"/>
<div>
<div style="color:#ef9a9a;font-size:11px;font-weight:800;letter-spacing:0.12em;
text-transform:uppercase;font-family:'Inter',sans-serif;">Military College of Signals</div>
<div style="color:#b0bec5;font-size:10px;letter-spacing:0.05em;font-family:'Inter',sans-serif;">Rawalpindi</div>
</div>
</div>
<!-- Center title -->
<div style="text-align:center;flex:1;min-width:240px;">
<div style="font-family:'Inter',sans-serif;font-size:clamp(18px,2.8vw,26px);font-weight:800;
color:#e3f2fd;letter-spacing:0.03em;line-height:1.2;
text-shadow:0 2px 10px rgba(79,195,247,0.25);">
Pashto ASR
</div>
<div style="font-family:'Inter',sans-serif;font-size:clamp(11px,1.4vw,13px);font-weight:600;
color:#546e7a;letter-spacing:0.06em;margin-top:5px;">
Whisper &nbsp;Β·&nbsp; Transformers
</div>
<div style="margin-top:10px;display:inline-block;
background:linear-gradient(135deg,#0d2a0d,#1b5e20);
border:2px solid #00e676;
border-radius:10px;padding:5px 18px;
box-shadow:0 0 18px rgba(0,230,118,0.45),0 0 6px rgba(0,230,118,0.25);">
<span style="font-family:'Inter',sans-serif;font-size:11px;font-weight:700;
color:#69f0ae;letter-spacing:0.12em;text-transform:uppercase;">
WER
</span>
<span style="font-family:'Inter',sans-serif;font-size:16px;font-weight:900;
color:#00e676;letter-spacing:0.05em;margin-left:8px;
text-shadow:0 0 10px rgba(0,230,118,0.7);">
24.19%
</span>
</div>
</div>
<!-- NUST Logo + label -->
<div style="display:flex;align-items:center;gap:12px;flex:0 0 auto;">
<div style="text-align:right;">
<div style="color:#90caf9;font-size:11px;font-weight:800;letter-spacing:0.12em;
text-transform:uppercase;font-family:'Inter',sans-serif;">National University of</div>
<div style="color:#b0bec5;font-size:10px;letter-spacing:0.05em;font-family:'Inter',sans-serif;">Sciences &amp; Technology</div>
</div>
<img src="{NUST_SRC}"
style="width:64px;height:64px;border-radius:50%;border:2.5px solid #1565c0;
box-shadow:0 0 14px rgba(21,101,192,0.45);" alt="NUST Logo"/>
</div>
</div>
</div>
""")
# ── MAIN LAYOUT ──────────────────────────────────────────────────
with gr.Row(equal_height=True):
# ── LEFT COLUMN ────────────────────────────────────────────
with gr.Column(scale=1, min_width=300):
_mic_img = f'<img src="{MIC_SRC}" style="width:20px;height:20px;vertical-align:middle;margin-right:8px;filter:brightness(1.2);">' if MIC_SRC else "πŸŽ™οΈ&nbsp;"
gr.HTML(f"""
<div style="background:#0d1626;border:1px solid #1e3050;border-radius:12px 12px 0 0;
padding:10px 16px 8px 16px;margin-bottom:-2px;">
<div style="display:flex;align-items:center;gap:4px;">
{_mic_img}
<span style="color:#7aa8d8;font-size:12px;font-weight:700;
letter-spacing:0.09em;text-transform:uppercase;
font-family:'Inter',sans-serif;">Audio Input</span>
</div>
</div>
""")
audio_input = gr.Audio(
sources=["microphone", "upload"],
type="filepath",
label="Record or Upload Pashto Audio",
)
run_btn = gr.Button("πŸŽ™οΈ Transcribe Speech", variant="primary", size="lg", elem_classes="run-btn")
# Info panel
gr.HTML("""
<div style="margin-top:12px;background:#0d1626;
border:1px solid #1e3050;border-radius:12px;
padding:14px 16px;font-size:12px;line-height:1.9;
font-family:'Inter',sans-serif;">
<div style="color:#5c8abf;font-weight:700;font-size:11px;
letter-spacing:0.1em;text-transform:uppercase;margin-bottom:8px;">
🧠 How It Works
</div>
<div style="display:grid;grid-template-columns:22px 1fr;gap:3px 8px;align-items:start;color:#546e7a;">
<span>πŸŽ™οΈ</span><span><b style="color:#cdd8e8">Record</b> β€” use your microphone directly</span>
<span>πŸ“‚</span><span><b style="color:#cdd8e8">Upload</b> β€” submit a .wav / .mp3 file</span>
</div>
<div style="height:1px;background:rgba(79,195,247,0.1);margin:10px 0;"></div>
<div style="color:#5c8abf;font-weight:700;font-size:11px;
letter-spacing:0.1em;text-transform:uppercase;margin-bottom:6px;">
βš™οΈ Model
</div>
<div style="color:#546e7a;font-size:11px;line-height:1.7;">
<b style="color:#90caf9">Zarnabh/whisper-large-ps</b><br/>
Fine-tuned Whisper for Pashto speech recognition
</div>
</div>
""")
# ── RIGHT COLUMN ───────────────────────────────────────────
with gr.Column(scale=2):
transcript_out = gr.Textbox(
label="πŸ“ ASR Transcript",
interactive=False,
rtl=True,
text_align="right",
lines=2,
elem_id="transcript-box",
)
gr.HTML("""
<div style="display:flex;align-items:center;gap:8px;margin-top:14px;margin-bottom:6px;">
<div style="width:28px;height:28px;border-radius:7px;
background:linear-gradient(135deg,#1a3a6e,#1565c0);
display:flex;align-items:center;justify-content:center;font-size:14px;">πŸ“Š</div>
<span style="color:#90caf9;font-size:12px;font-weight:700;
letter-spacing:0.09em;text-transform:uppercase;
font-family:'Inter',sans-serif;">Transcription Result</span>
</div>
""")
html_out = gr.HTML()
run_btn.click(
fn=run_pipeline,
inputs=[audio_input],
outputs=[html_out, transcript_out],
)
# ── FOOTER ──────────────────────────────────────────────────────
gr.HTML("""
<div style="margin-top:24px;padding:12px 24px;
background:linear-gradient(90deg,#060a14,#0d1a2e,#060a14);
border-top:1px solid #152440;
display:flex;flex-wrap:wrap;justify-content:space-between;
align-items:center;gap:8px;font-family:'Inter',sans-serif;">
<div style="color:#2a4a6a;font-size:11px;letter-spacing:0.04em;">
<b style="color:#4a7aab">ASR Model:</b> Zarnabh/whisper-large-ps
</div>
<div style="color:#2a4a6a;font-size:11px;letter-spacing:0.04em;">
MCS &amp; NUST &nbsp;Β·&nbsp; Pashto NLP Research
</div>
</div>
""")
if __name__ == "__main__":
demo.launch()