import gradio as gr import torch import numpy as np import librosa from transformers import WhisperProcessor, WhisperForConditionalGeneration # ========================= # CONFIG # ========================= MODEL_ID = "afaqalinagra/PASHTO-ASR-MODEL" DEVICE = "cpu" TARGET_SR = 16000 # ========================= # LOAD MODEL # ========================= processor = WhisperProcessor.from_pretrained( MODEL_ID, language="pashto", task="transcribe" ) model = WhisperForConditionalGeneration.from_pretrained( MODEL_ID ).to(DEVICE) model.eval() # ========================= # SAFE AUDIO HANDLER # ========================= def transcribe_audio(audio): if audio is None: return "No audio provided." # ---- HANDLE BOTH GRADIO FORMATS ---- if isinstance(audio, dict): waveform = audio.get("data", None) sample_rate = audio.get("sampling_rate", None) else: sample_rate, waveform = audio if waveform is None or sample_rate is None: return "Invalid audio input." if len(waveform) == 0: return "Empty audio." # Convert stereo → mono if waveform.ndim > 1: waveform = np.mean(waveform, axis=1) # Resample to 16kHz if sample_rate != TARGET_SR: waveform = librosa.resample( waveform.astype(np.float32), orig_sr=sample_rate, target_sr=TARGET_SR ) inputs = processor( waveform, sampling_rate=TARGET_SR, return_tensors="pt" ) with torch.no_grad(): predicted_ids = model.generate( inputs.input_features.to(DEVICE), max_length=448 ) transcription = processor.batch_decode( predicted_ids, skip_special_tokens=True )[0] return transcription.strip() if transcription else "No speech detected." # ========================= # GLASSMORPHISM CSS # ========================= CUSTOM_CSS = """ body { background: linear-gradient(135deg, #0f2027, #203a43, #2c5364); font-family: Inter, sans-serif; } .gradio-container { max-width: 1100px !important; margin: auto; } .glass { background: rgba(255, 255, 255, 0.12); backdrop-filter: blur(18px); -webkit-backdrop-filter: blur(18px); border-radius: 18px; border: 1px solid rgba(255, 255, 255, 0.25); box-shadow: 0 8px 32px rgba(0, 0, 0, 0.35); padding: 24px; } h1, h3, p { color: white !important; text-align: center; } button { background: linear-gradient(135deg, #ff8008, #ffc837) !important; color: black !important; font-weight: 600 !important; border-radius: 10px !important; } textarea { font-size: 16px !important; } """ # ========================= # UI # ========================= with gr.Blocks(css=CUSTOM_CSS) as demo: with gr.Column(elem_classes="glass"): gr.Markdown( """ # 🎙️ Pashto Speech-to-Text ### Powered by Whisper ASR Upload or record Pashto audio and get accurate transcription. """ ) audio_input = gr.Audio( sources=["upload", "microphone"], type="numpy", label="Upload or Record Pashto Audio" ) transcribe_btn = gr.Button("Transcribe") output_text = gr.Textbox( label="Transcription Output", lines=6, placeholder="Pashto transcription will appear here..." ) transcribe_btn.click( fn=transcribe_audio, inputs=audio_input, outputs=output_text ) gr.Markdown( """
Developed for low-resource Pashto ASR using Whisper fine-tuning.
Runs entirely on Hugging Face free infrastructure.