Spaces:

afaqalinagra
/

PASHTO-ASR-MODEL

Sleeping

File size: 3,968 Bytes

bbbf3e8
 
 
 
 
6da03f5
bbbf3e8
 
6da03f5
bbbf3e8
30b7049
6da03f5
 
bbbf3e8
 
6da03f5
bbbf3e8
30b7049
bbbf3e8
30b7049
 
bbbf3e8
 
30b7049
 
 
bbbf3e8
30b7049
bbbf3e8
 
6da03f5
bbbf3e8
30b7049
bbbf3e8
6da03f5
 
 
 
 
 
 
 
 
 
 
bbbf3e8
6da03f5
 
bbbf3e8
6da03f5
bbbf3e8
 
 
6da03f5
 
bbbf3e8
6da03f5
bbbf3e8
6da03f5
bbbf3e8
 
 
 
6da03f5
bbbf3e8
 
 
 
30b7049
 
 
bbbf3e8
 
 
30b7049
bbbf3e8
 
 
6da03f5
bbbf3e8
 
6da03f5
bbbf3e8
30b7049
bbbf3e8
30b7049
6da03f5
bbbf3e8
 
30b7049
 
 
bbbf3e8
 
30b7049
 
 
 
 
 
 
 
bbbf3e8
 
30b7049
6da03f5
30b7049
bbbf3e8
 
30b7049
 
6da03f5
30b7049
 
bbbf3e8
 
30b7049
 
bbbf3e8
 
 
 
6da03f5
bbbf3e8
30b7049
 
bbbf3e8
 
30b7049
 
 
bbbf3e8
 
 
30b7049
 
 
 
 
bbbf3e8
30b7049
bbbf3e8
30b7049
 
 
 
 
bbbf3e8
 
30b7049
bbbf3e8
 
 
 
30b7049
 
 
 
 
 
 
 
 
bbbf3e8
 
 
 
30b7049

import gradio as gr
import torch
import numpy as np
import librosa

from transformers import WhisperProcessor, WhisperForConditionalGeneration

# =========================
# CONFIG
# =========================
MODEL_ID = "afaqalinagra/PASHTO-ASR-MODEL"
DEVICE = "cpu"
TARGET_SR = 16000

# =========================
# LOAD MODEL
# =========================
processor = WhisperProcessor.from_pretrained(
    MODEL_ID,
    language="pashto",
    task="transcribe"
)

model = WhisperForConditionalGeneration.from_pretrained(
    MODEL_ID
).to(DEVICE)

model.eval()

# =========================
# SAFE AUDIO HANDLER
# =========================
def transcribe_audio(audio):
    if audio is None:
        return "No audio provided."

    # ---- HANDLE BOTH GRADIO FORMATS ----
    if isinstance(audio, dict):
        waveform = audio.get("data", None)
        sample_rate = audio.get("sampling_rate", None)
    else:
        sample_rate, waveform = audio

    if waveform is None or sample_rate is None:
        return "Invalid audio input."

    if len(waveform) == 0:
        return "Empty audio."

    # Convert stereo → mono
    if waveform.ndim > 1:
        waveform = np.mean(waveform, axis=1)

    # Resample to 16kHz
    if sample_rate != TARGET_SR:
        waveform = librosa.resample(
            waveform.astype(np.float32),
            orig_sr=sample_rate,
            target_sr=TARGET_SR
        )

    inputs = processor(
        waveform,
        sampling_rate=TARGET_SR,
        return_tensors="pt"
    )

    with torch.no_grad():
        predicted_ids = model.generate(
            inputs.input_features.to(DEVICE),
            max_length=448
        )

    transcription = processor.batch_decode(
        predicted_ids,
        skip_special_tokens=True
    )[0]

    return transcription.strip() if transcription else "No speech detected."

# =========================
# GLASSMORPHISM CSS
# =========================
CUSTOM_CSS = """
body {
    background: linear-gradient(135deg, #0f2027, #203a43, #2c5364);
    font-family: Inter, sans-serif;
}

.gradio-container {
    max-width: 1100px !important;
    margin: auto;
}

.glass {
    background: rgba(255, 255, 255, 0.12);
    backdrop-filter: blur(18px);
    -webkit-backdrop-filter: blur(18px);
    border-radius: 18px;
    border: 1px solid rgba(255, 255, 255, 0.25);
    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.35);
    padding: 24px;
}

h1, h3, p {
    color: white !important;
    text-align: center;
}

button {
    background: linear-gradient(135deg, #ff8008, #ffc837) !important;
    color: black !important;
    font-weight: 600 !important;
    border-radius: 10px !important;
}

textarea {
    font-size: 16px !important;
}
"""

# =========================
# UI
# =========================
with gr.Blocks(css=CUSTOM_CSS) as demo:
    with gr.Column(elem_classes="glass"):
        gr.Markdown(
            """
            # 🎙️ Pashto Speech-to-Text  
            ### Powered by Whisper ASR  
            Upload or record Pashto audio and get accurate transcription.
            """
        )

        audio_input = gr.Audio(
            sources=["upload", "microphone"],
            type="numpy",
            label="Upload or Record Pashto Audio"
        )

        transcribe_btn = gr.Button("Transcribe")

        output_text = gr.Textbox(
            label="Transcription Output",
            lines=6,
            placeholder="Pashto transcription will appear here..."
        )

        transcribe_btn.click(
            fn=transcribe_audio,
            inputs=audio_input,
            outputs=output_text
        )

        gr.Markdown(
            """
            <hr>
            <p>
            Developed for low-resource Pashto ASR using Whisper fine-tuning.<br>
            Runs entirely on Hugging Face free infrastructure.
            </p>
            """
        )

# =========================
# LAUNCH
# =========================
if __name__ == "__main__":
    demo.launch()