Spaces:

afaqalinagra
/

PASHTO-ASR-MODEL

Sleeping

App Files Files Community

afaqalinagra commited on Jan 23

Commit

30b7049

verified ·

1 Parent(s): bbbf3e8

Update app.py

Browse files

Files changed (1) hide show

app.py +81 -81

app.py CHANGED Viewed

@@ -2,161 +2,161 @@ import gradio as gr
 import torch
 import numpy as np
 import librosa
-from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
 # =========================
-# MODEL CONFIGURATION
 # =========================
-MODEL_ID = "afaqalinagra/PASHTO-ASR-MODEL"
-DEVICE = "cpu"
-DTYPE = torch.float32
 # =========================
 # LOAD MODEL & PROCESSOR
 # =========================
-processor = AutoProcessor.from_pretrained(MODEL_ID)
-model = AutoModelForSpeechSeq2Seq.from_pretrained(
     MODEL_ID,
-    torch_dtype=DTYPE,
-    low_cpu_mem_usage=True
 )
-model.to(DEVICE)
-model.eval()
 # =========================
-# ASR FUNCTION
 # =========================
-def transcribe(audio):
     if audio is None:
-        return "No audio provided."
     sample_rate, waveform = audio
-    # Convert stereo to mono
     if waveform.ndim > 1:
         waveform = np.mean(waveform, axis=1)
-    # Ensure float32
-    waveform = waveform.astype(np.float32)
-    # Resample to 16kHz (mandatory for ASR)
-    if sample_rate != 16000:
         waveform = librosa.resample(
             waveform,
             orig_sr=sample_rate,
-            target_sr=16000
         )
     inputs = processor(
         waveform,
-        sampling_rate=16000,
         return_tensors="pt"
     )
     with torch.no_grad():
-        generated_ids = model.generate(
-            inputs.input_features.to(DEVICE)
         )
     transcription = processor.batch_decode(
-        generated_ids,
         skip_special_tokens=True
     )[0]
     return transcription.strip()
 # =========================
-# CUSTOM GLASS-MORPHISM CSS
 # =========================
-custom_css = """
 body {
-    background: linear-gradient(135deg, #1e1e2f, #2b5876);
-    font-family: Inter, system-ui, -apple-system, BlinkMacSystemFont;
 }
-.glass-card {
-    background: rgba(255, 255, 255, 0.15);
-    backdrop-filter: blur(16px);
-    -webkit-backdrop-filter: blur(16px);
-    border-radius: 22px;
-    padding: 28px;
-    border: 1px solid rgba(255, 255, 255, 0.25);
-    box-shadow: 0 10px 40px rgba(0, 0, 0, 0.35);
 }
-h1, h2, h3, label {
-    color: white !important;
 }
-.gr-button {
-    background: linear-gradient(135deg, #ff7a18, #ffb347);
-    border-radius: 14px;
-    font-weight: 600;
-    color: black;
-    height: 48px;
 }
-.gr-textbox textarea {
-    background: rgba(255, 255, 255, 0.25);
-    color: white;
-    border-radius: 12px;
 }
-.gr-audio {
-    background: rgba(255, 255, 255, 0.18);
-    border-radius: 14px;
 }
 """
 # =========================
 # GRADIO UI
 # =========================
-with gr.Blocks(css=custom_css) as demo:
-    with gr.Column(elem_classes=["glass-card"]):
         gr.Markdown(
             """
-            <h1 style="text-align:center;">Pashto Speech-to-Text</h1>
-            <h3 style="text-align:center;">Powered by Custom ASR Model</h3>
-            <p style="text-align:center; color:white;">
-            Upload or record Pashto audio and receive accurate transcription.
-            </p>
             """
         )
-        with gr.Row():
-            with gr.Column(scale=1):
-                audio_input = gr.Audio(
-                    sources=["upload", "microphone"],
-                    type="numpy",
-                    label="Upload or Record Pashto Audio"
-                )
-                transcribe_btn = gr.Button("Transcribe")
-            with gr.Column(scale=1):
-                output_text = gr.Textbox(
-                    label="Transcription Output",
-                    lines=8,
-                    placeholder="Transcribed text will appear here..."
-                )
         transcribe_btn.click(
-            fn=transcribe,
             inputs=audio_input,
             outputs=output_text
         )
 # =========================
 # LAUNCH
 # =========================
-demo.launch()

 import torch
 import numpy as np
 import librosa
+from transformers import (
+    WhisperProcessor,
+    WhisperForConditionalGeneration
+)
 # =========================
+# CONFIGURATION
 # =========================
+MODEL_ID = "afaqalinagra/PASHTO-ASR-MODEL"
+DEVICE = "cpu"   # HF free tier = CPU only
+TARGET_SAMPLE_RATE = 16000
 # =========================
 # LOAD MODEL & PROCESSOR
 # =========================
+processor = WhisperProcessor.from_pretrained(
     MODEL_ID,
+    language="pashto",
+    task="transcribe"
 )
+model = WhisperForConditionalGeneration.from_pretrained(
+    MODEL_ID
+).to(DEVICE)
+model.eval()
 # =========================
+# TRANSCRIPTION FUNCTION
 # =========================
+def transcribe_audio(audio):
     if audio is None:
+        return ""
+    # audio = (sample_rate, numpy_array)
     sample_rate, waveform = audio
+    # Convert stereo to mono if needed
     if waveform.ndim > 1:
         waveform = np.mean(waveform, axis=1)
+    # Resample to 16kHz if needed
+    if sample_rate != TARGET_SAMPLE_RATE:
         waveform = librosa.resample(
             waveform,
             orig_sr=sample_rate,
+            target_sr=TARGET_SAMPLE_RATE
         )
     inputs = processor(
         waveform,
+        sampling_rate=TARGET_SAMPLE_RATE,
         return_tensors="pt"
     )
     with torch.no_grad():
+        predicted_ids = model.generate(
+            inputs.input_features.to(DEVICE),
+            max_length=448
         )
     transcription = processor.batch_decode(
+        predicted_ids,
         skip_special_tokens=True
     )[0]
     return transcription.strip()
 # =========================
+# CUSTOM GLASSMORPHISM CSS
 # =========================
+CUSTOM_CSS = """
 body {
+    background: linear-gradient(135deg, #0f2027, #203a43, #2c5364);
+    font-family: 'Inter', sans-serif;
 }
+.gradio-container {
+    max-width: 1100px !important;
+    margin: auto;
 }
+.glass {
+    background: rgba(255, 255, 255, 0.12);
+    backdrop-filter: blur(18px);
+    -webkit-backdrop-filter: blur(18px);
+    border-radius: 18px;
+    border: 1px solid rgba(255, 255, 255, 0.25);
+    box-shadow: 0 8px 32px rgba(0, 0, 0, 0.35);
+    padding: 24px;
 }
+h1, h3, p {
+    color: #ffffff !important;
+    text-align: center;
 }
+button {
+    background: linear-gradient(135deg, #ff8008, #ffc837) !important;
+    color: #000000 !important;
+    font-weight: 600 !important;
+    border-radius: 10px !important;
 }
+textarea {
+    font-size: 16px !important;
 }
 """
 # =========================
 # GRADIO UI
 # =========================
+with gr.Blocks(css=CUSTOM_CSS) as demo:
+    with gr.Column(elem_classes="glass"):
         gr.Markdown(
             """
+            # 🎙️ Pashto Speech-to-Text
+            ### Powered by Whisper ASR
+            Upload or record Pashto audio and get accurate transcription.
             """
         )
+        audio_input = gr.Audio(
+            sources=["upload", "microphone"],
+            type="numpy",
+            label="Upload or Record Pashto Audio"
+        )
+        transcribe_btn = gr.Button("Transcribe")
+        output_text = gr.Textbox(
+            label="Transcription Output",
+            lines=6,
+            placeholder="Pashto transcription will appear here..."
+        )
         transcribe_btn.click(
+            fn=transcribe_audio,
             inputs=audio_input,
             outputs=output_text
         )
+        gr.Markdown(
+            """
+            <hr>
+            <p>
+            Developed for low-resource Pashto ASR using Whisper fine-tuning.<br>
+            Runs entirely on Hugging Face free infrastructure.
+            </p>
+            """
+        )
 # =========================
 # LAUNCH
 # =========================
+if __name__ == "__main__":
+    demo.launch()