afaqalinagra commited on
Commit
30b7049
·
verified ·
1 Parent(s): bbbf3e8

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +81 -81
app.py CHANGED
@@ -2,161 +2,161 @@ import gradio as gr
2
  import torch
3
  import numpy as np
4
  import librosa
5
- from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
6
 
 
 
 
 
7
 
8
  # =========================
9
- # MODEL CONFIGURATION
10
  # =========================
11
- MODEL_ID = "afaqalinagra/PASHTO-ASR-MODEL"
12
-
13
- DEVICE = "cpu"
14
- DTYPE = torch.float32
15
-
16
 
17
  # =========================
18
  # LOAD MODEL & PROCESSOR
19
  # =========================
20
- processor = AutoProcessor.from_pretrained(MODEL_ID)
21
-
22
- model = AutoModelForSpeechSeq2Seq.from_pretrained(
23
  MODEL_ID,
24
- torch_dtype=DTYPE,
25
- low_cpu_mem_usage=True
26
  )
27
 
28
- model.to(DEVICE)
29
- model.eval()
 
30
 
 
31
 
32
  # =========================
33
- # ASR FUNCTION
34
  # =========================
35
- def transcribe(audio):
36
  if audio is None:
37
- return "No audio provided."
38
 
 
39
  sample_rate, waveform = audio
40
 
41
- # Convert stereo to mono
42
  if waveform.ndim > 1:
43
  waveform = np.mean(waveform, axis=1)
44
 
45
- # Ensure float32
46
- waveform = waveform.astype(np.float32)
47
-
48
- # Resample to 16kHz (mandatory for ASR)
49
- if sample_rate != 16000:
50
  waveform = librosa.resample(
51
  waveform,
52
  orig_sr=sample_rate,
53
- target_sr=16000
54
  )
55
 
56
  inputs = processor(
57
  waveform,
58
- sampling_rate=16000,
59
  return_tensors="pt"
60
  )
61
 
62
  with torch.no_grad():
63
- generated_ids = model.generate(
64
- inputs.input_features.to(DEVICE)
 
65
  )
66
 
67
  transcription = processor.batch_decode(
68
- generated_ids,
69
  skip_special_tokens=True
70
  )[0]
71
 
72
  return transcription.strip()
73
 
74
-
75
  # =========================
76
- # CUSTOM GLASS-MORPHISM CSS
77
  # =========================
78
- custom_css = """
79
  body {
80
- background: linear-gradient(135deg, #1e1e2f, #2b5876);
81
- font-family: Inter, system-ui, -apple-system, BlinkMacSystemFont;
82
  }
83
 
84
- .glass-card {
85
- background: rgba(255, 255, 255, 0.15);
86
- backdrop-filter: blur(16px);
87
- -webkit-backdrop-filter: blur(16px);
88
- border-radius: 22px;
89
- padding: 28px;
90
- border: 1px solid rgba(255, 255, 255, 0.25);
91
- box-shadow: 0 10px 40px rgba(0, 0, 0, 0.35);
92
  }
93
 
94
- h1, h2, h3, label {
95
- color: white !important;
 
 
 
 
 
 
96
  }
97
 
98
- .gr-button {
99
- background: linear-gradient(135deg, #ff7a18, #ffb347);
100
- border-radius: 14px;
101
- font-weight: 600;
102
- color: black;
103
- height: 48px;
104
  }
105
 
106
- .gr-textbox textarea {
107
- background: rgba(255, 255, 255, 0.25);
108
- color: white;
109
- border-radius: 12px;
 
110
  }
111
 
112
- .gr-audio {
113
- background: rgba(255, 255, 255, 0.18);
114
- border-radius: 14px;
115
  }
116
  """
117
 
118
-
119
  # =========================
120
  # GRADIO UI
121
  # =========================
122
- with gr.Blocks(css=custom_css) as demo:
123
-
124
- with gr.Column(elem_classes=["glass-card"]):
125
  gr.Markdown(
126
  """
127
- <h1 style="text-align:center;">Pashto Speech-to-Text</h1>
128
- <h3 style="text-align:center;">Powered by Custom ASR Model</h3>
129
- <p style="text-align:center; color:white;">
130
- Upload or record Pashto audio and receive accurate transcription.
131
- </p>
132
  """
133
  )
134
 
135
- with gr.Row():
136
- with gr.Column(scale=1):
137
- audio_input = gr.Audio(
138
- sources=["upload", "microphone"],
139
- type="numpy",
140
- label="Upload or Record Pashto Audio"
141
- )
142
 
143
- transcribe_btn = gr.Button("Transcribe")
144
 
145
- with gr.Column(scale=1):
146
- output_text = gr.Textbox(
147
- label="Transcription Output",
148
- lines=8,
149
- placeholder="Transcribed text will appear here..."
150
- )
151
 
152
  transcribe_btn.click(
153
- fn=transcribe,
154
  inputs=audio_input,
155
  outputs=output_text
156
  )
157
 
 
 
 
 
 
 
 
 
 
158
 
159
  # =========================
160
  # LAUNCH
161
  # =========================
162
- demo.launch()
 
 
2
  import torch
3
  import numpy as np
4
  import librosa
 
5
 
6
+ from transformers import (
7
+ WhisperProcessor,
8
+ WhisperForConditionalGeneration
9
+ )
10
 
11
  # =========================
12
+ # CONFIGURATION
13
  # =========================
14
+ MODEL_ID = "afaqalinagra/PASHTO-ASR-MODEL"
15
+ DEVICE = "cpu" # HF free tier = CPU only
16
+ TARGET_SAMPLE_RATE = 16000
 
 
17
 
18
  # =========================
19
  # LOAD MODEL & PROCESSOR
20
  # =========================
21
+ processor = WhisperProcessor.from_pretrained(
 
 
22
  MODEL_ID,
23
+ language="pashto",
24
+ task="transcribe"
25
  )
26
 
27
+ model = WhisperForConditionalGeneration.from_pretrained(
28
+ MODEL_ID
29
+ ).to(DEVICE)
30
 
31
+ model.eval()
32
 
33
  # =========================
34
+ # TRANSCRIPTION FUNCTION
35
  # =========================
36
+ def transcribe_audio(audio):
37
  if audio is None:
38
+ return ""
39
 
40
+ # audio = (sample_rate, numpy_array)
41
  sample_rate, waveform = audio
42
 
43
+ # Convert stereo to mono if needed
44
  if waveform.ndim > 1:
45
  waveform = np.mean(waveform, axis=1)
46
 
47
+ # Resample to 16kHz if needed
48
+ if sample_rate != TARGET_SAMPLE_RATE:
 
 
 
49
  waveform = librosa.resample(
50
  waveform,
51
  orig_sr=sample_rate,
52
+ target_sr=TARGET_SAMPLE_RATE
53
  )
54
 
55
  inputs = processor(
56
  waveform,
57
+ sampling_rate=TARGET_SAMPLE_RATE,
58
  return_tensors="pt"
59
  )
60
 
61
  with torch.no_grad():
62
+ predicted_ids = model.generate(
63
+ inputs.input_features.to(DEVICE),
64
+ max_length=448
65
  )
66
 
67
  transcription = processor.batch_decode(
68
+ predicted_ids,
69
  skip_special_tokens=True
70
  )[0]
71
 
72
  return transcription.strip()
73
 
 
74
  # =========================
75
+ # CUSTOM GLASSMORPHISM CSS
76
  # =========================
77
+ CUSTOM_CSS = """
78
  body {
79
+ background: linear-gradient(135deg, #0f2027, #203a43, #2c5364);
80
+ font-family: 'Inter', sans-serif;
81
  }
82
 
83
+ .gradio-container {
84
+ max-width: 1100px !important;
85
+ margin: auto;
 
 
 
 
 
86
  }
87
 
88
+ .glass {
89
+ background: rgba(255, 255, 255, 0.12);
90
+ backdrop-filter: blur(18px);
91
+ -webkit-backdrop-filter: blur(18px);
92
+ border-radius: 18px;
93
+ border: 1px solid rgba(255, 255, 255, 0.25);
94
+ box-shadow: 0 8px 32px rgba(0, 0, 0, 0.35);
95
+ padding: 24px;
96
  }
97
 
98
+ h1, h3, p {
99
+ color: #ffffff !important;
100
+ text-align: center;
 
 
 
101
  }
102
 
103
+ button {
104
+ background: linear-gradient(135deg, #ff8008, #ffc837) !important;
105
+ color: #000000 !important;
106
+ font-weight: 600 !important;
107
+ border-radius: 10px !important;
108
  }
109
 
110
+ textarea {
111
+ font-size: 16px !important;
 
112
  }
113
  """
114
 
 
115
  # =========================
116
  # GRADIO UI
117
  # =========================
118
+ with gr.Blocks(css=CUSTOM_CSS) as demo:
119
+ with gr.Column(elem_classes="glass"):
 
120
  gr.Markdown(
121
  """
122
+ # 🎙️ Pashto Speech-to-Text
123
+ ### Powered by Whisper ASR
124
+ Upload or record Pashto audio and get accurate transcription.
 
 
125
  """
126
  )
127
 
128
+ audio_input = gr.Audio(
129
+ sources=["upload", "microphone"],
130
+ type="numpy",
131
+ label="Upload or Record Pashto Audio"
132
+ )
 
 
133
 
134
+ transcribe_btn = gr.Button("Transcribe")
135
 
136
+ output_text = gr.Textbox(
137
+ label="Transcription Output",
138
+ lines=6,
139
+ placeholder="Pashto transcription will appear here..."
140
+ )
 
141
 
142
  transcribe_btn.click(
143
+ fn=transcribe_audio,
144
  inputs=audio_input,
145
  outputs=output_text
146
  )
147
 
148
+ gr.Markdown(
149
+ """
150
+ <hr>
151
+ <p>
152
+ Developed for low-resource Pashto ASR using Whisper fine-tuning.<br>
153
+ Runs entirely on Hugging Face free infrastructure.
154
+ </p>
155
+ """
156
+ )
157
 
158
  # =========================
159
  # LAUNCH
160
  # =========================
161
+ if __name__ == "__main__":
162
+ demo.launch()