harsh2ai commited on
Commit
c70257c
·
1 Parent(s): 7d46a3c

updated ui for new model

Browse files
Files changed (1) hide show
  1. app.py +141 -123
app.py CHANGED
@@ -26,126 +26,114 @@ LOGO_URL = os.environ.get("STT_LOGO_URL", DEFAULT_LOGO_URL).strip()
26
  # Custom CSS for Ringg branding
27
  custom_css = """
28
  .gradio-container {
29
- font-family: 'Inter', sans-serif;
 
 
30
  }
 
31
  .main-header {
32
- display: flex;
33
- align-items: center;
34
- justify-content: center;
35
- gap: 20px;
36
- flex-wrap: nowrap;
37
- padding: 20px;
38
- background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
39
- color: white;
40
- border-radius: 10px;
41
- margin-bottom: 20px;
42
- max-width: 900px;
43
- margin-left: auto;
44
- margin-right: auto;
45
  }
 
46
  .main-header .main-logo {
47
- height: 60px;
48
- width: 60px;
49
- flex-shrink: 0;
50
- display: flex;
51
- align-items: center;
52
- justify-content: center;
53
  }
 
54
  .main-header .main-logo img {
55
- max-height: 100%;
56
- max-width: 100%;
57
- object-fit: contain;
58
  }
 
59
  .main-header .main-logo.main-logo--placeholder {
60
- background-color: rgba(255, 255, 255, 0.2);
61
- border-radius: 12px;
62
  }
 
63
  .main-header .main-text {
64
- text-align: left;
65
- display: flex;
66
- flex-direction: column;
67
- justify-content: center;
68
- min-width: 0;
69
  }
 
70
  .main-header .main-text h1 {
71
- margin: 0 0 6px;
72
  }
 
73
  .main-header .main-text p {
74
- margin: 0;
75
  }
 
76
  @media (max-width: 640px) {
77
- .main-header {
78
- flex-wrap: wrap;
79
- }
80
- .main-header .main-text {
81
- text-align: center;
82
- width: 100%;
83
- }
 
84
  }
 
85
  .status-dot {
86
- display: inline-block;
87
- width: 8px;
88
- height: 8px;
89
- border-radius: 50%;
90
- margin-left: 8px;
91
  }
 
92
  .status-dot.healthy {
93
- background-color: #22c55e;
94
- animation: pulse-green 2s ease-in-out infinite;
95
  }
 
96
  .status-dot.error {
97
- background-color: #ef4444;
98
- animation: pulse-red 2s ease-in-out infinite;
99
  }
 
100
  @keyframes pulse-green {
101
- 0% {
102
- box-shadow: 0 0 0 0 rgba(34, 197, 94, 0.7);
103
- }
104
- 70% {
105
- box-shadow: 0 0 0 6px rgba(34, 197, 94, 0);
106
- }
107
- 100% {
108
- box-shadow: 0 0 0 0 rgba(34, 197, 94, 0);
109
- }
110
  }
 
111
  @keyframes pulse-red {
112
- 0% {
113
- box-shadow: 0 0 0 0 rgba(239, 68, 68, 0.7);
114
- }
115
- 70% {
116
- box-shadow: 0 0 0 6px rgba(239, 68, 68, 0);
117
- }
118
- 100% {
119
- box-shadow: 0 0 0 0 rgba(239, 68, 68, 0);
120
- }
121
- }
122
- div[data-testid="audio"] {
123
- min-height: 60px !important;
124
- max-height: 80px !important;
125
- }
126
- div[data-testid="audio"] > div {
127
- height: auto !important;
128
- min-height: auto !important;
129
- }
130
- .wrap.wrap.wrap.svelte-1w6y6zl {
131
- height: auto !important;
132
- min-height: auto !important;
133
- }
134
- .gradio-row {
135
- min-height: auto !important;
136
- }
137
- footer {
138
- visibility: hidden !important;
139
- height: 50px !important;
140
- }
141
- footer:after {
142
- content: "Made with ❤️ by RinggAI Team" !important;
143
- visibility: visible !important;
144
- display: block !important;
145
- text-align: center !important;
146
- margin-top: 15px !important;
147
- color: #666 !important;
148
- font-size: 14px !important;
149
  }
150
  """
151
 
@@ -225,9 +213,16 @@ def create_interface():
225
  def transcribe_audio(audio_file):
226
  """Transcribe uploaded audio"""
227
  if audio_file is None:
228
- return "Please upload an audio file!"
 
 
 
 
 
 
229
 
230
- return stt_client.transcribe_audio(audio_file)
 
231
 
232
  def check_api_status():
233
  """Check API health status"""
@@ -259,29 +254,32 @@ def create_interface():
259
  """)
260
 
261
  gr.Markdown(
262
- """ # 🎯 Performance Benchmarks \n #### **Ringg STT V0** Ranks **2nd** Among Top Models, Outperforming OpenAI Whisper Large-v3 and Other leading Solutions."""
 
 
 
263
  )
264
 
265
  with gr.Row():
266
  gr.DataFrame(
267
  value=[
268
- ["IndicWav2Vec (Winner)", "18.55%", "63.31%"],
269
- ["Ringg STT V0", "21.03%", "66.27%"],
270
- ["VakyanSh Wav2Vec2", "24.06%", "66.34%"],
271
- ["Whisper Large-v3", "29.17%", "63.31%"],
272
- ["Whisper Large-v2", "37.50%", "66.27%"],
273
  ],
274
- headers=["Model", "Indic Norm WER ↓", "Whisper Norm WER ↓"],
275
  datatype=["str", "str", "str"],
276
- row_count=5,
277
  col_count=(3, "fixed"),
 
278
  )
279
 
280
- gr.Markdown("""
281
- -----------------
282
- # 📁 Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)
283
-
284
- """)
 
285
 
286
  with gr.Row():
287
  audio_input = gr.Audio(
@@ -297,26 +295,46 @@ def create_interface():
297
 
298
  file_output = gr.Textbox(
299
  label="Transcription Result",
300
- lines=3,
301
  interactive=True,
302
  placeholder="Upload a file and click Transcribe...",
303
  )
304
 
305
- transcribe_btn.click(transcribe_audio, inputs=audio_input, outputs=file_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
306
 
307
- # gr.Markdown("""
308
- # ### ✨ Features
309
- # - 🌐 **Hindi Support**: Accurate transcription for Hindi audio
310
- # - 🎯 **High Accuracy**: Competitive with leading ASR models
311
- # - 📁 **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
312
- # - **Fast Processing**: Optimized for quick transcription
313
- # """)
 
 
 
 
314
 
315
- gr.Markdown("""
316
- # 🙏 Acknowledgements
317
- - Special thanks to [@jeremylee12](https://huggingface.co/jeremylee12) for their contributions
318
- - Built with [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) models
319
- """)
 
 
320
 
321
  return demo
322
 
 
26
  # Custom CSS for Ringg branding
27
  custom_css = """
28
  .gradio-container {
29
+ font-family: 'Inter', sans-serif;
30
+ max-width: 950px;
31
+ margin: 0 auto;
32
  }
33
+
34
  .main-header {
35
+ display: flex;
36
+ align-items: center;
37
+ justify-content: center;
38
+ gap: 20px;
39
+ flex-wrap: nowrap;
40
+ padding: 20px;
41
+ background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
42
+ color: white;
43
+ border-radius: 10px;
44
+ margin-bottom: 20px;
45
+ max-width: 900px;
46
+ margin-left: auto;
47
+ margin-right: auto;
48
  }
49
+
50
  .main-header .main-logo {
51
+ height: 60px;
52
+ width: 60px;
53
+ flex-shrink: 0;
54
+ display: flex;
55
+ align-items: center;
56
+ justify-content: center;
57
  }
58
+
59
  .main-header .main-logo img {
60
+ max-height: 100%;
61
+ max-width: 100%;
62
+ object-fit: contain;
63
  }
64
+
65
  .main-header .main-logo.main-logo--placeholder {
66
+ background-color: rgba(255, 255, 255, 0.2);
67
+ border-radius: 12px;
68
  }
69
+
70
  .main-header .main-text {
71
+ text-align: left;
72
+ display: flex;
73
+ flex-direction: column;
74
+ justify-content: center;
75
+ min-width: 0;
76
  }
77
+
78
  .main-header .main-text h1 {
79
+ margin: 0 0 6px;
80
  }
81
+
82
  .main-header .main-text p {
83
+ margin: 0;
84
  }
85
+
86
  @media (max-width: 640px) {
87
+ .main-header {
88
+ flex-wrap: wrap;
89
+ }
90
+
91
+ .main-header .main-text {
92
+ text-align: center;
93
+ width: 100%;
94
+ }
95
  }
96
+
97
  .status-dot {
98
+ display: inline-block;
99
+ width: 8px;
100
+ height: 8px;
101
+ border-radius: 50%;
102
+ margin-left: 8px;
103
  }
104
+
105
  .status-dot.healthy {
106
+ background-color: #22c55e;
107
+ animation: pulse-green 2s ease-in-out infinite;
108
  }
109
+
110
  .status-dot.error {
111
+ background-color: #ef4444;
112
+ animation: pulse-red 2s ease-in-out infinite;
113
  }
114
+
115
  @keyframes pulse-green {
116
+ 0% {
117
+ box-shadow: 0 0 0 0 rgba(34, 197, 94, 0.7);
118
+ }
119
+ 70% {
120
+ box-shadow: 0 0 0 6px rgba(34, 197, 94, 0);
121
+ }
122
+ 100% {
123
+ box-shadow: 0 0 0 0 rgba(34, 197, 94, 0);
124
+ }
125
  }
126
+
127
  @keyframes pulse-red {
128
+ 0% {
129
+ box-shadow: 0 0 0 0 rgba(239, 68, 68, 0.7);
130
+ }
131
+ 70% {
132
+ box-shadow: 0 0 0 6px rgba(239, 68, 68, 0);
133
+ }
134
+ 100% {
135
+ box-shadow: 0 0 0 0 rgba(239, 68, 68, 0);
136
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
137
  }
138
  """
139
 
 
213
  def transcribe_audio(audio_file):
214
  """Transcribe uploaded audio"""
215
  if audio_file is None:
216
+ return "⚠️ Please upload an audio file to transcribe."
217
+
218
+ transcription = stt_client.transcribe_audio(audio_file)
219
+ text = (transcription or "").strip()
220
+
221
+ if not text or text.startswith("❌") or text.startswith("⏱"):
222
+ return text or "⚠️ No speech detected—try a clearer recording."
223
 
224
+ footer = "(Served via API • Remote backend)"
225
+ return f"{text}\n\n{footer}"
226
 
227
  def check_api_status():
228
  """Check API health status"""
 
254
  """)
255
 
256
  gr.Markdown(
257
+ """
258
+ # 🎯 Performance Benchmarks
259
+ #### **Ringg STT V0** Ranks **1st** Among Top Models, Outperforming OpenAI Whisper Large-v3 and Other Leading Solutions.
260
+ """
261
  )
262
 
263
  with gr.Row():
264
  gr.DataFrame(
265
  value=[
266
+ ["Elaichi STT (Ringg AI)", "15.00%", "15.92%"],
267
+ ["IndicWav2Vec ", "19.35%", "20.91%"],
268
+ ["VakyanSh Wav2Vec2", "22.73%", "24.78%"],
 
 
269
  ],
270
+ headers=["Model", "Median WER ↓", "Mean WER ↓"],
271
  datatype=["str", "str", "str"],
272
+ row_count=3,
273
  col_count=(3, "fixed"),
274
+ interactive=False,
275
  )
276
 
277
+ gr.Markdown(
278
+ """
279
+ -----------------
280
+ # 📁 Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)
281
+ """
282
+ )
283
 
284
  with gr.Row():
285
  audio_input = gr.Audio(
 
295
 
296
  file_output = gr.Textbox(
297
  label="Transcription Result",
298
+ lines=6,
299
  interactive=True,
300
  placeholder="Upload a file and click Transcribe...",
301
  )
302
 
303
+ transcribe_btn.click(
304
+ transcribe_audio,
305
+ inputs=audio_input,
306
+ outputs=file_output,
307
+ )
308
+
309
+ gr.Markdown(
310
+ """
311
+ ### ✨ Features
312
+ - 🌐 **Hindi Support**: Accurate transcription for Hindi audio
313
+ - 🎯 **High Accuracy**: Competitive with leading ASR models
314
+ - 📁 **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
315
+ - ⚡ **Fast Processing**: Optimized for quick transcription
316
+ """
317
+ )
318
 
319
+ gr.Markdown(
320
+ """
321
+ ### ⚠️ Benchmark Disclaimer
322
+ - Evaluated on a modified FLEURS subset to ensure consistent Hindi coverage
323
+ - Dataset issues include inaudible segments and repeated sentences caused by interruptions
324
+ - Background noise is prominent across many clips, impacting recognition quality
325
+ - Mixed Hindi-English speech often provides Hindi-only transcripts
326
+ - Currency, time, and year normalization is inconsistent with spoken forms
327
+ - Original transcripts lack punctuation, increasing WER for models that predict it
328
+ """
329
+ )
330
 
331
+ gr.Markdown(
332
+ """
333
+ # 🙏 Acknowledgements
334
+ - Special thanks to [@jeremylee12](https://huggingface.co/jeremylee12) for their contributions
335
+ - Built with [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) models
336
+ """
337
+ )
338
 
339
  return demo
340