harsh2ai commited on
Commit
8f86c2d
Β·
1 Parent(s): d2a497b

Update app UI layout

Browse files
Files changed (1) hide show
  1. app.py +248 -86
app.py CHANGED
@@ -25,19 +25,6 @@ custom_css = """
25
  border-radius: 10px;
26
  margin-bottom: 20px;
27
  }
28
- footer {
29
- visibility: hidden !important;
30
- height: 50px !important;
31
- }
32
- footer:after {
33
- content: "Made with ❀️ by RinggAI Team" !important;
34
- visibility: visible !important;
35
- display: block !important;
36
- text-align: center !important;
37
- margin-top: 15px !important;
38
- color: #666 !important;
39
- font-size: 14px !important;
40
- }
41
  """
42
 
43
  # Backend API endpoint (ngrok URL)
@@ -104,6 +91,33 @@ class RinggSTTClient:
104
  except Exception as e:
105
  return f"❌ Error: {str(e)}"
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  # Initialize API client
108
  print(f"πŸ”— Connecting to STT API: {API_ENDPOINT}")
109
  stt_client = RinggSTTClient(API_ENDPOINT)
@@ -123,6 +137,43 @@ def create_interface():
123
 
124
  return stt_client.transcribe_audio(audio_file)
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  def check_api_status():
127
  """Check API health status"""
128
  health = stt_client.check_health()
@@ -133,17 +184,193 @@ def create_interface():
133
  gr.Markdown("""
134
  <div class="main-header">
135
  <h1>πŸŽ™οΈ Ringg STT V0</h1>
136
- <p>State-of-the-Art Bilingual Speech-to-Text (English & Hindi)</p>
137
  </div>
138
  """)
139
-
140
- # Performance Comparison Table
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  gr.Markdown("""
142
  ## Performance Benchmarks
143
-
144
- Our model achieves **state-of-the-art performance** on English-Hindi bilingual speech recognition:
145
  """)
146
-
147
  with gr.Row():
148
  gr.DataFrame(
149
  value=[
@@ -159,75 +386,10 @@ def create_interface():
159
  col_count=(3, "fixed"),
160
  label="Word Error Rate Comparison (Lower is Better)"
161
  )
162
-
163
- gr.Markdown("""
164
- **Ringg STT V0** ranks **2nd** among top models, outperforming OpenAI Whisper Large-v3 and other leading solutions.
165
-
166
- Lower WER (Word Error Rate) indicates better accuracy. Our model achieves competitive performance while supporting bilingual transcription.
167
- """)
168
-
169
- gr.Markdown("""
170
- ### ✨ Features
171
- - 🌐 **Bilingual Support**: Transcribe English and Hindi speech
172
- - 🎯 **High Accuracy**: Competitive with leading ASR models
173
- - πŸ“ **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
174
- - ⚑ **Fast Processing**: Optimized for quick transcription
175
- - πŸ”’ **Private Infrastructure**: Secure and controlled deployment
176
- """)
177
-
178
  gr.Markdown("""
179
- ### πŸ”— Links
180
- - **Organization**: [RinggAI on Hugging Face](https://huggingface.co/RinggAI)
181
- - **TTS Space**: [Ringg TTS V0](https://huggingface.co/spaces/RinggAI/Ringg-TTS-v0.0)
182
-
183
- ### πŸ™ Acknowledgements
184
- - Special thanks to [@jeremylee12](https://huggingface.co/jeremylee12) for their contributions
185
  """)
186
-
187
- # API Status indicator
188
- with gr.Row():
189
- with gr.Column(scale=4):
190
- api_status = gr.Textbox(
191
- label="πŸ”Œ API Status",
192
- value=health_status["message"],
193
- interactive=False
194
- )
195
- with gr.Column(scale=1):
196
- check_btn = gr.Button("πŸ”„ Check Status", size="sm")
197
- check_btn.click(check_api_status, outputs=api_status)
198
-
199
- with gr.Tab("πŸ“ File Upload"):
200
- gr.Markdown("### Upload Audio File")
201
- gr.Markdown("Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)")
202
-
203
- audio_input = gr.Audio(
204
- label="πŸ“ Upload Audio File",
205
- type="filepath",
206
- sources=["upload"]
207
- )
208
-
209
- transcribe_btn = gr.Button("πŸ”„ Transcribe", variant="primary", size="lg")
210
-
211
- file_output = gr.Textbox(
212
- label="Transcription Result",
213
- lines=8,
214
- interactive=False,
215
- placeholder="Upload a file and click Transcribe..."
216
- )
217
-
218
- transcribe_btn.click(
219
- transcribe_audio,
220
- inputs=audio_input,
221
- outputs=file_output
222
- )
223
-
224
- gr.Markdown("""
225
- ### πŸ’‘ Tips for Best Results
226
- - Use clear audio with minimal background noise
227
- - Speak naturally at a moderate pace
228
- - For file upload, ensure audio quality is good (16kHz or higher recommended)
229
- - Model handles code-switching between English and Hindi
230
- """)
231
 
232
  return demo
233
 
 
25
  border-radius: 10px;
26
  margin-bottom: 20px;
27
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  """
29
 
30
  # Backend API endpoint (ngrok URL)
 
91
  except Exception as e:
92
  return f"❌ Error: {str(e)}"
93
 
94
+ def transcribe_streaming(self, audio_chunk: np.ndarray) -> Optional[str]:
95
+ """Send audio chunk for streaming transcription"""
96
+ try:
97
+ # Convert numpy array to base64
98
+ audio_bytes = audio_chunk.tobytes()
99
+ audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
100
+
101
+ response = self.session.post(
102
+ f"{self.api_endpoint}/transcribe_stream",
103
+ json={
104
+ "audio_chunk": audio_base64,
105
+ "dtype": str(audio_chunk.dtype),
106
+ "shape": list(audio_chunk.shape)
107
+ },
108
+ timeout=10
109
+ )
110
+
111
+ if response.status_code == 200:
112
+ result = response.json()
113
+ return result.get("transcription")
114
+ return None
115
+
116
+ except Exception as e:
117
+ print(f"Streaming error: {e}")
118
+ return None
119
+
120
+
121
  # Initialize API client
122
  print(f"πŸ”— Connecting to STT API: {API_ENDPOINT}")
123
  stt_client = RinggSTTClient(API_ENDPOINT)
 
137
 
138
  return stt_client.transcribe_audio(audio_file)
139
 
140
+ def stream_audio(audio, state):
141
+ """Handle streaming audio"""
142
+ if audio is None:
143
+ return "No audio input", state
144
+
145
+ try:
146
+ if state is None:
147
+ state = {"transcripts": []}
148
+
149
+ if isinstance(audio, tuple):
150
+ sample_rate, audio_array = audio
151
+ else:
152
+ audio_array = audio
153
+ sample_rate = 16000
154
+
155
+ if audio_array is not None and len(audio_array) > 0:
156
+ if len(audio_array.shape) > 1:
157
+ audio_array = np.mean(audio_array, axis=1)
158
+
159
+ audio_array = audio_array.astype(np.float32)
160
+ max_abs = np.max(np.abs(audio_array)) if audio_array.size else 0.0
161
+ if max_abs > 1e-6:
162
+ audio_array = audio_array / max_abs
163
+
164
+ # Send to API
165
+ transcript = stt_client.transcribe_streaming(audio_array)
166
+
167
+ if transcript and transcript.strip():
168
+ if not state["transcripts"] or transcript != state["transcripts"][-1]:
169
+ state["transcripts"].append(transcript)
170
+
171
+ combined = " ".join(state["transcripts"]) if state["transcripts"] else "🎀 Listening..."
172
+ return combined, state
173
+
174
+ except Exception as e:
175
+ return f"❌ Error: {str(e)}", state
176
+
177
  def check_api_status():
178
  """Check API health status"""
179
  health = stt_client.check_health()
 
184
  gr.Markdown("""
185
  <div class="main-header">
186
  <h1>πŸŽ™οΈ Ringg STT V0</h1>
187
+ <p>State-of-the-Art Speech-to-Text for Hindi and code-switching</p>
188
  </div>
189
  """)
190
+
191
+ # API Status indicator
192
+ with gr.Row():
193
+ with gr.Column(scale=4):
194
+ api_status = gr.Textbox(
195
+ label="πŸ”Œ API Status",
196
+ value=health_status["message"],
197
+ interactive=False
198
+ )
199
+ with gr.Column(scale=1):
200
+ check_btn = gr.Button("πŸ”„ Check Status", size="sm")
201
+ check_btn.click(check_api_status, outputs=api_status)
202
+
203
+ gr.Markdown("""
204
+ ### πŸ“ File Upload
205
+ Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)
206
+ """)
207
+
208
+ audio_input = gr.Audio(
209
+ label="πŸ“ Upload Audio File",
210
+ type="filepath",
211
+ sources=["upload"]
212
+ )
213
+
214
+ transcribe_btn = gr.Button("πŸ”„ Transcribe", variant="primary", size="lg")
215
+
216
+ file_output = gr.Textbox(
217
+ label="Transcription Result",
218
+ lines=8,
219
+ interactive=False,
220
+ placeholder="Upload a file and click Transcribe..."
221
+ )
222
+
223
+ transcribe_btn.click(
224
+ transcribe_audio,
225
+ inputs=audio_input,
226
+ outputs=file_output
227
+ )
228
+
229
+ gr.Markdown("""
230
+ ### πŸ’‘ Tips for Best Results
231
+ - Use clear audio with minimal background noise
232
+ - Speak naturally at a moderate pace
233
+ - For file upload, ensure audio quality is good (16kHz or higher recommended)
234
+ - Model handles Hindi code-switching scenarios
235
+ """)
236
+
237
+ gr.Markdown("""
238
+ ### ✨ Features
239
+ - 🌐 **Bilingual Support**: Handles Hindi and common code-switching patterns
240
+ - ⚑ **Real-time Processing**: Instant transcription as you speak
241
+ - 🎯 **High Accuracy**: Powered by Parakeet TDT CTC 110M model
242
+ - πŸ“ **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
243
+ """)
244
+
245
+ gr.Markdown("""
246
+ ### 🎀 Real-time Streaming
247
+ Speak into your microphone for real-time transcription tuned for Hindi and code-switching.
248
+ """)
249
+
250
+ gr.Markdown("""
251
+ ⚠️ **Note**: Real-time streaming sends audio chunks to the API endpoint.
252
+ Make sure your backend service is running and accessible.
253
+ """)
254
+
255
+ # Buffer Configuration Controls
256
+ with gr.Row():
257
+ with gr.Column():
258
+ gr.Markdown("### πŸ”§ Streaming Configuration")
259
+ gr.Markdown("Adjust these settings to optimize streaming performance based on your connection.")
260
+
261
+ buffer_duration = gr.Slider(
262
+ minimum=2.0, maximum=6.0, step=0.5, value=3.0,
263
+ label="Buffer Duration (seconds)",
264
+ info="Size of audio chunks sent to API"
265
+ )
266
+ process_every_n = gr.Slider(
267
+ minimum=2, maximum=8, step=1, value=3,
268
+ label="Process Every N Chunks",
269
+ info="How often to send audio (higher = less frequent)"
270
+ )
271
+ min_interval = gr.Slider(
272
+ minimum=1.0, maximum=4.0, step=0.5, value=2.0,
273
+ label="Min Processing Interval (seconds)",
274
+ info="Minimum time between API calls"
275
+ )
276
+
277
+ config_info = gr.Markdown("""
278
+ **πŸ’‘ Tuning Tips:**
279
+ - **Lower latency**: Decrease buffer duration and interval
280
+ - **Better accuracy**: Increase buffer duration
281
+ - **Reduce API calls**: Increase process frequency
282
+ - **Slow connection**: Increase all values
283
+ """)
284
+
285
+ mic_input = gr.Audio(
286
+ sources=["microphone"],
287
+ type="numpy",
288
+ streaming=True,
289
+ label="🎀 Microphone Input"
290
+ )
291
+
292
+ live_output = gr.Textbox(
293
+ label="Live Transcription",
294
+ lines=8,
295
+ interactive=False,
296
+ placeholder="Your transcription will appear here..."
297
+ )
298
+
299
+ session_state = gr.State(lambda: None)
300
+
301
+ mic_input.stream(
302
+ fn=stream_audio,
303
+ inputs=[mic_input, session_state],
304
+ outputs=[live_output, session_state],
305
+ stream_every=0.5
306
+ )
307
+
308
+ gr.Markdown("""
309
+ ### βš™οΈ Configuration
310
+ **Current API Endpoint**: `{API_ENDPOINT}`
311
+
312
+ The transcription service runs on a private backend accessed via a secure API endpoint.
313
+ To update the API endpoint, set the `STT_API_ENDPOINT` environment variable in Space Settings.
314
+ """)
315
+
316
+ gr.Markdown("""
317
+ ## About Ringg STT V0
318
+
319
+ Ringg STT V0 is powered by NVIDIA NeMo's Parakeet TDT CTC 110M model, optimized for Hindi transcription and code-switching scenarios.
320
+
321
+ ### 🎯 Model Details
322
+ - **Model**: Parakeet TDT CTC 110M
323
+ - **Architecture**: FastConformer encoder with CTC decoder
324
+ - **Parameters**: 110 Million
325
+ - **Languages**: Hindi + code-switching contexts
326
+ - **Sample Rate**: 16kHz
327
+ - **Framework**: PyTorch + NVIDIA NeMo
328
+
329
+ ### πŸ—οΈ Architecture
330
+
331
+ This Space uses a **frontend-backend architecture**:
332
+
333
+ ```
334
+ User β†’ HF Space (Frontend) β†’ API Endpoint β†’ Private Server (Model) β†’ Response
335
+ ```
336
+
337
+ - **Frontend**: This Hugging Face Space (Gradio UI)
338
+ - **Backend**: Private inference server with the actual model
339
+ - **Connection**: Secure API calls via tunnel
340
+
341
+ ### πŸš€ Key Features
342
+ - **Hindi-focused Recognition** with code-switching support
343
+ - **Real-time Streaming** with low latency
344
+ - **Flexible Input** supporting microphone and file upload
345
+
346
+ ### πŸ“Š Use Cases
347
+ - Meeting transcription and call analytics
348
+ - Media subtitling
349
+ - Accessibility applications
350
+ - Voice search and automation workflows
351
+
352
+ ### πŸ”§ Technical Specifications
353
+ - **Decoder**: CTC (Connectionist Temporal Classification)
354
+ - **Audio Processing**: 16kHz mono, PCM16
355
+ - **Latency**: ~2-3 seconds for streaming
356
+ - **API Protocol**: REST API with base64-encoded audio
357
+
358
+ ### πŸ“ Limitations
359
+ - Requires an active backend API endpoint
360
+ - Performs best with clear audio and minimal background noise
361
+ - Accuracy may vary with accents and challenging acoustic conditions
362
+
363
+ ---
364
+
365
+ Made with ❀️ by RinggAI Team
366
+ """)
367
+
368
  gr.Markdown("""
369
  ## Performance Benchmarks
370
+
371
+ Our model achieves **state-of-the-art performance** on bilingual speech recognition benchmarks:
372
  """)
373
+
374
  with gr.Row():
375
  gr.DataFrame(
376
  value=[
 
386
  col_count=(3, "fixed"),
387
  label="Word Error Rate Comparison (Lower is Better)"
388
  )
389
+
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
  gr.Markdown("""
391
+ **Ringg STT V0** ranks **2nd** among top models, offering competitive performance while keeping the backend infrastructure private.
 
 
 
 
 
392
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
 
394
  return demo
395