harsh2ai commited on
Commit
508b24f
Β·
1 Parent(s): 428a84e

Revert to original app layout

Browse files
Files changed (1) hide show
  1. app.py +86 -247
app.py CHANGED
@@ -25,6 +25,19 @@ custom_css = """
25
  border-radius: 10px;
26
  margin-bottom: 20px;
27
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  """
29
 
30
  # Backend API endpoint (ngrok URL)
@@ -91,33 +104,6 @@ class RinggSTTClient:
91
  except Exception as e:
92
  return f"❌ Error: {str(e)}"
93
 
94
- def transcribe_streaming(self, audio_chunk: np.ndarray) -> Optional[str]:
95
- """Send audio chunk for streaming transcription"""
96
- try:
97
- # Convert numpy array to base64
98
- audio_bytes = audio_chunk.tobytes()
99
- audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
100
-
101
- response = self.session.post(
102
- f"{self.api_endpoint}/transcribe_stream",
103
- json={
104
- "audio_chunk": audio_base64,
105
- "dtype": str(audio_chunk.dtype),
106
- "shape": list(audio_chunk.shape)
107
- },
108
- timeout=10
109
- )
110
-
111
- if response.status_code == 200:
112
- result = response.json()
113
- return result.get("transcription")
114
- return None
115
-
116
- except Exception as e:
117
- print(f"Streaming error: {e}")
118
- return None
119
-
120
-
121
  # Initialize API client
122
  print(f"πŸ”— Connecting to STT API: {API_ENDPOINT}")
123
  stt_client = RinggSTTClient(API_ENDPOINT)
@@ -137,43 +123,6 @@ def create_interface():
137
 
138
  return stt_client.transcribe_audio(audio_file)
139
 
140
- def stream_audio(audio, state):
141
- """Handle streaming audio"""
142
- if audio is None:
143
- return "No audio input", state
144
-
145
- try:
146
- if state is None:
147
- state = {"transcripts": []}
148
-
149
- if isinstance(audio, tuple):
150
- sample_rate, audio_array = audio
151
- else:
152
- audio_array = audio
153
- sample_rate = 16000
154
-
155
- if audio_array is not None and len(audio_array) > 0:
156
- if len(audio_array.shape) > 1:
157
- audio_array = np.mean(audio_array, axis=1)
158
-
159
- audio_array = audio_array.astype(np.float32)
160
- max_abs = np.max(np.abs(audio_array)) if audio_array.size else 0.0
161
- if max_abs > 1e-6:
162
- audio_array = audio_array / max_abs
163
-
164
- # Send to API
165
- transcript = stt_client.transcribe_streaming(audio_array)
166
-
167
- if transcript and transcript.strip():
168
- if not state["transcripts"] or transcript != state["transcripts"][-1]:
169
- state["transcripts"].append(transcript)
170
-
171
- combined = " ".join(state["transcripts"]) if state["transcripts"] else "🎀 Listening..."
172
- return combined, state
173
-
174
- except Exception as e:
175
- return f"❌ Error: {str(e)}", state
176
-
177
  def check_api_status():
178
  """Check API health status"""
179
  health = stt_client.check_health()
@@ -184,192 +133,17 @@ def create_interface():
184
  gr.Markdown("""
185
  <div class="main-header">
186
  <h1>πŸŽ™οΈ Ringg STT V0</h1>
187
- <p>State-of-the-Art Speech-to-Text for Hindi and code-switching</p>
188
  </div>
189
  """)
190
-
191
- # API Status indicator
192
- with gr.Row():
193
- with gr.Column(scale=4):
194
- api_status = gr.Textbox(
195
- label="πŸ”Œ API Status",
196
- value=health_status["message"],
197
- interactive=False
198
- )
199
- with gr.Column(scale=1):
200
- check_btn = gr.Button("πŸ”„ Check Status", size="sm")
201
- check_btn.click(check_api_status, outputs=api_status)
202
-
203
- gr.Markdown("""
204
- ### πŸ“ File Upload
205
- Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)
206
- """)
207
-
208
- audio_input = gr.Audio(
209
- label="πŸ“ Upload Audio File",
210
- type="filepath",
211
- sources=["upload"]
212
- )
213
-
214
- transcribe_btn = gr.Button("πŸ”„ Transcribe", variant="primary", size="lg")
215
-
216
- file_output = gr.Textbox(
217
- label="Transcription Result",
218
- lines=8,
219
- interactive=False,
220
- placeholder="Upload a file and click Transcribe..."
221
- )
222
-
223
- transcribe_btn.click(
224
- transcribe_audio,
225
- inputs=audio_input,
226
- outputs=file_output
227
- )
228
-
229
- gr.Markdown("""
230
- ### πŸ’‘ Tips for Best Results
231
- - Use clear audio with minimal background noise
232
- - Speak naturally at a moderate pace
233
- - For file upload, ensure audio quality is good (16kHz or higher recommended)
234
- - Model handles Hindi code-switching scenarios
235
- """)
236
-
237
- gr.Markdown("""
238
- ### ✨ Features
239
- - 🌐 **Bilingual Support**: Handles Hindi and common code-switching patterns
240
- - ⚑ **Real-time Processing**: Instant transcription as you speak
241
- - 🎯 **High Accuracy**: Powered by Parakeet TDT CTC 110M model
242
- - πŸ“ **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
243
- """)
244
-
245
- gr.Markdown("""
246
- ### 🎀 Real-time Streaming
247
- Speak into your microphone for real-time transcription tuned for Hindi and code-switching.
248
- """)
249
-
250
- gr.Markdown("""
251
- ⚠️ **Note**: Real-time streaming sends audio chunks to the API endpoint.
252
- Make sure your backend service is running and accessible.
253
- """)
254
-
255
- # Buffer Configuration Controls
256
- with gr.Row():
257
- with gr.Column():
258
- gr.Markdown("### πŸ”§ Streaming Configuration")
259
- gr.Markdown("Adjust these settings to optimize streaming performance based on your connection.")
260
-
261
- buffer_duration = gr.Slider(
262
- minimum=2.0, maximum=6.0, step=0.5, value=3.0,
263
- label="Buffer Duration (seconds)",
264
- info="Size of audio chunks sent to API"
265
- )
266
- process_every_n = gr.Slider(
267
- minimum=2, maximum=8, step=1, value=3,
268
- label="Process Every N Chunks",
269
- info="How often to send audio (higher = less frequent)"
270
- )
271
- min_interval = gr.Slider(
272
- minimum=1.0, maximum=4.0, step=0.5, value=2.0,
273
- label="Min Processing Interval (seconds)",
274
- info="Minimum time between API calls"
275
- )
276
-
277
- config_info = gr.Markdown("""
278
- **πŸ’‘ Tuning Tips:**
279
- - **Lower latency**: Decrease buffer duration and interval
280
- - **Better accuracy**: Increase buffer duration
281
- - **Reduce API calls**: Increase process frequency
282
- - **Slow connection**: Increase all values
283
- """)
284
-
285
- mic_input = gr.Audio(
286
- sources=["microphone"],
287
- type="numpy",
288
- streaming=True,
289
- label="🎀 Microphone Input"
290
- )
291
-
292
- live_output = gr.Textbox(
293
- label="Live Transcription",
294
- lines=8,
295
- interactive=False,
296
- placeholder="Your transcription will appear here..."
297
- )
298
-
299
- session_state = gr.State(lambda: None)
300
-
301
- mic_input.stream(
302
- fn=stream_audio,
303
- inputs=[mic_input, session_state],
304
- outputs=[live_output, session_state]
305
- )
306
-
307
- gr.Markdown("""
308
- ### βš™οΈ Configuration
309
- **Current API Endpoint**: `{API_ENDPOINT}`
310
-
311
- The transcription service runs on a private backend accessed via a secure API endpoint.
312
- To update the API endpoint, set the `STT_API_ENDPOINT` environment variable in Space Settings.
313
- """)
314
-
315
- gr.Markdown("""
316
- ## About Ringg STT V0
317
-
318
- Ringg STT V0 is powered by NVIDIA NeMo's Parakeet TDT CTC 110M model, optimized for Hindi transcription and code-switching scenarios.
319
-
320
- ### 🎯 Model Details
321
- - **Model**: Parakeet TDT CTC 110M
322
- - **Architecture**: FastConformer encoder with CTC decoder
323
- - **Parameters**: 110 Million
324
- - **Languages**: Hindi + code-switching contexts
325
- - **Sample Rate**: 16kHz
326
- - **Framework**: PyTorch + NVIDIA NeMo
327
-
328
- ### πŸ—οΈ Architecture
329
-
330
- This Space uses a **frontend-backend architecture**:
331
-
332
- ```
333
- User β†’ HF Space (Frontend) β†’ API Endpoint β†’ Private Server (Model) β†’ Response
334
- ```
335
-
336
- - **Frontend**: This Hugging Face Space (Gradio UI)
337
- - **Backend**: Private inference server with the actual model
338
- - **Connection**: Secure API calls via tunnel
339
-
340
- ### πŸš€ Key Features
341
- - **Hindi-focused Recognition** with code-switching support
342
- - **Real-time Streaming** with low latency
343
- - **Flexible Input** supporting microphone and file upload
344
-
345
- ### πŸ“Š Use Cases
346
- - Meeting transcription and call analytics
347
- - Media subtitling
348
- - Accessibility applications
349
- - Voice search and automation workflows
350
-
351
- ### πŸ”§ Technical Specifications
352
- - **Decoder**: CTC (Connectionist Temporal Classification)
353
- - **Audio Processing**: 16kHz mono, PCM16
354
- - **Latency**: ~2-3 seconds for streaming
355
- - **API Protocol**: REST API with base64-encoded audio
356
-
357
- ### πŸ“ Limitations
358
- - Requires an active backend API endpoint
359
- - Performs best with clear audio and minimal background noise
360
- - Accuracy may vary with accents and challenging acoustic conditions
361
-
362
- ---
363
-
364
- Made with ❀️ by RinggAI Team
365
- """)
366
-
367
  gr.Markdown("""
368
  ## Performance Benchmarks
369
-
370
- Our model achieves **state-of-the-art performance** on bilingual speech recognition benchmarks:
371
  """)
372
-
373
  with gr.Row():
374
  gr.DataFrame(
375
  value=[
@@ -385,10 +159,75 @@ def create_interface():
385
  col_count=(3, "fixed"),
386
  label="Word Error Rate Comparison (Lower is Better)"
387
  )
388
-
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
389
  gr.Markdown("""
390
- **Ringg STT V0** ranks **2nd** among top models, offering competitive performance while keeping the backend infrastructure private.
 
 
 
 
 
391
  """)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
392
 
393
  return demo
394
 
 
25
  border-radius: 10px;
26
  margin-bottom: 20px;
27
  }
28
+ footer {
29
+ visibility: hidden !important;
30
+ height: 50px !important;
31
+ }
32
+ footer:after {
33
+ content: "Made with ❀️ by RinggAI Team" !important;
34
+ visibility: visible !important;
35
+ display: block !important;
36
+ text-align: center !important;
37
+ margin-top: 15px !important;
38
+ color: #666 !important;
39
+ font-size: 14px !important;
40
+ }
41
  """
42
 
43
  # Backend API endpoint (ngrok URL)
 
104
  except Exception as e:
105
  return f"❌ Error: {str(e)}"
106
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
  # Initialize API client
108
  print(f"πŸ”— Connecting to STT API: {API_ENDPOINT}")
109
  stt_client = RinggSTTClient(API_ENDPOINT)
 
123
 
124
  return stt_client.transcribe_audio(audio_file)
125
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
126
  def check_api_status():
127
  """Check API health status"""
128
  health = stt_client.check_health()
 
133
  gr.Markdown("""
134
  <div class="main-header">
135
  <h1>πŸŽ™οΈ Ringg STT V0</h1>
136
+ <p>State-of-the-Art Bilingual Speech-to-Text (English & Hindi)</p>
137
  </div>
138
  """)
139
+
140
+ # Performance Comparison Table
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
141
  gr.Markdown("""
142
  ## Performance Benchmarks
143
+
144
+ Our model achieves **state-of-the-art performance** on English-Hindi bilingual speech recognition:
145
  """)
146
+
147
  with gr.Row():
148
  gr.DataFrame(
149
  value=[
 
159
  col_count=(3, "fixed"),
160
  label="Word Error Rate Comparison (Lower is Better)"
161
  )
162
+
163
+ gr.Markdown("""
164
+ **Ringg STT V0** ranks **2nd** among top models, outperforming OpenAI Whisper Large-v3 and other leading solutions.
165
+
166
+ Lower WER (Word Error Rate) indicates better accuracy. Our model achieves competitive performance while supporting bilingual transcription.
167
+ """)
168
+
169
+ gr.Markdown("""
170
+ ### ✨ Features
171
+ - 🌐 **Bilingual Support**: Transcribe English and Hindi speech
172
+ - 🎯 **High Accuracy**: Competitive with leading ASR models
173
+ - πŸ“ **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
174
+ - ⚑ **Fast Processing**: Optimized for quick transcription
175
+ - πŸ”’ **Private Infrastructure**: Secure and controlled deployment
176
+ """)
177
+
178
  gr.Markdown("""
179
+ ### πŸ”— Links
180
+ - **Organization**: [RinggAI on Hugging Face](https://huggingface.co/RinggAI)
181
+ - **TTS Space**: [Ringg TTS V0](https://huggingface.co/spaces/RinggAI/Ringg-TTS-v0.0)
182
+
183
+ ### πŸ™ Acknowledgements
184
+ - Special thanks to [@jeremylee12](https://huggingface.co/jeremylee12) for their contributions
185
  """)
186
+
187
+ # API Status indicator
188
+ with gr.Row():
189
+ with gr.Column(scale=4):
190
+ api_status = gr.Textbox(
191
+ label="πŸ”Œ API Status",
192
+ value=health_status["message"],
193
+ interactive=False
194
+ )
195
+ with gr.Column(scale=1):
196
+ check_btn = gr.Button("πŸ”„ Check Status", size="sm")
197
+ check_btn.click(check_api_status, outputs=api_status)
198
+
199
+ with gr.Tab("πŸ“ File Upload"):
200
+ gr.Markdown("### Upload Audio File")
201
+ gr.Markdown("Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)")
202
+
203
+ audio_input = gr.Audio(
204
+ label="πŸ“ Upload Audio File",
205
+ type="filepath",
206
+ sources=["upload"]
207
+ )
208
+
209
+ transcribe_btn = gr.Button("πŸ”„ Transcribe", variant="primary", size="lg")
210
+
211
+ file_output = gr.Textbox(
212
+ label="Transcription Result",
213
+ lines=8,
214
+ interactive=False,
215
+ placeholder="Upload a file and click Transcribe..."
216
+ )
217
+
218
+ transcribe_btn.click(
219
+ transcribe_audio,
220
+ inputs=audio_input,
221
+ outputs=file_output
222
+ )
223
+
224
+ gr.Markdown("""
225
+ ### πŸ’‘ Tips for Best Results
226
+ - Use clear audio with minimal background noise
227
+ - Speak naturally at a moderate pace
228
+ - For file upload, ensure audio quality is good (16kHz or higher recommended)
229
+ - Model handles code-switching between English and Hindi
230
+ """)
231
 
232
  return demo
233