harsh2ai commited on
Commit
2fc062a
Β·
1 Parent(s): 2cbdadf

Remove real-time streaming: Keep only file upload transcription

Browse files
Files changed (2) hide show
  1. app.py +5 -148
  2. requirements.txt +4 -0
app.py CHANGED
@@ -91,33 +91,6 @@ class RinggSTTClient:
91
  except Exception as e:
92
  return f"❌ Error: {str(e)}"
93
 
94
- def transcribe_streaming(self, audio_chunk: np.ndarray) -> Optional[str]:
95
- """Send audio chunk for streaming transcription"""
96
- try:
97
- # Convert numpy array to base64
98
- audio_bytes = audio_chunk.tobytes()
99
- audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
100
-
101
- response = self.session.post(
102
- f"{self.api_endpoint}/transcribe_stream",
103
- json={
104
- "audio_chunk": audio_base64,
105
- "dtype": str(audio_chunk.dtype),
106
- "shape": list(audio_chunk.shape)
107
- },
108
- timeout=10
109
- )
110
-
111
- if response.status_code == 200:
112
- result = response.json()
113
- return result.get("transcription")
114
- return None
115
-
116
- except Exception as e:
117
- print(f"Streaming error: {e}")
118
- return None
119
-
120
-
121
  # Initialize API client
122
  print(f"πŸ”— Connecting to STT API: {API_ENDPOINT}")
123
  stt_client = RinggSTTClient(API_ENDPOINT)
@@ -137,43 +110,6 @@ def create_interface():
137
 
138
  return stt_client.transcribe_audio(audio_file)
139
 
140
- def stream_audio(audio, state):
141
- """Handle streaming audio"""
142
- if audio is None:
143
- return "No audio input", state
144
-
145
- try:
146
- if state is None:
147
- state = {"transcripts": []}
148
-
149
- if isinstance(audio, tuple):
150
- sample_rate, audio_array = audio
151
- else:
152
- audio_array = audio
153
- sample_rate = 16000
154
-
155
- if audio_array is not None and len(audio_array) > 0:
156
- if len(audio_array.shape) > 1:
157
- audio_array = np.mean(audio_array, axis=1)
158
-
159
- audio_array = audio_array.astype(np.float32)
160
- max_abs = np.max(np.abs(audio_array)) if audio_array.size else 0.0
161
- if max_abs > 1e-6:
162
- audio_array = audio_array / max_abs
163
-
164
- # Send to API
165
- transcript = stt_client.transcribe_streaming(audio_array)
166
-
167
- if transcript and transcript.strip():
168
- if not state["transcripts"] or transcript != state["transcripts"][-1]:
169
- state["transcripts"].append(transcript)
170
-
171
- combined = " ".join(state["transcripts"]) if state["transcripts"] else "🎀 Listening..."
172
- return combined, state
173
-
174
- except Exception as e:
175
- return f"❌ Error: {str(e)}", state
176
-
177
  def check_api_status():
178
  """Check API health status"""
179
  health = stt_client.check_health()
@@ -220,9 +156,9 @@ def create_interface():
220
  gr.Markdown("""
221
  ### ✨ Features
222
  - 🌐 **Bilingual Support**: Transcribe English and Hindi speech
223
- - ⚑ **Real-time Processing**: Instant transcription as you speak
224
- - 🎯 **High Accuracy**: Competitive with leading ASR models
225
  - πŸ“ **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
 
226
  - πŸ”’ **Private Infrastructure**: Secure and controlled deployment
227
  """)
228
 
@@ -238,67 +174,6 @@ def create_interface():
238
  check_btn = gr.Button("πŸ”„ Check Status", size="sm")
239
  check_btn.click(check_api_status, outputs=api_status)
240
 
241
- with gr.Tab("🎀 Real-time Streaming"):
242
- gr.Markdown("### Live Microphone Transcription")
243
- gr.Markdown("Speak into your microphone for real-time transcription in English or Hindi.")
244
-
245
- gr.Markdown("""
246
- ⚠️ **Note**: Real-time streaming sends audio chunks to the API endpoint.
247
- Make sure your backend service is running and accessible.
248
- """)
249
-
250
- # Buffer Configuration Controls
251
- with gr.Row():
252
- with gr.Column():
253
- gr.Markdown("### πŸ”§ Streaming Configuration")
254
- gr.Markdown("Adjust these settings to optimize streaming performance based on your connection.")
255
-
256
- buffer_duration = gr.Slider(
257
- minimum=2.0, maximum=6.0, step=0.5, value=3.0,
258
- label="Buffer Duration (seconds)",
259
- info="Size of audio chunks sent to API"
260
- )
261
- process_every_n = gr.Slider(
262
- minimum=2, maximum=8, step=1, value=3,
263
- label="Process Every N Chunks",
264
- info="How often to send audio (higher = less frequent)"
265
- )
266
- min_interval = gr.Slider(
267
- minimum=1.0, maximum=4.0, step=0.5, value=2.0,
268
- label="Min Processing Interval (seconds)",
269
- info="Minimum time between API calls"
270
- )
271
-
272
- gr.Markdown("""
273
- **πŸ’‘ Tuning Tips:**
274
- - **Lower latency**: Decrease buffer duration and interval
275
- - **Better accuracy**: Increase buffer duration
276
- - **Reduce API calls**: Increase process frequency
277
- - **Slow connection**: Increase all values
278
- """)
279
-
280
- mic_input = gr.Audio(
281
- sources=["microphone"],
282
- type="numpy",
283
- streaming=True,
284
- label="🎀 Microphone Input"
285
- )
286
-
287
- live_output = gr.Textbox(
288
- label="Live Transcription",
289
- lines=8,
290
- interactive=False,
291
- placeholder="Your transcription will appear here..."
292
- )
293
-
294
- session_state = gr.State(lambda: None)
295
-
296
- mic_input.stream(
297
- fn=stream_audio,
298
- inputs=[mic_input, session_state],
299
- outputs=[live_output, session_state]
300
- )
301
-
302
  with gr.Tab("πŸ“ File Upload"):
303
  gr.Markdown("### Upload Audio File")
304
  gr.Markdown("Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)")
@@ -330,28 +205,10 @@ def create_interface():
330
  - Speak naturally at a moderate pace
331
  - For file upload, ensure audio quality is good (16kHz or higher recommended)
332
  - Model handles code-switching between English and Hindi
333
- """)
334
-
335
- with gr.Tab("βš™οΈ Configuration"):
336
- gr.Markdown("### API Endpoint Configuration")
337
- gr.Markdown(f"""
338
- **Current API Endpoint**: `{API_ENDPOINT}`
339
-
340
- The transcription service runs on a private infrastructure and is accessed via a secure API endpoint.
341
-
342
- #### How it Works:
343
- 1. 🎀 You interact with this Hugging Face Space (frontend)
344
- 2. πŸ“‘ Audio is sent to the private API endpoint
345
- 3. πŸ€– The model processes the audio on secure infrastructure
346
- 4. πŸ“ Transcription is returned and displayed
347
-
348
- #### Benefits:
349
- - πŸ”’ **Privacy**: Model and data stay on private infrastructure
350
- - ⚑ **Performance**: Dedicated compute resources
351
- - 🎯 **Control**: Full control over the model and processing
352
- - πŸ’° **Cost-effective**: Use your own compute resources
353
 
354
- To update the API endpoint, set the `STT_API_ENDPOINT` environment variable in Space Settings.
 
 
355
  """)
356
 
357
  with gr.Tab("ℹ️ About"):
 
91
  except Exception as e:
92
  return f"❌ Error: {str(e)}"
93
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
94
  # Initialize API client
95
  print(f"πŸ”— Connecting to STT API: {API_ENDPOINT}")
96
  stt_client = RinggSTTClient(API_ENDPOINT)
 
110
 
111
  return stt_client.transcribe_audio(audio_file)
112
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  def check_api_status():
114
  """Check API health status"""
115
  health = stt_client.check_health()
 
156
  gr.Markdown("""
157
  ### ✨ Features
158
  - 🌐 **Bilingual Support**: Transcribe English and Hindi speech
159
+ - 🎯 **High Accuracy**: Competitive with leading ASR models
 
160
  - πŸ“ **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
161
+ - ⚑ **Fast Processing**: Optimized for quick transcription
162
  - πŸ”’ **Private Infrastructure**: Secure and controlled deployment
163
  """)
164
 
 
174
  check_btn = gr.Button("πŸ”„ Check Status", size="sm")
175
  check_btn.click(check_api_status, outputs=api_status)
176
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
  with gr.Tab("πŸ“ File Upload"):
178
  gr.Markdown("### Upload Audio File")
179
  gr.Markdown("Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)")
 
205
  - Speak naturally at a moderate pace
206
  - For file upload, ensure audio quality is good (16kHz or higher recommended)
207
  - Model handles code-switching between English and Hindi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
+ ### πŸ”§ Backend Configuration
210
+ This Space connects to a private API endpoint for transcription.
211
+ The API endpoint can be configured via the `STT_API_ENDPOINT` secret in Space Settings.
212
  """)
213
 
214
  with gr.Tab("ℹ️ About"):
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ gradio==4.44.0
2
+ numpy
3
+ requests
4
+