harsh2ai commited on
Commit
f4a0156
Β·
1 Parent(s): dea7e3e

updated the logic

Browse files
Files changed (1) hide show
  1. app.py +284 -127
app.py CHANGED
@@ -1,37 +1,34 @@
1
  #!/usr/bin/env python3
2
- # updated
3
  """
4
  Ringg Parrot STT V1 🦜 - Hugging Face Space (Frontend)
5
- Makes API calls to private inference endpoint via ngrok
6
  """
7
 
8
  import os
9
- import base64
10
  from pathlib import Path
11
 
12
- # os.environ.setdefault("GRADIO_API_INFO_ENABLED", "false")
13
-
14
  import gradio as gr
15
  import requests
 
 
16
  from dotenv import load_dotenv
17
 
18
- load_dotenv() # reads variables from a .env file and sets them in os.environ
19
-
 
 
 
 
20
 
21
- LOGO_BASE64 = ""
22
- logo_path = Path(__file__).parent / "logo.png"
23
- if logo_path.exists():
24
- try:
25
- LOGO_BASE64 = base64.b64encode(logo_path.read_bytes()).decode("utf-8")
26
- except Exception as e:
27
- print(f"⚠️ Unable to load logo.png: {e}")
28
 
29
- DEFAULT_LOGO_URL = "https://storage.googleapis.com/desivocal-prod/desi-vocal/logo.png"
30
- LOGO_URL = os.environ.get("STT_LOGO_URL", DEFAULT_LOGO_URL).strip()
 
31
 
32
- # Backend API endpoint (ngrok URL)
33
- # You can update this via Hugging Face Space Secrets
34
- API_ENDPOINT = os.environ.get("STT_API_ENDPOINT", "")
35
 
36
 
37
  class RinggSTTClient:
@@ -43,49 +40,79 @@ class RinggSTTClient:
43
  self.session.headers.update({"User-Agent": "RinggSTT-HF-Space/1.0"})
44
 
45
  def check_health(self) -> dict:
46
- """Check if the API is available"""
47
  try:
48
  response = self.session.get(f"{self.api_endpoint}/health", timeout=5)
49
  if response.status_code == 200:
50
  return {"status": "healthy", "message": "βœ… API is online"}
51
- else:
52
- return {
53
- "status": "error",
54
- "message": f"❌ API returned status {response.status_code}",
55
- }
56
- except requests.exceptions.Timeout:
57
- return {"status": "error", "message": "⏱️ API request timed out"}
58
- except requests.exceptions.ConnectionError:
59
- return {"status": "error", "message": "❌ Cannot connect to API"}
60
  except Exception as e:
61
  return {"status": "error", "message": f"❌ Error: {str(e)}"}
62
 
63
- def transcribe_audio(self, audio_file_path: str) -> str:
64
- """Transcribe audio file via API"""
65
  try:
66
- # Read audio file and encode as base64
67
- with open(audio_file_path, "rb") as f:
68
- audio_data = f.read()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
 
70
- audio_base64 = base64.b64encode(audio_data).decode("utf-8")
 
 
 
 
 
 
 
 
 
 
 
 
71
 
72
- # Make API request
73
- response = self.session.post(
74
- f"{self.api_endpoint}/transcribe",
75
- json={"audio_data": audio_base64, "sample_rate": 16000},
76
- timeout=30,
77
- )
 
 
 
 
 
 
78
 
79
  if response.status_code == 200:
80
  result = response.json()
 
 
 
 
 
 
 
81
  return result.get("transcription", "No transcription received")
82
  else:
83
- return f"❌ API Error: {response.status_code} - {response.text}"
84
 
85
- except requests.exceptions.Timeout:
86
- return "⏱️ Request timed out. The audio file might be too long."
87
- except requests.exceptions.ConnectionError:
88
- return "❌ Cannot connect to the transcription service. Please try again later."
89
  except Exception as e:
90
  return f"❌ Error: {str(e)}"
91
 
@@ -93,35 +120,140 @@ class RinggSTTClient:
93
  # Initialize API client
94
  print(f"πŸ”— Connecting to STT API: {API_ENDPOINT}")
95
  stt_client = RinggSTTClient(API_ENDPOINT)
96
-
97
- # Check health on startup
98
  health_status = stt_client.check_health()
99
  print(f"API Health: {health_status}")
100
 
101
 
102
- def create_interface():
103
- """Create Gradio interface"""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
104
 
105
- def transcribe_audio(audio_file):
106
- """Transcribe uploaded audio"""
107
- if audio_file is None:
108
- return "⚠️ Please upload an audio file to transcribe."
109
 
110
- transcription = stt_client.transcribe_audio(audio_file)
111
- text = (transcription or "").strip()
112
 
113
- if not text or text.startswith("❌") or text.startswith("⏱"):
114
- return text or "⚠️ No speech detectedβ€”try a clearer recording."
 
 
 
 
 
 
115
 
116
- # footer = "(Served via API β€’ Remote backend)"
117
- return text
118
 
119
- def check_api_status():
120
- """Check API health status"""
121
- health = stt_client.check_health()
122
- return health["message"]
 
123
 
124
- # Create interface
125
  with gr.Blocks(
126
  theme=gr.themes.Base(
127
  font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"]
@@ -129,48 +261,104 @@ def create_interface():
129
  css=".gradio-container {max-width: none !important;}",
130
  ) as demo:
131
  gr.HTML("""
132
- <div style="display: flex; align-items: center; gap: 10px;">
133
- <img style="width: 50px; height: 50px; background-color: white; border-radius: 10%;" src="https://storage.googleapis.com/desivocal-prod/desi-vocal/ringg.svg" alt="Logo">
134
- <h1 style="margin: 0;">Ringg Parrot STT V1.0 🦜</h1>
135
- </div>
136
- """)
137
-
138
- gr.Markdown(
139
- """
140
- ## πŸ“ Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)
141
- """
142
- )
143
- with gr.Column():
144
- with gr.Row():
 
 
 
 
 
 
 
 
 
 
 
 
 
 
145
  audio_input = gr.Audio(
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
146
  type="filepath",
147
  sources=["upload"],
148
- show_label=False,
149
  )
150
-
 
 
151
  file_output = gr.Textbox(
152
- info="<p style=' text-align: center;'>Transcription Result</p>",
153
- container=False,
154
- lines=10,
155
  interactive=False,
156
  )
157
- transcribe_btn = gr.Button("Transcribe", variant="primary", size="lg")
158
 
159
  transcribe_btn.click(
160
- transcribe_audio,
161
- inputs=audio_input,
162
  outputs=file_output,
163
- concurrency_limit=1,
164
  )
165
 
166
- gr.Markdown(
167
- """
168
  <br>
169
 
170
  ## 🎯 Performance Benchmarks
171
- **Ringg Parrot STT V1** Ranks **1st** Among Top Models, Outperforming OpenAI Whisper Large-v3 and Other Leading Solutions.
172
- """
173
- )
174
 
175
  with gr.Row():
176
  gr.DataFrame(
@@ -186,48 +374,17 @@ def create_interface():
186
  interactive=False,
187
  )
188
 
189
- # gr.Markdown(
190
- # """
191
- # <br>
192
-
193
- # ## ✨ Features
194
- # - **Hindi Support**: Accurate transcription for Hindi audio
195
- # - **High Accuracy**: Competitive with leading ASR models
196
- # - **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
197
- # - **Fast Processing**: Optimized for quick transcription
198
- # """
199
- # )
200
-
201
- gr.Markdown(
202
- """
203
- <br>
204
-
205
- ## ⚠️ Benchmark Disclaimer
206
- - Evaluated on a modified FLEURS subset to ensure consistent Hindi coverage
207
- - Dataset issues include inaudible segments and repeated sentences caused by interruptions
208
- - Background noise is prominent across many clips, impacting recognition quality
209
- - Mixed Hindi-English speech often provides Hindi-only transcripts
210
- - Currency, time, and year normalization is inconsistent with spoken forms
211
- - Original transcripts lack punctuation, increasing WER for models that predict it
212
- """
213
- )
214
-
215
- gr.Markdown(
216
- """
217
- <br>
218
-
219
- ## πŸ™ Acknowledgements
220
- - Special thanks to [@jeremylee12](https://huggingface.co/jeremylee12) for their contributions
221
  - Built with [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) models
222
- """
223
- )
224
 
225
  return demo
226
 
227
 
228
- # Launch the app
229
  if __name__ == "__main__":
230
  print("🌐 Launching Ringg Parrot STT V1 Gradio Interface...")
 
231
  demo = create_interface()
232
  demo.queue(default_concurrency_limit=2, max_size=20)
233
  demo.launch(
 
1
  #!/usr/bin/env python3
 
2
  """
3
  Ringg Parrot STT V1 🦜 - Hugging Face Space (Frontend)
4
+ Real-time streaming transcription using Gradio's audio streaming.
5
  """
6
 
7
  import os
8
+ import tempfile
9
  from pathlib import Path
10
 
 
 
11
  import gradio as gr
12
  import requests
13
+ import numpy as np
14
+ import soundfile as sf
15
  from dotenv import load_dotenv
16
 
17
+ try:
18
+ import librosa
19
+ HAS_LIBROSA = True
20
+ except ImportError:
21
+ HAS_LIBROSA = False
22
+ print("⚠️ librosa not installed. Install with: pip install librosa")
23
 
24
+ load_dotenv()
 
 
 
 
 
 
25
 
26
+ # Backend API endpoint
27
+ API_ENDPOINT = os.environ.get("STT_API_ENDPOINT", "http://localhost:7864")
28
+ TARGET_SAMPLE_RATE = 16000
29
 
30
+ # How often to transcribe (in seconds of audio)
31
+ MIN_AUDIO_LENGTH = 0.4 # Transcribe when we have at least 400ms of new audio
 
32
 
33
 
34
  class RinggSTTClient:
 
40
  self.session.headers.update({"User-Agent": "RinggSTT-HF-Space/1.0"})
41
 
42
  def check_health(self) -> dict:
 
43
  try:
44
  response = self.session.get(f"{self.api_endpoint}/health", timeout=5)
45
  if response.status_code == 200:
46
  return {"status": "healthy", "message": "βœ… API is online"}
47
+ return {"status": "error", "message": f"❌ API returned status {response.status_code}"}
 
 
 
 
 
 
 
 
48
  except Exception as e:
49
  return {"status": "error", "message": f"❌ Error: {str(e)}"}
50
 
51
+ def transcribe_audio_data(self, audio_data: np.ndarray, sample_rate: int, language: str = "hi") -> str:
52
+ """Transcribe audio data (numpy array) via multipart upload API"""
53
  try:
54
+ # Save to temporary WAV file
55
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
56
+ temp_path = f.name
57
+ sf.write(temp_path, audio_data, sample_rate)
58
+
59
+ try:
60
+ with open(temp_path, "rb") as f:
61
+ files = {"file": ("audio.wav", f, "audio/wav")}
62
+ data = {"language": language, "punctuate": "false"}
63
+ response = self.session.post(
64
+ f"{self.api_endpoint}/v1/audio/transcriptions",
65
+ files=files,
66
+ data=data,
67
+ timeout=30,
68
+ )
69
+
70
+ # Debug: log the response for troubleshooting
71
+ print(
72
+ f"[transcribe_audio_data] status={response.status_code} "
73
+ f"body={response.text[:500]}"
74
+ )
75
 
76
+ if response.status_code == 200:
77
+ result = response.json()
78
+ if "transcription_channel_0" in result:
79
+ return result.get("transcription_channel_0", "")
80
+ return result.get("transcription", "")
81
+ else:
82
+ return ""
83
+ finally:
84
+ os.unlink(temp_path)
85
+
86
+ except Exception as e:
87
+ print(f"Transcription error: {e}")
88
+ return ""
89
 
90
+ def transcribe_file(self, audio_file_path: str, language: str = "hi") -> str:
91
+ """Transcribe audio file via multipart upload API"""
92
+ try:
93
+ with open(audio_file_path, "rb") as f:
94
+ files = {"file": (Path(audio_file_path).name, f)}
95
+ data = {"language": language, "punctuate": "false"}
96
+ response = self.session.post(
97
+ f"{self.api_endpoint}/v1/audio/transcriptions",
98
+ files=files,
99
+ data=data,
100
+ timeout=120,
101
+ )
102
 
103
  if response.status_code == 200:
104
  result = response.json()
105
+ if "transcription_channel_0" in result:
106
+ transcripts = []
107
+ if result.get("transcription_channel_0"):
108
+ transcripts.append(result["transcription_channel_0"])
109
+ if result.get("transcription_channel_1"):
110
+ transcripts.append(f"\n[Channel 2]: {result['transcription_channel_1']}")
111
+ return "".join(transcripts) if transcripts else "No speech detected"
112
  return result.get("transcription", "No transcription received")
113
  else:
114
+ return f"❌ API Error: {response.status_code}"
115
 
 
 
 
 
116
  except Exception as e:
117
  return f"❌ Error: {str(e)}"
118
 
 
120
  # Initialize API client
121
  print(f"πŸ”— Connecting to STT API: {API_ENDPOINT}")
122
  stt_client = RinggSTTClient(API_ENDPOINT)
 
 
123
  health_status = stt_client.check_health()
124
  print(f"API Health: {health_status}")
125
 
126
 
127
+ def resample_audio(audio: np.ndarray, orig_sr: int, target_sr: int) -> np.ndarray:
128
+ """Resample audio to target sample rate"""
129
+ if orig_sr == target_sr:
130
+ return audio
131
+
132
+ if HAS_LIBROSA:
133
+ return librosa.resample(audio.astype(np.float64), orig_sr=orig_sr, target_sr=target_sr)
134
+ else:
135
+ # Simple linear interpolation fallback
136
+ duration = len(audio) / orig_sr
137
+ new_length = int(duration * target_sr)
138
+ indices = np.linspace(0, len(audio) - 1, new_length)
139
+ return np.interp(indices, np.arange(len(audio)), audio.astype(np.float64))
140
+
141
+
142
+ def transcribe_stream(audio, language, audio_buffer, last_transcription, samples_processed):
143
+ """
144
+ Process streaming audio from microphone.
145
+
146
+ Simplified approach:
147
+ - Accumulate ALL audio chunks
148
+ - When we have enough new audio, transcribe the ENTIRE recording
149
+ - Display the complete transcription (backend handles everything)
150
+ """
151
+ # Initialize states
152
+ if audio_buffer is None:
153
+ audio_buffer = []
154
+ if last_transcription is None:
155
+ last_transcription = ""
156
+ if samples_processed is None:
157
+ samples_processed = 0
158
+
159
+ # Handle invalid audio input
160
+ if audio is None or isinstance(audio, int):
161
+ display = last_transcription if last_transcription else "🎀 Click microphone to start..."
162
+ return display, audio_buffer, last_transcription, samples_processed
163
+
164
+ # Gradio streaming returns (sample_rate, audio_data)
165
+ if not isinstance(audio, tuple) or len(audio) != 2:
166
+ display = last_transcription if last_transcription else "🎀 Listening..."
167
+ return display, audio_buffer, last_transcription, samples_processed
168
+
169
+ sample_rate, audio_data = audio
170
+
171
+ if not isinstance(audio_data, np.ndarray) or len(audio_data) == 0:
172
+ display = last_transcription if last_transcription else "🎀 Listening..."
173
+ return display, audio_buffer, last_transcription, samples_processed
174
+
175
+ # Convert stereo to mono if needed
176
+ if len(audio_data.shape) > 1:
177
+ audio_data = np.mean(audio_data, axis=1)
178
+
179
+ # Append this chunk to buffer
180
+ audio_buffer.append(audio_data.copy())
181
+
182
+ # Calculate total samples we have now
183
+ total_samples = sum(len(arr) for arr in audio_buffer)
184
+ total_duration = total_samples / sample_rate
185
+
186
+ # Calculate new audio since last transcription
187
+ new_samples = total_samples - samples_processed
188
+ new_duration = new_samples / sample_rate
189
+
190
+ # Only transcribe if we have enough NEW audio (to avoid too frequent API calls)
191
+ if new_duration < MIN_AUDIO_LENGTH:
192
+ display = last_transcription if last_transcription else f"🎀 Recording... ({total_duration:.1f}s)"
193
+ return display, audio_buffer, last_transcription, samples_processed
194
+
195
+ try:
196
+ # Concatenate ALL buffered audio
197
+ full_audio = np.concatenate(audio_buffer)
198
+
199
+ # Resample to 16kHz if needed
200
+ if sample_rate != TARGET_SAMPLE_RATE:
201
+ full_audio = resample_audio(full_audio, sample_rate, TARGET_SAMPLE_RATE)
202
+
203
+ # Normalize audio
204
+ max_val = np.max(np.abs(full_audio))
205
+ if max_val > 0:
206
+ full_audio = full_audio / max_val * 0.95
207
+
208
+ # Get language code
209
+ lang_code = "hi" if language == "Hindi" else "en"
210
+
211
+ # Transcribe the ENTIRE audio
212
+ transcription = stt_client.transcribe_audio_data(
213
+ full_audio.astype(np.float32),
214
+ TARGET_SAMPLE_RATE,
215
+ lang_code
216
+ )
217
+
218
+ # Update state
219
+ if transcription.strip():
220
+ last_transcription = transcription
221
+
222
+ # Mark all current samples as processed
223
+ samples_processed = total_samples
224
+
225
+ display = last_transcription if last_transcription else f"🎀 Recording... ({total_duration:.1f}s)"
226
+ return display, audio_buffer, last_transcription, samples_processed
227
+
228
+ except Exception as e:
229
+ print(f"Processing error: {e}")
230
+ display = last_transcription if last_transcription else "🎀 Listening..."
231
+ return display, audio_buffer, last_transcription, samples_processed
232
+
233
 
234
+ def clear_transcription():
235
+ """Clear all transcription state"""
236
+ return "🎀 Click microphone to start...", None, "", 0
 
237
 
 
 
238
 
239
+ def transcribe_file(audio_file, language):
240
+ """Transcribe uploaded audio file"""
241
+ if audio_file is None:
242
+ return "⚠️ Please upload an audio file to transcribe."
243
+
244
+ lang_code = "hi" if language == "Hindi" else "en"
245
+ transcription = stt_client.transcribe_file(audio_file, lang_code)
246
+ text = (transcription or "").strip()
247
 
248
+ if not text or text.startswith("❌") or text.startswith("⏱"):
249
+ return text or "⚠️ No speech detectedβ€”try a clearer recording."
250
 
251
+ return text
252
+
253
+
254
+ def create_interface():
255
+ """Create Gradio interface"""
256
 
 
257
  with gr.Blocks(
258
  theme=gr.themes.Base(
259
  font=[gr.themes.GoogleFont("Source Sans Pro"), "Arial", "sans-serif"]
 
261
  css=".gradio-container {max-width: none !important;}",
262
  ) as demo:
263
  gr.HTML("""
264
+ <div style="display: flex; align-items: center; gap: 10px;">
265
+ <img style="width: 50px; height: 50px; background-color: white; border-radius: 10%;"
266
+ src="https://storage.googleapis.com/desivocal-prod/desi-vocal/ringg.svg" alt="Logo">
267
+ <h1 style="margin: 0;">Ringg Parrot STT V1.0 🦜</h1>
268
+ </div>
269
+ """)
270
+
271
+ # Real-time streaming section
272
+ gr.Markdown("""
273
+ ## 🎀 Real-time Transcription
274
+ Click the microphone to start recording. Transcription updates as you speak.
275
+
276
+ *The entire recording is transcribed each time, so text may refine as more context is added.*
277
+ """)
278
+
279
+ # States for streaming
280
+ audio_buffer = gr.State(None)
281
+ last_transcription = gr.State("")
282
+ samples_processed = gr.State(0)
283
+
284
+ with gr.Row():
285
+ with gr.Column(scale=1):
286
+ stream_language = gr.Dropdown(
287
+ choices=["Hindi", "English"],
288
+ value="Hindi",
289
+ label="Language",
290
+ )
291
  audio_input = gr.Audio(
292
+ sources=["microphone"],
293
+ type="numpy",
294
+ streaming=True,
295
+ label="🎀 Click to start recording",
296
+ )
297
+ clear_btn = gr.Button("πŸ—‘οΈ Clear & Reset", variant="secondary")
298
+
299
+ with gr.Column(scale=2):
300
+ text_output = gr.Textbox(
301
+ label="Transcription",
302
+ value="🎀 Click microphone to start...",
303
+ lines=10,
304
+ interactive=False,
305
+ )
306
+
307
+ # Wire up streaming
308
+ audio_input.stream(
309
+ fn=transcribe_stream,
310
+ inputs=[audio_input, stream_language, audio_buffer, last_transcription, samples_processed],
311
+ outputs=[text_output, audio_buffer, last_transcription, samples_processed],
312
+ )
313
+
314
+ # Clear button
315
+ clear_btn.click(
316
+ fn=clear_transcription,
317
+ inputs=[],
318
+ outputs=[text_output, audio_buffer, last_transcription, samples_processed],
319
+ )
320
+
321
+ gr.Markdown("<br>")
322
+
323
+ # File upload section
324
+ gr.Markdown("""
325
+ ## πŸ“ Upload an audio file for transcription
326
+ Supports WAV, MP3, FLAC, M4A, and more.
327
+ """)
328
+
329
+ with gr.Row():
330
+ with gr.Column(scale=1):
331
+ file_language = gr.Dropdown(
332
+ choices=["Hindi", "English"],
333
+ value="Hindi",
334
+ label="Language",
335
+ )
336
+ file_input = gr.Audio(
337
  type="filepath",
338
  sources=["upload"],
339
+ label="Upload Audio",
340
  )
341
+ transcribe_btn = gr.Button("Transcribe File", variant="primary", size="lg")
342
+
343
+ with gr.Column(scale=2):
344
  file_output = gr.Textbox(
345
+ label="Transcription",
346
+ lines=8,
 
347
  interactive=False,
348
  )
 
349
 
350
  transcribe_btn.click(
351
+ fn=transcribe_file,
352
+ inputs=[file_input, file_language],
353
  outputs=file_output,
 
354
  )
355
 
356
+ gr.Markdown("""
 
357
  <br>
358
 
359
  ## 🎯 Performance Benchmarks
360
+ **Ringg Parrot STT V1** Ranks **1st** Among Top Models.
361
+ """)
 
362
 
363
  with gr.Row():
364
  gr.DataFrame(
 
374
  interactive=False,
375
  )
376
 
377
+ gr.Markdown("""
378
+ ## πŸ™ Acknowledgements
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
379
  - Built with [NVIDIA NeMo](https://github.com/NVIDIA/NeMo) models
380
+ """)
 
381
 
382
  return demo
383
 
384
 
 
385
  if __name__ == "__main__":
386
  print("🌐 Launching Ringg Parrot STT V1 Gradio Interface...")
387
+ print(f"Backend API: {API_ENDPOINT}")
388
  demo = create_interface()
389
  demo.queue(default_concurrency_limit=2, max_size=20)
390
  demo.launch(