harsh2ai
commited on
Commit
Β·
2fc062a
1
Parent(s):
2cbdadf
Remove real-time streaming: Keep only file upload transcription
Browse files- app.py +5 -148
- requirements.txt +4 -0
app.py
CHANGED
|
@@ -91,33 +91,6 @@ class RinggSTTClient:
|
|
| 91 |
except Exception as e:
|
| 92 |
return f"β Error: {str(e)}"
|
| 93 |
|
| 94 |
-
def transcribe_streaming(self, audio_chunk: np.ndarray) -> Optional[str]:
|
| 95 |
-
"""Send audio chunk for streaming transcription"""
|
| 96 |
-
try:
|
| 97 |
-
# Convert numpy array to base64
|
| 98 |
-
audio_bytes = audio_chunk.tobytes()
|
| 99 |
-
audio_base64 = base64.b64encode(audio_bytes).decode('utf-8')
|
| 100 |
-
|
| 101 |
-
response = self.session.post(
|
| 102 |
-
f"{self.api_endpoint}/transcribe_stream",
|
| 103 |
-
json={
|
| 104 |
-
"audio_chunk": audio_base64,
|
| 105 |
-
"dtype": str(audio_chunk.dtype),
|
| 106 |
-
"shape": list(audio_chunk.shape)
|
| 107 |
-
},
|
| 108 |
-
timeout=10
|
| 109 |
-
)
|
| 110 |
-
|
| 111 |
-
if response.status_code == 200:
|
| 112 |
-
result = response.json()
|
| 113 |
-
return result.get("transcription")
|
| 114 |
-
return None
|
| 115 |
-
|
| 116 |
-
except Exception as e:
|
| 117 |
-
print(f"Streaming error: {e}")
|
| 118 |
-
return None
|
| 119 |
-
|
| 120 |
-
|
| 121 |
# Initialize API client
|
| 122 |
print(f"π Connecting to STT API: {API_ENDPOINT}")
|
| 123 |
stt_client = RinggSTTClient(API_ENDPOINT)
|
|
@@ -137,43 +110,6 @@ def create_interface():
|
|
| 137 |
|
| 138 |
return stt_client.transcribe_audio(audio_file)
|
| 139 |
|
| 140 |
-
def stream_audio(audio, state):
|
| 141 |
-
"""Handle streaming audio"""
|
| 142 |
-
if audio is None:
|
| 143 |
-
return "No audio input", state
|
| 144 |
-
|
| 145 |
-
try:
|
| 146 |
-
if state is None:
|
| 147 |
-
state = {"transcripts": []}
|
| 148 |
-
|
| 149 |
-
if isinstance(audio, tuple):
|
| 150 |
-
sample_rate, audio_array = audio
|
| 151 |
-
else:
|
| 152 |
-
audio_array = audio
|
| 153 |
-
sample_rate = 16000
|
| 154 |
-
|
| 155 |
-
if audio_array is not None and len(audio_array) > 0:
|
| 156 |
-
if len(audio_array.shape) > 1:
|
| 157 |
-
audio_array = np.mean(audio_array, axis=1)
|
| 158 |
-
|
| 159 |
-
audio_array = audio_array.astype(np.float32)
|
| 160 |
-
max_abs = np.max(np.abs(audio_array)) if audio_array.size else 0.0
|
| 161 |
-
if max_abs > 1e-6:
|
| 162 |
-
audio_array = audio_array / max_abs
|
| 163 |
-
|
| 164 |
-
# Send to API
|
| 165 |
-
transcript = stt_client.transcribe_streaming(audio_array)
|
| 166 |
-
|
| 167 |
-
if transcript and transcript.strip():
|
| 168 |
-
if not state["transcripts"] or transcript != state["transcripts"][-1]:
|
| 169 |
-
state["transcripts"].append(transcript)
|
| 170 |
-
|
| 171 |
-
combined = " ".join(state["transcripts"]) if state["transcripts"] else "π€ Listening..."
|
| 172 |
-
return combined, state
|
| 173 |
-
|
| 174 |
-
except Exception as e:
|
| 175 |
-
return f"β Error: {str(e)}", state
|
| 176 |
-
|
| 177 |
def check_api_status():
|
| 178 |
"""Check API health status"""
|
| 179 |
health = stt_client.check_health()
|
|
@@ -220,9 +156,9 @@ def create_interface():
|
|
| 220 |
gr.Markdown("""
|
| 221 |
### β¨ Features
|
| 222 |
- π **Bilingual Support**: Transcribe English and Hindi speech
|
| 223 |
-
-
|
| 224 |
-
- π― **High Accuracy**: Competitive with leading ASR models
|
| 225 |
- π **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
|
|
|
|
| 226 |
- π **Private Infrastructure**: Secure and controlled deployment
|
| 227 |
""")
|
| 228 |
|
|
@@ -238,67 +174,6 @@ def create_interface():
|
|
| 238 |
check_btn = gr.Button("π Check Status", size="sm")
|
| 239 |
check_btn.click(check_api_status, outputs=api_status)
|
| 240 |
|
| 241 |
-
with gr.Tab("π€ Real-time Streaming"):
|
| 242 |
-
gr.Markdown("### Live Microphone Transcription")
|
| 243 |
-
gr.Markdown("Speak into your microphone for real-time transcription in English or Hindi.")
|
| 244 |
-
|
| 245 |
-
gr.Markdown("""
|
| 246 |
-
β οΈ **Note**: Real-time streaming sends audio chunks to the API endpoint.
|
| 247 |
-
Make sure your backend service is running and accessible.
|
| 248 |
-
""")
|
| 249 |
-
|
| 250 |
-
# Buffer Configuration Controls
|
| 251 |
-
with gr.Row():
|
| 252 |
-
with gr.Column():
|
| 253 |
-
gr.Markdown("### π§ Streaming Configuration")
|
| 254 |
-
gr.Markdown("Adjust these settings to optimize streaming performance based on your connection.")
|
| 255 |
-
|
| 256 |
-
buffer_duration = gr.Slider(
|
| 257 |
-
minimum=2.0, maximum=6.0, step=0.5, value=3.0,
|
| 258 |
-
label="Buffer Duration (seconds)",
|
| 259 |
-
info="Size of audio chunks sent to API"
|
| 260 |
-
)
|
| 261 |
-
process_every_n = gr.Slider(
|
| 262 |
-
minimum=2, maximum=8, step=1, value=3,
|
| 263 |
-
label="Process Every N Chunks",
|
| 264 |
-
info="How often to send audio (higher = less frequent)"
|
| 265 |
-
)
|
| 266 |
-
min_interval = gr.Slider(
|
| 267 |
-
minimum=1.0, maximum=4.0, step=0.5, value=2.0,
|
| 268 |
-
label="Min Processing Interval (seconds)",
|
| 269 |
-
info="Minimum time between API calls"
|
| 270 |
-
)
|
| 271 |
-
|
| 272 |
-
gr.Markdown("""
|
| 273 |
-
**π‘ Tuning Tips:**
|
| 274 |
-
- **Lower latency**: Decrease buffer duration and interval
|
| 275 |
-
- **Better accuracy**: Increase buffer duration
|
| 276 |
-
- **Reduce API calls**: Increase process frequency
|
| 277 |
-
- **Slow connection**: Increase all values
|
| 278 |
-
""")
|
| 279 |
-
|
| 280 |
-
mic_input = gr.Audio(
|
| 281 |
-
sources=["microphone"],
|
| 282 |
-
type="numpy",
|
| 283 |
-
streaming=True,
|
| 284 |
-
label="π€ Microphone Input"
|
| 285 |
-
)
|
| 286 |
-
|
| 287 |
-
live_output = gr.Textbox(
|
| 288 |
-
label="Live Transcription",
|
| 289 |
-
lines=8,
|
| 290 |
-
interactive=False,
|
| 291 |
-
placeholder="Your transcription will appear here..."
|
| 292 |
-
)
|
| 293 |
-
|
| 294 |
-
session_state = gr.State(lambda: None)
|
| 295 |
-
|
| 296 |
-
mic_input.stream(
|
| 297 |
-
fn=stream_audio,
|
| 298 |
-
inputs=[mic_input, session_state],
|
| 299 |
-
outputs=[live_output, session_state]
|
| 300 |
-
)
|
| 301 |
-
|
| 302 |
with gr.Tab("π File Upload"):
|
| 303 |
gr.Markdown("### Upload Audio File")
|
| 304 |
gr.Markdown("Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)")
|
|
@@ -330,28 +205,10 @@ def create_interface():
|
|
| 330 |
- Speak naturally at a moderate pace
|
| 331 |
- For file upload, ensure audio quality is good (16kHz or higher recommended)
|
| 332 |
- Model handles code-switching between English and Hindi
|
| 333 |
-
""")
|
| 334 |
-
|
| 335 |
-
with gr.Tab("βοΈ Configuration"):
|
| 336 |
-
gr.Markdown("### API Endpoint Configuration")
|
| 337 |
-
gr.Markdown(f"""
|
| 338 |
-
**Current API Endpoint**: `{API_ENDPOINT}`
|
| 339 |
-
|
| 340 |
-
The transcription service runs on a private infrastructure and is accessed via a secure API endpoint.
|
| 341 |
-
|
| 342 |
-
#### How it Works:
|
| 343 |
-
1. π€ You interact with this Hugging Face Space (frontend)
|
| 344 |
-
2. π‘ Audio is sent to the private API endpoint
|
| 345 |
-
3. π€ The model processes the audio on secure infrastructure
|
| 346 |
-
4. π Transcription is returned and displayed
|
| 347 |
-
|
| 348 |
-
#### Benefits:
|
| 349 |
-
- π **Privacy**: Model and data stay on private infrastructure
|
| 350 |
-
- β‘ **Performance**: Dedicated compute resources
|
| 351 |
-
- π― **Control**: Full control over the model and processing
|
| 352 |
-
- π° **Cost-effective**: Use your own compute resources
|
| 353 |
|
| 354 |
-
|
|
|
|
|
|
|
| 355 |
""")
|
| 356 |
|
| 357 |
with gr.Tab("βΉοΈ About"):
|
|
|
|
| 91 |
except Exception as e:
|
| 92 |
return f"β Error: {str(e)}"
|
| 93 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
# Initialize API client
|
| 95 |
print(f"π Connecting to STT API: {API_ENDPOINT}")
|
| 96 |
stt_client = RinggSTTClient(API_ENDPOINT)
|
|
|
|
| 110 |
|
| 111 |
return stt_client.transcribe_audio(audio_file)
|
| 112 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 113 |
def check_api_status():
|
| 114 |
"""Check API health status"""
|
| 115 |
health = stt_client.check_health()
|
|
|
|
| 156 |
gr.Markdown("""
|
| 157 |
### β¨ Features
|
| 158 |
- π **Bilingual Support**: Transcribe English and Hindi speech
|
| 159 |
+
- π― **High Accuracy**: Competitive with leading ASR models
|
|
|
|
| 160 |
- π **File Upload**: Support for various audio formats (WAV, MP3, FLAC, etc.)
|
| 161 |
+
- β‘ **Fast Processing**: Optimized for quick transcription
|
| 162 |
- π **Private Infrastructure**: Secure and controlled deployment
|
| 163 |
""")
|
| 164 |
|
|
|
|
| 174 |
check_btn = gr.Button("π Check Status", size="sm")
|
| 175 |
check_btn.click(check_api_status, outputs=api_status)
|
| 176 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 177 |
with gr.Tab("π File Upload"):
|
| 178 |
gr.Markdown("### Upload Audio File")
|
| 179 |
gr.Markdown("Upload an audio file for transcription (supports WAV, MP3, FLAC, M4A, etc.)")
|
|
|
|
| 205 |
- Speak naturally at a moderate pace
|
| 206 |
- For file upload, ensure audio quality is good (16kHz or higher recommended)
|
| 207 |
- Model handles code-switching between English and Hindi
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
|
| 209 |
+
### π§ Backend Configuration
|
| 210 |
+
This Space connects to a private API endpoint for transcription.
|
| 211 |
+
The API endpoint can be configured via the `STT_API_ENDPOINT` secret in Space Settings.
|
| 212 |
""")
|
| 213 |
|
| 214 |
with gr.Tab("βΉοΈ About"):
|
requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
gradio==4.44.0
|
| 2 |
+
numpy
|
| 3 |
+
requests
|
| 4 |
+
|