Spaces:
Running
Running
Commit
·
7925ca5
1
Parent(s):
944e4f0
fix: add support for additional model in available models and improve audio processing logic
Browse files
app.py
CHANGED
|
@@ -13,7 +13,7 @@ model = None
|
|
| 13 |
current_model_name = "nvidia/parakeet-tdt-0.6b-v2"
|
| 14 |
|
| 15 |
# Available models
|
| 16 |
-
available_models = ["nvidia/parakeet-tdt-0.6b-v2"]
|
| 17 |
|
| 18 |
def load_model(model_name=None):
|
| 19 |
# This function will be called in the GPU worker process
|
|
@@ -25,9 +25,9 @@ def load_model(model_name=None):
|
|
| 25 |
# Check if we need to load a new model
|
| 26 |
if model is None or model_name != current_model_name:
|
| 27 |
print(f"Loading model {model_name} in worker process")
|
| 28 |
-
print(f"CUDA available: {torch.cuda.is_available()}")
|
| 29 |
-
if torch.cuda.is_available():
|
| 30 |
-
|
| 31 |
|
| 32 |
# Update the current model name
|
| 33 |
current_model_name = model_name
|
|
@@ -89,9 +89,9 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
|
|
| 89 |
full_audio = full_audio.astype(float)
|
| 90 |
|
| 91 |
# Normalize audio (helps with consistent volume levels)
|
| 92 |
-
if np.abs(full_audio).max() > 0:
|
| 93 |
-
|
| 94 |
-
|
| 95 |
|
| 96 |
# Process chunks
|
| 97 |
new_state = state
|
|
@@ -102,7 +102,7 @@ def transcribe(audio, model_name="nvidia/parakeet-tdt-0.6b-v2", state="", audio_
|
|
| 102 |
start_sample = int(current_time * sample_rate)
|
| 103 |
end_sample = int((current_time + chunk_duration) * sample_rate)
|
| 104 |
if end_sample > total_samples_16k:
|
| 105 |
-
|
| 106 |
|
| 107 |
chunk = full_audio[start_sample:end_sample]
|
| 108 |
print(f"Processing chunk from {current_time:.2f}s to {current_time + chunk_duration:.2f}s")
|
|
@@ -206,7 +206,8 @@ with gr.Blocks(title="Real-time Speech-to-Text with NeMo") as demo:
|
|
| 206 |
sources=["microphone"],
|
| 207 |
type="numpy",
|
| 208 |
streaming=True,
|
| 209 |
-
label="Speak into your microphone"
|
|
|
|
| 210 |
)
|
| 211 |
|
| 212 |
clear_btn = gr.Button("Clear Transcript", variant="secondary")
|
|
|
|
| 13 |
current_model_name = "nvidia/parakeet-tdt-0.6b-v2"
|
| 14 |
|
| 15 |
# Available models
|
| 16 |
+
available_models = ["nvidia/parakeet-tdt-0.6b-v2","nvidia/parakeet-tdt-1.1b"]
|
| 17 |
|
| 18 |
def load_model(model_name=None):
|
| 19 |
# This function will be called in the GPU worker process
|
|
|
|
| 25 |
# Check if we need to load a new model
|
| 26 |
if model is None or model_name != current_model_name:
|
| 27 |
print(f"Loading model {model_name} in worker process")
|
| 28 |
+
# print(f"CUDA available: {torch.cuda.is_available()}")
|
| 29 |
+
# if torch.cuda.is_available():
|
| 30 |
+
# print(f"CUDA device: {torch.cuda.get_device_name(0)}")
|
| 31 |
|
| 32 |
# Update the current model name
|
| 33 |
current_model_name = model_name
|
|
|
|
| 89 |
full_audio = full_audio.astype(float)
|
| 90 |
|
| 91 |
# Normalize audio (helps with consistent volume levels)
|
| 92 |
+
# if np.abs(full_audio).max() > 0:
|
| 93 |
+
# full_audio = full_audio / np.abs(full_audio).max() * 0.9
|
| 94 |
+
# print("Audio normalized to improve transcription")
|
| 95 |
|
| 96 |
# Process chunks
|
| 97 |
new_state = state
|
|
|
|
| 102 |
start_sample = int(current_time * sample_rate)
|
| 103 |
end_sample = int((current_time + chunk_duration) * sample_rate)
|
| 104 |
if end_sample > total_samples_16k:
|
| 105 |
+
end_sample = total_samples_16k
|
| 106 |
|
| 107 |
chunk = full_audio[start_sample:end_sample]
|
| 108 |
print(f"Processing chunk from {current_time:.2f}s to {current_time + chunk_duration:.2f}s")
|
|
|
|
| 206 |
sources=["microphone"],
|
| 207 |
type="numpy",
|
| 208 |
streaming=True,
|
| 209 |
+
label="Speak into your microphone",
|
| 210 |
+
samplerate=16000
|
| 211 |
)
|
| 212 |
|
| 213 |
clear_btn = gr.Button("Clear Transcript", variant="secondary")
|