Spaces:
Running on Zero
Running on Zero
FireRed Team commited on
Upload app.py
Browse files
app.py
CHANGED
|
@@ -9,6 +9,7 @@ from fireredasr2s import FireRedAsr2System, FireRedAsr2SystemConfig
|
|
| 9 |
from fireredasr2s.fireredasr2.asr import FireRedAsr2, FireRedAsr2Config
|
| 10 |
from fireredasr2s.fireredvad.vad import FireRedVad, FireRedVadConfig
|
| 11 |
from fireredasr2s.fireredvad.aed import FireRedAed, FireRedAedConfig
|
|
|
|
| 12 |
|
| 13 |
|
| 14 |
asr_system = None
|
|
@@ -16,6 +17,7 @@ asr_model_aed = None
|
|
| 16 |
asr_model_llm = None
|
| 17 |
vad_model = None
|
| 18 |
aed_model = None
|
|
|
|
| 19 |
|
| 20 |
|
| 21 |
def init_model(model_dir_aed, model_dir_llm):
|
|
@@ -24,6 +26,7 @@ def init_model(model_dir_aed, model_dir_llm):
|
|
| 24 |
global asr_model_llm
|
| 25 |
global vad_model
|
| 26 |
global aed_model
|
|
|
|
| 27 |
if asr_system is None:
|
| 28 |
asr_system_config = FireRedAsr2SystemConfig() # Use default config
|
| 29 |
asr_system = FireRedAsr2System(asr_system_config)
|
|
@@ -61,7 +64,8 @@ def init_model(model_dir_aed, model_dir_llm):
|
|
| 61 |
extend_speech_frame=0,
|
| 62 |
chunk_max_frame=30000)
|
| 63 |
vad_model = FireRedVad.from_pretrained("pretrained_models/FireRedVAD/VAD", vad_config)
|
| 64 |
-
|
|
|
|
| 65 |
use_gpu=False,
|
| 66 |
smooth_window_size=5,
|
| 67 |
speech_threshold=0.4,
|
|
@@ -74,7 +78,17 @@ def init_model(model_dir_aed, model_dir_llm):
|
|
| 74 |
extend_speech_frame=0,
|
| 75 |
chunk_max_frame=30000)
|
| 76 |
aed_model = FireRedAed.from_pretrained("pretrained_models/FireRedVAD/AED", aed_config)
|
| 77 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
|
| 80 |
@spaces.GPU(duration=20)
|
|
@@ -118,11 +132,13 @@ def asr_inference_llm(audio_file):
|
|
| 118 |
def vad_inference(audio_file):
|
| 119 |
if not audio_file:
|
| 120 |
return "Please upload a wav file"
|
| 121 |
-
result, probs =
|
| 122 |
s = f'Duration: {result["dur"]}s'
|
| 123 |
-
s += f'\
|
| 124 |
-
result, probs =
|
| 125 |
-
s += f'\
|
|
|
|
|
|
|
| 126 |
return s
|
| 127 |
|
| 128 |
|
|
|
|
| 9 |
from fireredasr2s.fireredasr2.asr import FireRedAsr2, FireRedAsr2Config
|
| 10 |
from fireredasr2s.fireredvad.vad import FireRedVad, FireRedVadConfig
|
| 11 |
from fireredasr2s.fireredvad.aed import FireRedAed, FireRedAedConfig
|
| 12 |
+
from fireredasr2s.fireredvad.stream_vad import FireRedStreamVad, FireRedStreamVadConfig
|
| 13 |
|
| 14 |
|
| 15 |
asr_system = None
|
|
|
|
| 17 |
asr_model_llm = None
|
| 18 |
vad_model = None
|
| 19 |
aed_model = None
|
| 20 |
+
stream_vad_model = None
|
| 21 |
|
| 22 |
|
| 23 |
def init_model(model_dir_aed, model_dir_llm):
|
|
|
|
| 26 |
global asr_model_llm
|
| 27 |
global vad_model
|
| 28 |
global aed_model
|
| 29 |
+
global stream_vad_model
|
| 30 |
if asr_system is None:
|
| 31 |
asr_system_config = FireRedAsr2SystemConfig() # Use default config
|
| 32 |
asr_system = FireRedAsr2System(asr_system_config)
|
|
|
|
| 64 |
extend_speech_frame=0,
|
| 65 |
chunk_max_frame=30000)
|
| 66 |
vad_model = FireRedVad.from_pretrained("pretrained_models/FireRedVAD/VAD", vad_config)
|
| 67 |
+
if aed_model is None:
|
| 68 |
+
aed_config = FireRedAedConfig(
|
| 69 |
use_gpu=False,
|
| 70 |
smooth_window_size=5,
|
| 71 |
speech_threshold=0.4,
|
|
|
|
| 78 |
extend_speech_frame=0,
|
| 79 |
chunk_max_frame=30000)
|
| 80 |
aed_model = FireRedAed.from_pretrained("pretrained_models/FireRedVAD/AED", aed_config)
|
| 81 |
+
if stream_vad_model is None:
|
| 82 |
+
vad_config = FireRedStreamVadConfig(
|
| 83 |
+
use_gpu=False,
|
| 84 |
+
smooth_window_size=5,
|
| 85 |
+
speech_threshold=0.4,
|
| 86 |
+
pad_start_frame=5,
|
| 87 |
+
min_speech_frame=8,
|
| 88 |
+
max_speech_frame=2000,
|
| 89 |
+
min_silence_frame=20,
|
| 90 |
+
chunk_max_frame=30000)
|
| 91 |
+
stream_vad_model = FireRedStreamVad.from_pretrained("pretrained_models/FireRedVAD/Stream-VAD", vad_config)
|
| 92 |
|
| 93 |
|
| 94 |
@spaces.GPU(duration=20)
|
|
|
|
| 132 |
def vad_inference(audio_file):
|
| 133 |
if not audio_file:
|
| 134 |
return "Please upload a wav file"
|
| 135 |
+
result, probs = vad_model.detect(audio_file)
|
| 136 |
s = f'Duration: {result["dur"]}s'
|
| 137 |
+
s += f'\nVAD: {result["timestamps"]}'
|
| 138 |
+
result, probs = stream_vad_model.detect_full(audio_file)
|
| 139 |
+
s += f'\nStream VAD: {result["timestamps"]}'
|
| 140 |
+
result, probs = aed_model.detect(audio_file)
|
| 141 |
+
s += f'\nAudio Event: {results["event2ratio"]}\n {result["event2timestamps"]}'
|
| 142 |
return s
|
| 143 |
|
| 144 |
|