FireRed Team commited on
Commit
b71b08b
·
verified ·
1 Parent(s): cd83b92

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +52 -1
app.py CHANGED
@@ -7,17 +7,22 @@ from huggingface_hub import snapshot_download
7
  sys.path.append("./fireredasr2s")
8
  from fireredasr2s import FireRedAsr2System, FireRedAsr2SystemConfig
9
  from fireredasr2s.fireredasr2.asr import FireRedAsr2, FireRedAsr2Config
 
10
 
11
 
12
  asr_system = None
13
  asr_model_aed = None
14
  asr_model_llm = None
 
 
15
 
16
 
17
  def init_model(model_dir_aed, model_dir_llm):
18
  global asr_system
19
  global asr_model_aed
20
  global asr_model_llm
 
 
21
  if asr_system is None:
22
  asr_system_config = FireRedAsr2SystemConfig() # Use default config
23
  asr_system = FireRedAsr2System(asr_system_config)
@@ -43,6 +48,32 @@ def init_model(model_dir_aed, model_dir_llm):
43
  temperature=1.0
44
  )
45
  asr_model_llm = FireRedAsr2.from_pretrained("llm", model_dir_llm, asr_config_llm)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
 
47
 
48
  @spaces.GPU(duration=20)
@@ -50,7 +81,7 @@ def asr_sys_inference(audio_file):
50
  if not audio_file:
51
  return "Please upload a wav file"
52
  results = asr_system.process(audio_file)
53
- s = f'ASR: {results["text"]}\nSentences: {results["sentences"]}\nVAD(ms): {results["vad_segments_ms"]}'
54
  return s
55
 
56
 
@@ -82,6 +113,18 @@ def asr_inference_llm(audio_file):
82
  return text_output
83
 
84
 
 
 
 
 
 
 
 
 
 
 
 
 
85
  with gr.Blocks(title="FireRedASR2S") as demo:
86
  gr.HTML(
87
  "<h1 style='text-align: center'>FireRedASR2S Demo</h1>"
@@ -92,6 +135,8 @@ with gr.Blocks(title="FireRedASR2S") as demo:
92
  with gr.Column():
93
  #audio_file = gr.Audio(label="Upload Audio", sources=["upload", "microphone"], type="filepath")
94
  audio_file = gr.Audio(label="Upload wav file", sources=["upload"], type="filepath")
 
 
95
 
96
  with gr.Column():
97
  asr_sys_button = gr.Button("Start Recognition (FireRedASR2S)", variant="primary")
@@ -101,6 +146,12 @@ with gr.Blocks(title="FireRedASR2S") as demo:
101
  asr_button_llm = gr.Button("Start Recognition (FireRedASR2-LLM-L)", variant="primary")
102
  text_output_llm = gr.Textbox(label="Model Result (FireRedASR2-LLM-L)", interactive=False, lines=3, max_lines=12)
103
 
 
 
 
 
 
 
104
  asr_sys_button.click(
105
  fn=asr_sys_inference,
106
  inputs=[audio_file],
 
7
  sys.path.append("./fireredasr2s")
8
  from fireredasr2s import FireRedAsr2System, FireRedAsr2SystemConfig
9
  from fireredasr2s.fireredasr2.asr import FireRedAsr2, FireRedAsr2Config
10
+ from fireredasr2s.fireredvad.vad import FireRedVad, FireRedVadConfig
11
 
12
 
13
  asr_system = None
14
  asr_model_aed = None
15
  asr_model_llm = None
16
+ vad_model = None
17
+ aed_model = None
18
 
19
 
20
  def init_model(model_dir_aed, model_dir_llm):
21
  global asr_system
22
  global asr_model_aed
23
  global asr_model_llm
24
+ global vad_model
25
+ global aed_model
26
  if asr_system is None:
27
  asr_system_config = FireRedAsr2SystemConfig() # Use default config
28
  asr_system = FireRedAsr2System(asr_system_config)
 
48
  temperature=1.0
49
  )
50
  asr_model_llm = FireRedAsr2.from_pretrained("llm", model_dir_llm, asr_config_llm)
51
+ if vad_model is None:
52
+ vad_config = FireRedVadConfig(
53
+ use_gpu=False,
54
+ smooth_window_size=5,
55
+ speech_threshold=0.4,
56
+ min_speech_frame=20,
57
+ max_speech_frame=2000,
58
+ min_silence_frame=20,
59
+ merge_silence_frame=0,
60
+ extend_speech_frame=0,
61
+ chunk_max_frame=30000)
62
+ vad_model = FireRedVad.from_pretrained("pretrained_models/FireRedVAD/VAD", vad_config)
63
+ aed_config=FireRedAedConfig(
64
+ use_gpu=False,
65
+ smooth_window_size=5,
66
+ speech_threshold=0.4,
67
+ singing_threshold=0.5,
68
+ music_threshold=0.5,
69
+ min_event_frame=20,
70
+ max_event_frame=2000,
71
+ min_silence_frame=20,
72
+ merge_silence_frame=0,
73
+ extend_speech_frame=0,
74
+ chunk_max_frame=30000)
75
+ aed_model = FireRedAed.from_pretrained("pretrained_models/FireRedVAD/AED", aed_config)
76
+
77
 
78
 
79
  @spaces.GPU(duration=20)
 
81
  if not audio_file:
82
  return "Please upload a wav file"
83
  results = asr_system.process(audio_file)
84
+ s = f'ASR: {results["text"]}\nSentences: {results["sentences"]}\nVAD(ms): {results["vad_segments_ms"]}\nDuration: {results["dur_s"]}s'
85
  return s
86
 
87
 
 
113
  return text_output
114
 
115
 
116
+ @spaces.GPU(duration=20)
117
+ def vad_inference(audio_file):
118
+ if not audio_file:
119
+ return "Please upload a wav file"
120
+ result, probs = vad.detect(audio_file)
121
+ s = f'Duration: {result["dur"]}s'
122
+ s += f'\nVoice: {result["timestamps"]}'
123
+ result, probs = aed.detect(audio_file)
124
+ s += f'\nEvent: {results["event2ratio"]}\n {result["event2timestamps"]}'
125
+ return s
126
+
127
+
128
  with gr.Blocks(title="FireRedASR2S") as demo:
129
  gr.HTML(
130
  "<h1 style='text-align: center'>FireRedASR2S Demo</h1>"
 
135
  with gr.Column():
136
  #audio_file = gr.Audio(label="Upload Audio", sources=["upload", "microphone"], type="filepath")
137
  audio_file = gr.Audio(label="Upload wav file", sources=["upload"], type="filepath")
138
+ vad_button = gr.Button("Start Recognition (FireRedVAD)", variant="primary")
139
+ vad_output = gr.Textbox(label="Model Result (FireRedVAD)", interactive=False, lines=3, max_lines=12)
140
 
141
  with gr.Column():
142
  asr_sys_button = gr.Button("Start Recognition (FireRedASR2S)", variant="primary")
 
146
  asr_button_llm = gr.Button("Start Recognition (FireRedASR2-LLM-L)", variant="primary")
147
  text_output_llm = gr.Textbox(label="Model Result (FireRedASR2-LLM-L)", interactive=False, lines=3, max_lines=12)
148
 
149
+ vad_button.click(
150
+ fn=vad_inference,
151
+ inputs=[audio_file],
152
+ outputs=[vad_output]
153
+ )
154
+
155
  asr_sys_button.click(
156
  fn=asr_sys_inference,
157
  inputs=[audio_file],