sarahwei commited on
Commit
40e4072
·
1 Parent(s): 6627d73

Upload files

Browse files
Files changed (3) hide show
  1. app.py +23 -7
  2. enum_.py +1 -1
  3. requirements.txt +3 -1
app.py CHANGED
@@ -18,14 +18,16 @@ import librosa
18
  import numpy as np
19
  import torch
20
  import os
21
- from pydub import AudioSegment
22
- import io
23
 
24
  ##translation
25
  translation_model_name = "facebook/nllb-200-distilled-600M"
26
  tokenizer = AutoTokenizer.from_pretrained(translation_model_name)
27
  translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
28
 
 
 
 
29
 
30
  @lru_cache(maxsize=10)
31
  def translate_sentence(sentence, src_lang, tgt_lang):
@@ -98,7 +100,7 @@ def tts(sentence, language):
98
  return None
99
  try:
100
  language_code = tts_languages[language]
101
- if language_code in ["en", "ko", "ja"]:
102
  tts_model = load_tts()
103
  base_dir = os.path.dirname(os.path.abspath(__file__))
104
  wav_path = os.path.join(base_dir, "example.mp3")
@@ -146,14 +148,25 @@ def transcribe(audio, language=None):
146
  segments, info = model.transcribe(y, language=whisper_languages[language])
147
  else:
148
  segments, info = model.transcribe(y)
149
- print(info.language)
150
  transcription = ""
151
  for segment in segments:
152
- print(segment.text)
153
  transcription += f"{segment.text}\n"
154
  return f"{transcription}"
155
 
156
 
 
 
 
 
 
 
 
 
 
 
 
157
  with gr.Blocks() as demo:
158
  gr.Markdown(
159
  """
@@ -191,10 +204,10 @@ with gr.Blocks() as demo:
191
 
192
  with gr.Column(scale=1, min_width=300):
193
  mic = gr.Audio(
194
- sources=["microphone"], type="filepath", label="Record yourself"
195
  )
196
  transcription = gr.Textbox(label="Your transcription")
197
- feedback = gr.Textbox(label="Feedback")
198
 
199
  translate_btn.click(
200
  fn=lambda txt, s_lang, t_lang: translate_sentence(txt, s_lang, t_lang),
@@ -205,6 +218,9 @@ with gr.Blocks() as demo:
205
  translation.change(fn=tts, inputs=[translation, tgt], outputs=speech)
206
 
207
  mic.change(fn=transcribe, inputs=[mic, tgt], outputs=[transcription])
 
 
 
208
  # You could add more callbacks: e.g. after generating sentence, allow translation etc.
209
 
210
  demo.launch(share=True)
 
18
  import numpy as np
19
  import torch
20
  import os
21
+ from evaluate import load
 
22
 
23
  ##translation
24
  translation_model_name = "facebook/nllb-200-distilled-600M"
25
  tokenizer = AutoTokenizer.from_pretrained(translation_model_name)
26
  translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
27
 
28
+ wer_metric = load("wer")
29
+ cer_metric = load("cer")
30
+
31
 
32
  @lru_cache(maxsize=10)
33
  def translate_sentence(sentence, src_lang, tgt_lang):
 
100
  return None
101
  try:
102
  language_code = tts_languages[language]
103
+ if language_code in ["en", "ko", "ja", "zh-cn"]:
104
  tts_model = load_tts()
105
  base_dir = os.path.dirname(os.path.abspath(__file__))
106
  wav_path = os.path.join(base_dir, "example.mp3")
 
148
  segments, info = model.transcribe(y, language=whisper_languages[language])
149
  else:
150
  segments, info = model.transcribe(y)
151
+ logging.info(f"Detected language: {info.language}")
152
  transcription = ""
153
  for segment in segments:
154
+ logging.info(segment.text)
155
  transcription += f"{segment.text}\n"
156
  return f"{transcription}"
157
 
158
 
159
+ def evaluate(language, reference, prediction):
160
+ ### wer
161
+ if language in ["Traditional Chinese", "Vitetnamese"]:
162
+ wer = wer_metric.compute(prediction, reference)
163
+ return str((1 - wer) * 100) + "%"
164
+ ### cer
165
+ else:
166
+ cer = cer_metric.compute(prediction, reference)
167
+ return str((1 - cer) * 100) + "%"
168
+
169
+
170
  with gr.Blocks() as demo:
171
  gr.Markdown(
172
  """
 
204
 
205
  with gr.Column(scale=1, min_width=300):
206
  mic = gr.Audio(
207
+ sources=["microphone"], type="numpy", label="Record yourself"
208
  )
209
  transcription = gr.Textbox(label="Your transcription")
210
+ accuracy = gr.Textbox(label="Accuracy")
211
 
212
  translate_btn.click(
213
  fn=lambda txt, s_lang, t_lang: translate_sentence(txt, s_lang, t_lang),
 
218
  translation.change(fn=tts, inputs=[translation, tgt], outputs=speech)
219
 
220
  mic.change(fn=transcribe, inputs=[mic, tgt], outputs=[transcription])
221
+ transcription.change(
222
+ fn=evaluate, inputs=[tgt, translation, transcription], outputs=[accuracy]
223
+ )
224
  # You could add more callbacks: e.g. after generating sentence, allow translation etc.
225
 
226
  demo.launch(share=True)
enum_.py CHANGED
@@ -8,7 +8,7 @@ trans_languages = {
8
  }
9
 
10
  tts_languages = {
11
- "Traditional Chinese": "zh-tw",
12
  "English": "en",
13
  "Korean": "ko",
14
  "Vietnamese": "vie",
 
8
  }
9
 
10
  tts_languages = {
11
+ "Traditional Chinese": "zh-cn",
12
  "English": "en",
13
  "Korean": "ko",
14
  "Vietnamese": "vie",
requirements.txt CHANGED
@@ -14,4 +14,6 @@ librosa==0.10.0
14
  cutlet==0.5.0
15
  fugashi==1.5.2
16
  pydub==0.25.1
17
- TTS==0.22.0
 
 
 
14
  cutlet==0.5.0
15
  fugashi==1.5.2
16
  pydub==0.25.1
17
+ TTS==0.22.0
18
+ evaluate==0.4.6
19
+ jiwer==4.0.0