Spaces:
Sleeping
Sleeping
Upload files
Browse files- app.py +23 -7
- enum_.py +1 -1
- requirements.txt +3 -1
app.py
CHANGED
|
@@ -18,14 +18,16 @@ import librosa
|
|
| 18 |
import numpy as np
|
| 19 |
import torch
|
| 20 |
import os
|
| 21 |
-
from
|
| 22 |
-
import io
|
| 23 |
|
| 24 |
##translation
|
| 25 |
translation_model_name = "facebook/nllb-200-distilled-600M"
|
| 26 |
tokenizer = AutoTokenizer.from_pretrained(translation_model_name)
|
| 27 |
translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
|
| 30 |
@lru_cache(maxsize=10)
|
| 31 |
def translate_sentence(sentence, src_lang, tgt_lang):
|
|
@@ -98,7 +100,7 @@ def tts(sentence, language):
|
|
| 98 |
return None
|
| 99 |
try:
|
| 100 |
language_code = tts_languages[language]
|
| 101 |
-
if language_code in ["en", "ko", "ja"]:
|
| 102 |
tts_model = load_tts()
|
| 103 |
base_dir = os.path.dirname(os.path.abspath(__file__))
|
| 104 |
wav_path = os.path.join(base_dir, "example.mp3")
|
|
@@ -146,14 +148,25 @@ def transcribe(audio, language=None):
|
|
| 146 |
segments, info = model.transcribe(y, language=whisper_languages[language])
|
| 147 |
else:
|
| 148 |
segments, info = model.transcribe(y)
|
| 149 |
-
|
| 150 |
transcription = ""
|
| 151 |
for segment in segments:
|
| 152 |
-
|
| 153 |
transcription += f"{segment.text}\n"
|
| 154 |
return f"{transcription}"
|
| 155 |
|
| 156 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 157 |
with gr.Blocks() as demo:
|
| 158 |
gr.Markdown(
|
| 159 |
"""
|
|
@@ -191,10 +204,10 @@ with gr.Blocks() as demo:
|
|
| 191 |
|
| 192 |
with gr.Column(scale=1, min_width=300):
|
| 193 |
mic = gr.Audio(
|
| 194 |
-
sources=["microphone"], type="
|
| 195 |
)
|
| 196 |
transcription = gr.Textbox(label="Your transcription")
|
| 197 |
-
|
| 198 |
|
| 199 |
translate_btn.click(
|
| 200 |
fn=lambda txt, s_lang, t_lang: translate_sentence(txt, s_lang, t_lang),
|
|
@@ -205,6 +218,9 @@ with gr.Blocks() as demo:
|
|
| 205 |
translation.change(fn=tts, inputs=[translation, tgt], outputs=speech)
|
| 206 |
|
| 207 |
mic.change(fn=transcribe, inputs=[mic, tgt], outputs=[transcription])
|
|
|
|
|
|
|
|
|
|
| 208 |
# You could add more callbacks: e.g. after generating sentence, allow translation etc.
|
| 209 |
|
| 210 |
demo.launch(share=True)
|
|
|
|
| 18 |
import numpy as np
|
| 19 |
import torch
|
| 20 |
import os
|
| 21 |
+
from evaluate import load
|
|
|
|
| 22 |
|
| 23 |
##translation
|
| 24 |
translation_model_name = "facebook/nllb-200-distilled-600M"
|
| 25 |
tokenizer = AutoTokenizer.from_pretrained(translation_model_name)
|
| 26 |
translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
|
| 27 |
|
| 28 |
+
wer_metric = load("wer")
|
| 29 |
+
cer_metric = load("cer")
|
| 30 |
+
|
| 31 |
|
| 32 |
@lru_cache(maxsize=10)
|
| 33 |
def translate_sentence(sentence, src_lang, tgt_lang):
|
|
|
|
| 100 |
return None
|
| 101 |
try:
|
| 102 |
language_code = tts_languages[language]
|
| 103 |
+
if language_code in ["en", "ko", "ja", "zh-cn"]:
|
| 104 |
tts_model = load_tts()
|
| 105 |
base_dir = os.path.dirname(os.path.abspath(__file__))
|
| 106 |
wav_path = os.path.join(base_dir, "example.mp3")
|
|
|
|
| 148 |
segments, info = model.transcribe(y, language=whisper_languages[language])
|
| 149 |
else:
|
| 150 |
segments, info = model.transcribe(y)
|
| 151 |
+
logging.info(f"Detected language: {info.language}")
|
| 152 |
transcription = ""
|
| 153 |
for segment in segments:
|
| 154 |
+
logging.info(segment.text)
|
| 155 |
transcription += f"{segment.text}\n"
|
| 156 |
return f"{transcription}"
|
| 157 |
|
| 158 |
|
| 159 |
+
def evaluate(language, reference, prediction):
|
| 160 |
+
### wer
|
| 161 |
+
if language in ["Traditional Chinese", "Vitetnamese"]:
|
| 162 |
+
wer = wer_metric.compute(prediction, reference)
|
| 163 |
+
return str((1 - wer) * 100) + "%"
|
| 164 |
+
### cer
|
| 165 |
+
else:
|
| 166 |
+
cer = cer_metric.compute(prediction, reference)
|
| 167 |
+
return str((1 - cer) * 100) + "%"
|
| 168 |
+
|
| 169 |
+
|
| 170 |
with gr.Blocks() as demo:
|
| 171 |
gr.Markdown(
|
| 172 |
"""
|
|
|
|
| 204 |
|
| 205 |
with gr.Column(scale=1, min_width=300):
|
| 206 |
mic = gr.Audio(
|
| 207 |
+
sources=["microphone"], type="numpy", label="Record yourself"
|
| 208 |
)
|
| 209 |
transcription = gr.Textbox(label="Your transcription")
|
| 210 |
+
accuracy = gr.Textbox(label="Accuracy")
|
| 211 |
|
| 212 |
translate_btn.click(
|
| 213 |
fn=lambda txt, s_lang, t_lang: translate_sentence(txt, s_lang, t_lang),
|
|
|
|
| 218 |
translation.change(fn=tts, inputs=[translation, tgt], outputs=speech)
|
| 219 |
|
| 220 |
mic.change(fn=transcribe, inputs=[mic, tgt], outputs=[transcription])
|
| 221 |
+
transcription.change(
|
| 222 |
+
fn=evaluate, inputs=[tgt, translation, transcription], outputs=[accuracy]
|
| 223 |
+
)
|
| 224 |
# You could add more callbacks: e.g. after generating sentence, allow translation etc.
|
| 225 |
|
| 226 |
demo.launch(share=True)
|
enum_.py
CHANGED
|
@@ -8,7 +8,7 @@ trans_languages = {
|
|
| 8 |
}
|
| 9 |
|
| 10 |
tts_languages = {
|
| 11 |
-
"Traditional Chinese": "zh-
|
| 12 |
"English": "en",
|
| 13 |
"Korean": "ko",
|
| 14 |
"Vietnamese": "vie",
|
|
|
|
| 8 |
}
|
| 9 |
|
| 10 |
tts_languages = {
|
| 11 |
+
"Traditional Chinese": "zh-cn",
|
| 12 |
"English": "en",
|
| 13 |
"Korean": "ko",
|
| 14 |
"Vietnamese": "vie",
|
requirements.txt
CHANGED
|
@@ -14,4 +14,6 @@ librosa==0.10.0
|
|
| 14 |
cutlet==0.5.0
|
| 15 |
fugashi==1.5.2
|
| 16 |
pydub==0.25.1
|
| 17 |
-
TTS==0.22.0
|
|
|
|
|
|
|
|
|
| 14 |
cutlet==0.5.0
|
| 15 |
fugashi==1.5.2
|
| 16 |
pydub==0.25.1
|
| 17 |
+
TTS==0.22.0
|
| 18 |
+
evaluate==0.4.6
|
| 19 |
+
jiwer==4.0.0
|