Spaces:

sarahwei
/

learning-assistant

Build error

App Files Files Community

sarahwei commited on Dec 11, 2025

Commit

40e4072

1 Parent(s): 6627d73

Upload files

Browse files

Files changed (3) hide show

app.py +23 -7
enum_.py +1 -1
requirements.txt +3 -1

app.py CHANGED Viewed

@@ -18,14 +18,16 @@ import librosa
 import numpy as np
 import torch
 import os
-from pydub import AudioSegment
-import io
 ##translation
 translation_model_name = "facebook/nllb-200-distilled-600M"
 tokenizer = AutoTokenizer.from_pretrained(translation_model_name)
 translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
 @lru_cache(maxsize=10)
 def translate_sentence(sentence, src_lang, tgt_lang):
@@ -98,7 +100,7 @@ def tts(sentence, language):
         return None
     try:
         language_code = tts_languages[language]
-        if language_code in ["en", "ko", "ja"]:
             tts_model = load_tts()
             base_dir = os.path.dirname(os.path.abspath(__file__))
             wav_path = os.path.join(base_dir, "example.mp3")
@@ -146,14 +148,25 @@ def transcribe(audio, language=None):
         segments, info = model.transcribe(y, language=whisper_languages[language])
     else:
         segments, info = model.transcribe(y)
-    print(info.language)
     transcription = ""
     for segment in segments:
-        print(segment.text)
         transcription += f"{segment.text}\n"
     return f"{transcription}"
 with gr.Blocks() as demo:
     gr.Markdown(
         """
@@ -191,10 +204,10 @@ with gr.Blocks() as demo:
         with gr.Column(scale=1, min_width=300):
             mic = gr.Audio(
-                sources=["microphone"], type="filepath", label="Record yourself"
             )
             transcription = gr.Textbox(label="Your transcription")
-            feedback = gr.Textbox(label="Feedback")
     translate_btn.click(
         fn=lambda txt, s_lang, t_lang: translate_sentence(txt, s_lang, t_lang),
@@ -205,6 +218,9 @@ with gr.Blocks() as demo:
     translation.change(fn=tts, inputs=[translation, tgt], outputs=speech)
     mic.change(fn=transcribe, inputs=[mic, tgt], outputs=[transcription])
     # You could add more callbacks: e.g. after generating sentence, allow translation etc.
 demo.launch(share=True)

 import numpy as np
 import torch
 import os
+from evaluate import load
 ##translation
 translation_model_name = "facebook/nllb-200-distilled-600M"
 tokenizer = AutoTokenizer.from_pretrained(translation_model_name)
 translation_model = AutoModelForSeq2SeqLM.from_pretrained(translation_model_name)
+wer_metric = load("wer")
+cer_metric = load("cer")
 @lru_cache(maxsize=10)
 def translate_sentence(sentence, src_lang, tgt_lang):
         return None
     try:
         language_code = tts_languages[language]
+        if language_code in ["en", "ko", "ja", "zh-cn"]:
             tts_model = load_tts()
             base_dir = os.path.dirname(os.path.abspath(__file__))
             wav_path = os.path.join(base_dir, "example.mp3")
         segments, info = model.transcribe(y, language=whisper_languages[language])
     else:
         segments, info = model.transcribe(y)
+        logging.info(f"Detected language: {info.language}")
     transcription = ""
     for segment in segments:
+        logging.info(segment.text)
         transcription += f"{segment.text}\n"
     return f"{transcription}"
+def evaluate(language, reference, prediction):
+    ### wer
+    if language in ["Traditional Chinese", "Vitetnamese"]:
+        wer = wer_metric.compute(prediction, reference)
+        return str((1 - wer) * 100) + "%"
+    ### cer
+    else:
+        cer = cer_metric.compute(prediction, reference)
+        return str((1 - cer) * 100) + "%"
 with gr.Blocks() as demo:
     gr.Markdown(
         """
         with gr.Column(scale=1, min_width=300):
             mic = gr.Audio(
+                sources=["microphone"], type="numpy", label="Record yourself"
             )
             transcription = gr.Textbox(label="Your transcription")
+            accuracy = gr.Textbox(label="Accuracy")
     translate_btn.click(
         fn=lambda txt, s_lang, t_lang: translate_sentence(txt, s_lang, t_lang),
     translation.change(fn=tts, inputs=[translation, tgt], outputs=speech)
     mic.change(fn=transcribe, inputs=[mic, tgt], outputs=[transcription])
+    transcription.change(
+        fn=evaluate, inputs=[tgt, translation, transcription], outputs=[accuracy]
+    )
     # You could add more callbacks: e.g. after generating sentence, allow translation etc.
 demo.launch(share=True)

enum_.py CHANGED Viewed

@@ -8,7 +8,7 @@ trans_languages = {
 }
 tts_languages = {
-    "Traditional Chinese": "zh-tw",
     "English": "en",
     "Korean": "ko",
     "Vietnamese": "vie",

 }
 tts_languages = {
+    "Traditional Chinese": "zh-cn",
     "English": "en",
     "Korean": "ko",
     "Vietnamese": "vie",

requirements.txt CHANGED Viewed

@@ -14,4 +14,6 @@ librosa==0.10.0
 cutlet==0.5.0
 fugashi==1.5.2
 pydub==0.25.1
-TTS==0.22.0

 cutlet==0.5.0
 fugashi==1.5.2
 pydub==0.25.1
+TTS==0.22.0
+evaluate==0.4.6
+jiwer==4.0.0