Spaces:

thanhtvt
/

uetasr

Build error

App Files Files Community

thanhtvt commited on Apr 27, 2023

Commit

4ac7ffc

1 Parent(s): bcd8e2f

remove alsd

Browse files

Files changed (4) hide show

app.py +26 -72
decode.py +1 -8
model.py +0 -2
requirements.txt +1 -1

app.py CHANGED Viewed

@@ -19,7 +19,7 @@ def get_duration(filename: str) -> float:
     return librosa.get_duration(path=filename)
-def convert_to_wav1(in_filename: str) -> str:
     out_filename = os.path.splitext(in_filename)[0] + ".wav"
     logging.info(f"Converting {in_filename} to {out_filename}")
     y, sr = librosa.load(in_filename, sr=16000)
@@ -27,22 +27,6 @@ def convert_to_wav1(in_filename: str) -> str:
     return out_filename
-def convert_to_wav(in_filename: str) -> str:
-    """Convert the input audio file to a wave file"""
-    out_filename = in_filename + ".wav"
-    logging.info(f"Converting '{in_filename}' to '{out_filename}'")
-    sp_args = ["ffmpeg", "-hide_banner", "-i", in_filename, "-ar", "16000", out_filename]
-    sp_args.insert(2, "-y") if os.path.exists(out_filename) else None
-    # Create a subprocess to run the ffmpeg command.
-    _ = subprocess.Popen(
-        sp_args,
-        stdin=subprocess.PIPE,
-    )
-    return out_filename
 def build_html_output(s: str, style: str = "result_item_success"):
     return f"""
     <div class='result'>
@@ -58,7 +42,6 @@ def process_url(
     decoding_method: str,
     beam_size: int,
     max_symbols_per_step: int,
-    max_out_seq_len_ratio: float,
 ):
     logging.info(f"Processing URL: {url}")
     with tempfile.NamedTemporaryFile() as f:
@@ -67,8 +50,7 @@ def process_url(
             return process(in_filename=f.name,
                            decoding_method=decoding_method,
                            beam_size=beam_size,
-                           max_symbols_per_step=max_symbols_per_step,
-                           max_out_seq_len_ratio=max_out_seq_len_ratio)
         except Exception as e:
             logging.info(str(e))
             return "", build_html_output(str(e), "result_item_error")
@@ -79,7 +61,6 @@ def process_uploaded_file(
     decoding_method: str,
     beam_size: int,
     max_symbols_per_step: int,
-    max_out_seq_len_ratio: float,
 ):
     if in_filename is None or in_filename == "":
         return "", build_html_output(
@@ -93,8 +74,7 @@ def process_uploaded_file(
         return process(in_filename=in_filename,
                        decoding_method=decoding_method,
                        beam_size=beam_size,
-                       max_symbols_per_step=max_symbols_per_step,
-                       max_out_seq_len_ratio=max_out_seq_len_ratio)
     except Exception as e:
         logging.info(str(e))
         return "", build_html_output(str(e), "result_item_error")
@@ -105,7 +85,6 @@ def process_microphone(
     decoding_method: str,
     beam_size: int,
     max_symbols_per_step: int,
-    max_out_seq_len_ratio: float,
 ):
     if in_filename is None or in_filename == "":
         return "", build_html_output(
@@ -119,8 +98,7 @@ def process_microphone(
         return process(in_filename=in_filename,
                        decoding_method=decoding_method,
                        beam_size=beam_size,
-                       max_symbols_per_step=max_symbols_per_step,
-                       max_out_seq_len_ratio=max_out_seq_len_ratio)
     except Exception as e:
         logging.info(str(e))
         return "", build_html_output(str(e), "result_item_error")
@@ -131,7 +109,6 @@ def process(
     decoding_method: str,
     beam_size: int,
     max_symbols_per_step: int,
-    max_out_seq_len_ratio: float,
 ):
     logging.info(f"in_filename: {in_filename}")
@@ -148,8 +125,7 @@ def process(
     recognizer = UETASRModel(repo_id,
                              decoding_method,
                              beam_size,
-                             max_symbols_per_step,
-                             max_out_seq_len_ratio)
     text = recognizer.predict(filename)
     date_time = now.strftime("%d/%m/%Y, %H:%M:%S.%f")
@@ -167,7 +143,7 @@ def process(
     """
     if rtf > 1:
         info += (
-            "<br/>We are loading the model for the first run. "
             "Please run again to measure the real RTF.<br/>"
         )
@@ -202,59 +178,40 @@ with demo:
     decode_method_radio = gr.Radio(
         label="Decoding method",
-        choices=["greedy_search", "beam_search", "alsd_search"],
         value="greedy_search",
         interactive=True,
     )
-    with gr.Column(visible=False) as beam_col:
-        beam_size = gr.Slider(
-            label="Beam size",
-            minimum=1,
-            maximum=10,
-            step=1,
-            value=5,
-            interactive=True,
-        )
-    def enable_beam_col(decoding_method):
-        if decoding_method != "greedy_search":
-            return gr.update(visible=True)
         else:
-            return gr.update(visible=False)
-    decode_method_radio.change(enable_beam_col, decode_method_radio, beam_col)
     max_symbols_per_step_slider = gr.Slider(
         label="Maximum symbols per step",
         minimum=1,
-        maximum=15,
         step=1,
         value=5,
         interactive=True,
         visible=True,
     )
-    max_out_seq_len_slider = gr.Slider(
-        label="Maximum output sequence length ratio",
-        minimum=0,
-        maximum=1,
-        step=0.01,
-        value=0.6,
-        interactive=True,
-        visible=False,
-    )
-    def switch_slider(decoding_method):
-        if decoding_method == "alsd_search":
-            return gr.update(visible=False), gr.update(visible=True)
-        else:
-            return gr.update(visible=True), gr.update(visible=False)
-    decode_method_radio.change(switch_slider,
-                               decode_method_radio,
-                               [max_symbols_per_step_slider, max_out_seq_len_slider])
     with gr.Tabs():
         with gr.TabItem("Upload from disk"):
             uploaded_file = gr.Audio(
@@ -308,9 +265,8 @@ with demo:
             inputs=[
                 uploaded_file,
                 decode_method_radio,
-                beam_size,
                 max_symbols_per_step_slider,
-                max_out_seq_len_slider,
             ],
             outputs=[uploaded_output, uploaded_html_info],
         )
@@ -320,9 +276,8 @@ with demo:
             inputs=[
                 microphone,
                 decode_method_radio,
-                beam_size,
                 max_symbols_per_step_slider,
-                max_out_seq_len_slider,
             ],
             outputs=[recorded_output, recorded_html_info],
         )
@@ -332,9 +287,8 @@ with demo:
             inputs=[
                 url_textbox,
                 decode_method_radio,
-                beam_size,
                 max_symbols_per_step_slider,
-                max_out_seq_len_slider,
             ],
             outputs=[url_output, url_html_info],
         )

     return librosa.get_duration(path=filename)
+def convert_to_wav(in_filename: str) -> str:
     out_filename = os.path.splitext(in_filename)[0] + ".wav"
     logging.info(f"Converting {in_filename} to {out_filename}")
     y, sr = librosa.load(in_filename, sr=16000)
     return out_filename
 def build_html_output(s: str, style: str = "result_item_success"):
     return f"""
     <div class='result'>
     decoding_method: str,
     beam_size: int,
     max_symbols_per_step: int,
 ):
     logging.info(f"Processing URL: {url}")
     with tempfile.NamedTemporaryFile() as f:
             return process(in_filename=f.name,
                            decoding_method=decoding_method,
                            beam_size=beam_size,
+                           max_symbols_per_step=max_symbols_per_step)
         except Exception as e:
             logging.info(str(e))
             return "", build_html_output(str(e), "result_item_error")
     decoding_method: str,
     beam_size: int,
     max_symbols_per_step: int,
 ):
     if in_filename is None or in_filename == "":
         return "", build_html_output(
         return process(in_filename=in_filename,
                        decoding_method=decoding_method,
                        beam_size=beam_size,
+                       max_symbols_per_step=max_symbols_per_step)
     except Exception as e:
         logging.info(str(e))
         return "", build_html_output(str(e), "result_item_error")
     decoding_method: str,
     beam_size: int,
     max_symbols_per_step: int,
 ):
     if in_filename is None or in_filename == "":
         return "", build_html_output(
         return process(in_filename=in_filename,
                        decoding_method=decoding_method,
                        beam_size=beam_size,
+                       max_symbols_per_step=max_symbols_per_step)
     except Exception as e:
         logging.info(str(e))
         return "", build_html_output(str(e), "result_item_error")
     decoding_method: str,
     beam_size: int,
     max_symbols_per_step: int,
 ):
     logging.info(f"in_filename: {in_filename}")
     recognizer = UETASRModel(repo_id,
                              decoding_method,
                              beam_size,
+                             max_symbols_per_step)
     text = recognizer.predict(filename)
     date_time = now.strftime("%d/%m/%Y, %H:%M:%S.%f")
     """
     if rtf > 1:
         info += (
+            "<br/>We are loading required resources for the first run. "
             "Please run again to measure the real RTF.<br/>"
         )
     decode_method_radio = gr.Radio(
         label="Decoding method",
+        choices=["greedy_search", "beam_search"],
         value="greedy_search",
         interactive=True,
     )
+    beam_size_slider = gr.Slider(
+        label="Beam size",
+        minimum=1,
+        maximum=20,
+        step=1,
+        value=1,
+        interactive=False,
+    )
+    def interact_beam_slider(decoding_method):
+        if decoding_method == "greedy_search":
+            return gr.update(value=1, interactive=False)
         else:
+            return gr.update(interactive=True)
+    decode_method_radio.change(interact_beam_slider,
+                               decode_method_radio,
+                               beam_size_slider)
     max_symbols_per_step_slider = gr.Slider(
         label="Maximum symbols per step",
         minimum=1,
+        maximum=20,
         step=1,
         value=5,
         interactive=True,
         visible=True,
     )
     with gr.Tabs():
         with gr.TabItem("Upload from disk"):
             uploaded_file = gr.Audio(
             inputs=[
                 uploaded_file,
                 decode_method_radio,
+                beam_size_slider,
                 max_symbols_per_step_slider,
             ],
             outputs=[uploaded_output, uploaded_html_info],
         )
             inputs=[
                 microphone,
                 decode_method_radio,
+                beam_size_slider,
                 max_symbols_per_step_slider,
             ],
             outputs=[recorded_output, recorded_html_info],
         )
             inputs=[
                 url_textbox,
                 decode_method_radio,
+                beam_size_slider,
                 max_symbols_per_step_slider,
             ],
             outputs=[url_output, url_html_info],
         )

decode.py CHANGED Viewed

@@ -1,7 +1,7 @@
 import logging
 import tensorflow as tf
 from functools import lru_cache
-from uetasr.searchers import GreedyRNNT, BeamRNNT, ALSDBeamRNNT
 @lru_cache(maxsize=5)
@@ -12,7 +12,6 @@ def get_searcher(
     text_decoder: tf.keras.layers.experimental.preprocessing.PreprocessingLayer,
     beam_size: int,
     max_symbols_per_step: int,
-    max_output_seq_length_ratio: float,
 ):
     common_kwargs = {
         "decoder": decoder,
@@ -32,12 +31,6 @@ def get_searcher(
             alpha=0.0,
             **common_kwargs,
         )
-    elif searcher_type == "alsd_search":
-        searcher = ALSDBeamRNNT(
-            fraction=max_output_seq_length_ratio,
-            beam_size=beam_size,
-            **common_kwargs,
-        )
     else:
         logging.info(f"Unknown searcher type: {searcher_type}")

 import logging
 import tensorflow as tf
 from functools import lru_cache
+from uetasr.searchers import GreedyRNNT, BeamRNNT
 @lru_cache(maxsize=5)
     text_decoder: tf.keras.layers.experimental.preprocessing.PreprocessingLayer,
     beam_size: int,
     max_symbols_per_step: int,
 ):
     common_kwargs = {
         "decoder": decoder,
             alpha=0.0,
             **common_kwargs,
         )
     else:
         logging.info(f"Unknown searcher type: {searcher_type}")

model.py CHANGED Viewed

@@ -101,7 +101,6 @@ class UETASRModel:
         decoding_method: str,
         beam_size: int,
         max_symbols_per_step: int,
-        max_output_seq_length_ratio: float,
     ):
         self.featurizer, self.encoder_model, jointer, decoder, text_encoder, self.model = _get_conformer_pre_trained_model(repo_id)
         self.searcher = get_searcher(
@@ -111,7 +110,6 @@ class UETASRModel:
             text_encoder,
             beam_size,
             max_symbols_per_step,
-            max_output_seq_length_ratio,
         )
     def predict(self, in_filename: str):

         decoding_method: str,
         beam_size: int,
         max_symbols_per_step: int,
     ):
         self.featurizer, self.encoder_model, jointer, decoder, text_encoder, self.model = _get_conformer_pre_trained_model(repo_id)
         self.searcher = get_searcher(
             text_encoder,
             beam_size,
             max_symbols_per_step,
         )
     def predict(self, in_filename: str):

requirements.txt CHANGED Viewed

	@@ -1,2 +1,2 @@
1	- uetasr @ git+https://github.com/thanhtvt/uetasr
2	requests==2.28.2


1	+ uetasr @ git+https://github.com/thanhtvt/uetasr@v0.2.1
2	requests==2.28.2