Spaces:

Yehor
/

hubert-uk

Sleeping

App Files Files Community

Yehor Smoliakov commited on Jul 26, 2024

Commit

bd540a9

1 Parent(s): 9cece8a

Refactor the app

Browse files

Files changed (15) hide show

README.md +1 -1
app.py +117 -31
example_1.wav +0 -0
example_2.wav +0 -0
example_3.wav +0 -0
example_4.wav +0 -0
example_5.wav +0 -0
example_6.wav +0 -0
requirements.txt +3 -0
sample_1.wav +0 -3
sample_2.wav +0 -3
sample_3.wav +0 -3
sample_4.wav +0 -3
sample_5.wav +0 -3
sample_6.wav +0 -3

README.md CHANGED Viewed

@@ -11,7 +11,7 @@ pinned: true
 ## Install
 ```shell
-uv venv --python 3.12
 source .venv/bin/activate

 ## Install
 ```shell
+uv venv --python 3.11
 source .venv/bin/activate

app.py CHANGED Viewed

@@ -1,59 +1,129 @@
 import time
 import torch
 import librosa
 import gradio as gr
 from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
 model_name = "Yehor/w2v-bert-2.0-uk-v2"
-device = "cpu"
-max_duration = 30
-asr_model = AutoModelForCTC.from_pretrained(model_name).to(device)
 processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
-audio_samples = [
-    "sample_1.wav",
-    "sample_2.wav",
-    "sample_3.wav",
-    "sample_4.wav",
-    "sample_5.wav",
-    "sample_6.wav",
 ]
-description_head = """
 # Speech-to-Text for Ukrainian v2
 ## Overview
-This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2 model that solves
-a Speech-to-Text task for the Ukrainian language.
 """.strip()
-description_foot = """
 ## Community
-- Join our Discord server - https://discord.gg/yVAjkBgmt4 - where we're talking about Data Science,
-Machine Learning, Deep Learning, and Artificial Intelligence.
-- Join our Speech Recognition Group in Telegram: https://t.me/speech_recognition_uk
-## Authors
-Yehor Smoliakov: https://github.com/egorsmkv on GitHub, and egorsmkv@gmail.com for private discussions.
 """.strip()
 def inference(audio_path, progress=gr.Progress()):
-    gr.Info("Starting process", duration=2)
-    progress(0, desc="Starting")
     duration = librosa.get_duration(path=audio_path)
     if duration > max_duration:
-        raise gr.Error("The duration of the file exceeds 10 seconds.")
     paths = [
         audio_path,
@@ -70,12 +140,18 @@ def inference(audio_path, progress=gr.Progress()):
         features = processor([audio_input], sampling_rate=16_000).input_features
         features = torch.tensor(features).to(device)
         with torch.inference_mode():
             logits = asr_model(features).logits
         predicted_ids = torch.argmax(logits, dim=-1)
         predictions = processor.batch_decode(predicted_ids)
         elapsed_time = round(time.time() - t0, 2)
         rtf = round(elapsed_time / audio_duration, 4)
         audio_duration = round(audio_duration, 2)
@@ -89,7 +165,7 @@ def inference(audio_path, progress=gr.Progress()):
             }
         )
-    gr.Info("Finished...", duration=2)
     result_texts = []
@@ -113,24 +189,34 @@ demo = gr.Blocks(
 with demo:
     gr.Markdown(description_head)
-    gr.Markdown(f"## Demo (max. duration: **{max_duration}** seconds)")
     with gr.Row():
         audio_file = gr.Audio(label="Audio file", type="filepath")
         transcription = gr.Markdown(
             label="Transcription",
-            value="Recognized text will appear here. Use **an example file** below the Recognize button,"
-            "upload **your audio file**, or use **the microphone** to record something...",
         )
-    gr.Button("Recognize").click(inference, inputs=audio_file, outputs=transcription)
     with gr.Row():
-        gr.Examples(
-            label="Choose an example audio", inputs=audio_file, examples=audio_samples
-        )
     gr.Markdown(description_foot)
 if __name__ == "__main__":
     demo.launch()

+import sys
 import time
 import torch
+import torchaudio
 import librosa
 import gradio as gr
 from transformers import AutoModelForCTC, Wav2Vec2BertProcessor
+# Config
 model_name = "Yehor/w2v-bert-2.0-uk-v2"
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+torch_dtype = torch.float16
+min_duration = 0.5
+max_duration = 60
+concurrency_limit = 1
+use_torch_compile = False
+# Load the model
+asr_model = AutoModelForCTC.from_pretrained(model_name, torch_dtype=torch_dtype).to(device)
 processor = Wav2Vec2BertProcessor.from_pretrained(model_name)
+if use_torch_compile:
+    asr_model = torch.compile(asr_model)
+# Elements
+examples = [
+    "example_1.wav",
+    "example_2.wav",
+    "example_3.wav",
+    "example_4.wav",
+    "example_5.wav",
+    "example_6.wav",
 ]
+examples_table = '''
+| File  | Text |
+| ------------- | ------------- |
+| `example_1.wav`  | тема про яку не люблять говорити офіційні джерела у генштабі і міноборони це хімічна зброя окупанти вже тривалий час використовують хімічну зброю заборонену |
+| `example_2.wav`  | всіма конвенціями якщо спочатку це були гранати з дронів то тепер фіксують випадки застосування  |
+| `example_3.wav`  | хімічних снарядів причому склад отруйної речовони різний а отже й наслідки для наших військових теж різні  |
+| `example_4.wav`  | використовує на фронті все що має і хімічна зброя не нийняток тож з чим маємо справу розбиралася марія моганисян |
+| `example_5.wav`  | двох тисяч випадків застосування росіянами боєприпасів споряджених небезпечними хімічними речовинами |
+| `example_6.wav`  | на всі писані норми марія моганисян олександр моторний спецкор марафон єдині новини |
+'''.strip()
+# https://www.tablesgenerator.com/markdown_tables
+authors_table = '''
+## Authors
+Follow them in social networks and **contact** if you need any help or have any questions:
+| <img src="https://avatars.githubusercontent.com/u/7875085?v=4" width="100"> **Yehor Smoliakov** |
+|-------------------------------------------------------------------------------------------------|
+| https://t.me/smlkw in Telegram                                                                  |
+| https://x.com/yehor_smoliakov at X                                                              |
+| https://github.com/egorsmkv at GitHub                                                           |
+| https://huggingface.co/Yehor at Hugging Face                                                    |
+| or use egorsmkv@gmail.com                                                                       |
+'''.strip()
+description_head = f"""
 # Speech-to-Text for Ukrainian v2
 ## Overview
+This space uses https://huggingface.co/Yehor/w2v-bert-2.0-uk-v2 model to recognize audio files.
+> For demo, audio duration **must not** exceed **{max_duration}** seconds.
 """.strip()
+description_foot = f"""
 ## Community
+- Join our Discord server where we talk about AI/ML/DL: https://discord.gg/yVAjkBgmt4
+- Join our Speech Recognition group in Telegram: https://t.me/speech_recognition_uk
+## More
+Check out other ASR models: https://github.com/egorsmkv/speech-recognition-uk
+{authors_table}
+""".strip()
+transcription_value = """
+Recognized text will appear here.
+Choose **an example file** below the Recognize button, upload **your audio file**, or use **the microphone** to record something.
+""".strip()
+tech_env = f"""
+#### Environment
+- Python: {sys.version}
+- Torch device: {device}
+- Torch dtype: {torch_dtype}
+- Use torch.compile: {use_torch_compile}
+""".strip()
+tech_libraries = f"""
+#### Libraries
+- PyTorch: {torch.__version__}
+- Transformers: {torch.__version__}
+- Librosa: {librosa.version.version}
+- Gradio: {gr.__version__}
 """.strip()
 def inference(audio_path, progress=gr.Progress()):
+    if not audio_path:
+        raise gr.Error("Please upload an audio file.")
+    gr.Info("Starting recognition", duration=2)
+    progress(0, desc="Recognizing")
     duration = librosa.get_duration(path=audio_path)
+    if duration < min_duration:
+        raise gr.Error(f"The duration of the file is less than {min_duration} seconds, it is {round(duration, 2)} seconds.")
     if duration > max_duration:
+        raise gr.Error(f"The duration of the file exceeds {max_duration} seconds.")
     paths = [
         audio_path,
         features = processor([audio_input], sampling_rate=16_000).input_features
         features = torch.tensor(features).to(device)
+        if torch_dtype == torch.float16:
+            features = features.half()
         with torch.inference_mode():
             logits = asr_model(features).logits
         predicted_ids = torch.argmax(logits, dim=-1)
         predictions = processor.batch_decode(predicted_ids)
+        if not predictions:
+            predictions = '-'
         elapsed_time = round(time.time() - t0, 2)
         rtf = round(elapsed_time / audio_duration, 4)
         audio_duration = round(audio_duration, 2)
             }
         )
+    gr.Info("Finished!", duration=2)
     result_texts = []
 with demo:
     gr.Markdown(description_head)
+    gr.Markdown("## Demo")
     with gr.Row():
         audio_file = gr.Audio(label="Audio file", type="filepath")
         transcription = gr.Markdown(
             label="Transcription",
+            value=transcription_value,
         )
+    gr.Button("Recognize").click(
+        inference,
+        concurrency_limit=concurrency_limit,
+        inputs=audio_file,
+        outputs=transcription,
+    )
     with gr.Row():
+        gr.Examples(label="Choose an example", inputs=audio_file, examples=examples)
+    gr.Markdown(examples_table)
     gr.Markdown(description_foot)
+    gr.Markdown('### Gradio app uses the following technologies:')
+    with gr.Row():
+        gr.Markdown(tech_env)
+        gr.Markdown(tech_libraries)
 if __name__ == "__main__":
+    demo.queue()
     demo.launch()

example_1.wav ADDED Viewed

Binary file (273 kB). View file

example_2.wav ADDED Viewed

Binary file (200 kB). View file

example_3.wav ADDED Viewed

Binary file (193 kB). View file

example_4.wav ADDED Viewed

Binary file (241 kB). View file

example_5.wav ADDED Viewed

Binary file (193 kB). View file

example_6.wav ADDED Viewed

Binary file (186 kB). View file

requirements.txt CHANGED Viewed

@@ -3,6 +3,9 @@ gradio
 torch
 torchaudio
 transformers
 librosa

 torch
 torchaudio
+triton
+setuptools
 transformers
 librosa

sample_1.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:172ade978b299f4a0c47e3b76666d1a06161e6001fbb5591b82038a1bbc4b5ad
-size 272568

sample_2.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:98fe42f22f8ea632714081a958dc035f3d507523fd340b320a1223ac2f55ccac
-size 199942

sample_3.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:83c0b7375beada8cee74b5de226da494368fcc6a3ce692913b3302dcda0bd9a2
-size 192842

sample_4.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:19e466ee9c0c129c1eecf93eb6791a44c2ee8d68dce2c3e8fd3734b87f28324a
-size 241442

sample_5.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5af19120c92859846a08496e0a617c21877cae2db5807d211f0a431d95163a3e
-size 193388

sample_6.wav DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ac877968d5749438930339497f7548046003390a848496136f6cbe8a74c51629
-size 186290