Spaces:

andrei-saceleanu
/

SSL_demo

Paused

App Files Files Community

Andrei-Iulian SĂCELEANU commited on Jun 10, 2023

Commit

1f3a9b6

1 Parent(s): 7d0a00c

added audio tab

Browse files

Files changed (8) hide show

app.py +116 -31
checkpoints/audio_fixmatch.data-00000-of-00001 +0 -0
checkpoints/audio_fixmatch.index +0 -0
checkpoints/audio_freematch.data-00000-of-00001 +0 -0
checkpoints/audio_freematch.index +0 -0
checkpoints/audio_mixmatch.data-00000-of-00001 +0 -0
checkpoints/audio_mixmatch.index +0 -0
models.py +72 -2

app.py CHANGED Viewed

@@ -1,11 +1,14 @@
 import re
 import gradio as gr
-from transformers import AutoTokenizer
 from unidecode import unidecode
 from models import *
 tok = AutoTokenizer.from_pretrained("readerbench/RoBERT-base")
 def preprocess(x):
     """Preprocess input string x"""
@@ -21,6 +24,7 @@ def preprocess(x):
     return s
 label_names = ["ABUSE", "INSULT", "OTHER", "PROFANITY"]
 def ssl_predict(in_text, model_type):
     """main predict function"""
@@ -39,12 +43,12 @@ def ssl_predict(in_text, model_type):
         model = FixMatchTune(encoder_name="readerbench/RoBERT-base")
         model.load_weights("./checkpoints/fixmatch_tune")
         preds, _ = model([toks["input_ids"],toks["attention_mask"]], training=False)
     elif model_type == "freematch":
         model = FixMatchTune(encoder_name="andrei-saceleanu/ro-offense-freematch")
         model.cls_head.load_weights("./checkpoints/freematch_tune")
         preds, _ = model([toks["input_ids"],toks["attention_mask"]], training=False)
     elif model_type == "mixmatch":
         model = MixMatch(bert_model="andrei-saceleanu/ro-offense-mixmatch")
         model.cls_head.load_weights("./checkpoints/mixmatch")
@@ -68,37 +72,118 @@ def ssl_predict(in_text, model_type):
     return d
-with gr.Blocks() as ssl_interface:
-    with gr.Row():
-        with gr.Column():
-            in_text = gr.Textbox(label="Input text")
-            model_list = gr.Dropdown(
-                choices=["fixmatch", "freematch", "mixmatch", "contrastive_reg", "label_propagation"],
-                max_choices=1,
-                label="Training method",
-                allow_custom_value=False,
-                info="Select trained model according to different SSL techniques from paper",
-            )
-            with gr.Row():
-                clear_btn = gr.Button(value="Clear")
-                submit_btn = gr.Button(value="Submit")
-        with gr.Column():
-            out_field = gr.Label(num_top_classes=4, label="Prediction")
-    submit_btn.click(
-        fn=ssl_predict,
-        inputs=[in_text, model_list],
-        outputs=[out_field]
-    )
-    clear_btn.click(
-        fn=lambda: [None for _ in range(2)],
-        inputs=None,
-        outputs=[in_text, out_field]
     )
 ssl_interface.launch(server_name="0.0.0.0", server_port=7860)

 import re
 import gradio as gr
+import librosa
+import numpy as np
+from transformers import AutoTokenizer,ViTImageProcessor
 from unidecode import unidecode
 from models import *
 tok = AutoTokenizer.from_pretrained("readerbench/RoBERT-base")
+processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
 def preprocess(x):
     """Preprocess input string x"""
     return s
 label_names = ["ABUSE", "INSULT", "OTHER", "PROFANITY"]
+audio_label_names = ["Laughter", "Sigh", "Cough", "Throat clearing", "Sneeze", "Sniff"]
 def ssl_predict(in_text, model_type):
     """main predict function"""
         model = FixMatchTune(encoder_name="readerbench/RoBERT-base")
         model.load_weights("./checkpoints/fixmatch_tune")
         preds, _ = model([toks["input_ids"],toks["attention_mask"]], training=False)
     elif model_type == "freematch":
         model = FixMatchTune(encoder_name="andrei-saceleanu/ro-offense-freematch")
         model.cls_head.load_weights("./checkpoints/freematch_tune")
         preds, _ = model([toks["input_ids"],toks["attention_mask"]], training=False)
     elif model_type == "mixmatch":
         model = MixMatch(bert_model="andrei-saceleanu/ro-offense-mixmatch")
         model.cls_head.load_weights("./checkpoints/mixmatch")
     return d
+def ssl_predict2(audio_file, model_type):
+    """main predict function"""
+    signal, sr = librosa.load(audio_file.name, sr=16000)
+    length = 5 * 16000
+    if len(signal) < length:
+        signal = np.pad(signal,(0,length-len(signal)),'constant')
+    else:
+        signal = signal[:length]
+    spectrogram = librosa.feature.melspectrogram(y=signal, sr=sr, n_mels=128)
+    spectrogram = librosa.power_to_db(S=spectrogram, ref=np.max)
+    spectrogram_min, spectrogram_max = spectrogram.min(), spectrogram.max()
+    spectrogram = (spectrogram - spectrogram_min) / (spectrogram_max - spectrogram_min)
+    spectrogram = spectrogram.astype("float32")
+    inputs = processor.preprocess(
+        np.repeat(spectrogram[:,:,:,np.newaxis],3,-1),
+        image_mean=(-3.05,-3.05,-3.05),
+        image_std=(2.33,2.33,2.33),
+        return_tensors="tf"
     )
+    preds = None
+    if model_type == "fixmatch":
+        model = AudioFixMatch(encoder_name="andrei-saceleanu/vit-base-fixmatch")
+        model.cls_head.load_weights("./checkpoints/audio_fixmatch")
+        preds, _ = model(inputs["pixel_values"], training=False)
+    elif model_type == "freematch":
+        model = AudioFixMatch(encoder_name="andrei-saceleanu/vit-base-freematch")
+        model.cls_head.load_weights("./checkpoints/audio_freematch")
+        preds, _ = model(inputs["pixel_values"], training=False)
+    elif model_type == "mixmatch":
+        model = AudioMixMatch(bert_model="andrei-saceleanu/vit-base-mixmatch")
+        model.cls_head.load_weights("./checkpoints/audio_mixmatch")
+        preds = model(inputs["pixel_values"], training=False)
+    probs = list(preds[0].numpy())
+    d = {}
+    for k, v in zip(audio_label_names, probs):
+        d[k] = float(v)
+    return d
+with gr.Blocks() as ssl_interface:
+    with gr.Tab("Text (RO-Offense)"):
+        with gr.Row():
+            with gr.Column():
+                in_text = gr.Textbox(label="Input text")
+                model_list = gr.Dropdown(
+                    choices=["fixmatch", "freematch", "mixmatch", "contrastive_reg", "label_propagation"],
+                    max_choices=1,
+                    label="Training method",
+                    allow_custom_value=False,
+                    info="Select trained model according to different SSL techniques from paper",
+                )
+                with gr.Row():
+                    clear_btn = gr.Button(value="Clear")
+                    submit_btn = gr.Button(value="Submit")
+            with gr.Column():
+                out_field = gr.Label(num_top_classes=4, label="Prediction")
+        submit_btn.click(
+            fn=ssl_predict,
+            inputs=[in_text, model_list],
+            outputs=[out_field]
+        )
+        clear_btn.click(
+            fn=lambda: [None for _ in range(2)],
+            inputs=None,
+            outputs=[in_text, out_field]
+        )
+    with gr.Tab("Audio (VocalSound)"):
+        with gr.Row():
+            with gr.Column():
+                audio_file = gr.File(
+                    label="Input audio",
+                    file_count="single",
+                    file_types=["audio"]
+                )
+                model_list2 = gr.Dropdown(
+                    choices=["fixmatch", "freematch", "mixmatch"],
+                    max_choices=1,
+                    label="Training method",
+                    allow_custom_value=False,
+                    info="Select trained model according to different SSL techniques from paper",
+                )
+                with gr.Row():
+                    clear_btn2 = gr.Button(value="Clear")
+                    submit_btn2 = gr.Button(value="Submit")
+            with gr.Column():
+                out_field2 = gr.Label(num_top_classes=6, label="Prediction")
+        submit_btn2.click(
+            fn=ssl_predict2,
+            inputs=[audio_file, model_list2],
+            outputs=[out_field2]
+        )
+        clear_btn2.click(
+            fn=lambda: [None for _ in range(2)],
+            inputs=None,
+            outputs=[audio_file, out_field2]
+        )
 ssl_interface.launch(server_name="0.0.0.0", server_port=7860)

checkpoints/audio_fixmatch.data-00000-of-00001 ADDED Viewed

Binary file (856 kB). View file

checkpoints/audio_fixmatch.index ADDED Viewed

Binary file (518 Bytes). View file

checkpoints/audio_freematch.data-00000-of-00001 ADDED Viewed

Binary file (856 kB). View file

checkpoints/audio_freematch.index ADDED Viewed

Binary file (518 Bytes). View file

checkpoints/audio_mixmatch.data-00000-of-00001 ADDED Viewed

Binary file (856 kB). View file

checkpoints/audio_mixmatch.index ADDED Viewed

Binary file (518 Bytes). View file

models.py CHANGED Viewed

@@ -1,6 +1,7 @@
 """Model definitions"""
 import tensorflow as tf
-from transformers import TFAutoModel
 class FixMatchTune(tf.keras.Model):
@@ -82,4 +83,73 @@ class LPModel(tf.keras.Model):
         embeds = self.bert(input_ids=ids, attention_mask=mask,training=training).pooler_output
-        return self.cls_head(embeds, training=training)

 """Model definitions"""
 import tensorflow as tf
+from transformers import TFAutoModel, TFViTModel
+from kapre.augmentation import SpecAugment
 class FixMatchTune(tf.keras.Model):
         embeds = self.bert(input_ids=ids, attention_mask=mask,training=training).pooler_output
+        return self.cls_head(embeds, training=training)
+class AudioFixMatch(tf.keras.Model):
+    def __init__(self, encoder_name='google/vit-base-patch16-224', num_classes=6, **kwargs):
+        super(AudioFixMatch, self).__init__(**kwargs)
+        self.vit = TFViTModel.from_pretrained(encoder_name)
+        self.num_classes = num_classes
+        self.cls_head = tf.keras.Sequential([
+            tf.keras.layers.Dense(256,activation="relu"),
+            tf.keras.layers.Dropout(0.2),
+            tf.keras.layers.Dense(64,activation="relu"),
+            tf.keras.layers.Dense(self.num_classes, activation="softmax")
+        ])
+        self.strong_augment = SpecAugment(
+            freq_mask_param=8,
+            time_mask_param=8,
+            n_freq_masks=2,
+            n_time_masks=2,
+            mask_value=0.0,
+            data_format="channels_first"
+        )
+        self.weak_augment = SpecAugment(
+            freq_mask_param=2,
+            time_mask_param=2,
+            n_freq_masks=2,
+            n_time_masks=2,
+            mask_value=0.0,
+            data_format="channels_first"
+        )
+    def call(self, inputs, training):
+        strong = self.strong_augment(inputs[:,0,:,:][:,tf.newaxis,:,:],training=training)
+        weak = self.weak_augment(inputs[:,0,:,:][:,tf.newaxis,:,:],training=training)
+        embeds_strong = self.vit(pixel_values=tf.repeat(strong,3,axis=1),training=training).pooler_output
+        embeds_weak = self.vit(pixel_values=tf.repeat(weak,3,axis=1),training=training).pooler_output
+        return self.cls_head(embeds_weak), self.cls_head(embeds_strong)
+class AudioMixMatch(tf.keras.Model):
+    def __init__(self, encoder_name='google/vit-base-patch16-224', num_classes=6, **kwargs):
+        super(AudioMixMatch, self).__init__(**kwargs)
+        self.vit = TFViTModel.from_pretrained(encoder_name)
+        self.num_classes = num_classes
+        self.cls_head = tf.keras.Sequential([
+            tf.keras.layers.Dense(256,activation="relu"),
+            tf.keras.layers.Dropout(0.2),
+            tf.keras.layers.Dense(64,activation="relu"),
+            tf.keras.layers.Dense(self.num_classes, activation="softmax")
+        ])
+        self.augment = SpecAugment(
+            freq_mask_param=3,
+            time_mask_param=3,
+            n_freq_masks=2,
+            n_time_masks=2,
+            mask_value=0.0,
+            data_format="channels_first"
+        )
+    def aug_features(self, inputs, training):
+        aug = self.augment(inputs[:,0,:,:][:,tf.newaxis,:,:],training=training)
+        embeds = self.vit(pixel_values=tf.repeat(aug,3,axis=1),training=training).pooler_output
+        return embeds
+    def call(self, inputs, training):
+        aug = self.augment(inputs[:,0,:,:][:,tf.newaxis,:,:],training=training)
+        embeds = self.vit(pixel_values=tf.repeat(aug,3,axis=1),training=training).pooler_output
+        return self.cls_head(embeds)