diff --git a/.gitattributes b/.gitattributes index a6344aac8c09253b3b630fb776ae94478aa0275b..4639070f4b00920dd2ef0537084c559876a1d508 100644 --- a/.gitattributes +++ b/.gitattributes @@ -33,3 +33,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text *.zip filter=lfs diff=lfs merge=lfs -text *.zst filter=lfs diff=lfs merge=lfs -text *tfevents* filter=lfs diff=lfs merge=lfs -text +example.wav filter=lfs diff=lfs merge=lfs -text +example2.wav filter=lfs diff=lfs merge=lfs -text diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000000000000000000000000000000000000..551f6b3ce6a183853a3e829af99b5683482eec07 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2025 Jiarui Hai + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/README.md b/README.md index 97ead9082dc3ca868cd74239b48fa42253ad5b96..826ec94780a2ee6f29acbe06b668b303ac415c4d 100644 --- a/README.md +++ b/README.md @@ -1,14 +1,60 @@ ---- -title: FlexSED -emoji: ๐Ÿ† -colorFrom: purple -colorTo: purple -sdk: gradio -sdk_version: 5.49.1 -app_file: app.py -pinned: false -license: mit -short_description: 'FlexSED: An Open-Vocabulary Sound Event Detection System' ---- - -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# FlexSED: Towards Open-Vocabulary Sound Event Detection + +[![arXiv](https://img.shields.io/badge/arXiv-2409.10819-brightgreen.svg?style=flat-square)](https://arxiv.org/abs/2509.18606) +[![Hugging Face Models](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-Models-blue)](https://huggingface.co/Higobeatz/FlexSED/tree/main) + + +## News +- Oct 2025: ๐Ÿ“ฆ Released code and pretrained checkpoint +- Sep 2025: ๐ŸŽ‰ FlexSED Spotlighted at WASPAA 2025 + + +## Installation + +Clone the repository: +``` +git clone git@github.com:JHU-LCAP/FlexSED.git +``` +Install the dependencies: +``` +cd FlexSED +pip install -r requirements.txt +``` + +## Usage +```python +from api import FlexSED +import torch +import soundfile as sf + +# load model +flexsed = FlexSED(device='cuda') + +# run inference +events = ["Dog"] +preds = flexsed.run_inference("example.wav", events) + +# visualize prediciton +flexsed.to_multi_plot(preds, events, fname="example2") + +# (Optional) visualize prediciton by video +# flexsed.to_multi_video(preds, events, audio_path="example2.wav", fname="example2") +``` + +## Training + +WIP + + +## Reference + +If you find the code useful for your research, please consider citing: + +```bibtex +@article{hai2025flexsed, + title={FlexSED: Towards Open-Vocabulary Sound Event Detection}, + author={Hai, Jiarui and Wang, Helin and Guo, Weizhe and Elhilali, Mounya}, + journal={arXiv preprint arXiv:2509.18606}, + year={2025} +} +``` \ No newline at end of file diff --git a/api.py b/api.py new file mode 100644 index 0000000000000000000000000000000000000000..a5bff2b2ee3df00b78138425c4e111e7b580c08d --- /dev/null +++ b/api.py @@ -0,0 +1,185 @@ +import torch +import librosa +import os +import numpy as np +import matplotlib.pyplot as plt +from transformers import AutoTokenizer, ClapTextModelWithProjection +from src.models.transformer import Dasheng_Encoder +from src.models.sed_decoder import Decoder, TSED_Wrapper +from src.utils import load_yaml_with_includes + + +class FlexSED: + def __init__( + self, + config_path='src/configs/model.yml', + ckpt_path='ckpts/flexsed_as.pt', + ckpt_url='https://huggingface.co/Higobeatz/FlexSED/resolve/main/ckpts/flexsed_as.pt', + device='cuda' + ): + """ + Initialize FlexSED with model, CLAP, and tokenizer loaded once. + If the checkpoint is not available locally, it will be downloaded automatically. + """ + self.device = device + params = load_yaml_with_includes(config_path) + + # Ensure checkpoint exists + if not os.path.exists(ckpt_path): + print(f"[FlexSED] Downloading checkpoint from {ckpt_url} ...") + state_dict = torch.hub.load_state_dict_from_url(ckpt_url, map_location="cpu") + else: + state_dict = torch.load(ckpt_path, map_location="cpu") + + # Encoder + Decoder + encoder = Dasheng_Encoder(**params['encoder']).to(self.device) + decoder = Decoder(**params['decoder']).to(self.device) + self.model = TSED_Wrapper(encoder, decoder, params['ft_blocks'], params['frozen_encoder']) + self.model.load_state_dict(state_dict['model']) + self.model.eval() + + # CLAP text model + self.clap = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused") + self.clap.eval() + self.tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused") + + def run_inference(self, audio_path, events, norm_audio=True): + """ + Run inference on audio for given events. + """ + audio, sr = librosa.load(audio_path, sr=16000) + audio = torch.tensor([audio]).to(self.device) + + if norm_audio: + eps = 1e-9 + max_val = torch.max(torch.abs(audio)) + audio = audio / (max_val + eps) + + clap_embeds = [] + with torch.no_grad(): + for event in events: + text = f"The sound of {event.replace('_',' ')}" + inputs = self.tokenizer([text], padding=True, return_tensors="pt") + outputs = self.clap(**inputs) + text_embeds = outputs.text_embeds.unsqueeze(1) + clap_embeds.append(text_embeds) + + query = torch.cat(clap_embeds, dim=1).to(self.device) + mel = self.model.forward_to_spec(audio) + preds = self.model(mel, query) + preds = torch.sigmoid(preds).cpu() + + return preds # shape: [num_events, 1, T] + + # ---------- Multi-event plotting ---------- + @staticmethod + def plot_and_save_multi(preds, events, sr=25, out_dir="./plots", fname="all_events"): + os.makedirs(out_dir, exist_ok=True) + preds_np = preds.squeeze(1).numpy() # [num_events, T] + T = preds_np.shape[1] + + plt.figure(figsize=(12, len(events) * 0.6 + 2)) + plt.imshow( + preds_np, + aspect="auto", + cmap="Blues", + extent=[0, T/sr, 0, len(events)], + vmin=0, vmax=1, origin="lower" + + ) + plt.colorbar(label="Probability") + plt.yticks(np.arange(len(events)) + 0.5, events) + plt.xlabel("Time (s)") + plt.ylabel("Events") + plt.title("Event Predictions") + + save_path = os.path.join(out_dir, f"{fname}.png") + plt.savefig(save_path, dpi=200, bbox_inches="tight") + plt.close() + return save_path + + def to_multi_plot(self, preds, events, out_dir="./plots", fname="all_events"): + return self.plot_and_save_multi(preds, events, out_dir=out_dir, fname=fname) + + # ---------- Multi-event video ---------- + @staticmethod + def make_multi_event_video(preds, events, sr=25, out_dir="./videos", + audio_path=None, fps=25, highlight=True, fname="all_events"): + from moviepy.editor import ImageSequenceClip, AudioFileClip + from tqdm import tqdm + + os.makedirs(out_dir, exist_ok=True) + preds_np = preds.squeeze(1).numpy() # [num_events, T] + T = preds_np.shape[1] + duration = T / sr + + frames = [] + n_frames = int(duration * fps) + + for i in tqdm(range(n_frames)): + t = int(i * T / n_frames) + plt.figure(figsize=(12, len(events) * 0.6 + 2)) + + if highlight: + mask = np.zeros_like(preds_np) + mask[:, :t+1] = preds_np[:, :t+1] + plt.imshow( + mask, + aspect="auto", + cmap="Blues", + extent=[0, T/sr, 0, len(events)], + vmin=0, vmax=1, origin="lower" + ) + else: + plt.imshow( + preds_np[:, :t+1], + aspect="auto", + cmap="Blues", + extent=[0, (t+1)/sr, 0, len(events)], + vmin=0, vmax=1, origin="lower" + ) + + plt.colorbar(label="Probability") + plt.yticks(np.arange(len(events)) + 0.5, events) + plt.xlabel("Time (s)") + plt.ylabel("Events") + plt.title("Event Predictions") + + frame_path = f"/tmp/frame_{i:04d}.png" + plt.savefig(frame_path, dpi=150, bbox_inches="tight") + plt.close() + frames.append(frame_path) + + clip = ImageSequenceClip(frames, fps=fps) + if audio_path is not None: + audio = AudioFileClip(audio_path).subclip(0, duration) + clip = clip.set_audio(audio) + + save_path = os.path.join(out_dir, f"{fname}.mp4") + clip.write_videofile( + save_path, + fps=fps, + codec="mpeg4", + audio_codec="aac" + ) + + for f in frames: + os.remove(f) + + return save_path + + def to_multi_video(self, preds, events, audio_path, out_dir="./videos", fname="all_events"): + return self.make_multi_event_video( + preds, events, audio_path=audio_path, out_dir=out_dir, fname=fname + ) + + +if __name__ == "__main__": + flexsed = FlexSED(device='cuda') + + events = ["Door", "Laughter", "Dog"] + preds = flexsed.run_inference("example2.wav", events) + + # Combined plot & video + flexsed.to_multi_plot(preds, events, fname="example2") + # flexsed.to_multi_video(preds, events, audio_path="example2.wav", fname="example2") diff --git a/app.py b/app.py index 04cc31aa8d0e06aeaac3b59bb361ed71d831e43f..158c7b1ee1a1f1b6b5687ed19a1ac9324a3ba7f4 100644 --- a/app.py +++ b/app.py @@ -1,7 +1,74 @@ import gradio as gr +import torch +from api import FlexSED +import tempfile +import os -def greet(name): - return "Hello " + name + "!!" +# Load model once on startup +flexsed = FlexSED(device="cuda" if torch.cuda.is_available() else "cpu") -demo = gr.Interface(fn=greet, inputs="text", outputs="text") -demo.launch() +def run_flexsed(audio_file, event_list): + """ + Run inference using FlexSED and return prediction plot. + """ + if not audio_file: + return None + + # Split events by semicolon or comma + events = [e.strip() for e in event_list.split(";") if e.strip()] + if not events: + return None + + # Run inference + preds = flexsed.run_inference(audio_file, events) + + # Generate visualization + output_fname = os.path.join(tempfile.gettempdir(), "flexsed_output") + flexsed.to_multi_plot(preds, events, fname=output_fname) + plot_path = f"{output_fname}.png" + + return plot_path + + +# App layout +with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue")) as app: + # Header + gr.Markdown(""" + ## ๐ŸŽง FlexSED: A Flexible Open-Vocabulary Sound Event Detection System + + ๐Ÿ‘‹ Welcome to the **FlexSED live demo** โ€” explore **prompt-guided sound event detection** in real audio clips. + + ๐Ÿ”— Learn more on the [FlexSED GitHub Repository](https://github.com/JHU-LCAP/FlexSED) + """) + + gr.Markdown("### ๐Ÿ” Upload or choose an example below to detect sound events:") + + with gr.Row(): + # Left column: Inputs + with gr.Column(scale=1): + audio_input = gr.Audio(type="filepath", label="๐ŸŽต Upload Audio (.wav)") + text_input = gr.Textbox(label="Event list (semicolon-separated)", value="Male speech; Door; Dog; Laughter") + + with gr.Row(): + detect_btn = gr.Button("๐ŸŽฏ Detect", variant="primary") + clear_btn = gr.Button("๐Ÿงน Clear") + + # Right column: Output + with gr.Column(scale=1): + image_output = gr.Image(label="Prediction Plot", show_label=True, elem_id="output-image") + gr.Examples( + examples=[ + ["example.wav", "Male speech; Door; Dog; Laughter"], + ["example2.wav", "Male speech; Bee; Gunshot, gunfire"], + ], + inputs=[audio_input, text_input], + label="Example Audios" + ) + + # Function bindings + detect_btn.click(run_flexsed, inputs=[audio_input, text_input], outputs=image_output) + clear_btn.click(lambda: (None, "Male speech; Door; Dog; Laughter"), outputs=[audio_input, text_input]) + + +if __name__ == "__main__": + app.launch(share=True) diff --git a/example.wav b/example.wav new file mode 100644 index 0000000000000000000000000000000000000000..20fc8d75550807ee2fc3322ec4b37da156aab6d0 --- /dev/null +++ b/example.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:371ee4358cd3b12330f406d7d576fecb2329057132696360278b602043009562 +size 480044 diff --git a/example2.wav b/example2.wav new file mode 100644 index 0000000000000000000000000000000000000000..32485e3ed43b401a74b03a713475b663ff351682 --- /dev/null +++ b/example2.wav @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ceca83fd7bd5e1ab16dd61a445c3f3fb11b87c67d8a56b277d4ee293c56b23ed +size 480044 diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000000000000000000000000000000000000..ee6c2ccd91d4c306273eb75ad0bac2b1792aa2f0 --- /dev/null +++ b/requirements.txt @@ -0,0 +1,6 @@ +gradio +torch +soundfile +matplotlib +numpy +librosa \ No newline at end of file diff --git a/src/.ipynb_checkpoints/prepare_clap-checkpoint.py b/src/.ipynb_checkpoints/prepare_clap-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..d44c6daffa259eb74c37a469b490cfab5b69699b --- /dev/null +++ b/src/.ipynb_checkpoints/prepare_clap-checkpoint.py @@ -0,0 +1,39 @@ +import os +import pandas as pd +import torch +from transformers import AutoTokenizer, ClapTextModelWithProjection + +if __name__ == '__main__': + # Load the CLAP model and tokenizer + model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused") + model.eval() + tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused") + + # Path to the input CSV file + input_csv_path = '/home/user/SSD/Dataset/Audioset_SL/no_rule_all/label_to_id.csv' + output_path = 'clap_embedding/' # Replace with your desired output folder path + + # Create the output folder if it doesn't exist + os.makedirs(output_path, exist_ok=True) + + # Read the CSV file + df = pd.read_csv(input_csv_path) + + # Get unique event labels + events = df['label'].unique() + + with torch.no_grad(): # Disable gradient computation + # Process each event + for event in events: + text = event.replace('_', ' ') # Replace underscores with spaces + text = f'The sound of {text}' + print(text) + inputs = tokenizer([text], padding=True, return_tensors="pt") + outputs = model(**inputs) + text_embeds = outputs.text_embeds + + # Save the embeddings to a .pt file + output_file = os.path.join(output_path, f"{event}.pt") + torch.save(text_embeds, output_file) + + print("Embedding extraction and saving complete!") diff --git a/src/.ipynb_checkpoints/test-checkpoint.py b/src/.ipynb_checkpoints/test-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..1f6c313f46c9bec6fc0e8b39d417b2fb7bd49805 --- /dev/null +++ b/src/.ipynb_checkpoints/test-checkpoint.py @@ -0,0 +1,140 @@ +import random +import argparse +import os +import time +import numpy as np +import matplotlib.pyplot as plt +from tqdm import tqdm + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader + +from accelerate import Accelerator + +from models.transformer import Dasheng_Encoder +from models.sed_decoder import Decoder, TSED_Wrapper +from dataset.tsed import TSED_AS +from dataset.tsed_val import TSED_Val +from utils import load_yaml_with_includes, get_lr_scheduler, ConcatDatasetBatchSampler +from utils.data_aug import frame_shift, mixup, time_mask, feature_transformation +from val import val_psds + + +def parse_args(): + parser = argparse.ArgumentParser() + + # Config settings + parser.add_argument('--config-name', type=str, default='configs/model.yml') + parser.add_argument('--ckpt', type=str, default='20000.pt') + + # Training settings + parser.add_argument("--amp", type=str, default='fp16') + parser.add_argument('--epochs', type=int, default=20) + parser.add_argument('--num-workers', type=int, default=8) + parser.add_argument('--num-threads', type=int, default=1) + parser.add_argument('--eval-every-step', type=int, default=5000) + parser.add_argument('--save-every-step', type=int, default=5000) + # parser.add_argument('--dataloader', type=str, default='EACaps') + parser.add_argument("--logit-normal-indices", type=bool, default=False) + + # Log and random seed + parser.add_argument('--random-seed', type=int, default=2024) + parser.add_argument('--log-step', type=int, default=100) + parser.add_argument('--log-dir', type=str, default='../logs/') + parser.add_argument('--save-dir', type=str, default='../ckpts/') + return parser.parse_args() + + +def setup_directories(args, params): + args.log_dir = os.path.join(args.log_dir, params['model_name']) + '/' + args.save_dir = os.path.join(args.save_dir, params['model_name']) + '/' + + os.makedirs(args.log_dir, exist_ok=True) + os.makedirs(args.save_dir, exist_ok=True) + + +def set_device(args): + torch.set_num_threads(args.num_threads) + if torch.cuda.is_available(): + args.device = 'cuda' + torch.cuda.manual_seed_all(args.random_seed) + torch.backends.cuda.matmul.allow_tf32 = True + if torch.backends.cudnn.is_available(): + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + else: + args.device = 'cpu' + + +if __name__ == '__main__': + args = parse_args() + params = load_yaml_with_includes(args.config_name) + set_device(args) + setup_directories(args, params) + + random.seed(args.random_seed) + torch.manual_seed(args.random_seed) + + # use accelerator for multi-gpu training + accelerator = Accelerator(mixed_precision=args.amp, + gradient_accumulation_steps=params['opt']['accumulation_steps'], + step_scheduler_with_optimizer=False) + + train_set = TSED_AS(**params['data']['train_data']) + train_loader = DataLoader(train_set, batch_size=params['opt']['batch_size'], num_workers=args.num_workers) + + # val_set = TSED_Val(**params['data']['val_data']) + # val_loader = DataLoader(val_set, num_workers=0, batch_size=1, shuffle=False) + + test_set = TSED_Val(**params['data']['test_data']) + test_loader = DataLoader(test_set, num_workers=0, batch_size=1, shuffle=False) + + encoder = Dasheng_Encoder(**params['encoder']).to(accelerator.device) + pretrained_url = 'https://zenodo.org/records/11511780/files/dasheng_base.pt?download=1' + dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu') + model_parmeters = dump['model'] + # pretrained_url = 'https://zenodo.org/records/13315686/files/dasheng_audioset_mAP497.pt?download=1' + # dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu') + # model_parmeters = dump + encoder.load_state_dict(model_parmeters) + + decoder = Decoder(**params['decoder']).to(accelerator.device) + + model = TSED_Wrapper(encoder, decoder, params['ft_blocks'], params['frozen_encoder']) + print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.2f}M") + + model.load_state_dict(torch.load(args.ckpt, map_location='cpu')['model']) + + if params['frozen_encoder']: + optimizer = torch.optim.AdamW( + model.parameters(), + lr=params['opt']['learning_rate'], + weight_decay=params['opt']['weight_decay'], + betas=(params['opt']['beta1'], params['opt']['beta2']), + eps=params['opt']['adam_epsilon']) + else: + optimizer = torch.optim.AdamW( + [ + {'params': model.encoder.parameters(), 'lr': 0.1 * params['opt']['learning_rate']}, + {'params': model.decoder.parameters(), 'lr': params['opt']['learning_rate']} + ], + weight_decay=params['opt']['weight_decay'], + betas=(params['opt']['beta1'], params['opt']['beta2']), + eps=params['opt']['adam_epsilon']) + + lr_scheduler = get_lr_scheduler(optimizer, 'customized', **params['opt']['lr_scheduler']) + + strong_loss_func = nn.BCEWithLogitsLoss() + + model, optimizer, lr_scheduler, train_loader, test_loader = accelerator.prepare( + model, optimizer, lr_scheduler, train_loader, test_loader) + + global_step = 0.0 + losses = 0.0 + + if accelerator.is_main_process: + model_module = model.module if hasattr(model, 'module') else model + val_psds(model_module, test_loader, params, epoch='test_full', split='test', + save_path=args.log_dir + 'output/', device=accelerator.device) \ No newline at end of file diff --git a/src/.ipynb_checkpoints/train-checkpoint.py b/src/.ipynb_checkpoints/train-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..19d225165d77c205136b85e7ad0c3dec32b93ac2 --- /dev/null +++ b/src/.ipynb_checkpoints/train-checkpoint.py @@ -0,0 +1,208 @@ +import random +import argparse +import os +import time +import numpy as np +import matplotlib.pyplot as plt +from tqdm import tqdm + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader + +from accelerate import Accelerator + +from models.transformer import Dasheng_Encoder +from models.sed_decoder import Decoder, TSED_Wrapper +from dataset.tsed import TSED_AS +from dataset.tsed_val import TSED_Val +from utils import load_yaml_with_includes, get_lr_scheduler, ConcatDatasetBatchSampler +from utils.data_aug import frame_shift, mixup, time_mask, feature_transformation +from val import val_psds + + +def parse_args(): + parser = argparse.ArgumentParser() + + # Config settings + parser.add_argument('--config-name', type=str, default='configs/model.yml') + + # Training settings + parser.add_argument("--amp", type=str, default='fp16') + parser.add_argument('--epochs', type=int, default=20) + parser.add_argument('--num-workers', type=int, default=8) + parser.add_argument('--num-threads', type=int, default=1) + parser.add_argument('--eval-every-step', type=int, default=5000) + parser.add_argument('--save-every-step', type=int, default=5000) + # parser.add_argument('--dataloader', type=str, default='EACaps') + parser.add_argument("--logit-normal-indices", type=bool, default=False) + + # Log and random seed + parser.add_argument('--random-seed', type=int, default=2024) + parser.add_argument('--log-step', type=int, default=100) + parser.add_argument('--log-dir', type=str, default='../logs/') + parser.add_argument('--save-dir', type=str, default='../ckpts/') + return parser.parse_args() + + +def setup_directories(args, params): + args.log_dir = os.path.join(args.log_dir, params['model_name']) + '/' + args.save_dir = os.path.join(args.save_dir, params['model_name']) + '/' + + os.makedirs(args.log_dir, exist_ok=True) + os.makedirs(args.save_dir, exist_ok=True) + + +def set_device(args): + torch.set_num_threads(args.num_threads) + if torch.cuda.is_available(): + args.device = 'cuda' + torch.cuda.manual_seed_all(args.random_seed) + torch.backends.cuda.matmul.allow_tf32 = True + if torch.backends.cudnn.is_available(): + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + else: + args.device = 'cpu' + + +if __name__ == '__main__': + args = parse_args() + params = load_yaml_with_includes(args.config_name) + set_device(args) + setup_directories(args, params) + + random.seed(args.random_seed) + torch.manual_seed(args.random_seed) + + # use accelerator for multi-gpu training + accelerator = Accelerator(mixed_precision=args.amp, + gradient_accumulation_steps=params['opt']['accumulation_steps'], + step_scheduler_with_optimizer=False) + + train_set = TSED_AS(**params['data']['train_data']) + train_loader = DataLoader(train_set, shuffle=True, + batch_size=params['opt']['batch_size'], + num_workers=args.num_workers) + + val_set = TSED_Val(**params['data']['val_data']) + val_loader = DataLoader(val_set, num_workers=0, batch_size=1, shuffle=False) + + # test_set = TSED_Val(**params['data']['test_data']) + # test_loader = DataLoader(val_set, num_workers=0, batch_size=1, shuffle=False) + + encoder = Dasheng_Encoder(**params['encoder']).to(accelerator.device) + pretrained_url = 'https://zenodo.org/records/11511780/files/dasheng_base.pt?download=1' + dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu') + model_parmeters = dump['model'] + # pretrained_url = 'https://zenodo.org/records/13315686/files/dasheng_audioset_mAP497.pt?download=1' + # dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu') + # model_parmeters = dump + encoder.load_state_dict(model_parmeters) + + decoder = Decoder(**params['decoder']).to(accelerator.device) + + model = TSED_Wrapper(encoder, decoder, params['ft_blocks'], params['frozen_encoder']) + print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.2f}M") + + # model.load_state_dict(torch.load('../ckpts/TSED_AS_filter/20000.0.pt', map_location='cpu')['model']) + + if params['frozen_encoder']: + optimizer = torch.optim.AdamW( + model.parameters(), + lr=params['opt']['learning_rate'], + weight_decay=params['opt']['weight_decay'], + betas=(params['opt']['beta1'], params['opt']['beta2']), + eps=params['opt']['adam_epsilon']) + else: + optimizer = torch.optim.AdamW( + [ + {'params': model.encoder.parameters(), 'lr': 0.1 * params['opt']['learning_rate']}, + {'params': model.decoder.parameters(), 'lr': params['opt']['learning_rate']} + ], + weight_decay=params['opt']['weight_decay'], + betas=(params['opt']['beta1'], params['opt']['beta2']), + eps=params['opt']['adam_epsilon']) + + lr_scheduler = get_lr_scheduler(optimizer, 'customized', **params['opt']['lr_scheduler']) + + strong_loss_func = nn.BCEWithLogitsLoss() + + model, optimizer, lr_scheduler, train_loader, val_loader = accelerator.prepare( + model, optimizer, lr_scheduler, train_loader, val_loader) + + global_step = 0.0 + losses = 0.0 + + if accelerator.is_main_process: + model_module = model.module if hasattr(model, 'module') else model + val_psds(model_module, val_loader, params, epoch='debug', split='val', + save_path=args.log_dir + 'output/', device=accelerator.device) + + for epoch in range(args.epochs): + model.train() + for step, batch in enumerate(tqdm(train_loader)): + with accelerator.accumulate(model): + audio, cls, label, _ = batch + mel = model.forward_to_spec(audio) + + # data aug + mel, label = frame_shift(mel, label, params['net_pooling']) + mel, label = time_mask(mel, label, params["net_pooling"], + mask_ratios=params['data_aug']["time_mask_ratios"]) + mel, _ = feature_transformation(mel, **params['data_aug']["transform"]) + + strong_pred = model(mel, cls) + + B, N, L = label.shape + label = label.reshape(B * N, L) + label = label.unsqueeze(1) + + loss = strong_loss_func(strong_pred, label) + + accelerator.backward(loss) + + # clip grad up + if accelerator.sync_gradients: + if 'grad_clip' in params['opt'] and params['opt']['grad_clip'] > 0: + accelerator.clip_grad_norm_(model.parameters(), + max_norm=params['opt']['grad_clip']) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + global_step += 1/params['opt']['accumulation_steps'] + losses += loss.item()/params['opt']['accumulation_steps'] + + if accelerator.is_main_process: + if global_step % args.log_step == 0: + current_time = time.asctime(time.localtime(time.time())) + epoch_info = f'Epoch: [{epoch + 1}][{args.epochs}]' + batch_info = f'Global Step: {global_step}' + loss_info = f'Loss: {losses / args.log_step:.6f}' + + # Extract the learning rate from the optimizer + lr = optimizer.param_groups[0]['lr'] + lr_info = f'Learning Rate: {lr:.6f}' + + log_message = f'{current_time}\n{epoch_info} {batch_info} {loss_info} {lr_info}\n' + + with open(args.log_dir + 'log.txt', mode='a') as n: + n.write(log_message) + + losses = 0.0 + + # check performance + if (global_step + 1) % args.eval_every_step == 0: + if accelerator.is_main_process: + model_module = model.module if hasattr(model, 'module') else model + val_psds(model_module, val_loader, params, epoch=global_step+1, split='val', + save_path=args.log_dir + 'output/', device=accelerator.device) + # save model + unwrapped_model = accelerator.unwrap_model(model) + accelerator.save({ + "model": model.state_dict(), + }, args.save_dir + str(global_step+1) + '.pt') + accelerator.wait_for_everyone() + model.train() diff --git a/src/.ipynb_checkpoints/val-checkpoint.py b/src/.ipynb_checkpoints/val-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..ea0dca20b51ab35666006c2824dce95af2a9ef0c --- /dev/null +++ b/src/.ipynb_checkpoints/val-checkpoint.py @@ -0,0 +1,141 @@ +import torch +import os +import pandas as pd +from tqdm import tqdm +import sed_scores_eval +from desed_task.evaluation.evaluation_measures import (compute_per_intersection_macro_f1, + compute_psds_from_operating_points, + compute_psds_from_scores) +from local.utils import (batched_decode_preds,) +from utils.sed import Encoder +import numpy as np + + +@torch.no_grad() +def val_psds(model, val_loader, params, epoch, split, save_path, device): + label_df = pd.read_csv(params['data'][split]['label']) + EVENTS = label_df['label'].tolist() + + clap_emb = [] + for event in EVENTS: + cls = torch.load(params['data']['train_data']['clap_dir'] + event + '.pt').to(device) + cls = cls.unsqueeze(1) + clap_emb.append(cls) + cls = torch.cat(clap_emb, dim=1) + + encoder = Encoder(EVENTS, audio_len=10, frame_len=160, frame_hop=160, net_pooling=4, sr=16000) + + model.eval() + test_csv = params['data'][split]["csv"] + test_dur = params['data'][split]["dur"] + + gt = pd.read_csv(test_csv, sep='\t') + + test_scores_postprocessed_buffer = {} + test_scores_postprocessed_buffer_tsed = {} + test_thresholds = [0.5] + test_psds_buffer = {k: pd.DataFrame() for k in test_thresholds} + test_psds_buffer_tsed = {k: pd.DataFrame() for k in test_thresholds} + + for batch in tqdm(val_loader): + audio, filenames = batch + B = audio.shape[0] + N = cls.shape[1] + cls = cls.expand(B, -1, -1) + + audio = audio.to(device) + mel = model.forward_to_spec(audio) + + preds = model(mel, cls) + preds = torch.sigmoid(preds) + preds = preds.reshape(B, N, -1) + preds_tsed = preds.clone() + # tsed assumes sound exitencance is known + for idx, filename in enumerate(filenames): + weak_label = list(gt[gt['filename'] == filename]['event_label'].unique()) + for j, event in enumerate(EVENTS): + if event not in weak_label: + preds_tsed[idx][j] = 0.0 + # preds = preds.transpose(1, 2) + + (_, scores_postprocessed_strong, _,) = \ + batched_decode_preds( + preds, + filenames, + encoder, + median_filter=9, + thresholds=list(test_psds_buffer.keys()), ) + test_scores_postprocessed_buffer.update(scores_postprocessed_strong) + + (_, scores_postprocessed_strong_tsed, _,) = \ + batched_decode_preds( + preds_tsed, + filenames, + encoder, + median_filter=9, + thresholds=list(test_psds_buffer_tsed.keys()), ) + test_scores_postprocessed_buffer_tsed.update(scores_postprocessed_strong_tsed) + + ground_truth = sed_scores_eval.io.read_ground_truth_events(test_csv) + audio_durations = sed_scores_eval.io.read_audio_durations(test_dur) + + ground_truth = { + audio_id: ground_truth[audio_id] + for audio_id in test_scores_postprocessed_buffer + } + audio_durations = { + audio_id: audio_durations[audio_id] + for audio_id in test_scores_postprocessed_buffer + } + + psds1_sed_scores_eval, psds1_cls = compute_psds_from_scores( + test_scores_postprocessed_buffer, + ground_truth, + audio_durations, + dtc_threshold=0.7, + gtc_threshold=0.7, + cttc_threshold=None, + alpha_ct=0.0, + alpha_st=0.0, + # save_dir=os.path.join(save_dir, "student", "scenario1"), + ) + psds1_cls['overall'] = psds1_sed_scores_eval + psds1_cls['macro_averaged'] = np.array([v for k, v in psds1_cls.items()]).mean() + psds1_cls['name'] = 'psds1' + + psds1_sed_scores_eval_tsed, psds1_cls_tsed = compute_psds_from_scores( + test_scores_postprocessed_buffer_tsed, + ground_truth, + audio_durations, + dtc_threshold=0.7, + gtc_threshold=0.7, + cttc_threshold=None, + alpha_ct=0.0, + alpha_st=0.0, + # save_dir=os.path.join(save_dir, "student", "scenario1"), + ) + + psds1_cls_tsed['overall'] = psds1_sed_scores_eval_tsed + psds1_cls_tsed['macro_averaged'] = np.array([v for k, v in psds1_cls_tsed.items()]).mean() + psds1_cls_tsed['name'] = 'psds1_tsed' + + # psds2_sed_scores_eval, psds2_cls = compute_psds_from_scores( + # test_scores_postprocessed_buffer, + # ground_truth, + # audio_durations, + # dtc_threshold=0.1, + # gtc_threshold=0.1, + # cttc_threshold=0.3, + # alpha_ct=0.5, + # alpha_st=1, + # # save_dir=os.path.join(save_dir, "student", "scenario1"), + # ) + # psds2_cls['overall'] = psds2_sed_scores_eval + # psds2_cls['macro_averaged'] = np.array([v for k, v in psds2_cls.items()]).mean() + # psds2_cls['name'] = 'psds2' + psds_cls = pd.DataFrame([psds1_cls, psds1_cls_tsed]) + # psds_cls = pd.DataFrame([psds1_cls, psds2_cls]) + os.makedirs(f'{save_path}/psds_cls/', exist_ok=True) + psds_cls.to_csv(f'{save_path}/psds_cls/{epoch}.csv', index=False) + + return psds1_sed_scores_eval, psds1_sed_scores_eval_tsed \ No newline at end of file diff --git a/src/clap_embedding/Accelerating, revving, vroom.pt b/src/clap_embedding/Accelerating, revving, vroom.pt new file mode 100644 index 0000000000000000000000000000000000000000..89ad47a3caa4bdacb2108ef9c0350c5ba375e0cf --- /dev/null +++ b/src/clap_embedding/Accelerating, revving, vroom.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b4a351451f29ac729cdf638e3c3e81da4c1ff7963cdbbc17ca64c49f2e0a7f8 +size 3397 diff --git a/src/clap_embedding/Air brake.pt b/src/clap_embedding/Air brake.pt new file mode 100644 index 0000000000000000000000000000000000000000..92ce9a4c57a46ce79a215be71e287ffa844d97a0 --- /dev/null +++ b/src/clap_embedding/Air brake.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34ca30c586a0c92b86b136aa8fd69c27a31a10b454159e6fdfc1197e8c1585b5 +size 3238 diff --git a/src/clap_embedding/Air conditioning.pt b/src/clap_embedding/Air conditioning.pt new file mode 100644 index 0000000000000000000000000000000000000000..a081d956ccb8b471afb8b9acff47ec266ce66604 --- /dev/null +++ b/src/clap_embedding/Air conditioning.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8238a7ffa14c033b4d41f2331ce94424d925a0e3b1a37f4d5b491a111d518425 +size 3273 diff --git a/src/clap_embedding/Air horn, truck horn.pt b/src/clap_embedding/Air horn, truck horn.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6d14ae86794bd950180f7b65f31d54e61901548 --- /dev/null +++ b/src/clap_embedding/Air horn, truck horn.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a054ca636e138e115e085ed4b5f315ffaa60b647400f657dd6d334720fbc8e73 +size 3293 diff --git a/src/clap_embedding/Aircraft engine.pt b/src/clap_embedding/Aircraft engine.pt new file mode 100644 index 0000000000000000000000000000000000000000..a2b9c95c23b61632afaaa58f9ea0a28724644a92 --- /dev/null +++ b/src/clap_embedding/Aircraft engine.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac2b736bed945841a2d066cb4ad5218b55903c88083d34870b6d27eccc9b1d55 +size 3268 diff --git a/src/clap_embedding/Aircraft.pt b/src/clap_embedding/Aircraft.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c0b3f299088535d926f48ac8f585bbc6d4cde7f --- /dev/null +++ b/src/clap_embedding/Aircraft.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:902502af6d7e3ff22b6650282c1c8e3f98d6c1687b1f3078465bf942c30620cf +size 3233 diff --git a/src/clap_embedding/Alarm clock.pt b/src/clap_embedding/Alarm clock.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2aa92f991f623ecbfa2ca40762445a0880afc89 --- /dev/null +++ b/src/clap_embedding/Alarm clock.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e0d17a345893c6f0fe6a9d6fb11f060da277b8d82475b95c2249138919beb5b +size 3248 diff --git a/src/clap_embedding/Alarm.pt b/src/clap_embedding/Alarm.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8acb52b269a5ddc478b80cc1142c82355b36d49 --- /dev/null +++ b/src/clap_embedding/Alarm.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43fce07d002c946daad296e9b637943a9941cc703bb3c1755fb497f72afcccc1 +size 3154 diff --git a/src/clap_embedding/Alert.pt b/src/clap_embedding/Alert.pt new file mode 100644 index 0000000000000000000000000000000000000000..711a88433f1aa02d11ccf737b1451ac8fbc5fa14 --- /dev/null +++ b/src/clap_embedding/Alert.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:751b03e62094f66ddd4965d55583ffc8db0f37a621a614b8aec953ab284d5e23 +size 3154 diff --git a/src/clap_embedding/Ambulance (siren).pt b/src/clap_embedding/Ambulance (siren).pt new file mode 100644 index 0000000000000000000000000000000000000000..daef9b9d074742d048f4c2c9f528f941e47b8947 --- /dev/null +++ b/src/clap_embedding/Ambulance (siren).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b8d7ab047ba136bc0c8979ce97b917cbba1181e60257c046a748612dae58660 +size 3278 diff --git a/src/clap_embedding/Animal.pt b/src/clap_embedding/Animal.pt new file mode 100644 index 0000000000000000000000000000000000000000..33d5d10919af298bbeab18b26d640366bf6416cd --- /dev/null +++ b/src/clap_embedding/Animal.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:060e4de7c1aa9e784f8d4245b8ed7d17e001a68615005bdde858aeb044f61aac +size 3159 diff --git a/src/clap_embedding/Applause.pt b/src/clap_embedding/Applause.pt new file mode 100644 index 0000000000000000000000000000000000000000..8595d640e2be3460fac82331e110d5e350c9ee6e --- /dev/null +++ b/src/clap_embedding/Applause.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:053458a5422c7a2dc316dfe803963118a11093ef324b07c07d79f98e11001bbb +size 3233 diff --git a/src/clap_embedding/Arrow.pt b/src/clap_embedding/Arrow.pt new file mode 100644 index 0000000000000000000000000000000000000000..3726d294ca961db3d9109db46a30bc0577ee4b7e --- /dev/null +++ b/src/clap_embedding/Arrow.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ad03adea0e391dce55cf4e1ef13d4d299ea741d56e06a379915305b4ae56d03 +size 3154 diff --git a/src/clap_embedding/Artillery fire.pt b/src/clap_embedding/Artillery fire.pt new file mode 100644 index 0000000000000000000000000000000000000000..02718f9f683c227d08a3ebc716b9c86531975122 --- /dev/null +++ b/src/clap_embedding/Artillery fire.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a8aafd9fe1fc64424eb28aa0206bf3ad5ad505cb5eeb471164453f6d3a61313 +size 3263 diff --git a/src/clap_embedding/Audio logo.pt b/src/clap_embedding/Audio logo.pt new file mode 100644 index 0000000000000000000000000000000000000000..5bcb2814efe0972d3ebc9f2029842b7cfb4e6aaf --- /dev/null +++ b/src/clap_embedding/Audio logo.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6e3a7f4f827f4e9d2de401956568f7ff771a7e7c11cb547306454cf0ea0c4ab +size 3243 diff --git a/src/clap_embedding/Babbling.pt b/src/clap_embedding/Babbling.pt new file mode 100644 index 0000000000000000000000000000000000000000..5d365326fb01d990734e5fa9cef7fc01bd8083a0 --- /dev/null +++ b/src/clap_embedding/Babbling.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c7ccbb7c692f6d2a5a1039039ea127d06189788e2cab5d25302d8d0bd4ddef5 +size 3233 diff --git a/src/clap_embedding/Baby cry, infant cry.pt b/src/clap_embedding/Baby cry, infant cry.pt new file mode 100644 index 0000000000000000000000000000000000000000..f580920e70db90407bda723f88c6c1bd23a60d1a --- /dev/null +++ b/src/clap_embedding/Baby cry, infant cry.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebad3b3fd6e97f3a4681f2cbf3bc7b9dad2eed05715b2ab28d095ee156d204f7 +size 3293 diff --git a/src/clap_embedding/Baby laughter.pt b/src/clap_embedding/Baby laughter.pt new file mode 100644 index 0000000000000000000000000000000000000000..06f3a2cda7cbdec2604eadf77653ed658229d08a --- /dev/null +++ b/src/clap_embedding/Baby laughter.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13abd466060f1997a8f0251fd7a32456824713cfec0ca8754a01a2f245ae03af +size 3258 diff --git a/src/clap_embedding/Background noise.pt b/src/clap_embedding/Background noise.pt new file mode 100644 index 0000000000000000000000000000000000000000..4217e7cf05c63c002608c64a6e25eb64de64d3ad --- /dev/null +++ b/src/clap_embedding/Background noise.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e64c97573b4d05dfcb0e50362afff050b9e287e9c26195b10e6da1182a8b104f +size 3273 diff --git a/src/clap_embedding/Bang.pt b/src/clap_embedding/Bang.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c6eaa55c2a3273992af91903c5c7465d990a767 --- /dev/null +++ b/src/clap_embedding/Bang.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e28d04d1f0b6e407dda531d7df5c1883ac907c218feaffabb4a213445d874e5 +size 3149 diff --git a/src/clap_embedding/Bark.pt b/src/clap_embedding/Bark.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a64d772b9403f8156e5ae1a1f53d2d41ea04848 --- /dev/null +++ b/src/clap_embedding/Bark.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f866f230ad42ff9be34383a120ae6563606d149b1bd00039cd360c61cbbb371a +size 3149 diff --git a/src/clap_embedding/Basketball bounce.pt b/src/clap_embedding/Basketball bounce.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa97aece1beb9be8edf7c3ff8c36fc5c1b0064a8 --- /dev/null +++ b/src/clap_embedding/Basketball bounce.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cac0a6b3d44c42d8d1eeb93ea1fb59abcc095127ae88ea9f7f684ff0ba5f52d +size 3278 diff --git a/src/clap_embedding/Bathroom sounds.pt b/src/clap_embedding/Bathroom sounds.pt new file mode 100644 index 0000000000000000000000000000000000000000..0da8da8cd11d6ac37817b8089ab8b72ebac3cb76 --- /dev/null +++ b/src/clap_embedding/Bathroom sounds.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e781b8e4ea847ad5acfa5cdc7cbecf294019e96d156f5e8e9d71fd384421f3c4 +size 3268 diff --git a/src/clap_embedding/Bathtub (filling or washing).pt b/src/clap_embedding/Bathtub (filling or washing).pt new file mode 100644 index 0000000000000000000000000000000000000000..69a14e9b680ea1efe912c1b1b58d251ebd288d2e --- /dev/null +++ b/src/clap_embedding/Bathtub (filling or washing).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7af853e52974d5cd77f2d79b207303df6299c23f508ed481cf343c1b7034bed8 +size 3397 diff --git a/src/clap_embedding/Battle cry.pt b/src/clap_embedding/Battle cry.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a4d87cb4207d9307fd2ebebfdf9b1e6f9799d77 --- /dev/null +++ b/src/clap_embedding/Battle cry.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f06286c4fc69426a7fe8685fa6c74a337b97f552ab503009b11d11b70498a45 +size 3243 diff --git a/src/clap_embedding/Bee, wasp, etc..pt b/src/clap_embedding/Bee, wasp, etc..pt new file mode 100644 index 0000000000000000000000000000000000000000..586421d947fdbaf173f539e93d4489e470eb8d15 --- /dev/null +++ b/src/clap_embedding/Bee, wasp, etc..pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eaabbd88980a77e0e1a4293d912c9f21fefe9dc8cc288c75fd4c07020e86ef58 +size 3268 diff --git a/src/clap_embedding/Beep, bleep.pt b/src/clap_embedding/Beep, bleep.pt new file mode 100644 index 0000000000000000000000000000000000000000..972af18c2297a679d1e29b648405b4f00c26ca6a --- /dev/null +++ b/src/clap_embedding/Beep, bleep.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6d6a2021aa05b8f325efdd8e8aa163df1829cc9350b650da63a46284087d142 +size 3248 diff --git a/src/clap_embedding/Bell.pt b/src/clap_embedding/Bell.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0425ba4585daa3f30b7acf6586823f3a88d92c2 --- /dev/null +++ b/src/clap_embedding/Bell.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3252fcd0b5f0e000410ac308b5f012167770d0ac0e27f4897945b922e02485c2 +size 3149 diff --git a/src/clap_embedding/Bellow.pt b/src/clap_embedding/Bellow.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd321d37f784dd0b3725c10be3954982a0155108 --- /dev/null +++ b/src/clap_embedding/Bellow.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b43dd2c5a59384af15520a845ee7801138265eb28d0050fd96156f096ec822a1 +size 3159 diff --git a/src/clap_embedding/Belly laugh.pt b/src/clap_embedding/Belly laugh.pt new file mode 100644 index 0000000000000000000000000000000000000000..afcc121b3ddfcc201b70d2714c5329f53cdd62ad --- /dev/null +++ b/src/clap_embedding/Belly laugh.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61b60d96453309b2791d7e17a36e63e4ea98d5e55f687b5083349ac67a1e9cc7 +size 3248 diff --git a/src/clap_embedding/Bicycle bell.pt b/src/clap_embedding/Bicycle bell.pt new file mode 100644 index 0000000000000000000000000000000000000000..9daf23198e41c4f0d7522b5ee80807a56b21c06c --- /dev/null +++ b/src/clap_embedding/Bicycle bell.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0a0a337a1b9c19819393b6eab07aace307ad402fec3f9e84f72d6ffd5501e76 +size 3253 diff --git a/src/clap_embedding/Bicycle, tricycle.pt b/src/clap_embedding/Bicycle, tricycle.pt new file mode 100644 index 0000000000000000000000000000000000000000..d819c43a4984a56f081a860e3a138c24fa09007a --- /dev/null +++ b/src/clap_embedding/Bicycle, tricycle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f950ffa51035922839dcebe144e6c395c463288870f719a286064c02adf1f4f +size 3278 diff --git a/src/clap_embedding/Bird flight, flapping wings.pt b/src/clap_embedding/Bird flight, flapping wings.pt new file mode 100644 index 0000000000000000000000000000000000000000..127b57850d70dcd2f9bc5e43f06193062590f694 --- /dev/null +++ b/src/clap_embedding/Bird flight, flapping wings.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41e0c3d59dcfb50e5169cbee4ca8642f9b78ca047c0f3e44d0fa7f2b46bfb320 +size 3392 diff --git a/src/clap_embedding/Bird vocalization, bird call, bird song.pt b/src/clap_embedding/Bird vocalization, bird call, bird song.pt new file mode 100644 index 0000000000000000000000000000000000000000..0c1a61ea9af71ac587b7850de602465f871b7bb3 --- /dev/null +++ b/src/clap_embedding/Bird vocalization, bird call, bird song.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1445a61059d9164ced63c01abf60de91637b79d26fed015004124158824c04df +size 3452 diff --git a/src/clap_embedding/Bird.pt b/src/clap_embedding/Bird.pt new file mode 100644 index 0000000000000000000000000000000000000000..d15fae876b57cb8f04c61498677556c507082a24 --- /dev/null +++ b/src/clap_embedding/Bird.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c94d39ae43d5c16d35c77a6ddbe500dc7aae6a044a3a10f43d33dcf14da48e91 +size 3149 diff --git a/src/clap_embedding/Biting.pt b/src/clap_embedding/Biting.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae13f8443025dfd5fb38ca01122df2f8bea0e4e3 --- /dev/null +++ b/src/clap_embedding/Biting.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5c42b9c0b3fbfa6c0b43428a0bbbb199706c3d1babd7942d2f45e8a8874106d +size 3159 diff --git a/src/clap_embedding/Bleat.pt b/src/clap_embedding/Bleat.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e051d8dd586b0bbc438587e931998ce7e8a4010 --- /dev/null +++ b/src/clap_embedding/Bleat.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2b9513225200a2f5ddeea1ee54d167c85c9b180af1bbe3766f5ff0fb044ee2c +size 3154 diff --git a/src/clap_embedding/Blender, food processor.pt b/src/clap_embedding/Blender, food processor.pt new file mode 100644 index 0000000000000000000000000000000000000000..ed0cf50ccdd536546c0ee6f9763289b6fd22fb24 --- /dev/null +++ b/src/clap_embedding/Blender, food processor.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b82c822d42ea9ada0d83695e565af6b90a016942f7f02b0bbb27b9f5d755f5c +size 3372 diff --git a/src/clap_embedding/Boat, Water vehicle.pt b/src/clap_embedding/Boat, Water vehicle.pt new file mode 100644 index 0000000000000000000000000000000000000000..ffe709df9f027ae86b47bfde719219ce83feb6cf --- /dev/null +++ b/src/clap_embedding/Boat, Water vehicle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93194b228ea1887d0a77cc5dc8c23e0351c86820b3846139d9d3f9defa6669b4 +size 3288 diff --git a/src/clap_embedding/Boiling.pt b/src/clap_embedding/Boiling.pt new file mode 100644 index 0000000000000000000000000000000000000000..760e2c6ec54fe0da186f8899c17081982a82243e --- /dev/null +++ b/src/clap_embedding/Boiling.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f06e84807fba78dc21e5769267fda2bf42ea72854437fb9c3013c57d3f0ac40 +size 3228 diff --git a/src/clap_embedding/Boing.pt b/src/clap_embedding/Boing.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c17e4832199cef881166304f9e8ec6b8fdfb1e7 --- /dev/null +++ b/src/clap_embedding/Boing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60f5176af0476cbf1b7b043a5d8ae99754c0800bd6fff5be0529c42aec0fbd3d +size 3154 diff --git a/src/clap_embedding/Booing.pt b/src/clap_embedding/Booing.pt new file mode 100644 index 0000000000000000000000000000000000000000..308bc71360e186b7c11afd559599fc11ac63c88b --- /dev/null +++ b/src/clap_embedding/Booing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a31a445f20f03069b105cb05320034f9e900bc4609295fbd1aae968e548a993c +size 3159 diff --git a/src/clap_embedding/Boom.pt b/src/clap_embedding/Boom.pt new file mode 100644 index 0000000000000000000000000000000000000000..818311d2c646a51d6b06fd24e06af4bfe0fac2ad --- /dev/null +++ b/src/clap_embedding/Boom.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:535527eb5fac42bd126a1bdb550f49aefedf6d654a5f31199261ab6b3c16d114 +size 3149 diff --git a/src/clap_embedding/Bouncing.pt b/src/clap_embedding/Bouncing.pt new file mode 100644 index 0000000000000000000000000000000000000000..971c2a1c71214f1482ebdbdbad88ed33f6c46db8 --- /dev/null +++ b/src/clap_embedding/Bouncing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db76f737034a9e63108abedc19814eccafbc9c656f6962a216fd79f111474740 +size 3233 diff --git a/src/clap_embedding/Bow-wow.pt b/src/clap_embedding/Bow-wow.pt new file mode 100644 index 0000000000000000000000000000000000000000..6e189d70aa2f8e2f95102f4f563cd6c23a408671 --- /dev/null +++ b/src/clap_embedding/Bow-wow.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c25f1f94d2d210ad48f063b9b7abf974784ea02848ce9f28605b700d0aa33c1e +size 3228 diff --git a/src/clap_embedding/Breaking.pt b/src/clap_embedding/Breaking.pt new file mode 100644 index 0000000000000000000000000000000000000000..803daee1ecec6b21b76c2f3642814702da8c7828 --- /dev/null +++ b/src/clap_embedding/Breaking.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0675454a2ea3bc0ee2664d68863825cdefb57fcc9acc218c6b570802292b72ea +size 3233 diff --git a/src/clap_embedding/Breathing.pt b/src/clap_embedding/Breathing.pt new file mode 100644 index 0000000000000000000000000000000000000000..36328bbacfa5c63f8b76359361195d3d66991de7 --- /dev/null +++ b/src/clap_embedding/Breathing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41a7d56c7773c48fb93f849a86897083f82a4f075711943ac98686a9311ea1a7 +size 3238 diff --git a/src/clap_embedding/Brief tone.pt b/src/clap_embedding/Brief tone.pt new file mode 100644 index 0000000000000000000000000000000000000000..372774511b20e61b4acc63a342808241997d4f26 --- /dev/null +++ b/src/clap_embedding/Brief tone.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3fb05440b92dae952b5a268900ec7cefbbb4fc11e55c52ac452fb654e1121f3 +size 3243 diff --git a/src/clap_embedding/Burping, eructation.pt b/src/clap_embedding/Burping, eructation.pt new file mode 100644 index 0000000000000000000000000000000000000000..f1d92b6ed57ab8d24e21c73b9242e5b2c5f9d700 --- /dev/null +++ b/src/clap_embedding/Burping, eructation.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ebf11a6352783c2ad00648882773b86d8df69b03f30857646cad7c62e6bb715 +size 3288 diff --git a/src/clap_embedding/Burst, pop.pt b/src/clap_embedding/Burst, pop.pt new file mode 100644 index 0000000000000000000000000000000000000000..7910198847d49c98a9767f62666590571ba9e372 --- /dev/null +++ b/src/clap_embedding/Burst, pop.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97105cef7bbe84857ebbf5630159c4d0dfab8f8f6d69ceb25e22bfd3fa9cbd05 +size 3243 diff --git a/src/clap_embedding/Bus.pt b/src/clap_embedding/Bus.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b3ae793df438290b55e2c0bb00d1ab5833cdc51 --- /dev/null +++ b/src/clap_embedding/Bus.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cf24d0cf1e98d7f6b28a9618ee9f22c76ebee68761cd1a4f74c7e66ed28e4f5 +size 3144 diff --git a/src/clap_embedding/Busy signal.pt b/src/clap_embedding/Busy signal.pt new file mode 100644 index 0000000000000000000000000000000000000000..8961cc946389c4675d404d240f305a244883abc6 --- /dev/null +++ b/src/clap_embedding/Busy signal.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2953e128d2814f7d4380193caabf887ac3e6acbdfe56d7cc9744203ff4cd59b0 +size 3248 diff --git a/src/clap_embedding/Buzz.pt b/src/clap_embedding/Buzz.pt new file mode 100644 index 0000000000000000000000000000000000000000..aaa6b39e9e463ff2f69c48811c0857d67bd734c7 --- /dev/null +++ b/src/clap_embedding/Buzz.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8b274b4e394f221b826f5b7aa1273a5c96bfc146ebffe4ad908a343e32f8898 +size 3149 diff --git a/src/clap_embedding/Buzzer.pt b/src/clap_embedding/Buzzer.pt new file mode 100644 index 0000000000000000000000000000000000000000..f9957d8baed453afe5dddb85ac5de839b04c852f --- /dev/null +++ b/src/clap_embedding/Buzzer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0260ad6dbc4fe9b513b15f687227542a7c56417cb0cb9f149f137acdce6d8ba9 +size 3159 diff --git a/src/clap_embedding/Cacophony.pt b/src/clap_embedding/Cacophony.pt new file mode 100644 index 0000000000000000000000000000000000000000..a1072ac3548d0b672ee400fd8b2a899fabf70c81 --- /dev/null +++ b/src/clap_embedding/Cacophony.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81cd0f66b1f6f68efd894b46db0e0cc095ce654a5e6a169505f0581d03bc0614 +size 3238 diff --git a/src/clap_embedding/Camera.pt b/src/clap_embedding/Camera.pt new file mode 100644 index 0000000000000000000000000000000000000000..341f357c3f539fae71c56bdbcc8a050b9c02b5b9 --- /dev/null +++ b/src/clap_embedding/Camera.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9a4772b66db8fc8bda74f9aaa4144d9511d6325e5a5c753034527a1dd7db71f +size 3159 diff --git a/src/clap_embedding/Canidae, wild dogs, wolves.pt b/src/clap_embedding/Canidae, wild dogs, wolves.pt new file mode 100644 index 0000000000000000000000000000000000000000..9e249ef6c8de98c06968bc2c6615b69c508ec5b9 --- /dev/null +++ b/src/clap_embedding/Canidae, wild dogs, wolves.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a76e382a41fe59d84eb709ec5316b1fa19ab6a7d8413fec005d44facdd2b575d +size 3387 diff --git a/src/clap_embedding/Cap gun.pt b/src/clap_embedding/Cap gun.pt new file mode 100644 index 0000000000000000000000000000000000000000..381077d8428a987e1a3ae8a5d6420949f86d8f45 --- /dev/null +++ b/src/clap_embedding/Cap gun.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2186da90d82b8822ffa43940491a2fc77a9b31ea95d4951d51bdf6cff2ef37e +size 3228 diff --git a/src/clap_embedding/Car alarm.pt b/src/clap_embedding/Car alarm.pt new file mode 100644 index 0000000000000000000000000000000000000000..638ef858aec6e4b58a3bff742c53b03d45f55158 --- /dev/null +++ b/src/clap_embedding/Car alarm.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2db1bb75b31da191496715fd9108c627118e6270dc2694a1252e5c3bbe6c84f +size 3238 diff --git a/src/clap_embedding/Car passing by.pt b/src/clap_embedding/Car passing by.pt new file mode 100644 index 0000000000000000000000000000000000000000..85876d18eb4c2ab386b5b93cd48f0120c0447ca2 --- /dev/null +++ b/src/clap_embedding/Car passing by.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5e1d75d16a68f20d32ff9679ea593c7f66937f167ffed37802ba6ba2cb51327 +size 3263 diff --git a/src/clap_embedding/Car.pt b/src/clap_embedding/Car.pt new file mode 100644 index 0000000000000000000000000000000000000000..d234f4a5ad40f0424dd257738ccb54a8ead66ad2 --- /dev/null +++ b/src/clap_embedding/Car.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d564e63db3a73146ef9ecbec171726c134c24bd921862813d2bd13d7be2252e2 +size 3144 diff --git a/src/clap_embedding/Carbon monoxide detector, CO detector.pt b/src/clap_embedding/Carbon monoxide detector, CO detector.pt new file mode 100644 index 0000000000000000000000000000000000000000..c7e692ce3b65feb888ca97c92c2a6bf4c1c8c2a4 --- /dev/null +++ b/src/clap_embedding/Carbon monoxide detector, CO detector.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5aaad4cc1f8b622302096d9336e05dd95e8192c4382e15202dd2bd95b70a85d +size 3442 diff --git a/src/clap_embedding/Cart.pt b/src/clap_embedding/Cart.pt new file mode 100644 index 0000000000000000000000000000000000000000..398441f21bb4c18e5b6a6f8db8cead171ba3b95c --- /dev/null +++ b/src/clap_embedding/Cart.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16966a1209b58296c44e8c50fd6a6c272d78753c184cd0782f515888dfdc2696 +size 3149 diff --git a/src/clap_embedding/Cash register.pt b/src/clap_embedding/Cash register.pt new file mode 100644 index 0000000000000000000000000000000000000000..4ad6b0440f9a9624b5a28bcf2b9d80080b2196d2 --- /dev/null +++ b/src/clap_embedding/Cash register.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3434a96dc99ce05dabd215bc9d794d608da8694a0c2d362483160f1b8fedd38 +size 3258 diff --git a/src/clap_embedding/Cat.pt b/src/clap_embedding/Cat.pt new file mode 100644 index 0000000000000000000000000000000000000000..980c3040ccd466c889e61ce9e5c1720c1a11f32b --- /dev/null +++ b/src/clap_embedding/Cat.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4004045e724c6041773174b25504cd19b8014be20e53af60930791bb8f2fee4 +size 3144 diff --git a/src/clap_embedding/Caterwaul.pt b/src/clap_embedding/Caterwaul.pt new file mode 100644 index 0000000000000000000000000000000000000000..38f74b4322961b831269780f86040436ef424f42 --- /dev/null +++ b/src/clap_embedding/Caterwaul.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f71a5270a8e7be89cd487a38bbb0961501df97b9602750240d868815d73dd67 +size 3238 diff --git a/src/clap_embedding/Cattle, bovinae.pt b/src/clap_embedding/Cattle, bovinae.pt new file mode 100644 index 0000000000000000000000000000000000000000..7d2c853a0a04ae372b32b2beac2744a31ff0f1ea --- /dev/null +++ b/src/clap_embedding/Cattle, bovinae.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2e197f3983cbc9711b42cd242f5cff38d308d85557bee117728e8c5028fb776 +size 3268 diff --git a/src/clap_embedding/Caw.pt b/src/clap_embedding/Caw.pt new file mode 100644 index 0000000000000000000000000000000000000000..eaa2cae89d84e0ee583fd2774ac54ec34c8579fc --- /dev/null +++ b/src/clap_embedding/Caw.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9456d5477769582ee341af4cd0830a22b92f916a8aebf490e4913afaf6ab7bb7 +size 3144 diff --git a/src/clap_embedding/Cellphone buzz, vibrating alert.pt b/src/clap_embedding/Cellphone buzz, vibrating alert.pt new file mode 100644 index 0000000000000000000000000000000000000000..81b417a4794aba99ec5fa73be087af34b90f34e5 --- /dev/null +++ b/src/clap_embedding/Cellphone buzz, vibrating alert.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:882daf495397bf9ae954546489ca761bc36f097d8f09055a5bee529906e40bdd +size 3412 diff --git a/src/clap_embedding/Chain.pt b/src/clap_embedding/Chain.pt new file mode 100644 index 0000000000000000000000000000000000000000..c467330e7f26abd5698c6901ceea569b543a9240 --- /dev/null +++ b/src/clap_embedding/Chain.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f7e7f0023b367ed8370cafb7cbc4032d92f23e9583858d1bd75cb9c3f423551 +size 3154 diff --git a/src/clap_embedding/Chainsaw.pt b/src/clap_embedding/Chainsaw.pt new file mode 100644 index 0000000000000000000000000000000000000000..e81dfe4a63427edba2bf44f69291a5d99060363c --- /dev/null +++ b/src/clap_embedding/Chainsaw.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c5ec951bc3cfab1f7eff9dbeb7d3b28b4e1f55f690599c445a8b5e299769d9e +size 3233 diff --git a/src/clap_embedding/Change ringing (campanology).pt b/src/clap_embedding/Change ringing (campanology).pt new file mode 100644 index 0000000000000000000000000000000000000000..cab9bb361dcc85951938d43317329f1df780a896 --- /dev/null +++ b/src/clap_embedding/Change ringing (campanology).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66653e443b52f18a0a8e8c7619c75ff4b031ab3e0b8f756a18d925571e8f0419 +size 3397 diff --git a/src/clap_embedding/Channel, environment and background.pt b/src/clap_embedding/Channel, environment and background.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8578c40a3cd157bba3620fc85ab240e8f936759 --- /dev/null +++ b/src/clap_embedding/Channel, environment and background.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b87a45a1a807c212b62cf7d0bc4330a9da7bf10bcf73fd846114f0d646d8ec90 +size 3432 diff --git a/src/clap_embedding/Chant.pt b/src/clap_embedding/Chant.pt new file mode 100644 index 0000000000000000000000000000000000000000..d7556930354c5ad101ee3d71c1b3cb9a20b33bbf --- /dev/null +++ b/src/clap_embedding/Chant.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71a46da22e297174dfcf54e0cf808cd5c8036392f249d6a2c8c01882956ba748 +size 3154 diff --git a/src/clap_embedding/Cheering.pt b/src/clap_embedding/Cheering.pt new file mode 100644 index 0000000000000000000000000000000000000000..44374169ef5b0fb5ffb44c79c63caa5982888889 --- /dev/null +++ b/src/clap_embedding/Cheering.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4af7fe9c11c364eb07eab39f0ac815c6b346f2cbaac7cd4fc1c4295c364ebae7 +size 3233 diff --git a/src/clap_embedding/Chewing, mastication.pt b/src/clap_embedding/Chewing, mastication.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6b7a3ce3181291981381a6931c157e7e4c96c18 --- /dev/null +++ b/src/clap_embedding/Chewing, mastication.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:410311ae9a09290f0105ed35286dcd21a5efcd714517770cb21a256084385acc +size 3293 diff --git a/src/clap_embedding/Chicken, rooster.pt b/src/clap_embedding/Chicken, rooster.pt new file mode 100644 index 0000000000000000000000000000000000000000..8e383342f4edfd523aab4b150ef7e4321d1cf9a6 --- /dev/null +++ b/src/clap_embedding/Chicken, rooster.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc9edc23ef9017e3e4e1b7a6c7e17d168a288b8d1e6d51eba6a8fdd4bd9aa743 +size 3273 diff --git a/src/clap_embedding/Child singing.pt b/src/clap_embedding/Child singing.pt new file mode 100644 index 0000000000000000000000000000000000000000..b3922caaa40a590c892f15c2f81be51d3f021ae7 --- /dev/null +++ b/src/clap_embedding/Child singing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4eba20123ecb6088f738ce51662c2b0a15f0ed12d5c3d60f6685914f8a7a6a2 +size 3258 diff --git a/src/clap_embedding/Child speech, kid speaking.pt b/src/clap_embedding/Child speech, kid speaking.pt new file mode 100644 index 0000000000000000000000000000000000000000..6c13fc148add8284c3661a18f087505b87e0b912 --- /dev/null +++ b/src/clap_embedding/Child speech, kid speaking.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:028c57d19d3e560143b9fc85ac2624ef35cff37883f323f5b9e92984baa83a4c +size 3387 diff --git a/src/clap_embedding/Children playing.pt b/src/clap_embedding/Children playing.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8dc58156c9dcfdc5932fdc99659bfbe669403dc --- /dev/null +++ b/src/clap_embedding/Children playing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb587309b4dfb010384865d81be89d6ecc7687f7f2446452709dab48e8654f4f +size 3273 diff --git a/src/clap_embedding/Children shouting.pt b/src/clap_embedding/Children shouting.pt new file mode 100644 index 0000000000000000000000000000000000000000..7e85cc68e54d2ef18cbc2ac2402ef53a456fd91f --- /dev/null +++ b/src/clap_embedding/Children shouting.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cf5e6e8b75f38dfea2f6b86a3892fa259614db4cbc864c8074c80d2d82c81b0 +size 3278 diff --git a/src/clap_embedding/Chime.pt b/src/clap_embedding/Chime.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b1a01d7c39d396019fac0a5b27e7be7dcfaae6f --- /dev/null +++ b/src/clap_embedding/Chime.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13549bebdf6a4ee2fe3805240e010f3d311b60d3ef784abe43dfc81cfb6244cf +size 3154 diff --git a/src/clap_embedding/Chipmunk.pt b/src/clap_embedding/Chipmunk.pt new file mode 100644 index 0000000000000000000000000000000000000000..3c691ab7afa33fe0cbf32a5e20421a8f92eee915 --- /dev/null +++ b/src/clap_embedding/Chipmunk.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0f32162236619d57f53f0019dc9af552b20e91d351200e2d666c708c512e8f7 +size 3233 diff --git a/src/clap_embedding/Chirp tone.pt b/src/clap_embedding/Chirp tone.pt new file mode 100644 index 0000000000000000000000000000000000000000..74f94889bf042fd28548d62d40db45e8cfa7c277 --- /dev/null +++ b/src/clap_embedding/Chirp tone.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59faac41c0cd23d339866a054aafac8ef09aa44568af23353e7cb2b0a40af3b6 +size 3243 diff --git a/src/clap_embedding/Chirp, tweet.pt b/src/clap_embedding/Chirp, tweet.pt new file mode 100644 index 0000000000000000000000000000000000000000..ea21422cdc3e084986efa1a5601094ace630cb28 --- /dev/null +++ b/src/clap_embedding/Chirp, tweet.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f0aa4982ae1f43605780e0e7bc8f259de4bb7112e726e12d864fe8abeac7d8 +size 3253 diff --git a/src/clap_embedding/Choir.pt b/src/clap_embedding/Choir.pt new file mode 100644 index 0000000000000000000000000000000000000000..fad576c5626fe4fa116b4af9af6d4cf6a8a14e78 --- /dev/null +++ b/src/clap_embedding/Choir.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49d653add2957fa585fcb8b2c0ab759bf0ed2f799bcf2832ae40aafdc4a9426c +size 3154 diff --git a/src/clap_embedding/Chop.pt b/src/clap_embedding/Chop.pt new file mode 100644 index 0000000000000000000000000000000000000000..ff7c9a3def5c41ceec89c9b84dfc1abc8ea5854c --- /dev/null +++ b/src/clap_embedding/Chop.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a3938979ccc54bc43cfb4720bfbcf90af6dab2eae82a21ff67679e4b69bf83d +size 3149 diff --git a/src/clap_embedding/Chopping (food).pt b/src/clap_embedding/Chopping (food).pt new file mode 100644 index 0000000000000000000000000000000000000000..ce266f756599d061f6c079d531380584db632f19 --- /dev/null +++ b/src/clap_embedding/Chopping (food).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c90f158f10db77e2fffc11cdcede0663b7461527bcda9fd05618c4272b586569 +size 3268 diff --git a/src/clap_embedding/Chorus effect.pt b/src/clap_embedding/Chorus effect.pt new file mode 100644 index 0000000000000000000000000000000000000000..814feb2432a096d08b69e73df0922878046e583a --- /dev/null +++ b/src/clap_embedding/Chorus effect.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:644f9be1c5b6240088f699514cd9dfd4d372e5df881faf6d644d4edf73b53f9b +size 3258 diff --git a/src/clap_embedding/Chuckle, chortle.pt b/src/clap_embedding/Chuckle, chortle.pt new file mode 100644 index 0000000000000000000000000000000000000000..534a5cb0e6100d7eaeb1d69d551b35f20d7f5248 --- /dev/null +++ b/src/clap_embedding/Chuckle, chortle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5099fb5e47d3ca2fe22c7ff306c8f5847ea439c26640a6cf729955a095a32d71 +size 3273 diff --git a/src/clap_embedding/Church bell.pt b/src/clap_embedding/Church bell.pt new file mode 100644 index 0000000000000000000000000000000000000000..8f212df54046f7a3a04910f431684195176dfe93 --- /dev/null +++ b/src/clap_embedding/Church bell.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:037fa80197f2087b5400d2a1bfdfc6b206ebec7603bc167721c7300d214573c7 +size 3248 diff --git a/src/clap_embedding/Civil defense siren.pt b/src/clap_embedding/Civil defense siren.pt new file mode 100644 index 0000000000000000000000000000000000000000..2ad0a7cdf6874076cfa826d6d8e6caae8ad9a044 --- /dev/null +++ b/src/clap_embedding/Civil defense siren.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41b6b9685fd12967bffec01ded5c340fac280d7d344aa7e21ee372714c19701d +size 3288 diff --git a/src/clap_embedding/Clang.pt b/src/clap_embedding/Clang.pt new file mode 100644 index 0000000000000000000000000000000000000000..bde29632f5e766114d54a5b47696f3bc80940c66 --- /dev/null +++ b/src/clap_embedding/Clang.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2e6dc1c30f844c22cac08f40320976fb16d051e08aca96cc47ce8c471a504fd +size 3154 diff --git a/src/clap_embedding/Clapping.pt b/src/clap_embedding/Clapping.pt new file mode 100644 index 0000000000000000000000000000000000000000..17029b7621a54fd4cf2e12e74b47b7243276688b --- /dev/null +++ b/src/clap_embedding/Clapping.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ec7091b75958ed29949b4b86fee4b38c1301cb55f16efe86b347227c883ae05 +size 3233 diff --git a/src/clap_embedding/Clatter.pt b/src/clap_embedding/Clatter.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe12198020e132a1905d3bc31aca2b39558233ea --- /dev/null +++ b/src/clap_embedding/Clatter.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2225da4a7a2037466bb01aea0a066b8dfd96921f636d800b5e3e7baedb2b7993 +size 3228 diff --git a/src/clap_embedding/Clickety-clack.pt b/src/clap_embedding/Clickety-clack.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb43c86da11f47b2b7f89a0653b0a91644427718 --- /dev/null +++ b/src/clap_embedding/Clickety-clack.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebd895905003dc6b960f6229ea164ccab51d2b973550be0b06a9b7748498d92c +size 3263 diff --git a/src/clap_embedding/Clicking.pt b/src/clap_embedding/Clicking.pt new file mode 100644 index 0000000000000000000000000000000000000000..5eb2da0e214c5bdded36b582153b00e18ee85b38 --- /dev/null +++ b/src/clap_embedding/Clicking.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba6faee2c17550aab23afdca590e71347241992ceb4cf414428337a206b9e258 +size 3233 diff --git a/src/clap_embedding/Clip-clop.pt b/src/clap_embedding/Clip-clop.pt new file mode 100644 index 0000000000000000000000000000000000000000..f4399488ce7da84bce4c1c9381e556a31821d237 --- /dev/null +++ b/src/clap_embedding/Clip-clop.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c95dc151b8b94e5d189504882f09ddaa79068866a00c56b24b2889b7bf966b83 +size 3238 diff --git a/src/clap_embedding/Clock.pt b/src/clap_embedding/Clock.pt new file mode 100644 index 0000000000000000000000000000000000000000..3ead5585756bd4615589ee87355d1b7237c12965 --- /dev/null +++ b/src/clap_embedding/Clock.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b36602a200f5dd94e476dc52a0700167a9a6f9984761ebc566b8aef0b2170a7 +size 3154 diff --git a/src/clap_embedding/Cluck.pt b/src/clap_embedding/Cluck.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a86785c6abc36f7f998a425e7ecf7b381478f7e --- /dev/null +++ b/src/clap_embedding/Cluck.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe5c44a7a8ae34b839495a3c50aee8a258ce0f11a1306245cdfb158476dfe517 +size 3154 diff --git a/src/clap_embedding/Clunk.pt b/src/clap_embedding/Clunk.pt new file mode 100644 index 0000000000000000000000000000000000000000..b69099a8267b09cc8591bc287e06109953957227 --- /dev/null +++ b/src/clap_embedding/Clunk.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:245843b4b71d02785fa03b590e5e00a869b1588f6740e6ac1c79ad4469f55301 +size 3154 diff --git a/src/clap_embedding/Coin (dropping).pt b/src/clap_embedding/Coin (dropping).pt new file mode 100644 index 0000000000000000000000000000000000000000..76b4f9760a0c25e508c3daf06a95409de267ba98 --- /dev/null +++ b/src/clap_embedding/Coin (dropping).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4b4e901a590dcac872c7ebf9cd7aee1342df5f185024f3c9b94c2ee41923328 +size 3268 diff --git a/src/clap_embedding/Computer keyboard.pt b/src/clap_embedding/Computer keyboard.pt new file mode 100644 index 0000000000000000000000000000000000000000..b39a7cdd9cb4b853780fe8ee2dbfbccb51cfefb8 --- /dev/null +++ b/src/clap_embedding/Computer keyboard.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a28ff845436a09f4578f1b0d8e3ee988d1307858da891a973440ae248d4fd644 +size 3278 diff --git a/src/clap_embedding/Conversation.pt b/src/clap_embedding/Conversation.pt new file mode 100644 index 0000000000000000000000000000000000000000..d58531f86679919980a683a665c85d5d44b26378 --- /dev/null +++ b/src/clap_embedding/Conversation.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b426ccf93113930cf2f7860e5540411c214f5607a88955e9419b024e59692ba +size 3253 diff --git a/src/clap_embedding/Coo.pt b/src/clap_embedding/Coo.pt new file mode 100644 index 0000000000000000000000000000000000000000..77b14e59d4c225ea916b2acd4c80437b059e05c0 --- /dev/null +++ b/src/clap_embedding/Coo.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57f52758744597ace6fea8aa90642f29b68522b9fc56dc22576c67da4f97a517 +size 3144 diff --git a/src/clap_embedding/Cough.pt b/src/clap_embedding/Cough.pt new file mode 100644 index 0000000000000000000000000000000000000000..5b49dc73a9b5c6b80479a2906049b31541d5d94a --- /dev/null +++ b/src/clap_embedding/Cough.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c1931a79d1ed1397fe5dec9d8bfe0789f875701195cd8d25d6fa536c77b041fd +size 3154 diff --git a/src/clap_embedding/Cowbell.pt b/src/clap_embedding/Cowbell.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc68c467cfd6b4ab7feade335514bbeec1c7996f --- /dev/null +++ b/src/clap_embedding/Cowbell.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51916cf4083f9decd98fa84f8598d79d35b7e30f4378e72ad80b625443fe56d6 +size 3228 diff --git a/src/clap_embedding/Crack.pt b/src/clap_embedding/Crack.pt new file mode 100644 index 0000000000000000000000000000000000000000..4da577e73b0b7551539a57131637efdc3bd5b3c2 --- /dev/null +++ b/src/clap_embedding/Crack.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d32d28956560bc8b0c9841545e52fe102edd65e869717c4f16695ed8680ecaf4 +size 3154 diff --git a/src/clap_embedding/Crackle.pt b/src/clap_embedding/Crackle.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c8dfa7e86220cd07fba5386af88c76dc88d9056 --- /dev/null +++ b/src/clap_embedding/Crackle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b685c817aa2420465671b673a0fcdbea122911e0d43dda1041506b3b5d98a8fb +size 3228 diff --git a/src/clap_embedding/Creak.pt b/src/clap_embedding/Creak.pt new file mode 100644 index 0000000000000000000000000000000000000000..270b1b23768ec3549bbd379be3bde059b4f8e45a --- /dev/null +++ b/src/clap_embedding/Creak.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:809c351f7aec873ccf067b1dd4dc0714f69ea190204052a55b5d71d8f3650bc0 +size 3154 diff --git a/src/clap_embedding/Cricket.pt b/src/clap_embedding/Cricket.pt new file mode 100644 index 0000000000000000000000000000000000000000..8df92856e82cd5a51b621d077a1e3d4a71cb0b5b --- /dev/null +++ b/src/clap_embedding/Cricket.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15a1d23dd970134167297b0b7147ec545ab2d580a793c96c0812232acec46314 +size 3228 diff --git a/src/clap_embedding/Croak.pt b/src/clap_embedding/Croak.pt new file mode 100644 index 0000000000000000000000000000000000000000..498558d318dc117b198288f0d9b2ba098eb095cd --- /dev/null +++ b/src/clap_embedding/Croak.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b4867ee83342bf7118a01bca1d2a0fedada0eddf7b9560bf9de1350257b49b6 +size 3154 diff --git a/src/clap_embedding/Crockery breaking and smashing.pt b/src/clap_embedding/Crockery breaking and smashing.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d4ef1f7a48f8c6b86e9116508943d5447695e00 --- /dev/null +++ b/src/clap_embedding/Crockery breaking and smashing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb10aa028d6f971980accbb5ab74b1ec1ced3f8ecb18e045664bbf28600d540f +size 3407 diff --git a/src/clap_embedding/Crow.pt b/src/clap_embedding/Crow.pt new file mode 100644 index 0000000000000000000000000000000000000000..2db0839bf0cec85f26adbeb1b36f2a37491d44b5 --- /dev/null +++ b/src/clap_embedding/Crow.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08d0e467119a1b93b503acdd82216e7c2d4685ac515ddb1025af619f0e67b767 +size 3149 diff --git a/src/clap_embedding/Crowd.pt b/src/clap_embedding/Crowd.pt new file mode 100644 index 0000000000000000000000000000000000000000..e03bfc92aaffc0257f08bd84e5881b3a2a64a2da --- /dev/null +++ b/src/clap_embedding/Crowd.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d05b494d5b2105d8e7bec7d20bb2bd35343c30ba6ac599a65fe6c08ca8e90a0 +size 3154 diff --git a/src/clap_embedding/Crowing, cock-a-doodle-doo.pt b/src/clap_embedding/Crowing, cock-a-doodle-doo.pt new file mode 100644 index 0000000000000000000000000000000000000000..d564ba3053bb381f6152505e7ec58a0978445f01 --- /dev/null +++ b/src/clap_embedding/Crowing, cock-a-doodle-doo.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e60878e89903fc44b4c592ec6734e456438c16216517f4a8f9246718922d60b2 +size 3387 diff --git a/src/clap_embedding/Crumpling, crinkling.pt b/src/clap_embedding/Crumpling, crinkling.pt new file mode 100644 index 0000000000000000000000000000000000000000..2c110a8b48ff12a6d381cd67797b8a0fe4ada412 --- /dev/null +++ b/src/clap_embedding/Crumpling, crinkling.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:558b2e44192e7048c686735a828eec1eda791a39ce7e9d636f5a0032bc4c36cb +size 3293 diff --git a/src/clap_embedding/Crunch.pt b/src/clap_embedding/Crunch.pt new file mode 100644 index 0000000000000000000000000000000000000000..f31318e92bb8354e55ee3f11383125aa8285cecc --- /dev/null +++ b/src/clap_embedding/Crunch.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e74beaaa71f8c10fa8212e7c4aa2dbcffba2487c3ae821c8201f771b733edca6 +size 3159 diff --git a/src/clap_embedding/Crushing.pt b/src/clap_embedding/Crushing.pt new file mode 100644 index 0000000000000000000000000000000000000000..516ce9dd112d68aa6204a242f217944dbc17c6f8 --- /dev/null +++ b/src/clap_embedding/Crushing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd4c5a8d6e7ad09603929dd724fde2cb313438936d461a5759d25652d74c6931 +size 3233 diff --git a/src/clap_embedding/Crying, sobbing.pt b/src/clap_embedding/Crying, sobbing.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cfca1334185936a2d20703e36f19bbfa21fee91 --- /dev/null +++ b/src/clap_embedding/Crying, sobbing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3df029a15951521d2273904f9b585eab73ed5ef3ca9517786129113765b5cb41 +size 3268 diff --git a/src/clap_embedding/Cupboard open or close.pt b/src/clap_embedding/Cupboard open or close.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb82db0aba2fd52972412a86aced2baf05ed8f57 --- /dev/null +++ b/src/clap_embedding/Cupboard open or close.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d20a7c25470c3066d5b06cf849bfe5d56c55c1fd06d8982a25ec37b269783af +size 3367 diff --git a/src/clap_embedding/Cutlery, silverware.pt b/src/clap_embedding/Cutlery, silverware.pt new file mode 100644 index 0000000000000000000000000000000000000000..95131ee44d5c04a8ffa3bfd9ea622a66ddbd17ea --- /dev/null +++ b/src/clap_embedding/Cutlery, silverware.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49cbf43dd3f0c5ff7e7afa9dd7a52162d1108ed188c287ba3449424723fcd683 +size 3288 diff --git a/src/clap_embedding/Deformable shell.pt b/src/clap_embedding/Deformable shell.pt new file mode 100644 index 0000000000000000000000000000000000000000..e384c3e7db152829ee0a1d27ec582c07fd0c38b1 --- /dev/null +++ b/src/clap_embedding/Deformable shell.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dcc1bf7e90f248ed0a87f5ad20136cfc146dec3b96d36c958b41346fdfdcbd6 +size 3273 diff --git a/src/clap_embedding/Dental drill, dentist's drill.pt b/src/clap_embedding/Dental drill, dentist's drill.pt new file mode 100644 index 0000000000000000000000000000000000000000..33c0058ec75d62122d704e4f4a816c0cad255f26 --- /dev/null +++ b/src/clap_embedding/Dental drill, dentist's drill.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31ab45a47d0399ff888b5fae2a44a2f75042e63e27cfcb9594fb6cad548e1a4c +size 3402 diff --git a/src/clap_embedding/Dial tone.pt b/src/clap_embedding/Dial tone.pt new file mode 100644 index 0000000000000000000000000000000000000000..e191d7379be508865193a8252ccd52bd18e04aac --- /dev/null +++ b/src/clap_embedding/Dial tone.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3baa411052a19eb16ee649e2cff407ab06c0582be4be8922c9f8c783efd1bfd9 +size 3238 diff --git a/src/clap_embedding/Digestive.pt b/src/clap_embedding/Digestive.pt new file mode 100644 index 0000000000000000000000000000000000000000..7392b7904839185cc99d01c7e9177d1c4c92e4b0 --- /dev/null +++ b/src/clap_embedding/Digestive.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4feaaa02ad5a40271670f5bde5e1980da3a1277a9f070afbd0f6e720a4ce7718 +size 3238 diff --git a/src/clap_embedding/Ding-dong.pt b/src/clap_embedding/Ding-dong.pt new file mode 100644 index 0000000000000000000000000000000000000000..2085b09478eb5c6d6475551b6c414dd9b4c61603 --- /dev/null +++ b/src/clap_embedding/Ding-dong.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61eae4912c684256d380505bc7e3f483cdb3dd987519517dbcaf1ef8a773fca1 +size 3238 diff --git a/src/clap_embedding/Ding.pt b/src/clap_embedding/Ding.pt new file mode 100644 index 0000000000000000000000000000000000000000..b8bd4b658f4c66c2e2bece29b65e48b92d0974a2 --- /dev/null +++ b/src/clap_embedding/Ding.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bf55ad66963c4480b65d3e580ba4fc1e4c9aff084c8b5279f3e306b828b876a +size 3149 diff --git a/src/clap_embedding/Dishes, pots, and pans.pt b/src/clap_embedding/Dishes, pots, and pans.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e4c56a922cd50da3eee654ca45c8b6afdb82fc7 --- /dev/null +++ b/src/clap_embedding/Dishes, pots, and pans.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b2f99c832eed8d528b6f6f41bda03c78ddbeb9d5b31aa5a5a30cf5ed036e9fa +size 3367 diff --git a/src/clap_embedding/Distortion.pt b/src/clap_embedding/Distortion.pt new file mode 100644 index 0000000000000000000000000000000000000000..c305062848c949834f5ebacb8bac7efd5c4f3239 --- /dev/null +++ b/src/clap_embedding/Distortion.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b0daf6e1093492e482349fcc132ff17c384eec042a8268d3ffe5d8769874558 +size 3243 diff --git a/src/clap_embedding/Dog.pt b/src/clap_embedding/Dog.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6910cfd918585ba790f3c4128d27f259b9ceba8 --- /dev/null +++ b/src/clap_embedding/Dog.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44645771ca5050f4d3727994d3fa963a931b5764a5205e2518512f6d0d8722f4 +size 3144 diff --git a/src/clap_embedding/Domestic animals, pets.pt b/src/clap_embedding/Domestic animals, pets.pt new file mode 100644 index 0000000000000000000000000000000000000000..fa4d58f9ff04308cac8cb83b910bcd88305e01fb --- /dev/null +++ b/src/clap_embedding/Domestic animals, pets.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e2765201f22e0e97bcd8f2d0abc4273f9c98464bf7da9cf5b794b4288195953 +size 3367 diff --git a/src/clap_embedding/Dong, bong.pt b/src/clap_embedding/Dong, bong.pt new file mode 100644 index 0000000000000000000000000000000000000000..5488666884911c31a0f57aa88be7cbf1399f9708 --- /dev/null +++ b/src/clap_embedding/Dong, bong.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b68f5e0e6fe73de5664f9caa00c2a17d3bd9ba3dc36854ff8f7d89cf79f7330 +size 3243 diff --git a/src/clap_embedding/Donkey, ass.pt b/src/clap_embedding/Donkey, ass.pt new file mode 100644 index 0000000000000000000000000000000000000000..9151755866adaddb32d66fb54e4ea7fee2668cfa --- /dev/null +++ b/src/clap_embedding/Donkey, ass.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86bed3af42efe5a3e7fa5d9c938892ea70af7cae298cfef1afa3e8bccc58d009 +size 3248 diff --git a/src/clap_embedding/Door.pt b/src/clap_embedding/Door.pt new file mode 100644 index 0000000000000000000000000000000000000000..e37fcc27fadd033738bddc6afd38a57de705962f --- /dev/null +++ b/src/clap_embedding/Door.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78518e738acab3d7b31639fe458c94de72cd18d94da5c6615cf085b507d5c1a3 +size 3149 diff --git a/src/clap_embedding/Doorbell.pt b/src/clap_embedding/Doorbell.pt new file mode 100644 index 0000000000000000000000000000000000000000..821869673d1e9b4967d4ccefe18c40da0e45947b --- /dev/null +++ b/src/clap_embedding/Doorbell.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15c4c779e0fbee3246dd11f7237013ef3c30778a64898e272c1e44809dbae65c +size 3233 diff --git a/src/clap_embedding/Drawer open or close.pt b/src/clap_embedding/Drawer open or close.pt new file mode 100644 index 0000000000000000000000000000000000000000..09dd1489027ac837e6e986651b1716b7f6d1e6c5 --- /dev/null +++ b/src/clap_embedding/Drawer open or close.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:deeb35b24ecaf5b5247cdec8d05cac19b204cd1ccb4e1f4f0ba4264d8d1b7006 +size 3293 diff --git a/src/clap_embedding/Drill.pt b/src/clap_embedding/Drill.pt new file mode 100644 index 0000000000000000000000000000000000000000..1322f43546703ab8012eeab4f85655a291f3c48d --- /dev/null +++ b/src/clap_embedding/Drill.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e624f79a41f31ee5288a0e8cf119107d158c90ae793a35e117c7f23596eccb18 +size 3154 diff --git a/src/clap_embedding/Drip.pt b/src/clap_embedding/Drip.pt new file mode 100644 index 0000000000000000000000000000000000000000..e95822fc79276c9c014ac2ccf9daf5b68674378d --- /dev/null +++ b/src/clap_embedding/Drip.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56d0ac6beef807ff8b972532e00a94905e804229137fe24828b46b816668f074 +size 3149 diff --git a/src/clap_embedding/Duck call (hunting tool).pt b/src/clap_embedding/Duck call (hunting tool).pt new file mode 100644 index 0000000000000000000000000000000000000000..41a84bb23ca24251bc6eeaaeabb27c0807984626 --- /dev/null +++ b/src/clap_embedding/Duck call (hunting tool).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e043d0f9d9481492c2e6a47c0b46f8cf326d6a96c33d0d2712b3f5810b4afee +size 3377 diff --git a/src/clap_embedding/Ducks, geese, waterfowl.pt b/src/clap_embedding/Ducks, geese, waterfowl.pt new file mode 100644 index 0000000000000000000000000000000000000000..7152ebd1f279d81014582246c0298975e15067db --- /dev/null +++ b/src/clap_embedding/Ducks, geese, waterfowl.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:363e24846fd3ac8a0e0e1fe464e84a5a44b5bdc553eae2debc73faae23632b1a +size 3372 diff --git a/src/clap_embedding/Echo.pt b/src/clap_embedding/Echo.pt new file mode 100644 index 0000000000000000000000000000000000000000..21946bd731440dd38c499a399a71b1a346ee9c7a --- /dev/null +++ b/src/clap_embedding/Echo.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9e57880091134e4f157da6eeda8ebf8aab55e444a2030a54d8414faedc85629 +size 3149 diff --git a/src/clap_embedding/Effects unit.pt b/src/clap_embedding/Effects unit.pt new file mode 100644 index 0000000000000000000000000000000000000000..593b3c87c892d9fa2d56052b98aa8a56ea9bc0e7 --- /dev/null +++ b/src/clap_embedding/Effects unit.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:424581a945971f55354cabbf3b5c119e613ea422f278a295f2fd146bd722423e +size 3253 diff --git a/src/clap_embedding/Electric rotor drone, quadcopter.pt b/src/clap_embedding/Electric rotor drone, quadcopter.pt new file mode 100644 index 0000000000000000000000000000000000000000..d966630d0aab3bcd9eface8533f512a6c58cc9cb --- /dev/null +++ b/src/clap_embedding/Electric rotor drone, quadcopter.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2fd3b1412fdd5e3782bc5fc441adaaf69aff8d67ec9e6d4c938fb6195dabafa +size 3417 diff --git a/src/clap_embedding/Electric shaver, electric razor.pt b/src/clap_embedding/Electric shaver, electric razor.pt new file mode 100644 index 0000000000000000000000000000000000000000..8a5185cf14b862f1ec1b733466b06469d3bd885f --- /dev/null +++ b/src/clap_embedding/Electric shaver, electric razor.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f844023f21dc1c0974054c354b997aaa97afcddb5f57822dabb5232429d8082f +size 3412 diff --git a/src/clap_embedding/Electric toothbrush.pt b/src/clap_embedding/Electric toothbrush.pt new file mode 100644 index 0000000000000000000000000000000000000000..83e355ff856ab64d409f9cb48a17a9e328d9b75a --- /dev/null +++ b/src/clap_embedding/Electric toothbrush.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91143b975e7f10133b6325327f5615895b7f824e0f81049b3a9a8856ed32a3c6 +size 3288 diff --git a/src/clap_embedding/Electronic tuner.pt b/src/clap_embedding/Electronic tuner.pt new file mode 100644 index 0000000000000000000000000000000000000000..7782f6417a4476196e862cc6f56b39bbb0d8922c --- /dev/null +++ b/src/clap_embedding/Electronic tuner.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2d376cda861152f00b8352485d4d6e02a4e9c0ded5165b1f2df71f670e1c35f +size 3273 diff --git a/src/clap_embedding/Emergency vehicle.pt b/src/clap_embedding/Emergency vehicle.pt new file mode 100644 index 0000000000000000000000000000000000000000..5f9b979e08a4c72159de8a0e7e7ddbb9a73737f5 --- /dev/null +++ b/src/clap_embedding/Emergency vehicle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3fb7b027875c7d319c3d19eb7534ee1d1f42488b39890f02ae98df3e0c04928 +size 3278 diff --git a/src/clap_embedding/Engine knocking.pt b/src/clap_embedding/Engine knocking.pt new file mode 100644 index 0000000000000000000000000000000000000000..af5a317fe85a9de6b3d3b426ccb7ec3cd110c9b2 --- /dev/null +++ b/src/clap_embedding/Engine knocking.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1341a03e80375ffde8d054ad17d928c0b682370a22f24f5d7e7e18320194ca9 +size 3268 diff --git a/src/clap_embedding/Engine starting.pt b/src/clap_embedding/Engine starting.pt new file mode 100644 index 0000000000000000000000000000000000000000..576384bd86d8832f0291c6a44c8bcc3d9b3ddced --- /dev/null +++ b/src/clap_embedding/Engine starting.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a5b02803de8aa39d737d42996aa9c5f253e79114383531b33036525ac36732a +size 3268 diff --git a/src/clap_embedding/Engine.pt b/src/clap_embedding/Engine.pt new file mode 100644 index 0000000000000000000000000000000000000000..48c5b5c7c2660d929e80212c2a6751447428da51 --- /dev/null +++ b/src/clap_embedding/Engine.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d272173f19bc0fc43de6bf3cb94e061eb143ea4b0fbfe3c2326c72e36097a43f +size 3159 diff --git a/src/clap_embedding/Environmental noise.pt b/src/clap_embedding/Environmental noise.pt new file mode 100644 index 0000000000000000000000000000000000000000..49ecfbf2f437b36c519ec5b968a23f57ce8def44 --- /dev/null +++ b/src/clap_embedding/Environmental noise.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bb05f1aec9af1c25da2280f4755bef1f0a06ff6431a8a9862a91aceb83697a8 +size 3288 diff --git a/src/clap_embedding/Error signal.pt b/src/clap_embedding/Error signal.pt new file mode 100644 index 0000000000000000000000000000000000000000..304972f11cc6d1290838e88302168a4004edbccf --- /dev/null +++ b/src/clap_embedding/Error signal.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96fed657f5b5e5eff6580a806bc7cc2cf3d6b078185ed6335a4c1ff26b1e0eec +size 3253 diff --git a/src/clap_embedding/Eruption.pt b/src/clap_embedding/Eruption.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb117c9d64a1cf5e8a7d6aa0873c979a5edaf909 --- /dev/null +++ b/src/clap_embedding/Eruption.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e51f19349aff6b99a6d9f65fa0242f1901e30fdb8793fa7174b4af9db52c2524 +size 3233 diff --git a/src/clap_embedding/Explosion.pt b/src/clap_embedding/Explosion.pt new file mode 100644 index 0000000000000000000000000000000000000000..6553651dcf27bfc08c36c30a84b6cc738c7e4677 --- /dev/null +++ b/src/clap_embedding/Explosion.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a933311e9ebaad832bfb09ed8cb1774181d2772decff293b568d5eddd790e1e5 +size 3238 diff --git a/src/clap_embedding/Fart.pt b/src/clap_embedding/Fart.pt new file mode 100644 index 0000000000000000000000000000000000000000..f48c60ff6e54077914e17b15e62c494fa7e7b8bf --- /dev/null +++ b/src/clap_embedding/Fart.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0bc11a918983a338f9c5ae9e71edddc13bd195ed205ddb71db3753ac8c11a21 +size 3149 diff --git a/src/clap_embedding/Female singing.pt b/src/clap_embedding/Female singing.pt new file mode 100644 index 0000000000000000000000000000000000000000..19787f53508d21e820bc77485869486c5aabbd56 --- /dev/null +++ b/src/clap_embedding/Female singing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7da8151509735233c72c5342f1860603cd548f1262f693bee794191d2c7be1cd +size 3263 diff --git a/src/clap_embedding/Female speech, woman speaking.pt b/src/clap_embedding/Female speech, woman speaking.pt new file mode 100644 index 0000000000000000000000000000000000000000..112e0cae780a8c81c82afc15b50a6ec7581d0ee0 --- /dev/null +++ b/src/clap_embedding/Female speech, woman speaking.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3607714797f1b44d4974ed4833a2d3316c686d10ba74f6259110be3b5669de76 +size 3402 diff --git a/src/clap_embedding/Filing (rasp).pt b/src/clap_embedding/Filing (rasp).pt new file mode 100644 index 0000000000000000000000000000000000000000..66ce5998b7103f22d17947873d654c6afe5f6f08 --- /dev/null +++ b/src/clap_embedding/Filing (rasp).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23cbf7ba6e3af0e6ff88677937b622dc02d781695f1b633d50de6971fc01ef60 +size 3258 diff --git a/src/clap_embedding/Fill (with liquid).pt b/src/clap_embedding/Fill (with liquid).pt new file mode 100644 index 0000000000000000000000000000000000000000..d539330a4a17ccdc678b335f03f270e37519f55f --- /dev/null +++ b/src/clap_embedding/Fill (with liquid).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4bd9c97cb02d0e5a5b99a82ac3fac6e2a5de61b067d5f68488d7f94b4ef312a +size 3283 diff --git a/src/clap_embedding/Finger snapping.pt b/src/clap_embedding/Finger snapping.pt new file mode 100644 index 0000000000000000000000000000000000000000..3edea43fdfda6c9118cbc850c485c24aa859d478 --- /dev/null +++ b/src/clap_embedding/Finger snapping.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11364348e2a6855e6a6b96a09a288a0719e46dbf55718cfc3672e66f0e664d15 +size 3268 diff --git a/src/clap_embedding/Fire alarm.pt b/src/clap_embedding/Fire alarm.pt new file mode 100644 index 0000000000000000000000000000000000000000..7b982fa0541aabf945d6e8d66b4d9ead14a3c43f --- /dev/null +++ b/src/clap_embedding/Fire alarm.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a872b27c27bb40199f215b353d2b398ad5eeeb86323c3692afc09d687bb632b +size 3243 diff --git a/src/clap_embedding/Fire engine, fire truck (siren).pt b/src/clap_embedding/Fire engine, fire truck (siren).pt new file mode 100644 index 0000000000000000000000000000000000000000..553dcd09f1738b4b0741e9e8669b79371a178fd4 --- /dev/null +++ b/src/clap_embedding/Fire engine, fire truck (siren).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:158155bb074942ec953b10d204ffb473398b418a7db9a3f9669d644d1b309209 +size 3412 diff --git a/src/clap_embedding/Fire.pt b/src/clap_embedding/Fire.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef607fa025ce4123d939e916662957ef5b6a9269 --- /dev/null +++ b/src/clap_embedding/Fire.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a1b263ec0fe27b364080ca99f2e5462948fd1dcb6e4f51bec8bb7c43e1bbde8 +size 3149 diff --git a/src/clap_embedding/Firecracker.pt b/src/clap_embedding/Firecracker.pt new file mode 100644 index 0000000000000000000000000000000000000000..c8a7e790486ac52297b50d26edcd1c2683b43c10 --- /dev/null +++ b/src/clap_embedding/Firecracker.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b471ed31c7a42b9ebcff7f7bd489dfc28eddce2d34c6e09f975bda452186775a +size 3248 diff --git a/src/clap_embedding/Fireworks.pt b/src/clap_embedding/Fireworks.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d32bcad996daa99895fd41d6410295148e00684 --- /dev/null +++ b/src/clap_embedding/Fireworks.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:141174b1331c2ffbbe977d86884948bc05b62f4e5f15348e1e15e8148cf59ef8 +size 3238 diff --git a/src/clap_embedding/Fixed-wing aircraft, airplane.pt b/src/clap_embedding/Fixed-wing aircraft, airplane.pt new file mode 100644 index 0000000000000000000000000000000000000000..f5fc90af4a47db53638106e7f04d9681a9253085 --- /dev/null +++ b/src/clap_embedding/Fixed-wing aircraft, airplane.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b2832bbc6bafca25662a0953612cc9bc1ff3fdf89bb3f512fbf659b1d3ade64 +size 3402 diff --git a/src/clap_embedding/Fizz.pt b/src/clap_embedding/Fizz.pt new file mode 100644 index 0000000000000000000000000000000000000000..d1533fc948279e9cc49c248161cb4fbe51fe2b4e --- /dev/null +++ b/src/clap_embedding/Fizz.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93706cdf43c4a726bd62652d274fd2fcb314f0b0ff3cd3159d90dcf4a59c3b63 +size 3149 diff --git a/src/clap_embedding/Flap.pt b/src/clap_embedding/Flap.pt new file mode 100644 index 0000000000000000000000000000000000000000..8fc957ab21ed20ef17756ad5934d617a640c6111 --- /dev/null +++ b/src/clap_embedding/Flap.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38fc77e0de9201dbdbcd005f85fe63c0c1f1ab5558dbb76ebd0442a512447a26 +size 3149 diff --git a/src/clap_embedding/Fly, housefly.pt b/src/clap_embedding/Fly, housefly.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2cdf24da6112ea4a37657132c15eaa79847b624 --- /dev/null +++ b/src/clap_embedding/Fly, housefly.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f490766c6cd2d73b2da3d02fd81b8fb813c6d7c43736df3de8c13e014c1a637 +size 3258 diff --git a/src/clap_embedding/Foghorn.pt b/src/clap_embedding/Foghorn.pt new file mode 100644 index 0000000000000000000000000000000000000000..a46ce5e2058c21e8eaf79f423a945d5eca466ef6 --- /dev/null +++ b/src/clap_embedding/Foghorn.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de9fb5563c7d646124427116c1fac7b859ed720556167b3012c2eb2817c5b0cc +size 3228 diff --git a/src/clap_embedding/Fowl.pt b/src/clap_embedding/Fowl.pt new file mode 100644 index 0000000000000000000000000000000000000000..34ec6011df1f31085182e259ad9117edeb1a1609 --- /dev/null +++ b/src/clap_embedding/Fowl.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04a8c497405869f9bd82d4f5334090c9209d758398b49315fd200afdc22c7563 +size 3149 diff --git a/src/clap_embedding/Frog.pt b/src/clap_embedding/Frog.pt new file mode 100644 index 0000000000000000000000000000000000000000..a3a3c0d84842da07a412138d73a85a737b2ac4e4 --- /dev/null +++ b/src/clap_embedding/Frog.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92476e86c231ceaee51a16ccf44e91b2a4b69d4c1cc2fe8f6ff14b360469f570 +size 3149 diff --git a/src/clap_embedding/Frying (food).pt b/src/clap_embedding/Frying (food).pt new file mode 100644 index 0000000000000000000000000000000000000000..e07f50f408f0a56e72e1d780a43d28cb42dce138 --- /dev/null +++ b/src/clap_embedding/Frying (food).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3fa0bdf156a693b15c29f9ba35337b3592938479e1d4953c1dd55816f3b24aa +size 3258 diff --git a/src/clap_embedding/Fusillade.pt b/src/clap_embedding/Fusillade.pt new file mode 100644 index 0000000000000000000000000000000000000000..39d8d19fc2b28f80ab854d1f8074c185b95ec7de --- /dev/null +++ b/src/clap_embedding/Fusillade.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c49381628834c219b7241f8f3d419a439e835ab2204a8ad1c869433b18a8a844 +size 3238 diff --git a/src/clap_embedding/Gargling.pt b/src/clap_embedding/Gargling.pt new file mode 100644 index 0000000000000000000000000000000000000000..a839ffac0034340e5673c27ae34fd7640afcde6b --- /dev/null +++ b/src/clap_embedding/Gargling.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a51e9a04c1835744eb66c542f19cd9138f5b281a977ad900d5c49d494128d41 +size 3233 diff --git a/src/clap_embedding/Gasp.pt b/src/clap_embedding/Gasp.pt new file mode 100644 index 0000000000000000000000000000000000000000..1687e4b82a24fa4bd8a5b720272d65549c9f81cd --- /dev/null +++ b/src/clap_embedding/Gasp.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd87421b230ef9415052606fdb2718071d2b0d6816510fc13b3a2af119c91518 +size 3149 diff --git a/src/clap_embedding/Gears.pt b/src/clap_embedding/Gears.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab2108b734bbf2f49f2fbeab32e0382b7158f3dd --- /dev/null +++ b/src/clap_embedding/Gears.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0be893a4c2b0fa8c4e53322a8e57b225ad01242748f88660741cc831298ed0ef +size 3154 diff --git a/src/clap_embedding/Generic impact sounds.pt b/src/clap_embedding/Generic impact sounds.pt new file mode 100644 index 0000000000000000000000000000000000000000..cf17ff113a04f7e6be56151072770a28b3f29d57 --- /dev/null +++ b/src/clap_embedding/Generic impact sounds.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51d8380cf2d15df36a48c16f8a117d17aac924b7b81c1d466e79ee4b1d49fc9c +size 3298 diff --git a/src/clap_embedding/Giggle.pt b/src/clap_embedding/Giggle.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8bae9e3affd15c07371a306f66aaccddc73e1cc --- /dev/null +++ b/src/clap_embedding/Giggle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e1102b80121f48016ff78a13830cc0db4e21a52d3782c7863e63b76bcbf724a +size 3159 diff --git a/src/clap_embedding/Glass chink, clink.pt b/src/clap_embedding/Glass chink, clink.pt new file mode 100644 index 0000000000000000000000000000000000000000..1c5766295439ffac08c7f94f85503992f873460c --- /dev/null +++ b/src/clap_embedding/Glass chink, clink.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53545b821904f4cb5d3abb3bd1ba4dd0b63785546ccda9e6e83f88c4d0acd0d1 +size 3283 diff --git a/src/clap_embedding/Glass shatter.pt b/src/clap_embedding/Glass shatter.pt new file mode 100644 index 0000000000000000000000000000000000000000..6fe3eedcc78fe8e371053acdb42a0a061c32d2f7 --- /dev/null +++ b/src/clap_embedding/Glass shatter.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:977edb04ae17c61e62a5ad11be8ca613a5e4d4a8ccf26d8b2afb6d54d479e5c5 +size 3258 diff --git a/src/clap_embedding/Glass.pt b/src/clap_embedding/Glass.pt new file mode 100644 index 0000000000000000000000000000000000000000..d8698509b348f42245d66eb463b23f8c80a00ff5 --- /dev/null +++ b/src/clap_embedding/Glass.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7872ccb61d2ba173e74695a6f43bc556287a3e0d1fee5d66c0e7bdbcf9e6671e +size 3154 diff --git a/src/clap_embedding/Goat.pt b/src/clap_embedding/Goat.pt new file mode 100644 index 0000000000000000000000000000000000000000..d87e1aa635222f77f070fcdf1de6d57cf7be23fa --- /dev/null +++ b/src/clap_embedding/Goat.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5578611f1c9b9ec6bdb04ae529ccf1c241485e8b3ed5de935b7415b229320f3 +size 3149 diff --git a/src/clap_embedding/Gobble.pt b/src/clap_embedding/Gobble.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0603f248b12d8ff5276ae2086c8e084c26edaa9 --- /dev/null +++ b/src/clap_embedding/Gobble.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:746f50f2a7b85379908ae0f7c80f3a1257655726dd52de42c723c67b66a664a5 +size 3159 diff --git a/src/clap_embedding/Grind.pt b/src/clap_embedding/Grind.pt new file mode 100644 index 0000000000000000000000000000000000000000..3d6b9bafe0d316e04e0e7ee3da3643568bf697f5 --- /dev/null +++ b/src/clap_embedding/Grind.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4447f9819e8195a35563d71883d0bc02e66d275ad8bb3dc841c44b190384c54e +size 3154 diff --git a/src/clap_embedding/Groan.pt b/src/clap_embedding/Groan.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4626bf55d4917cd21d45cd8b9fade5d32a7bf63 --- /dev/null +++ b/src/clap_embedding/Groan.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97b64b1ff9e7e34f92e30ca995ea1feb3b54f64faedcde371637ff024eb8746e +size 3154 diff --git a/src/clap_embedding/Growling.pt b/src/clap_embedding/Growling.pt new file mode 100644 index 0000000000000000000000000000000000000000..8dc53c666f4063a6ccd3d47889c24c8f77c3d315 --- /dev/null +++ b/src/clap_embedding/Growling.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1902b3bd1f766501c67b6526fb777a9a01fffbe4ebc72823e80ac8b51cb9d872 +size 3233 diff --git a/src/clap_embedding/Grunt.pt b/src/clap_embedding/Grunt.pt new file mode 100644 index 0000000000000000000000000000000000000000..de78c46506c1c72ed598c6d757c2ff02878d61be --- /dev/null +++ b/src/clap_embedding/Grunt.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:519c1be6ab6cf16d353d6004d56d6828daf38a9dc73c1d504e4962af60ab6ec2 +size 3154 diff --git a/src/clap_embedding/Gull, seagull.pt b/src/clap_embedding/Gull, seagull.pt new file mode 100644 index 0000000000000000000000000000000000000000..db054e1bf112f1e552f7fb3cc91b57245a09962d --- /dev/null +++ b/src/clap_embedding/Gull, seagull.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8e28e423042e6ae178dfe86045930c48edea328ba09071956d4fcb234e77513 +size 3258 diff --git a/src/clap_embedding/Gunshot, gunfire.pt b/src/clap_embedding/Gunshot, gunfire.pt new file mode 100644 index 0000000000000000000000000000000000000000..945cbe66c279592f698e032944f90d9add885cc1 --- /dev/null +++ b/src/clap_embedding/Gunshot, gunfire.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c1fe2fc294c1b810c6bc84bcf996247d803d60c6434e06aa1d3d49953efd86a +size 3273 diff --git a/src/clap_embedding/Gurgling, bubbling.pt b/src/clap_embedding/Gurgling, bubbling.pt new file mode 100644 index 0000000000000000000000000000000000000000..f17fea65cdf29e85c6cdf122986abbac29f57097 --- /dev/null +++ b/src/clap_embedding/Gurgling, bubbling.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f83bf162e3e2be0769aea29f5c573f8ccce3b919a8fa3049e17a51995648638 +size 3283 diff --git a/src/clap_embedding/Gush.pt b/src/clap_embedding/Gush.pt new file mode 100644 index 0000000000000000000000000000000000000000..8700703c6b01eb049a617d3249212aebd39e8de4 --- /dev/null +++ b/src/clap_embedding/Gush.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57e1f10f76a5fe1f7bec48ad5c98726dd53d60bb0033fa54c8c6cb1a50342873 +size 3149 diff --git a/src/clap_embedding/Hair dryer.pt b/src/clap_embedding/Hair dryer.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2caeb56167e3d25a0291c2fd6dd21b46316096e --- /dev/null +++ b/src/clap_embedding/Hair dryer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5edbec0fa877f6f665bffb875cda2bde4b23349c98e777536a3632ed103b748 +size 3243 diff --git a/src/clap_embedding/Hammer.pt b/src/clap_embedding/Hammer.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cf5cb6719caf90b5131404fdcfe3cd745380740 --- /dev/null +++ b/src/clap_embedding/Hammer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5eb455d8af3faf2882d5bd884d1d013b07302a712b4865c1eb3f43b42c785434 +size 3159 diff --git a/src/clap_embedding/Hands.pt b/src/clap_embedding/Hands.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae17bf2ef5ab583f234010a34766f849cd39e8ce --- /dev/null +++ b/src/clap_embedding/Hands.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40876db8624e959ea43fff4de4a4c38d14725bfb56abb0fd095a437dce18291e +size 3154 diff --git a/src/clap_embedding/Heart murmur.pt b/src/clap_embedding/Heart murmur.pt new file mode 100644 index 0000000000000000000000000000000000000000..52cf5cdb1daeef005df68d07ca5b37ab5cac1f11 --- /dev/null +++ b/src/clap_embedding/Heart murmur.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:888a879eb5aab0d50614f27d88f843ce90e2fe93fb04f77990554fa16fa7152b +size 3253 diff --git a/src/clap_embedding/Heart sounds, heartbeat.pt b/src/clap_embedding/Heart sounds, heartbeat.pt new file mode 100644 index 0000000000000000000000000000000000000000..82003f368747a492e191b6b586b8906a1a5cbf69 --- /dev/null +++ b/src/clap_embedding/Heart sounds, heartbeat.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13b85ec671796d56a432ee7317677a5e11bc52071b8855f25acb191b43a5265a +size 3372 diff --git a/src/clap_embedding/Heavy engine (low frequency).pt b/src/clap_embedding/Heavy engine (low frequency).pt new file mode 100644 index 0000000000000000000000000000000000000000..4bf975d6f4ddf9c0956ef32dac9b8aa13691ec83 --- /dev/null +++ b/src/clap_embedding/Heavy engine (low frequency).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cecdb2f1080e4f029069833e252b93ce621c55e9c2d8a4810008e397640ddf1a +size 3397 diff --git a/src/clap_embedding/Helicopter.pt b/src/clap_embedding/Helicopter.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2d49bae3605fa06669c99a367289a193ab1ecd4 --- /dev/null +++ b/src/clap_embedding/Helicopter.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60426efdac777bf8beefbdeb7060316269a5a3d5409470297ec5790a3a2c01b9 +size 3243 diff --git a/src/clap_embedding/Hiccup.pt b/src/clap_embedding/Hiccup.pt new file mode 100644 index 0000000000000000000000000000000000000000..d2abb8917a10ee435fd0797c29b5d96f974270ef --- /dev/null +++ b/src/clap_embedding/Hiccup.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f26e9b7f784963c3e8600b023cf9c5a39fc6501e2bf5b5803c3af491e3b286a +size 3159 diff --git a/src/clap_embedding/Hiss.pt b/src/clap_embedding/Hiss.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8189f8bf15ecf8eef8a5521a8137b307dc18e08 --- /dev/null +++ b/src/clap_embedding/Hiss.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e15ecedef6452980971bd320f14821a2cc02babbeb601060793f4afa6c82938 +size 3149 diff --git a/src/clap_embedding/Honk.pt b/src/clap_embedding/Honk.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0ab644de5dbc07fc4be825f586df359a9ce4493 --- /dev/null +++ b/src/clap_embedding/Honk.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd25d338407829a672960a1f2d2a1f0fa7dcfa3d352f74dbaaacbba3a5fcbb34 +size 3149 diff --git a/src/clap_embedding/Hoot.pt b/src/clap_embedding/Hoot.pt new file mode 100644 index 0000000000000000000000000000000000000000..56dcf50c77a4689d33000796617e7dc21579d766 --- /dev/null +++ b/src/clap_embedding/Hoot.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8a7768e7b4a7d83922685cd2c6287dbf299f533444ded1546448f53395703c5 +size 3149 diff --git a/src/clap_embedding/Horse.pt b/src/clap_embedding/Horse.pt new file mode 100644 index 0000000000000000000000000000000000000000..72560e2435110019c0123d15dbf5e65092a950e1 --- /dev/null +++ b/src/clap_embedding/Horse.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b44e8087f74522cee69aaef76b1e7edf9c51a948093a6488af59ca842b189f5b +size 3154 diff --git a/src/clap_embedding/Howl (wind).pt b/src/clap_embedding/Howl (wind).pt new file mode 100644 index 0000000000000000000000000000000000000000..6799b875f346a4d6ca9349def9036829b3f5c8b4 --- /dev/null +++ b/src/clap_embedding/Howl (wind).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81de9487a62ba7d02eeff4bcf68ab1786fd7a5c81e33e019e9888ad009820b92 +size 3248 diff --git a/src/clap_embedding/Howl.pt b/src/clap_embedding/Howl.pt new file mode 100644 index 0000000000000000000000000000000000000000..a082212910bd9bb8da705bcd612b1a591292cfad --- /dev/null +++ b/src/clap_embedding/Howl.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9bde5bddce81691384dc152f4d13292d3dfa32409a6f5ed1f15029c44ae19b6 +size 3149 diff --git a/src/clap_embedding/Hubbub, speech noise, speech babble.pt b/src/clap_embedding/Hubbub, speech noise, speech babble.pt new file mode 100644 index 0000000000000000000000000000000000000000..c53b4b5a31880b6b15f02590a813e7c1440f0851 --- /dev/null +++ b/src/clap_embedding/Hubbub, speech noise, speech babble.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39e861c7bc3dee553e781e33ca03bbf0e2f900d33ab5860cf1e0a35b7f5f28b5 +size 3432 diff --git a/src/clap_embedding/Hum.pt b/src/clap_embedding/Hum.pt new file mode 100644 index 0000000000000000000000000000000000000000..dc9a14629e9c5763700d81e35455d6a2adc0d8a3 --- /dev/null +++ b/src/clap_embedding/Hum.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92a1050903b82be44646fd869f79b061bb3d30def8bc871f9a2b040f985178a6 +size 3144 diff --git a/src/clap_embedding/Human group actions.pt b/src/clap_embedding/Human group actions.pt new file mode 100644 index 0000000000000000000000000000000000000000..4717e2568fcaacf1d10c4a05e6b29eedbcc98a3f --- /dev/null +++ b/src/clap_embedding/Human group actions.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dba0f52ccbec66fcda643e2b905a3e9d62eebc99ac9ca734d7fb0774b7b8a734 +size 3288 diff --git a/src/clap_embedding/Human locomotion.pt b/src/clap_embedding/Human locomotion.pt new file mode 100644 index 0000000000000000000000000000000000000000..5971591b95d39f593c5a644b404bd7091d263d49 --- /dev/null +++ b/src/clap_embedding/Human locomotion.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c35152288abcbd2368864182b5f2eb716ab2545ae9f37a97b66f14b071e5d97f +size 3273 diff --git a/src/clap_embedding/Human sounds.pt b/src/clap_embedding/Human sounds.pt new file mode 100644 index 0000000000000000000000000000000000000000..82f5c285ca325ad5633de0a25fca0651692eeb61 --- /dev/null +++ b/src/clap_embedding/Human sounds.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6dbbde305b1c8503df96d0d8243fccb5bd8a7eabfaf1074e199427fbd75ac71 +size 3253 diff --git a/src/clap_embedding/Human voice.pt b/src/clap_embedding/Human voice.pt new file mode 100644 index 0000000000000000000000000000000000000000..8d940556f54b4a2f559d4152d44c56053a761a2f --- /dev/null +++ b/src/clap_embedding/Human voice.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b961f68bb7336e27177baa5dfa90776739397767a145a609bc29e5a48b1bf0a5 +size 3248 diff --git a/src/clap_embedding/Humming.pt b/src/clap_embedding/Humming.pt new file mode 100644 index 0000000000000000000000000000000000000000..9259db154910eb8ff91292e91c86a9c2f72d6a5c --- /dev/null +++ b/src/clap_embedding/Humming.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:499fa7b39df00d300b9082eecd7b1267f69a00d289c0d3135f4e87b707a08ecb +size 3228 diff --git a/src/clap_embedding/Ice cream truck, ice cream van.pt b/src/clap_embedding/Ice cream truck, ice cream van.pt new file mode 100644 index 0000000000000000000000000000000000000000..e141f68f60466b11ec18cf2d7ac565e4176b05f4 --- /dev/null +++ b/src/clap_embedding/Ice cream truck, ice cream van.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5a0b91315b96faa240057014781a833bd6f14eba1440ad7faca6116bc66bab0 +size 3407 diff --git a/src/clap_embedding/Idling.pt b/src/clap_embedding/Idling.pt new file mode 100644 index 0000000000000000000000000000000000000000..3fde2968107b4e969730749a1706c68d2d51c822 --- /dev/null +++ b/src/clap_embedding/Idling.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6c402e00261e4672dcc1f4be79df9b3bb1351dac202b34105f20abfb18d7a3eb +size 3159 diff --git a/src/clap_embedding/Insect.pt b/src/clap_embedding/Insect.pt new file mode 100644 index 0000000000000000000000000000000000000000..571f63f878698c84331551ee0578e87c7e6494d9 --- /dev/null +++ b/src/clap_embedding/Insect.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89328c1bbbd08707d9f8a1752a1c421280c58fcb0ebd1b80c7ea9de3d6f0df3f +size 3159 diff --git a/src/clap_embedding/Inside, large room or hall.pt b/src/clap_embedding/Inside, large room or hall.pt new file mode 100644 index 0000000000000000000000000000000000000000..81cdec4942b82e59b8929e9051ae0b8610ef8045 --- /dev/null +++ b/src/clap_embedding/Inside, large room or hall.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fdd5b00678f1e21974fcc0b26e24b53f02ada99f2a189c2cb763c4308a15295e +size 3387 diff --git a/src/clap_embedding/Inside, public space.pt b/src/clap_embedding/Inside, public space.pt new file mode 100644 index 0000000000000000000000000000000000000000..05571711041e3a8ebeab3f8728ad5a7ef99cb076 --- /dev/null +++ b/src/clap_embedding/Inside, public space.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75999f1d417561b83b1d0a4e961ac6b02248991510e71cebd14e1995d926e4ca +size 3293 diff --git a/src/clap_embedding/Inside, small room.pt b/src/clap_embedding/Inside, small room.pt new file mode 100644 index 0000000000000000000000000000000000000000..713e349cc7bc07b1238a508eb5215c49a37ec01e --- /dev/null +++ b/src/clap_embedding/Inside, small room.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ce7ee323c543d138c9ff20932befb480cff629eebadfa8956d2a28379fc2ef2 +size 3283 diff --git a/src/clap_embedding/Jackhammer.pt b/src/clap_embedding/Jackhammer.pt new file mode 100644 index 0000000000000000000000000000000000000000..dcabcbd4291a1ef7a9c000f8898ae9f343c1ef96 --- /dev/null +++ b/src/clap_embedding/Jackhammer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3dbe3b70dfc05164c0fefcd08209f4a47a95b580e498e33cd7960a4c9ed5cb0 +size 3243 diff --git a/src/clap_embedding/Jet engine.pt b/src/clap_embedding/Jet engine.pt new file mode 100644 index 0000000000000000000000000000000000000000..2f8254e50424f5b80cd195601861196640e62765 --- /dev/null +++ b/src/clap_embedding/Jet engine.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23258fda89b41aa839355d5b7defe00681a2f0d69bece87b24232369df67b36e +size 3243 diff --git a/src/clap_embedding/Jingle bell.pt b/src/clap_embedding/Jingle bell.pt new file mode 100644 index 0000000000000000000000000000000000000000..9262840b87c926968cae61a4c69f5d9e22a10777 --- /dev/null +++ b/src/clap_embedding/Jingle bell.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e25eb455d25ed14731d6e8e446d066894e74f3a2665387caefc9d0e101b9524 +size 3248 diff --git a/src/clap_embedding/Jingle, tinkle.pt b/src/clap_embedding/Jingle, tinkle.pt new file mode 100644 index 0000000000000000000000000000000000000000..11528440b77d57529c0e31d91eeec4fbe424946b --- /dev/null +++ b/src/clap_embedding/Jingle, tinkle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26edafa73916c67a7f3ab8d4a8e0d7837f40cd22d95185a6147e399aeaa97f4f +size 3263 diff --git a/src/clap_embedding/Kettle whistle.pt b/src/clap_embedding/Kettle whistle.pt new file mode 100644 index 0000000000000000000000000000000000000000..12c136649c8b9a6ea7d50211324525b55f2d4d90 --- /dev/null +++ b/src/clap_embedding/Kettle whistle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:617fada93e01a8589219e217809e5a90ef22bbeb2d5720affa62570f9de4f3cb +size 3263 diff --git a/src/clap_embedding/Keypress tone.pt b/src/clap_embedding/Keypress tone.pt new file mode 100644 index 0000000000000000000000000000000000000000..06f27b5bf8480244d40b0df9f3378325fb04cfd4 --- /dev/null +++ b/src/clap_embedding/Keypress tone.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c6f61967d37252ad90b5bbda3cdc993eca5b7e5d6dc68de01627a0a83b8f789 +size 3258 diff --git a/src/clap_embedding/Keys jangling.pt b/src/clap_embedding/Keys jangling.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab2675203949ff8e56b10ef630c9b7333bf7d973 --- /dev/null +++ b/src/clap_embedding/Keys jangling.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9837c7b2d3b3bf1fef005ffe455276ad7b7f5e2643154ed1f29d54fcc8bd9dca +size 3258 diff --git a/src/clap_embedding/Kitchen and dining room sounds.pt b/src/clap_embedding/Kitchen and dining room sounds.pt new file mode 100644 index 0000000000000000000000000000000000000000..fc9ad60c5e1754d26e4d4e3a66ade77d1e631bfc --- /dev/null +++ b/src/clap_embedding/Kitchen and dining room sounds.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2a231b7be7c39b616825557f83c806c013d33fd85f87adad3026298192c95b2 +size 3407 diff --git a/src/clap_embedding/Knife.pt b/src/clap_embedding/Knife.pt new file mode 100644 index 0000000000000000000000000000000000000000..c66d9530183a329de5af156c582821a168e1c9fb --- /dev/null +++ b/src/clap_embedding/Knife.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35d0c765798bff38568ac93f130d9e4f396892c7416dacf62e64f4913670773f +size 3154 diff --git a/src/clap_embedding/Knock.pt b/src/clap_embedding/Knock.pt new file mode 100644 index 0000000000000000000000000000000000000000..e2f58f26e5bad28574dd5804eee0f3bf6a127b4a --- /dev/null +++ b/src/clap_embedding/Knock.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03ea0e3b8dec7b7c6e09fdf3e026b8342fd857559e26c2708437006d375bdba6 +size 3154 diff --git a/src/clap_embedding/Laughter.pt b/src/clap_embedding/Laughter.pt new file mode 100644 index 0000000000000000000000000000000000000000..3e5efac1f21508b845b6c3da9eeb6e28233ba5aa --- /dev/null +++ b/src/clap_embedding/Laughter.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20f539633c430fcac75a4b19dec7ed1613cfef9c36977c14bbcfe2de155f640a +size 3233 diff --git a/src/clap_embedding/Lawn mower.pt b/src/clap_embedding/Lawn mower.pt new file mode 100644 index 0000000000000000000000000000000000000000..8ff72a6b45e505fb3b96b6233ddefc21f18354bb --- /dev/null +++ b/src/clap_embedding/Lawn mower.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4387f7c465a1a928483b264a4b53a3cc3f6ea43d666e3f84e08488fec9f3b90c +size 3243 diff --git a/src/clap_embedding/Light engine (high frequency).pt b/src/clap_embedding/Light engine (high frequency).pt new file mode 100644 index 0000000000000000000000000000000000000000..03ac3dc3630e61b12221c7d59cda40a89108e7ef --- /dev/null +++ b/src/clap_embedding/Light engine (high frequency).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69c24c7f6719fc68d41478f5801422b0fec3fd8ab4902e5152a45406829af31a +size 3402 diff --git a/src/clap_embedding/Liquid.pt b/src/clap_embedding/Liquid.pt new file mode 100644 index 0000000000000000000000000000000000000000..213c223203305fa26b74dc2df3c235b1c77b79e9 --- /dev/null +++ b/src/clap_embedding/Liquid.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e4a8dafcbaa4088694ba23ea479802f277ea4923c2ae4035c8431867af4e712 +size 3159 diff --git a/src/clap_embedding/Livestock, farm animals, working animals.pt b/src/clap_embedding/Livestock, farm animals, working animals.pt new file mode 100644 index 0000000000000000000000000000000000000000..8df040faa34a367bea85eee6eccfaf853ddc1c22 --- /dev/null +++ b/src/clap_embedding/Livestock, farm animals, working animals.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bf96ffbbb6ac82affe0d27b040e2d920b014e516d7ae4f539693b1d173ee4d8 +size 3521 diff --git a/src/clap_embedding/Lock.pt b/src/clap_embedding/Lock.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d3ba2548c6fc8a0a1eb1d7f0d0373d46025b08e --- /dev/null +++ b/src/clap_embedding/Lock.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bef7de6c31bbe9dfd74abd9e37a32c8b2e0e44cd368a13780121bf028a502c3 +size 3149 diff --git a/src/clap_embedding/Machine gun.pt b/src/clap_embedding/Machine gun.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a5c76d7b0b9b555272272e26ed5be8cf61eddc4 --- /dev/null +++ b/src/clap_embedding/Machine gun.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab55c7ccbac30a006d1afcd845f8882068786f6137015b7db4b32716b98cc5ce +size 3248 diff --git a/src/clap_embedding/Mains hum.pt b/src/clap_embedding/Mains hum.pt new file mode 100644 index 0000000000000000000000000000000000000000..312d540e104d6a552feefc1e1240d09b6040c701 --- /dev/null +++ b/src/clap_embedding/Mains hum.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0441d68bc94da79035fc3a411ca37abe19dcd1d438e2c2a8bb6c60fba4338b9 +size 3238 diff --git a/src/clap_embedding/Male singing.pt b/src/clap_embedding/Male singing.pt new file mode 100644 index 0000000000000000000000000000000000000000..ad8ba93dead1efb109e8bf53b5df8b2192279e17 --- /dev/null +++ b/src/clap_embedding/Male singing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a300ad7341223d561ddb333824eb72096b6f7cf72e2399778b997f02d5688c9 +size 3253 diff --git a/src/clap_embedding/Male speech, man speaking.pt b/src/clap_embedding/Male speech, man speaking.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f039170486923c979f2c8550e6866151ce7e3d7 --- /dev/null +++ b/src/clap_embedding/Male speech, man speaking.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1072aa13fa871cb560ea779c361702ca2ae05a8064491563fb65b1990a427863 +size 3382 diff --git a/src/clap_embedding/Mantra.pt b/src/clap_embedding/Mantra.pt new file mode 100644 index 0000000000000000000000000000000000000000..82f6302eb1a8ae4dc30c1dc10b1a3068cf32f1ba --- /dev/null +++ b/src/clap_embedding/Mantra.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c34e4cad9c9cb4c62dd0c91c0bc50527a670decca52e10ee025236920a769011 +size 3159 diff --git a/src/clap_embedding/Mechanical bell.pt b/src/clap_embedding/Mechanical bell.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9bf34507b4c081d14157a956fa7ca3e56c2e37e --- /dev/null +++ b/src/clap_embedding/Mechanical bell.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:590ae9f93dc5e3a7c0dc21d53744c5328385a03577df151f3d503920612a0808 +size 3268 diff --git a/src/clap_embedding/Mechanical fan.pt b/src/clap_embedding/Mechanical fan.pt new file mode 100644 index 0000000000000000000000000000000000000000..949157e582f1cea721b0b2df9b66f364069324c1 --- /dev/null +++ b/src/clap_embedding/Mechanical fan.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5434c33a207dc21606096a106f7d6bae600578ee9acecce83e4f1ef5ba593647 +size 3263 diff --git a/src/clap_embedding/Mechanisms.pt b/src/clap_embedding/Mechanisms.pt new file mode 100644 index 0000000000000000000000000000000000000000..27d8d0651446a4cc5d7d698fdf88fbc950251690 --- /dev/null +++ b/src/clap_embedding/Mechanisms.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38d76f15c6b11af9ff88d21e78c9bb9f1d75f84499ede8004b071bc6a397a65f +size 3243 diff --git a/src/clap_embedding/Medium engine (mid frequency).pt b/src/clap_embedding/Medium engine (mid frequency).pt new file mode 100644 index 0000000000000000000000000000000000000000..6febb73ca6d4d625ddba672af24997e80d3effe1 --- /dev/null +++ b/src/clap_embedding/Medium engine (mid frequency).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0be778945e3bca0130f540cc4118fb872900e7d68e27fb64e142354195fa05ac +size 3402 diff --git a/src/clap_embedding/Meow.pt b/src/clap_embedding/Meow.pt new file mode 100644 index 0000000000000000000000000000000000000000..e0fc642c7b1b18cb1cdf5ef9de8d1eb16e2596ba --- /dev/null +++ b/src/clap_embedding/Meow.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba85a22792a6bf5bf821a9dbbb3d4f7acbde087c9d79d00007b45e0a921e03ee +size 3149 diff --git a/src/clap_embedding/Microphone.pt b/src/clap_embedding/Microphone.pt new file mode 100644 index 0000000000000000000000000000000000000000..9376da7c9e5d5920c1d56258ff0a74c2c051904e --- /dev/null +++ b/src/clap_embedding/Microphone.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a4630004004a007827c91ecfab716c26550aea5906f5da8b65e4dc07aae6e1d +size 3243 diff --git a/src/clap_embedding/Microwave oven.pt b/src/clap_embedding/Microwave oven.pt new file mode 100644 index 0000000000000000000000000000000000000000..fd9a7089cb925d794bbc2520d5429fefeff6517c --- /dev/null +++ b/src/clap_embedding/Microwave oven.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a79e9c698cbcddaacf95d03840731600b6a714f172eaa9d78d598e2a74b8a0 +size 3263 diff --git a/src/clap_embedding/Moo.pt b/src/clap_embedding/Moo.pt new file mode 100644 index 0000000000000000000000000000000000000000..50ad2d48c8267e358d522c477bd2e5e963e7b206 --- /dev/null +++ b/src/clap_embedding/Moo.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:200ba8bf89d7a1461d12372e368499d375b65201d9c5374a0e7dd9ded579eb21 +size 3144 diff --git a/src/clap_embedding/Mosquito.pt b/src/clap_embedding/Mosquito.pt new file mode 100644 index 0000000000000000000000000000000000000000..79746ab4f4de759aa8f6e413aa1397a466e7bf40 --- /dev/null +++ b/src/clap_embedding/Mosquito.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87a36255c80ac16d1e2d40b38575a6bbfa84c6ceddb0799d579450f8e714e7e1 +size 3233 diff --git a/src/clap_embedding/Motor vehicle (road).pt b/src/clap_embedding/Motor vehicle (road).pt new file mode 100644 index 0000000000000000000000000000000000000000..6803db21da8e8c17382f8c60942f99cb9f2e205b --- /dev/null +++ b/src/clap_embedding/Motor vehicle (road).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9df5db7238c53fa53eb6981b0071948fa50d84c509df1f89468545d302521924 +size 3293 diff --git a/src/clap_embedding/Motorboat, speedboat.pt b/src/clap_embedding/Motorboat, speedboat.pt new file mode 100644 index 0000000000000000000000000000000000000000..98520d4268f9410e49398f1b80fb739c19c8b2de --- /dev/null +++ b/src/clap_embedding/Motorboat, speedboat.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ffd7f481b071ac1a56f031b5ad247df8b8aa2d05189e43692983208064e82fc +size 3293 diff --git a/src/clap_embedding/Motorcycle.pt b/src/clap_embedding/Motorcycle.pt new file mode 100644 index 0000000000000000000000000000000000000000..325068fa1d30f5f4657f584726d0104c45fc0740 --- /dev/null +++ b/src/clap_embedding/Motorcycle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2509f0c6824ed5c0fcfaef8a1d919e5c078de505dea21c3b3bd1fed80c4f84d +size 3243 diff --git a/src/clap_embedding/Mouse.pt b/src/clap_embedding/Mouse.pt new file mode 100644 index 0000000000000000000000000000000000000000..16c2acfb75a83f3eac143b13999feb4a6fa5ac0e --- /dev/null +++ b/src/clap_embedding/Mouse.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:374a3d52d9003af1f427b31d2195071a414dadb744156874e882c4335917161f +size 3154 diff --git a/src/clap_embedding/Music.pt b/src/clap_embedding/Music.pt new file mode 100644 index 0000000000000000000000000000000000000000..20490321f6eea2778561a7df12cfdacbdbd004fb --- /dev/null +++ b/src/clap_embedding/Music.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d8b93abf6714cc3387f7d4e2e654ebcb93738ea8e804e3fe8e2d537c350ea12 +size 3154 diff --git a/src/clap_embedding/Narration, monologue.pt b/src/clap_embedding/Narration, monologue.pt new file mode 100644 index 0000000000000000000000000000000000000000..848e711ae5390c4e5e9905c9fe1743bba3e570d1 --- /dev/null +++ b/src/clap_embedding/Narration, monologue.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7db7c70dea0eb6521d8665abdcad0660559ce3960228625e53b343fcf857a1b3 +size 3293 diff --git a/src/clap_embedding/Neigh, whinny.pt b/src/clap_embedding/Neigh, whinny.pt new file mode 100644 index 0000000000000000000000000000000000000000..d65cfd45a2b817d0ef8875a5b73cea642a666d47 --- /dev/null +++ b/src/clap_embedding/Neigh, whinny.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d04bd33fa37808681ecd3cbfa5b41cc2ed734dc81df021a3eead961db6d1c23e +size 3258 diff --git a/src/clap_embedding/Noise.pt b/src/clap_embedding/Noise.pt new file mode 100644 index 0000000000000000000000000000000000000000..b71066638f28eea9bd0b3597c4270097b84ebd15 --- /dev/null +++ b/src/clap_embedding/Noise.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d937540c5a130fa335128f11d811e14f4632ce416016949363fd362205a7ff05 +size 3154 diff --git a/src/clap_embedding/Non-motorized land vehicle.pt b/src/clap_embedding/Non-motorized land vehicle.pt new file mode 100644 index 0000000000000000000000000000000000000000..2b21aec32e81ffc39a386f2713802c2303f7e0bb --- /dev/null +++ b/src/clap_embedding/Non-motorized land vehicle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f306ae611999eac002df497f79d4f58b39fabc413462c31edd004fe571d9260 +size 3387 diff --git a/src/clap_embedding/Ocean.pt b/src/clap_embedding/Ocean.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb143c8a987360c46260e129ef39e1387b73483e --- /dev/null +++ b/src/clap_embedding/Ocean.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:527215090e7227b6be0c53126ed0eb1d6349a1ccce83c574eb6417475a3117e6 +size 3154 diff --git a/src/clap_embedding/Oink.pt b/src/clap_embedding/Oink.pt new file mode 100644 index 0000000000000000000000000000000000000000..e7eb0e5b99b00dd2a45529957c18287a284c4336 --- /dev/null +++ b/src/clap_embedding/Oink.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43410c9d161919bbe31c40d8616803d423808b77882ee7e647c6b9861958dab8 +size 3149 diff --git a/src/clap_embedding/Other sourceless.pt b/src/clap_embedding/Other sourceless.pt new file mode 100644 index 0000000000000000000000000000000000000000..f04f1fb58e3b1c79f462b3750d08ef316026bfee --- /dev/null +++ b/src/clap_embedding/Other sourceless.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13b9505f7563ccdacb35ff39463f9f5160148eaf606db03ce6769f8b05fa5599 +size 3273 diff --git a/src/clap_embedding/Outside, rural or natural.pt b/src/clap_embedding/Outside, rural or natural.pt new file mode 100644 index 0000000000000000000000000000000000000000..4a55f2c6c6326d8edc5a0fb5d873427ca1c691de --- /dev/null +++ b/src/clap_embedding/Outside, rural or natural.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aca2bfdb6353bc36e599fdf4b611c426ef7326d8ccb5b3224bab345a903e8e1 +size 3382 diff --git a/src/clap_embedding/Outside, urban or manmade.pt b/src/clap_embedding/Outside, urban or manmade.pt new file mode 100644 index 0000000000000000000000000000000000000000..cffccf480f7f55786684fa125a7f65c2c81adf8e --- /dev/null +++ b/src/clap_embedding/Outside, urban or manmade.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63aaabc5a17d74b1810451c6e48d2989c195e38b1ce401acbd7f7a3d2758fec9 +size 3382 diff --git a/src/clap_embedding/Owl.pt b/src/clap_embedding/Owl.pt new file mode 100644 index 0000000000000000000000000000000000000000..fb59a494baa991344a61bf2be84bfba29c34dd93 --- /dev/null +++ b/src/clap_embedding/Owl.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:078a464524a02a7013b4c683e7c0e30180f9054889f4f63553c22c7966fb4560 +size 3144 diff --git a/src/clap_embedding/Packing tape, duct tape.pt b/src/clap_embedding/Packing tape, duct tape.pt new file mode 100644 index 0000000000000000000000000000000000000000..c2a8ae40de3b803b2eec1296c55ba31f0fa6b440 --- /dev/null +++ b/src/clap_embedding/Packing tape, duct tape.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bdd361a3f7b6fb8a12b7502ed6eec1e83e808ba651277d72f6af1b3fe4b30dc +size 3372 diff --git a/src/clap_embedding/Pant (dog).pt b/src/clap_embedding/Pant (dog).pt new file mode 100644 index 0000000000000000000000000000000000000000..6d532691ade4a71e929832cf821adc92cbb21f08 --- /dev/null +++ b/src/clap_embedding/Pant (dog).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00314640e6baa273008b7d68a3628ce4116e11dc2d70ba6bcec06db5e164429c +size 3243 diff --git a/src/clap_embedding/Pant.pt b/src/clap_embedding/Pant.pt new file mode 100644 index 0000000000000000000000000000000000000000..b14187a9a6abca54b5c6adae8d55e675af99ffc9 --- /dev/null +++ b/src/clap_embedding/Pant.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:093a84164b66070dfd824815695cb55a8cc262a2ca31506f6695ea494e736aa7 +size 3149 diff --git a/src/clap_embedding/Paper rustling.pt b/src/clap_embedding/Paper rustling.pt new file mode 100644 index 0000000000000000000000000000000000000000..df56201d3b7c0a751170571b646aa25f224792f2 --- /dev/null +++ b/src/clap_embedding/Paper rustling.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99fbdf0b1dfad3d9950873324cb3411658aaea5b9b591a94c3b2582e7c942daf +size 3263 diff --git a/src/clap_embedding/Patter.pt b/src/clap_embedding/Patter.pt new file mode 100644 index 0000000000000000000000000000000000000000..008faa361a127e8bf982884a2bd81699f408f1a6 --- /dev/null +++ b/src/clap_embedding/Patter.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a62c3d2508dc0210ee4005911ebb9e2c045179f452ae6fad2ec1ced2abd832a0 +size 3159 diff --git a/src/clap_embedding/Pig.pt b/src/clap_embedding/Pig.pt new file mode 100644 index 0000000000000000000000000000000000000000..e21a05c189b354b1517d7ccabfcfa097be8bdeaa --- /dev/null +++ b/src/clap_embedding/Pig.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29851a382de03ad5523532adeb9c00424920ffd89ea3f4ab8453b5a58c2cfe70 +size 3144 diff --git a/src/clap_embedding/Pigeon, dove.pt b/src/clap_embedding/Pigeon, dove.pt new file mode 100644 index 0000000000000000000000000000000000000000..5708d00cc249a48a65d248f0f03772cc7d758e94 --- /dev/null +++ b/src/clap_embedding/Pigeon, dove.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dc603873174e6696baf3e22dcd8ceb2b34b6cff0f0c4c70dd1345d49426df60 +size 3253 diff --git a/src/clap_embedding/Ping.pt b/src/clap_embedding/Ping.pt new file mode 100644 index 0000000000000000000000000000000000000000..f470901893f224ae61d0bbfec8d76e487c6189f4 --- /dev/null +++ b/src/clap_embedding/Ping.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5b4901d6c90f051ac1dd155678dc77e640a207719be1634377207eab337f3f1 +size 3149 diff --git a/src/clap_embedding/Plop.pt b/src/clap_embedding/Plop.pt new file mode 100644 index 0000000000000000000000000000000000000000..529f57ab02fb62bb26f60a0a39b42afc85a577f6 --- /dev/null +++ b/src/clap_embedding/Plop.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd1b59b9673f7a90d3ab59fefd8be5a6f4d84a188a737e756fe8db64f5281d70 +size 3149 diff --git a/src/clap_embedding/Police car (siren).pt b/src/clap_embedding/Police car (siren).pt new file mode 100644 index 0000000000000000000000000000000000000000..b547c2a5de1bdc72bb87ee4a92b32015b05a63d4 --- /dev/null +++ b/src/clap_embedding/Police car (siren).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f0f4daaaae00a015eb2dccab2f570f947dff755b66284857dc93206854a3820 +size 3283 diff --git a/src/clap_embedding/Pour.pt b/src/clap_embedding/Pour.pt new file mode 100644 index 0000000000000000000000000000000000000000..d0606ac5412e54637e6fa45feeda8e5e2127a25d --- /dev/null +++ b/src/clap_embedding/Pour.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0c1eeb13502f8f7058f2bb2a9499f4c9321eff26fb0964dc8f80dc346454c70 +size 3149 diff --git a/src/clap_embedding/Power saw, circular saw, table saw.pt b/src/clap_embedding/Power saw, circular saw, table saw.pt new file mode 100644 index 0000000000000000000000000000000000000000..c104d6dee505203bc2b4b18467f8ad4f9d27a4e6 --- /dev/null +++ b/src/clap_embedding/Power saw, circular saw, table saw.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4598f59a228c8f3052f5d73fb1f0d785d2a3b32937b876f696db377b5796d7e3 +size 3427 diff --git a/src/clap_embedding/Power tool.pt b/src/clap_embedding/Power tool.pt new file mode 100644 index 0000000000000000000000000000000000000000..46a018e2d604942f82d58e11fa5174bd269b0a28 --- /dev/null +++ b/src/clap_embedding/Power tool.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc2d37442746ea6e51913edfe21735dfb52d94ea65c77a137a3912f8e8bac7e1 +size 3243 diff --git a/src/clap_embedding/Power windows, electric windows.pt b/src/clap_embedding/Power windows, electric windows.pt new file mode 100644 index 0000000000000000000000000000000000000000..76d14b4ccba672cbccafe6347b1c6a7a85dbc738 --- /dev/null +++ b/src/clap_embedding/Power windows, electric windows.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f48bc3ba909a80b0fa0a1901e1c841befd31b069700e6b04252863b8002a255 +size 3412 diff --git a/src/clap_embedding/Printer.pt b/src/clap_embedding/Printer.pt new file mode 100644 index 0000000000000000000000000000000000000000..ecba1a4c1f94a9610e2f8f3db5bc31662e5bd1b8 --- /dev/null +++ b/src/clap_embedding/Printer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c6c9a2c9dc3b69f40f41c9baf47ab593c52d1def055e17c2f811007821e0f52 +size 3228 diff --git a/src/clap_embedding/Propeller, airscrew.pt b/src/clap_embedding/Propeller, airscrew.pt new file mode 100644 index 0000000000000000000000000000000000000000..246d2d27f412bb5d63e6841e0c7d8fd686eb31b1 --- /dev/null +++ b/src/clap_embedding/Propeller, airscrew.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d294919a814e00440522df68fd8297353321130caf3a8c4eaed02b006db15d94 +size 3288 diff --git a/src/clap_embedding/Puff.pt b/src/clap_embedding/Puff.pt new file mode 100644 index 0000000000000000000000000000000000000000..353e116ca6d1efaffbd15527e4ae0b246874dacc --- /dev/null +++ b/src/clap_embedding/Puff.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61cc064d00929041af75eee55fd8edd1b538f69178bef41d6a6041c813db644a +size 3149 diff --git a/src/clap_embedding/Pulleys.pt b/src/clap_embedding/Pulleys.pt new file mode 100644 index 0000000000000000000000000000000000000000..622339be01f2f698b6292e30a4232435f73c4379 --- /dev/null +++ b/src/clap_embedding/Pulleys.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd8be6337e060e451c9fd2be108d60029c5d5446d9c0a9d58032374af566632b +size 3228 diff --git a/src/clap_embedding/Pulse.pt b/src/clap_embedding/Pulse.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8e72cf513794200ee257bc259529744050676b1 --- /dev/null +++ b/src/clap_embedding/Pulse.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92357c6f14f49a37238e191d0f8c02d2bf0f3d7772b144b8a2c9db2357e2215e +size 3154 diff --git a/src/clap_embedding/Pump (liquid).pt b/src/clap_embedding/Pump (liquid).pt new file mode 100644 index 0000000000000000000000000000000000000000..c7ad453e8781d3bcbd7d3a322f9d9ab4fef98fdc --- /dev/null +++ b/src/clap_embedding/Pump (liquid).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0806c8fe24ad7162ee20c7d42d42df594aa75f1c6bcd951497de28ef9edd0716 +size 3258 diff --git a/src/clap_embedding/Purr.pt b/src/clap_embedding/Purr.pt new file mode 100644 index 0000000000000000000000000000000000000000..2897f9d46238301320f07329afa2e5cdd623f593 --- /dev/null +++ b/src/clap_embedding/Purr.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b79e499123e740ae116c51cec36af21e0b14c0ab15a6404fe87a48196859b454 +size 3149 diff --git a/src/clap_embedding/Quack.pt b/src/clap_embedding/Quack.pt new file mode 100644 index 0000000000000000000000000000000000000000..8abda9c190e0cd9e3c6a41f5a22baee2a0e4e6ee --- /dev/null +++ b/src/clap_embedding/Quack.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff50d9a3999a3ef67e19435acc4f9215e4ded3601660a2075ee5f54150a66913 +size 3154 diff --git a/src/clap_embedding/Race car, auto racing.pt b/src/clap_embedding/Race car, auto racing.pt new file mode 100644 index 0000000000000000000000000000000000000000..a06dcc5de0874a05ec7fbe55ce70bc23714dc763 --- /dev/null +++ b/src/clap_embedding/Race car, auto racing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cac9947a2eb6225a2aad79c59f3d16884aec0476713780cd7d34cd1e74ea022e +size 3298 diff --git a/src/clap_embedding/Radio.pt b/src/clap_embedding/Radio.pt new file mode 100644 index 0000000000000000000000000000000000000000..e75075c3156968d9d3f097d53a36298145f055b5 --- /dev/null +++ b/src/clap_embedding/Radio.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:090ca4aab5f946879276eec1d972fa3a32afbcd7844ee2a4d0abb948db213838 +size 3154 diff --git a/src/clap_embedding/Rail transport.pt b/src/clap_embedding/Rail transport.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e54f8ffaac5fb92c83402dc45c24ce6c6426a30 --- /dev/null +++ b/src/clap_embedding/Rail transport.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:156f49060ead53ab1fdfc68d20143959a7dfc16937287b1c4e8b7b67a0bab857 +size 3263 diff --git a/src/clap_embedding/Railroad car, train wagon.pt b/src/clap_embedding/Railroad car, train wagon.pt new file mode 100644 index 0000000000000000000000000000000000000000..4be9ebc916dea5994b12fa9614456a2544acb1a5 --- /dev/null +++ b/src/clap_embedding/Railroad car, train wagon.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec6c6d83bd5a0fa50509ff5606af6f487fede40199c95e3c626b32e0fcd36d97 +size 3382 diff --git a/src/clap_embedding/Rain on surface.pt b/src/clap_embedding/Rain on surface.pt new file mode 100644 index 0000000000000000000000000000000000000000..2dee26c0f46df011f756d88c8a9477159cb9ffe5 --- /dev/null +++ b/src/clap_embedding/Rain on surface.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:223489b7bc0ba19dacc4ef498590f5eda20934068686315945576cdca66b2258 +size 3268 diff --git a/src/clap_embedding/Rain.pt b/src/clap_embedding/Rain.pt new file mode 100644 index 0000000000000000000000000000000000000000..f3ce4818670c440be553cf895407f37046651df3 --- /dev/null +++ b/src/clap_embedding/Rain.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f110c376fb19eab5ffe6ed5c0cf35099ec7153883d09e26329b1212d06fd14c5 +size 3149 diff --git a/src/clap_embedding/Raindrop.pt b/src/clap_embedding/Raindrop.pt new file mode 100644 index 0000000000000000000000000000000000000000..ee2036155c7317aa28d9b8777d5d9e1db5dd9b35 --- /dev/null +++ b/src/clap_embedding/Raindrop.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f806b148f807e422e040cfb004e7188b781e72abc647ea2cbdc5d0280d20322 +size 3233 diff --git a/src/clap_embedding/Rapping.pt b/src/clap_embedding/Rapping.pt new file mode 100644 index 0000000000000000000000000000000000000000..c6a8128cd89af3fb9c6070a3377cfe26b222c18a --- /dev/null +++ b/src/clap_embedding/Rapping.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1569919abdcafd2e082644a35c683a9bb4c99ff177c8d8b2344b9e9dd2dbee88 +size 3228 diff --git a/src/clap_embedding/Ratchet, pawl.pt b/src/clap_embedding/Ratchet, pawl.pt new file mode 100644 index 0000000000000000000000000000000000000000..24c3fdb34f456e25720d4c54939636fad3f45605 --- /dev/null +++ b/src/clap_embedding/Ratchet, pawl.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:373cda9c954969f1b55fc3e58fb41520fccd0ffb5f14bee30d55b62f48b3da98 +size 3258 diff --git a/src/clap_embedding/Rattle.pt b/src/clap_embedding/Rattle.pt new file mode 100644 index 0000000000000000000000000000000000000000..0bc53ef4ee8e1be67b2312f3bbd6257c27854af5 --- /dev/null +++ b/src/clap_embedding/Rattle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dcde596cc83d1d4308185675e6d5152ff187bc98dcdc9b73db2cc3ebfd6fb49 +size 3159 diff --git a/src/clap_embedding/Refrigerator.pt b/src/clap_embedding/Refrigerator.pt new file mode 100644 index 0000000000000000000000000000000000000000..f8926e07785a4c6829a829ee3b224c46e0cfdc68 --- /dev/null +++ b/src/clap_embedding/Refrigerator.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d08e8c2d34e7bd772e8d34bd212daa1df025a1931699f76ef45b78115559152 +size 3253 diff --git a/src/clap_embedding/Respiratory sounds.pt b/src/clap_embedding/Respiratory sounds.pt new file mode 100644 index 0000000000000000000000000000000000000000..b0e535cb97c3098a068622f19b7bc409b7452645 --- /dev/null +++ b/src/clap_embedding/Respiratory sounds.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2589270c75a99330a3e848635b32284daa10614fdf91ee4d18e378f38c6719b +size 3283 diff --git a/src/clap_embedding/Reverberation.pt b/src/clap_embedding/Reverberation.pt new file mode 100644 index 0000000000000000000000000000000000000000..da65f8e7c50a9054b48792e9b139e5401ff93a79 --- /dev/null +++ b/src/clap_embedding/Reverberation.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b7cd93a57ce0e564f9f69d33145716ec0108769713af0aa975b3bd7d2f2bbe2 +size 3258 diff --git a/src/clap_embedding/Reversing beeps.pt b/src/clap_embedding/Reversing beeps.pt new file mode 100644 index 0000000000000000000000000000000000000000..954bc7a0340e382979a79249203253551082c02f --- /dev/null +++ b/src/clap_embedding/Reversing beeps.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5a6a9efe3d6e311093ee1e9d0d5121cade7c41f511f02ea83d955205583beff +size 3268 diff --git a/src/clap_embedding/Ringing tone, ringback tone.pt b/src/clap_embedding/Ringing tone, ringback tone.pt new file mode 100644 index 0000000000000000000000000000000000000000..613881476b7310af27844a11c70a7d024216dcdc --- /dev/null +++ b/src/clap_embedding/Ringing tone, ringback tone.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9001521f3409d8ad0aa492d97d602c825d32c3edd355c7129ebd0143e0292d5e +size 3392 diff --git a/src/clap_embedding/Ringtone.pt b/src/clap_embedding/Ringtone.pt new file mode 100644 index 0000000000000000000000000000000000000000..71135910ef6aee7e6765f32d76781720b4c58492 --- /dev/null +++ b/src/clap_embedding/Ringtone.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27c1aec25ad3898a445b2c65e55e35a9e16f2907d71d844fddd0ee5153ae82c7 +size 3233 diff --git a/src/clap_embedding/Roar.pt b/src/clap_embedding/Roar.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6c4395a829594975f5f1b311dbbc5d335cc7f0a --- /dev/null +++ b/src/clap_embedding/Roar.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d5b2741012d43df6a50473654db1029cdfa1268b70b656eb5a732615110a7990 +size 3149 diff --git a/src/clap_embedding/Roaring cats (lions, tigers).pt b/src/clap_embedding/Roaring cats (lions, tigers).pt new file mode 100644 index 0000000000000000000000000000000000000000..70cc177234cdc95ba44835d593af0041365f7946 --- /dev/null +++ b/src/clap_embedding/Roaring cats (lions, tigers).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:495b5d29e59dce3ac4f3f904dcaa707d85e2465b33c26f87ee0b4e2ab95c1399 +size 3397 diff --git a/src/clap_embedding/Rodents, rats, mice.pt b/src/clap_embedding/Rodents, rats, mice.pt new file mode 100644 index 0000000000000000000000000000000000000000..9b5534a6648f6196830e816b4bfa4caf2016a172 --- /dev/null +++ b/src/clap_embedding/Rodents, rats, mice.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93eedb9d72ca4c21d87b66a584028c54363634aaf5b4acbc1ddb1be63a709ad4 +size 3288 diff --git a/src/clap_embedding/Roll.pt b/src/clap_embedding/Roll.pt new file mode 100644 index 0000000000000000000000000000000000000000..50ad6fa0142bf794407f43f016ccfd97485375ee --- /dev/null +++ b/src/clap_embedding/Roll.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7aed13db3fe708058a46e5b447851494d431e3ad1e6a92fbceae2b0f5aaf44a2 +size 3149 diff --git a/src/clap_embedding/Rowboat, canoe, kayak.pt b/src/clap_embedding/Rowboat, canoe, kayak.pt new file mode 100644 index 0000000000000000000000000000000000000000..75a870f0b2887f9b63ba7cbc3ea527cc02eac012 --- /dev/null +++ b/src/clap_embedding/Rowboat, canoe, kayak.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d18a6c432974ca4bacd1f73e8d38f6c207a32ebed94cfbf6b40c655a85acc726 +size 3298 diff --git a/src/clap_embedding/Rub.pt b/src/clap_embedding/Rub.pt new file mode 100644 index 0000000000000000000000000000000000000000..f6fd34abff4e44606e63d384d506c25af5f456b8 --- /dev/null +++ b/src/clap_embedding/Rub.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:470bde9ce335f84dced9c9cf160e10620fbaba1451d9fbbb2b4524f31f68783b +size 3144 diff --git a/src/clap_embedding/Rumble.pt b/src/clap_embedding/Rumble.pt new file mode 100644 index 0000000000000000000000000000000000000000..90429e488773a77bcc9be40d50f68100f389369f --- /dev/null +++ b/src/clap_embedding/Rumble.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5841287b1e460ce58d7d2af46a7b993c99ad54750363b20f0741eb19f2af0b0a +size 3159 diff --git a/src/clap_embedding/Run.pt b/src/clap_embedding/Run.pt new file mode 100644 index 0000000000000000000000000000000000000000..a9cafdcf79aac334e9b6d5999dbb4be5990655f2 --- /dev/null +++ b/src/clap_embedding/Run.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63b67d2438073c59ee507572f7b99d5a2d4e9eed7a56c1b8050843fe506a0b9a +size 3144 diff --git a/src/clap_embedding/Rustle.pt b/src/clap_embedding/Rustle.pt new file mode 100644 index 0000000000000000000000000000000000000000..977be534b1e7f72f504b67547a74ba11752e30f8 --- /dev/null +++ b/src/clap_embedding/Rustle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d91c2a60dc6ae4caf4e0b0383b67fa95c236ad1fdf98dedab15a87fb424518a4 +size 3159 diff --git a/src/clap_embedding/Sailboat, sailing ship.pt b/src/clap_embedding/Sailboat, sailing ship.pt new file mode 100644 index 0000000000000000000000000000000000000000..0a3100e6542813f7bca17862f1f9416a09d4333e --- /dev/null +++ b/src/clap_embedding/Sailboat, sailing ship.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b84653ffb1a8b677e595c6159b24207fe0c9f0ee91eb71ca52f2b23ef81146c5 +size 3367 diff --git a/src/clap_embedding/Sanding.pt b/src/clap_embedding/Sanding.pt new file mode 100644 index 0000000000000000000000000000000000000000..473bb111ea1decd8766c9794fde5956f12857ca8 --- /dev/null +++ b/src/clap_embedding/Sanding.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f890864e8a22aa47d20c0c2af311e276d05dcda92bfab1b68f6841430242986a +size 3228 diff --git a/src/clap_embedding/Sawing.pt b/src/clap_embedding/Sawing.pt new file mode 100644 index 0000000000000000000000000000000000000000..3faf9cd66dff1caba36461dadc93ced88ecc0e7b --- /dev/null +++ b/src/clap_embedding/Sawing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bacbfd0c732cbe21b8f644cf6d6fc771e85826a26c173551f08382a4b02f4ba4 +size 3159 diff --git a/src/clap_embedding/Scissors.pt b/src/clap_embedding/Scissors.pt new file mode 100644 index 0000000000000000000000000000000000000000..cc6a83f968a3029516391b83c7b674fe8826fcef --- /dev/null +++ b/src/clap_embedding/Scissors.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e4dbcd38deb86195fb990edfa081e88c31c4e3f4941b4ed5f608392537e9b8a +size 3233 diff --git a/src/clap_embedding/Scrape.pt b/src/clap_embedding/Scrape.pt new file mode 100644 index 0000000000000000000000000000000000000000..e12fb24288c967675410fdb6a6be9f5f91ad9709 --- /dev/null +++ b/src/clap_embedding/Scrape.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ffb19bb391fee97d25947a2da06bb12f9ead7e96c7c2e3fe9797a40e6e1671d +size 3159 diff --git a/src/clap_embedding/Scratch.pt b/src/clap_embedding/Scratch.pt new file mode 100644 index 0000000000000000000000000000000000000000..dad5357db15ee2f145454292b62d491172d26af3 --- /dev/null +++ b/src/clap_embedding/Scratch.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61dc95b0ae1d967501541962717700883c85438e7ea5b2e1bfb949cc03556832 +size 3228 diff --git a/src/clap_embedding/Screaming.pt b/src/clap_embedding/Screaming.pt new file mode 100644 index 0000000000000000000000000000000000000000..e02b2c807d75ad51baf81164c3799933c8f8734f --- /dev/null +++ b/src/clap_embedding/Screaming.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe99964b7ff8bcc92ff8ec8ac9a5df585442b176a85752813b4af432569b80e2 +size 3238 diff --git a/src/clap_embedding/Screech.pt b/src/clap_embedding/Screech.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b36b40ff8f09c78135bac6ab321710960b9ffc0 --- /dev/null +++ b/src/clap_embedding/Screech.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:362fe58a9415185a158e709b7834fc2543c33592be6a848554bdf6db388cdf40 +size 3228 diff --git a/src/clap_embedding/Sewing machine.pt b/src/clap_embedding/Sewing machine.pt new file mode 100644 index 0000000000000000000000000000000000000000..d6ad496b62f833ebc9204d91dd552820421f9109 --- /dev/null +++ b/src/clap_embedding/Sewing machine.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85e399a586e89d72e3c4c1954b562ae1b18297cc9e4d9ad4f926371ee47886a6 +size 3263 diff --git a/src/clap_embedding/Sheep.pt b/src/clap_embedding/Sheep.pt new file mode 100644 index 0000000000000000000000000000000000000000..64ed813373c7988c3daaa71fbeab502b84bed748 --- /dev/null +++ b/src/clap_embedding/Sheep.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2db2b1452cfab53307c85d9f0a4a4441d19f2b13dbddbd6601ea9067d3e287e3 +size 3154 diff --git a/src/clap_embedding/Ship.pt b/src/clap_embedding/Ship.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a5202ff68ca23b07842392c599e37a0d25db353 --- /dev/null +++ b/src/clap_embedding/Ship.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee5473dc224637f309ac5b02a17f2a17b0307a7d2b6d4e67c4c36bd22e1359d9 +size 3149 diff --git a/src/clap_embedding/Shout.pt b/src/clap_embedding/Shout.pt new file mode 100644 index 0000000000000000000000000000000000000000..eb744da1e0ddfcf45cc04be5582f6d8f260fd450 --- /dev/null +++ b/src/clap_embedding/Shout.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dee5b1ef3d6a7864620dc55fabfa2c4cbe3a7e42d2617c931de1760c08fa6bf3 +size 3154 diff --git a/src/clap_embedding/Shower.pt b/src/clap_embedding/Shower.pt new file mode 100644 index 0000000000000000000000000000000000000000..99f3cdc00b5b7f5b95149944d85661c9d2fbef20 --- /dev/null +++ b/src/clap_embedding/Shower.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75bf08366958fa93cadfe49cf3d7d83976dc713adbdc378bbcb34fdc94a3618e +size 3159 diff --git a/src/clap_embedding/Shuffle.pt b/src/clap_embedding/Shuffle.pt new file mode 100644 index 0000000000000000000000000000000000000000..32e81fd33031636330274d43343a7182df793d1c --- /dev/null +++ b/src/clap_embedding/Shuffle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d2ec314c82930f21876dea07600c19ff8043b12912c65e29c802e82a947d06 +size 3228 diff --git a/src/clap_embedding/Shuffling cards.pt b/src/clap_embedding/Shuffling cards.pt new file mode 100644 index 0000000000000000000000000000000000000000..dd5390ada35da03fe20d1a7feb63f64053141068 --- /dev/null +++ b/src/clap_embedding/Shuffling cards.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a048aded620e54c0dd6ed2597c911bc60ed2654141e054a5c7b54e262abbfe8b +size 3268 diff --git a/src/clap_embedding/Sigh.pt b/src/clap_embedding/Sigh.pt new file mode 100644 index 0000000000000000000000000000000000000000..28205b0ee0c0ab980aad590b9a7c323bbaf7ffde --- /dev/null +++ b/src/clap_embedding/Sigh.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5750186cb9ba3843db7d42e0bb481c9bd7341229a7342d46c841e53b8495a4f3 +size 3149 diff --git a/src/clap_embedding/Silence.pt b/src/clap_embedding/Silence.pt new file mode 100644 index 0000000000000000000000000000000000000000..ba48c4a33aafc5f73c5a1dfdbe8b223899d9516b --- /dev/null +++ b/src/clap_embedding/Silence.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20f483b0a15a9112783a2af734059887da99e7ef9f0ac1202aa19123b6922021 +size 3228 diff --git a/src/clap_embedding/Sine wave.pt b/src/clap_embedding/Sine wave.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9c39b2b78bd69fcaf7852d2617710daecb123cf --- /dev/null +++ b/src/clap_embedding/Sine wave.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3516433ff2a36a6a85c8f45302a76c592858606b7e817b9b9b1981375015397b +size 3238 diff --git a/src/clap_embedding/Singing.pt b/src/clap_embedding/Singing.pt new file mode 100644 index 0000000000000000000000000000000000000000..60893345e65e62cab1bba32465aa44093dd5eb49 --- /dev/null +++ b/src/clap_embedding/Singing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc3778e5e307baaccc377faa269b199d10c629b9b7e161d354cfd450752e5597 +size 3228 diff --git a/src/clap_embedding/Single-lens reflex camera.pt b/src/clap_embedding/Single-lens reflex camera.pt new file mode 100644 index 0000000000000000000000000000000000000000..a8b51679a0eb2f90651b46a724a2e24581bfbff9 --- /dev/null +++ b/src/clap_embedding/Single-lens reflex camera.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bea76434967280b7f3b8b9538dc1c911f15fd4c406e506475f991b17cde2100 +size 3382 diff --git a/src/clap_embedding/Sink (filling or washing).pt b/src/clap_embedding/Sink (filling or washing).pt new file mode 100644 index 0000000000000000000000000000000000000000..dfd93422fca51d032fd1795f352e8ac7186ec609 --- /dev/null +++ b/src/clap_embedding/Sink (filling or washing).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b87bc0f4e699df0a8eb0ea91ac49aaabe6e0e79b44f78fa87c7842f5fd3587bb +size 3382 diff --git a/src/clap_embedding/Siren.pt b/src/clap_embedding/Siren.pt new file mode 100644 index 0000000000000000000000000000000000000000..bb12a2c775e1490ef2f3eb2e925d8913ff869e2a --- /dev/null +++ b/src/clap_embedding/Siren.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd444b58f40b5ce3c156e412d18dd81f0b4e931dba5e300698fe9fcc5274cfcb +size 3154 diff --git a/src/clap_embedding/Sizzle.pt b/src/clap_embedding/Sizzle.pt new file mode 100644 index 0000000000000000000000000000000000000000..db531c1f7e5da27ec066ea294e918aa437c66dbc --- /dev/null +++ b/src/clap_embedding/Sizzle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b37887b954624f2a742fa21d590da2263092d7f35a7b90de553c01986a33ac94 +size 3159 diff --git a/src/clap_embedding/Skateboard.pt b/src/clap_embedding/Skateboard.pt new file mode 100644 index 0000000000000000000000000000000000000000..119790d5f16c252432004c54c023c03deae0b3c1 --- /dev/null +++ b/src/clap_embedding/Skateboard.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e48fd7d0b929e69809961ad53d1e9ff47726e02e1897c845b7039280610fcb7d +size 3243 diff --git a/src/clap_embedding/Slam.pt b/src/clap_embedding/Slam.pt new file mode 100644 index 0000000000000000000000000000000000000000..c99561d2de4c110140a80eb35f8101c438c03aff --- /dev/null +++ b/src/clap_embedding/Slam.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de3227c1faa8df76a72b5f22ddea288d8b5526585072611d6e12b154a8e94402 +size 3149 diff --git a/src/clap_embedding/Slap, smack.pt b/src/clap_embedding/Slap, smack.pt new file mode 100644 index 0000000000000000000000000000000000000000..542a356eb32ea789e1f3d9ff0293bf0e2a30c766 --- /dev/null +++ b/src/clap_embedding/Slap, smack.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c0282879265cde410b006f3d6f2e22c496cccb6168335607daba2b60770a0ce9 +size 3248 diff --git a/src/clap_embedding/Sliding door.pt b/src/clap_embedding/Sliding door.pt new file mode 100644 index 0000000000000000000000000000000000000000..d710cc3abd0f2f884b77dc8fbc2aa979ea1bc3f6 --- /dev/null +++ b/src/clap_embedding/Sliding door.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fa5e12a6c434dcbb4995f72db1ff2276f40e299cf63ea558b1fadaa0232acb6 +size 3253 diff --git a/src/clap_embedding/Slosh.pt b/src/clap_embedding/Slosh.pt new file mode 100644 index 0000000000000000000000000000000000000000..de1376ac6a2045f09fef00db4b2d4ef50c45af8d --- /dev/null +++ b/src/clap_embedding/Slosh.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7589829816ec49c3b5ff4e46f4668813c7e1fa9cdd9ae580ed8a258b3fb49b8 +size 3154 diff --git a/src/clap_embedding/Slurp, drinking straw.pt b/src/clap_embedding/Slurp, drinking straw.pt new file mode 100644 index 0000000000000000000000000000000000000000..b7fef74715d573eef15b7ebf6cf4486f789a047a --- /dev/null +++ b/src/clap_embedding/Slurp, drinking straw.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb443d49a3500576e8c654fb88b1ef6bd1680f71fd8bb416a2fbf6b96b80f9ae +size 3298 diff --git a/src/clap_embedding/Smash, crash.pt b/src/clap_embedding/Smash, crash.pt new file mode 100644 index 0000000000000000000000000000000000000000..252fa1b509faa89b4dce204a8a5178077311382a --- /dev/null +++ b/src/clap_embedding/Smash, crash.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6393449e60fb62e5f040bb5ea539a1259a24fca18b188c43c0e3766565a02961 +size 3253 diff --git a/src/clap_embedding/Smoke detector, smoke alarm.pt b/src/clap_embedding/Smoke detector, smoke alarm.pt new file mode 100644 index 0000000000000000000000000000000000000000..4b363e1a5e33b0305fd31cf0631b05df70760ea5 --- /dev/null +++ b/src/clap_embedding/Smoke detector, smoke alarm.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a18cc23764c26a4c1cae7f1e573c1661afa18f17d58db4d091daad6b6202a58a +size 3392 diff --git a/src/clap_embedding/Snake.pt b/src/clap_embedding/Snake.pt new file mode 100644 index 0000000000000000000000000000000000000000..b562c6e15c4e3331fb688542c2ca1ede667de48f --- /dev/null +++ b/src/clap_embedding/Snake.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a02d57972ae90981b548324f353bb8d1f99136857ef98d39763c1c67bf5f8797 +size 3154 diff --git a/src/clap_embedding/Snap.pt b/src/clap_embedding/Snap.pt new file mode 100644 index 0000000000000000000000000000000000000000..3b3ee56a158d482000e12b2f07402cf3f560dd40 --- /dev/null +++ b/src/clap_embedding/Snap.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b38213d58fe348f6120012863f3971172ec9a542c44a32b105c04475857d41a0 +size 3149 diff --git a/src/clap_embedding/Sneeze.pt b/src/clap_embedding/Sneeze.pt new file mode 100644 index 0000000000000000000000000000000000000000..37194b9318f85daedeb4744225db657f78f1653b --- /dev/null +++ b/src/clap_embedding/Sneeze.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e0aadeda795d87aa13c6ee800b9af4b93aeb99826b5a239d1cff1bc49e39c75 +size 3159 diff --git a/src/clap_embedding/Snicker.pt b/src/clap_embedding/Snicker.pt new file mode 100644 index 0000000000000000000000000000000000000000..d9912c7046b2de804b6e30dba782d92af7bde7b7 --- /dev/null +++ b/src/clap_embedding/Snicker.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff4b7c0957599c29a4f355620422bb15beee4fad8f09da9a09db2b1ae70a9d37 +size 3228 diff --git a/src/clap_embedding/Sniff.pt b/src/clap_embedding/Sniff.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cc7523bda7335e8138660eafc21ba41cb43e3a9 --- /dev/null +++ b/src/clap_embedding/Sniff.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20aeff137ec0f35d238ad4a3f3f46489a3e1722ebed74b3173059886f0ef2c5c +size 3154 diff --git a/src/clap_embedding/Snoring.pt b/src/clap_embedding/Snoring.pt new file mode 100644 index 0000000000000000000000000000000000000000..05eb01bceddc756b2e217b2bac9360989960d2df --- /dev/null +++ b/src/clap_embedding/Snoring.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc8505a445c21c3151b0d2400ade3b0a7758d4283b51786c9d1d0d00f20a9aed +size 3228 diff --git a/src/clap_embedding/Snort (horse).pt b/src/clap_embedding/Snort (horse).pt new file mode 100644 index 0000000000000000000000000000000000000000..0f552f934147cbe49359e052d0c19aef7f65a237 --- /dev/null +++ b/src/clap_embedding/Snort (horse).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fef8b21e5574923cc6b5d47f2cfddc40daf2937524ac09328d876f74f7f6d06 +size 3258 diff --git a/src/clap_embedding/Snort.pt b/src/clap_embedding/Snort.pt new file mode 100644 index 0000000000000000000000000000000000000000..52686261989cbdd97b215508b40e3f60eb171958 --- /dev/null +++ b/src/clap_embedding/Snort.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c1fcc1a09a55113732db440134c4a31f52f5ee9b8940bb228448de6be6e191f +size 3154 diff --git a/src/clap_embedding/Sonar.pt b/src/clap_embedding/Sonar.pt new file mode 100644 index 0000000000000000000000000000000000000000..4c3a85f6d2d632d0f0fc6b8719b9de889784fe73 --- /dev/null +++ b/src/clap_embedding/Sonar.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50ee2bc6ccff3dfadb0c5c36a477769aaf228e61e853620cc21c13be98d89b80 +size 3154 diff --git a/src/clap_embedding/Sonic boom.pt b/src/clap_embedding/Sonic boom.pt new file mode 100644 index 0000000000000000000000000000000000000000..7a3460afdd2141fbb0762073fac093a2e5837413 --- /dev/null +++ b/src/clap_embedding/Sonic boom.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43a732f75f78f03ed245351838fa6e6edfe3e613497371722a5383b02407bd00 +size 3243 diff --git a/src/clap_embedding/Sound effect.pt b/src/clap_embedding/Sound effect.pt new file mode 100644 index 0000000000000000000000000000000000000000..40ce86bfa1ba121b2bccb138166c8c2abad2f1aa --- /dev/null +++ b/src/clap_embedding/Sound effect.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9b3fd36f0be787e280615f31261f914bcdaada818bae3e0343ccb43965523f1 +size 3253 diff --git a/src/clap_embedding/Sound equipment.pt b/src/clap_embedding/Sound equipment.pt new file mode 100644 index 0000000000000000000000000000000000000000..c4b09e689dc879052249bd273afe50e7fd4aee5e --- /dev/null +++ b/src/clap_embedding/Sound equipment.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:daef5e030b11c5422b444a229e9959b922c158e7c6ee67814833b6612be5cd94 +size 3268 diff --git a/src/clap_embedding/Sound reproduction.pt b/src/clap_embedding/Sound reproduction.pt new file mode 100644 index 0000000000000000000000000000000000000000..1e03d80134e940ff464b0539843fc88b9e02a338 --- /dev/null +++ b/src/clap_embedding/Sound reproduction.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23d9b106ccb9e99eae6eee5390ddb19daab99db4da089c49af399d9f0400a42a +size 3283 diff --git a/src/clap_embedding/Source-ambiguous sounds.pt b/src/clap_embedding/Source-ambiguous sounds.pt new file mode 100644 index 0000000000000000000000000000000000000000..128ca97e6e67239a1295e7e7ab279956024a8596 --- /dev/null +++ b/src/clap_embedding/Source-ambiguous sounds.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:189133b954b46f27fa16ddeff6c2594a909d9108f5e93d8f641cb729492c0e50 +size 3372 diff --git a/src/clap_embedding/Specific impact sounds.pt b/src/clap_embedding/Specific impact sounds.pt new file mode 100644 index 0000000000000000000000000000000000000000..9bcb9f789e4068dbd7569e11bdb3b4cf28cab658 --- /dev/null +++ b/src/clap_embedding/Specific impact sounds.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6331f5b36a82e1642d7b83a29cd7b410e2b449994a5aa438be768127e53a908d +size 3367 diff --git a/src/clap_embedding/Speech synthesizer.pt b/src/clap_embedding/Speech synthesizer.pt new file mode 100644 index 0000000000000000000000000000000000000000..0ba4322742933f93d74d664d5f2f62ec4d88f488 --- /dev/null +++ b/src/clap_embedding/Speech synthesizer.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a4de3649edb117b62a9b3aa1ed891257ab9d91265c0b30d80d2804838493df1 +size 3283 diff --git a/src/clap_embedding/Speech.pt b/src/clap_embedding/Speech.pt new file mode 100644 index 0000000000000000000000000000000000000000..79fadc3e7ff441ddcb11a1abaf237a5c4112faa2 --- /dev/null +++ b/src/clap_embedding/Speech.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37c95cd8024212bd58eca350f5b1d01c51e34b46266c5566dd9a7a14cfc562a8 +size 3159 diff --git a/src/clap_embedding/Splash, splatter.pt b/src/clap_embedding/Splash, splatter.pt new file mode 100644 index 0000000000000000000000000000000000000000..e6b262cd80ab61e02caa9b179d568c4e7d4900ea --- /dev/null +++ b/src/clap_embedding/Splash, splatter.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:136cf4ca3c30f40dfa712bee51306167da24109d1be0cd38da4679b6816ba5eb +size 3273 diff --git a/src/clap_embedding/Splinter.pt b/src/clap_embedding/Splinter.pt new file mode 100644 index 0000000000000000000000000000000000000000..2d80883e53f6af5db93d1370b662cda7a4bda833 --- /dev/null +++ b/src/clap_embedding/Splinter.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bede63c74f3b8f49e5b53ebcfbec5f06667866621f1bf778e61a50db7ab21cea +size 3233 diff --git a/src/clap_embedding/Spray.pt b/src/clap_embedding/Spray.pt new file mode 100644 index 0000000000000000000000000000000000000000..62f0608d084a53e3ad3bd751cd555f18f86775b5 --- /dev/null +++ b/src/clap_embedding/Spray.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1b69bc4d7769aa658b8df2720e7a71474f8753ca6dabe9afeed378082fe32d6 +size 3154 diff --git a/src/clap_embedding/Squawk.pt b/src/clap_embedding/Squawk.pt new file mode 100644 index 0000000000000000000000000000000000000000..1965584e05c69cc82f13d0125cbd676cfc4c139a --- /dev/null +++ b/src/clap_embedding/Squawk.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8590596dc4a5515ae8a0a696de2394d0943db5f8f5cdc829bdc002182c65fa1d +size 3159 diff --git a/src/clap_embedding/Squeak.pt b/src/clap_embedding/Squeak.pt new file mode 100644 index 0000000000000000000000000000000000000000..3f17231d225f368b71f7e0a586563b6cc77cef1a --- /dev/null +++ b/src/clap_embedding/Squeak.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b24b9cf7c0df85ed9aa1ba520e529f5885ba2ae7dcd6d21ad679715622d0f99 +size 3159 diff --git a/src/clap_embedding/Squeal.pt b/src/clap_embedding/Squeal.pt new file mode 100644 index 0000000000000000000000000000000000000000..224bac5622639679be93601f534f3fbc62f342e7 --- /dev/null +++ b/src/clap_embedding/Squeal.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:62454ae2bc7334b7891df524e322f5274b5cee082cfbc061db7ae63653d924c7 +size 3159 diff --git a/src/clap_embedding/Squish.pt b/src/clap_embedding/Squish.pt new file mode 100644 index 0000000000000000000000000000000000000000..f931369b8878a1feb635f4ab2a243bf61dcb38d5 --- /dev/null +++ b/src/clap_embedding/Squish.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f34a0aaf0b84e1222c862740e00ab34d039e204b1ac22f782ffef1fc5f1a5bd6 +size 3159 diff --git a/src/clap_embedding/Stairs.pt b/src/clap_embedding/Stairs.pt new file mode 100644 index 0000000000000000000000000000000000000000..ef755b645b568f29ffe0b92a68b06c15a94481f3 --- /dev/null +++ b/src/clap_embedding/Stairs.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1f5d2fc8e05f1e0d9536248513b724bc32591e1d1d605e6129b40de058d01ac +size 3159 diff --git a/src/clap_embedding/Static.pt b/src/clap_embedding/Static.pt new file mode 100644 index 0000000000000000000000000000000000000000..9475a35fc6cd8bac36ee4e369d8f217e3780053a --- /dev/null +++ b/src/clap_embedding/Static.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29ca0234ffa7ef20a6ec37c602108584760db276422572c923f7d6da5c87f85c +size 3159 diff --git a/src/clap_embedding/Steam whistle.pt b/src/clap_embedding/Steam whistle.pt new file mode 100644 index 0000000000000000000000000000000000000000..6056e21e93212c6b9d623abb57ec690a372dccf2 --- /dev/null +++ b/src/clap_embedding/Steam whistle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3551006678be39a3b40bc50204392b827797946645b801b2d2012dd5d1f3e84f +size 3258 diff --git a/src/clap_embedding/Steam.pt b/src/clap_embedding/Steam.pt new file mode 100644 index 0000000000000000000000000000000000000000..b450392e82f9ba8fd9e0b95f95450cb54828eaea --- /dev/null +++ b/src/clap_embedding/Steam.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a716c4b22c6c431c499fee4a26246b8ecde1ee7540bed55db96b9334341d7783 +size 3154 diff --git a/src/clap_embedding/Stir.pt b/src/clap_embedding/Stir.pt new file mode 100644 index 0000000000000000000000000000000000000000..39ea2ea1fe886e26ac74afaab2fbe90198e79f4f --- /dev/null +++ b/src/clap_embedding/Stir.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60ab6ddb8436d0913ca994f928257c5cd01aa0391803c2b660a8dcab77a9822a +size 3149 diff --git a/src/clap_embedding/Stomach rumble.pt b/src/clap_embedding/Stomach rumble.pt new file mode 100644 index 0000000000000000000000000000000000000000..025fe446b7fa88cd44de4475b60c73d51094a652 --- /dev/null +++ b/src/clap_embedding/Stomach rumble.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e09aa98387104c473915e421b9d2176745edc3e44d7bc73bb69658e749be50c4 +size 3263 diff --git a/src/clap_embedding/Stomp, stamp.pt b/src/clap_embedding/Stomp, stamp.pt new file mode 100644 index 0000000000000000000000000000000000000000..b5c4d572d71e43d39e8bd74295ce107cdea8267c --- /dev/null +++ b/src/clap_embedding/Stomp, stamp.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:834d6068d20e73358d01dc98e83b5f12c40240912923ec7d8257bb89d66d7b68 +size 3253 diff --git a/src/clap_embedding/Stream, river.pt b/src/clap_embedding/Stream, river.pt new file mode 100644 index 0000000000000000000000000000000000000000..74461dbb078d2e3b3141bcee4a77e00adf7df48b --- /dev/null +++ b/src/clap_embedding/Stream, river.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a055d188d57d9ce6333a2f2877f8fc275bca4ae5fadb4f62cafde8bc3ab6b6bb +size 3258 diff --git a/src/clap_embedding/Studio recording.pt b/src/clap_embedding/Studio recording.pt new file mode 100644 index 0000000000000000000000000000000000000000..0df4fa6fd752fb19d8bbedd02938bcad33964049 --- /dev/null +++ b/src/clap_embedding/Studio recording.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42dfc387875375cf1d6d3c8a4c752dd7254728bda15d829227702c6ea2219fa1 +size 3273 diff --git a/src/clap_embedding/Subway, metro, underground.pt b/src/clap_embedding/Subway, metro, underground.pt new file mode 100644 index 0000000000000000000000000000000000000000..3bc6e2865d351bc22e82bb205b8fed9bdb910ec3 --- /dev/null +++ b/src/clap_embedding/Subway, metro, underground.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b9d896418e3a03c3c0586b5de49cb03f55dd359498cfc50f2e07ba28566c63d +size 3387 diff --git a/src/clap_embedding/Surface contact.pt b/src/clap_embedding/Surface contact.pt new file mode 100644 index 0000000000000000000000000000000000000000..c272bf88111c93f658edaff00b907282596103d7 --- /dev/null +++ b/src/clap_embedding/Surface contact.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60b049ee09c3fde0ae48298ea88bdf1681186c2f2b74ccbc198a01359726f9a9 +size 3268 diff --git a/src/clap_embedding/Sweeping.pt b/src/clap_embedding/Sweeping.pt new file mode 100644 index 0000000000000000000000000000000000000000..528ca0f86cf57c2724f4cce0efae9785e4d57333 --- /dev/null +++ b/src/clap_embedding/Sweeping.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53c893ec4cb3053244811c01dbefbe3048bc4016ac0f7d1c3d70cb1f9e290dbe +size 3233 diff --git a/src/clap_embedding/Synthetic singing.pt b/src/clap_embedding/Synthetic singing.pt new file mode 100644 index 0000000000000000000000000000000000000000..45400f23721155c332fbfc9a947a169fac58c2a1 --- /dev/null +++ b/src/clap_embedding/Synthetic singing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27b6846bd22b40b8ea8cf8295077f119bc0115097de1e39fafeb1402c8db56f4 +size 3278 diff --git a/src/clap_embedding/Tap dance.pt b/src/clap_embedding/Tap dance.pt new file mode 100644 index 0000000000000000000000000000000000000000..409f8db475b669dce39f0fd2b925287110744dd2 --- /dev/null +++ b/src/clap_embedding/Tap dance.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b6472355f0dd3add8d9029b315ae94725935d510c7f7a57dc665d0669c7fbd3 +size 3238 diff --git a/src/clap_embedding/Tap.pt b/src/clap_embedding/Tap.pt new file mode 100644 index 0000000000000000000000000000000000000000..777927f7ec8c29283d1f0c9d625d2be19db003aa --- /dev/null +++ b/src/clap_embedding/Tap.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55ba2a7c8ed5fe393d174697822ddc5e4db6b72004fda1bab5f9e9b46d0b6cc7 +size 3144 diff --git a/src/clap_embedding/Tape hiss.pt b/src/clap_embedding/Tape hiss.pt new file mode 100644 index 0000000000000000000000000000000000000000..f2cc2d1ddaa98822f9f01fd05fcd95eccb1545a1 --- /dev/null +++ b/src/clap_embedding/Tape hiss.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86578e2c9c4c72f8affb909feef104551d373dfadf621b70fdcf707991c6d1c8 +size 3238 diff --git a/src/clap_embedding/Tearing.pt b/src/clap_embedding/Tearing.pt new file mode 100644 index 0000000000000000000000000000000000000000..e45a34dc1acfcd02eaa5a22a8a5b5c0c2f95480a --- /dev/null +++ b/src/clap_embedding/Tearing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6141893cc7773eef4425721d40e25951c265123ade273d77c061adae8b724f13 +size 3228 diff --git a/src/clap_embedding/Telephone bell ringing.pt b/src/clap_embedding/Telephone bell ringing.pt new file mode 100644 index 0000000000000000000000000000000000000000..1d3f93ca0a389774745254bb772712a3e3ab046e --- /dev/null +++ b/src/clap_embedding/Telephone bell ringing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5ac738aee7e572f3e5eb9f1dc4d06664a9700e5ebf1aeb910f34f5eb789d945 +size 3367 diff --git a/src/clap_embedding/Telephone dialing, DTMF.pt b/src/clap_embedding/Telephone dialing, DTMF.pt new file mode 100644 index 0000000000000000000000000000000000000000..76361e485da93e10345a89eff5efdfda50a049ed --- /dev/null +++ b/src/clap_embedding/Telephone dialing, DTMF.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a48b27ffe98c8a68bb5e52bbec24372e29824e5c4acb8a6536bfe755bfd5ffc +size 3372 diff --git a/src/clap_embedding/Telephone.pt b/src/clap_embedding/Telephone.pt new file mode 100644 index 0000000000000000000000000000000000000000..118bae8498e359565e2b0d8aa692a277e8879beb --- /dev/null +++ b/src/clap_embedding/Telephone.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8ee36aab919ce4c72f774ab280e1d0706b2c082a930e717c8c8efa6d304e596 +size 3238 diff --git a/src/clap_embedding/Television.pt b/src/clap_embedding/Television.pt new file mode 100644 index 0000000000000000000000000000000000000000..89fd866fc43831650e8bfeaa1e8b6652341446ca --- /dev/null +++ b/src/clap_embedding/Television.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce6f07f37aeacd8b4e17db4d2740fd0c31f6485f14e1c62a1fda9bb8bbf07d94 +size 3243 diff --git a/src/clap_embedding/Throat clearing.pt b/src/clap_embedding/Throat clearing.pt new file mode 100644 index 0000000000000000000000000000000000000000..adc553ce1fbfa4d87579dc7a608f5e0cf3cd8a4e --- /dev/null +++ b/src/clap_embedding/Throat clearing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de1e793ae8b9289db47edf95e92182798f5072a4053f7bb4fb1e35b8f8931666 +size 3268 diff --git a/src/clap_embedding/Throbbing.pt b/src/clap_embedding/Throbbing.pt new file mode 100644 index 0000000000000000000000000000000000000000..30cacaff6db2a5f8c007056950e7d8a412def99b --- /dev/null +++ b/src/clap_embedding/Throbbing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50c8088a5a5e2d868adbb1972ff0052ee0fc5b15c17ee2a47968ef4181653b9d +size 3238 diff --git a/src/clap_embedding/Thump, thud.pt b/src/clap_embedding/Thump, thud.pt new file mode 100644 index 0000000000000000000000000000000000000000..6104242aecb20faa7379f3e7b6c3a4fa8691f1c4 --- /dev/null +++ b/src/clap_embedding/Thump, thud.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecb7028b41e9a3cc51d6b1c32c5d2dd3c7171dcc13a60b6f8feb50ee80c7f1e7 +size 3248 diff --git a/src/clap_embedding/Thunder.pt b/src/clap_embedding/Thunder.pt new file mode 100644 index 0000000000000000000000000000000000000000..895e6a9ac042124ce826cc639f97d8c60971e4ed --- /dev/null +++ b/src/clap_embedding/Thunder.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebd69fa38e63db2b17d70efac13ead7dc3ae60ad578663323bb05d4044e41038 +size 3228 diff --git a/src/clap_embedding/Thunderstorm.pt b/src/clap_embedding/Thunderstorm.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1cd0891bddbc509ca0cc149a5c84f3d23ac188c --- /dev/null +++ b/src/clap_embedding/Thunderstorm.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:299c83adacb1e89efc87be09251aa725f6d7d5ef7ed28cbfc440e7fdcfb93999 +size 3253 diff --git a/src/clap_embedding/Thunk.pt b/src/clap_embedding/Thunk.pt new file mode 100644 index 0000000000000000000000000000000000000000..82f8a20c91766cc067bf35318124b5ecdbd9fa4f --- /dev/null +++ b/src/clap_embedding/Thunk.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:37e6c3eba7f4b5ce85bdec76f4cb3e342e0e7e48e7d183cfe337401a2fdfe0c9 +size 3154 diff --git a/src/clap_embedding/Tick-tock.pt b/src/clap_embedding/Tick-tock.pt new file mode 100644 index 0000000000000000000000000000000000000000..c013b691fa27d94d9ddceaf69c2a6804309a3755 --- /dev/null +++ b/src/clap_embedding/Tick-tock.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:024382c2954b5c468d92ee56e5c366dc8b7db02a00c0c4a1859f02724c72d8a9 +size 3238 diff --git a/src/clap_embedding/Tick.pt b/src/clap_embedding/Tick.pt new file mode 100644 index 0000000000000000000000000000000000000000..e70c76c9ecd60e26c5ea376190b8a52e1745cfc7 --- /dev/null +++ b/src/clap_embedding/Tick.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4aa78085fff34531044b9e883b138c46064def8e702324eda6ba526818ad0fad +size 3149 diff --git a/src/clap_embedding/Tire squeal, skidding.pt b/src/clap_embedding/Tire squeal, skidding.pt new file mode 100644 index 0000000000000000000000000000000000000000..593f6e73cffbe28d720c3f13121824d43cee3e36 --- /dev/null +++ b/src/clap_embedding/Tire squeal, skidding.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35956886225fd4579a1fbdbe29fd80958c2a48c9e2484093ec68a62e7c58ca97 +size 3298 diff --git a/src/clap_embedding/Toilet flush.pt b/src/clap_embedding/Toilet flush.pt new file mode 100644 index 0000000000000000000000000000000000000000..342b2b0ff44169477b3186fbc313badabd97b555 --- /dev/null +++ b/src/clap_embedding/Toilet flush.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cc535c7ca2d3f3f846894e4c324a664e63cb5d0da0bce3d92d2dd884ae75439 +size 3253 diff --git a/src/clap_embedding/Tools.pt b/src/clap_embedding/Tools.pt new file mode 100644 index 0000000000000000000000000000000000000000..af6f9a12f04ffe2a556ca8118879057b779bdeaf --- /dev/null +++ b/src/clap_embedding/Tools.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4190669641863998f998eca3a4ca9e9000f926d4c2f128d2336aedbf50a9345e +size 3154 diff --git a/src/clap_embedding/Toothbrush.pt b/src/clap_embedding/Toothbrush.pt new file mode 100644 index 0000000000000000000000000000000000000000..cb5215e704243402ba57bfe24adad900004b1142 --- /dev/null +++ b/src/clap_embedding/Toothbrush.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a625cbd0c916320309bd4bd3089a370e724e76d0a5ab89c168975a17b5f6fdc9 +size 3243 diff --git a/src/clap_embedding/Traffic noise, roadway noise.pt b/src/clap_embedding/Traffic noise, roadway noise.pt new file mode 100644 index 0000000000000000000000000000000000000000..6b8f715885404087418a37a182ff68aba825de62 --- /dev/null +++ b/src/clap_embedding/Traffic noise, roadway noise.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df05e0ff3df49bb6e6ddc7b2856295b295b61dd8575b2efe3e6ad322b2f9ee9d +size 3397 diff --git a/src/clap_embedding/Train horn.pt b/src/clap_embedding/Train horn.pt new file mode 100644 index 0000000000000000000000000000000000000000..09422a3343fd7eab0cd2ed2af3ed34e8adf4f59b --- /dev/null +++ b/src/clap_embedding/Train horn.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0387677b1843d09c80ee420da1fedd2d6e241c6b8dec8054556ffbf79bf5ae4b +size 3243 diff --git a/src/clap_embedding/Train wheels squealing.pt b/src/clap_embedding/Train wheels squealing.pt new file mode 100644 index 0000000000000000000000000000000000000000..e92b5806f20ce985b2f669b2488bf307b24c3658 --- /dev/null +++ b/src/clap_embedding/Train wheels squealing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c70e89054e81418379f11f7a9211323917ac890dc841a8a9b5acc6cdbc94f8eb +size 3367 diff --git a/src/clap_embedding/Train whistle.pt b/src/clap_embedding/Train whistle.pt new file mode 100644 index 0000000000000000000000000000000000000000..bccde0c54f18c8aec0d47b0cb738148ed8619b69 --- /dev/null +++ b/src/clap_embedding/Train whistle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06fa7b6f787ac7926676b0aaf32d50062b4e08a17d1713519320154ee6cdc4d3 +size 3258 diff --git a/src/clap_embedding/Train.pt b/src/clap_embedding/Train.pt new file mode 100644 index 0000000000000000000000000000000000000000..224826e57c1d09b6a886ab174104b509464fa05b --- /dev/null +++ b/src/clap_embedding/Train.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e2edb3948fe6586697230a6206a51767a04e3d55fcabc1ad68a6488f754e8e9 +size 3154 diff --git a/src/clap_embedding/Trickle, dribble.pt b/src/clap_embedding/Trickle, dribble.pt new file mode 100644 index 0000000000000000000000000000000000000000..435e56074bf43f9614ea58e526bb74da77003c34 --- /dev/null +++ b/src/clap_embedding/Trickle, dribble.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f6b0ae646842cdbfc2532c3c14f4ec221c9ce63f26d9b16358a4a09cc969137d +size 3273 diff --git a/src/clap_embedding/Truck.pt b/src/clap_embedding/Truck.pt new file mode 100644 index 0000000000000000000000000000000000000000..4bf5e4d3ebe58271f6092d851ae1fc81d6abf2eb --- /dev/null +++ b/src/clap_embedding/Truck.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:da5e6ecae24fde2d71bfdd87a31952dc3409a275bc2a1d791c9904ebd872c8d6 +size 3154 diff --git a/src/clap_embedding/Tuning fork.pt b/src/clap_embedding/Tuning fork.pt new file mode 100644 index 0000000000000000000000000000000000000000..46ab308826895a2b4872df12a25f06029349001c --- /dev/null +++ b/src/clap_embedding/Tuning fork.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a050c4f34fe4f820d0e1a780ca7dd7b213fd7d50b0958499b10657ffe626e166 +size 3248 diff --git a/src/clap_embedding/Turkey.pt b/src/clap_embedding/Turkey.pt new file mode 100644 index 0000000000000000000000000000000000000000..63a1f70cdc5393b0b54ea3c222ae8360294255dc --- /dev/null +++ b/src/clap_embedding/Turkey.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40ed0f3eafea5a1f0b0a29540ec3e698cb3f93a34081bb8f68b85cfc787a0728 +size 3159 diff --git a/src/clap_embedding/Typewriter.pt b/src/clap_embedding/Typewriter.pt new file mode 100644 index 0000000000000000000000000000000000000000..8329e9318550bfe08a18414fbc6df4a4bfb7d9c8 --- /dev/null +++ b/src/clap_embedding/Typewriter.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af8005a7c0ae558eacf5ce78886bc7b895db63070dde79c4b6e8df1aa3aa1464 +size 3243 diff --git a/src/clap_embedding/Typing.pt b/src/clap_embedding/Typing.pt new file mode 100644 index 0000000000000000000000000000000000000000..79a8957fb61786714741f2c05a46e82e33b76a89 --- /dev/null +++ b/src/clap_embedding/Typing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e383e9164f08e8b1f8f064200dfee2642f61aa2bca135987126839a34340bf9 +size 3159 diff --git a/src/clap_embedding/Unknown sound.pt b/src/clap_embedding/Unknown sound.pt new file mode 100644 index 0000000000000000000000000000000000000000..e1219be7d270479ce21409f55d187ffdd7b34288 --- /dev/null +++ b/src/clap_embedding/Unknown sound.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7514d40d908be50d8bf31b00f8a79f41a48f31b6991d4f1f064da7b5a86a4ad +size 3258 diff --git a/src/clap_embedding/Unmodified field recording.pt b/src/clap_embedding/Unmodified field recording.pt new file mode 100644 index 0000000000000000000000000000000000000000..e00fe8ba0dac434b621c6325aea4dd9a6f49b750 --- /dev/null +++ b/src/clap_embedding/Unmodified field recording.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:981207dd299748b7bb38d1f3f6b96741a90f464e890c153a7558d1ebb9e0a89f +size 3387 diff --git a/src/clap_embedding/Vacuum cleaner.pt b/src/clap_embedding/Vacuum cleaner.pt new file mode 100644 index 0000000000000000000000000000000000000000..91a60f225eac41581b69d501c68c1969546b03da --- /dev/null +++ b/src/clap_embedding/Vacuum cleaner.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:041627d53e6a7f6327411834e64780c2ccbae00c94f8dd050e4ae60f7a2877a6 +size 3263 diff --git a/src/clap_embedding/Vehicle horn, car horn, honking, toot.pt b/src/clap_embedding/Vehicle horn, car horn, honking, toot.pt new file mode 100644 index 0000000000000000000000000000000000000000..9af7e79a455a715e802f8645120549e2f1750b9c --- /dev/null +++ b/src/clap_embedding/Vehicle horn, car horn, honking, toot.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8670dff21862b2acb7bba9a7b2d4ef489334e4e7b55f11571d83758bcbc3f772 +size 3442 diff --git a/src/clap_embedding/Vehicle.pt b/src/clap_embedding/Vehicle.pt new file mode 100644 index 0000000000000000000000000000000000000000..8b53ac667e40a3ea2bb29e9232e7d4cb016ff54f --- /dev/null +++ b/src/clap_embedding/Vehicle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76937d51e7669c98279aac973d53c2e54cb3356d70ebd7ffc1582b0894537148 +size 3228 diff --git a/src/clap_embedding/Velcro, hook and loop fastener.pt b/src/clap_embedding/Velcro, hook and loop fastener.pt new file mode 100644 index 0000000000000000000000000000000000000000..ce679d62a252749113785f720297f59050ca6c51 --- /dev/null +++ b/src/clap_embedding/Velcro, hook and loop fastener.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7caae04b5a41a738963a3d39a43a0b0f3fb12ad42eb0f38e323dff6e8d3248f +size 3407 diff --git a/src/clap_embedding/Vibration.pt b/src/clap_embedding/Vibration.pt new file mode 100644 index 0000000000000000000000000000000000000000..711a364825232393fa70dab3994d4c18451c8eb3 --- /dev/null +++ b/src/clap_embedding/Vibration.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:becf578a0e990c742425206d9ecea39bc08924340eef1d6aed76ee731a0f4906 +size 3238 diff --git a/src/clap_embedding/Video game sound.pt b/src/clap_embedding/Video game sound.pt new file mode 100644 index 0000000000000000000000000000000000000000..6798a585ff78a8dd01341aaf23cfe29a230e708b --- /dev/null +++ b/src/clap_embedding/Video game sound.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2191b48c22656061db5280fbd62b6ba5ab797002a4409b750f64ce2d3c14f59d +size 3273 diff --git a/src/clap_embedding/Wail, moan.pt b/src/clap_embedding/Wail, moan.pt new file mode 100644 index 0000000000000000000000000000000000000000..028de47e189da24c13783dd09f6ddda0c0544cad --- /dev/null +++ b/src/clap_embedding/Wail, moan.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:086f052114f44199c589f2cfad2aa7c1dfa66636700d4a44e52e0ad5126c4211 +size 3243 diff --git a/src/clap_embedding/Walk, footsteps.pt b/src/clap_embedding/Walk, footsteps.pt new file mode 100644 index 0000000000000000000000000000000000000000..54cd7a405d4416445e874a3bc19dcb180350fa24 --- /dev/null +++ b/src/clap_embedding/Walk, footsteps.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c00586fd61144e163281e4b2b63d8f89051f7a683c46bc32afad04ca84c12df +size 3268 diff --git a/src/clap_embedding/Washing machine.pt b/src/clap_embedding/Washing machine.pt new file mode 100644 index 0000000000000000000000000000000000000000..d37d8c781f24c2b7fb25d6b1ec959c1c7b0706f0 --- /dev/null +++ b/src/clap_embedding/Washing machine.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2ba273fddd72a814c0b917800fff572cea36a5c498e52901da2907a96bc6daf +size 3268 diff --git a/src/clap_embedding/Water tap, faucet.pt b/src/clap_embedding/Water tap, faucet.pt new file mode 100644 index 0000000000000000000000000000000000000000..9201ebafa8a23cf8b9d79b2d2a00a4a6975d09cf --- /dev/null +++ b/src/clap_embedding/Water tap, faucet.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bec9ca097411c3f8c3492fc861f6867c5e9746e86310f68395bba2d7e3c6107 +size 3278 diff --git a/src/clap_embedding/Water.pt b/src/clap_embedding/Water.pt new file mode 100644 index 0000000000000000000000000000000000000000..044afb9cbabd385bcd8120e69d727ceef1be537b --- /dev/null +++ b/src/clap_embedding/Water.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20101244ba541dc3ff587066babd57503e1bdbd2b0448dae54a2577441dd7237 +size 3154 diff --git a/src/clap_embedding/Waterfall.pt b/src/clap_embedding/Waterfall.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ad76041febaf713f3fa67f500be9d03ceb195fb --- /dev/null +++ b/src/clap_embedding/Waterfall.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b99d2c004d241e416e4212ec850a102b62aa39127b81a1ccce0a08d8136e1586 +size 3238 diff --git a/src/clap_embedding/Waves, surf.pt b/src/clap_embedding/Waves, surf.pt new file mode 100644 index 0000000000000000000000000000000000000000..0cd2d73ae7732255edc508effe3559f204903b78 --- /dev/null +++ b/src/clap_embedding/Waves, surf.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:813c5b1b2d4983dd86d2a3ea1db4381299ce79bb6fd6833043f090d0cd9d8124 +size 3248 diff --git a/src/clap_embedding/Whack, thwack.pt b/src/clap_embedding/Whack, thwack.pt new file mode 100644 index 0000000000000000000000000000000000000000..c37eef90bd33d7c2efc4c2c7eb422273e9bfc5c1 --- /dev/null +++ b/src/clap_embedding/Whack, thwack.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bfb2c60e53fdfa1c37b70f2f080aa0d6cb78f629743afa4afedca5ac8d0e250 +size 3258 diff --git a/src/clap_embedding/Whale vocalization.pt b/src/clap_embedding/Whale vocalization.pt new file mode 100644 index 0000000000000000000000000000000000000000..7f73d3755a384fdde16877037124c868279df183 --- /dev/null +++ b/src/clap_embedding/Whale vocalization.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:902de63ed26f0db87bd9f61ba834b7be5c74f8ef2f4dc68fc6dc7a26f295ef0d +size 3283 diff --git a/src/clap_embedding/Wheeze.pt b/src/clap_embedding/Wheeze.pt new file mode 100644 index 0000000000000000000000000000000000000000..7ffff3e642e36d4810e7a7112943e5c97455ac77 --- /dev/null +++ b/src/clap_embedding/Wheeze.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:998dd427f0043c1568678b24de176ba4e914d8c0e09b207e4af356288941f36e +size 3159 diff --git a/src/clap_embedding/Whimper (dog).pt b/src/clap_embedding/Whimper (dog).pt new file mode 100644 index 0000000000000000000000000000000000000000..ce3c16bd08c9cc2f25822e9228e6e522d8b4882e --- /dev/null +++ b/src/clap_embedding/Whimper (dog).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efad386da63f157486fb6b0c181a49dc26ca086994386afc1415326a8e6be388 +size 3258 diff --git a/src/clap_embedding/Whimper.pt b/src/clap_embedding/Whimper.pt new file mode 100644 index 0000000000000000000000000000000000000000..a4527ca84e084fb934d84d89232fb87889bac21f --- /dev/null +++ b/src/clap_embedding/Whimper.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c8cb31c52dd575f1fc0c31cc2f4de03c3367e61c4b2e777e126778461cfa349 +size 3228 diff --git a/src/clap_embedding/Whip.pt b/src/clap_embedding/Whip.pt new file mode 100644 index 0000000000000000000000000000000000000000..ab54b68c91e0d836909ff90f50f675c5879c589d --- /dev/null +++ b/src/clap_embedding/Whip.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:65fbfbd1c601436d5ec2e39f374fa7cf4b3443ebb9c4b0b7a48c76ec5a4b4fe4 +size 3149 diff --git a/src/clap_embedding/Whir.pt b/src/clap_embedding/Whir.pt new file mode 100644 index 0000000000000000000000000000000000000000..e988c350b86f5563a5b9f8b2f08db29940fc7ca8 --- /dev/null +++ b/src/clap_embedding/Whir.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bec9bc40322bf71b45066ca9c8bd620d8ecb44a810d27df3b47d3bc292912496 +size 3149 diff --git a/src/clap_embedding/Whispering.pt b/src/clap_embedding/Whispering.pt new file mode 100644 index 0000000000000000000000000000000000000000..54c0e4d288a27553412fccddec7ee746a2571b06 --- /dev/null +++ b/src/clap_embedding/Whispering.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97830acb0750847195370ce05db640efc3465d3c9fe05cc921c8a20a87bdfb93 +size 3243 diff --git a/src/clap_embedding/Whistle.pt b/src/clap_embedding/Whistle.pt new file mode 100644 index 0000000000000000000000000000000000000000..b2ffdc13f0f15cf881561753444d11a8cf3cf601 --- /dev/null +++ b/src/clap_embedding/Whistle.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b89dbf299ea4ec7668f0c006f258e55c6b48d4db93508a6e7db60daf1a6b26c3 +size 3228 diff --git a/src/clap_embedding/Whistling.pt b/src/clap_embedding/Whistling.pt new file mode 100644 index 0000000000000000000000000000000000000000..fbad512b46ec07cff71cc518e2156eeae595237d --- /dev/null +++ b/src/clap_embedding/Whistling.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:180c25e7ad471796d998769d9e9516ce43afab3efd503b66111281b3529f46d4 +size 3238 diff --git a/src/clap_embedding/White noise, pink noise.pt b/src/clap_embedding/White noise, pink noise.pt new file mode 100644 index 0000000000000000000000000000000000000000..7cd8292eea3a80c4e9b8a04b774377ff9511adb4 --- /dev/null +++ b/src/clap_embedding/White noise, pink noise.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:608568ef6c7fa44c7ba6756b09a42c0ce24b12c0126bca33a4d311a435471eb5 +size 3372 diff --git a/src/clap_embedding/Whoop.pt b/src/clap_embedding/Whoop.pt new file mode 100644 index 0000000000000000000000000000000000000000..f84f085b9023f2471bac826b6bf319b7c47e34b5 --- /dev/null +++ b/src/clap_embedding/Whoop.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0630bd9135b89fb37c1efc211ca8db5660bbd321c7e02d4c2a6042582f221e90 +size 3154 diff --git a/src/clap_embedding/Whoosh, swoosh, swish.pt b/src/clap_embedding/Whoosh, swoosh, swish.pt new file mode 100644 index 0000000000000000000000000000000000000000..aea4578f6b13b9bf574e5c7323c85ed45efec3a5 --- /dev/null +++ b/src/clap_embedding/Whoosh, swoosh, swish.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:655161f88d4cb875dfb0cb67c19dc5459738504ec2d7cc167f2e56d8eae10fad +size 3298 diff --git a/src/clap_embedding/Wild animals.pt b/src/clap_embedding/Wild animals.pt new file mode 100644 index 0000000000000000000000000000000000000000..67aa7aa2fecafddf3eaf5f8fc310fc2af7b147a0 --- /dev/null +++ b/src/clap_embedding/Wild animals.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73231d8ef5880c3447d92719175077f1a07d099fa1a8925c3a747a56b4ea3d70 +size 3253 diff --git a/src/clap_embedding/Wildfire.pt b/src/clap_embedding/Wildfire.pt new file mode 100644 index 0000000000000000000000000000000000000000..3141bddd942d7cde2b31db14e92ed34c89ee6b9c --- /dev/null +++ b/src/clap_embedding/Wildfire.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92ae8330fd3b4418f815d2c1b9806e4584a8f8cb5b4050e517edf53c1552e206 +size 3233 diff --git a/src/clap_embedding/Wind chime.pt b/src/clap_embedding/Wind chime.pt new file mode 100644 index 0000000000000000000000000000000000000000..fe0614516ec76847cd986f50254a0e7a897c744e --- /dev/null +++ b/src/clap_embedding/Wind chime.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fcb9d8c41dfc644f52911f090fc8dfbf9dedc7c32e7ddf6cb4ed44ac7c8a542 +size 3243 diff --git a/src/clap_embedding/Wind noise (microphone).pt b/src/clap_embedding/Wind noise (microphone).pt new file mode 100644 index 0000000000000000000000000000000000000000..984aec2f2a2e24babb430c79cdaf677fbecab50d --- /dev/null +++ b/src/clap_embedding/Wind noise (microphone).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:787eafb22625b870eaf5a72779a4a787e23e98864cc3e5cae181fffe5f628718 +size 3372 diff --git a/src/clap_embedding/Wind.pt b/src/clap_embedding/Wind.pt new file mode 100644 index 0000000000000000000000000000000000000000..899ceb350bd3e6d8388a864edae703faf08651ed --- /dev/null +++ b/src/clap_embedding/Wind.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6726785b453e9ff4b189e162bebb9b530e05956b22c27893ca5dcd93a75f7427 +size 3149 diff --git a/src/clap_embedding/Windscreen wiper, windshield wiper.pt b/src/clap_embedding/Windscreen wiper, windshield wiper.pt new file mode 100644 index 0000000000000000000000000000000000000000..5c1ce269e9050228bd4721b5f40b7fd158555305 --- /dev/null +++ b/src/clap_embedding/Windscreen wiper, windshield wiper.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cc50ad8fb6f26c7b5b198171729183adf414d1ed3574c59a4750636a2634515f +size 3427 diff --git a/src/clap_embedding/Wobble.pt b/src/clap_embedding/Wobble.pt new file mode 100644 index 0000000000000000000000000000000000000000..7577f726e57e88df0d78010eb710137f235b3a0a --- /dev/null +++ b/src/clap_embedding/Wobble.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38905dfb3838dce8882a2bcfea45435a4b974fcae0102d72736f6a4f64c70766 +size 3159 diff --git a/src/clap_embedding/Wolf-whistling.pt b/src/clap_embedding/Wolf-whistling.pt new file mode 100644 index 0000000000000000000000000000000000000000..ae8e72b4ae59f72ae65068d95b99c46aa343b895 --- /dev/null +++ b/src/clap_embedding/Wolf-whistling.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6c8e4e5d987883489dfe7a0b76d505055360f04630829feb6c53c955adb2679 +size 3263 diff --git a/src/clap_embedding/Wood.pt b/src/clap_embedding/Wood.pt new file mode 100644 index 0000000000000000000000000000000000000000..9370ae57f2ea93ff51374e59a42e56b0c264ed8a --- /dev/null +++ b/src/clap_embedding/Wood.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d66bac6f731383860206bda2f617ab133da333866d10743287126bdd4a84493 +size 3149 diff --git a/src/clap_embedding/Writing.pt b/src/clap_embedding/Writing.pt new file mode 100644 index 0000000000000000000000000000000000000000..64d5b76be14875b136d5d2e3ba92238934880e94 --- /dev/null +++ b/src/clap_embedding/Writing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc2b1c04d5ccd39e0066805ec073ac957e5efd6942e76296ad9dc741c449864e +size 3228 diff --git a/src/clap_embedding/Yak.pt b/src/clap_embedding/Yak.pt new file mode 100644 index 0000000000000000000000000000000000000000..122d56a22965c37ffa3e43117b036c1bce1d1858 --- /dev/null +++ b/src/clap_embedding/Yak.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa12b0d5dba3137ea4287649ea70f0bef11f105d12104bfa279e56f325b24f4f +size 3144 diff --git a/src/clap_embedding/Yawn.pt b/src/clap_embedding/Yawn.pt new file mode 100644 index 0000000000000000000000000000000000000000..1621c2bd56ad0a082b46e90ca35c07b184b55272 --- /dev/null +++ b/src/clap_embedding/Yawn.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ed77f715107c73b2e60dc3694829e4f5e847f295be1b378df54c61ed5ce434c +size 3149 diff --git a/src/clap_embedding/Yell.pt b/src/clap_embedding/Yell.pt new file mode 100644 index 0000000000000000000000000000000000000000..1cfd40425eb8aee705cc79bb7661a3ced12331c4 --- /dev/null +++ b/src/clap_embedding/Yell.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b26b9d816dff78943857c195ea37de1c672442ba41446088c5d32a45cc86bc3b +size 3149 diff --git a/src/clap_embedding/Yip.pt b/src/clap_embedding/Yip.pt new file mode 100644 index 0000000000000000000000000000000000000000..b6bfe25e67749cdd12d9139fe99b7efb019f2973 --- /dev/null +++ b/src/clap_embedding/Yip.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f05b6be2c6f1e49a84c65a720f62d1778a2720b5afc7e00e55ae5d43dc66cd43 +size 3144 diff --git a/src/clap_embedding/Yodeling.pt b/src/clap_embedding/Yodeling.pt new file mode 100644 index 0000000000000000000000000000000000000000..7623bbcf196eee063372b12750435a97ceb429c2 --- /dev/null +++ b/src/clap_embedding/Yodeling.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2971d97ccb4d25c8ecefd52a4212ba361d4d5ac875d7906d2e3a74e64ac421fc +size 3233 diff --git a/src/clap_embedding/Zing.pt b/src/clap_embedding/Zing.pt new file mode 100644 index 0000000000000000000000000000000000000000..91876584bb0c9ab9a690f4f28bea218776a15797 --- /dev/null +++ b/src/clap_embedding/Zing.pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39632ac040fa313042ba6b04bc39e9fc80cdd6b0b2d2752903be2defb4e7de83 +size 3149 diff --git a/src/clap_embedding/Zipper (clothing).pt b/src/clap_embedding/Zipper (clothing).pt new file mode 100644 index 0000000000000000000000000000000000000000..22430ffb343e35a38db1cf6657cd4958d36bf992 --- /dev/null +++ b/src/clap_embedding/Zipper (clothing).pt @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de18e2890b37173871520ddf209529b565e58aae170bdcd38fefb1bb85769557 +size 3278 diff --git a/src/configs/.ipynb_checkpoints/data_config-checkpoint.yml b/src/configs/.ipynb_checkpoints/data_config-checkpoint.yml new file mode 100644 index 0000000000000000000000000000000000000000..a044faf6141b7ab7e443e1889b2959783cd80a8f --- /dev/null +++ b/src/configs/.ipynb_checkpoints/data_config-checkpoint.yml @@ -0,0 +1,41 @@ +# dataset_config.yml +train_data: + data_dir: '/home/x/SSD/Dataset/Audioset_SL/16k/train/' + clap_dir: 'clap_embedding/' + meta_dir: '/home/x/SSD/Dataset/Audioset_SL/rule_all/train_df_duration.csv' + label_dir: '/home/x/SSD/Dataset/Audioset_SL/audiosetsl_train.csv' + class_list: '/home/x/SSD/Dataset/Audioset_SL/rule_all/label_to_id.csv' + label_per_audio: [10, 10] + seg_length: 10 + sr: 16000 + label_sr: 25 + label_type: 'strong' + norm: true + mono: true + sample_method: 'balance' + +val_data: + file_list: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/val/eval_list_dur_sampled.csv' + data_dir: '/home/x/SSD/Dataset/Audioset_SL/16k/eval/' + seg_length: 10 + sr: 16000 + norm: true + mono: true + +val_meta: + label: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/val/label_sampled.csv' + csv: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/val/audiosetsl_eval_shared_sampled.csv' + dur: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/val/eval_list_dur_sampled.csv' + +test_data: + file_list: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/full/eval_list_dur_sampled.csv' + data_dir: '/home/x/SSD/Dataset/Audioset_SL/16k/eval/' + seg_length: 10 + sr: 16000 + norm: true + mono: true + +test_meta: + label: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/full/label_sampled.csv' + csv: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/full/audiosetsl_eval_shared_sampled.csv' + dur: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/full/eval_list_dur_sampled.csv' \ No newline at end of file diff --git a/src/configs/.ipynb_checkpoints/model-checkpoint.yml b/src/configs/.ipynb_checkpoints/model-checkpoint.yml new file mode 100644 index 0000000000000000000000000000000000000000..0489d5b2b816ffc35ca9b0a32375ea19fdbca52b --- /dev/null +++ b/src/configs/.ipynb_checkpoints/model-checkpoint.yml @@ -0,0 +1,38 @@ +model_name: TSED_AS_filter + +encoder: + target_length: 1008 + patch_size: [64, 4] + patch_stride: [64, 4] + group_masking: True + embed_dim: 768 + depth: 12 + num_heads: 12 + +decoder: + embed_dim: 768 + depth: 6 + num_heads: 12 + cls_dim: 512 + fusion: adaln + +ft_blocks: [6, 7, 8, 9, 10, 11] +frozen_encoder: false + +net_pooling: 4 +sr: 16000 + +data_aug: + time_mask_ratios: [5, 20] + transform: # hyperparameters for data augmentations that do not alter the label information. + n_transform: 2 # 0: no augmentation below is applied. 1: same augmentation below is applied on student/teacher model input. 2: different augmentations below is applied on student/teacher model input. + choice: [ 1, 0, 0 ] # apply the chosen data augmentations: [ FilterAugment, freq_mask, add_noise ] + filter_db_range: [ -4.5, 6 ] # db range of FilterAugment to be applied on each band + filter_bands: [ 2, 5 ] # range of frequency band number in FilterAugment + filter_minimum_bandwidth: 4 + filter_type: step + freq_mask_ratio: 16 # maximum ratio of freuqnecy masking range. max 1/16 of total frequnecy number will be masked + noise_snrs: [ 35, 40 ] # snr of original signal wrpt the noise added. + +opt: !include opt_config.yml +data: !include data_config.yml \ No newline at end of file diff --git a/src/configs/.ipynb_checkpoints/opt_config-checkpoint.yml b/src/configs/.ipynb_checkpoints/opt_config-checkpoint.yml new file mode 100644 index 0000000000000000000000000000000000000000..8f1fbd084b59e7080b62451ee2a16cc250315b17 --- /dev/null +++ b/src/configs/.ipynb_checkpoints/opt_config-checkpoint.yml @@ -0,0 +1,21 @@ +# set 5.0e-05 for 32 batch +# set 1.0e-04 for 64 batch +# set 2.0e-0.4 for 128 or larger batch +learning_rate: 1.0e-04 +beta1: 0.9 +beta2: 0.999 +weight_decay: 0.01 +adam_epsilon: 1.0e-08 +grad_clip: 1.0 +batch_size: 16 +accumulation_steps: 1 + +w_weak: 0.0 + +lr_scheduler: + warmup_steps: 500 + decay_steps: 100000 + end_factor: 1.0e-02 + +# snr_gamma: 5.0 is used in tango, not stable, might use it in later experiment +# real batch = n_gpu * batch * accumulation \ No newline at end of file diff --git a/src/configs/data_config.yml b/src/configs/data_config.yml new file mode 100644 index 0000000000000000000000000000000000000000..a044faf6141b7ab7e443e1889b2959783cd80a8f --- /dev/null +++ b/src/configs/data_config.yml @@ -0,0 +1,41 @@ +# dataset_config.yml +train_data: + data_dir: '/home/x/SSD/Dataset/Audioset_SL/16k/train/' + clap_dir: 'clap_embedding/' + meta_dir: '/home/x/SSD/Dataset/Audioset_SL/rule_all/train_df_duration.csv' + label_dir: '/home/x/SSD/Dataset/Audioset_SL/audiosetsl_train.csv' + class_list: '/home/x/SSD/Dataset/Audioset_SL/rule_all/label_to_id.csv' + label_per_audio: [10, 10] + seg_length: 10 + sr: 16000 + label_sr: 25 + label_type: 'strong' + norm: true + mono: true + sample_method: 'balance' + +val_data: + file_list: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/val/eval_list_dur_sampled.csv' + data_dir: '/home/x/SSD/Dataset/Audioset_SL/16k/eval/' + seg_length: 10 + sr: 16000 + norm: true + mono: true + +val_meta: + label: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/val/label_sampled.csv' + csv: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/val/audiosetsl_eval_shared_sampled.csv' + dur: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/val/eval_list_dur_sampled.csv' + +test_data: + file_list: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/full/eval_list_dur_sampled.csv' + data_dir: '/home/x/SSD/Dataset/Audioset_SL/16k/eval/' + seg_length: 10 + sr: 16000 + norm: true + mono: true + +test_meta: + label: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/full/label_sampled.csv' + csv: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/full/audiosetsl_eval_shared_sampled.csv' + dur: '/home/x/SSD/Dataset/Audioset_SL/no_rule_shared/full/eval_list_dur_sampled.csv' \ No newline at end of file diff --git a/src/configs/model.yml b/src/configs/model.yml new file mode 100644 index 0000000000000000000000000000000000000000..0489d5b2b816ffc35ca9b0a32375ea19fdbca52b --- /dev/null +++ b/src/configs/model.yml @@ -0,0 +1,38 @@ +model_name: TSED_AS_filter + +encoder: + target_length: 1008 + patch_size: [64, 4] + patch_stride: [64, 4] + group_masking: True + embed_dim: 768 + depth: 12 + num_heads: 12 + +decoder: + embed_dim: 768 + depth: 6 + num_heads: 12 + cls_dim: 512 + fusion: adaln + +ft_blocks: [6, 7, 8, 9, 10, 11] +frozen_encoder: false + +net_pooling: 4 +sr: 16000 + +data_aug: + time_mask_ratios: [5, 20] + transform: # hyperparameters for data augmentations that do not alter the label information. + n_transform: 2 # 0: no augmentation below is applied. 1: same augmentation below is applied on student/teacher model input. 2: different augmentations below is applied on student/teacher model input. + choice: [ 1, 0, 0 ] # apply the chosen data augmentations: [ FilterAugment, freq_mask, add_noise ] + filter_db_range: [ -4.5, 6 ] # db range of FilterAugment to be applied on each band + filter_bands: [ 2, 5 ] # range of frequency band number in FilterAugment + filter_minimum_bandwidth: 4 + filter_type: step + freq_mask_ratio: 16 # maximum ratio of freuqnecy masking range. max 1/16 of total frequnecy number will be masked + noise_snrs: [ 35, 40 ] # snr of original signal wrpt the noise added. + +opt: !include opt_config.yml +data: !include data_config.yml \ No newline at end of file diff --git a/src/configs/opt_config.yml b/src/configs/opt_config.yml new file mode 100644 index 0000000000000000000000000000000000000000..8f1fbd084b59e7080b62451ee2a16cc250315b17 --- /dev/null +++ b/src/configs/opt_config.yml @@ -0,0 +1,21 @@ +# set 5.0e-05 for 32 batch +# set 1.0e-04 for 64 batch +# set 2.0e-0.4 for 128 or larger batch +learning_rate: 1.0e-04 +beta1: 0.9 +beta2: 0.999 +weight_decay: 0.01 +adam_epsilon: 1.0e-08 +grad_clip: 1.0 +batch_size: 16 +accumulation_steps: 1 + +w_weak: 0.0 + +lr_scheduler: + warmup_steps: 500 + decay_steps: 100000 + end_factor: 1.0e-02 + +# snr_gamma: 5.0 is used in tango, not stable, might use it in later experiment +# real batch = n_gpu * batch * accumulation \ No newline at end of file diff --git a/src/dataset/.ipynb_checkpoints/tsed-checkpoint.py b/src/dataset/.ipynb_checkpoints/tsed-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..2335d1eaa3474723e42b7e79849fde66d2bb354d --- /dev/null +++ b/src/dataset/.ipynb_checkpoints/tsed-checkpoint.py @@ -0,0 +1,142 @@ +import torch +import torchaudio +import torch.nn as nn +import pandas as pd +import random +from torch.utils.data import Dataset +import ast + + +class TSED_AS(Dataset): + def __init__(self, data_dir, clap_dir, meta_dir, label_dir, class_list, + seg_length=10, sr=16000, label_sr=25, label_per_audio=[10, 10], + norm=True, mono=True, label_type='strong', debug=False, sample_method='random', + neg_removed_weight=0.25, + **kwargs): + + self.data_dir = data_dir + self.clap_dir = clap_dir + meta = pd.read_csv(meta_dir) + meta = meta[meta['duration'] != 0] + self.meta = meta + if label_type == 'strong': + label = pd.read_csv(label_dir) + self.label = label + else: + self.label = None + + self.label_per_audio = label_per_audio + + self.class_list = pd.read_csv(class_list) + self.class_dict = dict(self.class_list.set_index('id')['label']) # Convert to dict + # self.event_id = dict(self.class_list.set_index('label')['id']) + self.cls_ids = sorted(self.class_list['id'].unique().tolist()) + self.sample_method = sample_method + + self.seg_len = seg_length + self.sr = sr + self.label_sr = label_sr + self.label_type = label_type + + self.norm = norm + self.mono = mono + + self.neg_removed_weight = neg_removed_weight + + def load_audio(self, audio_path): + y, sr = torchaudio.load(audio_path) + assert sr == self.sr + + # Handle stereo or mono based on self.mono + if self.mono: + # Convert to mono by averaging all channels + y = torch.mean(y, dim=0, keepdim=True) + else: + if y.shape[0] == 1: + pass + elif y.shape[0] == 2: + # Randomly pick one of the two stereo channels or take the mean + if random.choice([True, False]): + y = torch.mean(y, dim=0, keepdim=True) + else: + channel = random.choice([0, 1]) + y = y[channel, :].unsqueeze(0) + else: + raise ValueError("Unsupported number of channels: {}".format(y.shape[0])) + + total_length = y.shape[-1] + + start = 0 + end = min(start + self.seg_len * self.sr, total_length) + + audio_clip = torch.zeros(self.seg_len * self.sr) + audio_clip[:end - start] = y[0, start:end] + + if self.norm: + eps = 1e-9 + max_val = torch.max(torch.abs(audio_clip)) + audio_clip = audio_clip / (max_val + eps) + # audio_clip = self.augmenter(audio_clip) + return audio_clip + + def load_label(self, filelabel, event_label): + target = torch.zeros(self.seg_len * self.label_sr) + if self.label_type == 'strong': + label = filelabel[filelabel['label'] == event_label] + for i in range(len(label)): + row = label.iloc[i] + onset = row['onset'] + offset = row['offset'] + target[round(onset*self.label_sr):round(offset*self.label_sr)] = 1 + else: + pass + return target.unsqueeze(0) + + def __getitem__(self, index): + row = self.meta.iloc[index] + audio = self.load_audio(self.data_dir + row['file_name']) + + # TBD balance positive and negative + if self.sample_method == 'fix': + cls_list = row['ids'] + if self.sample_method == 'random': + cls_queue = self.cls_ids + cls_list = random.sample(cls_queue, self.label_per_audio) + elif self.sample_method == 'balance': + pos_ids = ast.literal_eval(row['pos_ids']) + neg_ids = ast.literal_eval(row['neg_ids']) + removed_ids = ast.literal_eval(row['removed_ids']) + N_p, N_n = self.label_per_audio + if len(pos_ids) < N_p: + N_n += N_p - len(pos_ids) + assert len(neg_ids) + len(removed_ids) >= N_n + # elif len(neg_ids) < N_n: + # N_p += N_n - len(neg_ids) + sampled_pos = random.sample(pos_ids, min(N_p, len(pos_ids))) + + # Combine neg_ids and removed_ids with different sampling weights + candidates = neg_ids + removed_ids + weights = [1.0] * len(neg_ids) + [self.neg_removed_weight] * len(removed_ids) + sampled_neg = random.choices(candidates, weights=weights, k=min(N_n, len(candidates))) + + cls_list = sampled_pos + sampled_neg + + cls_tokens = [] + labels = [] + + filelabel = self.label[self.label['filename'] == row['file_name']] + + for cls_id in cls_list: + event_label = self.class_dict[cls_id] + cls = torch.load(self.clap_dir + event_label + '.pt') + cls_tokens.append(cls) + label = self.load_label(filelabel, event_label) + labels.append(label) + + cls_tokens = torch.cat(cls_tokens, dim=0) + labels = torch.cat(labels, dim=0) + + return audio, cls_tokens, labels, row['file_name'] + + def __len__(self): + return len(self.meta) \ No newline at end of file diff --git a/src/dataset/tsed.py b/src/dataset/tsed.py new file mode 100644 index 0000000000000000000000000000000000000000..2335d1eaa3474723e42b7e79849fde66d2bb354d --- /dev/null +++ b/src/dataset/tsed.py @@ -0,0 +1,142 @@ +import torch +import torchaudio +import torch.nn as nn +import pandas as pd +import random +from torch.utils.data import Dataset +import ast + + +class TSED_AS(Dataset): + def __init__(self, data_dir, clap_dir, meta_dir, label_dir, class_list, + seg_length=10, sr=16000, label_sr=25, label_per_audio=[10, 10], + norm=True, mono=True, label_type='strong', debug=False, sample_method='random', + neg_removed_weight=0.25, + **kwargs): + + self.data_dir = data_dir + self.clap_dir = clap_dir + meta = pd.read_csv(meta_dir) + meta = meta[meta['duration'] != 0] + self.meta = meta + if label_type == 'strong': + label = pd.read_csv(label_dir) + self.label = label + else: + self.label = None + + self.label_per_audio = label_per_audio + + self.class_list = pd.read_csv(class_list) + self.class_dict = dict(self.class_list.set_index('id')['label']) # Convert to dict + # self.event_id = dict(self.class_list.set_index('label')['id']) + self.cls_ids = sorted(self.class_list['id'].unique().tolist()) + self.sample_method = sample_method + + self.seg_len = seg_length + self.sr = sr + self.label_sr = label_sr + self.label_type = label_type + + self.norm = norm + self.mono = mono + + self.neg_removed_weight = neg_removed_weight + + def load_audio(self, audio_path): + y, sr = torchaudio.load(audio_path) + assert sr == self.sr + + # Handle stereo or mono based on self.mono + if self.mono: + # Convert to mono by averaging all channels + y = torch.mean(y, dim=0, keepdim=True) + else: + if y.shape[0] == 1: + pass + elif y.shape[0] == 2: + # Randomly pick one of the two stereo channels or take the mean + if random.choice([True, False]): + y = torch.mean(y, dim=0, keepdim=True) + else: + channel = random.choice([0, 1]) + y = y[channel, :].unsqueeze(0) + else: + raise ValueError("Unsupported number of channels: {}".format(y.shape[0])) + + total_length = y.shape[-1] + + start = 0 + end = min(start + self.seg_len * self.sr, total_length) + + audio_clip = torch.zeros(self.seg_len * self.sr) + audio_clip[:end - start] = y[0, start:end] + + if self.norm: + eps = 1e-9 + max_val = torch.max(torch.abs(audio_clip)) + audio_clip = audio_clip / (max_val + eps) + # audio_clip = self.augmenter(audio_clip) + return audio_clip + + def load_label(self, filelabel, event_label): + target = torch.zeros(self.seg_len * self.label_sr) + if self.label_type == 'strong': + label = filelabel[filelabel['label'] == event_label] + for i in range(len(label)): + row = label.iloc[i] + onset = row['onset'] + offset = row['offset'] + target[round(onset*self.label_sr):round(offset*self.label_sr)] = 1 + else: + pass + return target.unsqueeze(0) + + def __getitem__(self, index): + row = self.meta.iloc[index] + audio = self.load_audio(self.data_dir + row['file_name']) + + # TBD balance positive and negative + if self.sample_method == 'fix': + cls_list = row['ids'] + if self.sample_method == 'random': + cls_queue = self.cls_ids + cls_list = random.sample(cls_queue, self.label_per_audio) + elif self.sample_method == 'balance': + pos_ids = ast.literal_eval(row['pos_ids']) + neg_ids = ast.literal_eval(row['neg_ids']) + removed_ids = ast.literal_eval(row['removed_ids']) + N_p, N_n = self.label_per_audio + if len(pos_ids) < N_p: + N_n += N_p - len(pos_ids) + assert len(neg_ids) + len(removed_ids) >= N_n + # elif len(neg_ids) < N_n: + # N_p += N_n - len(neg_ids) + sampled_pos = random.sample(pos_ids, min(N_p, len(pos_ids))) + + # Combine neg_ids and removed_ids with different sampling weights + candidates = neg_ids + removed_ids + weights = [1.0] * len(neg_ids) + [self.neg_removed_weight] * len(removed_ids) + sampled_neg = random.choices(candidates, weights=weights, k=min(N_n, len(candidates))) + + cls_list = sampled_pos + sampled_neg + + cls_tokens = [] + labels = [] + + filelabel = self.label[self.label['filename'] == row['file_name']] + + for cls_id in cls_list: + event_label = self.class_dict[cls_id] + cls = torch.load(self.clap_dir + event_label + '.pt') + cls_tokens.append(cls) + label = self.load_label(filelabel, event_label) + labels.append(label) + + cls_tokens = torch.cat(cls_tokens, dim=0) + labels = torch.cat(labels, dim=0) + + return audio, cls_tokens, labels, row['file_name'] + + def __len__(self): + return len(self.meta) \ No newline at end of file diff --git a/src/dataset/tsed_val.py b/src/dataset/tsed_val.py new file mode 100644 index 0000000000000000000000000000000000000000..4c08904d60b28caef09034b2d18f2404fe401df7 --- /dev/null +++ b/src/dataset/tsed_val.py @@ -0,0 +1,68 @@ +import torch +import torchaudio +import torch.nn as nn +import pandas as pd +import random +from torch.utils.data import Dataset + + +class TSED_Val(Dataset): + def __init__(self, file_list, data_dir, + seg_length=10, sr=16000, + norm=True, mono=True, + **kwargs): + + self.data_dir = data_dir + meta = pd.read_csv(file_list, sep='\t') + meta = meta[meta['duration'] != 0] + self.meta = meta + + self.seg_len = seg_length + self.sr = sr + + self.norm = norm + self.mono = mono + + def load_audio(self, audio_path): + y, sr = torchaudio.load(audio_path) + assert sr == self.sr + + # Handle stereo or mono based on self.mono + if self.mono: + # Convert to mono by averaging all channels + y = torch.mean(y, dim=0, keepdim=True) + else: + if y.shape[0] == 1: + pass + elif y.shape[0] == 2: + # Randomly pick one of the two stereo channels or take the mean + if random.choice([True, False]): + y = torch.mean(y, dim=0, keepdim=True) + else: + channel = random.choice([0, 1]) + y = y[channel, :].unsqueeze(0) + else: + raise ValueError("Unsupported number of channels: {}".format(y.shape[0])) + + total_length = y.shape[-1] + + start = 0 + end = min(start + self.seg_len * self.sr, total_length) + + audio_clip = torch.zeros(self.seg_len * self.sr) + audio_clip[:end - start] = y[0, start:end] + + if self.norm: + eps = 1e-9 + max_val = torch.max(torch.abs(audio_clip)) + audio_clip = audio_clip / (max_val + eps) + # audio_clip = self.augmenter(audio_clip) + return audio_clip + + def __getitem__(self, index): + row = self.meta.iloc[index] + audio = self.load_audio(self.data_dir + row['filename']) + return audio, row['filename'] + + def __len__(self): + return len(self.meta) \ No newline at end of file diff --git a/src/desed_task/README.md b/src/desed_task/README.md new file mode 100644 index 0000000000000000000000000000000000000000..1d1a4aaf445d4a5136fadeade613d7f1c2ed2338 --- /dev/null +++ b/src/desed_task/README.md @@ -0,0 +1,3 @@ +# desed_task package + +Description of the package to be added here. diff --git a/src/desed_task/data_augm.py b/src/desed_task/data_augm.py new file mode 100644 index 0000000000000000000000000000000000000000..0cf022d504fcd7627801190d7374fec181ddcdd5 --- /dev/null +++ b/src/desed_task/data_augm.py @@ -0,0 +1,76 @@ +import numpy as np +import torch +import random + + +def frame_shift(mels, labels, net_pooling=4): + bsz, n_bands, frames = mels.shape + shifted = [] + new_labels = [] + for bindx in range(bsz): + shift = int(random.gauss(0, 90)) + shifted.append(torch.roll(mels[bindx], shift, dims=-1)) + shift = -abs(shift) // net_pooling if shift < 0 else shift // net_pooling + new_labels.append(torch.roll(labels[bindx], shift, dims=-1)) + return torch.stack(shifted), torch.stack(new_labels) + + +def mixup(data, target=None, alpha=0.2, beta=0.2, mixup_label_type="soft"): + """Mixup data augmentation by permuting the data + + Args: + data: input tensor, must be a batch so data can be permuted and mixed. + target: tensor of the target to be mixed, if None, do not return targets. + alpha: float, the parameter to the np.random.beta distribution + beta: float, the parameter to the np.random.beta distribution + mixup_label_type: str, the type of mixup to be used choice between {'soft', 'hard'}. + Returns: + torch.Tensor of mixed data and labels if given + """ + with torch.no_grad(): + batch_size = data.size(0) + c = np.random.beta(alpha, beta) + + perm = torch.randperm(batch_size) + + mixed_data = c * data + (1 - c) * data[perm, :] + if target is not None: + if mixup_label_type == "soft": + mixed_target = torch.clamp( + c * target + (1 - c) * target[perm, :], min=0, max=1 + ) + elif mixup_label_type == "hard": + mixed_target = torch.clamp(target + target[perm, :], min=0, max=1) + else: + raise NotImplementedError( + f"mixup_label_type: {mixup_label_type} not implemented. choice in " + f"{'soft', 'hard'}" + ) + + return mixed_data, mixed_target + else: + return mixed_data + + +def add_noise(mels, snrs=(6, 30), dims=(1, 2)): + """ Add white noise to mels spectrograms + Args: + mels: torch.tensor, mels spectrograms to apply the white noise to. + snrs: int or tuple, the range of snrs to choose from if tuple (uniform) + dims: tuple, the dimensions for which to compute the standard deviation (default to (1,2) because assume + an input of a batch of mel spectrograms. + Returns: + torch.Tensor of mels with noise applied + """ + if isinstance(snrs, (list, tuple)): + snr = (snrs[0] - snrs[1]) * torch.rand( + (mels.shape[0],), device=mels.device + ).reshape(-1, 1, 1) + snrs[1] + else: + snr = snrs + + snr = 10 ** (snr / 20) # linear domain + sigma = torch.std(mels, dim=dims, keepdim=True) / snr + mels = mels + torch.randn(mels.shape, device=mels.device) * sigma + + return mels diff --git a/src/desed_task/dataio/__init__.py b/src/desed_task/dataio/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..72841a464044c96dca5cb2f77305918901cfb038 --- /dev/null +++ b/src/desed_task/dataio/__init__.py @@ -0,0 +1,2 @@ +from .datasets import WeakSet, UnlabeledSet, StronglyAnnotatedSet +from .sampler import ConcatDatasetBatchSampler diff --git a/src/desed_task/dataio/__pycache__/__init__.cpython-310.pyc b/src/desed_task/dataio/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2690a46b0dc34c84207e4c2f98732c35776e2fdd Binary files /dev/null and b/src/desed_task/dataio/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/desed_task/dataio/__pycache__/datasets.cpython-310.pyc b/src/desed_task/dataio/__pycache__/datasets.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b28712516086f5b98b52e9266a2200fffd5578a2 Binary files /dev/null and b/src/desed_task/dataio/__pycache__/datasets.cpython-310.pyc differ diff --git a/src/desed_task/dataio/__pycache__/sampler.cpython-310.pyc b/src/desed_task/dataio/__pycache__/sampler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ccf4b86784b0fbf292a84105512a0c74ae00e788 Binary files /dev/null and b/src/desed_task/dataio/__pycache__/sampler.cpython-310.pyc differ diff --git a/src/desed_task/dataio/datasets.py b/src/desed_task/dataio/datasets.py new file mode 100644 index 0000000000000000000000000000000000000000..80c79727b5651a66e7a1a681f9209b47ffda41b4 --- /dev/null +++ b/src/desed_task/dataio/datasets.py @@ -0,0 +1,383 @@ +from torch.utils.data import Dataset +import pandas as pd +import os +import numpy as np +import torchaudio +import random +import torch +import glob +import h5py +from pathlib import Path + + +def to_mono(mixture, random_ch=False): + if mixture.ndim > 1: # multi channel + if not random_ch: + mixture = torch.mean(mixture, 0) + else: # randomly select one channel + indx = np.random.randint(0, mixture.shape[0] - 1) + mixture = mixture[indx] + return mixture + + +def pad_audio(audio, target_len, fs): + if audio.shape[-1] < target_len: + audio = torch.nn.functional.pad( + audio, (0, target_len - audio.shape[-1]), mode="constant" + ) + + padded_indx = [target_len / len(audio)] + onset_s = 0.000 + + elif len(audio) > target_len: + + rand_onset = random.randint(0, len(audio) - target_len) + audio = audio[rand_onset:rand_onset + target_len] + onset_s = round(rand_onset / fs, 3) + + padded_indx = [target_len / len(audio)] + else: + + onset_s = 0.000 + padded_indx = [1.0] + + offset_s = round(onset_s + (target_len / fs), 3) + return audio, onset_s, offset_s, padded_indx + + +def process_labels(df, onset, offset): + df["onset"] = df["onset"] - onset + df["offset"] = df["offset"] - onset + + df["onset"] = df.apply(lambda x: max(0, x["onset"]), axis=1) + df["offset"] = df.apply(lambda x: min(10, x["offset"]), axis=1) + + df_new = df[(df.onset < df.offset)] + + return df_new.drop_duplicates() + + +def read_audio(file, multisrc, random_channel, pad_to): + mixture, fs = torchaudio.load(file) + + if not multisrc: + mixture = to_mono(mixture, random_channel) + + if pad_to is not None: + mixture, onset_s, offset_s, padded_indx = pad_audio(mixture, pad_to, fs) + else: + padded_indx = [1.0] + onset_s = None + offset_s = None + + mixture = mixture.float() + return mixture, onset_s, offset_s, padded_indx + + +class StronglyAnnotatedSet(Dataset): + def __init__( + self, + audio_folder, + tsv_entries, + encoder, + pad_to=10, + fs=16000, + return_filename=False, + random_channel=False, + multisrc=False, + feats_pipeline=None, + embeddings_hdf5_file=None, + embedding_type=None + + ): + + self.encoder = encoder + self.fs = fs + self.pad_to = pad_to * fs + self.return_filename = return_filename + self.random_channel = random_channel + self.multisrc = multisrc + self.feats_pipeline = feats_pipeline + self.embeddings_hdf5_file = embeddings_hdf5_file + self.embedding_type = embedding_type + assert embedding_type in ["global", "frame", None], "embedding type are either frame or global or None, got {}".format(embedding_type) + + tsv_entries = tsv_entries.dropna() + + examples = {} + for i, r in tsv_entries.iterrows(): + if r["filename"] not in examples.keys(): + examples[r["filename"]] = { + "mixture": os.path.join(audio_folder, r["filename"]), + "events": [], + } + if not np.isnan(r["onset"]): + examples[r["filename"]]["events"].append( + { + "event_label": r["event_label"], + "onset": r["onset"], + "offset": r["offset"], + } + ) + else: + if not np.isnan(r["onset"]): + examples[r["filename"]]["events"].append( + { + "event_label": r["event_label"], + "onset": r["onset"], + "offset": r["offset"], + } + ) + + # we construct a dictionary for each example + self.examples = examples + self.examples_list = list(examples.keys()) + + if self.embeddings_hdf5_file is not None: + assert self.embedding_type is not None, "If you use embeddings you need to specify also the type (global or frame)" + # fetch dict of positions for each example + self.ex2emb_idx = {} + f = h5py.File(self.embeddings_hdf5_file, "r") + for i, fname in enumerate(f["filenames"]): + self.ex2emb_idx[fname.decode('UTF-8')] = i + self._opened_hdf5 = None + + def __len__(self): + return len(self.examples_list) + + @property + def hdf5_file(self): + if self._opened_hdf5 is None: + self._opened_hdf5 = h5py.File(self.embeddings_hdf5_file, "r") + return self._opened_hdf5 + + def __getitem__(self, item): + + c_ex = self.examples[self.examples_list[item]] + mixture, onset_s, offset_s, padded_indx = read_audio( + c_ex["mixture"], self.multisrc, self.random_channel, self.pad_to + ) + + # labels + labels = c_ex["events"] + + # to steps + labels_df = pd.DataFrame(labels) + labels_df = process_labels(labels_df, onset_s, offset_s) + + # check if labels exists: + if not len(labels_df): + max_len_targets = self.encoder.n_frames + strong = torch.zeros(max_len_targets, len(self.encoder.labels)).float() + else: + strong = self.encoder.encode_strong_df(labels_df) + strong = torch.from_numpy(strong).float() + + out_args = [mixture, strong.transpose(0, 1), padded_indx] + + if self.feats_pipeline is not None: + # use this function to extract features in the dataloader and apply possibly some data augm + feats = self.feats_pipeline(mixture) + out_args.append(feats) + if self.return_filename: + out_args.append(c_ex["mixture"]) + + if self.embeddings_hdf5_file is not None: + + name = Path(c_ex["mixture"]).stem + index = self.ex2emb_idx[name] + + if self.embedding_type == "global": + embeddings = torch.from_numpy(self.hdf5_file["global_embeddings"][index]).float() + elif self.embedding_type == "frame": + embeddings = torch.from_numpy(np.stack(self.hdf5_file["frame_embeddings"][index])).float() + else: + raise NotImplementedError + + out_args.append(embeddings) + + return out_args + + +class WeakSet(Dataset): + + def __init__( + self, + audio_folder, + tsv_entries, + encoder, + pad_to=10, + fs=16000, + return_filename=False, + random_channel=False, + multisrc=False, + feats_pipeline=None, + embeddings_hdf5_file=None, + embedding_type=None, + + ): + + self.encoder = encoder + self.fs = fs + self.pad_to = pad_to * fs + self.return_filename = return_filename + self.random_channel = random_channel + self.multisrc = multisrc + self.feats_pipeline = feats_pipeline + self.embeddings_hdf5_file = embeddings_hdf5_file + self.embedding_type = embedding_type + assert embedding_type in ["global", "frame", + None], "embedding type are either frame or global or None, got {}".format( + embedding_type) + + examples = {} + for i, r in tsv_entries.iterrows(): + + if r["filename"] not in examples.keys(): + examples[r["filename"]] = { + "mixture": os.path.join(audio_folder, r["filename"]), + "events": r["event_labels"].split(","), + } + + self.examples = examples + self.examples_list = list(examples.keys()) + + if self.embeddings_hdf5_file is not None: + assert self.embedding_type is not None, "If you use embeddings you need to specify also the type (global or frame)" + # fetch dict of positions for each example + self.ex2emb_idx = {} + f = h5py.File(self.embeddings_hdf5_file, "r") + for i, fname in enumerate(f["filenames"]): + self.ex2emb_idx[fname.decode('UTF-8')] = i + self._opened_hdf5 = None + + def __len__(self): + return len(self.examples_list) + + @property + def hdf5_file(self): + if self._opened_hdf5 is None: + self._opened_hdf5 = h5py.File(self.embeddings_hdf5_file, "r") + return self._opened_hdf5 + + def __getitem__(self, item): + file = self.examples_list[item] + c_ex = self.examples[file] + + mixture, _, _, padded_indx = read_audio( + c_ex["mixture"], self.multisrc, self.random_channel, self.pad_to + ) + + # labels + labels = c_ex["events"] + # check if labels exists: + max_len_targets = self.encoder.n_frames + weak = torch.zeros(max_len_targets, len(self.encoder.labels)) + if len(labels): + weak_labels = self.encoder.encode_weak(labels) + weak[0, :] = torch.from_numpy(weak_labels).float() + + out_args = [mixture, weak.transpose(0, 1), padded_indx] + + if self.feats_pipeline is not None: + feats = self.feats_pipeline(mixture) + out_args.append(feats) + + if self.return_filename: + out_args.append(c_ex["mixture"]) + + if self.embeddings_hdf5_file is not None: + name = Path(c_ex["mixture"]).stem + index = self.ex2emb_idx[name] + + if self.embedding_type == "global": + embeddings = torch.from_numpy(self.hdf5_file["global_embeddings"][index]).float() + elif self.embedding_type == "frame": + embeddings = torch.from_numpy(np.stack(self.hdf5_file["frame_embeddings"][index])).float() + else: + raise NotImplementedError + + out_args.append(embeddings) + + + return out_args + + +class UnlabeledSet(Dataset): + def __init__( + self, + unlabeled_folder, + encoder, + pad_to=10, + fs=16000, + return_filename=False, + random_channel=False, + multisrc=False, + feats_pipeline=None, + embeddings_hdf5_file=None, + embedding_type=None, + ): + + self.encoder = encoder + self.fs = fs + self.pad_to = pad_to * fs if pad_to is not None else None + self.examples = glob.glob(os.path.join(unlabeled_folder, "*.wav")) + self.return_filename = return_filename + self.random_channel = random_channel + self.multisrc = multisrc + self.feats_pipeline = feats_pipeline + self.embeddings_hdf5_file = embeddings_hdf5_file + self.embedding_type = embedding_type + assert embedding_type in ["global", "frame", + None], "embedding type are either frame or global or None, got {}".format( + embedding_type) + + if self.embeddings_hdf5_file is not None: + assert self.embedding_type is not None, "If you use embeddings you need to specify also the type (global or frame)" + # fetch dict of positions for each example + self.ex2emb_idx = {} + f = h5py.File(self.embeddings_hdf5_file, "r") + for i, fname in enumerate(f["filenames"]): + self.ex2emb_idx[fname.decode('UTF-8')] = i + self._opened_hdf5 = None + + def __len__(self): + return len(self.examples) + + @property + def hdf5_file(self): + if self._opened_hdf5 is None: + self._opened_hdf5 = h5py.File(self.embeddings_hdf5_file, "r") + return self._opened_hdf5 + + def __getitem__(self, item): + c_ex = self.examples[item] + + mixture, _, _, padded_indx = read_audio( + c_ex, self.multisrc, self.random_channel, self.pad_to + ) + + max_len_targets = self.encoder.n_frames + strong = torch.zeros(max_len_targets, len(self.encoder.labels)).float() + out_args = [mixture, strong.transpose(0, 1), padded_indx] + if self.feats_pipeline is not None: + feats = self.feats_pipeline(mixture) + out_args.append(feats) + + if self.return_filename: + out_args.append(c_ex) + + if self.embeddings_hdf5_file is not None: + name = Path(c_ex).stem + index = self.ex2emb_idx[name] + + if self.embedding_type == "global": + embeddings = torch.from_numpy(self.hdf5_file["global_embeddings"][index]).float() + elif self.embedding_type == "frame": + embeddings = torch.from_numpy(np.stack(self.hdf5_file["frame_embeddings"][index])).float() + else: + raise NotImplementedError + + out_args.append(embeddings) + + return out_args diff --git a/src/desed_task/dataio/sampler.py b/src/desed_task/dataio/sampler.py new file mode 100644 index 0000000000000000000000000000000000000000..666d38204e4ddea3a5c715aedd5d2c8e2ac52017 --- /dev/null +++ b/src/desed_task/dataio/sampler.py @@ -0,0 +1,91 @@ +from torch.utils.data import Sampler +import numpy as np + + +class ConcatDatasetBatchSampler(Sampler): + """This sampler is built to work with a standard Pytorch ConcatDataset. + From SpeechBrain dataio see https://github.com/speechbrain/ + + It is used to retrieve elements from the different concatenated datasets placing them in the same batch + with proportion specified by batch_sizes, e.g 8, 16 means each batch will + be of 24 elements with the first 8 belonging to the first dataset in ConcatDataset + object and the last 16 to the second. + More than two datasets are supported, in that case you need to provide 3 batch + sizes. + + Note + ---- + Batched are drawn from the datasets till the one with smallest length is exhausted. + Thus number of examples in your training epoch is dictated by the dataset + whose length is the smallest. + + + Arguments + --------- + samplers : int + The base seed to use for the random number generator. It is recommended + to use a value which has a good mix of 0 and 1 bits. + batch_sizes: list + Batch sizes. + epoch : int + The epoch to start at. + """ + + def __init__(self, samplers, batch_sizes: (tuple, list), epoch=0) -> None: + + if not isinstance(samplers, (list, tuple)): + raise ValueError( + "samplers should be a list or tuple of Pytorch Samplers, " + "but got samplers={}".format(batch_sizes) + ) + + if not isinstance(batch_sizes, (list, tuple)): + raise ValueError( + "batch_sizes should be a list or tuple of integers, " + "but got batch_sizes={}".format(batch_sizes) + ) + + if not len(batch_sizes) == len(samplers): + raise ValueError("batch_sizes and samplers should be have same length") + + self.batch_sizes = batch_sizes + self.samplers = samplers + self.offsets = [0] + np.cumsum([len(x) for x in self.samplers]).tolist()[:-1] + + self.epoch = epoch + self.set_epoch(self.epoch) + + def _iter_one_dataset(self, c_batch_size, c_sampler, c_offset): + batch = [] + for idx in c_sampler: + batch.append(c_offset + idx) + if len(batch) == c_batch_size: + yield batch + + def set_epoch(self, epoch): + if hasattr(self.samplers[0], "epoch"): + for s in self.samplers: + s.set_epoch(epoch) + + def __iter__(self): + + iterators = [iter(i) for i in self.samplers] + tot_batch = [] + + for b_num in range(len(self)): + for samp_idx in range(len(self.samplers)): + c_batch = [] + while len(c_batch) < self.batch_sizes[samp_idx]: + c_batch.append(self.offsets[samp_idx] + next(iterators[samp_idx])) + tot_batch.extend(c_batch) + yield tot_batch + tot_batch = [] + + def __len__(self): + + min_len = float("inf") + for idx, sampler in enumerate(self.samplers): + c_len = (len(sampler)) // self.batch_sizes[idx] + + min_len = min(c_len, min_len) + return min_len diff --git a/src/desed_task/evaluation/.ipynb_checkpoints/evaluation_measures-checkpoint.py b/src/desed_task/evaluation/.ipynb_checkpoints/evaluation_measures-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..38c5283a59a96674cf0cc1e582d3ce905bb1d8f2 --- /dev/null +++ b/src/desed_task/evaluation/.ipynb_checkpoints/evaluation_measures-checkpoint.py @@ -0,0 +1,294 @@ +import os + +import numpy as np +import pandas as pd +import psds_eval +import sed_eval +from psds_eval import PSDSEval, plot_psd_roc +import sed_scores_eval + + +def get_event_list_current_file(df, fname): + """ + Get list of events for a given filename + Args: + df: pd.DataFrame, the dataframe to search on + fname: the filename to extract the value from the dataframe + Returns: + list of events (dictionaries) for the given filename + """ + event_file = df[df["filename"] == fname] + if len(event_file) == 1: + if pd.isna(event_file["event_label"].iloc[0]): + event_list_for_current_file = [{"filename": fname}] + else: + event_list_for_current_file = event_file.to_dict("records") + else: + event_list_for_current_file = event_file.to_dict("records") + + return event_list_for_current_file + + +def psds_results(psds_obj): + """ Compute psds scores + Args: + psds_obj: psds_eval.PSDSEval object with operating points. + Returns: + """ + try: + psds_score = psds_obj.psds(alpha_ct=0, alpha_st=0, max_efpr=100) + print(f"\nPSD-Score (0, 0, 100): {psds_score.value:.5f}") + psds_score = psds_obj.psds(alpha_ct=1, alpha_st=0, max_efpr=100) + print(f"\nPSD-Score (1, 0, 100): {psds_score.value:.5f}") + psds_score = psds_obj.psds(alpha_ct=0, alpha_st=1, max_efpr=100) + print(f"\nPSD-Score (0, 1, 100): {psds_score.value:.5f}") + except psds_eval.psds.PSDSEvalError as e: + print("psds did not work ....") + raise EnvironmentError + + +def event_based_evaluation_df( + reference, estimated, t_collar=0.200, percentage_of_length=0.2 +): + """ Calculate EventBasedMetric given a reference and estimated dataframe + + Args: + reference: pd.DataFrame containing "filename" "onset" "offset" and "event_label" columns which describe the + reference events + estimated: pd.DataFrame containing "filename" "onset" "offset" and "event_label" columns which describe the + estimated events to be compared with reference + t_collar: float, in seconds, the number of time allowed on onsets and offsets + percentage_of_length: float, between 0 and 1, the percentage of length of the file allowed on the offset + Returns: + sed_eval.sound_event.EventBasedMetrics with the scores + """ + + evaluated_files = reference["filename"].unique() + + classes = [] + classes.extend(reference.event_label.dropna().unique()) + classes.extend(estimated.event_label.dropna().unique()) + classes = list(set(classes)) + + event_based_metric = sed_eval.sound_event.EventBasedMetrics( + event_label_list=classes, + t_collar=t_collar, + percentage_of_length=percentage_of_length, + empty_system_output_handling="zero_score", + ) + + for fname in evaluated_files: + reference_event_list_for_current_file = get_event_list_current_file( + reference, fname + ) + estimated_event_list_for_current_file = get_event_list_current_file( + estimated, fname + ) + + event_based_metric.evaluate( + reference_event_list=reference_event_list_for_current_file, + estimated_event_list=estimated_event_list_for_current_file, + ) + + return event_based_metric + + +def segment_based_evaluation_df(reference, estimated, time_resolution=1.0): + """ Calculate SegmentBasedMetrics given a reference and estimated dataframe + + Args: + reference: pd.DataFrame containing "filename" "onset" "offset" and "event_label" columns which describe the + reference events + estimated: pd.DataFrame containing "filename" "onset" "offset" and "event_label" columns which describe the + estimated events to be compared with reference + time_resolution: float, the time resolution of the segment based metric + Returns: + sed_eval.sound_event.SegmentBasedMetrics with the scores + """ + evaluated_files = reference["filename"].unique() + + classes = [] + classes.extend(reference.event_label.dropna().unique()) + classes.extend(estimated.event_label.dropna().unique()) + classes = list(set(classes)) + + segment_based_metric = sed_eval.sound_event.SegmentBasedMetrics( + event_label_list=classes, time_resolution=time_resolution + ) + + for fname in evaluated_files: + reference_event_list_for_current_file = get_event_list_current_file( + reference, fname + ) + estimated_event_list_for_current_file = get_event_list_current_file( + estimated, fname + ) + + segment_based_metric.evaluate( + reference_event_list=reference_event_list_for_current_file, + estimated_event_list=estimated_event_list_for_current_file, + ) + + return segment_based_metric + + +def compute_sed_eval_metrics(predictions, groundtruth): + """ Compute sed_eval metrics event based and segment based with default parameters used in the task. + Args: + predictions: pd.DataFrame, predictions dataframe + groundtruth: pd.DataFrame, groundtruth dataframe + Returns: + tuple, (sed_eval.sound_event.EventBasedMetrics, sed_eval.sound_event.SegmentBasedMetrics) + """ + metric_event = event_based_evaluation_df( + groundtruth, predictions, t_collar=0.200, percentage_of_length=0.2 + ) + metric_segment = segment_based_evaluation_df( + groundtruth, predictions, time_resolution=1.0 + ) + + return metric_event, metric_segment + + +def compute_per_intersection_macro_f1( + prediction_dfs, + ground_truth_file, + durations_file, + dtc_threshold=0.5, + gtc_threshold=0.5, + cttc_threshold=0.3, +): + """ Compute F1-score per intersection, using the defautl + Args: + prediction_dfs: dict, a dictionary with thresholds keys and predictions dataframe + ground_truth_file: pd.DataFrame, the groundtruth dataframe + durations_file: pd.DataFrame, the duration dataframe + dtc_threshold: float, the parameter used in PSDSEval, percentage of tolerance for groundtruth intersection + with predictions + gtc_threshold: float, the parameter used in PSDSEval percentage of tolerance for predictions intersection + with groundtruth + gtc_threshold: float, the parameter used in PSDSEval to know the percentage needed to count FP as cross-trigger + + Returns: + + """ + gt = pd.read_csv(ground_truth_file, sep="\t") + durations = pd.read_csv(durations_file, sep="\t") + + psds = PSDSEval( + ground_truth=gt, + metadata=durations, + dtc_threshold=dtc_threshold, + gtc_threshold=gtc_threshold, + cttc_threshold=cttc_threshold, + ) + psds_macro_f1 = [] + for threshold in prediction_dfs.keys(): + if not prediction_dfs[threshold].empty: + threshold_f1, _ = psds.compute_macro_f_score(prediction_dfs[threshold]) + else: + threshold_f1 = 0 + if np.isnan(threshold_f1): + threshold_f1 = 0.0 + psds_macro_f1.append(threshold_f1) + psds_macro_f1 = np.mean(psds_macro_f1) + return psds_macro_f1 + + +def compute_psds_from_operating_points( + prediction_dfs, + ground_truth_file, + durations_file, + dtc_threshold=0.5, + gtc_threshold=0.5, + cttc_threshold=0.3, + alpha_ct=0, + alpha_st=0, + max_efpr=100, + save_dir=None, +): + + gt = pd.read_csv(ground_truth_file, sep="\t") + durations = pd.read_csv(durations_file, sep="\t") + psds_eval = PSDSEval( + ground_truth=gt, + metadata=durations, + dtc_threshold=dtc_threshold, + gtc_threshold=gtc_threshold, + cttc_threshold=cttc_threshold, + ) + + for i, k in enumerate(prediction_dfs.keys()): + det = prediction_dfs[k] + # see issue https://github.com/audioanalytic/psds_eval/issues/3 + det["index"] = range(1, len(det) + 1) + det = det.set_index("index") + psds_eval.add_operating_point( + det, info={"name": f"Op {i + 1:02d}", "threshold": k} + ) + + psds_score = psds_eval.psds(alpha_ct=alpha_ct, alpha_st=alpha_st, max_efpr=max_efpr) + + if save_dir is not None: + os.makedirs(save_dir, exist_ok=True) + + pred_dir = os.path.join( + save_dir, + f"predictions_dtc{dtc_threshold}_gtc{gtc_threshold}_cttc{cttc_threshold}", + ) + os.makedirs(pred_dir, exist_ok=True) + for k in prediction_dfs.keys(): + prediction_dfs[k].to_csv( + os.path.join(pred_dir, f"predictions_th_{k:.2f}.tsv"), + sep="\t", + index=False, + ) + + filename = ( + f"PSDS_dtc{dtc_threshold}_gtc{gtc_threshold}_cttc{cttc_threshold}" + f"_ct{alpha_ct}_st{alpha_st}_max{max_efpr}_psds_eval.png" + ) + plot_psd_roc( + psds_score, + filename=os.path.join(save_dir, filename), + ) + + return psds_score.value + + +def compute_psds_from_scores( + scores, + ground_truth_file, + durations_file, + dtc_threshold=0.5, + gtc_threshold=0.5, + cttc_threshold=0.3, + alpha_ct=0, + alpha_st=0, + max_efpr=100, + num_jobs=4, + save_dir=None, +): + psds, psd_roc, single_class_rocs, *_ = sed_scores_eval.intersection_based.psds( + scores=scores, ground_truth=ground_truth_file, + audio_durations=durations_file, + dtc_threshold=dtc_threshold, gtc_threshold=gtc_threshold, + cttc_threshold=cttc_threshold, alpha_ct=alpha_ct, alpha_st=alpha_st, + max_efpr=max_efpr, num_jobs=num_jobs, + ) + if save_dir is not None: + scores_dir = os.path.join(save_dir, "scores") + sed_scores_eval.io.write_sed_scores(scores, scores_dir) + filename = ( + f"PSDS_dtc{dtc_threshold}_gtc{gtc_threshold}_cttc{cttc_threshold}" + f"_ct{alpha_ct}_st{alpha_st}_max{max_efpr}_sed_scores_eval.png" + ) + sed_scores_eval.utils.visualization.plot_psd_roc( + psd_roc, + filename=os.path.join(save_dir, filename), + dtc_threshold=dtc_threshold, gtc_threshold=gtc_threshold, + cttc_threshold=cttc_threshold, alpha_ct=alpha_ct, + alpha_st=alpha_st, unit_of_time='hour', max_efpr=max_efpr, + psds=psds, + ) + return psds, psd_roc diff --git a/src/desed_task/evaluation/__init__.py b/src/desed_task/evaluation/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/desed_task/evaluation/__pycache__/__init__.cpython-310.pyc b/src/desed_task/evaluation/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8d081a1c522b1d66751ac588aa104c2d4b0fafd8 Binary files /dev/null and b/src/desed_task/evaluation/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/desed_task/evaluation/__pycache__/__init__.cpython-311.pyc b/src/desed_task/evaluation/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f6bc1bc0dbc1c03fc236eeddd4f56bb17c62fa0a Binary files /dev/null and b/src/desed_task/evaluation/__pycache__/__init__.cpython-311.pyc differ diff --git a/src/desed_task/evaluation/__pycache__/__init__.cpython-312.pyc b/src/desed_task/evaluation/__pycache__/__init__.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b198d419738e969ccebcdf5c8746592fdd3926ca Binary files /dev/null and b/src/desed_task/evaluation/__pycache__/__init__.cpython-312.pyc differ diff --git a/src/desed_task/evaluation/__pycache__/evaluation_measures.cpython-310.pyc b/src/desed_task/evaluation/__pycache__/evaluation_measures.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6839f33f0752d4049a68c60d018b2e19f22b2e47 Binary files /dev/null and b/src/desed_task/evaluation/__pycache__/evaluation_measures.cpython-310.pyc differ diff --git a/src/desed_task/evaluation/__pycache__/evaluation_measures.cpython-311.pyc b/src/desed_task/evaluation/__pycache__/evaluation_measures.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..728d19f2f9f1c39a487e7783232784e0ed5f722d Binary files /dev/null and b/src/desed_task/evaluation/__pycache__/evaluation_measures.cpython-311.pyc differ diff --git a/src/desed_task/evaluation/__pycache__/evaluation_measures.cpython-312.pyc b/src/desed_task/evaluation/__pycache__/evaluation_measures.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..10b7f2d5f5b833b41c8f0b953aa82442a0d0f382 Binary files /dev/null and b/src/desed_task/evaluation/__pycache__/evaluation_measures.cpython-312.pyc differ diff --git a/src/desed_task/evaluation/evaluation_measures.py b/src/desed_task/evaluation/evaluation_measures.py new file mode 100644 index 0000000000000000000000000000000000000000..38c5283a59a96674cf0cc1e582d3ce905bb1d8f2 --- /dev/null +++ b/src/desed_task/evaluation/evaluation_measures.py @@ -0,0 +1,294 @@ +import os + +import numpy as np +import pandas as pd +import psds_eval +import sed_eval +from psds_eval import PSDSEval, plot_psd_roc +import sed_scores_eval + + +def get_event_list_current_file(df, fname): + """ + Get list of events for a given filename + Args: + df: pd.DataFrame, the dataframe to search on + fname: the filename to extract the value from the dataframe + Returns: + list of events (dictionaries) for the given filename + """ + event_file = df[df["filename"] == fname] + if len(event_file) == 1: + if pd.isna(event_file["event_label"].iloc[0]): + event_list_for_current_file = [{"filename": fname}] + else: + event_list_for_current_file = event_file.to_dict("records") + else: + event_list_for_current_file = event_file.to_dict("records") + + return event_list_for_current_file + + +def psds_results(psds_obj): + """ Compute psds scores + Args: + psds_obj: psds_eval.PSDSEval object with operating points. + Returns: + """ + try: + psds_score = psds_obj.psds(alpha_ct=0, alpha_st=0, max_efpr=100) + print(f"\nPSD-Score (0, 0, 100): {psds_score.value:.5f}") + psds_score = psds_obj.psds(alpha_ct=1, alpha_st=0, max_efpr=100) + print(f"\nPSD-Score (1, 0, 100): {psds_score.value:.5f}") + psds_score = psds_obj.psds(alpha_ct=0, alpha_st=1, max_efpr=100) + print(f"\nPSD-Score (0, 1, 100): {psds_score.value:.5f}") + except psds_eval.psds.PSDSEvalError as e: + print("psds did not work ....") + raise EnvironmentError + + +def event_based_evaluation_df( + reference, estimated, t_collar=0.200, percentage_of_length=0.2 +): + """ Calculate EventBasedMetric given a reference and estimated dataframe + + Args: + reference: pd.DataFrame containing "filename" "onset" "offset" and "event_label" columns which describe the + reference events + estimated: pd.DataFrame containing "filename" "onset" "offset" and "event_label" columns which describe the + estimated events to be compared with reference + t_collar: float, in seconds, the number of time allowed on onsets and offsets + percentage_of_length: float, between 0 and 1, the percentage of length of the file allowed on the offset + Returns: + sed_eval.sound_event.EventBasedMetrics with the scores + """ + + evaluated_files = reference["filename"].unique() + + classes = [] + classes.extend(reference.event_label.dropna().unique()) + classes.extend(estimated.event_label.dropna().unique()) + classes = list(set(classes)) + + event_based_metric = sed_eval.sound_event.EventBasedMetrics( + event_label_list=classes, + t_collar=t_collar, + percentage_of_length=percentage_of_length, + empty_system_output_handling="zero_score", + ) + + for fname in evaluated_files: + reference_event_list_for_current_file = get_event_list_current_file( + reference, fname + ) + estimated_event_list_for_current_file = get_event_list_current_file( + estimated, fname + ) + + event_based_metric.evaluate( + reference_event_list=reference_event_list_for_current_file, + estimated_event_list=estimated_event_list_for_current_file, + ) + + return event_based_metric + + +def segment_based_evaluation_df(reference, estimated, time_resolution=1.0): + """ Calculate SegmentBasedMetrics given a reference and estimated dataframe + + Args: + reference: pd.DataFrame containing "filename" "onset" "offset" and "event_label" columns which describe the + reference events + estimated: pd.DataFrame containing "filename" "onset" "offset" and "event_label" columns which describe the + estimated events to be compared with reference + time_resolution: float, the time resolution of the segment based metric + Returns: + sed_eval.sound_event.SegmentBasedMetrics with the scores + """ + evaluated_files = reference["filename"].unique() + + classes = [] + classes.extend(reference.event_label.dropna().unique()) + classes.extend(estimated.event_label.dropna().unique()) + classes = list(set(classes)) + + segment_based_metric = sed_eval.sound_event.SegmentBasedMetrics( + event_label_list=classes, time_resolution=time_resolution + ) + + for fname in evaluated_files: + reference_event_list_for_current_file = get_event_list_current_file( + reference, fname + ) + estimated_event_list_for_current_file = get_event_list_current_file( + estimated, fname + ) + + segment_based_metric.evaluate( + reference_event_list=reference_event_list_for_current_file, + estimated_event_list=estimated_event_list_for_current_file, + ) + + return segment_based_metric + + +def compute_sed_eval_metrics(predictions, groundtruth): + """ Compute sed_eval metrics event based and segment based with default parameters used in the task. + Args: + predictions: pd.DataFrame, predictions dataframe + groundtruth: pd.DataFrame, groundtruth dataframe + Returns: + tuple, (sed_eval.sound_event.EventBasedMetrics, sed_eval.sound_event.SegmentBasedMetrics) + """ + metric_event = event_based_evaluation_df( + groundtruth, predictions, t_collar=0.200, percentage_of_length=0.2 + ) + metric_segment = segment_based_evaluation_df( + groundtruth, predictions, time_resolution=1.0 + ) + + return metric_event, metric_segment + + +def compute_per_intersection_macro_f1( + prediction_dfs, + ground_truth_file, + durations_file, + dtc_threshold=0.5, + gtc_threshold=0.5, + cttc_threshold=0.3, +): + """ Compute F1-score per intersection, using the defautl + Args: + prediction_dfs: dict, a dictionary with thresholds keys and predictions dataframe + ground_truth_file: pd.DataFrame, the groundtruth dataframe + durations_file: pd.DataFrame, the duration dataframe + dtc_threshold: float, the parameter used in PSDSEval, percentage of tolerance for groundtruth intersection + with predictions + gtc_threshold: float, the parameter used in PSDSEval percentage of tolerance for predictions intersection + with groundtruth + gtc_threshold: float, the parameter used in PSDSEval to know the percentage needed to count FP as cross-trigger + + Returns: + + """ + gt = pd.read_csv(ground_truth_file, sep="\t") + durations = pd.read_csv(durations_file, sep="\t") + + psds = PSDSEval( + ground_truth=gt, + metadata=durations, + dtc_threshold=dtc_threshold, + gtc_threshold=gtc_threshold, + cttc_threshold=cttc_threshold, + ) + psds_macro_f1 = [] + for threshold in prediction_dfs.keys(): + if not prediction_dfs[threshold].empty: + threshold_f1, _ = psds.compute_macro_f_score(prediction_dfs[threshold]) + else: + threshold_f1 = 0 + if np.isnan(threshold_f1): + threshold_f1 = 0.0 + psds_macro_f1.append(threshold_f1) + psds_macro_f1 = np.mean(psds_macro_f1) + return psds_macro_f1 + + +def compute_psds_from_operating_points( + prediction_dfs, + ground_truth_file, + durations_file, + dtc_threshold=0.5, + gtc_threshold=0.5, + cttc_threshold=0.3, + alpha_ct=0, + alpha_st=0, + max_efpr=100, + save_dir=None, +): + + gt = pd.read_csv(ground_truth_file, sep="\t") + durations = pd.read_csv(durations_file, sep="\t") + psds_eval = PSDSEval( + ground_truth=gt, + metadata=durations, + dtc_threshold=dtc_threshold, + gtc_threshold=gtc_threshold, + cttc_threshold=cttc_threshold, + ) + + for i, k in enumerate(prediction_dfs.keys()): + det = prediction_dfs[k] + # see issue https://github.com/audioanalytic/psds_eval/issues/3 + det["index"] = range(1, len(det) + 1) + det = det.set_index("index") + psds_eval.add_operating_point( + det, info={"name": f"Op {i + 1:02d}", "threshold": k} + ) + + psds_score = psds_eval.psds(alpha_ct=alpha_ct, alpha_st=alpha_st, max_efpr=max_efpr) + + if save_dir is not None: + os.makedirs(save_dir, exist_ok=True) + + pred_dir = os.path.join( + save_dir, + f"predictions_dtc{dtc_threshold}_gtc{gtc_threshold}_cttc{cttc_threshold}", + ) + os.makedirs(pred_dir, exist_ok=True) + for k in prediction_dfs.keys(): + prediction_dfs[k].to_csv( + os.path.join(pred_dir, f"predictions_th_{k:.2f}.tsv"), + sep="\t", + index=False, + ) + + filename = ( + f"PSDS_dtc{dtc_threshold}_gtc{gtc_threshold}_cttc{cttc_threshold}" + f"_ct{alpha_ct}_st{alpha_st}_max{max_efpr}_psds_eval.png" + ) + plot_psd_roc( + psds_score, + filename=os.path.join(save_dir, filename), + ) + + return psds_score.value + + +def compute_psds_from_scores( + scores, + ground_truth_file, + durations_file, + dtc_threshold=0.5, + gtc_threshold=0.5, + cttc_threshold=0.3, + alpha_ct=0, + alpha_st=0, + max_efpr=100, + num_jobs=4, + save_dir=None, +): + psds, psd_roc, single_class_rocs, *_ = sed_scores_eval.intersection_based.psds( + scores=scores, ground_truth=ground_truth_file, + audio_durations=durations_file, + dtc_threshold=dtc_threshold, gtc_threshold=gtc_threshold, + cttc_threshold=cttc_threshold, alpha_ct=alpha_ct, alpha_st=alpha_st, + max_efpr=max_efpr, num_jobs=num_jobs, + ) + if save_dir is not None: + scores_dir = os.path.join(save_dir, "scores") + sed_scores_eval.io.write_sed_scores(scores, scores_dir) + filename = ( + f"PSDS_dtc{dtc_threshold}_gtc{gtc_threshold}_cttc{cttc_threshold}" + f"_ct{alpha_ct}_st{alpha_st}_max{max_efpr}_sed_scores_eval.png" + ) + sed_scores_eval.utils.visualization.plot_psd_roc( + psd_roc, + filename=os.path.join(save_dir, filename), + dtc_threshold=dtc_threshold, gtc_threshold=gtc_threshold, + cttc_threshold=cttc_threshold, alpha_ct=alpha_ct, + alpha_st=alpha_st, unit_of_time='hour', max_efpr=max_efpr, + psds=psds, + ) + return psds, psd_roc diff --git a/src/desed_task/nnet/CNN.py b/src/desed_task/nnet/CNN.py new file mode 100644 index 0000000000000000000000000000000000000000..87a5ba504e184ccf91956297a0122327223a954e --- /dev/null +++ b/src/desed_task/nnet/CNN.py @@ -0,0 +1,114 @@ +import torch.nn as nn +import torch + + +class GLU(nn.Module): + def __init__(self, input_num): + super(GLU, self).__init__() + self.sigmoid = nn.Sigmoid() + self.linear = nn.Linear(input_num, input_num) + + def forward(self, x): + lin = self.linear(x.permute(0, 2, 3, 1)) + lin = lin.permute(0, 3, 1, 2) + sig = self.sigmoid(x) + res = lin * sig + return res + + +class ContextGating(nn.Module): + def __init__(self, input_num): + super(ContextGating, self).__init__() + self.sigmoid = nn.Sigmoid() + self.linear = nn.Linear(input_num, input_num) + + def forward(self, x): + lin = self.linear(x.permute(0, 2, 3, 1)) + lin = lin.permute(0, 3, 1, 2) + sig = self.sigmoid(lin) + res = x * sig + return res + + +class CNN(nn.Module): + def __init__( + self, + n_in_channel, + activation="Relu", + conv_dropout=0, + kernel_size=[3, 3, 3], + padding=[1, 1, 1], + stride=[1, 1, 1], + nb_filters=[64, 64, 64], + pooling=[(1, 4), (1, 4), (1, 4)], + normalization="batch", + **transformer_kwargs + ): + """ + Initialization of CNN network s + + Args: + n_in_channel: int, number of input channel + activation: str, activation function + conv_dropout: float, dropout + kernel_size: kernel size + padding: padding + stride: list, stride + nb_filters: number of filters + pooling: list of tuples, time and frequency pooling + normalization: choose between "batch" for BatchNormalization and "layer" for LayerNormalization. + """ + super(CNN, self).__init__() + + self.nb_filters = nb_filters + cnn = nn.Sequential() + + def conv(i, normalization="batch", dropout=None, activ="relu"): + nIn = n_in_channel if i == 0 else nb_filters[i - 1] + nOut = nb_filters[i] + cnn.add_module( + "conv{0}".format(i), + nn.Conv2d(nIn, nOut, kernel_size[i], stride[i], padding[i]), + ) + if normalization == "batch": + cnn.add_module( + "batchnorm{0}".format(i), + nn.BatchNorm2d(nOut, eps=0.001, momentum=0.99), + ) + elif normalization == "layer": + cnn.add_module("layernorm{0}".format(i), nn.GroupNorm(1, nOut)) + + if activ.lower() == "leakyrelu": + cnn.add_module("relu{0}".format(i), nn.LeakyReLU(0.2)) + elif activ.lower() == "relu": + cnn.add_module("relu{0}".format(i), nn.ReLU()) + elif activ.lower() == "glu": + cnn.add_module("glu{0}".format(i), GLU(nOut)) + elif activ.lower() == "cg": + cnn.add_module("cg{0}".format(i), ContextGating(nOut)) + + if dropout is not None: + cnn.add_module("dropout{0}".format(i), nn.Dropout(dropout)) + + # 128x862x64 + for i in range(len(nb_filters)): + conv(i, normalization=normalization, dropout=conv_dropout, activ=activation) + cnn.add_module( + "pooling{0}".format(i), nn.AvgPool2d(pooling[i]) + ) # bs x tframe x mels + + self.cnn = cnn + + def forward(self, x): + """ + Forward step of the CNN module + + Args: + x (Tensor): input batch of size (batch_size, n_channels, n_frames, n_freq) + + Returns: + Tensor: batch embedded + """ + # conv features + x = self.cnn(x) + return x diff --git a/src/desed_task/nnet/CRNN.py b/src/desed_task/nnet/CRNN.py new file mode 100644 index 0000000000000000000000000000000000000000..a5e0dbd8c3f4cdac79731542474fbecbdadbf482 --- /dev/null +++ b/src/desed_task/nnet/CRNN.py @@ -0,0 +1,190 @@ +import warnings + +import torch.nn as nn +import torch +from .RNN import BidirectionalGRU +from .CNN import CNN + +class CRNN(nn.Module): + def __init__( + self, + n_in_channel=1, + nclass=10, + attention=True, + activation="glu", + dropout=0.5, + train_cnn=True, + rnn_type="BGRU", + n_RNN_cell=128, + n_layers_RNN=2, + dropout_recurrent=0, + cnn_integration=False, + freeze_bn=False, + use_embeddings=False, + embedding_size=527, + embedding_type="global", + frame_emb_enc_dim=512, + aggregation_type="global", + **kwargs, + ): + """ + Initialization of CRNN model + + Args: + n_in_channel: int, number of input channel + n_class: int, number of classes + attention: bool, adding attention layer or not + activation: str, activation function + dropout: float, dropout + train_cnn: bool, training cnn layers + rnn_type: str, rnn type + n_RNN_cell: int, RNN nodes + n_layer_RNN: int, number of RNN layers + dropout_recurrent: float, recurrent layers dropout + cnn_integration: bool, integration of cnn + freeze_bn: + **kwargs: keywords arguments for CNN. + """ + super(CRNN, self).__init__() + + self.n_in_channel = n_in_channel + self.attention = attention + self.cnn_integration = cnn_integration + self.freeze_bn = freeze_bn + self.use_embeddings = use_embeddings + self.embedding_type = embedding_type + self.aggregation_type = aggregation_type + + n_in_cnn = n_in_channel + + if cnn_integration: + n_in_cnn = 1 + + self.cnn = CNN( + n_in_channel=n_in_cnn, activation=activation, conv_dropout=dropout, **kwargs + ) + + self.train_cnn = train_cnn + if not train_cnn: + for param in self.cnn.parameters(): + param.requires_grad = False + + if rnn_type == "BGRU": + nb_in = self.cnn.nb_filters[-1] + if self.cnn_integration: + # self.fc = nn.Linear(nb_in * n_in_channel, nb_in) + nb_in = nb_in * n_in_channel + self.rnn = BidirectionalGRU( + n_in=nb_in, + n_hidden=n_RNN_cell, + dropout=dropout_recurrent, + num_layers=n_layers_RNN, + ) + else: + NotImplementedError("Only BGRU supported for CRNN for now") + + self.dropout = nn.Dropout(dropout) + self.dense = nn.Linear(n_RNN_cell * 2, nclass) + self.sigmoid = nn.Sigmoid() + + if self.attention: + self.dense_softmax = nn.Linear(n_RNN_cell * 2, nclass) + self.softmax = nn.Softmax(dim=-1) + + + if self.use_embeddings: + if self.aggregation_type == "frame": + self.frame_embs_encoder = nn.GRU(batch_first=True, input_size=embedding_size, + hidden_size=512, + bidirectional=True) + self.shrink_emb = torch.nn.Sequential(torch.nn.Linear(2 * frame_emb_enc_dim, nb_in), + torch.nn.LayerNorm(nb_in)) + self.cat_tf = torch.nn.Linear(2*nb_in, nb_in) + elif self.aggregation_type == "global": + self.shrink_emb = torch.nn.Sequential(torch.nn.Linear(embedding_size, nb_in), + torch.nn.LayerNorm(nb_in)) + self.cat_tf = torch.nn.Linear(2*nb_in, nb_in) + elif self.aggregation_type == "interpolate": + self.cat_tf = torch.nn.Linear(nb_in+embedding_size, nb_in) + elif self.aggregation_type == "pool1d": + self.cat_tf = torch.nn.Linear(nb_in+embedding_size, nb_in) + else: + self.cat_tf = torch.nn.Linear(2*nb_in, nb_in) + + + def forward(self, x, pad_mask=None, embeddings=None): + + x = x.transpose(1, 2).unsqueeze(1) + + # input size : (batch_size, n_channels, n_frames, n_freq) + if self.cnn_integration: + bs_in, nc_in = x.size(0), x.size(1) + x = x.view(bs_in * nc_in, 1, *x.shape[2:]) + + # conv features + x = self.cnn(x) + bs, chan, frames, freq = x.size() + if self.cnn_integration: + x = x.reshape(bs_in, chan * nc_in, frames, freq) + + if freq != 1: + warnings.warn( + f"Output shape is: {(bs, frames, chan * freq)}, from {freq} staying freq" + ) + x = x.permute(0, 2, 1, 3) + x = x.contiguous().view(bs, frames, chan * freq) + else: + x = x.squeeze(-1) + x = x.permute(0, 2, 1) # [bs, frames, chan] + + # rnn features + if self.use_embeddings: + if self.aggregation_type == "global": + x = self.cat_tf(torch.cat((x, self.shrink_emb(embeddings).unsqueeze(1).repeat(1, x.shape[1], 1)), -1)) + elif self.aggregation_type == "frame": + # there can be some mismatch between seq length of cnn of crnn and the pretrained embeddings, we use an rnn + # as an encoder and we use the last state + last, _ = self.frame_embs_encoder(embeddings.transpose(1, 2)) + embeddings = last[:, -1] + x = self.cat_tf(torch.cat((x, self.shrink_emb(embeddings).unsqueeze(1).repeat(1, x.shape[1], 1)), -1)) + elif self.aggregation_type == "interpolate": + output_shape = (embeddings.shape[1], x.shape[1]) + reshape_emb = torch.nn.functional.interpolate(embeddings.unsqueeze(1), size=output_shape, mode='nearest-exact').squeeze(1).transpose(1, 2) + x = self.cat_tf(torch.cat((x, reshape_emb), -1)) + elif self.aggregation_type == "pool1d": + reshape_emb = torch.nn.functional.adaptive_avg_pool1d(embeddings, x.shape[1]).transpose(1, 2) + x = self.cat_tf(torch.cat((x, reshape_emb), -1)) + else: + pass + + x = self.rnn(x) + x = self.dropout(x) + strong = self.dense(x) # [bs, frames, nclass] + strong = self.sigmoid(strong) + if self.attention: + sof = self.dense_softmax(x) # [bs, frames, nclass] + if not pad_mask is None: + sof = sof.masked_fill(pad_mask.transpose(1, 2), -1e30) # mask attention + sof = self.softmax(sof) + sof = torch.clamp(sof, min=1e-7, max=1) + weak = (strong * sof).sum(1) / sof.sum(1) # [bs, nclass] + else: + weak = strong.mean(1) + return strong.transpose(1, 2), weak + + def train(self, mode=True): + """ + Override the default train() to freeze the BN parameters + """ + super(CRNN, self).train(mode) + if self.freeze_bn: + print("Freezing Mean/Var of BatchNorm2D.") + if self.freeze_bn: + print("Freezing Weight/Bias of BatchNorm2D.") + if self.freeze_bn: + for m in self.modules(): + if isinstance(m, nn.BatchNorm2d): + m.eval() + if self.freeze_bn: + m.weight.requires_grad = False + m.bias.requires_grad = False diff --git a/src/desed_task/nnet/RNN.py b/src/desed_task/nnet/RNN.py new file mode 100644 index 0000000000000000000000000000000000000000..aee67f94353b84abb5e47d25f1b2c74a24eae117 --- /dev/null +++ b/src/desed_task/nnet/RNN.py @@ -0,0 +1,54 @@ +import warnings + +import torch +from torch import nn as nn + + +class BidirectionalGRU(nn.Module): + def __init__(self, n_in, n_hidden, dropout=0, num_layers=1): + + """ + Initialization of BidirectionalGRU instance + Args: + n_in: int, number of input + n_hidden: int, number of hidden layers + dropout: flat, dropout + num_layers: int, number of layers + """ + + super(BidirectionalGRU, self).__init__() + self.rnn = nn.GRU( + n_in, + n_hidden, + bidirectional=True, + dropout=dropout, + batch_first=True, + num_layers=num_layers, + ) + + def forward(self, input_feat): + recurrent, _ = self.rnn(input_feat) + return recurrent + + +class BidirectionalLSTM(nn.Module): + def __init__(self, nIn, nHidden, nOut, dropout=0, num_layers=1): + super(BidirectionalLSTM, self).__init__() + self.rnn = nn.LSTM( + nIn, + nHidden // 2, + bidirectional=True, + batch_first=True, + dropout=dropout, + num_layers=num_layers, + ) + self.embedding = nn.Linear(nHidden * 2, nOut) + + def forward(self, input_feat): + recurrent, _ = self.rnn(input_feat) + b, T, h = recurrent.size() + t_rec = recurrent.contiguous().view(b * T, h) + + output = self.embedding(t_rec) # [T * b, nOut] + output = output.view(b, T, -1) + return output diff --git a/src/desed_task/nnet/__init__.py b/src/desed_task/nnet/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/desed_task/nnet/__pycache__/CNN.cpython-310.pyc b/src/desed_task/nnet/__pycache__/CNN.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..45c34d773fd4972b10e2b6807ec59e3c300f2f10 Binary files /dev/null and b/src/desed_task/nnet/__pycache__/CNN.cpython-310.pyc differ diff --git a/src/desed_task/nnet/__pycache__/CRNN.cpython-310.pyc b/src/desed_task/nnet/__pycache__/CRNN.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..29b1933c4acfb51e35d695e9e558a87e2a8269ce Binary files /dev/null and b/src/desed_task/nnet/__pycache__/CRNN.cpython-310.pyc differ diff --git a/src/desed_task/nnet/__pycache__/RNN.cpython-310.pyc b/src/desed_task/nnet/__pycache__/RNN.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..663b01d6f9c13908a225275aca5eae31d2d4f304 Binary files /dev/null and b/src/desed_task/nnet/__pycache__/RNN.cpython-310.pyc differ diff --git a/src/desed_task/nnet/__pycache__/__init__.cpython-310.pyc b/src/desed_task/nnet/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4a221eb1381992a682c55d7430f183fbeeb9bd5d Binary files /dev/null and b/src/desed_task/nnet/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/desed_task/utils/__init__.py b/src/desed_task/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..1d87ad000b03cd4e6b2792a82bb959a2f90a5a4a --- /dev/null +++ b/src/desed_task/utils/__init__.py @@ -0,0 +1,2 @@ +from .encoder import ManyHotEncoder +from .schedulers import ExponentialWarmup diff --git a/src/desed_task/utils/__pycache__/__init__.cpython-310.pyc b/src/desed_task/utils/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..00ba4c5458accb36d44fcadc1bc161b23dea266e Binary files /dev/null and b/src/desed_task/utils/__pycache__/__init__.cpython-310.pyc differ diff --git a/src/desed_task/utils/__pycache__/encoder.cpython-310.pyc b/src/desed_task/utils/__pycache__/encoder.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..616112c89dbb78ed25142f65ffa05c87d13157d8 Binary files /dev/null and b/src/desed_task/utils/__pycache__/encoder.cpython-310.pyc differ diff --git a/src/desed_task/utils/__pycache__/scaler.cpython-310.pyc b/src/desed_task/utils/__pycache__/scaler.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c58439976d2444470fdbc02148d373507075f263 Binary files /dev/null and b/src/desed_task/utils/__pycache__/scaler.cpython-310.pyc differ diff --git a/src/desed_task/utils/__pycache__/schedulers.cpython-310.pyc b/src/desed_task/utils/__pycache__/schedulers.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..94e70d07cee3b07cf9d4b7e69fa62399d787eb85 Binary files /dev/null and b/src/desed_task/utils/__pycache__/schedulers.cpython-310.pyc differ diff --git a/src/desed_task/utils/download.py b/src/desed_task/utils/download.py new file mode 100644 index 0000000000000000000000000000000000000000..f9d2d19b512f27c3f61cb0ee2f78473d3ad906f0 --- /dev/null +++ b/src/desed_task/utils/download.py @@ -0,0 +1,22 @@ +import os.path +import os +from tqdm import tqdm +import requests +from pathlib import Path + +def download_from_url(url, destination): + if os.path.exists(destination): + print("Skipping download as file in {} exists already".format(destination)) + return + response = requests.get(url, stream=True) + total_size_in_bytes = int(response.headers.get('content-length', 0)) + block_size = 1024 # 1 Kibibyte + progress_bar = tqdm(total=total_size_in_bytes, unit='iB', unit_scale=True) + os.makedirs(Path(destination).parent, exist_ok=True) + with open(destination, 'wb') as file: + for data in response.iter_content(block_size): + progress_bar.update(len(data)) + file.write(data) + progress_bar.close() + if total_size_in_bytes != 0 and progress_bar.n != total_size_in_bytes: + print("ERROR, something went wrong") \ No newline at end of file diff --git a/src/desed_task/utils/encoder.py b/src/desed_task/utils/encoder.py new file mode 100644 index 0000000000000000000000000000000000000000..3dbee9acd2a8ff61c22fa7da47de8aebdb2178e6 --- /dev/null +++ b/src/desed_task/utils/encoder.py @@ -0,0 +1,213 @@ +import numpy as np +import pandas as pd +from dcase_util.data import DecisionEncoder + + +class ManyHotEncoder: + """" + Adapted after DecisionEncoder.find_contiguous_regions method in + https://github.com/DCASE-REPO/dcase_util/blob/master/dcase_util/data/decisions.py + + Encode labels into numpy arrays where 1 correspond to presence of the class and 0 absence. + Multiple 1 can appear on the same line, it is for multi label problem. + Args: + labels: list, the classes which will be encoded + n_frames: int, (Default value = None) only useful for strong labels. The number of frames of a segment. + Attributes: + labels: list, the classes which will be encoded + n_frames: int, only useful for strong labels. The number of frames of a segment. + """ + + def __init__( + self, labels, audio_len, frame_len, frame_hop, net_pooling=1, fs=16000 + ): + if type(labels) in [np.ndarray, np.array]: + labels = labels.tolist() + self.labels = labels + self.audio_len = audio_len + self.frame_len = frame_len + self.frame_hop = frame_hop + self.fs = fs + self.net_pooling = net_pooling + n_frames = self.audio_len * self.fs + # self.n_frames = int( + # int(((n_frames - self.frame_len) / self.frame_hop)) / self.net_pooling + # ) + self.n_frames = int(int((n_frames / self.frame_hop)) / self.net_pooling) + + def encode_weak(self, labels): + """ Encode a list of weak labels into a numpy array + + Args: + labels: list, list of labels to encode (to a vector of 0 and 1) + + Returns: + numpy.array + A vector containing 1 for each label, and 0 everywhere else + """ + # useful for tensor empty labels + if type(labels) is str: + if labels == "empty": + y = np.zeros(len(self.labels)) - 1 + return y + else: + labels = labels.split(",") + if type(labels) is pd.DataFrame: + if labels.empty: + labels = [] + elif "event_label" in labels.columns: + labels = labels["event_label"] + y = np.zeros(len(self.labels)) + for label in labels: + if not pd.isna(label): + i = self.labels.index(label) + y[i] = 1 + return y + + def _time_to_frame(self, time): + samples = time * self.fs + frame = (samples) / self.frame_hop + return np.clip(frame / self.net_pooling, a_min=0, a_max=self.n_frames) + + def _frame_to_time(self, frame): + frame = frame * self.net_pooling / (self.fs / self.frame_hop) + return np.clip(frame, a_min=0, a_max=self.audio_len) + + def encode_strong_df(self, label_df): + """Encode a list (or pandas Dataframe or Serie) of strong labels, they correspond to a given filename + + Args: + label_df: pandas DataFrame or Series, contains filename, onset (in frames) and offset (in frames) + If only filename (no onset offset) is specified, it will return the event on all the frames + onset and offset should be in frames + Returns: + numpy.array + Encoded labels, 1 where the label is present, 0 otherwise + """ + + assert any( + [x is not None for x in [self.audio_len, self.frame_len, self.frame_hop]] + ) + + samples_len = self.n_frames + if type(label_df) is str: + if label_df == "empty": + y = np.zeros((samples_len, len(self.labels))) - 1 + return y + y = np.zeros((samples_len, len(self.labels))) + if type(label_df) is pd.DataFrame: + if {"onset", "offset", "event_label"}.issubset(label_df.columns): + for _, row in label_df.iterrows(): + if not pd.isna(row["event_label"]): + i = self.labels.index(row["event_label"]) + onset = int(self._time_to_frame(row["onset"])) + offset = int(np.ceil(self._time_to_frame(row["offset"]))) + y[ + onset:offset, i + ] = 1 # means offset not included (hypothesis of overlapping frames, so ok) + + elif type(label_df) in [ + pd.Series, + list, + np.ndarray, + ]: # list of list or list of strings + if type(label_df) is pd.Series: + if {"onset", "offset", "event_label"}.issubset( + label_df.index + ): # means only one value + if not pd.isna(label_df["event_label"]): + i = self.labels.index(label_df["event_label"]) + onset = int(self._time_to_frame(label_df["onset"])) + offset = int(np.ceil(self._time_to_frame(label_df["offset"]))) + y[onset:offset, i] = 1 + return y + + for event_label in label_df: + # List of string, so weak labels to be encoded in strong + if type(event_label) is str: + if event_label != "": + i = self.labels.index(event_label) + y[:, i] = 1 + + # List of list, with [label, onset, offset] + elif len(event_label) == 3: + if event_label[0] != "": + i = self.labels.index(event_label[0]) + onset = int(self._time_to_frame(event_label[1])) + offset = int(np.ceil(self._time_to_frame(event_label[2]))) + y[onset:offset, i] = 1 + + else: + raise NotImplementedError( + "cannot encode strong, type mismatch: {}".format( + type(event_label) + ) + ) + + else: + raise NotImplementedError( + "To encode_strong, type is pandas.Dataframe with onset, offset and event_label" + "columns, or it is a list or pandas Series of event labels, " + "type given: {}".format(type(label_df)) + ) + return y + + def decode_weak(self, labels): + """ Decode the encoded weak labels + Args: + labels: numpy.array, the encoded labels to be decoded + + Returns: + list + Decoded labels, list of string + + """ + result_labels = [] + for i, value in enumerate(labels): + if value == 1: + result_labels.append(self.labels[i]) + return result_labels + + def decode_strong(self, labels): + """ Decode the encoded strong labels + Args: + labels: numpy.array, the encoded labels to be decoded + Returns: + list + Decoded labels, list of list: [[label, onset offset], ...] + + """ + result_labels = [] + for i, label_column in enumerate(labels.T): + change_indices = DecisionEncoder().find_contiguous_regions(label_column) + + # append [label, onset, offset] in the result list + for row in change_indices: + result_labels.append( + [ + self.labels[i], + self._frame_to_time(row[0]), + self._frame_to_time(row[1]), + ] + ) + return result_labels + + def state_dict(self): + return { + "labels": self.labels, + "audio_len": self.audio_len, + "frame_len": self.frame_len, + "frame_hop": self.frame_hop, + "net_pooling": self.net_pooling, + "fs": self.fs, + } + + @classmethod + def load_state_dict(cls, state_dict): + labels = state_dict["labels"] + audio_len = state_dict["audio_len"] + frame_len = state_dict["frame_len"] + frame_hop = state_dict["frame_hop"] + net_pooling = state_dict["net_pooling"] + fs = state_dict["fs"] + return cls(labels, audio_len, frame_len, frame_hop, net_pooling, fs) diff --git a/src/desed_task/utils/scaler.py b/src/desed_task/utils/scaler.py new file mode 100644 index 0000000000000000000000000000000000000000..7adfaf9903960266e02fc208d220afc39817cded --- /dev/null +++ b/src/desed_task/utils/scaler.py @@ -0,0 +1,121 @@ +import tqdm +import torch + + +class TorchScaler(torch.nn.Module): + """ + This torch module implements scaling for input tensors, both instance based + and dataset-wide statistic based. + + Args: + statistic: str, (default='dataset'), represent how to compute the statistic for normalisation. + Choice in {'dataset', 'instance'}. + 'dataset' needs to be 'fit()' with a dataloader of the dataset. + 'instance' apply the normalisation at an instance-level, so compute the statitics on the instance + specified, it can be a clip or a batch. + normtype: str, (default='standard') the type of normalisation to use. + Choice in {'standard', 'mean', 'minmax'}. 'standard' applies a classic normalisation with mean and standard + deviation. 'mean' substract the mean to the data. 'minmax' substract the minimum of the data and divide by + the difference between max and min. + """ + + def __init__(self, statistic="dataset", normtype="standard", dims=(1, 2), eps=1e-8): + super(TorchScaler, self).__init__() + assert statistic in ["dataset", "instance", None] + assert normtype in ["standard", "mean", "minmax", None] + if statistic == "dataset" and normtype == "minmax": + raise NotImplementedError( + "statistic==dataset and normtype==minmax is not currently implemented." + ) + self.statistic = statistic + self.normtype = normtype + self.dims = dims + self.eps = eps + + def load_state_dict(self, state_dict, strict=True): + if self.statistic == "dataset": + super(TorchScaler, self).load_state_dict(state_dict, strict) + + def _load_from_state_dict( + self, + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ): + if self.statistic == "dataset": + super(TorchScaler, self)._load_from_state_dict( + state_dict, + prefix, + local_metadata, + strict, + missing_keys, + unexpected_keys, + error_msgs, + ) + + def fit(self, dataloader, transform_func=lambda x: x[0]): + """ + Scaler fitting + + Args: + dataloader (DataLoader): training data DataLoader + transform_func (lambda function, optional): Transforms applied to the data. + Defaults to lambdax:x[0]. + """ + indx = 0 + for batch in tqdm.tqdm(dataloader): + + feats = transform_func(batch) + if indx == 0: + mean = torch.mean(feats, self.dims, keepdim=True).mean(0).unsqueeze(0) + mean_squared = ( + torch.mean(feats ** 2, self.dims, keepdim=True).mean(0).unsqueeze(0) + ) + else: + mean += torch.mean(feats, self.dims, keepdim=True).mean(0).unsqueeze(0) + mean_squared += ( + torch.mean(feats ** 2, self.dims, keepdim=True).mean(0).unsqueeze(0) + ) + indx += 1 + + mean /= indx + mean_squared /= indx + + self.register_buffer("mean", mean) + self.register_buffer("mean_squared", mean_squared) + + def forward(self, tensor): + + if self.statistic is None or self.normtype is None: + return tensor + + if self.statistic == "dataset": + assert hasattr(self, "mean") and hasattr( + self, "mean_squared" + ), "TorchScaler should be fit before used if statistics=dataset" + assert tensor.ndim == self.mean.ndim, "Pre-computed statistics " + if self.normtype == "mean": + return tensor - self.mean + elif self.normtype == "standard": + std = torch.sqrt(self.mean_squared - self.mean ** 2) + return (tensor - self.mean) / (std + self.eps) + else: + raise NotImplementedError + + else: + if self.normtype == "mean": + return tensor - torch.mean(tensor, self.dims, keepdim=True) + elif self.normtype == "standard": + return (tensor - torch.mean(tensor, self.dims, keepdim=True)) / ( + torch.std(tensor, self.dims, keepdim=True) + self.eps + ) + elif self.normtype == "minmax": + return (tensor - torch.amin(tensor, dim=self.dims, keepdim=True)) / ( + torch.amax(tensor, dim=self.dims, keepdim=True) + - torch.amin(tensor, dim=self.dims, keepdim=True) + + self.eps + ) diff --git a/src/desed_task/utils/schedulers.py b/src/desed_task/utils/schedulers.py new file mode 100644 index 0000000000000000000000000000000000000000..9060344ae41b61c8f107ea2be741e89b5aacaaa2 --- /dev/null +++ b/src/desed_task/utils/schedulers.py @@ -0,0 +1,73 @@ +import numpy as np +import torch + +# Copied from https://github.com/asteroid-team/asteroid/blob/master/asteroid/engine/schedulers.py +# Copied since it is the last function we still use from asteroid (and avoid other dependencies) +class BaseScheduler(object): + '''Base class for the step-wise scheduler logic. + Args: + optimizer (Optimize): Optimizer instance to apply lr schedule on. + Subclass this and overwrite ``_get_lr`` to write your own step-wise scheduler. + ''' + def __init__(self, optimizer): + self.optimizer = optimizer + self.step_num = 0 + def zero_grad(self): + self.optimizer.zero_grad() + def _get_lr(self): + raise NotImplementedError + def _set_lr(self, lr): + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + def step(self, metrics=None, epoch=None): + '''Update step-wise learning rate before optimizer.step.''' + self.step_num += 1 + lr = self._get_lr() + self._set_lr(lr) + def load_state_dict(self, state_dict): + self.__dict__.update(state_dict) + def state_dict(self): + return {key: value for key, value in self.__dict__.items() if key != "optimizer"} + def as_tensor(self, start=0, stop=100_000): + '''Returns the scheduler values from start to stop.''' + lr_list = [] + for _ in range(start, stop): + self.step_num += 1 + lr_list.append(self._get_lr()) + self.step_num = 0 + return torch.tensor(lr_list) + def plot(self, start=0, stop=100_000): # noqa + '''Plot the scheduler values from start to stop.''' + import matplotlib.pyplot as plt + all_lr = self.as_tensor(start=start, stop=stop) + plt.plot(all_lr.numpy()) + plt.show() + +class ExponentialWarmup(BaseScheduler): + """ Scheduler to apply ramp-up during training to the learning rate. + Args: + optimizer: torch.optimizer.Optimizer, the optimizer from which to rampup the value from + max_lr: float, the maximum learning to use at the end of ramp-up. + rampup_length: int, the length of the rampup (number of steps). + exponent: float, the exponent to be used. + """ + + def __init__(self, optimizer, max_lr, rampup_length, exponent=-5.0): + super().__init__(optimizer) + self.rampup_len = rampup_length + self.max_lr = max_lr + self.step_num = 1 + self.exponent = exponent + + def _get_scaling_factor(self): + + if self.rampup_len == 0: + return 1.0 + else: + + current = np.clip(self.step_num, 0.0, self.rampup_len) + phase = 1.0 - current / self.rampup_len + return float(np.exp(self.exponent * phase * phase)) + + def _get_lr(self): + return self.max_lr * self._get_scaling_factor() diff --git a/src/desed_task/utils/torch_utils.py b/src/desed_task/utils/torch_utils.py new file mode 100644 index 0000000000000000000000000000000000000000..d85a8a6d6502f1d7cf1e31f141ddf2180c0d5b44 --- /dev/null +++ b/src/desed_task/utils/torch_utils.py @@ -0,0 +1,14 @@ +import torch +import numpy as np + + +def nantensor(*args, **kwargs): + return torch.ones(*args, **kwargs) * np.nan + + +def nanmean(v, *args, inplace=False, **kwargs): + if not inplace: + v = v.clone() + is_nan = torch.isnan(v) + v[is_nan] = 0 + return v.sum(*args, **kwargs) / (~is_nan).float().sum(*args, **kwargs) diff --git a/src/local/beats/BEATs.py b/src/local/beats/BEATs.py new file mode 100644 index 0000000000000000000000000000000000000000..6ca3fcef181c538f7bae2be21ecae986834ca05c --- /dev/null +++ b/src/local/beats/BEATs.py @@ -0,0 +1,197 @@ +# -------------------------------------------------------- +# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058) +# Github source: https://github.com/microsoft/unilm/tree/master/beats +# Copyright (c) 2022 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Based on fairseq code bases +# https://github.com/pytorch/fairseq +# -------------------------------------------------------- + + +import torch +import torch.nn as nn +from torch.nn import LayerNorm +import torchaudio.compliance.kaldi as ta_kaldi + +from .backbone import ( + TransformerEncoder, +) + +import logging +from typing import Optional + +logger = logging.getLogger(__name__) + + +class BEATsConfig: + def __init__(self, cfg=None): + self.input_patch_size: int = -1 # path size of patch embedding + self.embed_dim: int = 512 # patch embedding dimension + self.conv_bias: bool = False # include bias in conv encoder + + self.encoder_layers: int = 12 # num encoder layers in the transformer + self.encoder_embed_dim: int = 768 # encoder embedding dimension + self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN + self.encoder_attention_heads: int = 12 # num encoder attention heads + self.activation_fn: str = "gelu" # activation function to use + + self.layer_wise_gradient_decay_ratio: float = 1.0 # ratio for layer-wise gradient decay + self.layer_norm_first: bool = False # apply layernorm first in the transformer + self.deep_norm: bool = False # apply deep_norm first in the transformer + + # dropouts + self.dropout: float = 0.1 # dropout probability for the transformer + self.attention_dropout: float = 0.1 # dropout probability for attention weights + self.activation_dropout: float = 0.0 # dropout probability after activation in FFN + self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer + self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr) + + # positional embeddings + self.conv_pos: int = 128 # number of filters for convolutional positional embeddings + self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding + + # relative position embedding + self.relative_position_embedding: bool = False # apply relative position embedding + self.num_buckets: int = 320 # number of buckets for relative position embedding + self.max_distance: int = 1280 # maximum distance for relative position embedding + self.gru_rel_pos: bool = False # apply gated relative position embedding + + # label predictor + self.finetuned_model: bool = False # whether the model is a fine-tuned model. + self.predictor_dropout: float = 0.1 # dropout probability for the predictor + self.predictor_class: int = 527 # target class number for the predictor + + if cfg is not None: + self.update(cfg) + + def update(self, cfg: dict): + self.__dict__.update(cfg) + + +class BEATs(nn.Module): + def __init__( + self, + cfg: BEATsConfig, + ) -> None: + super().__init__() + logger.info(f"BEATs Config: {cfg.__dict__}") + + self.cfg = cfg + + self.embed = cfg.embed_dim + self.post_extract_proj = ( + nn.Linear(self.embed, cfg.encoder_embed_dim) + if self.embed != cfg.encoder_embed_dim + else None + ) + + self.input_patch_size = cfg.input_patch_size + self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size, + bias=cfg.conv_bias) + + self.dropout_input = nn.Dropout(cfg.dropout_input) + + assert not cfg.deep_norm or not cfg.layer_norm_first + self.encoder = TransformerEncoder(cfg) + self.layer_norm = LayerNorm(self.embed) + + if cfg.finetuned_model: + self.predictor_dropout = nn.Dropout(cfg.predictor_dropout) + self.predictor = nn.Linear(cfg.encoder_embed_dim, cfg.predictor_class) + else: + self.predictor = None + + def forward_padding_mask( + self, + features: torch.Tensor, + padding_mask: torch.Tensor, + ) -> torch.Tensor: + extra = padding_mask.size(1) % features.size(1) + if extra > 0: + padding_mask = padding_mask[:, :-extra] + padding_mask = padding_mask.view( + padding_mask.size(0), features.size(1), -1 + ) + padding_mask = padding_mask.all(-1) + return padding_mask + + def preprocess( + self, + source: torch.Tensor, + fbank_mean: float = 15.41663, + fbank_std: float = 6.55582, + ) -> torch.Tensor: + fbanks = [] + for waveform in source: + waveform = waveform.unsqueeze(0) * 2 ** 15 + fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10) + fbanks.append(fbank) + fbank = torch.stack(fbanks, dim=0) + fbank = (fbank - fbank_mean) / (2 * fbank_std) + return fbank + + def extract_features( + self, + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + fbank_mean: float = 15.41663, + fbank_std: float = 6.55582, + ): + fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std) + + if padding_mask is not None: + padding_mask = self.forward_padding_mask(fbank, padding_mask) + + fbank = fbank.unsqueeze(1) + features = self.patch_embedding(fbank) + features = features.reshape(features.shape[0], features.shape[1], -1) + features = features.transpose(1, 2) + features = self.layer_norm(features) + + if padding_mask is not None: + padding_mask = self.forward_padding_mask(features, padding_mask) + + if self.post_extract_proj is not None: + features = self.post_extract_proj(features) + + x = self.dropout_input(features) + + x, layer_results = self.encoder( + x, + padding_mask=padding_mask, + ) + + if self.predictor is not None: + x = self.predictor_dropout(x) + logits = self.predictor(x) + + if padding_mask is not None and padding_mask.any(): + logits[padding_mask] = 0 + logits = logits.sum(dim=1) + logits = logits / (~padding_mask).sum(dim=1).unsqueeze(-1).expand_as(logits) + else: + logits = logits.mean(dim=1) + + lprobs = torch.sigmoid(logits) + + return lprobs, padding_mask + else: + return x, padding_mask + + + +class BEATsModel(nn.Module): + def __init__(self, cfg_path): + super().__init__() + # load the pre-trained checkpoint + checkpoint = torch.load(cfg_path) + cfg = BEATsConfig(checkpoint['cfg']) + BEATs_model = BEATs(cfg) + BEATs_model.load_state_dict(checkpoint['model']) + self.model = BEATs_model + self.ckpt = checkpoint + + def forward(self, x): + features = self.model.extract_features(x)[0] + global_features = features.mean(dim=1) + return {"global": global_features.float(), "frame": features.transpose(1, 2).float()} \ No newline at end of file diff --git a/src/local/beats/Tokenizers.py b/src/local/beats/Tokenizers.py new file mode 100644 index 0000000000000000000000000000000000000000..50f6c379e1c03929178fcaff66aebd40f507dd7d --- /dev/null +++ b/src/local/beats/Tokenizers.py @@ -0,0 +1,173 @@ +# -------------------------------------------------------- +# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058) +# Github source: https://github.com/microsoft/unilm/tree/master/beats +# Copyright (c) 2022 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Based on fairseq code bases +# https://github.com/pytorch/fairseq +# -------------------------------------------------------- + + +import torch +import torch.nn as nn +from torch.nn import LayerNorm +import torchaudio.compliance.kaldi as ta_kaldi + +from .backbone import ( + TransformerEncoder, +) +from .quantizer import ( + NormEMAVectorQuantizer, +) + +import logging +from typing import Optional + +logger = logging.getLogger(__name__) + + +class TokenizersConfig: + def __init__(self, cfg=None): + self.input_patch_size: int = -1 # path size of patch embedding + self.embed_dim: int = 512 # patch embedding dimension + self.conv_bias: bool = False # include bias in conv encoder + + self.encoder_layers: int = 12 # num encoder layers in the transformer + self.encoder_embed_dim: int = 768 # encoder embedding dimension + self.encoder_ffn_embed_dim: int = 3072 # encoder embedding dimension for FFN + self.encoder_attention_heads: int = 12 # num encoder attention heads + self.activation_fn: str = "gelu" # activation function to use + + self.layer_norm_first: bool = False # apply layernorm first in the transformer + self.deep_norm: bool = False # apply deep_norm first in the transformer + + # dropouts + self.dropout: float = 0.1 # dropout probability for the transformer + self.attention_dropout: float = 0.1 # dropout probability for attention weights + self.activation_dropout: float = 0.0 # dropout probability after activation in FFN + self.encoder_layerdrop: float = 0.0 # probability of dropping a tarnsformer layer + self.dropout_input: float = 0.0 # dropout to apply to the input (after feat extr) + + # positional embeddings + self.conv_pos: int = 128 # number of filters for convolutional positional embeddings + self.conv_pos_groups: int = 16 # number of groups for convolutional positional embedding + + # relative position embedding + self.relative_position_embedding: bool = False # apply relative position embedding + self.num_buckets: int = 320 # number of buckets for relative position embedding + self.max_distance: int = 1280 # maximum distance for relative position embedding + self.gru_rel_pos: bool = False # apply gated relative position embedding + + # quantizer + self.quant_n: int = 1024 # codebook number in quantizer + self.quant_dim: int = 256 # codebook dimension in quantizer + + if cfg is not None: + self.update(cfg) + + def update(self, cfg: dict): + self.__dict__.update(cfg) + + +class Tokenizers(nn.Module): + def __init__( + self, + cfg: TokenizersConfig, + ) -> None: + super().__init__() + logger.info(f"Tokenizers Config: {cfg.__dict__}") + + self.cfg = cfg + + self.embed = cfg.embed_dim + self.post_extract_proj = ( + nn.Linear(self.embed, cfg.encoder_embed_dim) + if self.embed != cfg.encoder_embed_dim + else None + ) + + self.input_patch_size = cfg.input_patch_size + self.patch_embedding = nn.Conv2d(1, self.embed, kernel_size=self.input_patch_size, stride=self.input_patch_size, + bias=cfg.conv_bias) + + self.dropout_input = nn.Dropout(cfg.dropout_input) + + assert not cfg.deep_norm or not cfg.layer_norm_first + self.encoder = TransformerEncoder(cfg) + self.layer_norm = LayerNorm(self.embed) + + self.quantize = NormEMAVectorQuantizer( + n_embed=cfg.quant_n, embedding_dim=cfg.quant_dim, beta=1.0, kmeans_init=True, decay=0.99, + ) + self.quant_n = cfg.quant_n + self.quantize_layer = nn.Sequential( + nn.Linear(cfg.encoder_embed_dim, cfg.encoder_embed_dim), + nn.Tanh(), + nn.Linear(cfg.encoder_embed_dim, cfg.quant_dim) # for quantize + ) + + def forward_padding_mask( + self, + features: torch.Tensor, + padding_mask: torch.Tensor, + ) -> torch.Tensor: + extra = padding_mask.size(1) % features.size(1) + if extra > 0: + padding_mask = padding_mask[:, :-extra] + padding_mask = padding_mask.view( + padding_mask.size(0), features.size(1), -1 + ) + padding_mask = padding_mask.all(-1) + return padding_mask + + def preprocess( + self, + source: torch.Tensor, + fbank_mean: float = 15.41663, + fbank_std: float = 6.55582, + ) -> torch.Tensor: + fbanks = [] + for waveform in source: + waveform = waveform.unsqueeze(0) * 2 ** 15 + fbank = ta_kaldi.fbank(waveform, num_mel_bins=128, sample_frequency=16000, frame_length=25, frame_shift=10) + fbanks.append(fbank) + fbank = torch.stack(fbanks, dim=0) + fbank = (fbank - fbank_mean) / (2 * fbank_std) + return fbank + + def extract_labels( + self, + source: torch.Tensor, + padding_mask: Optional[torch.Tensor] = None, + fbank_mean: float = 15.41663, + fbank_std: float = 6.55582, + ): + fbank = self.preprocess(source, fbank_mean=fbank_mean, fbank_std=fbank_std) + + if padding_mask is not None: + padding_mask = self.forward_padding_mask(fbank, padding_mask) + + fbank = fbank.unsqueeze(1) + features = self.patch_embedding(fbank) + features = features.reshape(features.shape[0], features.shape[1], -1) + features = features.transpose(1, 2) + features = self.layer_norm(features) + + if padding_mask is not None: + padding_mask = self.forward_padding_mask(features, padding_mask) + + if self.post_extract_proj is not None: + features = self.post_extract_proj(features) + + x = self.dropout_input(features) + + x, layer_results = self.encoder( + x, + padding_mask=padding_mask, + ) + + quantize_input = self.quantize_layer(x) + quantize_feature, embed_loss, embed_ind = self.quantize(quantize_input) + + return embed_ind + diff --git a/src/local/beats/__init__.py b/src/local/beats/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/src/local/beats/backbone.py b/src/local/beats/backbone.py new file mode 100644 index 0000000000000000000000000000000000000000..53ee744a24241f6e158dfad4104a5761a254d8a3 --- /dev/null +++ b/src/local/beats/backbone.py @@ -0,0 +1,783 @@ +# -------------------------------------------------------- +# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058) +# Github source: https://github.com/microsoft/unilm/tree/master/beats +# Copyright (c) 2022 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Based on fairseq code bases +# https://github.com/pytorch/fairseq +# -------------------------------------------------------- + +import math +import numpy as np +from typing import Dict, Optional, Tuple +import torch +from torch import Tensor, nn +import torch.nn.functional as F +from torch.nn import LayerNorm, Parameter +from .modules import ( + GradMultiply, + SamePad, + get_activation_fn, + GLU_Linear, + quant_noise, +) + + +class TransformerEncoder(nn.Module): + def __init__(self, args): + super().__init__() + + self.dropout = args.dropout + self.embedding_dim = args.encoder_embed_dim + + self.pos_conv = nn.Conv1d( + self.embedding_dim, + self.embedding_dim, + kernel_size=args.conv_pos, + padding=args.conv_pos // 2, + groups=args.conv_pos_groups, + ) + dropout = 0 + std = math.sqrt((4 * (1.0 - dropout)) / (args.conv_pos * self.embedding_dim)) + nn.init.normal_(self.pos_conv.weight, mean=0, std=std) + nn.init.constant_(self.pos_conv.bias, 0) + + self.pos_conv = nn.utils.weight_norm(self.pos_conv, name="weight", dim=2) + self.pos_conv = nn.Sequential(self.pos_conv, SamePad(args.conv_pos), nn.GELU()) + + if hasattr(args, "relative_position_embedding"): + self.relative_position_embedding = args.relative_position_embedding + self.num_buckets = args.num_buckets + self.max_distance = args.max_distance + else: + self.relative_position_embedding = False + self.num_buckets = 0 + self.max_distance = 0 + + self.layers = nn.ModuleList( + [ + TransformerSentenceEncoderLayer( + embedding_dim=self.embedding_dim, + ffn_embedding_dim=args.encoder_ffn_embed_dim, + num_attention_heads=args.encoder_attention_heads, + dropout=self.dropout, + attention_dropout=args.attention_dropout, + activation_dropout=args.activation_dropout, + activation_fn=args.activation_fn, + layer_norm_first=args.layer_norm_first, + deep_norm=args.deep_norm, + has_relative_attention_bias=self.relative_position_embedding, + num_buckets=self.num_buckets, + max_distance=self.max_distance, + gru_rel_pos=args.gru_rel_pos, + encoder_layers=args.encoder_layers, + ) + for i in range(args.encoder_layers) + ] + ) + if self.relative_position_embedding: + for i in range(1, args.encoder_layers): + del self.layers[i].self_attn.relative_attention_bias + self.layers[i].self_attn.relative_attention_bias = self.layers[0].self_attn.relative_attention_bias + + self.layer_norm_first = args.layer_norm_first + self.layer_norm = LayerNorm(self.embedding_dim) + self.layerdrop = args.encoder_layerdrop + + self.apply(init_bert_params) + + if args.deep_norm: + deep_norm_beta = math.pow(8 * args.encoder_layers, -1 / 4) + for i in range(args.encoder_layers): + nn.init.xavier_normal_(self.layers[i].self_attn.k_proj.weight, gain=1) + nn.init.xavier_normal_(self.layers[i].self_attn.v_proj.weight, gain=deep_norm_beta) + nn.init.xavier_normal_(self.layers[i].self_attn.q_proj.weight, gain=1) + nn.init.xavier_normal_(self.layers[i].self_attn.out_proj.weight, gain=deep_norm_beta) + nn.init.xavier_normal_(self.layers[i].fc1.weight, gain=deep_norm_beta) + nn.init.xavier_normal_(self.layers[i].fc2.weight, gain=deep_norm_beta) + + self.layer_wise_gradient_decay_ratio = getattr(args, "layer_wise_gradient_decay_ratio", 1) + + def forward(self, x, padding_mask=None, layer=None): + x, layer_results = self.extract_features(x, padding_mask, layer) + + if self.layer_norm_first and layer is None: + x = self.layer_norm(x) + + return x, layer_results + + def extract_features(self, x, padding_mask=None, tgt_layer=None): + + if padding_mask is not None: + x[padding_mask] = 0 + + x_conv = self.pos_conv(x.transpose(1, 2)) + x_conv = x_conv.transpose(1, 2) + x += x_conv + + if not self.layer_norm_first: + x = self.layer_norm(x) + + x = F.dropout(x, p=self.dropout, training=self.training) + + # B x T x C -> T x B x C + x = x.transpose(0, 1) + + layer_results = [] + z = None + if tgt_layer is not None: + layer_results.append((x, z)) + r = None + pos_bias = None + for i, layer in enumerate(self.layers): + if self.layer_wise_gradient_decay_ratio != 1.0: + x = GradMultiply.apply(x, self.layer_wise_gradient_decay_ratio) + dropout_probability = np.random.random() + if not self.training or (dropout_probability > self.layerdrop): + x, z, pos_bias = layer(x, self_attn_padding_mask=padding_mask, need_weights=False, pos_bias=pos_bias) + if tgt_layer is not None: + layer_results.append((x, z)) + if i == tgt_layer: + r = x + break + + if r is not None: + x = r + + # T x B x C -> B x T x C + x = x.transpose(0, 1) + + return x, layer_results + + +class TransformerSentenceEncoderLayer(nn.Module): + def __init__( + self, + embedding_dim: float = 768, + ffn_embedding_dim: float = 3072, + num_attention_heads: float = 8, + dropout: float = 0.1, + attention_dropout: float = 0.1, + activation_dropout: float = 0.1, + activation_fn: str = "relu", + layer_norm_first: bool = False, + deep_norm: bool = False, + has_relative_attention_bias: bool = False, + num_buckets: int = 0, + max_distance: int = 0, + rescale_init: bool = False, + gru_rel_pos: bool = False, + encoder_layers: int = 0, + ) -> None: + + super().__init__() + self.embedding_dim = embedding_dim + self.dropout = dropout + self.activation_dropout = activation_dropout + + self.activation_name = activation_fn + self.activation_fn = get_activation_fn(activation_fn) + self.self_attn = MultiheadAttention( + self.embedding_dim, + num_attention_heads, + dropout=attention_dropout, + self_attention=True, + has_relative_attention_bias=has_relative_attention_bias, + num_buckets=num_buckets, + max_distance=max_distance, + rescale_init=rescale_init, + gru_rel_pos=gru_rel_pos, + ) + + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(self.activation_dropout) + self.dropout3 = nn.Dropout(dropout) + + self.layer_norm_first = layer_norm_first + + self.self_attn_layer_norm = LayerNorm(self.embedding_dim) + + if self.activation_name == "glu": + self.fc1 = GLU_Linear(self.embedding_dim, ffn_embedding_dim, "swish") + else: + self.fc1 = nn.Linear(self.embedding_dim, ffn_embedding_dim) + self.fc2 = nn.Linear(ffn_embedding_dim, self.embedding_dim) + + self.final_layer_norm = LayerNorm(self.embedding_dim) + + self.deep_norm = deep_norm + if self.deep_norm: + self.deep_norm_alpha = math.pow(2 * encoder_layers, 1 / 4) + else: + self.deep_norm_alpha = 1 + + def forward( + self, + x: torch.Tensor, + self_attn_mask: torch.Tensor = None, + self_attn_padding_mask: torch.Tensor = None, + need_weights: bool = False, + pos_bias=None + ): + residual = x + + if self.layer_norm_first: + x = self.self_attn_layer_norm(x) + x, attn, pos_bias = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=self_attn_padding_mask, + need_weights=False, + attn_mask=self_attn_mask, + position_bias=pos_bias + ) + x = self.dropout1(x) + x = residual + x + + residual = x + x = self.final_layer_norm(x) + if self.activation_name == "glu": + x = self.fc1(x) + else: + x = self.activation_fn(self.fc1(x)) + x = self.dropout2(x) + x = self.fc2(x) + x = self.dropout3(x) + x = residual + x + else: + x, attn, pos_bias = self.self_attn( + query=x, + key=x, + value=x, + key_padding_mask=self_attn_padding_mask, + need_weights=need_weights, + attn_mask=self_attn_mask, + position_bias=pos_bias + ) + + x = self.dropout1(x) + x = residual * self.deep_norm_alpha + x + + x = self.self_attn_layer_norm(x) + + residual = x + if self.activation_name == "glu": + x = self.fc1(x) + else: + x = self.activation_fn(self.fc1(x)) + x = self.dropout2(x) + x = self.fc2(x) + x = self.dropout3(x) + x = residual * self.deep_norm_alpha + x + x = self.final_layer_norm(x) + + return x, attn, pos_bias + + +class MultiheadAttention(nn.Module): + """Multi-headed attention. + + See "Attention Is All You Need" for more details. + """ + + def __init__( + self, + embed_dim, + num_heads, + kdim=None, + vdim=None, + dropout=0.0, + bias=True, + add_bias_kv=False, + add_zero_attn=False, + self_attention=False, + encoder_decoder_attention=False, + q_noise=0.0, + qn_block_size=8, + has_relative_attention_bias=False, + num_buckets=32, + max_distance=128, + gru_rel_pos=False, + rescale_init=False, + ): + super().__init__() + self.embed_dim = embed_dim + self.kdim = kdim if kdim is not None else embed_dim + self.vdim = vdim if vdim is not None else embed_dim + self.qkv_same_dim = self.kdim == embed_dim and self.vdim == embed_dim + + self.num_heads = num_heads + self.dropout_module = nn.Dropout(dropout) + + self.has_relative_attention_bias = has_relative_attention_bias + self.num_buckets = num_buckets + self.max_distance = max_distance + if self.has_relative_attention_bias: + self.relative_attention_bias = nn.Embedding(num_buckets, num_heads) + + self.head_dim = embed_dim // num_heads + self.q_head_dim = self.head_dim + self.k_head_dim = self.head_dim + assert ( + self.head_dim * num_heads == self.embed_dim + ), "embed_dim must be divisible by num_heads" + self.scaling = self.head_dim ** -0.5 + + self.self_attention = self_attention + self.encoder_decoder_attention = encoder_decoder_attention + + assert not self.self_attention or self.qkv_same_dim, ( + "Self-attention requires query, key and " "value to be of the same size" + ) + + k_bias = True + if rescale_init: + k_bias = False + + k_embed_dim = embed_dim + q_embed_dim = embed_dim + + self.k_proj = quant_noise( + nn.Linear(self.kdim, k_embed_dim, bias=k_bias), q_noise, qn_block_size + ) + self.v_proj = quant_noise( + nn.Linear(self.vdim, embed_dim, bias=bias), q_noise, qn_block_size + ) + self.q_proj = quant_noise( + nn.Linear(embed_dim, q_embed_dim, bias=bias), q_noise, qn_block_size + ) + + self.out_proj = quant_noise( + nn.Linear(embed_dim, embed_dim, bias=bias), q_noise, qn_block_size + ) + + if add_bias_kv: + self.bias_k = Parameter(torch.Tensor(1, 1, embed_dim)) + self.bias_v = Parameter(torch.Tensor(1, 1, embed_dim)) + else: + self.bias_k = self.bias_v = None + + self.add_zero_attn = add_zero_attn + + self.gru_rel_pos = gru_rel_pos + if self.gru_rel_pos: + self.grep_linear = nn.Linear(self.q_head_dim, 8) + self.grep_a = nn.Parameter(torch.ones(1, num_heads, 1, 1)) + + self.reset_parameters() + + def reset_parameters(self): + if self.qkv_same_dim: + # Empirically observed the convergence to be much better with + # the scaled initialization + nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2)) + nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2)) + nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2)) + else: + nn.init.xavier_uniform_(self.k_proj.weight) + nn.init.xavier_uniform_(self.v_proj.weight) + nn.init.xavier_uniform_(self.q_proj.weight) + + nn.init.xavier_uniform_(self.out_proj.weight) + if self.out_proj.bias is not None: + nn.init.constant_(self.out_proj.bias, 0.0) + if self.bias_k is not None: + nn.init.xavier_normal_(self.bias_k) + if self.bias_v is not None: + nn.init.xavier_normal_(self.bias_v) + if self.has_relative_attention_bias: + nn.init.xavier_normal_(self.relative_attention_bias.weight) + + def _relative_positions_bucket(self, relative_positions, bidirectional=True): + num_buckets = self.num_buckets + max_distance = self.max_distance + relative_buckets = 0 + + if bidirectional: + num_buckets = num_buckets // 2 + relative_buckets += (relative_positions > 0).to(torch.long) * num_buckets + relative_positions = torch.abs(relative_positions) + else: + relative_positions = -torch.min(relative_positions, torch.zeros_like(relative_positions)) + + max_exact = num_buckets // 2 + is_small = relative_positions < max_exact + + relative_postion_if_large = max_exact + ( + torch.log(relative_positions.float() / max_exact) + / math.log(max_distance / max_exact) + * (num_buckets - max_exact) + ).to(torch.long) + relative_postion_if_large = torch.min( + relative_postion_if_large, torch.full_like(relative_postion_if_large, num_buckets - 1) + ) + + relative_buckets += torch.where(is_small, relative_positions, relative_postion_if_large) + return relative_buckets + + def compute_bias(self, query_length, key_length): + context_position = torch.arange(query_length, dtype=torch.long)[:, None] + memory_position = torch.arange(key_length, dtype=torch.long)[None, :] + relative_position = memory_position - context_position + relative_position_bucket = self._relative_positions_bucket( + relative_position, + bidirectional=True + ) + relative_position_bucket = relative_position_bucket.to(self.relative_attention_bias.weight.device) + values = self.relative_attention_bias(relative_position_bucket) + values = values.permute([2, 0, 1]) + return values + + def forward( + self, + query, + key: Optional[Tensor], + value: Optional[Tensor], + key_padding_mask: Optional[Tensor] = None, + incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] = None, + need_weights: bool = True, + static_kv: bool = False, + attn_mask: Optional[Tensor] = None, + before_softmax: bool = False, + need_head_weights: bool = False, + position_bias: Optional[Tensor] = None + ) -> Tuple[Tensor, Optional[Tensor], Optional[Tensor]]: + """Input shape: Time x Batch x Channel + + Args: + key_padding_mask (ByteTensor, optional): mask to exclude + keys that are pads, of shape `(batch, src_len)`, where + padding elements are indicated by 1s. + need_weights (bool, optional): return the attention weights, + averaged over heads (default: False). + attn_mask (ByteTensor, optional): typically used to + implement causal attention, where the mask prevents the + attention from looking forward in time (default: None). + before_softmax (bool, optional): return the raw attention + weights and values before the attention softmax. + need_head_weights (bool, optional): return the attention + weights for each head. Implies *need_weights*. Default: + return the average attention weights over all heads. + """ + if need_head_weights: + need_weights = True + + is_tpu = query.device.type == "xla" + + tgt_len, bsz, embed_dim = query.size() + src_len = tgt_len + assert embed_dim == self.embed_dim + assert list(query.size()) == [tgt_len, bsz, embed_dim] + if key is not None: + src_len, key_bsz, _ = key.size() + if not torch.jit.is_scripting(): + assert key_bsz == bsz + assert value is not None + assert src_len, bsz == value.shape[:2] + + if self.has_relative_attention_bias and position_bias is None: + position_bias = self.compute_bias(tgt_len, src_len) + position_bias = position_bias.unsqueeze(0).repeat(bsz, 1, 1, 1).view(bsz * self.num_heads, tgt_len, src_len) + + if incremental_state is not None: + saved_state = self._get_input_buffer(incremental_state) + if saved_state is not None and "prev_key" in saved_state: + # previous time steps are cached - no need to recompute + # key and value if they are static + if static_kv: + assert self.encoder_decoder_attention and not self.self_attention + key = value = None + else: + saved_state = None + + if self.self_attention: + q = self.q_proj(query) + k = self.k_proj(query) + v = self.v_proj(query) + elif self.encoder_decoder_attention: + # encoder-decoder attention + q = self.q_proj(query) + if key is None: + assert value is None + k = v = None + else: + k = self.k_proj(key) + v = self.v_proj(key) + + else: + assert key is not None and value is not None + q = self.q_proj(query) + k = self.k_proj(key) + v = self.v_proj(value) + q *= self.scaling + alpha = 32 + q *= 1 / alpha + + if self.bias_k is not None: + assert self.bias_v is not None + k = torch.cat([k, self.bias_k.repeat(1, bsz, 1)]) + v = torch.cat([v, self.bias_v.repeat(1, bsz, 1)]) + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 + ) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [ + key_padding_mask, + key_padding_mask.new_zeros(key_padding_mask.size(0), 1), + ], + dim=1, + ) + + q = ( + q.contiguous() + .view(tgt_len, bsz * self.num_heads, self.q_head_dim) + .transpose(0, 1) + ) + if k is not None: + k = ( + k.contiguous() + .view(-1, bsz * self.num_heads, self.k_head_dim) + .transpose(0, 1) + ) + if v is not None: + v = ( + v.contiguous() + .view(-1, bsz * self.num_heads, self.head_dim) + .transpose(0, 1) + ) + + if saved_state is not None: + # saved states are stored with shape (bsz, num_heads, seq_len, head_dim) + if "prev_key" in saved_state: + _prev_key = saved_state["prev_key"] + assert _prev_key is not None + prev_key = _prev_key.view(bsz * self.num_heads, -1, self.head_dim) + if static_kv: + k = prev_key + else: + assert k is not None + k = torch.cat([prev_key, k], dim=1) + src_len = k.size(1) + if "prev_value" in saved_state: + _prev_value = saved_state["prev_value"] + assert _prev_value is not None + prev_value = _prev_value.view(bsz * self.num_heads, -1, self.head_dim) + if static_kv: + v = prev_value + else: + assert v is not None + v = torch.cat([prev_value, v], dim=1) + prev_key_padding_mask: Optional[Tensor] = None + if "prev_key_padding_mask" in saved_state: + prev_key_padding_mask = saved_state["prev_key_padding_mask"] + assert k is not None and v is not None + key_padding_mask = MultiheadAttention._append_prev_key_padding_mask( + key_padding_mask=key_padding_mask, + prev_key_padding_mask=prev_key_padding_mask, + batch_size=bsz, + src_len=k.size(1), + static_kv=static_kv, + ) + + saved_state["prev_key"] = k.view(bsz, self.num_heads, -1, self.head_dim) + saved_state["prev_value"] = v.view(bsz, self.num_heads, -1, self.head_dim) + saved_state["prev_key_padding_mask"] = key_padding_mask + # In this branch incremental_state is never None + assert incremental_state is not None + incremental_state = self._set_input_buffer(incremental_state, saved_state) + assert k is not None + assert k.size(1) == src_len + + # This is part of a workaround to get around fork/join parallelism + # not supporting Optional types. + if key_padding_mask is not None and key_padding_mask.dim() == 0: + key_padding_mask = None + + if key_padding_mask is not None: + assert key_padding_mask.size(0) == bsz + assert key_padding_mask.size(1) == src_len + + if self.add_zero_attn: + assert v is not None + src_len += 1 + k = torch.cat([k, k.new_zeros((k.size(0), 1) + k.size()[2:])], dim=1) + v = torch.cat([v, v.new_zeros((v.size(0), 1) + v.size()[2:])], dim=1) + if attn_mask is not None: + attn_mask = torch.cat( + [attn_mask, attn_mask.new_zeros(attn_mask.size(0), 1)], dim=1 + ) + if key_padding_mask is not None: + key_padding_mask = torch.cat( + [ + key_padding_mask, + torch.zeros(key_padding_mask.size(0), 1).type_as( + key_padding_mask + ), + ], + dim=1, + ) + + attn_weights = torch.bmm(q, k.transpose(1, 2)) + attn_weights = (attn_weights - attn_weights.max(dim=-1, keepdim=True)[0]) * alpha + attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz) + + assert list(attn_weights.size()) == [bsz * self.num_heads, tgt_len, src_len] + + if attn_mask is not None: + attn_mask = attn_mask.unsqueeze(0) + attn_weights += attn_mask + + if key_padding_mask is not None: + # don't attend to padding symbols + attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + if not is_tpu: + attn_weights = attn_weights.masked_fill( + key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), + float("-inf"), + ) + else: + attn_weights = attn_weights.transpose(0, 2) + attn_weights = attn_weights.masked_fill(key_padding_mask, float("-inf")) + attn_weights = attn_weights.transpose(0, 2) + attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len) + + if before_softmax: + return attn_weights, v, position_bias + + if position_bias is not None: + attn_mask_rel_pos = position_bias + if self.gru_rel_pos == 1: + query_layer = q.view(bsz, self.num_heads, tgt_len, self.q_head_dim) * alpha / self.scaling + _B, _H, _L, __ = query_layer.size() + gate_a, gate_b = torch.sigmoid(self.grep_linear(query_layer).view( + _B, _H, _L, 2, 4).sum(-1, keepdim=False)).chunk(2, dim=-1) + gate_a_1 = gate_a * (gate_b * self.grep_a - 1.0) + 2.0 + attn_mask_rel_pos = gate_a_1.view(bsz * self.num_heads, tgt_len, 1) * position_bias + + attn_mask_rel_pos = attn_mask_rel_pos.view(attn_weights.size()) + + attn_weights = attn_weights + attn_mask_rel_pos + + attn_weights_float = F.softmax( + attn_weights, dim=-1 + ) + attn_weights = attn_weights_float.type_as(attn_weights) + attn_probs = self.dropout_module(attn_weights) + + assert v is not None + attn = torch.bmm(attn_probs, v) + assert list(attn.size()) == [bsz * self.num_heads, tgt_len, self.head_dim] + attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embed_dim) + attn = self.out_proj(attn) + attn_weights: Optional[Tensor] = None + if need_weights: + attn_weights = attn_weights_float.view( + bsz, self.num_heads, tgt_len, src_len + ).transpose(1, 0) + if not need_head_weights: + # average attention weights over heads + attn_weights = attn_weights.mean(dim=0) + + return attn, attn_weights, position_bias + + @staticmethod + def _append_prev_key_padding_mask( + key_padding_mask: Optional[Tensor], + prev_key_padding_mask: Optional[Tensor], + batch_size: int, + src_len: int, + static_kv: bool, + ) -> Optional[Tensor]: + # saved key padding masks have shape (bsz, seq_len) + if prev_key_padding_mask is not None and static_kv: + new_key_padding_mask = prev_key_padding_mask + elif prev_key_padding_mask is not None and key_padding_mask is not None: + new_key_padding_mask = torch.cat( + [prev_key_padding_mask.float(), key_padding_mask.float()], dim=1 + ) + # During incremental decoding, as the padding token enters and + # leaves the frame, there will be a time when prev or current + # is None + elif prev_key_padding_mask is not None: + if src_len > prev_key_padding_mask.size(1): + filler = torch.zeros( + (batch_size, src_len - prev_key_padding_mask.size(1)), + device=prev_key_padding_mask.device, + ) + new_key_padding_mask = torch.cat( + [prev_key_padding_mask.float(), filler.float()], dim=1 + ) + else: + new_key_padding_mask = prev_key_padding_mask.float() + elif key_padding_mask is not None: + if src_len > key_padding_mask.size(1): + filler = torch.zeros( + (batch_size, src_len - key_padding_mask.size(1)), + device=key_padding_mask.device, + ) + new_key_padding_mask = torch.cat( + [filler.float(), key_padding_mask.float()], dim=1 + ) + else: + new_key_padding_mask = key_padding_mask.float() + else: + new_key_padding_mask = prev_key_padding_mask + return new_key_padding_mask + + def _get_input_buffer( + self, incremental_state: Optional[Dict[str, Dict[str, Optional[Tensor]]]] + ) -> Dict[str, Optional[Tensor]]: + result = self.get_incremental_state(incremental_state, "attn_state") + if result is not None: + return result + else: + empty_result: Dict[str, Optional[Tensor]] = {} + return empty_result + + def _set_input_buffer( + self, + incremental_state: Dict[str, Dict[str, Optional[Tensor]]], + buffer: Dict[str, Optional[Tensor]], + ): + return self.set_incremental_state(incremental_state, "attn_state", buffer) + + def apply_sparse_mask(self, attn_weights, tgt_len: int, src_len: int, bsz: int): + return attn_weights + + +def init_bert_params(module): + """ + Initialize the weights specific to the BERT Model. + This overrides the default initializations depending on the specified arguments. + 1. If normal_init_linear_weights is set then weights of linear + layer will be initialized using the normal distribution and + bais will be set to the specified value. + 2. If normal_init_embed_weights is set then weights of embedding + layer will be initialized using the normal distribution. + 3. If normal_init_proj_weights is set then weights of + in_project_weight for MultiHeadAttention initialized using + the normal distribution (to be validated). + """ + + def normal_(data): + # with FSDP, module params will be on CUDA, so we cast them back to CPU + # so that the RNG is consistent with and without FSDP + data.copy_( + data.cpu().normal_(mean=0.0, std=0.02).to(data.device) + ) + + if isinstance(module, nn.Linear): + normal_(module.weight.data) + if module.bias is not None: + module.bias.data.zero_() + if isinstance(module, nn.Embedding): + normal_(module.weight.data) + if module.padding_idx is not None: + module.weight.data[module.padding_idx].zero_() + if isinstance(module, MultiheadAttention): + normal_(module.q_proj.weight.data) + normal_(module.k_proj.weight.data) + normal_(module.v_proj.weight.data) diff --git a/src/local/beats/modules.py b/src/local/beats/modules.py new file mode 100644 index 0000000000000000000000000000000000000000..7772b2d7448edca5ec2aa5fcd6278429b98e35a4 --- /dev/null +++ b/src/local/beats/modules.py @@ -0,0 +1,219 @@ +# -------------------------------------------------------- +# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058) +# Github source: https://github.com/microsoft/unilm/tree/master/beats +# Copyright (c) 2022 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Based on fairseq code bases +# https://github.com/pytorch/fairseq +# -------------------------------------------------------- + +import math +import warnings +import torch +from torch import Tensor, nn +import torch.nn.functional as F + + +class GradMultiply(torch.autograd.Function): + @staticmethod + def forward(ctx, x, scale): + ctx.scale = scale + res = x.new(x) + return res + + @staticmethod + def backward(ctx, grad): + return grad * ctx.scale, None + + +class SamePad(nn.Module): + def __init__(self, kernel_size, causal=False): + super().__init__() + if causal: + self.remove = kernel_size - 1 + else: + self.remove = 1 if kernel_size % 2 == 0 else 0 + + def forward(self, x): + if self.remove > 0: + x = x[:, :, : -self.remove] + return x + + +class Swish(nn.Module): + def __init__(self): + super(Swish, self).__init__() + self.act = torch.nn.Sigmoid() + + def forward(self, x): + return x * self.act(x) + + +class GLU_Linear(nn.Module): + def __init__(self, input_dim, output_dim, glu_type="sigmoid", bias_in_glu=True): + super(GLU_Linear, self).__init__() + + self.glu_type = glu_type + self.output_dim = output_dim + + if glu_type == "sigmoid": + self.glu_act = torch.nn.Sigmoid() + elif glu_type == "swish": + self.glu_act = Swish() + elif glu_type == "relu": + self.glu_act = torch.nn.ReLU() + elif glu_type == "gelu": + self.glu_act = torch.nn.GELU() + + if bias_in_glu: + self.linear = nn.Linear(input_dim, output_dim * 2, True) + else: + self.linear = nn.Linear(input_dim, output_dim * 2, False) + + def forward(self, x): + # to be consistent with GLU_Linear, we assume the input always has the #channel (#dim) in the last dimension of the tensor, so need to switch the dimension first for 1D-Conv case + x = self.linear(x) + + if self.glu_type == "bilinear": + x = (x[:, :, 0:self.output_dim] * x[:, :, self.output_dim:self.output_dim * 2]) + else: + x = (x[:, :, 0:self.output_dim] * self.glu_act(x[:, :, self.output_dim:self.output_dim * 2])) + + return x + + +def gelu_accurate(x): + if not hasattr(gelu_accurate, "_a"): + gelu_accurate._a = math.sqrt(2 / math.pi) + return ( + 0.5 * x * (1 + torch.tanh(gelu_accurate._a * (x + 0.044715 * torch.pow(x, 3)))) + ) + + +def gelu(x: torch.Tensor) -> torch.Tensor: + return torch.nn.functional.gelu(x.float()).type_as(x) + + +def get_activation_fn(activation: str): + """Returns the activation function corresponding to `activation`""" + + if activation == "relu": + return F.relu + elif activation == "gelu": + return gelu + elif activation == "gelu_fast": + warnings.warn( + "--activation-fn=gelu_fast has been renamed to gelu_accurate" + ) + return gelu_accurate + elif activation == "gelu_accurate": + return gelu_accurate + elif activation == "tanh": + return torch.tanh + elif activation == "linear": + return lambda x: x + elif activation == "glu": + return lambda x: x + else: + raise RuntimeError("--activation-fn {} not supported".format(activation)) + + +def quant_noise(module, p, block_size): + """ + Wraps modules and applies quantization noise to the weights for + subsequent quantization with Iterative Product Quantization as + described in "Training with Quantization Noise for Extreme Model Compression" + + Args: + - module: nn.Module + - p: amount of Quantization Noise + - block_size: size of the blocks for subsequent quantization with iPQ + + Remarks: + - Module weights must have the right sizes wrt the block size + - Only Linear, Embedding and Conv2d modules are supported for the moment + - For more detail on how to quantize by blocks with convolutional weights, + see "And the Bit Goes Down: Revisiting the Quantization of Neural Networks" + - We implement the simplest form of noise here as stated in the paper + which consists in randomly dropping blocks + """ + + # if no quantization noise, don't register hook + if p <= 0: + return module + + # supported modules + assert isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)) + + # test whether module.weight has the right sizes wrt block_size + is_conv = module.weight.ndim == 4 + + # 2D matrix + if not is_conv: + assert ( + module.weight.size(1) % block_size == 0 + ), "Input features must be a multiple of block sizes" + + # 4D matrix + else: + # 1x1 convolutions + if module.kernel_size == (1, 1): + assert ( + module.in_channels % block_size == 0 + ), "Input channels must be a multiple of block sizes" + # regular convolutions + else: + k = module.kernel_size[0] * module.kernel_size[1] + assert k % block_size == 0, "Kernel size must be a multiple of block size" + + def _forward_pre_hook(mod, input): + # no noise for evaluation + if mod.training: + if not is_conv: + # gather weight and sizes + weight = mod.weight + in_features = weight.size(1) + out_features = weight.size(0) + + # split weight matrix into blocks and randomly drop selected blocks + mask = torch.zeros( + in_features // block_size * out_features, device=weight.device + ) + mask.bernoulli_(p) + mask = mask.repeat_interleave(block_size, -1).view(-1, in_features) + + else: + # gather weight and sizes + weight = mod.weight + in_channels = mod.in_channels + out_channels = mod.out_channels + + # split weight matrix into blocks and randomly drop selected blocks + if mod.kernel_size == (1, 1): + mask = torch.zeros( + int(in_channels // block_size * out_channels), + device=weight.device, + ) + mask.bernoulli_(p) + mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels) + else: + mask = torch.zeros( + weight.size(0), weight.size(1), device=weight.device + ) + mask.bernoulli_(p) + mask = ( + mask.unsqueeze(2) + .unsqueeze(3) + .repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1]) + ) + + # scale weights and apply mask + mask = mask.to( + torch.bool + ) # x.bool() is not currently supported in TorchScript + s = 1 / (1 - p) + mod.weight.data = s * weight.masked_fill(mask, 0) + + module.register_forward_pre_hook(_forward_pre_hook) + return module + diff --git a/src/local/beats/quantizer.py b/src/local/beats/quantizer.py new file mode 100644 index 0000000000000000000000000000000000000000..5370d02e7f8f10723128b9bbc34afd3342cfcd86 --- /dev/null +++ b/src/local/beats/quantizer.py @@ -0,0 +1,215 @@ +# -------------------------------------------------------- +# BEATs: Audio Pre-Training with Acoustic Tokenizers (https://arxiv.org/abs/2212.09058) +# Github source: https://github.com/microsoft/unilm/tree/master/beats +# Copyright (c) 2022 Microsoft +# Licensed under The MIT License [see LICENSE for details] +# Based on VQGAN code bases +# https://github.com/CompVis/taming-transformers +# --------------------------------------------------------' + +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.distributed as distributed + +try: + from einops import rearrange, repeat +except ImportError: + pass + + +def l2norm(t): + return F.normalize(t, p=2, dim=-1) + + +def ema_inplace(moving_avg, new, decay): + moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay)) + + +def sample_vectors(samples, num): + num_samples, device = samples.shape[0], samples.device + + if num_samples >= num: + indices = torch.randperm(num_samples, device=device)[:num] + else: + indices = torch.randint(0, num_samples, (num,), device=device) + + return samples[indices] + + +def kmeans(samples, num_clusters, num_iters=10, use_cosine_sim=False): + dim, dtype, device = samples.shape[-1], samples.dtype, samples.device + + means = sample_vectors(samples, num_clusters) + + for _ in range(num_iters): + if use_cosine_sim: + dists = samples @ means.t() + else: + diffs = rearrange(samples, 'n d -> n () d') \ + - rearrange(means, 'c d -> () c d') + dists = -(diffs ** 2).sum(dim=-1) + + buckets = dists.max(dim=-1).indices + bins = torch.bincount(buckets, minlength=num_clusters) + zero_mask = bins == 0 + bins_min_clamped = bins.masked_fill(zero_mask, 1) + + new_means = buckets.new_zeros(num_clusters, dim, dtype=dtype) + new_means.scatter_add_(0, repeat(buckets, 'n -> n d', d=dim), samples) + new_means = new_means / bins_min_clamped[..., None] + + if use_cosine_sim: + new_means = l2norm(new_means) + + means = torch.where(zero_mask[..., None], means, new_means) + + return means, bins + + +class EmbeddingEMA(nn.Module): + def __init__(self, num_tokens, codebook_dim, decay=0.99, eps=1e-5, kmeans_init=True, codebook_init_path=''): + super().__init__() + self.num_tokens = num_tokens + self.codebook_dim = codebook_dim + self.decay = decay + self.eps = eps + if codebook_init_path == '': + if not kmeans_init: + weight = torch.randn(num_tokens, codebook_dim) + weight = l2norm(weight) + else: + weight = torch.zeros(num_tokens, codebook_dim) + self.register_buffer('initted', torch.Tensor([not kmeans_init])) + else: + print(f"load init codebook weight from {codebook_init_path}") + codebook_ckpt_weight = torch.load(codebook_init_path, map_location='cpu') + weight = codebook_ckpt_weight.clone() + self.register_buffer('initted', torch.Tensor([True])) + + self.weight = nn.Parameter(weight, requires_grad=False) + self.cluster_size = nn.Parameter(torch.zeros(num_tokens), requires_grad=False) + self.embed_avg = nn.Parameter(weight.clone(), requires_grad=False) + # self.register_buffer('initted', torch.Tensor([not kmeans_init])) + self.update = True + + @torch.jit.ignore + def init_embed_(self, data): + if self.initted: + return + print("Performing Kemans init for codebook") + embed, cluster_size = kmeans(data, self.num_tokens, 10, use_cosine_sim=True) + self.weight.data.copy_(embed) + self.cluster_size.data.copy_(cluster_size) + self.initted.data.copy_(torch.Tensor([True])) + + def forward(self, embed_id): + return F.embedding(embed_id, self.weight) + + def cluster_size_ema_update(self, new_cluster_size): + self.cluster_size.data.mul_(self.decay).add_(new_cluster_size, alpha=1 - self.decay) + + def embed_avg_ema_update(self, new_embed_avg): + self.embed_avg.data.mul_(self.decay).add_(new_embed_avg, alpha=1 - self.decay) + + def weight_update(self, num_tokens): + n = self.cluster_size.sum() + smoothed_cluster_size = ( + (self.cluster_size + self.eps) / (n + num_tokens * self.eps) * n + ) + # normalize embedding average with smoothed cluster size + embed_normalized = self.embed_avg / smoothed_cluster_size.unsqueeze(1) + # embed_normalized = l2norm(self.embed_avg / smoothed_cluster_size.unsqueeze(1)) + self.weight.data.copy_(embed_normalized) + + +def norm_ema_inplace(moving_avg, new, decay): + moving_avg.data.mul_(decay).add_(new, alpha=(1 - decay)) + moving_avg.data.copy_(l2norm(moving_avg.data)) + + +class NormEMAVectorQuantizer(nn.Module): + def __init__(self, n_embed, embedding_dim, beta, decay=0.99, eps=1e-5, + statistic_code_usage=True, kmeans_init=False, codebook_init_path=''): + super().__init__() + self.codebook_dim = embedding_dim + self.num_tokens = n_embed + self.beta = beta + self.decay = decay + + # learnable = True if orthogonal_reg_weight > 0 else False + self.embedding = EmbeddingEMA(self.num_tokens, self.codebook_dim, decay, eps, kmeans_init, codebook_init_path) + + self.statistic_code_usage = statistic_code_usage + if statistic_code_usage: + self.register_buffer('cluster_size', torch.zeros(n_embed)) + if distributed.is_available() and distributed.is_initialized(): + print("ddp is enable, so use ddp_reduce to sync the statistic_code_usage for each gpu!") + self.all_reduce_fn = distributed.all_reduce + else: + self.all_reduce_fn = nn.Identity() + + def reset_cluster_size(self, device): + if self.statistic_code_usage: + self.register_buffer('cluster_size', torch.zeros(self.num_tokens)) + self.cluster_size = self.cluster_size.to(device) + + def forward(self, z): + # reshape z -> (batch, height, width, channel) and flatten + # z, 'b c h w -> b h w c' + # z = rearrange(z, 'b c h w -> b h w c') + # z = z.transpose(1, 2) + z = l2norm(z) + z_flattened = z.reshape(-1, self.codebook_dim) + + self.embedding.init_embed_(z_flattened) + + d = z_flattened.pow(2).sum(dim=1, keepdim=True) + \ + self.embedding.weight.pow(2).sum(dim=1) - 2 * \ + torch.einsum('bd,nd->bn', z_flattened, self.embedding.weight) # 'n d -> d n' + + encoding_indices = torch.argmin(d, dim=1) + + z_q = self.embedding(encoding_indices).view(z.shape) + + encodings = F.one_hot(encoding_indices, self.num_tokens).type(z.dtype) + + if not self.training: + with torch.no_grad(): + cluster_size = encodings.sum(0) + self.all_reduce_fn(cluster_size) + ema_inplace(self.cluster_size, cluster_size, self.decay) + + if self.training and self.embedding.update: + # EMA cluster size + + bins = encodings.sum(0) + self.all_reduce_fn(bins) + + # self.embedding.cluster_size_ema_update(bins) + ema_inplace(self.cluster_size, bins, self.decay) + + zero_mask = (bins == 0) + bins = bins.masked_fill(zero_mask, 1.) + + embed_sum = z_flattened.t() @ encodings + self.all_reduce_fn(embed_sum) + + embed_normalized = (embed_sum / bins.unsqueeze(0)).t() + embed_normalized = l2norm(embed_normalized) + + embed_normalized = torch.where(zero_mask[..., None], self.embedding.weight, + embed_normalized) + norm_ema_inplace(self.embedding.weight, embed_normalized, self.decay) + + # compute loss for embedding + loss = self.beta * F.mse_loss(z_q.detach(), z) + + # preserve gradients + z_q = z + (z_q - z).detach() + + # reshape back to match original input shape + # z_q, 'b h w c -> b c h w' + # z_q = rearrange(z_q, 'b h w c -> b c h w') + # z_q = z_q.transpose(1, 2) + return z_q, loss, encoding_indices diff --git a/src/local/classes_dict.py b/src/local/classes_dict.py new file mode 100644 index 0000000000000000000000000000000000000000..cd19b74fca2b096602810eed3ca7835372cefc6f --- /dev/null +++ b/src/local/classes_dict.py @@ -0,0 +1,21 @@ +""" +we store here a dict where we define the encodings for all classes in DESED task. +""" + +from collections import OrderedDict + + +classes_labels = OrderedDict( + { + "Alarm_bell_ringing": 0, + "Blender": 1, + "Cat": 2, + "Dishes": 3, + "Dog": 4, + "Electric_shaver_toothbrush": 5, + "Frying": 6, + "Running_water": 7, + "Speech": 8, + "Vacuum_cleaner": 9, + } +) diff --git a/src/local/resample_folder.py b/src/local/resample_folder.py new file mode 100644 index 0000000000000000000000000000000000000000..a8eeb688cee099a17410474ea84385222f440158 --- /dev/null +++ b/src/local/resample_folder.py @@ -0,0 +1,87 @@ +import argparse +import glob +import os +from pathlib import Path + +import librosa +import torch +import torchaudio +import multiprocessing as mp +import tqdm +from tqdm.contrib.concurrent import process_map # or thread_map + + +def resample(audio, orig_fs, target_fs=16000): + """ + Resamples the audio given as input at the target_fs sample rate, if the target sample rate and the + original sample rate are different. + + Args: + audio (Tensor): audio to resample + orig_fs (int): original sample rate + target_fs (int): target sample rate + + Returns: + Tensor: audio resampled + """ + out = [] + for c in range(audio.shape[0]): + tmp = audio[c].detach().cpu().numpy() + if target_fs != orig_fs: + tmp = librosa.resample(tmp, orig_sr=orig_fs, target_sr=target_fs) + out.append(torch.from_numpy(tmp)) + out = torch.stack(out) + return out + + +def resample_folder(in_dir, out_dir, target_fs=16000, regex="*.wav"): + """ + Resamples the audio files contained in the in_dir folder and saves them in out_dir folder + + Args: + in_dir (str): path to audio directory (audio to be resampled) + out_dir (str): path to audio resampled directory + target_fs (int, optional): target sample rate. Defaults to 16000. + regex (str, optional): regular expression for extension of file. Defaults to "*.wav". + """ + compute = True + files = glob.glob(os.path.join(in_dir, regex)) + if os.path.exists(out_dir): + out_files = glob.glob(os.path.join(out_dir, regex)) + if len(files) == len(out_files): + compute = False + + if compute: + # Packing resample_file arguments to the multiprocessing pool + workers_args = [(f, in_dir, out_dir, target_fs) for f in files] + n_workers = min(10, mp.cpu_count()) + process_map(_worker_func, workers_args, max_workers=n_workers, chunksize=1) + return compute + +def _worker_func(input_args): + """ + Used internally by the pool of multiprocessing workers to resample a given audio file + """ + f, in_dir, out_dir, target_fs = input_args + audio, orig_fs = torchaudio.load(f) + audio = resample(audio, orig_fs, target_fs) + os.makedirs( + Path(os.path.join(out_dir, Path(f).relative_to(Path(in_dir)))).parent, + exist_ok=True, + ) + torchaudio.save( + os.path.join(out_dir, Path(f).relative_to(Path(in_dir))), + audio, + target_fs, + ) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser("Resample a folder recursively") + parser.add_argument("--in_dir", type=str) + parser.add_argument("--out_dir", type=str) + parser.add_argument("--target_fs", default=16000) + parser.add_argument("--regex", type=str, default="*.wav") + + args = parser.parse_args() + resample_folder(args.in_dir, args.out_dir, int(args.target_fs), args.regex) diff --git a/src/local/utils.py b/src/local/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..311bd9be49c9289cf75e92d46d21d094282d6c5c --- /dev/null +++ b/src/local/utils.py @@ -0,0 +1,230 @@ +import os +from pathlib import Path + +import numpy as np +import pandas as pd +import scipy +import torch + +from desed_task.evaluation.evaluation_measures import compute_sed_eval_metrics +import json + +import soundfile +import glob +from thop import profile, clever_format + +from sed_scores_eval.base_modules.scores import create_score_dataframe + + +def batched_decode_preds( + strong_preds, filenames, encoder, thresholds=[0.5], median_filter=7, pad_indx=None, +): + """ Decode a batch of predictions to dataframes. Each threshold gives a different dataframe and stored in a + dictionary + + Args: + strong_preds: torch.Tensor, batch of strong predictions. + filenames: list, the list of filenames of the current batch. + encoder: ManyHotEncoder object, object used to decode predictions. + thresholds: list, the list of thresholds to be used for predictions. + median_filter: int, the number of frames for which to apply median window (smoothing). + pad_indx: list, the list of indexes which have been used for padding. + + Returns: + dict of predictions, each keys is a threshold and the value is the DataFrame of predictions. + """ + # Init a dataframe per threshold + scores_raw = {} + scores_postprocessed = {} + prediction_dfs = {} + for threshold in thresholds: + prediction_dfs[threshold] = pd.DataFrame() + + for j in range(strong_preds.shape[0]): # over batches + audio_id = Path(filenames[j]).stem + filename = audio_id + ".wav" + c_scores = strong_preds[j] + if pad_indx is not None: + true_len = int(c_scores.shape[-1] * pad_indx[j].item()) + c_scores = c_scores[:true_len] + c_scores = c_scores.transpose(0, 1).detach().cpu().numpy() + scores_raw[audio_id] = create_score_dataframe( + scores=c_scores, + timestamps=encoder._frame_to_time(np.arange(len(c_scores)+1)), + event_classes=encoder.labels, + ) + c_scores = scipy.ndimage.filters.median_filter(c_scores, (median_filter, 1)) + scores_postprocessed[audio_id] = create_score_dataframe( + scores=c_scores, + timestamps=encoder._frame_to_time(np.arange(len(c_scores)+1)), + event_classes=encoder.labels, + ) + for c_th in thresholds: + pred = c_scores > c_th + pred = encoder.decode_strong(pred) + pred = pd.DataFrame(pred, columns=["event_label", "onset", "offset"]) + pred["filename"] = filename + prediction_dfs[c_th] = pd.concat([prediction_dfs[c_th], pred], ignore_index=True) + + return scores_raw, scores_postprocessed, prediction_dfs + + +def convert_to_event_based(weak_dataframe): + """ Convert a weakly labeled DataFrame ('filename', 'event_labels') to a DataFrame strongly labeled + ('filename', 'onset', 'offset', 'event_label'). + + Args: + weak_dataframe: pd.DataFrame, the dataframe to be converted. + + Returns: + pd.DataFrame, the dataframe strongly labeled. + """ + + new = [] + for i, r in weak_dataframe.iterrows(): + + events = r["event_labels"].split(",") + for e in events: + new.append( + {"filename": r["filename"], "event_label": e, "onset": 0, "offset": 1} + ) + return pd.DataFrame(new) + + +def log_sedeval_metrics(predictions, ground_truth, save_dir=None): + """ Return the set of metrics from sed_eval + Args: + predictions: pd.DataFrame, the dataframe of predictions. + ground_truth: pd.DataFrame, the dataframe of groundtruth. + save_dir: str, path to the folder where to save the event and segment based metrics outputs. + + Returns: + tuple, event-based macro-F1 and micro-F1, segment-based macro-F1 and micro-F1 + """ + if predictions.empty: + return 0.0, 0.0, 0.0, 0.0 + + gt = pd.read_csv(ground_truth, sep="\t") + + event_res, segment_res = compute_sed_eval_metrics(predictions, gt) + + if save_dir is not None: + os.makedirs(save_dir, exist_ok=True) + with open(os.path.join(save_dir, "event_f1.txt"), "w") as f: + f.write(str(event_res)) + + with open(os.path.join(save_dir, "segment_f1.txt"), "w") as f: + f.write(str(segment_res)) + + return ( + event_res.results()["class_wise_average"]["f_measure"]["f_measure"], + event_res.results()["overall"]["f_measure"]["f_measure"], + segment_res.results()["class_wise_average"]["f_measure"]["f_measure"], + segment_res.results()["overall"]["f_measure"]["f_measure"], + ) # return also segment measures + + +def parse_jams(jams_list, encoder, out_json): + + if len(jams_list) == 0: + raise IndexError("jams list is empty ! Wrong path ?") + + backgrounds = [] + sources = [] + for jamfile in jams_list: + + with open(jamfile, "r") as f: + jdata = json.load(f) + + # check if we have annotations for each source in scaper + assert len(jdata["annotations"][0]["data"]) == len( + jdata["annotations"][-1]["sandbox"]["scaper"]["isolated_events_audio_path"] + ) + + for indx, sound in enumerate(jdata["annotations"][0]["data"]): + source_name = Path( + jdata["annotations"][-1]["sandbox"]["scaper"][ + "isolated_events_audio_path" + ][indx] + ).stem + source_file = os.path.join( + Path(jamfile).parent, + Path(jamfile).stem + "_events", + source_name + ".wav", + ) + + if sound["value"]["role"] == "background": + backgrounds.append(source_file) + else: # it is an event + if ( + sound["value"]["label"] not in encoder.labels + ): # correct different labels + if sound["value"]["label"].startswith("Frying"): + sound["value"]["label"] = "Frying" + elif sound["value"]["label"].startswith("Vacuum_cleaner"): + sound["value"]["label"] = "Vacuum_cleaner" + else: + raise NotImplementedError + + sources.append( + { + "filename": source_file, + "onset": sound["value"]["event_time"], + "offset": sound["value"]["event_time"] + + sound["value"]["event_duration"], + "event_label": sound["value"]["label"], + } + ) + + os.makedirs(Path(out_json).parent, exist_ok=True) + with open(out_json, "w") as f: + json.dump({"backgrounds": backgrounds, "sources": sources}, f, indent=4) + + +def generate_tsv_wav_durations(audio_dir, out_tsv): + """ + Generate a dataframe with filename and duration of the file + + Args: + audio_dir: str, the path of the folder where audio files are (used by glob.glob) + out_tsv: str, the path of the output tsv file + + Returns: + pd.DataFrame: the dataframe containing filenames and durations + """ + meta_list = [] + for file in glob.glob(os.path.join(audio_dir, "*.wav")): + d = soundfile.info(file).duration + meta_list.append([os.path.basename(file), d]) + meta_df = pd.DataFrame(meta_list, columns=["filename", "duration"]) + if out_tsv is not None: + meta_df.to_csv(out_tsv, sep="\t", index=False, float_format="%.1f") + + return meta_df + + +def calculate_macs(model, config, dataset=None): + """ + The function calculate the multiplyโ€“accumulate operation (MACs) of the model given as input. + + Args: + model: deep learning model to calculate the macs for + config: config used to train the model + dataset: dataset used to train the model + + Returns: + + """ + n_frames = int(((config["feats"]["sample_rate"] * config["data"]["audio_max_len"]) / config["feats"]["hop_length"])+1) + input_size = [sum(config["training"]["batch_size"]), config["feats"]["n_mels"], n_frames] + input = torch.randn(input_size) + + if "use_embeddings" in config["net"] and config["net"]["use_embeddings"]: + audio, label, padded_indxs, path, embeddings = dataset[0] + embeddings = embeddings.repeat((sum(config["training"]["batch_size"])), 1, 1) + macs, params = profile(model, inputs=(input, None, embeddings)) + else: + macs, params = profile(model, inputs=(input,)) + + macs, params = clever_format([macs, params], "%.3f") + return macs, params diff --git a/src/models/.ipynb_checkpoints/dasheng-checkpoint.py b/src/models/.ipynb_checkpoints/dasheng-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..bfe29b9603239567824630e3b824bf3f8b716f1f --- /dev/null +++ b/src/models/.ipynb_checkpoints/dasheng-checkpoint.py @@ -0,0 +1,671 @@ +import torch +import torch.nn as nn +from einops import rearrange +import torch +from torch.cuda.amp import autocast +from functools import partial +from typing import Optional, Tuple, Union +import torchaudio.transforms as audio_transforms +from einops import rearrange +from einops.layers.torch import Rearrange +from itertools import repeat +import collections + +import torch.nn.functional as F +import einops + + +if hasattr(nn.functional, 'scaled_dot_product_attention'): + ATTENTION_MODE = 'flash' +else: + ATTENTION_MODE = 'math' +print(f'attention mode is {ATTENTION_MODE}') + + +def _ntuple(n): + + def parse(x) -> Tuple: + if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): + return tuple(x) + return tuple(repeat(x, n)) + + return parse + + +to_2tuple = _ntuple(2) + + +class MAELoss(torch.nn.Module): + + def __init__(self, norm_pix_loss: bool = True): + super().__init__() + self.norm_pix_loss = norm_pix_loss + + @autocast(enabled=False) + def forward(self, pred: torch.Tensor, target: torch.Tensor, + mask: torch.Tensor) -> torch.Tensor: + if self.norm_pix_loss is True: + mean = target.mean(dim=-1, keepdim=True) + var = target.var(dim=-1, keepdim=True) + target = (target - mean) / (var + 1.e-6)**.5 + elif self.norm_pix_loss == 'global': + mean = target.mean() + var = target.var() + target = (target - mean) / (var + 1.e-6)**.5 + loss = (pred - target)**2 + loss = loss.mean(dim=-1) # [N, L], mean loss per patch + loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches + return loss + + +class AudioPatchEmbed(nn.Module): + + def __init__(self, + input_size: Union[int, Tuple[int, int]] = (64, 100), + patch_size: Tuple[int, int] = (64, 4), + patch_stride: Tuple[int, int] = (64, 4), + in_chans=1, + embed_dim=768, + norm_layer=None, + flatten=False): + super().__init__() + patch_size = to_2tuple(patch_size) + patch_stride = to_2tuple(patch_stride) + self.input_size: Tuple[int, int] = to_2tuple(input_size) + self.patch_size: Tuple[int, int] = to_2tuple(patch_size) + self.patch_stride: Tuple[int, int] = to_2tuple(patch_stride) + self.grid_size = (self.input_size[0] // self.patch_stride[0], + self.input_size[1] // self.patch_stride[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + self.proj = nn.Conv2d(in_chans, + embed_dim, + kernel_size=patch_size, + stride=patch_stride) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x): + x = self.proj(x) + if self.flatten: + x = rearrange(x, 'b c f t -> b (f t) c') + x = self.norm(x) + return x + + +class LayerScale(nn.Module): + + def __init__(self, dim: int, init_values=1e-5, inplace=False): + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x): + return x.mul_(self.gamma) if self.inplace else x * self.gamma + + +class Attention(nn.Module): + + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + attn_drop=0., + proj_drop=0.): + super().__init__() + assert dim % num_heads == 0, 'dim should be divisible by num_heads' + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.attn_drop_p = attn_drop + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind( + 0) # make torchscript happy (cannot use tensor as tuple) + + if ATTENTION_MODE == 'flash': + x = F.scaled_dot_product_attention(q, k, v, + dropout_p=self.attn_drop_p, + scale=self.scale, + ) + x = einops.rearrange(x, 'B H L D -> B L (H D)') + elif ATTENTION_MODE == 'math': + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Mlp(nn.Module): + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Block(nn.Module): + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + drop=0., + attn_drop=0., + init_values=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + attention_type='Attention', + ): + super().__init__() + self.norm1 = norm_layer(dim) + attn_type = globals()[attention_type] + self.attn = attn_type(dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop) + self.ls1 = LayerScale( + dim, init_values=init_values) if init_values else nn.Identity() + + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, + hidden_features=int(dim * mlp_ratio), + act_layer=act_layer, + drop=drop) + self.ls2 = LayerScale( + dim, init_values=init_values) if init_values else nn.Identity() + + def forward(self, x): + x = x + self.ls1(self.attn(self.norm1(x))) + x = x + self.ls2(self.mlp(self.norm2(x))) + return x + + +class AudioTransformerMAE_Encoder(nn.Module): + + def __init__(self, + patch_size: Tuple[int, int] = (64, 4), + patch_stride: Tuple[int, int] = (64, 4), + embed_dim: int = 768, + depth: int = 12, + num_heads=8, + mlp_ratio=4., + qkv_bias=True, + drop_rate=0., + attn_drop_rate=0., + norm_layer=None, + act_layer=None, + init_values=None, + target_length=1008, + pooling='mean', + time_patch_out: Optional[float] = None, + freq_patch_out: Optional[float] = None, + block_type='Block', + attention_type='Attention', + eval_avg='cat', + n_fft: int = 512, + n_mels: int = 64, + hop_size: int = 160, + win_size: int = 512, + f_min: int = 0, + f_max: int = 8000, + center: bool = True, + **kwargs): + super().__init__() + self.pooling = pooling + self.embed_dim = embed_dim + self.patch_stride = patch_stride + self.patch_size = patch_size + self.n_mels = n_mels + self.eval_avg = eval_avg + self.time_patch_out = time_patch_out + self.freq_patch_out = freq_patch_out + + self.front_end = nn.Sequential( + audio_transforms.MelSpectrogram(f_min=f_min, + sample_rate=16000, + win_length=win_size, + center=center, + n_fft=n_fft, + f_max=f_max, + hop_length=hop_size, + n_mels=self.n_mels), + audio_transforms.AmplitudeToDB(top_db=kwargs.get('top_db', 120))) + + self.init_bn = nn.Sequential( + Rearrange('b c f t -> b f c t'), + nn.BatchNorm2d(self.n_mels, momentum=0.01), + Rearrange('b f c t -> b c f t')) + + self.target_length = target_length + self.patch_embed = AudioPatchEmbed(input_size=(self.n_mels, + target_length), + embed_dim=self.embed_dim, + patch_size=self.patch_size, + flatten=False, + patch_stride=self.patch_stride) + self.num_patches = self.patch_embed.num_patches + + if pooling == 'token': + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.token_pos_embed = nn.Parameter( + torch.randn(1, embed_dim) * .02) + + self.time_pos_embed = nn.Parameter( + torch.randn(1, embed_dim, 1, self.patch_embed.grid_size[1]) * .02) + self.freq_pos_embed = nn.Parameter( + torch.randn(1, embed_dim, self.patch_embed.grid_size[0], 1) * .02) + + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + self.pos_drop = nn.Dropout(p=drop_rate) + block_function = globals()[block_type] + self.blocks = nn.Sequential(*[ + block_function( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + init_values=init_values, + drop=drop_rate, + attn_drop=attn_drop_rate, + norm_layer=norm_layer, + act_layer=act_layer, + attention_type=attention_type, + ) for _ in range(depth) + ]) + self.norm = norm_layer(embed_dim) + self.apply(self.init_weights) + if hasattr(self, 'cls_token') and self.cls_token is not None: + nn.init.normal_(self.cls_token, std=1e-6) + group_masking = kwargs.get('group_masking', False) + if isinstance(group_masking, bool): + if group_masking is True: + self.masking_func = self.random_masking_group + else: + self.masking_func = self.random_masking + elif isinstance(group_masking, int): + self.masking_func = partial(self.random_masking_group, + group_factor=group_masking) + + @torch.jit.ignore + def no_weight_decay(self): + return { + 'time_pos_embed', 'cls_token', 'freq_pos_embed', 'token_pos_embed' + } + + def init_weights(self, module): + if isinstance(module, nn.Linear): + torch.nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.bias, 0) + nn.init.constant_(module.weight, 1.0) + + def random_masking_group(self, x, mask_ratio, group_factor: int = 2): + """ + Perform per-sample random masking by per-sample shuffling. + Per-sample shuffling is done by argsort random noise. + x: [N, L, D], sequence + """ + N, L, D = x.shape # batch, length, dim + len_keep = int(L * (1 - mask_ratio)) + + noise = torch.rand(N, L // group_factor, + device=x.device) # noise in [0, 1] + # indices = torch.arange(L).view(1, 5, 4).repeat(N, 1, 1) + indices = torch.arange(L, device=x.device).view(-1, group_factor) + + # sort noise for each sample + ids_shuffle = torch.argsort( + noise, dim=1) # ascend: small is keep, large is remove + ids_shuffle = indices[ids_shuffle].flatten(-2) + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # keep the first subset + ids_keep = ids_shuffle[:, :len_keep] + x_masked = torch.gather(x, + dim=1, + index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) + + # generate the binary mask: 0 is keep, 1 is remove + mask = torch.ones([N, L], device=x.device) + mask[:, :len_keep] = 0 + # unshuffle to get the binary mask + mask = torch.gather(mask, dim=1, index=ids_restore) + + return x_masked, mask, ids_restore + + def random_masking(self, x, mask_ratio): + """ + Perform per-sample random masking by per-sample shuffling. + Per-sample shuffling is done by argsort random noise. + x: [N, L, D], sequence + """ + N, L, D = x.shape # batch, length, dim + len_keep = int(L * (1 - mask_ratio)) + + noise = torch.rand(N, L, device=x.device) # noise in [0, 1] + + # sort noise for each sample + ids_shuffle = torch.argsort( + noise, dim=1) # ascend: small is keep, large is remove + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # keep the first subset + ids_keep = ids_shuffle[:, :len_keep] + x_masked = torch.gather(x, + dim=1, + index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) + + # generate the binary mask: 0 is keep, 1 is remove + mask = torch.ones([N, L], device=x.device) + mask[:, :len_keep] = 0 + # unshuffle to get the binary mask + mask = torch.gather(mask, dim=1, index=ids_restore) + + return x_masked, mask, ids_restore + + def forward_features(self, x, mask_ratio): + x = self.patch_embed(x) + b, c, f, t = x.shape + x = x + self.time_pos_embed[:, :, :, :t] + x = x + self.freq_pos_embed[:, :, :, :] # Just for sin pos embed + x = rearrange(x, 'b c f t -> b (f t) c') + # x, mask, ids_restore = self.random_masking(x, mask_ratio) + x, mask, ids_restore = self.masking_func(x, mask_ratio) + if self.pooling == 'token': + cls_token = self.cls_token.expand(x.shape[0], -1, -1) + cls_token = cls_token + self.token_pos_embed[:, :] + x = torch.cat((cls_token, x), dim=1) + x = self.pos_drop(x) + x = self.blocks(x) + x = self.norm(x) + return x, mask, ids_restore + + def load_state_dict(self, state_dict, strict=True, **kwargs): + if 'time_pos_embed' in state_dict and self.time_pos_embed.shape != state_dict[ + 'time_pos_embed'].shape: + print( + "Positional Embedding shape not the same with model, resizing!" + ) + self.change_pos_embedding(state_dict) + super().load_state_dict(state_dict, strict=strict, **kwargs) + + def change_pos_embedding(self, state_dict): + target_time_pos_embed_length = self.time_pos_embed.shape[-1] + target_freq_pos_embed_length = self.freq_pos_embed.shape[-2] + + pretrained_time_pos_embed = state_dict['time_pos_embed'] + pretrained_freq_pos_embed = state_dict['freq_pos_embed'] + + if target_freq_pos_embed_length <= pretrained_time_pos_embed.shape[-1]: + state_dict['time_pos_embed'] = pretrained_time_pos_embed[ + ..., :target_time_pos_embed_length] + else: + state_dict['time_pos_embed'] = torch.nn.functional.interpolate( + pretrained_time_pos_embed, + size=(1, target_time_pos_embed_length), + align_corners=False, + mode='bilinear') + if target_freq_pos_embed_length <= pretrained_freq_pos_embed.shape[-2]: + state_dict[ + 'freq_pos_embed'] = pretrained_freq_pos_embed[:, :, : + target_freq_pos_embed_length, :] + else: + state_dict['freq_pos_embed'] = torch.nn.functional.interpolate( + pretrained_freq_pos_embed, + size=(target_freq_pos_embed_length, 1), + align_corners=False, + mode='bilinear') + + def forward_to_spec(self, x): + # Do not use fp16 for feature extraction, that is likely to get nan + with autocast(enabled=False): + X = self.front_end(x) + X = rearrange(X, 'b f t -> b 1 f t') + X = self.init_bn(X) + return X + + def forward(self, x, mask_ratio: float = 0.75): + x = self.forward_to_spec(x) + x, mask, restore_idxs = self.forward_features(x, mask_ratio=mask_ratio) + return x, mask, restore_idxs + + +class AudioTransformerMAE_Decoder(nn.Module): + + def __init__(self, + input_dim: int, + outputdim: int, + patch_size: int = 16, + patch_stride: int = 16, + embed_dim: int = 768, + num_patches: int = 100, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4., + qkv_bias: bool = True, + drop_rate: float = 0., + attn_drop_rate: float = 0., + norm_layer: Optional[torch.nn.Module] = None, + act_layer: Optional[torch.nn.Module] = None, + cls_token: bool = False, + attention_type='Attention', + init_values=None, + **kwargs): + super().__init__() + self.embed_dim = embed_dim + self.patch_stride = patch_stride + self.patch_size = patch_size + self.input_dim = input_dim + + self.input_proj = nn.Linear(input_dim, embed_dim) + + self.mask_token = nn.Parameter(torch.randn(1, 1, embed_dim) * .02) + _norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + _act_layer = act_layer or nn.GELU + self.use_cls = cls_token + num_patches_total = num_patches + 1 if not cls_token else num_patches + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches_total, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + self.blocks = nn.Sequential(*[ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + init_values=init_values, + drop=drop_rate, + attn_drop=attn_drop_rate, + norm_layer=_norm_layer, + act_layer=_act_layer, + attention_type=attention_type, + ) for i in range(depth) + ]) + self.norm = _norm_layer(embed_dim) + self.outputlayer = nn.Linear(self.embed_dim, outputdim) + self.apply(self.init_weights) + torch.nn.init.normal_(self.mask_token, std=.02) + + @torch.jit.ignore + def no_weight_decay(self): + return { + 'time_pos_embed', 'cls_token', 'freq_pos_embed', 'token_pos_embed' + } + + def init_weights(self, module): + if isinstance(module, nn.Linear): + nn.init.trunc_normal_(module.weight, std=.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.bias, 0) + nn.init.constant_(module.weight, 1.0) + + def forward_features(self, x, ids_restore): + x = self.input_proj(x) + mask_tokens = self.mask_token.repeat( + x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1) + if self.use_cls: + x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1) # no cls token + else: + x_ = torch.cat([x[:, :, :], mask_tokens], dim=1) + x_ = torch.gather(x_, + dim=1, + index=ids_restore.unsqueeze(-1).repeat( + 1, 1, x.shape[2])) # unshuffle + if self.use_cls: + x = torch.cat([x[:, :1, :], x_], dim=1) # append cls token + else: + x = x_ + t = x.shape[1] + + x = x + self.pos_embed[:, :t, :] + x = self.pos_drop(x) + x = self.blocks(x) + x = self.norm(x) + return x + + def forward(self, x, restore_idxs): + x = self.forward_features(x, restore_idxs) + x = self.outputlayer(x) + return x + + +class AudioTransformerMAE(nn.Module): + + def __init__(self, + encoder: AudioTransformerMAE_Encoder, + decoder: AudioTransformerMAE_Decoder, + loss_fn: Optional[torch.nn.Module] = None): + super().__init__() + self.encoder = encoder + self.decoder = decoder + self.unfold = nn.Unfold( + kernel_size=self.encoder.patch_embed.patch_size, + stride=self.encoder.patch_embed.patch_size) + self.loss_fn = MAELoss() if loss_fn is None else loss_fn + + def forward(self, + x: torch.Tensor, + mask_ratio: float = 0.75, + return_loss: bool = False): + latent, mask, restore_ids = self.encoder(x, mask_ratio=mask_ratio) + pred = self.decoder(latent, restore_ids) + with autocast(enabled=False): + targets = self.encoder.front_end(x) + targets = self.patchify(targets) + if return_loss: + return self.loss_fn(pred, targets, mask) + return pred, targets, mask + + def patchify(self, x): + return self.unfold(x.unsqueeze(1)).transpose(-2, -1) + + +def dasheng_base(**kwargs): + encoder_kwargs = dict(embed_dim=768, + depth=12, + num_heads=12, + target_length=1008, + patch_size=[64, 4], + patch_stride=[64, 4]) + encoder_kwargs.update( + (k, kwargs[k]) for k in set(kwargs).intersection(encoder_kwargs)) + encoder_kwargs = {**encoder_kwargs, **kwargs} + encoder = AudioTransformerMAE_Encoder(**encoder_kwargs) + + decoder_kwargs = dict(embed_dim=512, + depth=8, + num_heads=16, + input_dim=encoder_kwargs['embed_dim'], + outputdim=encoder.patch_embed.patch_size[0] * + encoder.patch_embed.patch_size[1], + num_patches=encoder.patch_embed.num_patches) + decoder = AudioTransformerMAE_Decoder(**decoder_kwargs) + return AudioTransformerMAE(encoder, decoder) + + +def dasheng_06B(**kwargs): + encoder_kwargs = dict( + patch_size=[64, 4], + patch_stride=[64, 4], + embed_dim=1536, + depth=24, + num_heads=24, + mlp_ratio=4, + ) + encoder_kwargs.update( + (k, kwargs[k]) for k in set(kwargs).intersection(encoder_kwargs)) + encoder_kwargs = {**encoder_kwargs, **kwargs} + encoder = AudioTransformerMAE_Encoder(**encoder_kwargs) + + decoder_kwargs = dict(embed_dim=512, + depth=8, + num_heads=16, + input_dim=encoder_kwargs['embed_dim'], + outputdim=encoder.patch_embed.patch_size[0] * + encoder.patch_embed.patch_size[1], + num_patches=encoder.patch_embed.num_patches) + decoder = AudioTransformerMAE_Decoder(**decoder_kwargs) + return AudioTransformerMAE(encoder, decoder) + + +def dasheng_12B(**kwargs): + encoder_kwargs = dict( + patch_size=[64, 4], + patch_stride=[64, 4], + embed_dim=1536, + depth=40, + num_heads=24, + mlp_ratio=4, + ) + encoder_kwargs.update( + (k, kwargs[k]) for k in set(kwargs).intersection(encoder_kwargs)) + encoder_kwargs = {**encoder_kwargs, **kwargs} + encoder = AudioTransformerMAE_Encoder(**encoder_kwargs) + + decoder_kwargs = dict(embed_dim=768, + depth=8, + num_heads=24, + input_dim=encoder_kwargs['embed_dim'], + outputdim=encoder.patch_embed.patch_size[0] * + encoder.patch_embed.patch_size[1], + num_patches=encoder.patch_embed.num_patches) + decoder = AudioTransformerMAE_Decoder(**decoder_kwargs) + return AudioTransformerMAE(encoder, decoder) diff --git a/src/models/.ipynb_checkpoints/sed_decoder-checkpoint.py b/src/models/.ipynb_checkpoints/sed_decoder-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..2c86eb253aa8a0f898af58cd961c8b759960439b --- /dev/null +++ b/src/models/.ipynb_checkpoints/sed_decoder-checkpoint.py @@ -0,0 +1,215 @@ +import torch +import torch.nn as nn +import copy +from functools import partial +from .dasheng import LayerScale, Attention, Mlp + + +class Decoder_Block(nn.Module): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + drop=0., + attn_drop=0., + init_values=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + attention_type='Attention', + fusion='adaln', + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention(dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop) + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, + hidden_features=int(dim * mlp_ratio), + act_layer=act_layer, + drop=drop) + self.ls2 = LayerScale( + dim, init_values=init_values) if init_values else nn.Identity() + + self.fusion = fusion + if fusion == 'adaln': + self.adaln = nn.Linear(dim, 6 * dim, bias=True) + + def forward(self, x, c=None): + B, T, C = x.shape + + if self.fusion == 'adaln': + ada = self.adaln(c) + (scale_msa, gate_msa, shift_msa, + scale_mlp, gate_mlp, shift_mlp) = ada.reshape(B, 6, -1).chunk(6, dim=1) + # self attention + x_norm = self.norm1(x) * (1 + scale_msa) + shift_msa + tanh_gate_msa = torch.tanh(1 - gate_msa) + x = x + tanh_gate_msa * self.ls1(self.attn(x_norm)) + # mlp + x_norm = self.norm2(x) * (1 + scale_mlp) + shift_mlp + tanh_gate_mlp = torch.tanh(1 - gate_mlp) + x = x + tanh_gate_mlp * self.ls2(self.mlp(x_norm)) + else: + x = x + self.ls1(self.attn(self.norm1(x))) + x = x + self.ls2(self.mlp(self.norm2(x))) + return x + + +class Decoder(nn.Module): + def __init__( + self, + embed_dim: int = 768, + depth: int = 2, + num_heads=8, + mlp_ratio=4., + qkv_bias=True, + drop_rate=0., + attn_drop_rate=0., + cls_dim: int = 512, + fusion: str = 'adaln', + **kwargs + ): + super().__init__() + + norm_layer = partial(nn.LayerNorm, eps=1e-6) + act_layer = nn.GELU + init_values = None + + block_function = Decoder_Block + self.blocks = nn.ModuleList([ + block_function( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + init_values=init_values, + drop=drop_rate, + attn_drop=attn_drop_rate, + norm_layer=norm_layer, + act_layer=act_layer, + attention_type="Attention", + fusion=fusion, + ) for _ in range(depth) + ]) + + self.fusion = fusion + cls_out = embed_dim + + self.cls_embed = nn.Sequential( + nn.Linear(cls_dim, embed_dim, bias=True), + nn.SiLU(), + nn.Linear(embed_dim, cls_out, bias=True),) + + self.sed_head = nn.Linear(embed_dim, 1, bias=True) + self.norm = norm_layer(embed_dim) + self.apply(self.init_weights) + # self.energy_head = nn.Linear(embed_dim, 1, bias=True) + + def init_weights(self, module): + if isinstance(module, nn.Linear): + torch.nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.bias, 0) + nn.init.constant_(module.weight, 1.0) + + if self.fusion == 'adaln': + for block in self.blocks: + nn.init.constant_(block.adaln.weight, 0) + nn.init.constant_(block.adaln.bias, 0) + + def forward(self, x, cls): + B, L, C = x.shape + _, N, D = cls.shape + # Expand x to shape (B, N, L, C) + x = x.unsqueeze(1).expand(-1, N, -1, -1) + # Reshape both tensors to (B*N, L, C) for processing + x = x.reshape(B * N, L, C) + cls = cls.reshape(B * N, D) + + cls = self.cls_embed(cls) + + shift = 0 + if self.fusion == 'adaln': + pass + elif self.fusion == 'token': + cls = cls.unsqueeze(1) + x = torch.cat([cls, x], dim=1) + shift = 1 + else: + raise NotImplementedError("unknown fusion") + + for block in self.blocks: + x = block(x, cls) + + x = x[:, shift:] + + x = self.norm(x) + + strong = self.sed_head(x) + return strong.transpose(1, 2) + + +class TSED_Wrapper(nn.Module): + def __init__( + self, + encoder, + decoder, + ft_blocks=[11, 12], + frozen_encoder=True + ): + super().__init__() + + self.encoder = encoder + self.decoder = decoder + + print("Loading Dasheng weights for decoders...") + for i, blk_idx in enumerate(ft_blocks): + decoder_block = self.decoder.blocks[i] + encoder_block = self.encoder.blocks[blk_idx] + state_dict = copy.deepcopy(encoder_block.state_dict()) + missing, unexpected = decoder_block.load_state_dict(state_dict, strict=False) + if missing or unexpected: + print(f"Block {blk_idx}:") + if missing: + print(f"โœ… Expected missing keys: {missing}") + if unexpected: + print(f" Unexpected keys: {unexpected}") + # Copy norm_layer weights + self.decoder.norm.load_state_dict(copy.deepcopy(self.encoder.norm.state_dict())) + + # Remove the injected blocks and norm_layer from the encoder + for blk_idx in sorted(ft_blocks, reverse=True): + # Reverse to avoid index shift issues + del self.encoder.blocks[blk_idx] + # Remove encoder norm layer + del self.encoder.norm + + self.frozen_encoder = frozen_encoder + if frozen_encoder: + for param in self.encoder.parameters(): + param.requires_grad = False + + def forward_to_spec(self, x): + return self.encoder.forward_to_spec(x) + + def forward_encoder(self, x): + if self.frozen_encoder: + with torch.no_grad(): + x = self.encoder(x) + else: + x = self.encoder(x) + return x + + def forward(self, x, cls): + x = self.forward_encoder(x) + pred = self.decoder(x, cls) + return pred diff --git a/src/models/.ipynb_checkpoints/transformer-checkpoint.py b/src/models/.ipynb_checkpoints/transformer-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..e327c1cc74ceeaa2a6c8ea55dd88cdf76f69cc3f --- /dev/null +++ b/src/models/.ipynb_checkpoints/transformer-checkpoint.py @@ -0,0 +1,214 @@ +from einops import rearrange +from torch.cuda.amp import autocast +from functools import partial +from typing import Optional, Tuple +import torchaudio.transforms as audio_transforms +from einops.layers.torch import Rearrange + +import torch +import torch.nn as nn +from .dasheng import AudioPatchEmbed, Block + +# if hasattr(nn.functional, 'scaled_dot_product_attention'): +# ATTENTION_MODE = 'flash' +# else: +# ATTENTION_MODE = 'math' +# print(f'attention mode is {ATTENTION_MODE}') + + +class Dasheng_Encoder(nn.Module): + def __init__(self, + patch_size: Tuple[int, int] = (64, 4), + patch_stride: Tuple[int, int] = (64, 4), + embed_dim: int = 768, + depth: int = 12, + num_heads=8, + mlp_ratio=4., + qkv_bias=True, + drop_rate=0., + attn_drop_rate=0., + norm_layer=None, + act_layer=None, + init_values=None, + target_length=1008, + pooling='mean', + time_patch_out: Optional[float] = None, + freq_patch_out: Optional[float] = None, + block_type='Block', + attention_type='Attention', + eval_avg='cat', + n_fft: int = 512, + n_mels: int = 64, + hop_size: int = 160, + win_size: int = 512, + f_min: int = 0, + f_max: int = 8000, + center: bool = True, + **kwargs): + super().__init__() + self.pooling = pooling + self.embed_dim = embed_dim + self.patch_stride = patch_stride + self.patch_size = patch_size + self.n_mels = n_mels + self.eval_avg = eval_avg + self.time_patch_out = time_patch_out + self.freq_patch_out = freq_patch_out + + self.front_end = nn.Sequential( + audio_transforms.MelSpectrogram(f_min=f_min, + sample_rate=16000, + win_length=win_size, + center=center, + n_fft=n_fft, + f_max=f_max, + hop_length=hop_size, + n_mels=self.n_mels, + power=1)) + + self.to_db = audio_transforms.AmplitudeToDB(stype='magnitude', top_db=kwargs.get('top_db', 120)) + + self.init_bn = nn.Sequential( + Rearrange('b c f t -> b f c t'), + nn.BatchNorm2d(self.n_mels, momentum=0.01), + Rearrange('b f c t -> b c f t')) + + self.target_length = target_length + self.patch_embed = AudioPatchEmbed(input_size=(self.n_mels, + target_length), + embed_dim=self.embed_dim, + patch_size=self.patch_size, + flatten=False, + patch_stride=self.patch_stride) + self.num_patches = self.patch_embed.num_patches + + if pooling == 'token': + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.token_pos_embed = nn.Parameter( + torch.randn(1, embed_dim) * .02) + + self.time_pos_embed = nn.Parameter( + torch.randn(1, embed_dim, 1, self.patch_embed.grid_size[1]) * .02) + self.freq_pos_embed = nn.Parameter( + torch.randn(1, embed_dim, self.patch_embed.grid_size[0], 1) * .02) + + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + self.pos_drop = nn.Dropout(p=drop_rate) + self.blocks = nn.Sequential(*[ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + init_values=init_values, + drop=drop_rate, + attn_drop=attn_drop_rate, + norm_layer=norm_layer, + act_layer=act_layer, + attention_type=attention_type, + ) for _ in range(depth) + ]) + self.norm = norm_layer(embed_dim) + self.apply(self.init_weights) + if hasattr(self, 'cls_token') and self.cls_token is not None: + nn.init.normal_(self.cls_token, std=1e-6) + # group_masking = kwargs.get('group_masking', False) + # if isinstance(group_masking, bool): + # if group_masking is True: + # self.masking_func = self.random_masking_group + # else: + # self.masking_func = self.random_masking + # elif isinstance(group_masking, int): + # self.masking_func = partial(self.random_masking_group, + # group_factor=group_masking) + # @torch.jit.ignore + # def no_weight_decay(self): + # return { + # 'time_pos_embed', 'cls_token', 'freq_pos_embed', 'token_pos_embed' + # } + + def init_weights(self, module): + if isinstance(module, nn.Linear): + torch.nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.bias, 0) + nn.init.constant_(module.weight, 1.0) + + def forward_features(self, x): + x = self.patch_embed(x) + b, c, f, t = x.shape + x = x + self.time_pos_embed[:, :, :, :t] + x = x + self.freq_pos_embed[:, :, :, :] # Just for sin pos embed + x = rearrange(x, 'b c f t -> b (f t) c') + # x, mask, ids_restore = self.random_masking(x, mask_ratio) + # x, mask, ids_restore = self.masking_func(x, mask_ratio) + if self.pooling == 'token': + cls_token = self.cls_token.expand(x.shape[0], -1, -1) + cls_token = cls_token + self.token_pos_embed[:, :] + x = torch.cat((cls_token, x), dim=1) + x = self.pos_drop(x) + for block in self.blocks: + x = block(x) + # x = self.norm(x) + return x + + def load_state_dict(self, state_dict, **kwargs): + if 'time_pos_embed' in state_dict and self.time_pos_embed.shape != state_dict[ + 'time_pos_embed'].shape: + print("Positional Embedding shape not the same with model, resizing!") + self.change_pos_embedding(state_dict) + # Call the parent class method and capture the missing/unexpected keys + missing_keys, unexpected_keys = super().load_state_dict(state_dict, strict=False, **kwargs) + # Print missing and unexpected keys + if missing_keys: + print("Missing keys:", missing_keys) + if unexpected_keys: + print("Unexpected keys:", unexpected_keys) + + def change_pos_embedding(self, state_dict): + target_time_pos_embed_length = self.time_pos_embed.shape[-1] + target_freq_pos_embed_length = self.freq_pos_embed.shape[-2] + + pretrained_time_pos_embed = state_dict['time_pos_embed'] + pretrained_freq_pos_embed = state_dict['freq_pos_embed'] + + if target_freq_pos_embed_length <= pretrained_time_pos_embed.shape[-1]: + state_dict['time_pos_embed'] = pretrained_time_pos_embed[ + ..., :target_time_pos_embed_length] + else: + state_dict['time_pos_embed'] = torch.nn.functional.interpolate( + pretrained_time_pos_embed, + size=(1, target_time_pos_embed_length), + align_corners=False, + mode='bilinear') + if target_freq_pos_embed_length <= pretrained_freq_pos_embed.shape[-2]: + state_dict[ + 'freq_pos_embed'] = pretrained_freq_pos_embed[:, :, : + target_freq_pos_embed_length, :] + else: + state_dict['freq_pos_embed'] = torch.nn.functional.interpolate( + pretrained_freq_pos_embed, + size=(target_freq_pos_embed_length, 1), + align_corners=False, + mode='bilinear') + + def forward_to_spec(self, x): + # Do not use fp16 for feature extraction, that is likely to get nan + with autocast(enabled=False): + X = self.front_end(x) + # X = rearrange(X, 'b f t -> b 1 f t') + # X = self.init_bn(X) + return X + + def forward(self, x): + # x = self.forward_to_spec(x) + # print(x.shape) + with autocast(enabled=False): + x = self.to_db(x) + x = rearrange(x, 'b f t -> b 1 f t') + x = self.init_bn(x) + x = self.forward_features(x) + return x \ No newline at end of file diff --git a/src/models/__pycache__/dasheng.cpython-311.pyc b/src/models/__pycache__/dasheng.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d01bb7ec01e25319dab10c35658dcc9615bdc298 Binary files /dev/null and b/src/models/__pycache__/dasheng.cpython-311.pyc differ diff --git a/src/models/__pycache__/dasheng.cpython-312.pyc b/src/models/__pycache__/dasheng.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08a3131685b1aefb47bc0b29fdfa8841bd8796f9 Binary files /dev/null and b/src/models/__pycache__/dasheng.cpython-312.pyc differ diff --git a/src/models/__pycache__/sed_decoder.cpython-311.pyc b/src/models/__pycache__/sed_decoder.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b651c367f0b76f1f819a8005c634c21b03b1b6fe Binary files /dev/null and b/src/models/__pycache__/sed_decoder.cpython-311.pyc differ diff --git a/src/models/__pycache__/sed_decoder.cpython-312.pyc b/src/models/__pycache__/sed_decoder.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e81f77122fe9c73fb954996388c135db527e9681 Binary files /dev/null and b/src/models/__pycache__/sed_decoder.cpython-312.pyc differ diff --git a/src/models/__pycache__/transformer.cpython-311.pyc b/src/models/__pycache__/transformer.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8391aec3e21e7484ec513aa5a3758722edec3b07 Binary files /dev/null and b/src/models/__pycache__/transformer.cpython-311.pyc differ diff --git a/src/models/__pycache__/transformer.cpython-312.pyc b/src/models/__pycache__/transformer.cpython-312.pyc new file mode 100644 index 0000000000000000000000000000000000000000..cd23c2b0b50132f320cf7232d2f30ab4461fd0e8 Binary files /dev/null and b/src/models/__pycache__/transformer.cpython-312.pyc differ diff --git a/src/models/dasheng.py b/src/models/dasheng.py new file mode 100644 index 0000000000000000000000000000000000000000..bfe29b9603239567824630e3b824bf3f8b716f1f --- /dev/null +++ b/src/models/dasheng.py @@ -0,0 +1,671 @@ +import torch +import torch.nn as nn +from einops import rearrange +import torch +from torch.cuda.amp import autocast +from functools import partial +from typing import Optional, Tuple, Union +import torchaudio.transforms as audio_transforms +from einops import rearrange +from einops.layers.torch import Rearrange +from itertools import repeat +import collections + +import torch.nn.functional as F +import einops + + +if hasattr(nn.functional, 'scaled_dot_product_attention'): + ATTENTION_MODE = 'flash' +else: + ATTENTION_MODE = 'math' +print(f'attention mode is {ATTENTION_MODE}') + + +def _ntuple(n): + + def parse(x) -> Tuple: + if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): + return tuple(x) + return tuple(repeat(x, n)) + + return parse + + +to_2tuple = _ntuple(2) + + +class MAELoss(torch.nn.Module): + + def __init__(self, norm_pix_loss: bool = True): + super().__init__() + self.norm_pix_loss = norm_pix_loss + + @autocast(enabled=False) + def forward(self, pred: torch.Tensor, target: torch.Tensor, + mask: torch.Tensor) -> torch.Tensor: + if self.norm_pix_loss is True: + mean = target.mean(dim=-1, keepdim=True) + var = target.var(dim=-1, keepdim=True) + target = (target - mean) / (var + 1.e-6)**.5 + elif self.norm_pix_loss == 'global': + mean = target.mean() + var = target.var() + target = (target - mean) / (var + 1.e-6)**.5 + loss = (pred - target)**2 + loss = loss.mean(dim=-1) # [N, L], mean loss per patch + loss = (loss * mask).sum() / mask.sum() # mean loss on removed patches + return loss + + +class AudioPatchEmbed(nn.Module): + + def __init__(self, + input_size: Union[int, Tuple[int, int]] = (64, 100), + patch_size: Tuple[int, int] = (64, 4), + patch_stride: Tuple[int, int] = (64, 4), + in_chans=1, + embed_dim=768, + norm_layer=None, + flatten=False): + super().__init__() + patch_size = to_2tuple(patch_size) + patch_stride = to_2tuple(patch_stride) + self.input_size: Tuple[int, int] = to_2tuple(input_size) + self.patch_size: Tuple[int, int] = to_2tuple(patch_size) + self.patch_stride: Tuple[int, int] = to_2tuple(patch_stride) + self.grid_size = (self.input_size[0] // self.patch_stride[0], + self.input_size[1] // self.patch_stride[1]) + self.num_patches = self.grid_size[0] * self.grid_size[1] + self.flatten = flatten + + self.proj = nn.Conv2d(in_chans, + embed_dim, + kernel_size=patch_size, + stride=patch_stride) + self.norm = norm_layer(embed_dim) if norm_layer else nn.Identity() + + def forward(self, x): + x = self.proj(x) + if self.flatten: + x = rearrange(x, 'b c f t -> b (f t) c') + x = self.norm(x) + return x + + +class LayerScale(nn.Module): + + def __init__(self, dim: int, init_values=1e-5, inplace=False): + super().__init__() + self.inplace = inplace + self.gamma = nn.Parameter(init_values * torch.ones(dim)) + + def forward(self, x): + return x.mul_(self.gamma) if self.inplace else x * self.gamma + + +class Attention(nn.Module): + + def __init__(self, + dim, + num_heads=8, + qkv_bias=False, + attn_drop=0., + proj_drop=0.): + super().__init__() + assert dim % num_heads == 0, 'dim should be divisible by num_heads' + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.attn_drop_p = attn_drop + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + + def forward(self, x): + B, N, C = x.shape + qkv = self.qkv(x).reshape(B, N, 3, self.num_heads, + C // self.num_heads).permute(2, 0, 3, 1, 4) + q, k, v = qkv.unbind( + 0) # make torchscript happy (cannot use tensor as tuple) + + if ATTENTION_MODE == 'flash': + x = F.scaled_dot_product_attention(q, k, v, + dropout_p=self.attn_drop_p, + scale=self.scale, + ) + x = einops.rearrange(x, 'B H L D -> B L (H D)') + elif ATTENTION_MODE == 'math': + attn = (q @ k.transpose(-2, -1)) * self.scale + attn = attn.softmax(dim=-1) + attn = self.attn_drop(attn) + x = (attn @ v).transpose(1, 2).reshape(B, N, C) + + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Mlp(nn.Module): + + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Block(nn.Module): + + def __init__( + self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + drop=0., + attn_drop=0., + init_values=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + attention_type='Attention', + ): + super().__init__() + self.norm1 = norm_layer(dim) + attn_type = globals()[attention_type] + self.attn = attn_type(dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop) + self.ls1 = LayerScale( + dim, init_values=init_values) if init_values else nn.Identity() + + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, + hidden_features=int(dim * mlp_ratio), + act_layer=act_layer, + drop=drop) + self.ls2 = LayerScale( + dim, init_values=init_values) if init_values else nn.Identity() + + def forward(self, x): + x = x + self.ls1(self.attn(self.norm1(x))) + x = x + self.ls2(self.mlp(self.norm2(x))) + return x + + +class AudioTransformerMAE_Encoder(nn.Module): + + def __init__(self, + patch_size: Tuple[int, int] = (64, 4), + patch_stride: Tuple[int, int] = (64, 4), + embed_dim: int = 768, + depth: int = 12, + num_heads=8, + mlp_ratio=4., + qkv_bias=True, + drop_rate=0., + attn_drop_rate=0., + norm_layer=None, + act_layer=None, + init_values=None, + target_length=1008, + pooling='mean', + time_patch_out: Optional[float] = None, + freq_patch_out: Optional[float] = None, + block_type='Block', + attention_type='Attention', + eval_avg='cat', + n_fft: int = 512, + n_mels: int = 64, + hop_size: int = 160, + win_size: int = 512, + f_min: int = 0, + f_max: int = 8000, + center: bool = True, + **kwargs): + super().__init__() + self.pooling = pooling + self.embed_dim = embed_dim + self.patch_stride = patch_stride + self.patch_size = patch_size + self.n_mels = n_mels + self.eval_avg = eval_avg + self.time_patch_out = time_patch_out + self.freq_patch_out = freq_patch_out + + self.front_end = nn.Sequential( + audio_transforms.MelSpectrogram(f_min=f_min, + sample_rate=16000, + win_length=win_size, + center=center, + n_fft=n_fft, + f_max=f_max, + hop_length=hop_size, + n_mels=self.n_mels), + audio_transforms.AmplitudeToDB(top_db=kwargs.get('top_db', 120))) + + self.init_bn = nn.Sequential( + Rearrange('b c f t -> b f c t'), + nn.BatchNorm2d(self.n_mels, momentum=0.01), + Rearrange('b f c t -> b c f t')) + + self.target_length = target_length + self.patch_embed = AudioPatchEmbed(input_size=(self.n_mels, + target_length), + embed_dim=self.embed_dim, + patch_size=self.patch_size, + flatten=False, + patch_stride=self.patch_stride) + self.num_patches = self.patch_embed.num_patches + + if pooling == 'token': + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.token_pos_embed = nn.Parameter( + torch.randn(1, embed_dim) * .02) + + self.time_pos_embed = nn.Parameter( + torch.randn(1, embed_dim, 1, self.patch_embed.grid_size[1]) * .02) + self.freq_pos_embed = nn.Parameter( + torch.randn(1, embed_dim, self.patch_embed.grid_size[0], 1) * .02) + + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + self.pos_drop = nn.Dropout(p=drop_rate) + block_function = globals()[block_type] + self.blocks = nn.Sequential(*[ + block_function( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + init_values=init_values, + drop=drop_rate, + attn_drop=attn_drop_rate, + norm_layer=norm_layer, + act_layer=act_layer, + attention_type=attention_type, + ) for _ in range(depth) + ]) + self.norm = norm_layer(embed_dim) + self.apply(self.init_weights) + if hasattr(self, 'cls_token') and self.cls_token is not None: + nn.init.normal_(self.cls_token, std=1e-6) + group_masking = kwargs.get('group_masking', False) + if isinstance(group_masking, bool): + if group_masking is True: + self.masking_func = self.random_masking_group + else: + self.masking_func = self.random_masking + elif isinstance(group_masking, int): + self.masking_func = partial(self.random_masking_group, + group_factor=group_masking) + + @torch.jit.ignore + def no_weight_decay(self): + return { + 'time_pos_embed', 'cls_token', 'freq_pos_embed', 'token_pos_embed' + } + + def init_weights(self, module): + if isinstance(module, nn.Linear): + torch.nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.bias, 0) + nn.init.constant_(module.weight, 1.0) + + def random_masking_group(self, x, mask_ratio, group_factor: int = 2): + """ + Perform per-sample random masking by per-sample shuffling. + Per-sample shuffling is done by argsort random noise. + x: [N, L, D], sequence + """ + N, L, D = x.shape # batch, length, dim + len_keep = int(L * (1 - mask_ratio)) + + noise = torch.rand(N, L // group_factor, + device=x.device) # noise in [0, 1] + # indices = torch.arange(L).view(1, 5, 4).repeat(N, 1, 1) + indices = torch.arange(L, device=x.device).view(-1, group_factor) + + # sort noise for each sample + ids_shuffle = torch.argsort( + noise, dim=1) # ascend: small is keep, large is remove + ids_shuffle = indices[ids_shuffle].flatten(-2) + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # keep the first subset + ids_keep = ids_shuffle[:, :len_keep] + x_masked = torch.gather(x, + dim=1, + index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) + + # generate the binary mask: 0 is keep, 1 is remove + mask = torch.ones([N, L], device=x.device) + mask[:, :len_keep] = 0 + # unshuffle to get the binary mask + mask = torch.gather(mask, dim=1, index=ids_restore) + + return x_masked, mask, ids_restore + + def random_masking(self, x, mask_ratio): + """ + Perform per-sample random masking by per-sample shuffling. + Per-sample shuffling is done by argsort random noise. + x: [N, L, D], sequence + """ + N, L, D = x.shape # batch, length, dim + len_keep = int(L * (1 - mask_ratio)) + + noise = torch.rand(N, L, device=x.device) # noise in [0, 1] + + # sort noise for each sample + ids_shuffle = torch.argsort( + noise, dim=1) # ascend: small is keep, large is remove + ids_restore = torch.argsort(ids_shuffle, dim=1) + + # keep the first subset + ids_keep = ids_shuffle[:, :len_keep] + x_masked = torch.gather(x, + dim=1, + index=ids_keep.unsqueeze(-1).repeat(1, 1, D)) + + # generate the binary mask: 0 is keep, 1 is remove + mask = torch.ones([N, L], device=x.device) + mask[:, :len_keep] = 0 + # unshuffle to get the binary mask + mask = torch.gather(mask, dim=1, index=ids_restore) + + return x_masked, mask, ids_restore + + def forward_features(self, x, mask_ratio): + x = self.patch_embed(x) + b, c, f, t = x.shape + x = x + self.time_pos_embed[:, :, :, :t] + x = x + self.freq_pos_embed[:, :, :, :] # Just for sin pos embed + x = rearrange(x, 'b c f t -> b (f t) c') + # x, mask, ids_restore = self.random_masking(x, mask_ratio) + x, mask, ids_restore = self.masking_func(x, mask_ratio) + if self.pooling == 'token': + cls_token = self.cls_token.expand(x.shape[0], -1, -1) + cls_token = cls_token + self.token_pos_embed[:, :] + x = torch.cat((cls_token, x), dim=1) + x = self.pos_drop(x) + x = self.blocks(x) + x = self.norm(x) + return x, mask, ids_restore + + def load_state_dict(self, state_dict, strict=True, **kwargs): + if 'time_pos_embed' in state_dict and self.time_pos_embed.shape != state_dict[ + 'time_pos_embed'].shape: + print( + "Positional Embedding shape not the same with model, resizing!" + ) + self.change_pos_embedding(state_dict) + super().load_state_dict(state_dict, strict=strict, **kwargs) + + def change_pos_embedding(self, state_dict): + target_time_pos_embed_length = self.time_pos_embed.shape[-1] + target_freq_pos_embed_length = self.freq_pos_embed.shape[-2] + + pretrained_time_pos_embed = state_dict['time_pos_embed'] + pretrained_freq_pos_embed = state_dict['freq_pos_embed'] + + if target_freq_pos_embed_length <= pretrained_time_pos_embed.shape[-1]: + state_dict['time_pos_embed'] = pretrained_time_pos_embed[ + ..., :target_time_pos_embed_length] + else: + state_dict['time_pos_embed'] = torch.nn.functional.interpolate( + pretrained_time_pos_embed, + size=(1, target_time_pos_embed_length), + align_corners=False, + mode='bilinear') + if target_freq_pos_embed_length <= pretrained_freq_pos_embed.shape[-2]: + state_dict[ + 'freq_pos_embed'] = pretrained_freq_pos_embed[:, :, : + target_freq_pos_embed_length, :] + else: + state_dict['freq_pos_embed'] = torch.nn.functional.interpolate( + pretrained_freq_pos_embed, + size=(target_freq_pos_embed_length, 1), + align_corners=False, + mode='bilinear') + + def forward_to_spec(self, x): + # Do not use fp16 for feature extraction, that is likely to get nan + with autocast(enabled=False): + X = self.front_end(x) + X = rearrange(X, 'b f t -> b 1 f t') + X = self.init_bn(X) + return X + + def forward(self, x, mask_ratio: float = 0.75): + x = self.forward_to_spec(x) + x, mask, restore_idxs = self.forward_features(x, mask_ratio=mask_ratio) + return x, mask, restore_idxs + + +class AudioTransformerMAE_Decoder(nn.Module): + + def __init__(self, + input_dim: int, + outputdim: int, + patch_size: int = 16, + patch_stride: int = 16, + embed_dim: int = 768, + num_patches: int = 100, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4., + qkv_bias: bool = True, + drop_rate: float = 0., + attn_drop_rate: float = 0., + norm_layer: Optional[torch.nn.Module] = None, + act_layer: Optional[torch.nn.Module] = None, + cls_token: bool = False, + attention_type='Attention', + init_values=None, + **kwargs): + super().__init__() + self.embed_dim = embed_dim + self.patch_stride = patch_stride + self.patch_size = patch_size + self.input_dim = input_dim + + self.input_proj = nn.Linear(input_dim, embed_dim) + + self.mask_token = nn.Parameter(torch.randn(1, 1, embed_dim) * .02) + _norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + _act_layer = act_layer or nn.GELU + self.use_cls = cls_token + num_patches_total = num_patches + 1 if not cls_token else num_patches + self.pos_embed = nn.Parameter( + torch.zeros(1, num_patches_total, embed_dim)) + self.pos_drop = nn.Dropout(p=drop_rate) + self.blocks = nn.Sequential(*[ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + init_values=init_values, + drop=drop_rate, + attn_drop=attn_drop_rate, + norm_layer=_norm_layer, + act_layer=_act_layer, + attention_type=attention_type, + ) for i in range(depth) + ]) + self.norm = _norm_layer(embed_dim) + self.outputlayer = nn.Linear(self.embed_dim, outputdim) + self.apply(self.init_weights) + torch.nn.init.normal_(self.mask_token, std=.02) + + @torch.jit.ignore + def no_weight_decay(self): + return { + 'time_pos_embed', 'cls_token', 'freq_pos_embed', 'token_pos_embed' + } + + def init_weights(self, module): + if isinstance(module, nn.Linear): + nn.init.trunc_normal_(module.weight, std=.02) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.bias, 0) + nn.init.constant_(module.weight, 1.0) + + def forward_features(self, x, ids_restore): + x = self.input_proj(x) + mask_tokens = self.mask_token.repeat( + x.shape[0], ids_restore.shape[1] + 1 - x.shape[1], 1) + if self.use_cls: + x_ = torch.cat([x[:, 1:, :], mask_tokens], dim=1) # no cls token + else: + x_ = torch.cat([x[:, :, :], mask_tokens], dim=1) + x_ = torch.gather(x_, + dim=1, + index=ids_restore.unsqueeze(-1).repeat( + 1, 1, x.shape[2])) # unshuffle + if self.use_cls: + x = torch.cat([x[:, :1, :], x_], dim=1) # append cls token + else: + x = x_ + t = x.shape[1] + + x = x + self.pos_embed[:, :t, :] + x = self.pos_drop(x) + x = self.blocks(x) + x = self.norm(x) + return x + + def forward(self, x, restore_idxs): + x = self.forward_features(x, restore_idxs) + x = self.outputlayer(x) + return x + + +class AudioTransformerMAE(nn.Module): + + def __init__(self, + encoder: AudioTransformerMAE_Encoder, + decoder: AudioTransformerMAE_Decoder, + loss_fn: Optional[torch.nn.Module] = None): + super().__init__() + self.encoder = encoder + self.decoder = decoder + self.unfold = nn.Unfold( + kernel_size=self.encoder.patch_embed.patch_size, + stride=self.encoder.patch_embed.patch_size) + self.loss_fn = MAELoss() if loss_fn is None else loss_fn + + def forward(self, + x: torch.Tensor, + mask_ratio: float = 0.75, + return_loss: bool = False): + latent, mask, restore_ids = self.encoder(x, mask_ratio=mask_ratio) + pred = self.decoder(latent, restore_ids) + with autocast(enabled=False): + targets = self.encoder.front_end(x) + targets = self.patchify(targets) + if return_loss: + return self.loss_fn(pred, targets, mask) + return pred, targets, mask + + def patchify(self, x): + return self.unfold(x.unsqueeze(1)).transpose(-2, -1) + + +def dasheng_base(**kwargs): + encoder_kwargs = dict(embed_dim=768, + depth=12, + num_heads=12, + target_length=1008, + patch_size=[64, 4], + patch_stride=[64, 4]) + encoder_kwargs.update( + (k, kwargs[k]) for k in set(kwargs).intersection(encoder_kwargs)) + encoder_kwargs = {**encoder_kwargs, **kwargs} + encoder = AudioTransformerMAE_Encoder(**encoder_kwargs) + + decoder_kwargs = dict(embed_dim=512, + depth=8, + num_heads=16, + input_dim=encoder_kwargs['embed_dim'], + outputdim=encoder.patch_embed.patch_size[0] * + encoder.patch_embed.patch_size[1], + num_patches=encoder.patch_embed.num_patches) + decoder = AudioTransformerMAE_Decoder(**decoder_kwargs) + return AudioTransformerMAE(encoder, decoder) + + +def dasheng_06B(**kwargs): + encoder_kwargs = dict( + patch_size=[64, 4], + patch_stride=[64, 4], + embed_dim=1536, + depth=24, + num_heads=24, + mlp_ratio=4, + ) + encoder_kwargs.update( + (k, kwargs[k]) for k in set(kwargs).intersection(encoder_kwargs)) + encoder_kwargs = {**encoder_kwargs, **kwargs} + encoder = AudioTransformerMAE_Encoder(**encoder_kwargs) + + decoder_kwargs = dict(embed_dim=512, + depth=8, + num_heads=16, + input_dim=encoder_kwargs['embed_dim'], + outputdim=encoder.patch_embed.patch_size[0] * + encoder.patch_embed.patch_size[1], + num_patches=encoder.patch_embed.num_patches) + decoder = AudioTransformerMAE_Decoder(**decoder_kwargs) + return AudioTransformerMAE(encoder, decoder) + + +def dasheng_12B(**kwargs): + encoder_kwargs = dict( + patch_size=[64, 4], + patch_stride=[64, 4], + embed_dim=1536, + depth=40, + num_heads=24, + mlp_ratio=4, + ) + encoder_kwargs.update( + (k, kwargs[k]) for k in set(kwargs).intersection(encoder_kwargs)) + encoder_kwargs = {**encoder_kwargs, **kwargs} + encoder = AudioTransformerMAE_Encoder(**encoder_kwargs) + + decoder_kwargs = dict(embed_dim=768, + depth=8, + num_heads=24, + input_dim=encoder_kwargs['embed_dim'], + outputdim=encoder.patch_embed.patch_size[0] * + encoder.patch_embed.patch_size[1], + num_patches=encoder.patch_embed.num_patches) + decoder = AudioTransformerMAE_Decoder(**decoder_kwargs) + return AudioTransformerMAE(encoder, decoder) diff --git a/src/models/sed_decoder.py b/src/models/sed_decoder.py new file mode 100644 index 0000000000000000000000000000000000000000..2c86eb253aa8a0f898af58cd961c8b759960439b --- /dev/null +++ b/src/models/sed_decoder.py @@ -0,0 +1,215 @@ +import torch +import torch.nn as nn +import copy +from functools import partial +from .dasheng import LayerScale, Attention, Mlp + + +class Decoder_Block(nn.Module): + def __init__( + self, + dim, + num_heads, + mlp_ratio=4., + qkv_bias=False, + drop=0., + attn_drop=0., + init_values=None, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + attention_type='Attention', + fusion='adaln', + ): + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention(dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + attn_drop=attn_drop, + proj_drop=drop) + self.ls1 = LayerScale(dim, init_values=init_values) if init_values else nn.Identity() + + self.norm2 = norm_layer(dim) + self.mlp = Mlp(in_features=dim, + hidden_features=int(dim * mlp_ratio), + act_layer=act_layer, + drop=drop) + self.ls2 = LayerScale( + dim, init_values=init_values) if init_values else nn.Identity() + + self.fusion = fusion + if fusion == 'adaln': + self.adaln = nn.Linear(dim, 6 * dim, bias=True) + + def forward(self, x, c=None): + B, T, C = x.shape + + if self.fusion == 'adaln': + ada = self.adaln(c) + (scale_msa, gate_msa, shift_msa, + scale_mlp, gate_mlp, shift_mlp) = ada.reshape(B, 6, -1).chunk(6, dim=1) + # self attention + x_norm = self.norm1(x) * (1 + scale_msa) + shift_msa + tanh_gate_msa = torch.tanh(1 - gate_msa) + x = x + tanh_gate_msa * self.ls1(self.attn(x_norm)) + # mlp + x_norm = self.norm2(x) * (1 + scale_mlp) + shift_mlp + tanh_gate_mlp = torch.tanh(1 - gate_mlp) + x = x + tanh_gate_mlp * self.ls2(self.mlp(x_norm)) + else: + x = x + self.ls1(self.attn(self.norm1(x))) + x = x + self.ls2(self.mlp(self.norm2(x))) + return x + + +class Decoder(nn.Module): + def __init__( + self, + embed_dim: int = 768, + depth: int = 2, + num_heads=8, + mlp_ratio=4., + qkv_bias=True, + drop_rate=0., + attn_drop_rate=0., + cls_dim: int = 512, + fusion: str = 'adaln', + **kwargs + ): + super().__init__() + + norm_layer = partial(nn.LayerNorm, eps=1e-6) + act_layer = nn.GELU + init_values = None + + block_function = Decoder_Block + self.blocks = nn.ModuleList([ + block_function( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + init_values=init_values, + drop=drop_rate, + attn_drop=attn_drop_rate, + norm_layer=norm_layer, + act_layer=act_layer, + attention_type="Attention", + fusion=fusion, + ) for _ in range(depth) + ]) + + self.fusion = fusion + cls_out = embed_dim + + self.cls_embed = nn.Sequential( + nn.Linear(cls_dim, embed_dim, bias=True), + nn.SiLU(), + nn.Linear(embed_dim, cls_out, bias=True),) + + self.sed_head = nn.Linear(embed_dim, 1, bias=True) + self.norm = norm_layer(embed_dim) + self.apply(self.init_weights) + # self.energy_head = nn.Linear(embed_dim, 1, bias=True) + + def init_weights(self, module): + if isinstance(module, nn.Linear): + torch.nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.bias, 0) + nn.init.constant_(module.weight, 1.0) + + if self.fusion == 'adaln': + for block in self.blocks: + nn.init.constant_(block.adaln.weight, 0) + nn.init.constant_(block.adaln.bias, 0) + + def forward(self, x, cls): + B, L, C = x.shape + _, N, D = cls.shape + # Expand x to shape (B, N, L, C) + x = x.unsqueeze(1).expand(-1, N, -1, -1) + # Reshape both tensors to (B*N, L, C) for processing + x = x.reshape(B * N, L, C) + cls = cls.reshape(B * N, D) + + cls = self.cls_embed(cls) + + shift = 0 + if self.fusion == 'adaln': + pass + elif self.fusion == 'token': + cls = cls.unsqueeze(1) + x = torch.cat([cls, x], dim=1) + shift = 1 + else: + raise NotImplementedError("unknown fusion") + + for block in self.blocks: + x = block(x, cls) + + x = x[:, shift:] + + x = self.norm(x) + + strong = self.sed_head(x) + return strong.transpose(1, 2) + + +class TSED_Wrapper(nn.Module): + def __init__( + self, + encoder, + decoder, + ft_blocks=[11, 12], + frozen_encoder=True + ): + super().__init__() + + self.encoder = encoder + self.decoder = decoder + + print("Loading Dasheng weights for decoders...") + for i, blk_idx in enumerate(ft_blocks): + decoder_block = self.decoder.blocks[i] + encoder_block = self.encoder.blocks[blk_idx] + state_dict = copy.deepcopy(encoder_block.state_dict()) + missing, unexpected = decoder_block.load_state_dict(state_dict, strict=False) + if missing or unexpected: + print(f"Block {blk_idx}:") + if missing: + print(f"โœ… Expected missing keys: {missing}") + if unexpected: + print(f" Unexpected keys: {unexpected}") + # Copy norm_layer weights + self.decoder.norm.load_state_dict(copy.deepcopy(self.encoder.norm.state_dict())) + + # Remove the injected blocks and norm_layer from the encoder + for blk_idx in sorted(ft_blocks, reverse=True): + # Reverse to avoid index shift issues + del self.encoder.blocks[blk_idx] + # Remove encoder norm layer + del self.encoder.norm + + self.frozen_encoder = frozen_encoder + if frozen_encoder: + for param in self.encoder.parameters(): + param.requires_grad = False + + def forward_to_spec(self, x): + return self.encoder.forward_to_spec(x) + + def forward_encoder(self, x): + if self.frozen_encoder: + with torch.no_grad(): + x = self.encoder(x) + else: + x = self.encoder(x) + return x + + def forward(self, x, cls): + x = self.forward_encoder(x) + pred = self.decoder(x, cls) + return pred diff --git a/src/models/transformer.py b/src/models/transformer.py new file mode 100644 index 0000000000000000000000000000000000000000..e327c1cc74ceeaa2a6c8ea55dd88cdf76f69cc3f --- /dev/null +++ b/src/models/transformer.py @@ -0,0 +1,214 @@ +from einops import rearrange +from torch.cuda.amp import autocast +from functools import partial +from typing import Optional, Tuple +import torchaudio.transforms as audio_transforms +from einops.layers.torch import Rearrange + +import torch +import torch.nn as nn +from .dasheng import AudioPatchEmbed, Block + +# if hasattr(nn.functional, 'scaled_dot_product_attention'): +# ATTENTION_MODE = 'flash' +# else: +# ATTENTION_MODE = 'math' +# print(f'attention mode is {ATTENTION_MODE}') + + +class Dasheng_Encoder(nn.Module): + def __init__(self, + patch_size: Tuple[int, int] = (64, 4), + patch_stride: Tuple[int, int] = (64, 4), + embed_dim: int = 768, + depth: int = 12, + num_heads=8, + mlp_ratio=4., + qkv_bias=True, + drop_rate=0., + attn_drop_rate=0., + norm_layer=None, + act_layer=None, + init_values=None, + target_length=1008, + pooling='mean', + time_patch_out: Optional[float] = None, + freq_patch_out: Optional[float] = None, + block_type='Block', + attention_type='Attention', + eval_avg='cat', + n_fft: int = 512, + n_mels: int = 64, + hop_size: int = 160, + win_size: int = 512, + f_min: int = 0, + f_max: int = 8000, + center: bool = True, + **kwargs): + super().__init__() + self.pooling = pooling + self.embed_dim = embed_dim + self.patch_stride = patch_stride + self.patch_size = patch_size + self.n_mels = n_mels + self.eval_avg = eval_avg + self.time_patch_out = time_patch_out + self.freq_patch_out = freq_patch_out + + self.front_end = nn.Sequential( + audio_transforms.MelSpectrogram(f_min=f_min, + sample_rate=16000, + win_length=win_size, + center=center, + n_fft=n_fft, + f_max=f_max, + hop_length=hop_size, + n_mels=self.n_mels, + power=1)) + + self.to_db = audio_transforms.AmplitudeToDB(stype='magnitude', top_db=kwargs.get('top_db', 120)) + + self.init_bn = nn.Sequential( + Rearrange('b c f t -> b f c t'), + nn.BatchNorm2d(self.n_mels, momentum=0.01), + Rearrange('b f c t -> b c f t')) + + self.target_length = target_length + self.patch_embed = AudioPatchEmbed(input_size=(self.n_mels, + target_length), + embed_dim=self.embed_dim, + patch_size=self.patch_size, + flatten=False, + patch_stride=self.patch_stride) + self.num_patches = self.patch_embed.num_patches + + if pooling == 'token': + self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_dim)) + self.token_pos_embed = nn.Parameter( + torch.randn(1, embed_dim) * .02) + + self.time_pos_embed = nn.Parameter( + torch.randn(1, embed_dim, 1, self.patch_embed.grid_size[1]) * .02) + self.freq_pos_embed = nn.Parameter( + torch.randn(1, embed_dim, self.patch_embed.grid_size[0], 1) * .02) + + norm_layer = norm_layer or partial(nn.LayerNorm, eps=1e-6) + act_layer = act_layer or nn.GELU + self.pos_drop = nn.Dropout(p=drop_rate) + self.blocks = nn.Sequential(*[ + Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + init_values=init_values, + drop=drop_rate, + attn_drop=attn_drop_rate, + norm_layer=norm_layer, + act_layer=act_layer, + attention_type=attention_type, + ) for _ in range(depth) + ]) + self.norm = norm_layer(embed_dim) + self.apply(self.init_weights) + if hasattr(self, 'cls_token') and self.cls_token is not None: + nn.init.normal_(self.cls_token, std=1e-6) + # group_masking = kwargs.get('group_masking', False) + # if isinstance(group_masking, bool): + # if group_masking is True: + # self.masking_func = self.random_masking_group + # else: + # self.masking_func = self.random_masking + # elif isinstance(group_masking, int): + # self.masking_func = partial(self.random_masking_group, + # group_factor=group_masking) + # @torch.jit.ignore + # def no_weight_decay(self): + # return { + # 'time_pos_embed', 'cls_token', 'freq_pos_embed', 'token_pos_embed' + # } + + def init_weights(self, module): + if isinstance(module, nn.Linear): + torch.nn.init.xavier_uniform_(module.weight) + if module.bias is not None: + nn.init.zeros_(module.bias) + elif isinstance(module, nn.LayerNorm): + nn.init.constant_(module.bias, 0) + nn.init.constant_(module.weight, 1.0) + + def forward_features(self, x): + x = self.patch_embed(x) + b, c, f, t = x.shape + x = x + self.time_pos_embed[:, :, :, :t] + x = x + self.freq_pos_embed[:, :, :, :] # Just for sin pos embed + x = rearrange(x, 'b c f t -> b (f t) c') + # x, mask, ids_restore = self.random_masking(x, mask_ratio) + # x, mask, ids_restore = self.masking_func(x, mask_ratio) + if self.pooling == 'token': + cls_token = self.cls_token.expand(x.shape[0], -1, -1) + cls_token = cls_token + self.token_pos_embed[:, :] + x = torch.cat((cls_token, x), dim=1) + x = self.pos_drop(x) + for block in self.blocks: + x = block(x) + # x = self.norm(x) + return x + + def load_state_dict(self, state_dict, **kwargs): + if 'time_pos_embed' in state_dict and self.time_pos_embed.shape != state_dict[ + 'time_pos_embed'].shape: + print("Positional Embedding shape not the same with model, resizing!") + self.change_pos_embedding(state_dict) + # Call the parent class method and capture the missing/unexpected keys + missing_keys, unexpected_keys = super().load_state_dict(state_dict, strict=False, **kwargs) + # Print missing and unexpected keys + if missing_keys: + print("Missing keys:", missing_keys) + if unexpected_keys: + print("Unexpected keys:", unexpected_keys) + + def change_pos_embedding(self, state_dict): + target_time_pos_embed_length = self.time_pos_embed.shape[-1] + target_freq_pos_embed_length = self.freq_pos_embed.shape[-2] + + pretrained_time_pos_embed = state_dict['time_pos_embed'] + pretrained_freq_pos_embed = state_dict['freq_pos_embed'] + + if target_freq_pos_embed_length <= pretrained_time_pos_embed.shape[-1]: + state_dict['time_pos_embed'] = pretrained_time_pos_embed[ + ..., :target_time_pos_embed_length] + else: + state_dict['time_pos_embed'] = torch.nn.functional.interpolate( + pretrained_time_pos_embed, + size=(1, target_time_pos_embed_length), + align_corners=False, + mode='bilinear') + if target_freq_pos_embed_length <= pretrained_freq_pos_embed.shape[-2]: + state_dict[ + 'freq_pos_embed'] = pretrained_freq_pos_embed[:, :, : + target_freq_pos_embed_length, :] + else: + state_dict['freq_pos_embed'] = torch.nn.functional.interpolate( + pretrained_freq_pos_embed, + size=(target_freq_pos_embed_length, 1), + align_corners=False, + mode='bilinear') + + def forward_to_spec(self, x): + # Do not use fp16 for feature extraction, that is likely to get nan + with autocast(enabled=False): + X = self.front_end(x) + # X = rearrange(X, 'b f t -> b 1 f t') + # X = self.init_bn(X) + return X + + def forward(self, x): + # x = self.forward_to_spec(x) + # print(x.shape) + with autocast(enabled=False): + x = self.to_db(x) + x = rearrange(x, 'b f t -> b 1 f t') + x = self.init_bn(x) + x = self.forward_features(x) + return x \ No newline at end of file diff --git a/src/prepare_clap.py b/src/prepare_clap.py new file mode 100644 index 0000000000000000000000000000000000000000..d44c6daffa259eb74c37a469b490cfab5b69699b --- /dev/null +++ b/src/prepare_clap.py @@ -0,0 +1,39 @@ +import os +import pandas as pd +import torch +from transformers import AutoTokenizer, ClapTextModelWithProjection + +if __name__ == '__main__': + # Load the CLAP model and tokenizer + model = ClapTextModelWithProjection.from_pretrained("laion/clap-htsat-unfused") + model.eval() + tokenizer = AutoTokenizer.from_pretrained("laion/clap-htsat-unfused") + + # Path to the input CSV file + input_csv_path = '/home/user/SSD/Dataset/Audioset_SL/no_rule_all/label_to_id.csv' + output_path = 'clap_embedding/' # Replace with your desired output folder path + + # Create the output folder if it doesn't exist + os.makedirs(output_path, exist_ok=True) + + # Read the CSV file + df = pd.read_csv(input_csv_path) + + # Get unique event labels + events = df['label'].unique() + + with torch.no_grad(): # Disable gradient computation + # Process each event + for event in events: + text = event.replace('_', ' ') # Replace underscores with spaces + text = f'The sound of {text}' + print(text) + inputs = tokenizer([text], padding=True, return_tensors="pt") + outputs = model(**inputs) + text_embeds = outputs.text_embeds + + # Save the embeddings to a .pt file + output_file = os.path.join(output_path, f"{event}.pt") + torch.save(text_embeds, output_file) + + print("Embedding extraction and saving complete!") diff --git a/src/test.py b/src/test.py new file mode 100644 index 0000000000000000000000000000000000000000..1f6c313f46c9bec6fc0e8b39d417b2fb7bd49805 --- /dev/null +++ b/src/test.py @@ -0,0 +1,140 @@ +import random +import argparse +import os +import time +import numpy as np +import matplotlib.pyplot as plt +from tqdm import tqdm + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader + +from accelerate import Accelerator + +from models.transformer import Dasheng_Encoder +from models.sed_decoder import Decoder, TSED_Wrapper +from dataset.tsed import TSED_AS +from dataset.tsed_val import TSED_Val +from utils import load_yaml_with_includes, get_lr_scheduler, ConcatDatasetBatchSampler +from utils.data_aug import frame_shift, mixup, time_mask, feature_transformation +from val import val_psds + + +def parse_args(): + parser = argparse.ArgumentParser() + + # Config settings + parser.add_argument('--config-name', type=str, default='configs/model.yml') + parser.add_argument('--ckpt', type=str, default='20000.pt') + + # Training settings + parser.add_argument("--amp", type=str, default='fp16') + parser.add_argument('--epochs', type=int, default=20) + parser.add_argument('--num-workers', type=int, default=8) + parser.add_argument('--num-threads', type=int, default=1) + parser.add_argument('--eval-every-step', type=int, default=5000) + parser.add_argument('--save-every-step', type=int, default=5000) + # parser.add_argument('--dataloader', type=str, default='EACaps') + parser.add_argument("--logit-normal-indices", type=bool, default=False) + + # Log and random seed + parser.add_argument('--random-seed', type=int, default=2024) + parser.add_argument('--log-step', type=int, default=100) + parser.add_argument('--log-dir', type=str, default='../logs/') + parser.add_argument('--save-dir', type=str, default='../ckpts/') + return parser.parse_args() + + +def setup_directories(args, params): + args.log_dir = os.path.join(args.log_dir, params['model_name']) + '/' + args.save_dir = os.path.join(args.save_dir, params['model_name']) + '/' + + os.makedirs(args.log_dir, exist_ok=True) + os.makedirs(args.save_dir, exist_ok=True) + + +def set_device(args): + torch.set_num_threads(args.num_threads) + if torch.cuda.is_available(): + args.device = 'cuda' + torch.cuda.manual_seed_all(args.random_seed) + torch.backends.cuda.matmul.allow_tf32 = True + if torch.backends.cudnn.is_available(): + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + else: + args.device = 'cpu' + + +if __name__ == '__main__': + args = parse_args() + params = load_yaml_with_includes(args.config_name) + set_device(args) + setup_directories(args, params) + + random.seed(args.random_seed) + torch.manual_seed(args.random_seed) + + # use accelerator for multi-gpu training + accelerator = Accelerator(mixed_precision=args.amp, + gradient_accumulation_steps=params['opt']['accumulation_steps'], + step_scheduler_with_optimizer=False) + + train_set = TSED_AS(**params['data']['train_data']) + train_loader = DataLoader(train_set, batch_size=params['opt']['batch_size'], num_workers=args.num_workers) + + # val_set = TSED_Val(**params['data']['val_data']) + # val_loader = DataLoader(val_set, num_workers=0, batch_size=1, shuffle=False) + + test_set = TSED_Val(**params['data']['test_data']) + test_loader = DataLoader(test_set, num_workers=0, batch_size=1, shuffle=False) + + encoder = Dasheng_Encoder(**params['encoder']).to(accelerator.device) + pretrained_url = 'https://zenodo.org/records/11511780/files/dasheng_base.pt?download=1' + dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu') + model_parmeters = dump['model'] + # pretrained_url = 'https://zenodo.org/records/13315686/files/dasheng_audioset_mAP497.pt?download=1' + # dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu') + # model_parmeters = dump + encoder.load_state_dict(model_parmeters) + + decoder = Decoder(**params['decoder']).to(accelerator.device) + + model = TSED_Wrapper(encoder, decoder, params['ft_blocks'], params['frozen_encoder']) + print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.2f}M") + + model.load_state_dict(torch.load(args.ckpt, map_location='cpu')['model']) + + if params['frozen_encoder']: + optimizer = torch.optim.AdamW( + model.parameters(), + lr=params['opt']['learning_rate'], + weight_decay=params['opt']['weight_decay'], + betas=(params['opt']['beta1'], params['opt']['beta2']), + eps=params['opt']['adam_epsilon']) + else: + optimizer = torch.optim.AdamW( + [ + {'params': model.encoder.parameters(), 'lr': 0.1 * params['opt']['learning_rate']}, + {'params': model.decoder.parameters(), 'lr': params['opt']['learning_rate']} + ], + weight_decay=params['opt']['weight_decay'], + betas=(params['opt']['beta1'], params['opt']['beta2']), + eps=params['opt']['adam_epsilon']) + + lr_scheduler = get_lr_scheduler(optimizer, 'customized', **params['opt']['lr_scheduler']) + + strong_loss_func = nn.BCEWithLogitsLoss() + + model, optimizer, lr_scheduler, train_loader, test_loader = accelerator.prepare( + model, optimizer, lr_scheduler, train_loader, test_loader) + + global_step = 0.0 + losses = 0.0 + + if accelerator.is_main_process: + model_module = model.module if hasattr(model, 'module') else model + val_psds(model_module, test_loader, params, epoch='test_full', split='test', + save_path=args.log_dir + 'output/', device=accelerator.device) \ No newline at end of file diff --git a/src/train.py b/src/train.py new file mode 100644 index 0000000000000000000000000000000000000000..19d225165d77c205136b85e7ad0c3dec32b93ac2 --- /dev/null +++ b/src/train.py @@ -0,0 +1,208 @@ +import random +import argparse +import os +import time +import numpy as np +import matplotlib.pyplot as plt +from tqdm import tqdm + +import torch +import torch.nn as nn +import torch.nn.functional as F +from torch.utils.data import DataLoader + +from accelerate import Accelerator + +from models.transformer import Dasheng_Encoder +from models.sed_decoder import Decoder, TSED_Wrapper +from dataset.tsed import TSED_AS +from dataset.tsed_val import TSED_Val +from utils import load_yaml_with_includes, get_lr_scheduler, ConcatDatasetBatchSampler +from utils.data_aug import frame_shift, mixup, time_mask, feature_transformation +from val import val_psds + + +def parse_args(): + parser = argparse.ArgumentParser() + + # Config settings + parser.add_argument('--config-name', type=str, default='configs/model.yml') + + # Training settings + parser.add_argument("--amp", type=str, default='fp16') + parser.add_argument('--epochs', type=int, default=20) + parser.add_argument('--num-workers', type=int, default=8) + parser.add_argument('--num-threads', type=int, default=1) + parser.add_argument('--eval-every-step', type=int, default=5000) + parser.add_argument('--save-every-step', type=int, default=5000) + # parser.add_argument('--dataloader', type=str, default='EACaps') + parser.add_argument("--logit-normal-indices", type=bool, default=False) + + # Log and random seed + parser.add_argument('--random-seed', type=int, default=2024) + parser.add_argument('--log-step', type=int, default=100) + parser.add_argument('--log-dir', type=str, default='../logs/') + parser.add_argument('--save-dir', type=str, default='../ckpts/') + return parser.parse_args() + + +def setup_directories(args, params): + args.log_dir = os.path.join(args.log_dir, params['model_name']) + '/' + args.save_dir = os.path.join(args.save_dir, params['model_name']) + '/' + + os.makedirs(args.log_dir, exist_ok=True) + os.makedirs(args.save_dir, exist_ok=True) + + +def set_device(args): + torch.set_num_threads(args.num_threads) + if torch.cuda.is_available(): + args.device = 'cuda' + torch.cuda.manual_seed_all(args.random_seed) + torch.backends.cuda.matmul.allow_tf32 = True + if torch.backends.cudnn.is_available(): + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + else: + args.device = 'cpu' + + +if __name__ == '__main__': + args = parse_args() + params = load_yaml_with_includes(args.config_name) + set_device(args) + setup_directories(args, params) + + random.seed(args.random_seed) + torch.manual_seed(args.random_seed) + + # use accelerator for multi-gpu training + accelerator = Accelerator(mixed_precision=args.amp, + gradient_accumulation_steps=params['opt']['accumulation_steps'], + step_scheduler_with_optimizer=False) + + train_set = TSED_AS(**params['data']['train_data']) + train_loader = DataLoader(train_set, shuffle=True, + batch_size=params['opt']['batch_size'], + num_workers=args.num_workers) + + val_set = TSED_Val(**params['data']['val_data']) + val_loader = DataLoader(val_set, num_workers=0, batch_size=1, shuffle=False) + + # test_set = TSED_Val(**params['data']['test_data']) + # test_loader = DataLoader(val_set, num_workers=0, batch_size=1, shuffle=False) + + encoder = Dasheng_Encoder(**params['encoder']).to(accelerator.device) + pretrained_url = 'https://zenodo.org/records/11511780/files/dasheng_base.pt?download=1' + dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu') + model_parmeters = dump['model'] + # pretrained_url = 'https://zenodo.org/records/13315686/files/dasheng_audioset_mAP497.pt?download=1' + # dump = torch.hub.load_state_dict_from_url(pretrained_url, map_location='cpu') + # model_parmeters = dump + encoder.load_state_dict(model_parmeters) + + decoder = Decoder(**params['decoder']).to(accelerator.device) + + model = TSED_Wrapper(encoder, decoder, params['ft_blocks'], params['frozen_encoder']) + print(f"Trainable Parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad) / 1e6:.2f}M") + + # model.load_state_dict(torch.load('../ckpts/TSED_AS_filter/20000.0.pt', map_location='cpu')['model']) + + if params['frozen_encoder']: + optimizer = torch.optim.AdamW( + model.parameters(), + lr=params['opt']['learning_rate'], + weight_decay=params['opt']['weight_decay'], + betas=(params['opt']['beta1'], params['opt']['beta2']), + eps=params['opt']['adam_epsilon']) + else: + optimizer = torch.optim.AdamW( + [ + {'params': model.encoder.parameters(), 'lr': 0.1 * params['opt']['learning_rate']}, + {'params': model.decoder.parameters(), 'lr': params['opt']['learning_rate']} + ], + weight_decay=params['opt']['weight_decay'], + betas=(params['opt']['beta1'], params['opt']['beta2']), + eps=params['opt']['adam_epsilon']) + + lr_scheduler = get_lr_scheduler(optimizer, 'customized', **params['opt']['lr_scheduler']) + + strong_loss_func = nn.BCEWithLogitsLoss() + + model, optimizer, lr_scheduler, train_loader, val_loader = accelerator.prepare( + model, optimizer, lr_scheduler, train_loader, val_loader) + + global_step = 0.0 + losses = 0.0 + + if accelerator.is_main_process: + model_module = model.module if hasattr(model, 'module') else model + val_psds(model_module, val_loader, params, epoch='debug', split='val', + save_path=args.log_dir + 'output/', device=accelerator.device) + + for epoch in range(args.epochs): + model.train() + for step, batch in enumerate(tqdm(train_loader)): + with accelerator.accumulate(model): + audio, cls, label, _ = batch + mel = model.forward_to_spec(audio) + + # data aug + mel, label = frame_shift(mel, label, params['net_pooling']) + mel, label = time_mask(mel, label, params["net_pooling"], + mask_ratios=params['data_aug']["time_mask_ratios"]) + mel, _ = feature_transformation(mel, **params['data_aug']["transform"]) + + strong_pred = model(mel, cls) + + B, N, L = label.shape + label = label.reshape(B * N, L) + label = label.unsqueeze(1) + + loss = strong_loss_func(strong_pred, label) + + accelerator.backward(loss) + + # clip grad up + if accelerator.sync_gradients: + if 'grad_clip' in params['opt'] and params['opt']['grad_clip'] > 0: + accelerator.clip_grad_norm_(model.parameters(), + max_norm=params['opt']['grad_clip']) + optimizer.step() + lr_scheduler.step() + optimizer.zero_grad() + + global_step += 1/params['opt']['accumulation_steps'] + losses += loss.item()/params['opt']['accumulation_steps'] + + if accelerator.is_main_process: + if global_step % args.log_step == 0: + current_time = time.asctime(time.localtime(time.time())) + epoch_info = f'Epoch: [{epoch + 1}][{args.epochs}]' + batch_info = f'Global Step: {global_step}' + loss_info = f'Loss: {losses / args.log_step:.6f}' + + # Extract the learning rate from the optimizer + lr = optimizer.param_groups[0]['lr'] + lr_info = f'Learning Rate: {lr:.6f}' + + log_message = f'{current_time}\n{epoch_info} {batch_info} {loss_info} {lr_info}\n' + + with open(args.log_dir + 'log.txt', mode='a') as n: + n.write(log_message) + + losses = 0.0 + + # check performance + if (global_step + 1) % args.eval_every_step == 0: + if accelerator.is_main_process: + model_module = model.module if hasattr(model, 'module') else model + val_psds(model_module, val_loader, params, epoch=global_step+1, split='val', + save_path=args.log_dir + 'output/', device=accelerator.device) + # save model + unwrapped_model = accelerator.unwrap_model(model) + accelerator.save({ + "model": model.state_dict(), + }, args.save_dir + str(global_step+1) + '.pt') + accelerator.wait_for_everyone() + model.train() diff --git a/src/utils/.ipynb_checkpoints/utils-checkpoint.py b/src/utils/.ipynb_checkpoints/utils-checkpoint.py new file mode 100644 index 0000000000000000000000000000000000000000..0ce0fb22d47d396e8605e2a88627ae769862ee54 --- /dev/null +++ b/src/utils/.ipynb_checkpoints/utils-checkpoint.py @@ -0,0 +1,85 @@ +import torch +import numpy as np +import yaml +import os +from torch.utils.data import Sampler + + +def load_yaml_with_includes(yaml_file): + def loader_with_include(loader, node): + # Load the included file + include_path = os.path.join(os.path.dirname(yaml_file), loader.construct_scalar(node)) + with open(include_path, 'r') as f: + return yaml.load(f, Loader=yaml.FullLoader) + + yaml.add_constructor('!include', loader_with_include, Loader=yaml.FullLoader) + + with open(yaml_file, 'r') as f: + return yaml.load(f, Loader=yaml.FullLoader) + + +def customized_lr_scheduler(optimizer, warmup_steps=10000, decay_steps=1e6, end_factor=1e-4): + from torch.optim.lr_scheduler import LinearLR, SequentialLR + warmup_scheduler = LinearLR(optimizer, + start_factor=min(1 / warmup_steps, 1), + end_factor=1.0, total_iters=warmup_steps) + + decay_scheduler = LinearLR(optimizer, + start_factor=1.0, + end_factor=end_factor, + total_iters=decay_steps) + + scheduler = SequentialLR(optimizer, schedulers=[warmup_scheduler, decay_scheduler], + milestones=[warmup_steps]) + return scheduler + + +def get_lr_scheduler(optimizer, name, **kwargs): + if name == 'customized': + return customized_lr_scheduler(optimizer, **kwargs) + elif name == 'cosine': + from torch.optim.lr_scheduler import CosineAnnealingLR + return CosineAnnealingLR(optimizer, **kwargs) + else: + raise NotImplementedError(name) + + +class ConcatDatasetBatchSampler(Sampler): + def __init__(self, samplers, batch_sizes, epoch=0): + self.batch_sizes = batch_sizes + self.samplers = samplers + self.offsets = [0] + np.cumsum([len(x) for x in self.samplers]).tolist()[:-1] + + self.epoch = epoch + self.set_epoch(self.epoch) + + def _iter_one_dataset(self, c_batch_size, c_sampler, c_offset): + batch = [] + for idx in c_sampler: + batch.append(c_offset + idx) + if len(batch) == c_batch_size: + yield batch + + def set_epoch(self, epoch): + if hasattr(self.samplers[0], "epoch"): + for s in self.samplers: + s.set_epoch(epoch) + + def __iter__(self): + iterators = [iter(i) for i in self.samplers] + tot_batch = [] + for b_num in range(len(self)): + for samp_idx in range(len(self.samplers)): + c_batch = [] + while len(c_batch) < self.batch_sizes[samp_idx]: + c_batch.append(self.offsets[samp_idx] + next(iterators[samp_idx])) + tot_batch.extend(c_batch) + yield tot_batch + tot_batch = [] + + def __len__(self): + min_len = float("inf") + for idx, sampler in enumerate(self.samplers): + c_len = (len(sampler)) // self.batch_sizes[idx] + min_len = min(c_len, min_len) + return min_len \ No newline at end of file diff --git a/src/utils/__init__.py b/src/utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..90f60fdd89ad8575faafe45188bd1d968852fc67 --- /dev/null +++ b/src/utils/__init__.py @@ -0,0 +1 @@ +from .utils import * \ No newline at end of file diff --git a/src/utils/__pycache__/__init__.cpython-311.pyc b/src/utils/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..edfb07a2b5efd8fa3fd1bd16ef78bdc61700f9d0 Binary files /dev/null and b/src/utils/__pycache__/__init__.cpython-311.pyc differ diff --git a/src/utils/__pycache__/utils.cpython-311.pyc b/src/utils/__pycache__/utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8e9c19253e5b1731a9db05260ba3f54802efab3f Binary files /dev/null and b/src/utils/__pycache__/utils.cpython-311.pyc differ diff --git a/src/utils/data_aug.py b/src/utils/data_aug.py new file mode 100644 index 0000000000000000000000000000000000000000..0251413ba015e71409d684703d4adc1985f92491 --- /dev/null +++ b/src/utils/data_aug.py @@ -0,0 +1,173 @@ +# Some codes are adopted from https://github.com/DCASE-REPO/DESED_task +import torch +import numpy as np +import random + + +def frame_shift(features, label=None, net_pooling=None): + if label is not None: + batch_size, _, _ = features.shape + shifted_feature = [] + shifted_label = [] + for idx in range(batch_size): + shift = int(random.gauss(0, 90)) + shifted_feature.append(torch.roll(features[idx], shift, dims=-1)) + shift = -abs(shift) // net_pooling if shift < 0 else shift // net_pooling + shifted_label.append(torch.roll(label[idx], shift, dims=-1)) + return torch.stack(shifted_feature), torch.stack(shifted_label) + else: + batch_size, _, _ = features.shape + shifted_feature = [] + for idx in range(batch_size): + shift = int(random.gauss(0, 90)) + shifted_feature.append(torch.roll(features[idx], shift, dims=-1)) + return torch.stack(shifted_feature) + + +def mixup(features, label=None, permutation=None, c=None, alpha=0.2, beta=0.2, mixup_label_type="soft", returnc=False): + with torch.no_grad(): + batch_size = features.size(0) + + if permutation is None: + permutation = torch.randperm(batch_size) + + if c is None: + if mixup_label_type == "soft": + c = np.random.beta(alpha, beta) + elif mixup_label_type == "hard": + c = np.random.beta(alpha, beta) * 0.4 + 0.3 # c in [0.3, 0.7] + + mixed_features = c * features + (1 - c) * features[permutation, :] + if label is not None: + if mixup_label_type == "soft": + mixed_label = torch.clamp(c * label + (1 - c) * label[permutation, :], min=0, max=1) + elif mixup_label_type == "hard": + mixed_label = torch.clamp(label + label[permutation, :], min=0, max=1) + else: + raise NotImplementedError(f"mixup_label_type: {mixup_label_type} not implemented. choice in " + f"{'soft', 'hard'}") + if returnc: + return mixed_features, mixed_label, c, permutation + else: + return mixed_features, mixed_label + else: + return mixed_features + + +def time_mask(features, labels=None, net_pooling=None, mask_ratios=(10, 20)): + # print(labels.shape) + if labels is not None: + _, _, n_frame = labels.shape + t_width = torch.randint(low=int(n_frame/mask_ratios[1]), high=int(n_frame/mask_ratios[0]), size=(1,)) # [low, high) + t_low = torch.randint(low=0, high=n_frame-t_width[0], size=(1,)) + features[:, :, t_low * net_pooling:(t_low+t_width)*net_pooling] = 0 + labels[:, :, t_low:t_low+t_width] = 0 + return features, labels + else: + _, _, n_frame = features.shape + t_width = torch.randint(low=int(n_frame/mask_ratios[1]), high=int(n_frame/mask_ratios[0]), size=(1,)) # [low, high) + t_low = torch.randint(low=0, high=n_frame-t_width[0], size=(1,)) + features[:, :, t_low:(t_low + t_width)] = 0 + return features + + +def feature_transformation(features, n_transform, choice, filter_db_range, filter_bands, + filter_minimum_bandwidth, filter_type, freq_mask_ratio, noise_snrs): + if n_transform == 2: + feature_list = [] + for _ in range(n_transform): + features_temp = features + if choice[0]: + features_temp = filt_aug(features_temp, db_range=filter_db_range, n_band=filter_bands, + min_bw=filter_minimum_bandwidth, filter_type=filter_type) + if choice[1]: + features_temp = freq_mask(features_temp, mask_ratio=freq_mask_ratio) + if choice[2]: + features_temp = add_noise(features_temp, snrs=noise_snrs) + feature_list.append(features_temp) + return feature_list + elif n_transform == 1: + if choice[0]: + features = filt_aug(features, db_range=filter_db_range, n_band=filter_bands, + min_bw=filter_minimum_bandwidth, filter_type=filter_type) + if choice[1]: + features = freq_mask(features, mask_ratio=freq_mask_ratio) + if choice[2]: + features = add_noise(features, snrs=noise_snrs) + return [features, features] + else: + return [features, features] + + +def filt_aug(features, db_range=[-6, 6], n_band=[3, 6], min_bw=6, filter_type="linear"): + # this is updated FilterAugment algorithm used for ICASSP 2022 + if not isinstance(filter_type, str): + if torch.rand(1).item() < filter_type: + filter_type = "step" + n_band = [2, 5] + min_bw = 4 + else: + filter_type = "linear" + n_band = [3, 6] + min_bw = 6 + + batch_size, n_freq_bin, _ = features.shape + n_freq_band = torch.randint(low=n_band[0], high=n_band[1], size=(1,)).item() # [low, high) + if n_freq_band > 1: + while n_freq_bin - n_freq_band * min_bw + 1 < 0: + min_bw -= 1 + band_bndry_freqs = torch.sort(torch.randint(0, n_freq_bin - n_freq_band * min_bw + 1, + (n_freq_band - 1,)))[0] + \ + torch.arange(1, n_freq_band) * min_bw + band_bndry_freqs = torch.cat((torch.tensor([0]), band_bndry_freqs, torch.tensor([n_freq_bin]))) + + if filter_type == "step": + band_factors = torch.rand((batch_size, n_freq_band)).to(features) * (db_range[1] - db_range[0]) + db_range[0] + band_factors = 10 ** (band_factors / 20) + + freq_filt = torch.ones((batch_size, n_freq_bin, 1)).to(features) + for i in range(n_freq_band): + freq_filt[:, band_bndry_freqs[i]:band_bndry_freqs[i + 1], :] = band_factors[:, i].unsqueeze(-1).unsqueeze(-1) + + elif filter_type == "linear": + band_factors = torch.rand((batch_size, n_freq_band + 1)).to(features) * (db_range[1] - db_range[0]) + db_range[0] + freq_filt = torch.ones((batch_size, n_freq_bin, 1)).to(features) + for i in range(n_freq_band): + for j in range(batch_size): + freq_filt[j, band_bndry_freqs[i]:band_bndry_freqs[i+1], :] = \ + torch.linspace(band_factors[j, i], band_factors[j, i+1], + band_bndry_freqs[i+1] - band_bndry_freqs[i]).unsqueeze(-1) + freq_filt = 10 ** (freq_filt / 20) + return features * freq_filt + + else: + return features + + +def freq_mask(features, mask_ratio=16): + batch_size, n_freq_bin, _ = features.shape + max_mask = int(n_freq_bin/mask_ratio) + if max_mask == 1: + f_widths = torch.ones(batch_size) + else: + f_widths = torch.randint(low=1, high=max_mask, size=(batch_size,)) # [low, high) + + for i in range(batch_size): + f_width = f_widths[i] + f_low = torch.randint(low=0, high=n_freq_bin-f_width, size=(1,)) + + features[i, f_low:f_low+f_width, :] = 0 + return features + + +def add_noise(features, snrs=(15, 30), dims=(1, 2)): + if isinstance(snrs, (list, tuple)): + snr = (snrs[0] - snrs[1]) * torch.rand((features.shape[0],), device=features.device).reshape(-1, 1, 1) + snrs[1] + else: + snr = snrs + + snr = 10 ** (snr / 20) + sigma = torch.std(features, dim=dims, keepdim=True) / snr + return features + torch.randn(features.shape, device=features.device) * sigma + + diff --git a/src/utils/evaluation_measures.py b/src/utils/evaluation_measures.py new file mode 100644 index 0000000000000000000000000000000000000000..ac8231b275a3b33ddb50117a30dd5e6af0beb668 --- /dev/null +++ b/src/utils/evaluation_measures.py @@ -0,0 +1,252 @@ +#Adopted from https://github.com/DCASE-REPO/DESED_task +import os + +import numpy as np +import pandas as pd +import psds_eval +import sed_eval +from psds_eval import PSDSEval, plot_psd_roc + + +def get_event_list_current_file(df, fname): + """ + Get list of events for a given filename + Args: + df: pd.DataFrame, the dataframe to search on + fname: the filename to extract the value from the dataframe + Returns: + list of events (dictionaries) for the given filename + """ + event_file = df[df["filename"] == fname] + if len(event_file) == 1: + if pd.isna(event_file["event_label"].iloc[0]): + event_list_for_current_file = [{"filename": fname}] + else: + event_list_for_current_file = event_file.to_dict("records") + else: + event_list_for_current_file = event_file.to_dict("records") + + return event_list_for_current_file + + +def psds_results(psds_obj): + """ Compute psds scores + Args: + psds_obj: psds_eval.PSDSEval object with operating points. + Returns: + """ + try: + psds_score = psds_obj.psds(alpha_ct=0, alpha_st=0, max_efpr=100) + print(f"\nPSD-Score (0, 0, 100): {psds_score.value:.5f}") + psds_score = psds_obj.psds(alpha_ct=1, alpha_st=0, max_efpr=100) + print(f"\nPSD-Score (1, 0, 100): {psds_score.value:.5f}") + psds_score = psds_obj.psds(alpha_ct=0, alpha_st=1, max_efpr=100) + print(f"\nPSD-Score (0, 1, 100): {psds_score.value:.5f}") + except psds_eval.psds.PSDSEvalError as e: + print("psds did not work ....") + raise EnvironmentError + + +def event_based_evaluation_df( + reference, estimated, t_collar=0.200, percentage_of_length=0.2 +): + """ Calculate EventBasedMetric given a reference and estimated dataframe + + Args: + reference: pd.DataFrame containing "filename" "onset" "offset" and "event_label" columns which describe the + reference events + estimated: pd.DataFrame containing "filename" "onset" "offset" and "event_label" columns which describe the + estimated events to be compared with reference + t_collar: float, in seconds, the number of time allowed on onsets and offsets + percentage_of_length: float, between 0 and 1, the percentage of length of the file allowed on the offset + Returns: + sed_eval.sound_event.EventBasedMetrics with the scores + """ + + evaluated_files = reference["filename"].unique() + + classes = [] + classes.extend(reference.event_label.dropna().unique()) + classes.extend(estimated.event_label.dropna().unique()) + classes = list(set(classes)) + + event_based_metric = sed_eval.sound_event.EventBasedMetrics( + event_label_list=classes, + t_collar=t_collar, + percentage_of_length=percentage_of_length, + empty_system_output_handling="zero_score", + ) + + for fname in evaluated_files: + reference_event_list_for_current_file = get_event_list_current_file( + reference, fname + ) + estimated_event_list_for_current_file = get_event_list_current_file( + estimated, fname + ) + + event_based_metric.evaluate( + reference_event_list=reference_event_list_for_current_file, + estimated_event_list=estimated_event_list_for_current_file, + ) + + return event_based_metric + + +def segment_based_evaluation_df(reference, estimated, time_resolution=1.0): + """ Calculate SegmentBasedMetrics given a reference and estimated dataframe + + Args: + reference: pd.DataFrame containing "filename" "onset" "offset" and "event_label" columns which describe the + reference events + estimated: pd.DataFrame containing "filename" "onset" "offset" and "event_label" columns which describe the + estimated events to be compared with reference + time_resolution: float, the time resolution of the segment based metric + Returns: + sed_eval.sound_event.SegmentBasedMetrics with the scores + """ + evaluated_files = reference["filename"].unique() + + classes = [] + classes.extend(reference.event_label.dropna().unique()) + classes.extend(estimated.event_label.dropna().unique()) + classes = list(set(classes)) + + segment_based_metric = sed_eval.sound_event.SegmentBasedMetrics( + event_label_list=classes, time_resolution=time_resolution + ) + + for fname in evaluated_files: + reference_event_list_for_current_file = get_event_list_current_file( + reference, fname + ) + estimated_event_list_for_current_file = get_event_list_current_file( + estimated, fname + ) + + segment_based_metric.evaluate( + reference_event_list=reference_event_list_for_current_file, + estimated_event_list=estimated_event_list_for_current_file, + ) + + return segment_based_metric + + +def compute_sed_eval_metrics(predictions, groundtruth): + """ Compute sed_eval metrics event based and segment based with default parameters used in the task. + Args: + predictions: pd.DataFrame, predictions dataframe + groundtruth: pd.DataFrame, groundtruth dataframe + Returns: + tuple, (sed_eval.sound_event.EventBasedMetrics, sed_eval.sound_event.SegmentBasedMetrics) + """ + metric_event = event_based_evaluation_df( + groundtruth, predictions, t_collar=0.200, percentage_of_length=0.2 + ) + metric_segment = segment_based_evaluation_df( + groundtruth, predictions, time_resolution=1.0 + ) + + return metric_event, metric_segment + + +def compute_per_intersection_macro_f1( + prediction_dfs, + ground_truth_file, + durations_file, + dtc_threshold=0.5, + gtc_threshold=0.5, + cttc_threshold=0.3, +): + """ Compute F1-score per intersection, using the defautl + Args: + prediction_dfs: dict, a dictionary with thresholds keys and predictions dataframe + ground_truth_file: pd.DataFrame, the groundtruth dataframe + durations_file: pd.DataFrame, the duration dataframe + dtc_threshold: float, the parameter used in PSDSEval, percentage of tolerance for groundtruth intersection + with predictions + gtc_threshold: float, the parameter used in PSDSEval percentage of tolerance for predictions intersection + with groundtruth + gtc_threshold: float, the parameter used in PSDSEval to know the percentage needed to count FP as cross-trigger + + Returns: + + """ + gt = pd.read_csv(ground_truth_file, sep="\t") + durations = pd.read_csv(durations_file, sep="\t") + + psds = PSDSEval( + ground_truth=gt, + metadata=durations, + dtc_threshold=dtc_threshold, + gtc_threshold=gtc_threshold, + cttc_threshold=cttc_threshold, + ) + psds_macro_f1 = [] + for threshold in prediction_dfs.keys(): + if not prediction_dfs[threshold].empty: + threshold_f1, _ = psds.compute_macro_f_score(prediction_dfs[threshold]) + else: + threshold_f1 = 0 + if np.isnan(threshold_f1): + threshold_f1 = 0.0 + psds_macro_f1.append(threshold_f1) + psds_macro_f1 = np.mean(psds_macro_f1) + return psds_macro_f1 + + +def compute_psds_from_operating_points( + prediction_dfs, + ground_truth_file, + durations_file, + dtc_threshold=0.5, + gtc_threshold=0.5, + cttc_threshold=0.3, + alpha_ct=0, + alpha_st=0, + max_efpr=100, + save_dir=None, +): + + gt = pd.read_csv(ground_truth_file, sep="\t") + durations = pd.read_csv(durations_file, sep="\t") + psds_eval = PSDSEval( + ground_truth=gt, + metadata=durations, + dtc_threshold=dtc_threshold, + gtc_threshold=gtc_threshold, + cttc_threshold=cttc_threshold, + ) + + for i, k in enumerate(prediction_dfs.keys()): + det = prediction_dfs[k] + # see issue https://github.com/audioanalytic/psds_eval/issues/3 + det["index"] = range(1, len(det) + 1) + det = det.set_index("index") + psds_eval.add_operating_point( + det, info={"name": f"Op {i + 1:02d}", "threshold": k} + ) + + psds_score = psds_eval.psds(alpha_ct=alpha_ct, alpha_st=alpha_st, max_efpr=max_efpr) + + if save_dir is not None: + os.makedirs(save_dir, exist_ok=True) + + pred_dir = os.path.join( + save_dir, + f"predictions_dtc{dtc_threshold}_gtc{gtc_threshold}_cttc{cttc_threshold}", + ) + os.makedirs(pred_dir, exist_ok=True) + for k in prediction_dfs.keys(): + prediction_dfs[k].to_csv( + os.path.join(pred_dir, f"predictions_th_{k:.2f}.tsv"), + sep="\t", + index=False, + ) + + plot_psd_roc( + psds_score, + filename=os.path.join(save_dir, f"PSDS_ct{alpha_ct}_st{alpha_st}_100.png"), + ) + + return psds_score.value diff --git a/src/utils/sed.py b/src/utils/sed.py new file mode 100644 index 0000000000000000000000000000000000000000..774d6de0b8c8c5ddbff1153bfd2ea9face5ab766 --- /dev/null +++ b/src/utils/sed.py @@ -0,0 +1,313 @@ +#Some codes are adopted from https://github.com/DCASE-REPO/DESED_task +import torch +import torch.nn as nn +import torchaudio +from torch.utils.data import Sampler +import os +import math +import scipy +from pathlib import Path +import numpy as np +import pandas as pd + +from utils.evaluation_measures import compute_sed_eval_metrics + + +class Encoder: + def __init__(self, labels, audio_len, frame_len, frame_hop, net_pooling=1, sr=16000): + if type(labels) in [np.ndarray, np.array]: + labels = labels.tolist() + self.labels = labels + self.audio_len = audio_len + self.frame_len = frame_len + self.frame_hop = frame_hop + self.sr = sr + self.net_pooling = net_pooling + n_samples = self.audio_len * self.sr + self.n_frames = int(math.ceil(n_samples/2/self.frame_hop)*2 / self.net_pooling) + + def _time_to_frame(self, time): + sample = time * self.sr + frame = sample / self.frame_hop + return np.clip(frame / self.net_pooling, a_min=0, a_max=self.n_frames) + + def _frame_to_time(self, frame): + time = frame * self.net_pooling * self.frame_hop / self.sr + return np.clip(time, a_min=0, a_max=self.audio_len) + + def encode_strong_df(self, events_df): + # from event dict, generate strong label tensor sized as [n_frame, n_class] + true_labels = np.zeros((self.n_frames, len(self.labels))) + for _, row in events_df.iterrows(): + if not pd.isna(row['event_label']): + label_idx = self.labels.index(row["event_label"]) + onset = int(self._time_to_frame(row["onset"])) #๋ฒ„๋ฆผ -> ํ•ด๋‹น time frame์— ๊ฑธ์ณ์žˆ์œผ๋ฉด true + offset = int(np.ceil(self._time_to_frame(row["offset"]))) #์˜ฌ๋ฆผ -> ํ•ด๋‹น time frame์— ๊ฑธ์ณ์žˆ์œผ๋ฉด true + true_labels[onset:offset, label_idx] = 1 + return true_labels + + def encode_weak(self, events): + # from event dict, generate weak label tensor sized as [n_class] + labels = np.zeros((len(self.labels))) + if len(events) == 0: + return labels + else: + for event in events: + labels[self.labels.index(event)] = 1 + return labels + + def decode_strong(self, outputs): + #from the network output sized [n_frame, n_class], generate the label/onset/offset lists + pred = [] + for i, label_column in enumerate(outputs.T): #outputs size = [n_class, frames] + change_indices = self.find_contiguous_regions(label_column) + for row in change_indices: + onset = self._frame_to_time(row[0]) + offset = self._frame_to_time(row[1]) + onset = np.clip(onset, a_min=0, a_max=self.audio_len) + offset = np.clip(offset, a_min=0, a_max=self.audio_len) + pred.append([self.labels[i], onset, offset]) + return pred + + def decode_weak(self, outputs): + result_labels = [] + for i, value in enumerate(outputs): + if value == 1: + result_labels.append(self.labels[i]) + return result_labels + + def find_contiguous_regions(self, array): + #find at which frame the label changes in the array + change_indices = np.logical_xor(array[1:], array[:-1]).nonzero()[0] + #shift indices to focus the frame after + change_indices += 1 + if array[0]: + #if first element of array is True(1), add 0 in the beggining + #change_indices = np.append(0, change_indices) + change_indices = np.r_[0, change_indices] + if array[-1]: + #if last element is True, add the length of array + change_indices = np.r_[change_indices, array.size] + #reshape the result into two columns + return change_indices.reshape((-1, 2)) + + +def decode_pred_batch(outputs, weak_preds, filenames, encoder, thresholds, median_filter, decode_weak, pad_idx=None): + pred_dfs = {} + for threshold in thresholds: + pred_dfs[threshold] = pd.DataFrame() + for batch_idx in range(outputs.shape[0]): #outputs size = [bs, n_class, frames] + for c_th in thresholds: + output = outputs[batch_idx] #outputs size = [n_class, frames] + if pad_idx is not None: + true_len = int(output.shape[-1] * pad_idx[batch_idx].item) + output = output[:true_len] + output = output.transpose(0, 1).detach().cpu().numpy() #output size = [frames, n_class] + if decode_weak: # if decode_weak = 1 or 2 + for class_idx in range(weak_preds.size(1)): + if weak_preds[batch_idx, class_idx] < c_th: + output[:, class_idx] = 0 + elif decode_weak > 1: # use only weak predictions (weakSED) + output[:, class_idx] = 1 + if decode_weak < 2: # weak prediction masking + output = output > c_th + for mf_idx in range(len(median_filter)): + output[:, mf_idx] = scipy.ndimage.filters.median_filter(output[:, mf_idx], (median_filter[mf_idx])) + pred = encoder.decode_strong(output) + pred = pd.DataFrame(pred, columns=["event_label", "onset", "offset"]) + pred["filename"] = Path(filenames[batch_idx]).stem + ".wav" + pred_dfs[c_th] = pred_dfs[c_th]._append(pred, ignore_index=True) + return pred_dfs + + +class ConcatDatasetBatchSampler(Sampler): + def __init__(self, samplers, batch_sizes, epoch=0): + self.batch_sizes = batch_sizes + self.samplers = samplers + self.offsets = [0] + np.cumsum([len(x) for x in self.samplers]).tolist()[:-1] + + self.epoch = epoch + self.set_epoch(self.epoch) + + def _iter_one_dataset(self, c_batch_size, c_sampler, c_offset): + batch = [] + for idx in c_sampler: + batch.append(c_offset + idx) + if len(batch) == c_batch_size: + yield batch + + def set_epoch(self, epoch): + if hasattr(self.samplers[0], "epoch"): + for s in self.samplers: + s.set_epoch(epoch) + + def __iter__(self): + iterators = [iter(i) for i in self.samplers] + tot_batch = [] + for b_num in range(len(self)): + for samp_idx in range(len(self.samplers)): + c_batch = [] + while len(c_batch) < self.batch_sizes[samp_idx]: + c_batch.append(self.offsets[samp_idx] + next(iterators[samp_idx])) + tot_batch.extend(c_batch) + yield tot_batch + tot_batch = [] + + def __len__(self): + min_len = float("inf") + for idx, sampler in enumerate(self.samplers): + c_len = (len(sampler)) // self.batch_sizes[idx] + min_len = min(c_len, min_len) + return min_len + + +class ExponentialWarmup(object): + def __init__(self, optimizer, max_lr, rampup_length, exponent=-5.0): + self.optimizer = optimizer + self.rampup_length = rampup_length + self.max_lr = max_lr + self.step_num = 1 + self.exponent = exponent + + def zero_grad(self): + self.optimizer.zero_grad() + + def _get_lr(self): + return self.max_lr * self._get_scaling_factor() + + def _set_lr(self, lr): + for param_group in self.optimizer.param_groups: + param_group["lr"] = lr + + def step(self): + self.step_num += 1 + lr = self._get_lr() + self._set_lr(lr) + + # def load_state_dict(self, state_dict): + # self.__dict__.update(state_dict) + # + # def state_dict(self): + # return {key: value for key, value in self.__dict__.items() if key != "optimizer"} + + def _get_scaling_factor(self): + if self.rampup_length == 0: + return 1.0 + else: + current = np.clip(self.step_num, 0.0, self.rampup_length) + phase = 1.0 - current / self.rampup_length + return float(np.exp(self.exponent * phase * phase)) + + +def update_ema(net, ema_net, step, ema_factor): + # update EMA model + alpha = min(1 - 1 / step, ema_factor) + for ema_params, params in zip(ema_net.parameters(), net.parameters()): + ema_params.data.mul_(alpha).add_(params.data, alpha=1 - alpha) + return ema_net + + +def log_sedeval_metrics(predictions, ground_truth, save_dir=None): + """ Return the set of metrics from sed_eval + Args: + predictions: pd.DataFrame, the dataframe of predictions. + ground_truth: pd.DataFrame, the dataframe of groundtruth. + save_dir: str, path to the folder where to save the event and segment based metrics outputs. + + Returns: + tuple, event-based macro-F1 and micro-F1, segment-based macro-F1 and micro-F1 + """ + if predictions.empty: + return 0.0, 0.0, 0.0, 0.0 + + gt = pd.read_csv(ground_truth, sep="\t") + + event_res, segment_res = compute_sed_eval_metrics(predictions, gt) + + if save_dir is not None: + os.makedirs(save_dir, exist_ok=True) + with open(os.path.join(save_dir, "event_f1.txt"), "w") as f: + f.write(str(event_res)) + + with open(os.path.join(save_dir, "segment_f1.txt"), "w") as f: + f.write(str(segment_res)) + + return ( + event_res.results()["class_wise_average"]["f_measure"]["f_measure"], + event_res.results()["overall"]["f_measure"]["f_measure"], + segment_res.results()["class_wise_average"]["f_measure"]["f_measure"], + segment_res.results()["overall"]["f_measure"]["f_measure"], + ) # return also segment measures + + +class Scaler(nn.Module): + def __init__(self, statistic="instance", normtype="minmax", dims=(0, 2), eps=1e-8): + super(Scaler, self).__init__() + self.statistic = statistic + self.normtype = normtype + self.dims = dims + self.eps = eps + + def load_state_dict(self, state_dict, strict=True): + if self.statistic == "dataset": + super(Scaler, self).load_state_dict(state_dict, strict) + + def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, + missing_keys, unexpected_keys, error_msgs): + if self.statistic == "dataset": + super(Scaler, self)._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, + unexpected_keys, error_msgs) + + def forward(self, input): + if self.statistic == "dataset": + if self.normtype == "mean": + return input - self.mean + elif self.normtype == "standard": + std = torch.sqrt(self.mean_squared - self.mean ** 2) + return (input - self.mean) / (std + self.eps) + else: + raise NotImplementedError + + elif self.statistic =="instance": + if self.normtype == "mean": + return input - torch.mean(input, self.dims, keepdim=True) + elif self.normtype == "standard": + return (input - torch.mean(input, self.dims, keepdim=True)) / ( + torch.std(input, self.dims, keepdim=True) + self.eps) + elif self.normtype == "minmax": + return (input - torch.amin(input, dim=self.dims, keepdim=True)) / ( + torch.amax(input, dim=self.dims, keepdim=True) + - torch.amin(input, dim=self.dims, keepdim=True) + self.eps) + else: + raise NotImplementedError + + else: + raise NotImplementedError + + +class AsymmetricalFocalLoss(nn.Module): + def __init__(self, gamma=0, zeta=0): + super(AsymmetricalFocalLoss, self).__init__() + self.gamma = gamma # balancing between classes + self.zeta = zeta # balancing between active/inactive frames + + def forward(self, pred, target): + losses = - (((1 - pred) ** self.gamma) * target * torch.clamp_min(torch.log(pred), -100) + + (pred ** self.zeta) * (1 - target) * torch.clamp_min(torch.log(1 - pred), -100)) + return torch.mean(losses) + + +def take_log(feature): + amp2db = torchaudio.transforms.AmplitudeToDB(stype="amplitude") + amp2db.amin = 1e-5 + return amp2db(feature).clamp(min=-50, max=80) + + +def count_parameters(model): + total_params = 0 + for name, parameter in model.named_parameters(): + if not parameter.requires_grad: + continue + param = parameter.numel() + total_params += param + return total_params diff --git a/src/utils/utils.py b/src/utils/utils.py new file mode 100644 index 0000000000000000000000000000000000000000..0ce0fb22d47d396e8605e2a88627ae769862ee54 --- /dev/null +++ b/src/utils/utils.py @@ -0,0 +1,85 @@ +import torch +import numpy as np +import yaml +import os +from torch.utils.data import Sampler + + +def load_yaml_with_includes(yaml_file): + def loader_with_include(loader, node): + # Load the included file + include_path = os.path.join(os.path.dirname(yaml_file), loader.construct_scalar(node)) + with open(include_path, 'r') as f: + return yaml.load(f, Loader=yaml.FullLoader) + + yaml.add_constructor('!include', loader_with_include, Loader=yaml.FullLoader) + + with open(yaml_file, 'r') as f: + return yaml.load(f, Loader=yaml.FullLoader) + + +def customized_lr_scheduler(optimizer, warmup_steps=10000, decay_steps=1e6, end_factor=1e-4): + from torch.optim.lr_scheduler import LinearLR, SequentialLR + warmup_scheduler = LinearLR(optimizer, + start_factor=min(1 / warmup_steps, 1), + end_factor=1.0, total_iters=warmup_steps) + + decay_scheduler = LinearLR(optimizer, + start_factor=1.0, + end_factor=end_factor, + total_iters=decay_steps) + + scheduler = SequentialLR(optimizer, schedulers=[warmup_scheduler, decay_scheduler], + milestones=[warmup_steps]) + return scheduler + + +def get_lr_scheduler(optimizer, name, **kwargs): + if name == 'customized': + return customized_lr_scheduler(optimizer, **kwargs) + elif name == 'cosine': + from torch.optim.lr_scheduler import CosineAnnealingLR + return CosineAnnealingLR(optimizer, **kwargs) + else: + raise NotImplementedError(name) + + +class ConcatDatasetBatchSampler(Sampler): + def __init__(self, samplers, batch_sizes, epoch=0): + self.batch_sizes = batch_sizes + self.samplers = samplers + self.offsets = [0] + np.cumsum([len(x) for x in self.samplers]).tolist()[:-1] + + self.epoch = epoch + self.set_epoch(self.epoch) + + def _iter_one_dataset(self, c_batch_size, c_sampler, c_offset): + batch = [] + for idx in c_sampler: + batch.append(c_offset + idx) + if len(batch) == c_batch_size: + yield batch + + def set_epoch(self, epoch): + if hasattr(self.samplers[0], "epoch"): + for s in self.samplers: + s.set_epoch(epoch) + + def __iter__(self): + iterators = [iter(i) for i in self.samplers] + tot_batch = [] + for b_num in range(len(self)): + for samp_idx in range(len(self.samplers)): + c_batch = [] + while len(c_batch) < self.batch_sizes[samp_idx]: + c_batch.append(self.offsets[samp_idx] + next(iterators[samp_idx])) + tot_batch.extend(c_batch) + yield tot_batch + tot_batch = [] + + def __len__(self): + min_len = float("inf") + for idx, sampler in enumerate(self.samplers): + c_len = (len(sampler)) // self.batch_sizes[idx] + min_len = min(c_len, min_len) + return min_len \ No newline at end of file diff --git a/src/val.py b/src/val.py new file mode 100644 index 0000000000000000000000000000000000000000..ea0dca20b51ab35666006c2824dce95af2a9ef0c --- /dev/null +++ b/src/val.py @@ -0,0 +1,141 @@ +import torch +import os +import pandas as pd +from tqdm import tqdm +import sed_scores_eval +from desed_task.evaluation.evaluation_measures import (compute_per_intersection_macro_f1, + compute_psds_from_operating_points, + compute_psds_from_scores) +from local.utils import (batched_decode_preds,) +from utils.sed import Encoder +import numpy as np + + +@torch.no_grad() +def val_psds(model, val_loader, params, epoch, split, save_path, device): + label_df = pd.read_csv(params['data'][split]['label']) + EVENTS = label_df['label'].tolist() + + clap_emb = [] + for event in EVENTS: + cls = torch.load(params['data']['train_data']['clap_dir'] + event + '.pt').to(device) + cls = cls.unsqueeze(1) + clap_emb.append(cls) + cls = torch.cat(clap_emb, dim=1) + + encoder = Encoder(EVENTS, audio_len=10, frame_len=160, frame_hop=160, net_pooling=4, sr=16000) + + model.eval() + test_csv = params['data'][split]["csv"] + test_dur = params['data'][split]["dur"] + + gt = pd.read_csv(test_csv, sep='\t') + + test_scores_postprocessed_buffer = {} + test_scores_postprocessed_buffer_tsed = {} + test_thresholds = [0.5] + test_psds_buffer = {k: pd.DataFrame() for k in test_thresholds} + test_psds_buffer_tsed = {k: pd.DataFrame() for k in test_thresholds} + + for batch in tqdm(val_loader): + audio, filenames = batch + B = audio.shape[0] + N = cls.shape[1] + cls = cls.expand(B, -1, -1) + + audio = audio.to(device) + mel = model.forward_to_spec(audio) + + preds = model(mel, cls) + preds = torch.sigmoid(preds) + preds = preds.reshape(B, N, -1) + preds_tsed = preds.clone() + # tsed assumes sound exitencance is known + for idx, filename in enumerate(filenames): + weak_label = list(gt[gt['filename'] == filename]['event_label'].unique()) + for j, event in enumerate(EVENTS): + if event not in weak_label: + preds_tsed[idx][j] = 0.0 + # preds = preds.transpose(1, 2) + + (_, scores_postprocessed_strong, _,) = \ + batched_decode_preds( + preds, + filenames, + encoder, + median_filter=9, + thresholds=list(test_psds_buffer.keys()), ) + test_scores_postprocessed_buffer.update(scores_postprocessed_strong) + + (_, scores_postprocessed_strong_tsed, _,) = \ + batched_decode_preds( + preds_tsed, + filenames, + encoder, + median_filter=9, + thresholds=list(test_psds_buffer_tsed.keys()), ) + test_scores_postprocessed_buffer_tsed.update(scores_postprocessed_strong_tsed) + + ground_truth = sed_scores_eval.io.read_ground_truth_events(test_csv) + audio_durations = sed_scores_eval.io.read_audio_durations(test_dur) + + ground_truth = { + audio_id: ground_truth[audio_id] + for audio_id in test_scores_postprocessed_buffer + } + audio_durations = { + audio_id: audio_durations[audio_id] + for audio_id in test_scores_postprocessed_buffer + } + + psds1_sed_scores_eval, psds1_cls = compute_psds_from_scores( + test_scores_postprocessed_buffer, + ground_truth, + audio_durations, + dtc_threshold=0.7, + gtc_threshold=0.7, + cttc_threshold=None, + alpha_ct=0.0, + alpha_st=0.0, + # save_dir=os.path.join(save_dir, "student", "scenario1"), + ) + psds1_cls['overall'] = psds1_sed_scores_eval + psds1_cls['macro_averaged'] = np.array([v for k, v in psds1_cls.items()]).mean() + psds1_cls['name'] = 'psds1' + + psds1_sed_scores_eval_tsed, psds1_cls_tsed = compute_psds_from_scores( + test_scores_postprocessed_buffer_tsed, + ground_truth, + audio_durations, + dtc_threshold=0.7, + gtc_threshold=0.7, + cttc_threshold=None, + alpha_ct=0.0, + alpha_st=0.0, + # save_dir=os.path.join(save_dir, "student", "scenario1"), + ) + + psds1_cls_tsed['overall'] = psds1_sed_scores_eval_tsed + psds1_cls_tsed['macro_averaged'] = np.array([v for k, v in psds1_cls_tsed.items()]).mean() + psds1_cls_tsed['name'] = 'psds1_tsed' + + # psds2_sed_scores_eval, psds2_cls = compute_psds_from_scores( + # test_scores_postprocessed_buffer, + # ground_truth, + # audio_durations, + # dtc_threshold=0.1, + # gtc_threshold=0.1, + # cttc_threshold=0.3, + # alpha_ct=0.5, + # alpha_st=1, + # # save_dir=os.path.join(save_dir, "student", "scenario1"), + # ) + # psds2_cls['overall'] = psds2_sed_scores_eval + # psds2_cls['macro_averaged'] = np.array([v for k, v in psds2_cls.items()]).mean() + # psds2_cls['name'] = 'psds2' + psds_cls = pd.DataFrame([psds1_cls, psds1_cls_tsed]) + # psds_cls = pd.DataFrame([psds1_cls, psds2_cls]) + os.makedirs(f'{save_path}/psds_cls/', exist_ok=True) + psds_cls.to_csv(f'{save_path}/psds_cls/{epoch}.csv', index=False) + + return psds1_sed_scores_eval, psds1_sed_scores_eval_tsed \ No newline at end of file