Upload folder using huggingface_hub
Browse files- README.md +13 -3
- config.json +162 -0
- handler.py +88 -0
- model.safetensors +3 -0
- preprocessor_config.json +0 -0
README.md
CHANGED
|
@@ -1,3 +1,13 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: bigscience-openrail-m
|
| 3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
license: bigscience-openrail-m
|
| 3 |
+
datasets:
|
| 4 |
+
- DORI-SRKW/DORI-ONC
|
| 5 |
+
- DORI-SRKW/DORI-Orcasound
|
| 6 |
+
base_model: openai/whisper-tiny
|
| 7 |
+
pipeline_tag: audio-classification
|
| 8 |
+
tags:
|
| 9 |
+
- biology
|
| 10 |
+
---
|
| 11 |
+
|
| 12 |
+
This is a marine mammal detector, which classifies absence(0) or presence(1) of marine mammals. While it is trained on Orcasound and Ocean Networks Canada data, it has generalised well to Ocean Observatories Initiative and other nodes.
|
| 13 |
+
|
config.json
ADDED
|
@@ -0,0 +1,162 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"_name_or_path": "DORI-SRKW/whisper-tiny-mm",
|
| 3 |
+
"activation_dropout": 0.0,
|
| 4 |
+
"activation_function": "gelu",
|
| 5 |
+
"apply_spec_augment": false,
|
| 6 |
+
"architectures": [
|
| 7 |
+
"WhisperForAudioClassification"
|
| 8 |
+
],
|
| 9 |
+
"attention_dropout": 0.0,
|
| 10 |
+
"begin_suppress_tokens": [
|
| 11 |
+
220,
|
| 12 |
+
50257
|
| 13 |
+
],
|
| 14 |
+
"bos_token_id": 50257,
|
| 15 |
+
"classifier_proj_size": 256,
|
| 16 |
+
"d_model": 384,
|
| 17 |
+
"decoder_attention_heads": 6,
|
| 18 |
+
"decoder_ffn_dim": 1536,
|
| 19 |
+
"decoder_layerdrop": 0.0,
|
| 20 |
+
"decoder_layers": 4,
|
| 21 |
+
"decoder_start_token_id": 50258,
|
| 22 |
+
"dropout": 0.0,
|
| 23 |
+
"encoder_attention_heads": 6,
|
| 24 |
+
"encoder_ffn_dim": 1536,
|
| 25 |
+
"encoder_layerdrop": 0.0,
|
| 26 |
+
"encoder_layers": 4,
|
| 27 |
+
"eos_token_id": 50257,
|
| 28 |
+
"forced_decoder_ids": [
|
| 29 |
+
[
|
| 30 |
+
1,
|
| 31 |
+
50259
|
| 32 |
+
],
|
| 33 |
+
[
|
| 34 |
+
2,
|
| 35 |
+
50359
|
| 36 |
+
],
|
| 37 |
+
[
|
| 38 |
+
3,
|
| 39 |
+
50363
|
| 40 |
+
]
|
| 41 |
+
],
|
| 42 |
+
"id2label": {
|
| 43 |
+
"0": "absent",
|
| 44 |
+
"1": "present"
|
| 45 |
+
},
|
| 46 |
+
"init_std": 0.02,
|
| 47 |
+
"is_encoder_decoder": true,
|
| 48 |
+
"label2id": {
|
| 49 |
+
"absent": 0,
|
| 50 |
+
"present": 1
|
| 51 |
+
},
|
| 52 |
+
"mask_feature_length": 10,
|
| 53 |
+
"mask_feature_min_masks": 0,
|
| 54 |
+
"mask_feature_prob": 0.0,
|
| 55 |
+
"mask_time_length": 10,
|
| 56 |
+
"mask_time_min_masks": 2,
|
| 57 |
+
"mask_time_prob": 0.05,
|
| 58 |
+
"max_length": 448,
|
| 59 |
+
"max_source_positions": 1500,
|
| 60 |
+
"max_target_positions": 448,
|
| 61 |
+
"median_filter_width": 7,
|
| 62 |
+
"model_type": "whisper",
|
| 63 |
+
"num_hidden_layers": 4,
|
| 64 |
+
"num_mel_bins": 80,
|
| 65 |
+
"pad_token_id": 50257,
|
| 66 |
+
"scale_embedding": false,
|
| 67 |
+
"suppress_tokens": [
|
| 68 |
+
1,
|
| 69 |
+
2,
|
| 70 |
+
7,
|
| 71 |
+
8,
|
| 72 |
+
9,
|
| 73 |
+
10,
|
| 74 |
+
14,
|
| 75 |
+
25,
|
| 76 |
+
26,
|
| 77 |
+
27,
|
| 78 |
+
28,
|
| 79 |
+
29,
|
| 80 |
+
31,
|
| 81 |
+
58,
|
| 82 |
+
59,
|
| 83 |
+
60,
|
| 84 |
+
61,
|
| 85 |
+
62,
|
| 86 |
+
63,
|
| 87 |
+
90,
|
| 88 |
+
91,
|
| 89 |
+
92,
|
| 90 |
+
93,
|
| 91 |
+
359,
|
| 92 |
+
503,
|
| 93 |
+
522,
|
| 94 |
+
542,
|
| 95 |
+
873,
|
| 96 |
+
893,
|
| 97 |
+
902,
|
| 98 |
+
918,
|
| 99 |
+
922,
|
| 100 |
+
931,
|
| 101 |
+
1350,
|
| 102 |
+
1853,
|
| 103 |
+
1982,
|
| 104 |
+
2460,
|
| 105 |
+
2627,
|
| 106 |
+
3246,
|
| 107 |
+
3253,
|
| 108 |
+
3268,
|
| 109 |
+
3536,
|
| 110 |
+
3846,
|
| 111 |
+
3961,
|
| 112 |
+
4183,
|
| 113 |
+
4667,
|
| 114 |
+
6585,
|
| 115 |
+
6647,
|
| 116 |
+
7273,
|
| 117 |
+
9061,
|
| 118 |
+
9383,
|
| 119 |
+
10428,
|
| 120 |
+
10929,
|
| 121 |
+
11938,
|
| 122 |
+
12033,
|
| 123 |
+
12331,
|
| 124 |
+
12562,
|
| 125 |
+
13793,
|
| 126 |
+
14157,
|
| 127 |
+
14635,
|
| 128 |
+
15265,
|
| 129 |
+
15618,
|
| 130 |
+
16553,
|
| 131 |
+
16604,
|
| 132 |
+
18362,
|
| 133 |
+
18956,
|
| 134 |
+
20075,
|
| 135 |
+
21675,
|
| 136 |
+
22520,
|
| 137 |
+
26130,
|
| 138 |
+
26161,
|
| 139 |
+
26435,
|
| 140 |
+
28279,
|
| 141 |
+
29464,
|
| 142 |
+
31650,
|
| 143 |
+
32302,
|
| 144 |
+
32470,
|
| 145 |
+
36865,
|
| 146 |
+
42863,
|
| 147 |
+
47425,
|
| 148 |
+
49870,
|
| 149 |
+
50254,
|
| 150 |
+
50258,
|
| 151 |
+
50358,
|
| 152 |
+
50359,
|
| 153 |
+
50360,
|
| 154 |
+
50361,
|
| 155 |
+
50362
|
| 156 |
+
],
|
| 157 |
+
"torch_dtype": "float32",
|
| 158 |
+
"transformers_version": "4.44.1",
|
| 159 |
+
"use_cache": true,
|
| 160 |
+
"use_weighted_layer_sum": false,
|
| 161 |
+
"vocab_size": 51865
|
| 162 |
+
}
|
handler.py
ADDED
|
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
handler.py
|
| 3 |
+
Set up the possibility for an inference endpoint on huggingface.
|
| 4 |
+
"""
|
| 5 |
+
from typing import Dict, Any
|
| 6 |
+
import torch
|
| 7 |
+
import torchaudio
|
| 8 |
+
from transformers import WhisperForAudioClassification, WhisperFeatureExtractor
|
| 9 |
+
import numpy as np
|
| 10 |
+
import base64
|
| 11 |
+
|
| 12 |
+
class EndpointHandler():
|
| 13 |
+
"""
|
| 14 |
+
This is a wrapper for huggingface models so that they return json objects and consider the same configs as other implementations
|
| 15 |
+
"""
|
| 16 |
+
def __init__(self, threshold=0.5):
|
| 17 |
+
|
| 18 |
+
self.device = "cuda" if torch.cuda.is_available() else "cpu"
|
| 19 |
+
torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
|
| 20 |
+
model_id = 'DORI-SRKW/whisper-tiny-mm'
|
| 21 |
+
|
| 22 |
+
# Load the model
|
| 23 |
+
try:
|
| 24 |
+
self.model = WhisperForAudioClassification.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
|
| 25 |
+
except:
|
| 26 |
+
self.model = WhisperForAudioClassification.from_pretrained(model_id, torch_dtype=torch_dtype)
|
| 27 |
+
self.feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)
|
| 28 |
+
|
| 29 |
+
self.model.eval()
|
| 30 |
+
self.model.to(self.device)
|
| 31 |
+
self.threshold = threshold
|
| 32 |
+
|
| 33 |
+
|
| 34 |
+
def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
|
| 35 |
+
"""
|
| 36 |
+
Args:
|
| 37 |
+
data (:obj:):
|
| 38 |
+
includes the input data and the parameters for the inference.
|
| 39 |
+
Return:
|
| 40 |
+
A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
|
| 41 |
+
- "label": A string representing what the label/class is. There can be multiple labels.
|
| 42 |
+
- "score": A score between 0 and 1 describing how confident the model is for this label/class.
|
| 43 |
+
"""
|
| 44 |
+
|
| 45 |
+
# step one, get the sampling rate of the audio
|
| 46 |
+
audio = data['audio']
|
| 47 |
+
# we encoded using base64.b64encode(filebytes).decode('utf-8') to pass to api url
|
| 48 |
+
audio = base64.b64decode(audio.encode('utf-8'))
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
fs = data['sampling_rate']
|
| 52 |
+
|
| 53 |
+
# split into 15 second intervals
|
| 54 |
+
audio = np.frombuffer(audio, dtype=np.float32)
|
| 55 |
+
audio = torch.tensor(audio)
|
| 56 |
+
audio = audio.reshape(1, -1)
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
# torchaudio resamples the audio to 32000
|
| 60 |
+
audio = torchaudio.functional.resample(audio, orig_freq=fs, new_freq=32000)
|
| 61 |
+
|
| 62 |
+
# highpass filter 1000 hz
|
| 63 |
+
audio = torchaudio.functional.highpass_biquad(audio, 32000, 1000, 0.707)
|
| 64 |
+
|
| 65 |
+
audio3 = []
|
| 66 |
+
for i in range(0, len(audio[-1]), 32000*15):
|
| 67 |
+
audio3.append(audio[:,i:i+32000*15].squeeze().cpu().data.numpy())
|
| 68 |
+
|
| 69 |
+
data = self.feature_extractor(audio3, sampling_rate = 16000, padding='max_length', max_length=32000*15, return_tensors='pt')
|
| 70 |
+
|
| 71 |
+
try:
|
| 72 |
+
data['input_values'] = data['input_values'].squeeze(0)
|
| 73 |
+
except:
|
| 74 |
+
# it is called input_features for whisper
|
| 75 |
+
data['input_features'] = data['input_features'].squeeze(0)
|
| 76 |
+
|
| 77 |
+
data = {k: v.to(self.device) for k, v in data.items()}
|
| 78 |
+
with torch.amp.autocast(device_type=self.device):
|
| 79 |
+
outputs = []
|
| 80 |
+
for segment in range(data['input_features'].shape[0]):
|
| 81 |
+
# iterate through 15 second segments
|
| 82 |
+
output = self.model(data['input_features'][segment].unsqueeze(0))
|
| 83 |
+
|
| 84 |
+
|
| 85 |
+
outputs.append({'logit': torch.softmax(output.logits, dim=1)[0][1].float().cpu().data.numpy().max(), 'start_time_s': segment*15})
|
| 86 |
+
|
| 87 |
+
outputs = {'logit': max([x['logit'] for x in outputs]), 'classification': 'present' if max([x['logit'] for x in outputs]) >= self.threshold else 'absent'}
|
| 88 |
+
return outputs
|
model.safetensors
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:5e84017582382993ffd6193969d122788d99dcd2018373d188c38bc16d9fa43d
|
| 3 |
+
size 33237160
|
preprocessor_config.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|