Audio Classification
ONNX
Safetensors
whisper
biology
bnestor commited on
Commit
555eac0
·
verified ·
1 Parent(s): bfcc0cb

Upload folder using huggingface_hub

Browse files
Files changed (5) hide show
  1. README.md +13 -3
  2. config.json +162 -0
  3. handler.py +88 -0
  4. model.safetensors +3 -0
  5. preprocessor_config.json +0 -0
README.md CHANGED
@@ -1,3 +1,13 @@
1
- ---
2
- license: bigscience-openrail-m
3
- ---
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: bigscience-openrail-m
3
+ datasets:
4
+ - DORI-SRKW/DORI-ONC
5
+ - DORI-SRKW/DORI-Orcasound
6
+ base_model: openai/whisper-tiny
7
+ pipeline_tag: audio-classification
8
+ tags:
9
+ - biology
10
+ ---
11
+
12
+ This is a marine mammal detector, which classifies absence(0) or presence(1) of marine mammals. While it is trained on Orcasound and Ocean Networks Canada data, it has generalised well to Ocean Observatories Initiative and other nodes.
13
+
config.json ADDED
@@ -0,0 +1,162 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "DORI-SRKW/whisper-tiny-mm",
3
+ "activation_dropout": 0.0,
4
+ "activation_function": "gelu",
5
+ "apply_spec_augment": false,
6
+ "architectures": [
7
+ "WhisperForAudioClassification"
8
+ ],
9
+ "attention_dropout": 0.0,
10
+ "begin_suppress_tokens": [
11
+ 220,
12
+ 50257
13
+ ],
14
+ "bos_token_id": 50257,
15
+ "classifier_proj_size": 256,
16
+ "d_model": 384,
17
+ "decoder_attention_heads": 6,
18
+ "decoder_ffn_dim": 1536,
19
+ "decoder_layerdrop": 0.0,
20
+ "decoder_layers": 4,
21
+ "decoder_start_token_id": 50258,
22
+ "dropout": 0.0,
23
+ "encoder_attention_heads": 6,
24
+ "encoder_ffn_dim": 1536,
25
+ "encoder_layerdrop": 0.0,
26
+ "encoder_layers": 4,
27
+ "eos_token_id": 50257,
28
+ "forced_decoder_ids": [
29
+ [
30
+ 1,
31
+ 50259
32
+ ],
33
+ [
34
+ 2,
35
+ 50359
36
+ ],
37
+ [
38
+ 3,
39
+ 50363
40
+ ]
41
+ ],
42
+ "id2label": {
43
+ "0": "absent",
44
+ "1": "present"
45
+ },
46
+ "init_std": 0.02,
47
+ "is_encoder_decoder": true,
48
+ "label2id": {
49
+ "absent": 0,
50
+ "present": 1
51
+ },
52
+ "mask_feature_length": 10,
53
+ "mask_feature_min_masks": 0,
54
+ "mask_feature_prob": 0.0,
55
+ "mask_time_length": 10,
56
+ "mask_time_min_masks": 2,
57
+ "mask_time_prob": 0.05,
58
+ "max_length": 448,
59
+ "max_source_positions": 1500,
60
+ "max_target_positions": 448,
61
+ "median_filter_width": 7,
62
+ "model_type": "whisper",
63
+ "num_hidden_layers": 4,
64
+ "num_mel_bins": 80,
65
+ "pad_token_id": 50257,
66
+ "scale_embedding": false,
67
+ "suppress_tokens": [
68
+ 1,
69
+ 2,
70
+ 7,
71
+ 8,
72
+ 9,
73
+ 10,
74
+ 14,
75
+ 25,
76
+ 26,
77
+ 27,
78
+ 28,
79
+ 29,
80
+ 31,
81
+ 58,
82
+ 59,
83
+ 60,
84
+ 61,
85
+ 62,
86
+ 63,
87
+ 90,
88
+ 91,
89
+ 92,
90
+ 93,
91
+ 359,
92
+ 503,
93
+ 522,
94
+ 542,
95
+ 873,
96
+ 893,
97
+ 902,
98
+ 918,
99
+ 922,
100
+ 931,
101
+ 1350,
102
+ 1853,
103
+ 1982,
104
+ 2460,
105
+ 2627,
106
+ 3246,
107
+ 3253,
108
+ 3268,
109
+ 3536,
110
+ 3846,
111
+ 3961,
112
+ 4183,
113
+ 4667,
114
+ 6585,
115
+ 6647,
116
+ 7273,
117
+ 9061,
118
+ 9383,
119
+ 10428,
120
+ 10929,
121
+ 11938,
122
+ 12033,
123
+ 12331,
124
+ 12562,
125
+ 13793,
126
+ 14157,
127
+ 14635,
128
+ 15265,
129
+ 15618,
130
+ 16553,
131
+ 16604,
132
+ 18362,
133
+ 18956,
134
+ 20075,
135
+ 21675,
136
+ 22520,
137
+ 26130,
138
+ 26161,
139
+ 26435,
140
+ 28279,
141
+ 29464,
142
+ 31650,
143
+ 32302,
144
+ 32470,
145
+ 36865,
146
+ 42863,
147
+ 47425,
148
+ 49870,
149
+ 50254,
150
+ 50258,
151
+ 50358,
152
+ 50359,
153
+ 50360,
154
+ 50361,
155
+ 50362
156
+ ],
157
+ "torch_dtype": "float32",
158
+ "transformers_version": "4.44.1",
159
+ "use_cache": true,
160
+ "use_weighted_layer_sum": false,
161
+ "vocab_size": 51865
162
+ }
handler.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ handler.py
3
+ Set up the possibility for an inference endpoint on huggingface.
4
+ """
5
+ from typing import Dict, Any
6
+ import torch
7
+ import torchaudio
8
+ from transformers import WhisperForAudioClassification, WhisperFeatureExtractor
9
+ import numpy as np
10
+ import base64
11
+
12
+ class EndpointHandler():
13
+ """
14
+ This is a wrapper for huggingface models so that they return json objects and consider the same configs as other implementations
15
+ """
16
+ def __init__(self, threshold=0.5):
17
+
18
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
19
+ torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
20
+ model_id = 'DORI-SRKW/whisper-tiny-mm'
21
+
22
+ # Load the model
23
+ try:
24
+ self.model = WhisperForAudioClassification.from_pretrained(model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True)
25
+ except:
26
+ self.model = WhisperForAudioClassification.from_pretrained(model_id, torch_dtype=torch_dtype)
27
+ self.feature_extractor = WhisperFeatureExtractor.from_pretrained(model_id)
28
+
29
+ self.model.eval()
30
+ self.model.to(self.device)
31
+ self.threshold = threshold
32
+
33
+
34
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, str]:
35
+ """
36
+ Args:
37
+ data (:obj:):
38
+ includes the input data and the parameters for the inference.
39
+ Return:
40
+ A :obj:`list`:. The object returned should be a list of one list like [[{"label": 0.9939950108528137}]] containing :
41
+ - "label": A string representing what the label/class is. There can be multiple labels.
42
+ - "score": A score between 0 and 1 describing how confident the model is for this label/class.
43
+ """
44
+
45
+ # step one, get the sampling rate of the audio
46
+ audio = data['audio']
47
+ # we encoded using base64.b64encode(filebytes).decode('utf-8') to pass to api url
48
+ audio = base64.b64decode(audio.encode('utf-8'))
49
+
50
+
51
+ fs = data['sampling_rate']
52
+
53
+ # split into 15 second intervals
54
+ audio = np.frombuffer(audio, dtype=np.float32)
55
+ audio = torch.tensor(audio)
56
+ audio = audio.reshape(1, -1)
57
+
58
+
59
+ # torchaudio resamples the audio to 32000
60
+ audio = torchaudio.functional.resample(audio, orig_freq=fs, new_freq=32000)
61
+
62
+ # highpass filter 1000 hz
63
+ audio = torchaudio.functional.highpass_biquad(audio, 32000, 1000, 0.707)
64
+
65
+ audio3 = []
66
+ for i in range(0, len(audio[-1]), 32000*15):
67
+ audio3.append(audio[:,i:i+32000*15].squeeze().cpu().data.numpy())
68
+
69
+ data = self.feature_extractor(audio3, sampling_rate = 16000, padding='max_length', max_length=32000*15, return_tensors='pt')
70
+
71
+ try:
72
+ data['input_values'] = data['input_values'].squeeze(0)
73
+ except:
74
+ # it is called input_features for whisper
75
+ data['input_features'] = data['input_features'].squeeze(0)
76
+
77
+ data = {k: v.to(self.device) for k, v in data.items()}
78
+ with torch.amp.autocast(device_type=self.device):
79
+ outputs = []
80
+ for segment in range(data['input_features'].shape[0]):
81
+ # iterate through 15 second segments
82
+ output = self.model(data['input_features'][segment].unsqueeze(0))
83
+
84
+
85
+ outputs.append({'logit': torch.softmax(output.logits, dim=1)[0][1].float().cpu().data.numpy().max(), 'start_time_s': segment*15})
86
+
87
+ outputs = {'logit': max([x['logit'] for x in outputs]), 'classification': 'present' if max([x['logit'] for x in outputs]) >= self.threshold else 'absent'}
88
+ return outputs
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5e84017582382993ffd6193969d122788d99dcd2018373d188c38bc16d9fa43d
3
+ size 33237160
preprocessor_config.json ADDED
The diff for this file is too large to render. See raw diff