sumitranjan commited on
Commit
8beda60
·
verified ·
1 Parent(s): c087cfa

Upload 10 files

Browse files
config.json ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_type": "voiceshield",
3
+ "architectures": ["VoiceShieldForAudioClassification"],
4
+ "num_labels": 2,
5
+ "id2label": {
6
+ "0": "safe",
7
+ "1": "malicious"
8
+ },
9
+ "label2id": {
10
+ "safe": 0,
11
+ "malicious": 1
12
+ },
13
+ "base_model": "openai/whisper-small",
14
+ "auto_map": {
15
+ "AutoConfig": "modeling_voiceshield.VoiceShieldConfig",
16
+ "AutoModelForAudioClassification": "modeling_voiceshield.VoiceShieldForAudioClassification",
17
+ "AutoPipelineForAudioClassification": "pipeline_voiceshield.VoiceShieldPipeline"
18
+ }
19
+ }
label_map.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"0": "safe", "1": "malicious"}
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f336c8e4b58752a12dd1687e5d0cacfc32cb3ccd359c85d03c9a500bcd19a42c
3
+ size 354475640
modeling_voiceshield.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ from transformers import WhisperModel, PreTrainedModel
4
+ from transformers.modeling_outputs import SequenceClassifierOutput
5
+ from transformers.configuration_utils import PretrainedConfig
6
+
7
+
8
+ class VoiceShieldConfig(PretrainedConfig):
9
+ model_type = "voiceshield"
10
+
11
+ def __init__(self, num_labels=2, base_model="openai/whisper-small", **kwargs):
12
+ super().__init__(**kwargs)
13
+ self.num_labels = num_labels
14
+ self.base_model = base_model
15
+
16
+
17
+ class VoiceShieldForAudioClassification(PreTrainedModel):
18
+ config_class = VoiceShieldConfig
19
+
20
+ def __init__(self, config):
21
+ super().__init__(config)
22
+ whisper = WhisperModel.from_pretrained(config.base_model)
23
+ self.encoder = whisper.encoder
24
+ d_model = self.encoder.config.d_model
25
+
26
+ self.classifier = nn.Sequential(
27
+ nn.Linear(d_model, 512),
28
+ nn.GELU(),
29
+ nn.Linear(512, 128),
30
+ nn.GELU(),
31
+ nn.Linear(128, config.num_labels),
32
+ )
33
+
34
+ def forward(self, input_features=None, labels=None):
35
+ hidden = self.encoder(input_features).last_hidden_state
36
+ pooled = hidden.mean(dim=1)
37
+ logits = self.classifier(pooled)
38
+
39
+ loss = None
40
+ if labels is not None:
41
+ loss = nn.CrossEntropyLoss()(logits, labels)
42
+
43
+ return SequenceClassifierOutput(loss=loss, logits=logits)
pipeline_voiceshield.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn.functional as F
3
+ import torchaudio
4
+ from transformers import Pipeline, WhisperProcessor, WhisperForConditionalGeneration
5
+
6
+
7
+ class VoiceShieldPipeline(Pipeline):
8
+ def __init__(self, model, **kwargs):
9
+ super().__init__(model=model, **kwargs)
10
+
11
+ base_model = model.config.base_model
12
+ self.processor = WhisperProcessor.from_pretrained(base_model)
13
+ self.stt_model = WhisperForConditionalGeneration.from_pretrained(base_model)
14
+
15
+ self.device = model.device
16
+ self.stt_model.to(self.device)
17
+ self.stt_model.eval()
18
+
19
+ def _sanitize_parameters(self, **kwargs):
20
+ return {}, {}, {}
21
+
22
+ def preprocess(self, inputs):
23
+ audio, sr = torchaudio.load(inputs)
24
+
25
+ if sr != 16000:
26
+ audio = torchaudio.transforms.Resample(sr, 16000)(audio)
27
+
28
+ if audio.shape[0] > 1:
29
+ audio = audio.mean(dim=0, keepdim=True)
30
+
31
+ audio_np = audio.squeeze().numpy()
32
+
33
+ features = self.processor(
34
+ audio_np, sampling_rate=16000, return_tensors="pt"
35
+ ).input_features.to(self.device)
36
+
37
+ return {"features": features}
38
+
39
+ def _forward(self, model_inputs):
40
+ features = model_inputs["features"]
41
+
42
+ # Transcription
43
+ with torch.no_grad():
44
+ ids = self.stt_model.generate(features)
45
+ transcript = self.processor.batch_decode(ids, skip_special_tokens=True)[0]
46
+
47
+ # Classification
48
+ with torch.no_grad():
49
+ logits = self.model(features).logits
50
+ probs = F.softmax(logits, dim=-1)[0]
51
+
52
+ return {
53
+ "transcript": transcript,
54
+ "probs": probs,
55
+ }
56
+
57
+ def postprocess(self, model_outputs):
58
+ probs = model_outputs["probs"]
59
+ transcript = model_outputs["transcript"]
60
+
61
+ label_id = probs.argmax().item()
62
+ score = probs[label_id].item()
63
+
64
+ label = self.model.config.id2label[str(label_id)]
65
+
66
+ return {
67
+ "transcript": transcript,
68
+ "label": label,
69
+ "confidence": score,
70
+ }
preprocessor_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor": {
3
+ "chunk_length": 30,
4
+ "dither": 0.0,
5
+ "feature_extractor_type": "WhisperFeatureExtractor",
6
+ "feature_size": 80,
7
+ "hop_length": 160,
8
+ "n_fft": 400,
9
+ "n_samples": 480000,
10
+ "nb_max_frames": 3000,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ },
16
+ "processor_class": "WhisperProcessor"
17
+ }
processor_config.json ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "feature_extractor": {
3
+ "chunk_length": 30,
4
+ "dither": 0.0,
5
+ "feature_extractor_type": "WhisperFeatureExtractor",
6
+ "feature_size": 80,
7
+ "hop_length": 160,
8
+ "n_fft": 400,
9
+ "n_samples": 480000,
10
+ "nb_max_frames": 3000,
11
+ "padding_side": "right",
12
+ "padding_value": 0.0,
13
+ "return_attention_mask": false,
14
+ "sampling_rate": 16000
15
+ },
16
+ "processor_class": "WhisperProcessor"
17
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,127 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_prefix_space": false,
3
+ "backend": "tokenizers",
4
+ "bos_token": "<|endoftext|>",
5
+ "clean_up_tokenization_spaces": true,
6
+ "eos_token": "<|endoftext|>",
7
+ "errors": "replace",
8
+ "extra_special_tokens": [
9
+ "<|endoftext|>",
10
+ "<|startoftranscript|>",
11
+ "<|en|>",
12
+ "<|zh|>",
13
+ "<|de|>",
14
+ "<|es|>",
15
+ "<|ru|>",
16
+ "<|ko|>",
17
+ "<|fr|>",
18
+ "<|ja|>",
19
+ "<|pt|>",
20
+ "<|tr|>",
21
+ "<|pl|>",
22
+ "<|ca|>",
23
+ "<|nl|>",
24
+ "<|ar|>",
25
+ "<|sv|>",
26
+ "<|it|>",
27
+ "<|id|>",
28
+ "<|hi|>",
29
+ "<|fi|>",
30
+ "<|vi|>",
31
+ "<|he|>",
32
+ "<|uk|>",
33
+ "<|el|>",
34
+ "<|ms|>",
35
+ "<|cs|>",
36
+ "<|ro|>",
37
+ "<|da|>",
38
+ "<|hu|>",
39
+ "<|ta|>",
40
+ "<|no|>",
41
+ "<|th|>",
42
+ "<|ur|>",
43
+ "<|hr|>",
44
+ "<|bg|>",
45
+ "<|lt|>",
46
+ "<|la|>",
47
+ "<|mi|>",
48
+ "<|ml|>",
49
+ "<|cy|>",
50
+ "<|sk|>",
51
+ "<|te|>",
52
+ "<|fa|>",
53
+ "<|lv|>",
54
+ "<|bn|>",
55
+ "<|sr|>",
56
+ "<|az|>",
57
+ "<|sl|>",
58
+ "<|kn|>",
59
+ "<|et|>",
60
+ "<|mk|>",
61
+ "<|br|>",
62
+ "<|eu|>",
63
+ "<|is|>",
64
+ "<|hy|>",
65
+ "<|ne|>",
66
+ "<|mn|>",
67
+ "<|bs|>",
68
+ "<|kk|>",
69
+ "<|sq|>",
70
+ "<|sw|>",
71
+ "<|gl|>",
72
+ "<|mr|>",
73
+ "<|pa|>",
74
+ "<|si|>",
75
+ "<|km|>",
76
+ "<|sn|>",
77
+ "<|yo|>",
78
+ "<|so|>",
79
+ "<|af|>",
80
+ "<|oc|>",
81
+ "<|ka|>",
82
+ "<|be|>",
83
+ "<|tg|>",
84
+ "<|sd|>",
85
+ "<|gu|>",
86
+ "<|am|>",
87
+ "<|yi|>",
88
+ "<|lo|>",
89
+ "<|uz|>",
90
+ "<|fo|>",
91
+ "<|ht|>",
92
+ "<|ps|>",
93
+ "<|tk|>",
94
+ "<|nn|>",
95
+ "<|mt|>",
96
+ "<|sa|>",
97
+ "<|lb|>",
98
+ "<|my|>",
99
+ "<|bo|>",
100
+ "<|tl|>",
101
+ "<|mg|>",
102
+ "<|as|>",
103
+ "<|tt|>",
104
+ "<|haw|>",
105
+ "<|ln|>",
106
+ "<|ha|>",
107
+ "<|ba|>",
108
+ "<|jw|>",
109
+ "<|su|>",
110
+ "<|translate|>",
111
+ "<|transcribe|>",
112
+ "<|startoflm|>",
113
+ "<|startofprev|>",
114
+ "<|nocaptions|>",
115
+ "<|notimestamps|>"
116
+ ],
117
+ "is_local": false,
118
+ "language": null,
119
+ "model_max_length": 1024,
120
+ "pad_token": "<|endoftext|>",
121
+ "predict_timestamps": false,
122
+ "processor_class": "WhisperProcessor",
123
+ "return_attention_mask": false,
124
+ "task": null,
125
+ "tokenizer_class": "WhisperTokenizer",
126
+ "unk_token": "<|endoftext|>"
127
+ }
training_config.json ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "mappings_dir": "/content/drive/MyDrive/voice_dataset/mappings",
3
+ "output_dir": "/content/whisper-security-model-full",
4
+ "drive_backup": "/content/drive/MyDrive/voice_dataset/model_output",
5
+ "model_name": "openai/whisper-small",
6
+ "num_batches": 17,
7
+ "max_duration": 25,
8
+ "train_ratio": 0.7,
9
+ "val_ratio": 0.15,
10
+ "test_ratio": 0.15,
11
+ "seed": 42,
12
+ "n_folds": 5,
13
+ "batch_size": 4,
14
+ "grad_accum": 8,
15
+ "learning_rate": 3e-05,
16
+ "warmup_steps": 200,
17
+ "max_steps": 3000,
18
+ "logging_steps": 50,
19
+ "eval_steps": 200,
20
+ "save_steps": 500,
21
+ "labels": {
22
+ "safe": 0,
23
+ "malicious": 1
24
+ }
25
+ }