Upload 10 files

Browse files

Files changed (10) hide show

config.json +19 -0
label_map.json +1 -0
model.safetensors +3 -0
modeling_voiceshield.py +43 -0
pipeline_voiceshield.py +70 -0
preprocessor_config.json +17 -0
processor_config.json +17 -0
tokenizer.json +0 -0
tokenizer_config.json +127 -0
training_config.json +25 -0

config.json ADDED Viewed

	@@ -0,0 +1,19 @@

+{
+  "model_type": "voiceshield",
+  "architectures": ["VoiceShieldForAudioClassification"],
+  "num_labels": 2,
+  "id2label": {
+    "0": "safe",
+    "1": "malicious"
+  },
+  "label2id": {
+    "safe": 0,
+    "malicious": 1
+  },
+  "base_model": "openai/whisper-small",
+  "auto_map": {
+    "AutoConfig": "modeling_voiceshield.VoiceShieldConfig",
+    "AutoModelForAudioClassification": "modeling_voiceshield.VoiceShieldForAudioClassification",
+    "AutoPipelineForAudioClassification": "pipeline_voiceshield.VoiceShieldPipeline"
+  }
+}

label_map.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"0": "safe", "1": "malicious"}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:f336c8e4b58752a12dd1687e5d0cacfc32cb3ccd359c85d03c9a500bcd19a42c
+size 354475640

modeling_voiceshield.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import torch
+import torch.nn as nn
+from transformers import WhisperModel, PreTrainedModel
+from transformers.modeling_outputs import SequenceClassifierOutput
+from transformers.configuration_utils import PretrainedConfig
+class VoiceShieldConfig(PretrainedConfig):
+    model_type = "voiceshield"
+    def __init__(self, num_labels=2, base_model="openai/whisper-small", **kwargs):
+        super().__init__(**kwargs)
+        self.num_labels = num_labels
+        self.base_model = base_model
+class VoiceShieldForAudioClassification(PreTrainedModel):
+    config_class = VoiceShieldConfig
+    def __init__(self, config):
+        super().__init__(config)
+        whisper = WhisperModel.from_pretrained(config.base_model)
+        self.encoder = whisper.encoder
+        d_model = self.encoder.config.d_model
+        self.classifier = nn.Sequential(
+            nn.Linear(d_model, 512),
+            nn.GELU(),
+            nn.Linear(512, 128),
+            nn.GELU(),
+            nn.Linear(128, config.num_labels),
+        )
+    def forward(self, input_features=None, labels=None):
+        hidden = self.encoder(input_features).last_hidden_state
+        pooled = hidden.mean(dim=1)
+        logits = self.classifier(pooled)
+        loss = None
+        if labels is not None:
+            loss = nn.CrossEntropyLoss()(logits, labels)
+        return SequenceClassifierOutput(loss=loss, logits=logits)

pipeline_voiceshield.py ADDED Viewed

	@@ -0,0 +1,70 @@

+import torch
+import torch.nn.functional as F
+import torchaudio
+from transformers import Pipeline, WhisperProcessor, WhisperForConditionalGeneration
+class VoiceShieldPipeline(Pipeline):
+    def __init__(self, model, **kwargs):
+        super().__init__(model=model, **kwargs)
+        base_model = model.config.base_model
+        self.processor = WhisperProcessor.from_pretrained(base_model)
+        self.stt_model = WhisperForConditionalGeneration.from_pretrained(base_model)
+        self.device = model.device
+        self.stt_model.to(self.device)
+        self.stt_model.eval()
+    def _sanitize_parameters(self, **kwargs):
+        return {}, {}, {}
+    def preprocess(self, inputs):
+        audio, sr = torchaudio.load(inputs)
+        if sr != 16000:
+            audio = torchaudio.transforms.Resample(sr, 16000)(audio)
+        if audio.shape[0] > 1:
+            audio = audio.mean(dim=0, keepdim=True)
+        audio_np = audio.squeeze().numpy()
+        features = self.processor(
+            audio_np, sampling_rate=16000, return_tensors="pt"
+        ).input_features.to(self.device)
+        return {"features": features}
+    def _forward(self, model_inputs):
+        features = model_inputs["features"]
+        # Transcription
+        with torch.no_grad():
+            ids = self.stt_model.generate(features)
+        transcript = self.processor.batch_decode(ids, skip_special_tokens=True)[0]
+        # Classification
+        with torch.no_grad():
+            logits = self.model(features).logits
+            probs = F.softmax(logits, dim=-1)[0]
+        return {
+            "transcript": transcript,
+            "probs": probs,
+        }
+    def postprocess(self, model_outputs):
+        probs = model_outputs["probs"]
+        transcript = model_outputs["transcript"]
+        label_id = probs.argmax().item()
+        score = probs[label_id].item()
+        label = self.model.config.id2label[str(label_id)]
+        return {
+            "transcript": transcript,
+            "label": label,
+            "confidence": score,
+        }

preprocessor_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "feature_extractor": {
+    "chunk_length": 30,
+    "dither": 0.0,
+    "feature_extractor_type": "WhisperFeatureExtractor",
+    "feature_size": 80,
+    "hop_length": 160,
+    "n_fft": 400,
+    "n_samples": 480000,
+    "nb_max_frames": 3000,
+    "padding_side": "right",
+    "padding_value": 0.0,
+    "return_attention_mask": false,
+    "sampling_rate": 16000
+  },
+  "processor_class": "WhisperProcessor"
+}

processor_config.json ADDED Viewed

	@@ -0,0 +1,17 @@

+{
+  "feature_extractor": {
+    "chunk_length": 30,
+    "dither": 0.0,
+    "feature_extractor_type": "WhisperFeatureExtractor",
+    "feature_size": 80,
+    "hop_length": 160,
+    "n_fft": 400,
+    "n_samples": 480000,
+    "nb_max_frames": 3000,
+    "padding_side": "right",
+    "padding_value": 0.0,
+    "return_attention_mask": false,
+    "sampling_rate": 16000
+  },
+  "processor_class": "WhisperProcessor"
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,127 @@

+{
+  "add_prefix_space": false,
+  "backend": "tokenizers",
+  "bos_token": "<|endoftext|>",
+  "clean_up_tokenization_spaces": true,
+  "eos_token": "<|endoftext|>",
+  "errors": "replace",
+  "extra_special_tokens": [
+    "<|endoftext|>",
+    "<|startoftranscript|>",
+    "<|en|>",
+    "<|zh|>",
+    "<|de|>",
+    "<|es|>",
+    "<|ru|>",
+    "<|ko|>",
+    "<|fr|>",
+    "<|ja|>",
+    "<|pt|>",
+    "<|tr|>",
+    "<|pl|>",
+    "<|ca|>",
+    "<|nl|>",
+    "<|ar|>",
+    "<|sv|>",
+    "<|it|>",
+    "<|id|>",
+    "<|hi|>",
+    "<|fi|>",
+    "<|vi|>",
+    "<|he|>",
+    "<|uk|>",
+    "<|el|>",
+    "<|ms|>",
+    "<|cs|>",
+    "<|ro|>",
+    "<|da|>",
+    "<|hu|>",
+    "<|ta|>",
+    "<|no|>",
+    "<|th|>",
+    "<|ur|>",
+    "<|hr|>",
+    "<|bg|>",
+    "<|lt|>",
+    "<|la|>",
+    "<|mi|>",
+    "<|ml|>",
+    "<|cy|>",
+    "<|sk|>",
+    "<|te|>",
+    "<|fa|>",
+    "<|lv|>",
+    "<|bn|>",
+    "<|sr|>",
+    "<|az|>",
+    "<|sl|>",
+    "<|kn|>",
+    "<|et|>",
+    "<|mk|>",
+    "<|br|>",
+    "<|eu|>",
+    "<|is|>",
+    "<|hy|>",
+    "<|ne|>",
+    "<|mn|>",
+    "<|bs|>",
+    "<|kk|>",
+    "<|sq|>",
+    "<|sw|>",
+    "<|gl|>",
+    "<|mr|>",
+    "<|pa|>",
+    "<|si|>",
+    "<|km|>",
+    "<|sn|>",
+    "<|yo|>",
+    "<|so|>",
+    "<|af|>",
+    "<|oc|>",
+    "<|ka|>",
+    "<|be|>",
+    "<|tg|>",
+    "<|sd|>",
+    "<|gu|>",
+    "<|am|>",
+    "<|yi|>",
+    "<|lo|>",
+    "<|uz|>",
+    "<|fo|>",
+    "<|ht|>",
+    "<|ps|>",
+    "<|tk|>",
+    "<|nn|>",
+    "<|mt|>",
+    "<|sa|>",
+    "<|lb|>",
+    "<|my|>",
+    "<|bo|>",
+    "<|tl|>",
+    "<|mg|>",
+    "<|as|>",
+    "<|tt|>",
+    "<|haw|>",
+    "<|ln|>",
+    "<|ha|>",
+    "<|ba|>",
+    "<|jw|>",
+    "<|su|>",
+    "<|translate|>",
+    "<|transcribe|>",
+    "<|startoflm|>",
+    "<|startofprev|>",
+    "<|nocaptions|>",
+    "<|notimestamps|>"
+  ],
+  "is_local": false,
+  "language": null,
+  "model_max_length": 1024,
+  "pad_token": "<|endoftext|>",
+  "predict_timestamps": false,
+  "processor_class": "WhisperProcessor",
+  "return_attention_mask": false,
+  "task": null,
+  "tokenizer_class": "WhisperTokenizer",
+  "unk_token": "<|endoftext|>"
+}

training_config.json ADDED Viewed

	@@ -0,0 +1,25 @@

+{
+  "mappings_dir": "/content/drive/MyDrive/voice_dataset/mappings",
+  "output_dir": "/content/whisper-security-model-full",
+  "drive_backup": "/content/drive/MyDrive/voice_dataset/model_output",
+  "model_name": "openai/whisper-small",
+  "num_batches": 17,
+  "max_duration": 25,
+  "train_ratio": 0.7,
+  "val_ratio": 0.15,
+  "test_ratio": 0.15,
+  "seed": 42,
+  "n_folds": 5,
+  "batch_size": 4,
+  "grad_accum": 8,
+  "learning_rate": 3e-05,
+  "warmup_steps": 200,
+  "max_steps": 3000,
+  "logging_steps": 50,
+  "eval_steps": 200,
+  "save_steps": 500,
+  "labels": {
+    "safe": 0,
+    "malicious": 1
+  }
+}