Emvo-ai
/

voiceSHIELD-small

@@ -1,3 +1,6 @@
 import torch
 import torch.nn.functional as F
 import numpy as np
@@ -7,91 +10,147 @@ from transformers import Pipeline, WhisperProcessor, WhisperForConditionalGenera
 class VoiceShieldPipeline(Pipeline):
     def __init__(self, model, threshold=0.2, **kwargs):
         self.threshold = threshold
-        # FIX 1: tokenizer= must be passed to super().__init__() —
-        # Pipeline requires it even for audio tasks, pass None explicitly
-        # to prevent "tokenizer is required" crash
         kwargs.setdefault("tokenizer", None)
         super().__init__(model=model, **kwargs)
         base_model = model.config.base_model
-        # FIX 2: load processor AFTER super().__init__() so self.device is set
         self.processor = WhisperProcessor.from_pretrained(base_model)
         self.stt_model = WhisperForConditionalGeneration.from_pretrained(base_model)
         self.stt_model.to(self.device)
         self.stt_model.eval()
     def _sanitize_parameters(self, threshold=None, **kwargs):
         forward_kwargs = {}
         if threshold is not None:
             forward_kwargs["threshold"] = threshold
-        # FIX 3: must return exactly 3 dicts (preprocess, forward, postprocess)
-        return {}, forward_kwargs, {}
     def preprocess(self, inputs, **kwargs):
-        # FIX 4: soundfile.read returns (data, samplerate) — unpack correctly
-        audio_np, sr = sf.read(inputs)
-        # Stereo → mono
         if len(audio_np.shape) > 1:
             audio_np = np.mean(audio_np, axis=1)
         # Resample to 16kHz if needed
         if sr != 16000:
             num_samples = int(len(audio_np) * 16000 / sr)
             audio_np = resample(audio_np, num_samples).astype(np.float32)
         features = self.processor(
             audio_np,
             sampling_rate=16000,
             return_tensors="pt"
         ).input_features.to(self.device)
-        return {"features": features}
     def _forward(self, model_inputs, threshold=None, **kwargs):
-        # FIX 5: use instance threshold as fallback, not hardcoded 0.2
         threshold = threshold if threshold is not None else self.threshold
-        features  = model_inputs["features"]
-        # Transcription
-        attn_mask = torch.ones(
-            features.shape[:2], dtype=torch.long, device=self.device
-        )
         with torch.no_grad():
-            ids = self.stt_model.generate(
                 features,
-                attention_mask=attn_mask,       # FIX 6: prevents pad==eos warning
                 language="en",
                 task="transcribe",
-                suppress_tokens=[],             # FIX 7: prevents duplicate processor warning
             )
-        transcript = self.processor.batch_decode(
-            ids, skip_special_tokens=True
-        )[0].strip()
-        # Classification
         with torch.no_grad():
-            probs    = F.softmax(self.model(features).logits, dim=-1)[0]
-            mal_prob  = probs[1].item()
             safe_prob = probs[0].item()
-        label      = "malicious" if mal_prob >= threshold else "safe"
         confidence = mal_prob if label == "malicious" else safe_prob
         return {
-            "transcript":  transcript,
-            "label":       label,
-            "confidence":  round(confidence, 6),
             "p_malicious": round(mal_prob, 6),
-            "p_safe":      round(safe_prob, 6),
-            "threshold":   threshold,
         }
     def postprocess(self, model_outputs, **kwargs):
-        return model_outputs

+"""
+VoiceShield Pipeline for audio classification and transcription
+"""
 import torch
 import torch.nn.functional as F
 import numpy as np
 class VoiceShieldPipeline(Pipeline):
+    """
+    Pipeline for VoiceShield audio classification.
+    Combines transcription (via Whisper) with malicious audio detection.
+    Args:
+        model: VoiceShield classification model
+        threshold: Confidence threshold for malicious classification (default: 0.2)
+    """
     def __init__(self, model, threshold=0.2, **kwargs):
         self.threshold = threshold
+        # Pipeline requires tokenizer parameter, pass None for audio tasks
         kwargs.setdefault("tokenizer", None)
         super().__init__(model=model, **kwargs)
+        # Load processor and STT model after super().__init__() so self.device is set
         base_model = model.config.base_model
         self.processor = WhisperProcessor.from_pretrained(base_model)
         self.stt_model = WhisperForConditionalGeneration.from_pretrained(base_model)
         self.stt_model.to(self.device)
         self.stt_model.eval()
     def _sanitize_parameters(self, threshold=None, **kwargs):
+        """
+        Sanitize parameters for preprocess, forward, and postprocess.
+        Must return exactly 3 dictionaries.
+        """
+        preprocess_kwargs = {}
         forward_kwargs = {}
+        postprocess_kwargs = {}
         if threshold is not None:
             forward_kwargs["threshold"] = threshold
+        return preprocess_kwargs, forward_kwargs, postprocess_kwargs
     def preprocess(self, inputs, **kwargs):
+        """
+        Preprocess audio input.
+        Args:
+            inputs: Path to audio file or numpy array
+        Returns:
+            Dictionary with processed features
+        """
+        # Load audio file
+        if isinstance(inputs, str):
+            audio_np, sr = sf.read(inputs)
+        else:
+            audio_np = inputs
+            sr = kwargs.get("sampling_rate", 16000)
+        # Convert stereo to mono
         if len(audio_np.shape) > 1:
             audio_np = np.mean(audio_np, axis=1)
         # Resample to 16kHz if needed
         if sr != 16000:
             num_samples = int(len(audio_np) * 16000 / sr)
             audio_np = resample(audio_np, num_samples).astype(np.float32)
+        # Process with Whisper processor
         features = self.processor(
             audio_np,
             sampling_rate=16000,
             return_tensors="pt"
         ).input_features.to(self.device)
+        return {"features": features, "audio": audio_np}
     def _forward(self, model_inputs, threshold=None, **kwargs):
+        """
+        Forward pass: transcribe and classify audio.
+        Args:
+            model_inputs: Preprocessed features
+            threshold: Classification threshold
+        Returns:
+            Dictionary with transcript, label, and confidence scores
+        """
+        # Use instance threshold as fallback
         threshold = threshold if threshold is not None else self.threshold
+        features = model_inputs["features"]
+        # Generate transcription
         with torch.no_grad():
+            # Create attention mask
+            attn_mask = torch.ones(
+                features.shape[:2],
+                dtype=torch.long,
+                device=self.device
+            )
+            # Generate transcript
+            generated_ids = self.stt_model.generate(
                 features,
+                attention_mask=attn_mask,
                 language="en",
                 task="transcribe",
+                suppress_tokens=[],  # Prevents duplicate processor warning
             )
+            # Decode transcript
+            transcript = self.processor.batch_decode(
+                generated_ids,
+                skip_special_tokens=True
+            )[0].strip()
+        # Classify audio
         with torch.no_grad():
+            outputs = self.model(features)
+            probs = F.softmax(outputs.logits, dim=-1)[0]
             safe_prob = probs[0].item()
+            mal_prob = probs[1].item()
+        # Determine label and confidence
+        label = "malicious" if mal_prob >= threshold else "safe"
         confidence = mal_prob if label == "malicious" else safe_prob
         return {
+            "transcript": transcript,
+            "label": label,
+            "confidence": round(confidence, 6),
+            "p_safe": round(safe_prob, 6),
             "p_malicious": round(mal_prob, 6),
+            "threshold": threshold,
         }
     def postprocess(self, model_outputs, **kwargs):
+        """
+        Postprocess model outputs.
+        Args:
+            model_outputs: Outputs from forward pass
+        Returns:
+            Final formatted outputs
+        """
+        return model_outputs