Spaces:

ROSHANNN123
/

voicedetectionapi

Sleeping

App Files Files Community

ROSHANNN123 commited on 19 days ago

Commit

7146854

verified ·

1 Parent(s): 690ba94

Update model_service.py

Browse files

Files changed (1) hide show

model_service.py +92 -92

model_service.py CHANGED Viewed

@@ -1,92 +1,92 @@
-import torch
-import librosa
-import numpy as np
-import io
-import soundfile as sf
-from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
-import torch.nn.functional as F
-# Configuration
-MODEL_NAME = "Hemgg/Deepfake-audio-detection"  # Using a known fine-tuned model
-# Alternative: "mo-thecreator/Deepfake-audio-detection" if the above fails or is private
-# But usually public models are fine.
-class ModelService:
-    def __init__(self):
-        print(f"Loading model: {MODEL_NAME}...")
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        try:
-            self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
-            self.model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(self.device)
-            print(f"Model loaded on {self.device}")
-        except Exception as e:
-            print(f"Error loading model: {e}")
-            raise e
-    def preprocess_audio(self, audio_bytes):
-        """
-        Load audio bytes, resample to 16000 Hz (required by Wav2Vec2).
-        """
-        try:
-            # Load audio from bytes
-            # librosa.load supports file-like objects
-            audio_file = io.BytesIO(audio_bytes)
-            # Load and resample to 16k
-            speech, sr = librosa.load(audio_file, sr=16000)
-            # Ensure it's mono (if multi-channel, average them) - librosa.load handles this by default (mono=True)
-            return speech
-        except Exception as e:
-            print(f"Error processing audio: {e}")
-            raise ValueError("Invalid audio format or corrupted file.")
-    def predict(self, audio_bytes):
-        speech = self.preprocess_audio(audio_bytes)
-        # Tokenize (extract features)
-        inputs = self.feature_extractor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
-        inputs = {key: val.to(self.device) for key, val in inputs.items()}
-        with torch.no_grad():
-            logits = self.model(**inputs).logits
-        # Get probabilities
-        probs = F.softmax(logits, dim=-1)
-        # The model usually outputs [real, fake] or [fake, real].
-        # We need to verify the label mapping.
-        # Typically, id2label is stored in the config.
-        id2label = self.model.config.id2label
-        # Example id2label: {0: 'real', 1: 'fake'} or similar.
-        predicted_id = torch.argmax(probs, dim=-1).item()
-        predicted_label = id2label[predicted_id]
-        confidence = probs[0][predicted_id].item()
-        # Map to required output format "AI_GENERATED" or "HUMAN"
-        # Adjust based on specific model labels.
-        # Assuming common labels like "real"/"spoof" or "human"/"ai"
-        normalized_label = "UNKNOWN"
-        lower_label = predicted_label.lower()
-        if "real" in lower_label or "human" in lower_label or "bonafide" in lower_label:
-            normalized_label = "HUMAN"
-        elif "fake" in lower_label or "spoof" in lower_label or "ai" in lower_label:
-            normalized_label = "AI_GENERATED"
-        else:
-            # Fallback if labels are obscure, typically 0 is real, 1 is fake for many datasets but not all.
-            # We trust the string matching first.
-            normalized_label = predicted_label
-        return normalized_label, confidence
-# Singleton instance
-model_service = None
-def get_model_service():
-    global model_service
-    if model_service is None:
-        model_service = ModelService()
-    return model_service

+import torch
+import librosa
+import numpy as np
+import io
+import soundfile as sf
+from transformers import AutoModelForAudioClassification, Wav2Vec2FeatureExtractor
+import torch.nn.functional as F
+# Configuration
+MODEL_NAME = "Hemgg/Deepfake-audio-detection"  # Using a known fine-tuned model
+# Alternative: "mo-thecreator/Deepfake-audio-detection" if the above fails or is private
+# But usually public models are fine.
+class ModelService:
+    def __init__(self):
+        print(f"Loading model: {MODEL_NAME}...")
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        try:
+            self.feature_extractor = Wav2Vec2FeatureExtractor.from_pretrained(MODEL_NAME)
+            self.model = AutoModelForAudioClassification.from_pretrained(MODEL_NAME).to(self.device)
+            print(f"Model loaded on {self.device}")
+        except Exception as e:
+            print(f"Error loading model: {e}")
+            raise e
+    def preprocess_audio(self, audio_bytes):
+        """
+        Load audio bytes, resample to 16000 Hz (required by Wav2Vec2).
+        """
+        try:
+            # Load audio from bytes
+            # librosa.load supports file-like objects
+            audio_file = io.BytesIO(audio_bytes)
+            # Load and resample to 16k
+            speech, sr = librosa.load(audio_file, sr=16000)
+            # Ensure it's mono (if multi-channel, average them) - librosa.load handles this by default (mono=True)
+            return speech
+        except Exception as e:
+            print(f"Error processing audio: {e}")
+            raise ValueError("Invalid audio format or corrupted file: {str(e)}")
+    def predict(self, audio_bytes):
+        speech = self.preprocess_audio(audio_bytes)
+        # Tokenize (extract features)
+        inputs = self.feature_extractor(speech, sampling_rate=16000, return_tensors="pt", padding=True)
+        inputs = {key: val.to(self.device) for key, val in inputs.items()}
+        with torch.no_grad():
+            logits = self.model(**inputs).logits
+        # Get probabilities
+        probs = F.softmax(logits, dim=-1)
+        # The model usually outputs [real, fake] or [fake, real].
+        # We need to verify the label mapping.
+        # Typically, id2label is stored in the config.
+        id2label = self.model.config.id2label
+        # Example id2label: {0: 'real', 1: 'fake'} or similar.
+        predicted_id = torch.argmax(probs, dim=-1).item()
+        predicted_label = id2label[predicted_id]
+        confidence = probs[0][predicted_id].item()
+        # Map to required output format "AI_GENERATED" or "HUMAN"
+        # Adjust based on specific model labels.
+        # Assuming common labels like "real"/"spoof" or "human"/"ai"
+        normalized_label = "UNKNOWN"
+        lower_label = predicted_label.lower()
+        if "real" in lower_label or "human" in lower_label or "bonafide" in lower_label:
+            normalized_label = "HUMAN"
+        elif "fake" in lower_label or "spoof" in lower_label or "ai" in lower_label:
+            normalized_label = "AI_GENERATED"
+        else:
+            # Fallback if labels are obscure, typically 0 is real, 1 is fake for many datasets but not all.
+            # We trust the string matching first.
+            normalized_label = predicted_label
+        return normalized_label, confidence
+# Singleton instance
+model_service = None
+def get_model_service():
+    global model_service
+    if model_service is None:
+        model_service = ModelService()
+    return model_service