Spaces:

dj-dawgs-ipd
/

IPD-Audio-Model

Build error

App Files Files Community

Tirath5504 commited on Dec 8, 2024

Commit

54efbdc

verified ·

1 Parent(s): 382deb3

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -49

app.py CHANGED Viewed

@@ -3,58 +3,161 @@ import torch
 import librosa
 import numpy as np
 from sklearn.preprocessing import StandardScaler
-from transformers import AutoModelForSequenceClassification, AutoProcessor
-model = AutoModelForSequenceClassification.from_pretrained("Tirath5504/IPD_Audio_HuBERT")
-def extract_audio_features(audio_path):
-    y, sr = librosa.load(audio_path, sr=None)
-    features = {}
-    pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
-    pitches = pitches[pitches > 0]
-    features['pitch_mean'] = np.mean(pitches) if len(pitches) > 0 else 0
-    features['pitch_std'] = np.std(pitches) if len(pitches) > 0 else 0
-    spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
-    features['spectral_centroid_mean'] = np.mean(spectral_centroid)
-    features['spectral_centroid_std'] = np.std(spectral_centroid)
-    zcr = librosa.feature.zero_crossing_rate(y)
-    features['zcr_mean'] = np.mean(zcr)
-    features['zcr_std'] = np.std(zcr)
-    rms = librosa.feature.rms(y=y)
-    features['rms_mean'] = np.mean(rms)
-    features['rms_std'] = np.std(rms)
-    spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)
-    features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
-    features['spectral_rolloff_std'] = np.std(spectral_rolloff)
-    duration = librosa.get_duration(y=y, sr=sr)
-    voiced_frames = librosa.effects.split(y, top_db=20)
-    speaking_rate = len(voiced_frames) / duration if duration > 0 else 0
-    features['speaking_rate'] = speaking_rate
-    scaler = StandardScaler()
-    features_array = np.array(list(features.values())).reshape(1, -1)
-    features_scaled = scaler.fit_transform(features_array)
-    return torch.tensor(features_scaled, dtype=torch.float32)
-def classify_audio(audio):
-    features = extract_audio_features(audio)
-    logits = model(features).logits
-    prediction = torch.argmax(logits, dim=1).item()
-    return "Hate Speech" if prediction == 1 else "Non-Hate Speech"
-interface = gr.Interface(
-    fn=classify_audio,
-    inputs=gr.Audio(source="upload", type="filepath"),
-    outputs="text",
-    title="Audio Hate Speech Classifier",
-    description="Upload a .wav audio file to determine if it contains hate speech."
 )
 if __name__ == "__main__":

 import librosa
 import numpy as np
 from sklearn.preprocessing import StandardScaler
+import joblib
+import parselmouth
+from parselmouth.praat import call
+from transformers import HubertForSequenceClassification
+import torch.nn as nn
+class HuBERTHateSpeechClassifier(nn.Module):
+    def __init__(self, input_dim, num_classes):
+        super().__init__()
+        self.hubert = HubertForSequenceClassification.from_pretrained(
+            "facebook/hubert-base-ls960"
+        )
+        self.classifier = nn.Sequential(
+            nn.Linear(input_dim, 128),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(128, 64),
+            nn.ReLU(),
+            nn.Dropout(0.3),
+            nn.Linear(64, num_classes)
+        )
+    def forward(self, x):
+        return self.classifier(x)
+class AudioFeatureExtractor:
+    def __init__(self, scaler_path='scaler.joblib'):
+        self.scaler = joblib.load(scaler_path)
+    def safe_mean(self, arr):
+        try:
+            arr = np.array(arr).flatten()
+            arr = arr[np.isfinite(arr)]
+            return float(np.mean(arr)) if len(arr) > 0 else 0.0
+        except Exception:
+            return 0.0
+    def safe_std(self, arr):
+        try:
+            arr = np.array(arr).flatten()
+            arr = arr[np.isfinite(arr)]
+            return float(np.std(arr)) if len(arr) > 1 else 0.0
+        except Exception:
+            return 0.0
+    def extract_features(self, audio_path):
+        try:
+            y, sr = librosa.load(audio_path, duration=5)
+        except Exception as e:
+            print(f"Error loading audio file: {e}")
+            return np.zeros(13)
+        if len(y) == 0:
+            return np.zeros(13)
+        try:
+            pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
+            pitches = pitches[pitches > 0]
+            pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
+            pitch_std = np.std(pitches) if len(pitches) > 0 else 0
+            spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
+            spectral_centroid_mean = np.mean(spectral_centroid)
+            spectral_centroid_std = np.mean(spectral_centroid)
+            zcr = librosa.feature.zero_crossing_rate(y)
+            zcr_mean = np.mean(zcr)
+            zcr_std = np.mean(zcr)
+            rms = librosa.feature.rms(y=y)
+            rms_mean = np.mean(rms)
+            rms_std = np.mean(rms)
+            spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)
+            spectral_rolloff_mean = np.mean(spectral_rolloff)
+            spectral_rolloff_std = np.mean(spectral_rolloff)
+            hop_length = 512
+            duration = librosa.get_duration(y=y, sr=sr)
+            voiced_frames = librosa.effects.split(y, top_db=20)
+            speaking_rate = len(voiced_frames) / duration if duration > 0 else 0
+            try:
+                sound = parselmouth.Sound(audio_path)
+                pitch = call(sound, "To Pitch", 0.0, 75, 600)
+                harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
+                hnr_values = []
+                for time in pitch.ts():
+                    harmonicity_value = call(harmonicity, "Get value at time", time, "Linear")
+                    if not np.isnan(harmonicity_value):
+                        hnr_values.append(harmonicity_value)
+                hnr_mean = sum(hnr_values) / len(hnr_values) if len(hnr_values) > 0 else 0
+                hnr_std = np.std(hnr_values) if len(hnr_values) > 1 else 0
+            except Exception as e:
+                print(f"Error calculating HNR: {e}")
+                hnr_mean = 0
+                hnr_std = 0
+            feature_vector = np.array([
+                pitch_mean, pitch_std,
+                spectral_centroid_mean, spectral_centroid_std,
+                zcr_mean, zcr_std,
+                rms_mean, rms_std,
+                spectral_rolloff_mean, spectral_rolloff_std,
+                speaking_rate,
+                hnr_mean, hnr_std
+            ])
+            scaled_features = self.scaler.transform(feature_vector.reshape(1, -1))[0]
+            return scaled_features
+        except Exception as e:
+            print(f"Error extracting features: {e}")
+            return np.zeros(13)
+def predict_hate_speech(audio_path):
+    state_dict = torch.load("hate_speech_hubert_audio_classifier.pth", map_location=torch.device('cpu'))
+    model = HuBERTHateSpeechClassifier(13, 2)
+    model.load_state_dict(state_dict)
+    feature_extractor = AudioFeatureExtractor()
+    features = feature_extractor.extract_features(audio_path)
+    input_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
+    with torch.no_grad():
+        outputs = model(input_tensor)
+        probabilities = torch.softmax(outputs, dim=1)
+        predicted_class = torch.argmax(probabilities, dim=1).item()
+        confidence = probabilities[0][predicted_class].item()
+    result = {
+        'Classification': 'Hate Speech\n' if predicted_class == 1 else 'Non-Hate Speech',
+        'Confidence': f"{confidence:.2%}"
+    }
+    return result
+iface = gr.Interface(
+    fn=predict_hate_speech,
+    inputs=gr.Audio(type="filepath", label="Upload Audio"),
+    outputs=gr.Textbox(label="Hate Speech Analysis"),
+    title="Hate Speech Audio Classifier",
+    description="Upload an audio file to detect potential hate speech content.",
+    examples=[
+        ["hate_video_3_3_snippet2.wav"]
+    ],
+    allow_flagging="manual"
 )
 if __name__ == "__main__":