Tirath5504 commited on
Commit
54efbdc
·
verified ·
1 Parent(s): 382deb3

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -49
app.py CHANGED
@@ -3,58 +3,161 @@ import torch
3
  import librosa
4
  import numpy as np
5
  from sklearn.preprocessing import StandardScaler
6
- from transformers import AutoModelForSequenceClassification, AutoProcessor
 
 
 
 
7
 
8
- model = AutoModelForSequenceClassification.from_pretrained("Tirath5504/IPD_Audio_HuBERT")
9
 
10
- def extract_audio_features(audio_path):
11
- y, sr = librosa.load(audio_path, sr=None)
12
- features = {}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
- pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
15
- pitches = pitches[pitches > 0]
16
- features['pitch_mean'] = np.mean(pitches) if len(pitches) > 0 else 0
17
- features['pitch_std'] = np.std(pitches) if len(pitches) > 0 else 0
18
-
19
- spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
20
- features['spectral_centroid_mean'] = np.mean(spectral_centroid)
21
- features['spectral_centroid_std'] = np.std(spectral_centroid)
22
-
23
- zcr = librosa.feature.zero_crossing_rate(y)
24
- features['zcr_mean'] = np.mean(zcr)
25
- features['zcr_std'] = np.std(zcr)
26
-
27
- rms = librosa.feature.rms(y=y)
28
- features['rms_mean'] = np.mean(rms)
29
- features['rms_std'] = np.std(rms)
30
-
31
- spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)
32
- features['spectral_rolloff_mean'] = np.mean(spectral_rolloff)
33
- features['spectral_rolloff_std'] = np.std(spectral_rolloff)
34
-
35
- duration = librosa.get_duration(y=y, sr=sr)
36
- voiced_frames = librosa.effects.split(y, top_db=20)
37
- speaking_rate = len(voiced_frames) / duration if duration > 0 else 0
38
- features['speaking_rate'] = speaking_rate
39
-
40
- scaler = StandardScaler()
41
- features_array = np.array(list(features.values())).reshape(1, -1)
42
- features_scaled = scaler.fit_transform(features_array)
43
-
44
- return torch.tensor(features_scaled, dtype=torch.float32)
45
-
46
- def classify_audio(audio):
47
- features = extract_audio_features(audio)
48
- logits = model(features).logits
49
- prediction = torch.argmax(logits, dim=1).item()
50
- return "Hate Speech" if prediction == 1 else "Non-Hate Speech"
51
-
52
- interface = gr.Interface(
53
- fn=classify_audio,
54
- inputs=gr.Audio(source="upload", type="filepath"),
55
- outputs="text",
56
- title="Audio Hate Speech Classifier",
57
- description="Upload a .wav audio file to determine if it contains hate speech."
58
  )
59
 
60
  if __name__ == "__main__":
 
3
  import librosa
4
  import numpy as np
5
  from sklearn.preprocessing import StandardScaler
6
+ import joblib
7
+ import parselmouth
8
+ from parselmouth.praat import call
9
+ from transformers import HubertForSequenceClassification
10
+ import torch.nn as nn
11
 
 
12
 
13
+ class HuBERTHateSpeechClassifier(nn.Module):
14
+ def __init__(self, input_dim, num_classes):
15
+ super().__init__()
16
+ self.hubert = HubertForSequenceClassification.from_pretrained(
17
+ "facebook/hubert-base-ls960"
18
+ )
19
+
20
+ self.classifier = nn.Sequential(
21
+ nn.Linear(input_dim, 128),
22
+ nn.ReLU(),
23
+ nn.Dropout(0.3),
24
+ nn.Linear(128, 64),
25
+ nn.ReLU(),
26
+ nn.Dropout(0.3),
27
+ nn.Linear(64, num_classes)
28
+ )
29
+
30
+ def forward(self, x):
31
+ return self.classifier(x)
32
+
33
+
34
+ class AudioFeatureExtractor:
35
+ def __init__(self, scaler_path='scaler.joblib'):
36
+ self.scaler = joblib.load(scaler_path)
37
+
38
+ def safe_mean(self, arr):
39
+ try:
40
+ arr = np.array(arr).flatten()
41
+ arr = arr[np.isfinite(arr)]
42
+ return float(np.mean(arr)) if len(arr) > 0 else 0.0
43
+ except Exception:
44
+ return 0.0
45
+
46
+ def safe_std(self, arr):
47
+ try:
48
+ arr = np.array(arr).flatten()
49
+ arr = arr[np.isfinite(arr)]
50
+ return float(np.std(arr)) if len(arr) > 1 else 0.0
51
+ except Exception:
52
+ return 0.0
53
+
54
+ def extract_features(self, audio_path):
55
+ try:
56
+ y, sr = librosa.load(audio_path, duration=5)
57
+ except Exception as e:
58
+ print(f"Error loading audio file: {e}")
59
+ return np.zeros(13)
60
+
61
+ if len(y) == 0:
62
+ return np.zeros(13)
63
+
64
+ try:
65
+ pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
66
+ pitches = pitches[pitches > 0]
67
+ pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
68
+ pitch_std = np.std(pitches) if len(pitches) > 0 else 0
69
+
70
+ spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
71
+ spectral_centroid_mean = np.mean(spectral_centroid)
72
+ spectral_centroid_std = np.mean(spectral_centroid)
73
+
74
+ zcr = librosa.feature.zero_crossing_rate(y)
75
+ zcr_mean = np.mean(zcr)
76
+ zcr_std = np.mean(zcr)
77
+
78
+ rms = librosa.feature.rms(y=y)
79
+ rms_mean = np.mean(rms)
80
+ rms_std = np.mean(rms)
81
+
82
+ spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)
83
+ spectral_rolloff_mean = np.mean(spectral_rolloff)
84
+ spectral_rolloff_std = np.mean(spectral_rolloff)
85
+
86
+ hop_length = 512
87
+ duration = librosa.get_duration(y=y, sr=sr)
88
+ voiced_frames = librosa.effects.split(y, top_db=20)
89
+ speaking_rate = len(voiced_frames) / duration if duration > 0 else 0
90
+
91
+ try:
92
+ sound = parselmouth.Sound(audio_path)
93
+ pitch = call(sound, "To Pitch", 0.0, 75, 600)
94
+ harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
95
+ hnr_values = []
96
+ for time in pitch.ts():
97
+ harmonicity_value = call(harmonicity, "Get value at time", time, "Linear")
98
+ if not np.isnan(harmonicity_value):
99
+ hnr_values.append(harmonicity_value)
100
+
101
+ hnr_mean = sum(hnr_values) / len(hnr_values) if len(hnr_values) > 0 else 0
102
+ hnr_std = np.std(hnr_values) if len(hnr_values) > 1 else 0
103
+
104
+ except Exception as e:
105
+ print(f"Error calculating HNR: {e}")
106
+ hnr_mean = 0
107
+ hnr_std = 0
108
+
109
+ feature_vector = np.array([
110
+ pitch_mean, pitch_std,
111
+ spectral_centroid_mean, spectral_centroid_std,
112
+ zcr_mean, zcr_std,
113
+ rms_mean, rms_std,
114
+ spectral_rolloff_mean, spectral_rolloff_std,
115
+ speaking_rate,
116
+ hnr_mean, hnr_std
117
+ ])
118
+
119
+ scaled_features = self.scaler.transform(feature_vector.reshape(1, -1))[0]
120
+
121
+ return scaled_features
122
+
123
+ except Exception as e:
124
+ print(f"Error extracting features: {e}")
125
+ return np.zeros(13)
126
+
127
+
128
+ def predict_hate_speech(audio_path):
129
+ state_dict = torch.load("hate_speech_hubert_audio_classifier.pth", map_location=torch.device('cpu'))
130
+ model = HuBERTHateSpeechClassifier(13, 2)
131
+ model.load_state_dict(state_dict)
132
+
133
+ feature_extractor = AudioFeatureExtractor()
134
+ features = feature_extractor.extract_features(audio_path)
135
+
136
+ input_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)
137
+
138
+ with torch.no_grad():
139
+ outputs = model(input_tensor)
140
+ probabilities = torch.softmax(outputs, dim=1)
141
+ predicted_class = torch.argmax(probabilities, dim=1).item()
142
+ confidence = probabilities[0][predicted_class].item()
143
 
144
+ result = {
145
+ 'Classification': 'Hate Speech\n' if predicted_class == 1 else 'Non-Hate Speech',
146
+ 'Confidence': f"{confidence:.2%}"
147
+ }
148
+
149
+ return result
150
+
151
+ iface = gr.Interface(
152
+ fn=predict_hate_speech,
153
+ inputs=gr.Audio(type="filepath", label="Upload Audio"),
154
+ outputs=gr.Textbox(label="Hate Speech Analysis"),
155
+ title="Hate Speech Audio Classifier",
156
+ description="Upload an audio file to detect potential hate speech content.",
157
+ examples=[
158
+ ["hate_video_3_3_snippet2.wav"]
159
+ ],
160
+ allow_flagging="manual"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  )
162
 
163
  if __name__ == "__main__":