Spaces:
Build error
Build error
| import gradio as gr | |
| import torch | |
| import librosa | |
| import numpy as np | |
| from sklearn.preprocessing import StandardScaler | |
| import joblib | |
| import parselmouth | |
| from parselmouth.praat import call | |
| from transformers import HubertForSequenceClassification | |
| import torch.nn as nn | |
| class HuBERTHateSpeechClassifier(nn.Module): | |
| def __init__(self, input_dim, num_classes): | |
| super().__init__() | |
| self.hubert = HubertForSequenceClassification.from_pretrained( | |
| "facebook/hubert-base-ls960" | |
| ) | |
| self.classifier = nn.Sequential( | |
| nn.Linear(input_dim, 128), | |
| nn.ReLU(), | |
| nn.Dropout(0.3), | |
| nn.Linear(128, 64), | |
| nn.ReLU(), | |
| nn.Dropout(0.3), | |
| nn.Linear(64, num_classes) | |
| ) | |
| def forward(self, x): | |
| return self.classifier(x) | |
| class AudioFeatureExtractor: | |
| def __init__(self, scaler_path='scaler.joblib'): | |
| self.scaler = joblib.load(scaler_path) | |
| def safe_mean(self, arr): | |
| try: | |
| arr = np.array(arr).flatten() | |
| arr = arr[np.isfinite(arr)] | |
| return float(np.mean(arr)) if len(arr) > 0 else 0.0 | |
| except Exception: | |
| return 0.0 | |
| def safe_std(self, arr): | |
| try: | |
| arr = np.array(arr).flatten() | |
| arr = arr[np.isfinite(arr)] | |
| return float(np.std(arr)) if len(arr) > 1 else 0.0 | |
| except Exception: | |
| return 0.0 | |
| def extract_features(self, audio_path): | |
| try: | |
| y, sr = librosa.load(audio_path, duration=5) | |
| except Exception as e: | |
| print(f"Error loading audio file: {e}") | |
| return np.zeros(13) | |
| if len(y) == 0: | |
| return np.zeros(13) | |
| try: | |
| pitches, magnitudes = librosa.piptrack(y=y, sr=sr) | |
| pitches = pitches[pitches > 0] | |
| pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0 | |
| pitch_std = np.std(pitches) if len(pitches) > 0 else 0 | |
| spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr) | |
| spectral_centroid_mean = np.mean(spectral_centroid) | |
| spectral_centroid_std = np.mean(spectral_centroid) | |
| zcr = librosa.feature.zero_crossing_rate(y) | |
| zcr_mean = np.mean(zcr) | |
| zcr_std = np.mean(zcr) | |
| rms = librosa.feature.rms(y=y) | |
| rms_mean = np.mean(rms) | |
| rms_std = np.mean(rms) | |
| spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85) | |
| spectral_rolloff_mean = np.mean(spectral_rolloff) | |
| spectral_rolloff_std = np.mean(spectral_rolloff) | |
| hop_length = 512 | |
| duration = librosa.get_duration(y=y, sr=sr) | |
| voiced_frames = librosa.effects.split(y, top_db=20) | |
| speaking_rate = len(voiced_frames) / duration if duration > 0 else 0 | |
| try: | |
| sound = parselmouth.Sound(audio_path) | |
| pitch = call(sound, "To Pitch", 0.0, 75, 600) | |
| harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0) | |
| hnr_values = [] | |
| for time in pitch.ts(): | |
| harmonicity_value = call(harmonicity, "Get value at time", time, "Linear") | |
| if not np.isnan(harmonicity_value): | |
| hnr_values.append(harmonicity_value) | |
| hnr_mean = sum(hnr_values) / len(hnr_values) if len(hnr_values) > 0 else 0 | |
| hnr_std = np.std(hnr_values) if len(hnr_values) > 1 else 0 | |
| except Exception as e: | |
| print(f"Error calculating HNR: {e}") | |
| hnr_mean = 0 | |
| hnr_std = 0 | |
| feature_vector = np.array([ | |
| pitch_mean, pitch_std, | |
| spectral_centroid_mean, spectral_centroid_std, | |
| zcr_mean, zcr_std, | |
| rms_mean, rms_std, | |
| spectral_rolloff_mean, spectral_rolloff_std, | |
| speaking_rate, | |
| hnr_mean, hnr_std | |
| ]) | |
| scaled_features = self.scaler.transform(feature_vector.reshape(1, -1))[0] | |
| return scaled_features | |
| except Exception as e: | |
| print(f"Error extracting features: {e}") | |
| return np.zeros(13) | |
| def predict_hate_speech(audio_path): | |
| state_dict = torch.load("hate_speech_hubert_audio_classifier.pth", map_location=torch.device('cpu')) | |
| model = HuBERTHateSpeechClassifier(13, 2) | |
| model.load_state_dict(state_dict) | |
| feature_extractor = AudioFeatureExtractor() | |
| features = feature_extractor.extract_features(audio_path) | |
| input_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0) | |
| with torch.no_grad(): | |
| outputs = model(input_tensor) | |
| probabilities = torch.softmax(outputs, dim=1) | |
| predicted_class = torch.argmax(probabilities, dim=1).item() | |
| confidence = probabilities[0][predicted_class].item() | |
| if confidence > 0.6 and predicted_class == 1: | |
| result = { | |
| "Classification": "Hate Speech", | |
| "Confidence": confidence | |
| } | |
| else: | |
| if confidence < 0.5 and predicted_class == 1: | |
| confidence = 1 - confidence | |
| result = { | |
| "Classification": "Non-Hate Speech", | |
| "Confidence": confidence | |
| } | |
| return result | |
| iface = gr.Interface( | |
| fn=predict_hate_speech, | |
| inputs=gr.Audio(type="filepath", label="Upload Audio"), | |
| outputs=gr.Textbox(label="Hate Speech Analysis"), | |
| title="Hate Speech Audio Classifier", | |
| description="Upload an audio file to detect potential hate speech content.", | |
| examples=[ | |
| ["hate_video_3_3_snippet2.wav"] | |
| ], | |
| allow_flagging="manual" | |
| ) | |
| if __name__ == "__main__": | |
| iface.launch() | |