Spaces:

dj-dawgs-ipd
/

IPD-Audio-Model

Build error

App Files Files Community

IPD-Audio-Model / app.py

siddhantuniyal

Update app.py

2158a6c verified 12 months ago

raw

history blame contribute delete

5.89 kB

	import gradio as gr
	import torch
	import librosa
	import numpy as np
	from sklearn.preprocessing import StandardScaler
	import joblib
	import parselmouth
	from parselmouth.praat import call
	from transformers import HubertForSequenceClassification
	import torch.nn as nn


	class HuBERTHateSpeechClassifier(nn.Module):
	def __init__(self, input_dim, num_classes):
	super().__init__()
	self.hubert = HubertForSequenceClassification.from_pretrained(
	"facebook/hubert-base-ls960"
	)

	self.classifier = nn.Sequential(
	nn.Linear(input_dim, 128),
	nn.ReLU(),
	nn.Dropout(0.3),
	nn.Linear(128, 64),
	nn.ReLU(),
	nn.Dropout(0.3),
	nn.Linear(64, num_classes)
	)

	def forward(self, x):
	return self.classifier(x)


	class AudioFeatureExtractor:
	def __init__(self, scaler_path='scaler.joblib'):
	self.scaler = joblib.load(scaler_path)

	def safe_mean(self, arr):
	try:
	arr = np.array(arr).flatten()
	arr = arr[np.isfinite(arr)]
	return float(np.mean(arr)) if len(arr) > 0 else 0.0
	except Exception:
	return 0.0

	def safe_std(self, arr):
	try:
	arr = np.array(arr).flatten()
	arr = arr[np.isfinite(arr)]
	return float(np.std(arr)) if len(arr) > 1 else 0.0
	except Exception:
	return 0.0

	def extract_features(self, audio_path):
	try:
	y, sr = librosa.load(audio_path, duration=5)
	except Exception as e:
	print(f"Error loading audio file: {e}")
	return np.zeros(13)

	if len(y) == 0:
	return np.zeros(13)

	try:
	pitches, magnitudes = librosa.piptrack(y=y, sr=sr)
	pitches = pitches[pitches > 0]
	pitch_mean = np.mean(pitches) if len(pitches) > 0 else 0
	pitch_std = np.std(pitches) if len(pitches) > 0 else 0

	spectral_centroid = librosa.feature.spectral_centroid(y=y, sr=sr)
	spectral_centroid_mean = np.mean(spectral_centroid)
	spectral_centroid_std = np.mean(spectral_centroid)

	zcr = librosa.feature.zero_crossing_rate(y)
	zcr_mean = np.mean(zcr)
	zcr_std = np.mean(zcr)

	rms = librosa.feature.rms(y=y)
	rms_mean = np.mean(rms)
	rms_std = np.mean(rms)

	spectral_rolloff = librosa.feature.spectral_rolloff(y=y, sr=sr, roll_percent=0.85)
	spectral_rolloff_mean = np.mean(spectral_rolloff)
	spectral_rolloff_std = np.mean(spectral_rolloff)

	hop_length = 512
	duration = librosa.get_duration(y=y, sr=sr)
	voiced_frames = librosa.effects.split(y, top_db=20)
	speaking_rate = len(voiced_frames) / duration if duration > 0 else 0

	try:
	sound = parselmouth.Sound(audio_path)
	pitch = call(sound, "To Pitch", 0.0, 75, 600)
	harmonicity = call(sound, "To Harmonicity (cc)", 0.01, 75, 0.1, 1.0)
	hnr_values = []
	for time in pitch.ts():
	harmonicity_value = call(harmonicity, "Get value at time", time, "Linear")
	if not np.isnan(harmonicity_value):
	hnr_values.append(harmonicity_value)

	hnr_mean = sum(hnr_values) / len(hnr_values) if len(hnr_values) > 0 else 0
	hnr_std = np.std(hnr_values) if len(hnr_values) > 1 else 0

	except Exception as e:
	print(f"Error calculating HNR: {e}")
	hnr_mean = 0
	hnr_std = 0

	feature_vector = np.array([
	pitch_mean, pitch_std,
	spectral_centroid_mean, spectral_centroid_std,
	zcr_mean, zcr_std,
	rms_mean, rms_std,
	spectral_rolloff_mean, spectral_rolloff_std,
	speaking_rate,
	hnr_mean, hnr_std
	])

	scaled_features = self.scaler.transform(feature_vector.reshape(1, -1))[0]

	return scaled_features

	except Exception as e:
	print(f"Error extracting features: {e}")
	return np.zeros(13)


	def predict_hate_speech(audio_path):
	state_dict = torch.load("hate_speech_hubert_audio_classifier.pth", map_location=torch.device('cpu'))
	model = HuBERTHateSpeechClassifier(13, 2)
	model.load_state_dict(state_dict)

	feature_extractor = AudioFeatureExtractor()
	features = feature_extractor.extract_features(audio_path)

	input_tensor = torch.tensor(features, dtype=torch.float32).unsqueeze(0)

	with torch.no_grad():
	outputs = model(input_tensor)
	probabilities = torch.softmax(outputs, dim=1)
	predicted_class = torch.argmax(probabilities, dim=1).item()
	confidence = probabilities[0][predicted_class].item()

	if confidence > 0.6 and predicted_class == 1:
	result = {
	"Classification": "Hate Speech",
	"Confidence": confidence
	}
	else:
	if confidence < 0.5 and predicted_class == 1:
	confidence = 1 - confidence
	result = {
	"Classification": "Non-Hate Speech",
	"Confidence": confidence
	}

	return result

	iface = gr.Interface(
	fn=predict_hate_speech,
	inputs=gr.Audio(type="filepath", label="Upload Audio"),
	outputs=gr.Textbox(label="Hate Speech Analysis"),
	title="Hate Speech Audio Classifier",
	description="Upload an audio file to detect potential hate speech content.",
	examples=[
	["hate_video_3_3_snippet2.wav"]
	],
	allow_flagging="manual"
	)

	if __name__ == "__main__":
	iface.launch()