Voice-Emotion

Sleeping

App Files Files Community

Voice-Emotion / backend /model_manager.py

aniruddhakumarpaul

Upload folder using huggingface_hub

f74654d verified 2 months ago

raw

history blame contribute delete

10.8 kB

	import os
	import numpy as np
	import librosa
	import tensorflow as tf
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import LabelEncoder
	from tensorflow.keras.utils import to_categorical
	from tensorflow.keras.models import Sequential, load_model
	from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout, BatchNormalization
	from tensorflow.keras.optimizers import Adam
	from tensorflow.keras.callbacks import EarlyStopping
	import pickle
	import joblib
	from joblib import Parallel, delayed
	from .utils import get_label_from_filename

	# Parameters
	MAX_PAD_LEN = 174
	N_MFCC = 40
	DURATION = 3
	SAMPLE_RATE = 22050

	MODEL_PATH = "model.h5"
	ENCODER_PATH = "encoder.pkl"

	def extract_features_static(file_path, duration=DURATION, sample_rate=SAMPLE_RATE, n_mfcc=N_MFCC, max_pad_len=MAX_PAD_LEN):
	"""
	Static helper for feature extraction to allow pickling for joblib parallel processing.
	"""
	try:
	# Normalize path
	file_path = os.path.normpath(os.path.abspath(file_path))

	audio = None
	sr = sample_rate

	# Try loading with librosa first
	try:
	audio, sr = librosa.load(file_path, res_type='kaiser_fast', duration=duration, sr=sample_rate)
	except Exception as e_librosa:
	print(f"Librosa load failed for {file_path}: {e_librosa}. Trying soundfile...")
	try:
	import soundfile as sf
	audio, file_sr = sf.read(file_path)
	# If we read successfuly, we might need to resample or crop/pad
	if len(audio.shape) > 1:
	audio = audio[:, 0] # Take first channel if stereo

	# Simple resampling if needed (though librosa is better at this, we can try to use librosa.resample if load failed but resample works)
	if file_sr != sample_rate:
	audio = librosa.resample(audio, orig_sr=file_sr, target_sr=sample_rate)

	# Manual duration crop
	max_samples = int(duration * sample_rate)
	if len(audio) > max_samples:
	audio = audio[:max_samples]

	sr = sample_rate
	except Exception as e_sf:
	print(f"Soundfile fallback also failed for {file_path}: {e_sf}")
	return None

	if audio is None:
	return None

	mfccs = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=n_mfcc)
	pad_width = max_pad_len - mfccs.shape[1]
	if pad_width > 0:
	mfccs = np.pad(mfccs, pad_width=((0, 0), (0, pad_width)), mode='constant')
	else:
	mfccs = mfccs[:, :max_pad_len]
	return mfccs.T
	except Exception as e:
	print(f"Error extracting features from {file_path}: {e}")
	return None

	def process_file(file):
	"""
	Helper to process a single file and return (features, label).
	Must be at module level for joblib on Windows.
	"""
	try:
	lbl = get_label_from_filename(file)
	if lbl:
	feat = extract_features_static(file)
	if feat is not None:
	return (feat, lbl)
	except Exception as e:
	print(f"Error processing {file}: {e}")
	return None

	class EmotionClassifier:
	def __init__(self):
	self.model = None
	self.le = LabelEncoder()
	self.is_loaded = False
	self.load_artifacts()

	def load_artifacts(self):
	if os.path.exists(MODEL_PATH) and os.path.exists(ENCODER_PATH):
	try:
	self.model = load_model(MODEL_PATH)
	with open(ENCODER_PATH, 'rb') as f:
	self.le = pickle.load(f)

	# Consistency Check
	expected_classes = self.model.output_shape[1]
	actual_classes = len(self.le.classes_)
	if expected_classes != actual_classes:
	print(f"WARNING: Model expects {expected_classes} classes but encoder has {actual_classes}.")
	print("This may cause 'unseen labels' errors. Please re-train or ensure files match.")

	self.is_loaded = True
	print("Model and encoder loaded successfully.")
	except Exception as e:
	print(f"Failed to load artifacts: {e}")
	else:
	print("No pre-trained model found. System ready for training.")

	def extract_features(self, file_path):
	"""Wrapper for static extraction method."""
	return extract_features_static(file_path)

	def train_model(self, data_path, log_callback=None):
	"""Trains the model from scratch using data in data_path."""
	def log(msg):
	if log_callback:
	log_callback(msg)
	else:
	print(msg)

	# Cache paths
	features_cache_path = os.path.join(data_path, "features_cache.npy")
	labels_cache_path = os.path.join(data_path, "labels_cache.npy")

	X = None
	y = None

	# Check cache
	if os.path.exists(features_cache_path) and os.path.exists(labels_cache_path):
	log("Found cached features. Loading from disk...")
	try:
	X = np.load(features_cache_path)
	y = np.load(labels_cache_path)
	log(f"Loaded {len(X)} cached samples.")
	except Exception as e:
	log(f"Failed to load cache: {e}. Recomputing...")
	X = None
	y = None

	if X is None or y is None:
	files = []
	# Walk through directory
	for root, _, filenames in os.walk(data_path):
	for f in filenames:
	if f.endswith('.wav'):
	files.append(os.path.join(root, f))

	if not files:
	log("DEBUG: No .wav files found in os.walk")
	raise ValueError("No .wav files found for training.")

	log(f"Processing {len(files)} files for training utilizing parallel processing...")

	# Run in parallel
	# n_jobs=1 avoids Windows multiprocessing issues
	results = Parallel(n_jobs=1, verbose=5)(delayed(process_file)(f) for f in files)

	# Filter None results
	valid_results = [r for r in results if r is not None]

	if not valid_results:
	log("CRITICAL: No features extracted successfully!")
	raise ValueError("No features extracted. Check files and labels.")

	log(f"Successfully processed {len(valid_results)}/{len(files)} files.")

	features = [r[0] for r in valid_results]
	labels = [r[1] for r in valid_results]

	X = np.array(features, dtype='float32')
	y = np.array(labels)

	# Save cache
	log("Saving features to cache...")
	np.save(features_cache_path, X)
	np.save(labels_cache_path, y)

	# Encode labels
	y_encoded = to_categorical(self.le.fit_transform(y))

	# Split
	X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y) # stratify=y handles class imbalance better

	# Build Model
	self.model = Sequential([
	Conv1D(128, kernel_size=5, padding="same", activation="relu", input_shape=(X_train.shape[1], X_train.shape[2])),
	MaxPooling1D(pool_size=2),
	BatchNormalization(),
	Conv1D(256, kernel_size=5, padding="same", activation="relu"),
	MaxPooling1D(pool_size=2),
	BatchNormalization(),
	Dropout(0.3),
	Flatten(),
	Dense(256, activation='relu'),
	Dropout(0.4),
	Dense(y_encoded.shape[1], activation='softmax')
	])

	self.model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.0001), metrics=['accuracy'])

	early_stop = EarlyStopping(monitor='val_loss', patience=15, restore_best_weights=True)

	log("Starting training...")

	class LogCallback(tf.keras.callbacks.Callback):
	def on_epoch_end(self, epoch, logs=None):
	log(f"Epoch {epoch+1}: loss={logs['loss']:.4f}, acc={logs['accuracy']:.4f}, val_loss={logs['val_loss']:.4f}")

	self.model.fit(X_train, y_train, epochs=50, batch_size=32, validation_data=(X_test, y_test), callbacks=[early_stop, LogCallback()], verbose=0)

	# Save artifacts
	self.model.save(MODEL_PATH)
	with open(ENCODER_PATH, 'wb') as f:
	pickle.dump(self.le, f)

	self.is_loaded = True
	log("Training complete and model saved.")
	return {"accuracy": self.model.evaluate(X_test, y_test)[1]}

	def predict_emotion(self, file_path):
	if not self.is_loaded:
	raise ValueError("Model not loaded. Train the model first.")

	mfcc = self.extract_features(file_path)
	if mfcc is None:
	raise ValueError("Could not extract features.")

	mfcc = mfcc[np.newaxis, :, :] # Add batch dimension
	prediction = self.model.predict(mfcc, verbose=0)

	# Get all probabilities
	probs = prediction[0]
	classes = self.le.classes_

	# Determine predicted index
	predicted_index = np.argmax(prediction)

	# Safety check for unseen labels
	if predicted_index >= len(classes):
	print(f"ERROR: Predicted index {predicted_index} is out of bounds (0-{len(classes)-1}).")
	# Fallback: Get valid index with highest probability
	# We slice probs to match classes length just in case the model output is larger
	valid_probs = probs[:len(classes)]
	predicted_index = np.argmax(valid_probs)
	print(f"Fallback: Using index {predicted_index} ({classes[predicted_index]})")

	predicted_label = self.le.inverse_transform([predicted_index])[0]
	confidence = float(probs[predicted_index])

	# Create distribution dict {label: score}
	# Only zip up to the length of classes to avoid errors
	distribution = {label: float(score) for label, score in zip(classes, probs)}

	return {
	"label": predicted_label,
	"confidence": confidence,
	"distribution": distribution
	}