import os import numpy as np import librosa import tensorflow as tf from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split import re # Constants SAMPLE_RATE = 44100 # Sample rate NUM_MFCC = 13 MAX_LEN = 1000 WINDOW_SIZE = 1 # Window size in seconds HOP_SIZE = 1 # Hop size (overlap) in seconds # Function to extract MFCC features def extract_features(file_path): y, sr = librosa.load(file_path, sr=SAMPLE_RATE) mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=NUM_MFCC) # Pad or truncate MFCCs to a fixed length if mfccs.shape[1] < MAX_LEN: padding = MAX_LEN - mfccs.shape[1] mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant') else: mfccs = mfccs[:, :MAX_LEN] return mfccs # Load dataset def load_data(dataset_path): features = [] labels = [] # Regex pattern to extract class name from filename pattern = re.compile(r'^(.*?)(?: \d+)?\.wav$') for file_name in os.listdir(dataset_path): if file_name.endswith('.wav'): file_path = os.path.join(dataset_path, file_name) match = pattern.match(file_name) if match: label = match.group(1) # Extract class name without number mfccs = extract_features(file_path) features.append(mfccs) labels.append(label) if len(features) == 0 or len(labels) == 0: raise ValueError("No data found. Ensure the dataset path is correct and contains .wav files.") return np.array(features), np.array(labels) # Load data dataset_path = 'dataset' X, y = load_data(dataset_path) # Encode labels label_encoder = LabelEncoder() y_encoded = label_encoder.fit_transform(y) y_categorical = tf.keras.utils.to_categorical(y_encoded) # Save LabelEncoder np.save('label_encoder.npy', label_encoder.classes_) # Split data X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42) # Build model model = Sequential([ tf.keras.layers.Input(shape=(NUM_MFCC, MAX_LEN, 1)), Conv2D(32, kernel_size=(3, 3), activation='relu'), MaxPooling2D(pool_size=(2, 2)), Conv2D(64, kernel_size=(3, 3), activation='relu'), MaxPooling2D(pool_size=(2, 2)), Flatten(), Dense(128, activation='relu'), Dropout(0.5), Dense(len(np.unique(y_encoded)), activation='softmax') ]) # Adjust learning rate if necessary from tensorflow.keras.optimizers import Adam learning_rate = 0.0001 # Adjust as necessary optimizer = Adam(learning_rate=learning_rate) model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy']) # Reshape data for the model X_train = np.expand_dims(X_train, axis=-1) X_test = np.expand_dims(X_test, axis=-1) # Train model history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test)) # Save model model.save('sound_classification_model.h5') # Evaluate model loss, accuracy = model.evaluate(X_test, y_test) print(f"Test accuracy: {accuracy}") # Function to classify audio in sliding windows with overlapping handling def classify_audio(file_path, model, label_encoder, window_size=WINDOW_SIZE, hop_size=HOP_SIZE): y, sr = librosa.load(file_path, sr=SAMPLE_RATE) total_duration = librosa.get_duration(y=y, sr=sr) window_samples = int(window_size * sr) hop_samples = int(hop_size * sr) results = [] detected_windows = [] # List to keep track of detected windows for start in range(0, len(y) - window_samples + 1, hop_samples): end = start + window_samples segment = y[start:end] mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=NUM_MFCC) # Pad or truncate MFCCs if mfccs.shape[1] < MAX_LEN: padding = MAX_LEN - mfccs.shape[1] mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant') else: mfccs = mfccs[:, :MAX_LEN] mfccs = np.expand_dims(mfccs, axis=0) mfccs = np.expand_dims(mfccs, axis=-1) prediction = model.predict(mfccs) predicted_class = np.argmax(prediction, axis=1) time = start / sr class_label = label_encoder.inverse_transform(predicted_class)[0] # Check for overlaps and add detected regions detected = False for (det_start, det_end, det_label) in detected_windows: if (start < det_end and end > det_start): # Overlapping condition detected = True break if not detected: results.append((time, class_label)) detected_windows.append((start, end, class_label)) return results # Example usage if __name__ == "__main__": # Load model and label encoder def load_model_and_encoder(model_path, label_encoder_path): model = tf.keras.models.load_model(model_path) classes = np.load(label_encoder_path, allow_pickle=True) label_encoder = LabelEncoder() label_encoder.classes_ = classes return model, label_encoder model_path = 'sound_classification_model.h5' label_encoder_path = 'label_encoder.npy' audio_path = 'dataset/Debris Wood 02.wav' model, label_encoder = load_model_and_encoder(model_path, label_encoder_path) sound_identifications = classify_audio(audio_path, model, label_encoder) for time, label in sound_identifications: print(f'[{time:.2f} seconds] Class: {label}')