import os
import numpy as np
import librosa
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import re

# Constants
SAMPLE_RATE = 44100  # Sample rate
NUM_MFCC = 13
MAX_LEN = 1000
WINDOW_SIZE = 1  # Window size in seconds
HOP_SIZE = 1   # Hop size (overlap) in seconds

# Function to extract MFCC features
def extract_features(file_path):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=NUM_MFCC)
    
    # Pad or truncate MFCCs to a fixed length
    if mfccs.shape[1] < MAX_LEN:
        padding = MAX_LEN - mfccs.shape[1]
        mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant')
    else:
        mfccs = mfccs[:, :MAX_LEN]
    
    return mfccs

# Load dataset
def load_data(dataset_path):
    features = []
    labels = []
    
    # Regex pattern to extract class name from filename
    pattern = re.compile(r'^(.*?)(?: \d+)?\.wav$')
    
    for file_name in os.listdir(dataset_path):
        if file_name.endswith('.wav'):
            file_path = os.path.join(dataset_path, file_name)
            match = pattern.match(file_name)
            if match:
                label = match.group(1)  # Extract class name without number
                mfccs = extract_features(file_path)
                features.append(mfccs)
                labels.append(label)
    
    if len(features) == 0 or len(labels) == 0:
        raise ValueError("No data found. Ensure the dataset path is correct and contains .wav files.")
    
    return np.array(features), np.array(labels)

# Load data
dataset_path = 'dataset'
X, y = load_data(dataset_path)

# Encode labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_categorical = tf.keras.utils.to_categorical(y_encoded)

# Save LabelEncoder
np.save('label_encoder.npy', label_encoder.classes_)

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y_categorical, test_size=0.2, random_state=42)

# Build model
model = Sequential([
    tf.keras.layers.Input(shape=(NUM_MFCC, MAX_LEN, 1)),
    Conv2D(32, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Conv2D(64, kernel_size=(3, 3), activation='relu'),
    MaxPooling2D(pool_size=(2, 2)),
    Flatten(),
    Dense(128, activation='relu'),
    Dropout(0.5),
    Dense(len(np.unique(y_encoded)), activation='softmax')
])

# Adjust learning rate if necessary
from tensorflow.keras.optimizers import Adam

learning_rate = 0.0001  # Adjust as necessary
optimizer = Adam(learning_rate=learning_rate)
model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])

# Reshape data for the model
X_train = np.expand_dims(X_train, axis=-1)
X_test = np.expand_dims(X_test, axis=-1)

# Train model
history = model.fit(X_train, y_train, epochs=20, validation_data=(X_test, y_test))

# Save model
model.save('sound_classification_model.h5')

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test accuracy: {accuracy}")

# Function to classify audio in sliding windows with overlapping handling
def classify_audio(file_path, model, label_encoder, window_size=WINDOW_SIZE, hop_size=HOP_SIZE):
    y, sr = librosa.load(file_path, sr=SAMPLE_RATE)
    total_duration = librosa.get_duration(y=y, sr=sr)
    window_samples = int(window_size * sr)
    hop_samples = int(hop_size * sr)

    results = []
    detected_windows = []  # List to keep track of detected windows

    for start in range(0, len(y) - window_samples + 1, hop_samples):
        end = start + window_samples
        segment = y[start:end]
        mfccs = librosa.feature.mfcc(y=segment, sr=sr, n_mfcc=NUM_MFCC)

        # Pad or truncate MFCCs
        if mfccs.shape[1] < MAX_LEN:
            padding = MAX_LEN - mfccs.shape[1]
            mfccs = np.pad(mfccs, ((0, 0), (0, padding)), mode='constant')
        else:
            mfccs = mfccs[:, :MAX_LEN]

        mfccs = np.expand_dims(mfccs, axis=0)
        mfccs = np.expand_dims(mfccs, axis=-1)
        prediction = model.predict(mfccs)
        predicted_class = np.argmax(prediction, axis=1)
        time = start / sr
        class_label = label_encoder.inverse_transform(predicted_class)[0]

        # Check for overlaps and add detected regions
        detected = False
        for (det_start, det_end, det_label) in detected_windows:
            if (start < det_end and end > det_start):  # Overlapping condition
                detected = True
                break
        
        if not detected:
            results.append((time, class_label))
            detected_windows.append((start, end, class_label))

    return results

# Example usage
if __name__ == "__main__":
    # Load model and label encoder
    def load_model_and_encoder(model_path, label_encoder_path):
        model = tf.keras.models.load_model(model_path)
        classes = np.load(label_encoder_path, allow_pickle=True)
        label_encoder = LabelEncoder()
        label_encoder.classes_ = classes
        return model, label_encoder

    model_path = 'sound_classification_model.h5'
    label_encoder_path = 'label_encoder.npy'
    audio_path = 'dataset/Debris Wood 02.wav'

    model, label_encoder = load_model_and_encoder(model_path, label_encoder_path)

    sound_identifications = classify_audio(audio_path, model, label_encoder)

    for time, label in sound_identifications:
        print(f'[{time:.2f} seconds] Class: {label}')