import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from Exploration.inference import RespiratorySoundAnalysis
import seaborn as sns
import os

# Define base paths
BASE_PATH = 'D://github//AmpleHealth//data//Respiratory_Sound_Database//Respiratory_Sound_Database'
DIAGNOSIS_FILE = os.path.join(BASE_PATH, 'patient_diagnosis.csv')
AUDIO_PATH = os.path.join(BASE_PATH, 'testsample')
DEMOGRAPHIC_FILE = os.path.join('D://github//AmpleHealth//data', 'demographic_info.txt')

# Initialize analysis object
analysis = RespiratorySoundAnalysis(DIAGNOSIS_FILE, AUDIO_PATH)

# Load data
@st.cache_data
def load_data():
    analysis.load_diagnosis_data()
    analysis.load_audio_files()
    analysis.analyze_audio_properties()
    return analysis.diagnosis_df,  analysis.audio_df

diagnosis_df,  audio_df = load_data()

# Load patient demographic data
@st.cache_data
def load_patient_demographics():
    patient_df = pd.read_csv(
        DEMOGRAPHIC_FILE, 
        names=['Patient number', 'Age', 'Sex', 'Adult BMI (kg/m2)', 'Child Weight (kg)', 'Child Height (cm)'],
        delimiter=' '
    )
    return patient_df

patient_df = load_patient_demographics()

# Streamlit App
st.title("Respiratory Sound Data Explorer")

# Tabs for navigation
tabs = st.tabs(["Overview", "Explore Data", "Patient Demographics", "Preprocessing & Audio Effects"])

# Overview Tab


# Overview Tab
with tabs[0]:
    st.header("Dataset Overview")

    # Highlight key statistics
    total_patients = len(diagnosis_df)
    most_common_disease = diagnosis_df['disease'].value_counts().idxmax()
    least_common_disease = diagnosis_df['disease'].value_counts().idxmin()

    st.subheader("Key Statistics")
    st.markdown(f"""
    - **Total Patients:** {total_patients}
    - **Most Common Disease:** {most_common_disease} ({diagnosis_df['disease'].value_counts().max()} patients)
    - **Least Common Disease:** {least_common_disease} ({diagnosis_df['disease'].value_counts().min()} patients)
    """)

    # Diagnosis Distribution
    st.subheader("Diagnosis Distribution")
    disease_counts = diagnosis_df['disease'].value_counts()
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(y=disease_counts.index, x=disease_counts.values, palette="viridis", ax=ax, legend=False, hue=disease_counts.index, dodge=False)
    ax.set_title("Disease Distribution", fontsize=16, fontweight='bold')
    ax.set_xlabel("Number of Patients", fontsize=12)
    ax.set_ylabel("Disease", fontsize=12)
    for i, v in enumerate(disease_counts.values):
        ax.text(v + 1, i, str(v), color='black', fontsize=10, va='center')
    st.pyplot(fig)

    # Proportion of Diseases
    st.subheader("Disease Proportion")
    disease_proportions = diagnosis_df['disease'].value_counts(normalize=True) * 100
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(y=disease_proportions.index, x=disease_proportions.values, hue=disease_proportions.index, dodge=False, palette="coolwarm", ax=ax, legend=False)
    ax.set_title("Disease Proportion (%)", fontsize=16, fontweight='bold')
    ax.set_xlabel("Proportion (%)", fontsize=12)
    ax.set_ylabel("Disease", fontsize=12)
    for i, v in enumerate(disease_proportions.values):
        ax.text(v + 0.5, i, f"{v:.1f}%", color='black', fontsize=10, va='center')
    st.pyplot(fig)

# Explore Data Tab
with tabs[1]:
    st.header("Explore Data")

    if audio_df is not None and not audio_df.empty:
        # Key Audio Insights
        st.subheader("Key Audio Insights")
        st.markdown(f"""
        - **Total Audio Files:** {len(audio_df)}
        - **Average Duration:** {audio_df['duration_sec'].mean():.2f} seconds
        - **Shortest Audio File:** {audio_df.loc[audio_df['duration_sec'].idxmin(), 'file_name']} ({audio_df['duration_sec'].min():.2f} seconds)
        - **Longest Audio File:** {audio_df.loc[audio_df['duration_sec'].idxmax(), 'file_name']} ({audio_df['duration_sec'].max():.2f} seconds)
        """)

        # Duration Distribution
        st.subheader("Audio Duration Distribution")
        fig, ax = plt.subplots(figsize=(10, 6))
        sns.histplot(audio_df['duration_sec'], bins=20, kde=True, color='skyblue', ax=ax)
        ax.set_title("Audio Duration Distribution", fontsize=16, fontweight='bold')
        ax.set_xlabel("Duration (seconds)", fontsize=12)
        ax.set_ylabel("Frequency", fontsize=12)
        st.pyplot(fig)

        # Highlight Outliers
        st.subheader("Audio Duration Outliers")
        outlier_threshold = st.slider("Set Outlier Threshold (seconds):", 1.0, float(audio_df['duration_sec'].max()), 25.0, step=0.5)
        outliers = audio_df[audio_df['duration_sec'] > outlier_threshold]
        st.write(outliers if not outliers.empty else "No outliers found above the threshold.")

        # Optional Filtering
        st.subheader("Filter Audio Files by Duration")
        min_range, max_range = st.slider("Select Duration Range (seconds):", 0.0, float(audio_df['duration_sec'].max()), (0.0, float(audio_df['duration_sec'].max())), step=0.5)
        filtered_files = audio_df[(audio_df['duration_sec'] >= min_range) & (audio_df['duration_sec'] <= max_range)]
        st.write(f"**Number of Files in Range:** {len(filtered_files)}")
        st.dataframe(filtered_files[['file_name', 'duration_sec']])
    else:
        st.warning("No audio data available to display.")

# Patient Demographics Tab
with tabs[2]:
    st.header("Patient Demographics")
    st.subheader("Demographics Data")
    st.dataframe(patient_df)

    st.subheader("Missing Values Information")
    st.write(patient_df.isna().sum())

    st.subheader("Key Statistics")
    avg_age, min_age, max_age = patient_df['Age'].mean(), patient_df['Age'].min(), patient_df['Age'].max()
    st.markdown(f"- **Average Age:** {avg_age:.1f} years\n- **Youngest Patient:** {min_age} years\n- **Oldest Patient:** {max_age} years")

    # Visualizations
    st.markdown("### Age Distribution")
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.histplot(patient_df['Age'].dropna(), bins=20, kde=True, color='skyblue', ax=ax)
    ax.set_title("Age Distribution", fontsize=16, fontweight='bold')
    st.pyplot(fig)

    st.markdown("### Gender Distribution")
    fig, ax = plt.subplots(figsize=(10, 6))
    sns.barplot(x=patient_df['Sex'].value_counts().index, y=patient_df['Sex'].value_counts().values, palette="coolwarm", ax=ax, hue=patient_df['Sex'].value_counts().index, dodge=False)
    ax.set_title("Gender Distribution", fontsize=16, fontweight='bold')
    st.pyplot(fig)


with tabs[3]:
    st.header("Preprocessing & Audio Effects")

    # List all .wav files in the AUDIO_PATH directory
    wav_files = [f for f in os.listdir(AUDIO_PATH) if f.endswith('.wav')]
    
    if wav_files:
        selected_file_name = st.selectbox("Select an Audio File", wav_files)

        # Construct the full path of the selected file
        file_path = os.path.join(AUDIO_PATH, selected_file_name)

        try:
            # Load raw audio
            y_raw, sr = librosa.load(file_path)
        except Exception as e:
            st.error(f"Error loading audio file: {e}")
            st.stop()

        # Preprocessing and Visualization
        try:
            y_processed, processed_sr = analysis.preprocess_audio(y_raw, sr)

            # Mel spectrogram
            mel = librosa.feature.melspectrogram(
                y=y_processed, sr=processed_sr, n_fft=2048, hop_length=512, power=2.0
            )
            mel_db = librosa.power_to_db(mel, ref=np.max)

            # STFT
            stft = librosa.stft(y_processed, n_fft=2048, hop_length=512)
            stft_db = librosa.amplitude_to_db(np.abs(stft), ref=np.max)

            # Frequency Spectrum
            fft = np.abs(np.fft.rfft(y_processed))
            freqs = np.fft.rfftfreq(len(y_processed), 1 / processed_sr)

            # Zero-Crossing Rate
            zcr = librosa.feature.zero_crossing_rate(y_processed)[0]

            # RMS Energy
            rms = librosa.feature.rms(y=y_processed)[0]

            # Create subplots for visualizations
            fig, axs = plt.subplots(3, 2, figsize=(15, 12))

            # Raw waveform
            librosa.display.waveshow(y_raw, sr=sr, ax=axs[0, 0])
            axs[0, 0].set_title("Raw Waveform", fontsize=12)

            # Preprocessed waveform
            librosa.display.waveshow(y_processed, sr=processed_sr, ax=axs[0, 1])
            axs[0, 1].set_title("Preprocessed Waveform", fontsize=12)

            # Frequency spectrum
            axs[1, 0].plot(freqs, fft, color='blue')
            axs[1, 0].set_title("Frequency Spectrum", fontsize=12)
            axs[1, 0].set_xlabel("Frequency (Hz)")
            axs[1, 0].set_ylabel("Amplitude")

            # ZCR
            axs[1, 1].plot(zcr, color='green')
            axs[1, 1].set_title("Zero-Crossing Rate", fontsize=12)
            axs[1, 1].set_xlabel("Frames")
            axs[1, 1].set_ylabel("Rate")

            # RMS Energy
            axs[2, 0].plot(rms, color='red')
            axs[2, 0].set_title("RMS Energy", fontsize=12)
            axs[2, 0].set_xlabel("Frames")
            axs[2, 0].set_ylabel("RMS")

            # Mel spectrogram
            img_mel = librosa.display.specshow(
                mel_db, sr=processed_sr, x_axis='time', y_axis='mel', ax=axs[2, 1], cmap='viridis'
            )
            axs[2, 1].set_title("Mel Spectrogram", fontsize=12)
            fig.colorbar(img_mel, ax=axs[2, 1], format="%+2.0f dB")

            # Adjust layout
            plt.tight_layout()
            st.pyplot(fig)

        except Exception as e:
            st.error(f"Error during audio preprocessing or visualization: {e}")
            st.stop()

        # Play audio
        st.subheader("Listen to Audio")
        st.audio(file_path, format="audio/wav")
    else:
        st.warning("No audio files found in the directory.")