amp / ExplorationApp.py
magnumical's picture
Upload 103 files
9c6b905 verified
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import librosa
import librosa.display
from Exploration.inference import RespiratorySoundAnalysis
import seaborn as sns
import os
# Define base paths
BASE_PATH = 'D://github//AmpleHealth//data//Respiratory_Sound_Database//Respiratory_Sound_Database'
DIAGNOSIS_FILE = os.path.join(BASE_PATH, 'patient_diagnosis.csv')
AUDIO_PATH = os.path.join(BASE_PATH, 'testsample')
DEMOGRAPHIC_FILE = os.path.join('D://github//AmpleHealth//data', 'demographic_info.txt')
# Initialize analysis object
analysis = RespiratorySoundAnalysis(DIAGNOSIS_FILE, AUDIO_PATH)
# Load data
@st.cache_data
def load_data():
analysis.load_diagnosis_data()
analysis.load_audio_files()
analysis.analyze_audio_properties()
return analysis.diagnosis_df, analysis.audio_df
diagnosis_df, audio_df = load_data()
# Load patient demographic data
@st.cache_data
def load_patient_demographics():
patient_df = pd.read_csv(
DEMOGRAPHIC_FILE,
names=['Patient number', 'Age', 'Sex', 'Adult BMI (kg/m2)', 'Child Weight (kg)', 'Child Height (cm)'],
delimiter=' '
)
return patient_df
patient_df = load_patient_demographics()
# Streamlit App
st.title("Respiratory Sound Data Explorer")
# Tabs for navigation
tabs = st.tabs(["Overview", "Explore Data", "Patient Demographics", "Preprocessing & Audio Effects"])
# Overview Tab
# Overview Tab
with tabs[0]:
st.header("Dataset Overview")
# Highlight key statistics
total_patients = len(diagnosis_df)
most_common_disease = diagnosis_df['disease'].value_counts().idxmax()
least_common_disease = diagnosis_df['disease'].value_counts().idxmin()
st.subheader("Key Statistics")
st.markdown(f"""
- **Total Patients:** {total_patients}
- **Most Common Disease:** {most_common_disease} ({diagnosis_df['disease'].value_counts().max()} patients)
- **Least Common Disease:** {least_common_disease} ({diagnosis_df['disease'].value_counts().min()} patients)
""")
# Diagnosis Distribution
st.subheader("Diagnosis Distribution")
disease_counts = diagnosis_df['disease'].value_counts()
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(y=disease_counts.index, x=disease_counts.values, palette="viridis", ax=ax, legend=False, hue=disease_counts.index, dodge=False)
ax.set_title("Disease Distribution", fontsize=16, fontweight='bold')
ax.set_xlabel("Number of Patients", fontsize=12)
ax.set_ylabel("Disease", fontsize=12)
for i, v in enumerate(disease_counts.values):
ax.text(v + 1, i, str(v), color='black', fontsize=10, va='center')
st.pyplot(fig)
# Proportion of Diseases
st.subheader("Disease Proportion")
disease_proportions = diagnosis_df['disease'].value_counts(normalize=True) * 100
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(y=disease_proportions.index, x=disease_proportions.values, hue=disease_proportions.index, dodge=False, palette="coolwarm", ax=ax, legend=False)
ax.set_title("Disease Proportion (%)", fontsize=16, fontweight='bold')
ax.set_xlabel("Proportion (%)", fontsize=12)
ax.set_ylabel("Disease", fontsize=12)
for i, v in enumerate(disease_proportions.values):
ax.text(v + 0.5, i, f"{v:.1f}%", color='black', fontsize=10, va='center')
st.pyplot(fig)
# Explore Data Tab
with tabs[1]:
st.header("Explore Data")
if audio_df is not None and not audio_df.empty:
# Key Audio Insights
st.subheader("Key Audio Insights")
st.markdown(f"""
- **Total Audio Files:** {len(audio_df)}
- **Average Duration:** {audio_df['duration_sec'].mean():.2f} seconds
- **Shortest Audio File:** {audio_df.loc[audio_df['duration_sec'].idxmin(), 'file_name']} ({audio_df['duration_sec'].min():.2f} seconds)
- **Longest Audio File:** {audio_df.loc[audio_df['duration_sec'].idxmax(), 'file_name']} ({audio_df['duration_sec'].max():.2f} seconds)
""")
# Duration Distribution
st.subheader("Audio Duration Distribution")
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(audio_df['duration_sec'], bins=20, kde=True, color='skyblue', ax=ax)
ax.set_title("Audio Duration Distribution", fontsize=16, fontweight='bold')
ax.set_xlabel("Duration (seconds)", fontsize=12)
ax.set_ylabel("Frequency", fontsize=12)
st.pyplot(fig)
# Highlight Outliers
st.subheader("Audio Duration Outliers")
outlier_threshold = st.slider("Set Outlier Threshold (seconds):", 1.0, float(audio_df['duration_sec'].max()), 25.0, step=0.5)
outliers = audio_df[audio_df['duration_sec'] > outlier_threshold]
st.write(outliers if not outliers.empty else "No outliers found above the threshold.")
# Optional Filtering
st.subheader("Filter Audio Files by Duration")
min_range, max_range = st.slider("Select Duration Range (seconds):", 0.0, float(audio_df['duration_sec'].max()), (0.0, float(audio_df['duration_sec'].max())), step=0.5)
filtered_files = audio_df[(audio_df['duration_sec'] >= min_range) & (audio_df['duration_sec'] <= max_range)]
st.write(f"**Number of Files in Range:** {len(filtered_files)}")
st.dataframe(filtered_files[['file_name', 'duration_sec']])
else:
st.warning("No audio data available to display.")
# Patient Demographics Tab
with tabs[2]:
st.header("Patient Demographics")
st.subheader("Demographics Data")
st.dataframe(patient_df)
st.subheader("Missing Values Information")
st.write(patient_df.isna().sum())
st.subheader("Key Statistics")
avg_age, min_age, max_age = patient_df['Age'].mean(), patient_df['Age'].min(), patient_df['Age'].max()
st.markdown(f"- **Average Age:** {avg_age:.1f} years\n- **Youngest Patient:** {min_age} years\n- **Oldest Patient:** {max_age} years")
# Visualizations
st.markdown("### Age Distribution")
fig, ax = plt.subplots(figsize=(10, 6))
sns.histplot(patient_df['Age'].dropna(), bins=20, kde=True, color='skyblue', ax=ax)
ax.set_title("Age Distribution", fontsize=16, fontweight='bold')
st.pyplot(fig)
st.markdown("### Gender Distribution")
fig, ax = plt.subplots(figsize=(10, 6))
sns.barplot(x=patient_df['Sex'].value_counts().index, y=patient_df['Sex'].value_counts().values, palette="coolwarm", ax=ax, hue=patient_df['Sex'].value_counts().index, dodge=False)
ax.set_title("Gender Distribution", fontsize=16, fontweight='bold')
st.pyplot(fig)
with tabs[3]:
st.header("Preprocessing & Audio Effects")
# List all .wav files in the AUDIO_PATH directory
wav_files = [f for f in os.listdir(AUDIO_PATH) if f.endswith('.wav')]
if wav_files:
selected_file_name = st.selectbox("Select an Audio File", wav_files)
# Construct the full path of the selected file
file_path = os.path.join(AUDIO_PATH, selected_file_name)
try:
# Load raw audio
y_raw, sr = librosa.load(file_path)
except Exception as e:
st.error(f"Error loading audio file: {e}")
st.stop()
# Preprocessing and Visualization
try:
y_processed, processed_sr = analysis.preprocess_audio(y_raw, sr)
# Mel spectrogram
mel = librosa.feature.melspectrogram(
y=y_processed, sr=processed_sr, n_fft=2048, hop_length=512, power=2.0
)
mel_db = librosa.power_to_db(mel, ref=np.max)
# STFT
stft = librosa.stft(y_processed, n_fft=2048, hop_length=512)
stft_db = librosa.amplitude_to_db(np.abs(stft), ref=np.max)
# Frequency Spectrum
fft = np.abs(np.fft.rfft(y_processed))
freqs = np.fft.rfftfreq(len(y_processed), 1 / processed_sr)
# Zero-Crossing Rate
zcr = librosa.feature.zero_crossing_rate(y_processed)[0]
# RMS Energy
rms = librosa.feature.rms(y=y_processed)[0]
# Create subplots for visualizations
fig, axs = plt.subplots(3, 2, figsize=(15, 12))
# Raw waveform
librosa.display.waveshow(y_raw, sr=sr, ax=axs[0, 0])
axs[0, 0].set_title("Raw Waveform", fontsize=12)
# Preprocessed waveform
librosa.display.waveshow(y_processed, sr=processed_sr, ax=axs[0, 1])
axs[0, 1].set_title("Preprocessed Waveform", fontsize=12)
# Frequency spectrum
axs[1, 0].plot(freqs, fft, color='blue')
axs[1, 0].set_title("Frequency Spectrum", fontsize=12)
axs[1, 0].set_xlabel("Frequency (Hz)")
axs[1, 0].set_ylabel("Amplitude")
# ZCR
axs[1, 1].plot(zcr, color='green')
axs[1, 1].set_title("Zero-Crossing Rate", fontsize=12)
axs[1, 1].set_xlabel("Frames")
axs[1, 1].set_ylabel("Rate")
# RMS Energy
axs[2, 0].plot(rms, color='red')
axs[2, 0].set_title("RMS Energy", fontsize=12)
axs[2, 0].set_xlabel("Frames")
axs[2, 0].set_ylabel("RMS")
# Mel spectrogram
img_mel = librosa.display.specshow(
mel_db, sr=processed_sr, x_axis='time', y_axis='mel', ax=axs[2, 1], cmap='viridis'
)
axs[2, 1].set_title("Mel Spectrogram", fontsize=12)
fig.colorbar(img_mel, ax=axs[2, 1], format="%+2.0f dB")
# Adjust layout
plt.tight_layout()
st.pyplot(fig)
except Exception as e:
st.error(f"Error during audio preprocessing or visualization: {e}")
st.stop()
# Play audio
st.subheader("Listen to Audio")
st.audio(file_path, format="audio/wav")
else:
st.warning("No audio files found in the directory.")