Spaces:

magnumical
/

amp

Sleeping

App Files Files Community

amp / ExplorationApp.py

magnumical

Upload 103 files

9c6b905 verified about 1 year ago

raw

history blame contribute delete

10.2 kB

	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import librosa
	import librosa.display
	from Exploration.inference import RespiratorySoundAnalysis
	import seaborn as sns
	import os

	# Define base paths
	BASE_PATH = 'D://github//AmpleHealth//data//Respiratory_Sound_Database//Respiratory_Sound_Database'
	DIAGNOSIS_FILE = os.path.join(BASE_PATH, 'patient_diagnosis.csv')
	AUDIO_PATH = os.path.join(BASE_PATH, 'testsample')
	DEMOGRAPHIC_FILE = os.path.join('D://github//AmpleHealth//data', 'demographic_info.txt')

	# Initialize analysis object
	analysis = RespiratorySoundAnalysis(DIAGNOSIS_FILE, AUDIO_PATH)

	# Load data
	@st.cache_data
	def load_data():
	analysis.load_diagnosis_data()
	analysis.load_audio_files()
	analysis.analyze_audio_properties()
	return analysis.diagnosis_df, analysis.audio_df

	diagnosis_df, audio_df = load_data()

	# Load patient demographic data
	@st.cache_data
	def load_patient_demographics():
	patient_df = pd.read_csv(
	DEMOGRAPHIC_FILE,
	names=['Patient number', 'Age', 'Sex', 'Adult BMI (kg/m2)', 'Child Weight (kg)', 'Child Height (cm)'],
	delimiter=' '
	)
	return patient_df

	patient_df = load_patient_demographics()

	# Streamlit App
	st.title("Respiratory Sound Data Explorer")

	# Tabs for navigation
	tabs = st.tabs(["Overview", "Explore Data", "Patient Demographics", "Preprocessing & Audio Effects"])

	# Overview Tab


	# Overview Tab
	with tabs[0]:
	st.header("Dataset Overview")

	# Highlight key statistics
	total_patients = len(diagnosis_df)
	most_common_disease = diagnosis_df['disease'].value_counts().idxmax()
	least_common_disease = diagnosis_df['disease'].value_counts().idxmin()

	st.subheader("Key Statistics")
	st.markdown(f"""
	- Total Patients: {total_patients}
	- Most Common Disease: {most_common_disease} ({diagnosis_df['disease'].value_counts().max()} patients)
	- Least Common Disease: {least_common_disease} ({diagnosis_df['disease'].value_counts().min()} patients)
	""")

	# Diagnosis Distribution
	st.subheader("Diagnosis Distribution")
	disease_counts = diagnosis_df['disease'].value_counts()
	fig, ax = plt.subplots(figsize=(10, 6))
	sns.barplot(y=disease_counts.index, x=disease_counts.values, palette="viridis", ax=ax, legend=False, hue=disease_counts.index, dodge=False)
	ax.set_title("Disease Distribution", fontsize=16, fontweight='bold')
	ax.set_xlabel("Number of Patients", fontsize=12)
	ax.set_ylabel("Disease", fontsize=12)
	for i, v in enumerate(disease_counts.values):
	ax.text(v + 1, i, str(v), color='black', fontsize=10, va='center')
	st.pyplot(fig)

	# Proportion of Diseases
	st.subheader("Disease Proportion")
	disease_proportions = diagnosis_df['disease'].value_counts(normalize=True) * 100
	fig, ax = plt.subplots(figsize=(10, 6))
	sns.barplot(y=disease_proportions.index, x=disease_proportions.values, hue=disease_proportions.index, dodge=False, palette="coolwarm", ax=ax, legend=False)
	ax.set_title("Disease Proportion (%)", fontsize=16, fontweight='bold')
	ax.set_xlabel("Proportion (%)", fontsize=12)
	ax.set_ylabel("Disease", fontsize=12)
	for i, v in enumerate(disease_proportions.values):
	ax.text(v + 0.5, i, f"{v:.1f}%", color='black', fontsize=10, va='center')
	st.pyplot(fig)

	# Explore Data Tab
	with tabs[1]:
	st.header("Explore Data")

	if audio_df is not None and not audio_df.empty:
	# Key Audio Insights
	st.subheader("Key Audio Insights")
	st.markdown(f"""
	- Total Audio Files: {len(audio_df)}
	- Average Duration: {audio_df['duration_sec'].mean():.2f} seconds
	- Shortest Audio File: {audio_df.loc[audio_df['duration_sec'].idxmin(), 'file_name']} ({audio_df['duration_sec'].min():.2f} seconds)
	- Longest Audio File: {audio_df.loc[audio_df['duration_sec'].idxmax(), 'file_name']} ({audio_df['duration_sec'].max():.2f} seconds)
	""")

	# Duration Distribution
	st.subheader("Audio Duration Distribution")
	fig, ax = plt.subplots(figsize=(10, 6))
	sns.histplot(audio_df['duration_sec'], bins=20, kde=True, color='skyblue', ax=ax)
	ax.set_title("Audio Duration Distribution", fontsize=16, fontweight='bold')
	ax.set_xlabel("Duration (seconds)", fontsize=12)
	ax.set_ylabel("Frequency", fontsize=12)
	st.pyplot(fig)

	# Highlight Outliers
	st.subheader("Audio Duration Outliers")
	outlier_threshold = st.slider("Set Outlier Threshold (seconds):", 1.0, float(audio_df['duration_sec'].max()), 25.0, step=0.5)
	outliers = audio_df[audio_df['duration_sec'] > outlier_threshold]
	st.write(outliers if not outliers.empty else "No outliers found above the threshold.")

	# Optional Filtering
	st.subheader("Filter Audio Files by Duration")
	min_range, max_range = st.slider("Select Duration Range (seconds):", 0.0, float(audio_df['duration_sec'].max()), (0.0, float(audio_df['duration_sec'].max())), step=0.5)
	filtered_files = audio_df[(audio_df['duration_sec'] >= min_range) & (audio_df['duration_sec'] <= max_range)]
	st.write(f"Number of Files in Range: {len(filtered_files)}")
	st.dataframe(filtered_files[['file_name', 'duration_sec']])
	else:
	st.warning("No audio data available to display.")

	# Patient Demographics Tab
	with tabs[2]:
	st.header("Patient Demographics")
	st.subheader("Demographics Data")
	st.dataframe(patient_df)

	st.subheader("Missing Values Information")
	st.write(patient_df.isna().sum())

	st.subheader("Key Statistics")
	avg_age, min_age, max_age = patient_df['Age'].mean(), patient_df['Age'].min(), patient_df['Age'].max()
	st.markdown(f"- Average Age: {avg_age:.1f} years\n- Youngest Patient: {min_age} years\n- Oldest Patient: {max_age} years")

	# Visualizations
	st.markdown("### Age Distribution")
	fig, ax = plt.subplots(figsize=(10, 6))
	sns.histplot(patient_df['Age'].dropna(), bins=20, kde=True, color='skyblue', ax=ax)
	ax.set_title("Age Distribution", fontsize=16, fontweight='bold')
	st.pyplot(fig)

	st.markdown("### Gender Distribution")
	fig, ax = plt.subplots(figsize=(10, 6))
	sns.barplot(x=patient_df['Sex'].value_counts().index, y=patient_df['Sex'].value_counts().values, palette="coolwarm", ax=ax, hue=patient_df['Sex'].value_counts().index, dodge=False)
	ax.set_title("Gender Distribution", fontsize=16, fontweight='bold')
	st.pyplot(fig)


	with tabs[3]:
	st.header("Preprocessing & Audio Effects")

	# List all .wav files in the AUDIO_PATH directory
	wav_files = [f for f in os.listdir(AUDIO_PATH) if f.endswith('.wav')]

	if wav_files:
	selected_file_name = st.selectbox("Select an Audio File", wav_files)

	# Construct the full path of the selected file
	file_path = os.path.join(AUDIO_PATH, selected_file_name)

	try:
	# Load raw audio
	y_raw, sr = librosa.load(file_path)
	except Exception as e:
	st.error(f"Error loading audio file: {e}")
	st.stop()

	# Preprocessing and Visualization
	try:
	y_processed, processed_sr = analysis.preprocess_audio(y_raw, sr)

	# Mel spectrogram
	mel = librosa.feature.melspectrogram(
	y=y_processed, sr=processed_sr, n_fft=2048, hop_length=512, power=2.0
	)
	mel_db = librosa.power_to_db(mel, ref=np.max)

	# STFT
	stft = librosa.stft(y_processed, n_fft=2048, hop_length=512)
	stft_db = librosa.amplitude_to_db(np.abs(stft), ref=np.max)

	# Frequency Spectrum
	fft = np.abs(np.fft.rfft(y_processed))
	freqs = np.fft.rfftfreq(len(y_processed), 1 / processed_sr)

	# Zero-Crossing Rate
	zcr = librosa.feature.zero_crossing_rate(y_processed)[0]

	# RMS Energy
	rms = librosa.feature.rms(y=y_processed)[0]

	# Create subplots for visualizations
	fig, axs = plt.subplots(3, 2, figsize=(15, 12))

	# Raw waveform
	librosa.display.waveshow(y_raw, sr=sr, ax=axs[0, 0])
	axs[0, 0].set_title("Raw Waveform", fontsize=12)

	# Preprocessed waveform
	librosa.display.waveshow(y_processed, sr=processed_sr, ax=axs[0, 1])
	axs[0, 1].set_title("Preprocessed Waveform", fontsize=12)

	# Frequency spectrum
	axs[1, 0].plot(freqs, fft, color='blue')
	axs[1, 0].set_title("Frequency Spectrum", fontsize=12)
	axs[1, 0].set_xlabel("Frequency (Hz)")
	axs[1, 0].set_ylabel("Amplitude")

	# ZCR
	axs[1, 1].plot(zcr, color='green')
	axs[1, 1].set_title("Zero-Crossing Rate", fontsize=12)
	axs[1, 1].set_xlabel("Frames")
	axs[1, 1].set_ylabel("Rate")

	# RMS Energy
	axs[2, 0].plot(rms, color='red')
	axs[2, 0].set_title("RMS Energy", fontsize=12)
	axs[2, 0].set_xlabel("Frames")
	axs[2, 0].set_ylabel("RMS")

	# Mel spectrogram
	img_mel = librosa.display.specshow(
	mel_db, sr=processed_sr, x_axis='time', y_axis='mel', ax=axs[2, 1], cmap='viridis'
	)
	axs[2, 1].set_title("Mel Spectrogram", fontsize=12)
	fig.colorbar(img_mel, ax=axs[2, 1], format="%+2.0f dB")

	# Adjust layout
	plt.tight_layout()
	st.pyplot(fig)

	except Exception as e:
	st.error(f"Error during audio preprocessing or visualization: {e}")
	st.stop()

	# Play audio
	st.subheader("Listen to Audio")
	st.audio(file_path, format="audio/wav")
	else:
	st.warning("No audio files found in the directory.")