Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import librosa | |
| import librosa.display | |
| from Exploration.inference import RespiratorySoundAnalysis | |
| import seaborn as sns | |
| import os | |
| # Define base paths | |
| BASE_PATH = 'D://github//AmpleHealth//data//Respiratory_Sound_Database//Respiratory_Sound_Database' | |
| DIAGNOSIS_FILE = os.path.join(BASE_PATH, 'patient_diagnosis.csv') | |
| AUDIO_PATH = os.path.join(BASE_PATH, 'testsample') | |
| DEMOGRAPHIC_FILE = os.path.join('D://github//AmpleHealth//data', 'demographic_info.txt') | |
| # Initialize analysis object | |
| analysis = RespiratorySoundAnalysis(DIAGNOSIS_FILE, AUDIO_PATH) | |
| # Load data | |
| def load_data(): | |
| analysis.load_diagnosis_data() | |
| analysis.load_audio_files() | |
| analysis.analyze_audio_properties() | |
| return analysis.diagnosis_df, analysis.audio_df | |
| diagnosis_df, audio_df = load_data() | |
| # Load patient demographic data | |
| def load_patient_demographics(): | |
| patient_df = pd.read_csv( | |
| DEMOGRAPHIC_FILE, | |
| names=['Patient number', 'Age', 'Sex', 'Adult BMI (kg/m2)', 'Child Weight (kg)', 'Child Height (cm)'], | |
| delimiter=' ' | |
| ) | |
| return patient_df | |
| patient_df = load_patient_demographics() | |
| # Streamlit App | |
| st.title("Respiratory Sound Data Explorer") | |
| # Tabs for navigation | |
| tabs = st.tabs(["Overview", "Explore Data", "Patient Demographics", "Preprocessing & Audio Effects"]) | |
| # Overview Tab | |
| # Overview Tab | |
| with tabs[0]: | |
| st.header("Dataset Overview") | |
| # Highlight key statistics | |
| total_patients = len(diagnosis_df) | |
| most_common_disease = diagnosis_df['disease'].value_counts().idxmax() | |
| least_common_disease = diagnosis_df['disease'].value_counts().idxmin() | |
| st.subheader("Key Statistics") | |
| st.markdown(f""" | |
| - **Total Patients:** {total_patients} | |
| - **Most Common Disease:** {most_common_disease} ({diagnosis_df['disease'].value_counts().max()} patients) | |
| - **Least Common Disease:** {least_common_disease} ({diagnosis_df['disease'].value_counts().min()} patients) | |
| """) | |
| # Diagnosis Distribution | |
| st.subheader("Diagnosis Distribution") | |
| disease_counts = diagnosis_df['disease'].value_counts() | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| sns.barplot(y=disease_counts.index, x=disease_counts.values, palette="viridis", ax=ax, legend=False, hue=disease_counts.index, dodge=False) | |
| ax.set_title("Disease Distribution", fontsize=16, fontweight='bold') | |
| ax.set_xlabel("Number of Patients", fontsize=12) | |
| ax.set_ylabel("Disease", fontsize=12) | |
| for i, v in enumerate(disease_counts.values): | |
| ax.text(v + 1, i, str(v), color='black', fontsize=10, va='center') | |
| st.pyplot(fig) | |
| # Proportion of Diseases | |
| st.subheader("Disease Proportion") | |
| disease_proportions = diagnosis_df['disease'].value_counts(normalize=True) * 100 | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| sns.barplot(y=disease_proportions.index, x=disease_proportions.values, hue=disease_proportions.index, dodge=False, palette="coolwarm", ax=ax, legend=False) | |
| ax.set_title("Disease Proportion (%)", fontsize=16, fontweight='bold') | |
| ax.set_xlabel("Proportion (%)", fontsize=12) | |
| ax.set_ylabel("Disease", fontsize=12) | |
| for i, v in enumerate(disease_proportions.values): | |
| ax.text(v + 0.5, i, f"{v:.1f}%", color='black', fontsize=10, va='center') | |
| st.pyplot(fig) | |
| # Explore Data Tab | |
| with tabs[1]: | |
| st.header("Explore Data") | |
| if audio_df is not None and not audio_df.empty: | |
| # Key Audio Insights | |
| st.subheader("Key Audio Insights") | |
| st.markdown(f""" | |
| - **Total Audio Files:** {len(audio_df)} | |
| - **Average Duration:** {audio_df['duration_sec'].mean():.2f} seconds | |
| - **Shortest Audio File:** {audio_df.loc[audio_df['duration_sec'].idxmin(), 'file_name']} ({audio_df['duration_sec'].min():.2f} seconds) | |
| - **Longest Audio File:** {audio_df.loc[audio_df['duration_sec'].idxmax(), 'file_name']} ({audio_df['duration_sec'].max():.2f} seconds) | |
| """) | |
| # Duration Distribution | |
| st.subheader("Audio Duration Distribution") | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| sns.histplot(audio_df['duration_sec'], bins=20, kde=True, color='skyblue', ax=ax) | |
| ax.set_title("Audio Duration Distribution", fontsize=16, fontweight='bold') | |
| ax.set_xlabel("Duration (seconds)", fontsize=12) | |
| ax.set_ylabel("Frequency", fontsize=12) | |
| st.pyplot(fig) | |
| # Highlight Outliers | |
| st.subheader("Audio Duration Outliers") | |
| outlier_threshold = st.slider("Set Outlier Threshold (seconds):", 1.0, float(audio_df['duration_sec'].max()), 25.0, step=0.5) | |
| outliers = audio_df[audio_df['duration_sec'] > outlier_threshold] | |
| st.write(outliers if not outliers.empty else "No outliers found above the threshold.") | |
| # Optional Filtering | |
| st.subheader("Filter Audio Files by Duration") | |
| min_range, max_range = st.slider("Select Duration Range (seconds):", 0.0, float(audio_df['duration_sec'].max()), (0.0, float(audio_df['duration_sec'].max())), step=0.5) | |
| filtered_files = audio_df[(audio_df['duration_sec'] >= min_range) & (audio_df['duration_sec'] <= max_range)] | |
| st.write(f"**Number of Files in Range:** {len(filtered_files)}") | |
| st.dataframe(filtered_files[['file_name', 'duration_sec']]) | |
| else: | |
| st.warning("No audio data available to display.") | |
| # Patient Demographics Tab | |
| with tabs[2]: | |
| st.header("Patient Demographics") | |
| st.subheader("Demographics Data") | |
| st.dataframe(patient_df) | |
| st.subheader("Missing Values Information") | |
| st.write(patient_df.isna().sum()) | |
| st.subheader("Key Statistics") | |
| avg_age, min_age, max_age = patient_df['Age'].mean(), patient_df['Age'].min(), patient_df['Age'].max() | |
| st.markdown(f"- **Average Age:** {avg_age:.1f} years\n- **Youngest Patient:** {min_age} years\n- **Oldest Patient:** {max_age} years") | |
| # Visualizations | |
| st.markdown("### Age Distribution") | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| sns.histplot(patient_df['Age'].dropna(), bins=20, kde=True, color='skyblue', ax=ax) | |
| ax.set_title("Age Distribution", fontsize=16, fontweight='bold') | |
| st.pyplot(fig) | |
| st.markdown("### Gender Distribution") | |
| fig, ax = plt.subplots(figsize=(10, 6)) | |
| sns.barplot(x=patient_df['Sex'].value_counts().index, y=patient_df['Sex'].value_counts().values, palette="coolwarm", ax=ax, hue=patient_df['Sex'].value_counts().index, dodge=False) | |
| ax.set_title("Gender Distribution", fontsize=16, fontweight='bold') | |
| st.pyplot(fig) | |
| with tabs[3]: | |
| st.header("Preprocessing & Audio Effects") | |
| # List all .wav files in the AUDIO_PATH directory | |
| wav_files = [f for f in os.listdir(AUDIO_PATH) if f.endswith('.wav')] | |
| if wav_files: | |
| selected_file_name = st.selectbox("Select an Audio File", wav_files) | |
| # Construct the full path of the selected file | |
| file_path = os.path.join(AUDIO_PATH, selected_file_name) | |
| try: | |
| # Load raw audio | |
| y_raw, sr = librosa.load(file_path) | |
| except Exception as e: | |
| st.error(f"Error loading audio file: {e}") | |
| st.stop() | |
| # Preprocessing and Visualization | |
| try: | |
| y_processed, processed_sr = analysis.preprocess_audio(y_raw, sr) | |
| # Mel spectrogram | |
| mel = librosa.feature.melspectrogram( | |
| y=y_processed, sr=processed_sr, n_fft=2048, hop_length=512, power=2.0 | |
| ) | |
| mel_db = librosa.power_to_db(mel, ref=np.max) | |
| # STFT | |
| stft = librosa.stft(y_processed, n_fft=2048, hop_length=512) | |
| stft_db = librosa.amplitude_to_db(np.abs(stft), ref=np.max) | |
| # Frequency Spectrum | |
| fft = np.abs(np.fft.rfft(y_processed)) | |
| freqs = np.fft.rfftfreq(len(y_processed), 1 / processed_sr) | |
| # Zero-Crossing Rate | |
| zcr = librosa.feature.zero_crossing_rate(y_processed)[0] | |
| # RMS Energy | |
| rms = librosa.feature.rms(y=y_processed)[0] | |
| # Create subplots for visualizations | |
| fig, axs = plt.subplots(3, 2, figsize=(15, 12)) | |
| # Raw waveform | |
| librosa.display.waveshow(y_raw, sr=sr, ax=axs[0, 0]) | |
| axs[0, 0].set_title("Raw Waveform", fontsize=12) | |
| # Preprocessed waveform | |
| librosa.display.waveshow(y_processed, sr=processed_sr, ax=axs[0, 1]) | |
| axs[0, 1].set_title("Preprocessed Waveform", fontsize=12) | |
| # Frequency spectrum | |
| axs[1, 0].plot(freqs, fft, color='blue') | |
| axs[1, 0].set_title("Frequency Spectrum", fontsize=12) | |
| axs[1, 0].set_xlabel("Frequency (Hz)") | |
| axs[1, 0].set_ylabel("Amplitude") | |
| # ZCR | |
| axs[1, 1].plot(zcr, color='green') | |
| axs[1, 1].set_title("Zero-Crossing Rate", fontsize=12) | |
| axs[1, 1].set_xlabel("Frames") | |
| axs[1, 1].set_ylabel("Rate") | |
| # RMS Energy | |
| axs[2, 0].plot(rms, color='red') | |
| axs[2, 0].set_title("RMS Energy", fontsize=12) | |
| axs[2, 0].set_xlabel("Frames") | |
| axs[2, 0].set_ylabel("RMS") | |
| # Mel spectrogram | |
| img_mel = librosa.display.specshow( | |
| mel_db, sr=processed_sr, x_axis='time', y_axis='mel', ax=axs[2, 1], cmap='viridis' | |
| ) | |
| axs[2, 1].set_title("Mel Spectrogram", fontsize=12) | |
| fig.colorbar(img_mel, ax=axs[2, 1], format="%+2.0f dB") | |
| # Adjust layout | |
| plt.tight_layout() | |
| st.pyplot(fig) | |
| except Exception as e: | |
| st.error(f"Error during audio preprocessing or visualization: {e}") | |
| st.stop() | |
| # Play audio | |
| st.subheader("Listen to Audio") | |
| st.audio(file_path, format="audio/wav") | |
| else: | |
| st.warning("No audio files found in the directory.") | |