Spaces:

EzekielMW
/

Spectroscopy

Sleeping

App Files Files Community

EzekielMW commited on Jul 22, 2025

Commit

9e8b97d

verified ·

1 Parent(s): 76f2067

Update app.py

Browse files

Files changed (1) hide show

app.py +159 -203

app.py CHANGED Viewed

@@ -4,226 +4,182 @@ import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from sklearn.decomposition import PCA
 from scipy.signal import savgol_filter
 from math import pi
-from matplotlib.cm import get_cmap  # ✅ Import corrected colormap function
-# Ensure interactive backend for plotting
 plt.switch_backend('agg')
 # Load dataset
 df = pd.read_csv("milk_absorbance.csv")
 df.rename(columns={df.columns[0]: 'Label'}, inplace=True)
-def plot_all():
-    plots = []
-    # Plot 1: Mean Spectra per Class
-    fig1 = plt.figure(figsize=(12, 6))
-    for label in df['Label'].unique():
-        class_df = df[df['Label'] == label]
-        mean_spectrum = class_df.iloc[:, 1:].mean()
-        plt.plot(mean_spectrum.index.astype(int), mean_spectrum, label=f'Label {label}')
-    plt.title('Mean NIR Spectrum per Milk Ratio Class')
-    plt.xlabel('Wavelength (nm)')
-    plt.ylabel('Absorbance')
-    plt.legend(title='Class (Milk Ratio)')
-    plt.grid(True)
-    plt.tight_layout()
-    plots.append(fig1)
-    plt.close(fig1)
-    # Plot 2: Offset Mean Spectra
-    fig2 = plt.figure(figsize=(12, 6))
-    offset_step = 0.1
-    for i, label in enumerate(df['Label'].unique()):
-        class_df = df[df['Label'] == label]
-        mean_spectrum = class_df.iloc[:, 1:].mean()
-        offset = i * offset_step
-        plt.plot(mean_spectrum.index.astype(int), mean_spectrum + offset, label=f'Label {label}')
-    plt.title('Mean NIR Spectrum per Milk Ratio Class (with Offset)')
-    plt.xlabel('Wavelength (nm)')
-    plt.ylabel('Absorbance (Offset Applied)')
-    plt.legend(title='Class (Milk Ratio)')
-    plt.grid(True)
-    plt.tight_layout()
-    plots.append(fig2)
-    plt.close(fig2)
-    # Plot 3: Radar Plot
-    fig3 = plt.figure(figsize=(8, 8))
-    ax = plt.subplot(111, polar=True)
-    subset_cols = df.columns[1:][::20]
-    labels = df['Label'].unique()
-    N = len(subset_cols)
-    angles = [n / float(N) * 2 * pi for n in range(N)] + [0]
-    for label in labels:
-        class_df = df[df['Label'] == label]
-        mean_spectrum = class_df[subset_cols].mean().values
-        values = mean_spectrum.tolist() + [mean_spectrum[0]]
-        ax.plot(angles, values, label=f'Label {label}')
-        ax.fill(angles, values, alpha=0.1)
-    ax.set_xticks(angles[:-1])
-    ax.set_xticklabels(subset_cols.astype(int))
-    plt.title('Radar Plot of Mean Spectra (Subset Wavelengths)')
-    plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
-    plt.tight_layout()
-    plots.append(fig3)
-    plt.close(fig3)
-    # Plot 4: Cumulative PCA Explained Variance
-    fig4 = plt.figure(figsize=(8, 5))
-    X = df.iloc[:, 1:].values
-    X_scaled = StandardScaler().fit_transform(X)
-    pca = PCA(n_components=20)
-    pca.fit(X_scaled)
-    explained = np.cumsum(pca.explained_variance_ratio_)
-    plt.plot(range(1, 21), explained, marker='o')
-    plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')
-    plt.title('Cumulative Explained Variance by PCA')
-    plt.xlabel('Number of Principal Components')
-    plt.ylabel('Cumulative Variance')
-    plt.legend()
-    plt.grid(True)
-    plt.tight_layout()
-    plots.append(fig4)
-    plt.close(fig4)
-    # Plot 5: Derivative + Normalized Spectra
-    fig5 = plt.figure(figsize=(16, 8))
-    y_vals = df['Label'].values
-    wavelengths = df.columns[1:].astype(float)
-    X = df.iloc[:, 1:].values
-    X_deriv = savgol_filter(X, window_length=25, polyorder=5, deriv=1, axis=1)
-    scaler = MinMaxScaler()
-    X_deriv_norm = np.array([scaler.fit_transform(row.reshape(-1, 1)).flatten() for row in X_deriv])
-    unique_labels = np.unique(y_vals)
-    colors = get_cmap('tab10')(np.linspace(0, 1, len(unique_labels)))
-    for label, color in zip(unique_labels, colors):
-        indices = np.where(y_vals == label)[0]
-        for i in indices:
-            plt.plot(wavelengths, X_deriv_norm[i], color=color, alpha=0.3, label=f'Milk {label}' if i == indices[0] else '')
-    plt.title("All Spectra After First Derivative + Normalization")
-    plt.xlabel("Wavelength (nm)")
-    plt.ylabel("Normalized First Derivative")
-    plt.legend(title="Group")
-    plt.grid(True)
-    plt.tight_layout()
-    plots.append(fig5)
-    plt.close(fig5)
-    # Plot 6: Derivative Only (No Norm)
-    fig6 = plt.figure(figsize=(16, 8))
-    for label, color in zip(unique_labels, colors):
-        indices = np.where(y_vals == label)[0]
-        for i in indices:
-            plt.plot(wavelengths, X_deriv[i], color=color, alpha=0.3, label=f'Milk {label}' if i == indices[0] else '')
-    plt.title("All Spectra After First Derivative (No Normalization)")
-    plt.xlabel("Wavelength (nm)")
-    plt.ylabel("First Derivative Absorbance")
-    plt.legend(title="Group")
-    plt.grid(True)
-    plt.tight_layout()
-    plots.append(fig6)
-    plt.close(fig6)
-    # Plot 7: Score + Loadings
-    fig7, axs = plt.subplots(1, 2, figsize=(14, 5))
-    wavelength_columns = df.columns[1:]
-    labels = df.iloc[:, 0]
-    data = df.iloc[:, 1:].values.astype(float)
-    derivative_data = np.diff(data, axis=1)
     scaler = StandardScaler()
-    normalized_derivative_data = scaler.fit_transform(derivative_data)
-    derivative_wavelength_columns = [f'Der_{w1}-{w2}' for w1, w2 in zip(wavelength_columns[:-1], wavelength_columns[1:])]
-    processed_df = pd.DataFrame(normalized_derivative_data, columns=derivative_wavelength_columns)
-    processed_df.insert(0, 'Label', labels)
-    processed_df['Label'] = processed_df['Label'].astype(int)
-    X_processed = processed_df.drop('Label', axis=1)
-    y_processed = processed_df['Label']
     pca = PCA(n_components=2)
-    principal_components = pca.fit_transform(X_processed)
-    pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
-    pca_df['Label'] = y_processed.reset_index(drop=True)
-    targets = y_processed.unique()
-    cmap = get_cmap('tab10')
-    for i, target in enumerate(targets):
-        idx = pca_df['Label'] == target
-        axs[0].scatter(pca_df.loc[idx, 'PC1'], pca_df.loc[idx, 'PC2'], color=cmap(i % cmap.N), label=f'Label {target}')
-    axs[0].set_title('Score Plot: PC1 vs. PC2')
-    axs[0].legend()
-    axs[0].grid()
-    loadings = pca.components_.T
-    axs[1].plot(loadings[:, 0], label='PC1 Loadings')
-    axs[1].plot(loadings[:, 1], label='PC2 Loadings', color='black')
-    axs[1].set_title('Loadings Plot')
-    axs[1].legend()
-    axs[1].grid()
-    plt.tight_layout()
-    plots.append(fig7)
-    plt.close(fig7)
-    # Plot 8: 3x2 PCA Summary
-    fig8, axs = plt.subplots(3, 2, figsize=(16, 14))
-    raw_data = df.iloc[:, 1:].values.astype(float)
-    derivative_data = np.diff(raw_data, axis=1)
-    scaler = StandardScaler()
-    raw_scaled = scaler.fit_transform(raw_data)
-    derivative_scaled = scaler.fit_transform(derivative_data)
-    pca_raw = PCA(n_components=10)
-    pca_raw_scores = pca_raw.fit_transform(raw_scaled)
-    explained_var_raw = np.cumsum(pca_raw.explained_variance_ratio_) * 100
-    pca_der = PCA(n_components=10)
-    pca_der_scores = pca_der.fit_transform(derivative_scaled)
-    explained_var_der = np.cumsum(pca_der.explained_variance_ratio_) * 100
-    targets = np.unique(labels)
-    cmap = get_cmap('tab10')
-    for i, target in enumerate(targets):
-        idx = labels == target
-        axs[0, 0].scatter(pca_raw_scores[idx, 0], pca_raw_scores[idx, 1], s=40, label=f'Milk {target}', color=cmap(i % cmap.N))
-        axs[0, 0].axhline(0, color='gray', linestyle='--', linewidth=2)  # Horizontal
-        axs[0, 0].axvline(0, color='gray', linestyle='--', linewidth=2)  # Vertical
-        axs[0, 1].scatter(pca_der_scores[idx, 0], pca_der_scores[idx, 1], s=40, label=f'Milk {target}', color=cmap(i % cmap.N))
-        axs[0, 1].axhline(0, color='gray', linestyle='--', linewidth=2)  # Horizontal
-        axs[0, 1].axvline(0, color='gray', linestyle='--', linewidth=2)  # Vertical
-        axs[0, 0].set_title('Raw Data: PCA Score Plot')
-        axs[0, 1].set_title('1st Derivative: PCA Score Plot')
-    # Row 2: PCA Loadings for Raw and Derivative (with horizontal and vertical lines at 0)
-    axs[1, 0].plot(pca_raw.components_[0], label='PC1')
-    axs[1, 0].plot(pca_raw.components_[1], label='PC2')
-    axs[1, 0].axhline(0, color='gray', linestyle='--', linewidth=2)  # Horizontal
-    axs[1, 0].axvline(0, color='gray', linestyle='--', linewidth=2)  # Vertical
-    axs[1, 1].plot(pca_der.components_[0], label='PC1')
-    axs[1, 1].plot(pca_der.components_[1], label='PC2')
-    axs[1, 1].axhline(0, color='gray', linestyle='--', linewidth=2)  # Horizontal
-    axs[1, 1].axvline(0, color='gray', linestyle='--', linewidth=2)  # Vertical
-    axs[2, 0].plot(range(1, 11), explained_var_raw, marker='o')
-    axs[2, 1].plot(range(1, 11), explained_var_der, marker='o')
-    axs[0, 0].legend(); axs[0, 1].legend()
-    axs[1, 0].legend(); axs[1, 1].legend()
-    axs[2, 0].set_ylim(0, 105)
-    axs[2, 1].set_ylim(0, 105)
-    axs[2, 0].set_title('Raw Data: Scree Plot')
-    axs[2, 1].set_title('1st Derivative: Scree Plot')
-    plt.tight_layout()
-    plots.append(fig8)
-    plt.close(fig8)
-    return plots
-# Gradio UI
 with gr.Blocks() as demo:
-    gr.Markdown("# 🧪 Dataset Description")
     with gr.Tabs():
-        with gr.Tab("Preview Raw Data"):
             gr.DataFrame(df.head(50), label="Preview of Raw Data")
         with gr.Tab("Visualizations"):
             plot_button = gr.Button("Generate Spectroscopy Visualizations")
             out_gallery = [gr.Plot() for _ in range(8)]
             plot_button.click(fn=plot_all, inputs=[], outputs=out_gallery)
 demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)

 import matplotlib.pyplot as plt
 from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from sklearn.decomposition import PCA
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.metrics import accuracy_score, confusion_matrix
 from scipy.signal import savgol_filter
+from tensorflow.keras.models import Sequential
+from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense, Dropout
+from tensorflow.keras.utils import to_categorical
+from tensorflow.keras.callbacks import History
+import seaborn as sns
+import io
+import os
 from math import pi
+from matplotlib.cm import get_cmap
+import warnings
+warnings.filterwarnings("ignore")
 plt.switch_backend('agg')
 # Load dataset
 df = pd.read_csv("milk_absorbance.csv")
 df.rename(columns={df.columns[0]: 'Label'}, inplace=True)
+# ===================== Helper Functions =========================
+def compute_pca_data(df):
     scaler = StandardScaler()
+    features = df.iloc[:, 1:].values.astype(float)
+    features_scaled = scaler.fit_transform(features)
     pca = PCA(n_components=2)
+    pca_data = pca.fit_transform(features_scaled)
+    return pca_data, df['Label'].values
+def train_model_on_pca(model_name):
+    X, y = compute_pca_data(df)
+    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)
+    if model_name == "Random Forest":
+        model = RandomForestClassifier(n_estimators=100)
+    elif model_name == "Decision Tree":
+        model = DecisionTreeClassifier()
+    train_accuracies, test_accuracies = [], []
+    for epoch in range(1, 11):
+        model.fit(X_train, y_train)
+        train_acc = accuracy_score(y_train, model.predict(X_train))
+        test_acc = accuracy_score(y_test, model.predict(X_test))
+        train_accuracies.append(train_acc)
+        test_accuracies.append(test_acc)
+    cm = confusion_matrix(y_test, model.predict(X_test))
+    return train_accuracies, test_accuracies, cm
+def train_1d_cnn():
+    X = df.iloc[:, 1:].values.astype(float)
+    y = df['Label'].astype(int).values
+    X = X[:, :, np.newaxis]  # Shape for Conv1D
+    y_cat = to_categorical(y)
+    X_train, X_test, y_train, y_test = train_test_split(X, y_cat, test_size=0.2, stratify=y)
+    model = Sequential([
+        Conv1D(32, kernel_size=5, activation='relu', input_shape=(X.shape[1], 1)),
+        MaxPooling1D(pool_size=2),
+        Flatten(),
+        Dense(64, activation='relu'),
+        Dropout(0.3),
+        Dense(y_cat.shape[1], activation='softmax')
+    ])
+    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
+    history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=10, verbose=0)
+    cm = confusion_matrix(np.argmax(y_test, axis=1), np.argmax(model.predict(X_test), axis=1))
+    return history.history['accuracy'], history.history['val_accuracy'], cm
+def create_plot(train_acc, test_acc):
+    fig, ax = plt.subplots()
+    ax.plot(range(1, 11), train_acc, label="Train Accuracy")
+    ax.plot(range(1, 11), test_acc, label="Test Accuracy")
+    ax.set_xlabel("Epoch")
+    ax.set_ylabel("Accuracy")
+    ax.set_title("Train vs Test Accuracy")
+    ax.legend()
+    return fig
+def plot_confusion_matrix(cm):
+    fig, ax = plt.subplots()
+    sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
+    ax.set_xlabel("Predicted")
+    ax.set_ylabel("True")
+    ax.set_title("Confusion Matrix")
+    return fig
+def predict_model(input_df, model_name):
+    if model_name in ["Random Forest", "Decision Tree"]:
+        X, y = compute_pca_data(df)
+        if model_name == "Random Forest":
+            model = RandomForestClassifier(n_estimators=100)
+        else:
+            model = DecisionTreeClassifier()
+        model.fit(X, y)
+        input_pca, _ = compute_pca_data(input_df)
+        return model.predict(input_pca)
+    elif model_name == "1D CNN":
+        X = df.iloc[:, 1:].values.astype(float)
+        y = df['Label'].astype(int).values
+        X = X[:, :, np.newaxis]
+        y_cat = to_categorical(y)
+        model = Sequential([
+            Conv1D(32, kernel_size=5, activation='relu', input_shape=(X.shape[1], 1)),
+            MaxPooling1D(pool_size=2),
+            Flatten(),
+            Dense(64, activation='relu'),
+            Dropout(0.3),
+            Dense(y_cat.shape[1], activation='softmax')
+        ])
+        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
+        model.fit(X, y_cat, epochs=10, verbose=0)
+        input_data = input_df.iloc[:, 1:].values.astype(float)[:, :, np.newaxis]
+        return np.argmax(model.predict(input_data), axis=1)
+# ===================== Gradio UI =========================
 with gr.Blocks() as demo:
+    gr.Markdown("# 🧪 Milk Spectroscopy Analysis App")
     with gr.Tabs():
+        with gr.Tab("Dataset Description"):
             gr.DataFrame(df.head(50), label="Preview of Raw Data")
         with gr.Tab("Visualizations"):
             plot_button = gr.Button("Generate Spectroscopy Visualizations")
             out_gallery = [gr.Plot() for _ in range(8)]
             plot_button.click(fn=plot_all, inputs=[], outputs=out_gallery)
+        with gr.Tab("Models"):
+            with gr.Tabs():
+                with gr.Tab("Random Forest"):
+                    rf_btn = gr.Button("Train Random Forest")
+                    rf_table = gr.Dataframe(headers=["Epoch", "Train Acc", "Test Acc"])
+                    rf_plot = gr.Plot()
+                    rf_cm = gr.Plot()
+                    def run_rf():
+                        train_acc, test_acc, cm = train_model_on_pca("Random Forest")
+                        table = pd.DataFrame({"Epoch": list(range(1, 11)), "Train Acc": train_acc, "Test Acc": test_acc})
+                        return table, create_plot(train_acc, test_acc), plot_confusion_matrix(cm)
+                    rf_btn.click(fn=run_rf, inputs=[], outputs=[rf_table, rf_plot, rf_cm])
+                with gr.Tab("Decision Tree"):
+                    dt_btn = gr.Button("Train Decision Tree")
+                    dt_table = gr.Dataframe(headers=["Epoch", "Train Acc", "Test Acc"])
+                    dt_plot = gr.Plot()
+                    dt_cm = gr.Plot()
+                    def run_dt():
+                        train_acc, test_acc, cm = train_model_on_pca("Decision Tree")
+                        table = pd.DataFrame({"Epoch": list(range(1, 11)), "Train Acc": train_acc, "Test Acc": test_acc})
+                        return table, create_plot(train_acc, test_acc), plot_confusion_matrix(cm)
+                    dt_btn.click(fn=run_dt, inputs=[], outputs=[dt_table, dt_plot, dt_cm])
+                with gr.Tab("1D CNN (Raw Data)"):
+                    cnn_btn = gr.Button("Train 1D CNN")
+                    cnn_table = gr.Dataframe(headers=["Epoch", "Train Acc", "Test Acc"])
+                    cnn_plot = gr.Plot()
+                    cnn_cm = gr.Plot()
+                    def run_cnn():
+                        train_acc, test_acc, cm = train_1d_cnn()
+                        table = pd.DataFrame({"Epoch": list(range(1, 11)), "Train Acc": train_acc, "Test Acc": test_acc})
+                        return table, create_plot(train_acc, test_acc), plot_confusion_matrix(cm)
+                    cnn_btn.click(fn=run_cnn, inputs=[], outputs=[cnn_table, cnn_plot, cnn_cm])
+        with gr.Tab("Prediction"):
+            model_dropdown = gr.Dropdown(choices=["Random Forest", "Decision Tree", "1D CNN"], label="Select Model")
+            input_type = gr.Radio(choices=["Single", "Multiple (CSV)"])
+            csv_input = gr.File(file_types=[".csv"], label="Upload CSV")
+            predict_btn = gr.Button("Predict")
+            output_df = gr.DataFrame()
+            def predict_fn(model_name, type_sel, file):
+                if type_sel == "Multiple (CSV)":
+                    data = pd.read_csv(file.name)
+                else:
+                    data = df.sample(1)  # fallback dummy
+                preds = predict_model(data, model_name)
+                return pd.DataFrame({"Prediction": preds})
+            predict_btn.click(fn=predict_fn, inputs=[model_dropdown, input_type, csv_input], outputs=output_df)
 demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)