Spaces:

EzekielMW
/

Spectroscopy

Sleeping

File size: 14,661 Bytes

# ✅ FULL INTEGRATED SCRIPT
# Includes your existing visualizations + new Models and Prediction tabs

import gradio as gr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import StandardScaler, MinMaxScaler,LabelEncoder
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, confusion_matrix
from scipy.signal import savgol_filter
from math import pi
from matplotlib.cm import get_cmap
import seaborn as sns
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
from sklearn.model_selection import train_test_split


plt.switch_backend('agg')

# Load dataset
df = pd.read_csv("milk_absorbance.csv")
df.rename(columns={df.columns[0]: 'Label'}, inplace=True)

# Label encoding
le = LabelEncoder()
y = le.fit_transform(df['Label'].values)
# ---------- Plotting Function (Unchanged) ----------
def plot_all():
    plots = []

    # Plot 1: Mean Spectra per Class
    fig1 = plt.figure(figsize=(12, 6))
    for label in df['Label'].unique():
        class_df = df[df['Label'] == label]
        mean_spectrum = class_df.iloc[:, 1:].mean()
        plt.plot(mean_spectrum.index.astype(int), mean_spectrum, label=f'Label {label}')
    plt.title('Mean NIR Spectrum per Milk Ratio Class')
    plt.xlabel('Wavelength (nm)')
    plt.ylabel('Absorbance')
    plt.legend(title='Class (Milk Ratio)')
    plt.grid(True)
    plt.tight_layout()
    plots.append(fig1)
    plt.close(fig1)

    # Plot 2: Offset Mean Spectra
    fig2 = plt.figure(figsize=(12, 6))
    offset_step = 0.1
    for i, label in enumerate(df['Label'].unique()):
        class_df = df[df['Label'] == label]
        mean_spectrum = class_df.iloc[:, 1:].mean()
        offset = i * offset_step
        plt.plot(mean_spectrum.index.astype(int), mean_spectrum + offset, label=f'Label {label}')
    plt.title('Mean NIR Spectrum per Milk Ratio Class (with Offset)')
    plt.xlabel('Wavelength (nm)')
    plt.ylabel('Absorbance (Offset Applied)')
    plt.legend(title='Class (Milk Ratio)')
    plt.grid(True)
    plt.tight_layout()
    plots.append(fig2)
    plt.close(fig2)

    # Plot 3: Radar Plot
    fig3 = plt.figure(figsize=(8, 8))
    ax = plt.subplot(111, polar=True)
    subset_cols = df.columns[1:][::20]
    labels = df['Label'].unique()
    N = len(subset_cols)
    angles = [n / float(N) * 2 * pi for n in range(N)] + [0]
    for label in labels:
        class_df = df[df['Label'] == label]
        mean_spectrum = class_df[subset_cols].mean().values
        values = mean_spectrum.tolist() + [mean_spectrum[0]]
        ax.plot(angles, values, label=f'Label {label}')
        ax.fill(angles, values, alpha=0.1)
    ax.set_xticks(angles[:-1])
    ax.set_xticklabels(subset_cols.astype(int))
    plt.title('Radar Plot of Mean Spectra (Subset Wavelengths)')
    plt.legend(loc='upper right', bbox_to_anchor=(1.3, 1.1))
    plt.tight_layout()
    plots.append(fig3)
    plt.close(fig3)

    # Plot 4: Cumulative PCA Explained Variance
    fig4 = plt.figure(figsize=(8, 5))
    X = df.iloc[:, 1:].values
    X_scaled = StandardScaler().fit_transform(X)
    pca = PCA(n_components=20)
    pca.fit(X_scaled)
    explained = np.cumsum(pca.explained_variance_ratio_)
    plt.plot(range(1, 21), explained, marker='o')
    plt.axhline(y=0.95, color='r', linestyle='--', label='95% Variance')
    plt.title('Cumulative Explained Variance by PCA')
    plt.xlabel('Number of Principal Components')
    plt.ylabel('Cumulative Variance')
    plt.legend()
    plt.grid(True)
    plt.tight_layout()
    plots.append(fig4)
    plt.close(fig4)

    # Plot 5: Derivative + Normalized Spectra
    fig5 = plt.figure(figsize=(16, 8))
    y_vals = df['Label'].values
    wavelengths = df.columns[1:].astype(float)
    X = df.iloc[:, 1:].values
    X_deriv = savgol_filter(X, window_length=25, polyorder=5, deriv=1, axis=1)
    scaler = MinMaxScaler()
    X_deriv_norm = np.array([scaler.fit_transform(row.reshape(-1, 1)).flatten() for row in X_deriv])
    unique_labels = np.unique(y_vals)
    colors = get_cmap('tab10')(np.linspace(0, 1, len(unique_labels)))
    for label, color in zip(unique_labels, colors):
        indices = np.where(y_vals == label)[0]
        for i in indices:
            plt.plot(wavelengths, X_deriv_norm[i], color=color, alpha=0.3, label=f'Milk {label}' if i == indices[0] else '')
    plt.title("All Spectra After First Derivative + Normalization")
    plt.xlabel("Wavelength (nm)")
    plt.ylabel("Normalized First Derivative")
    plt.legend(title="Group")
    plt.grid(True)
    plt.tight_layout()
    plots.append(fig5)
    plt.close(fig5)

    # Plot 6: Derivative Only (No Norm)
    fig6 = plt.figure(figsize=(16, 8))
    for label, color in zip(unique_labels, colors):
        indices = np.where(y_vals == label)[0]
        for i in indices:
            plt.plot(wavelengths, X_deriv[i], color=color, alpha=0.3, label=f'Milk {label}' if i == indices[0] else '')
    plt.title("All Spectra After First Derivative (No Normalization)")
    plt.xlabel("Wavelength (nm)")
    plt.ylabel("First Derivative Absorbance")
    plt.legend(title="Group")
    plt.grid(True)
    plt.tight_layout()
    plots.append(fig6)
    plt.close(fig6)

    # Plot 7: Score + Loadings
    fig7, axs = plt.subplots(1, 2, figsize=(14, 5))
    wavelength_columns = df.columns[1:]
    labels = df.iloc[:, 0]
    data = df.iloc[:, 1:].values.astype(float)
    derivative_data = np.diff(data, axis=1)
    scaler = StandardScaler()
    normalized_derivative_data = scaler.fit_transform(derivative_data)
    derivative_wavelength_columns = [f'Der_{w1}-{w2}' for w1, w2 in zip(wavelength_columns[:-1], wavelength_columns[1:])]
    processed_df = pd.DataFrame(normalized_derivative_data, columns=derivative_wavelength_columns)
    processed_df.insert(0, 'Label', labels)
    processed_df['Label'] = processed_df['Label'].astype(int)
    X_processed = processed_df.drop('Label', axis=1)
    y_processed = processed_df['Label']
    pca = PCA(n_components=2)
    principal_components = pca.fit_transform(X_processed)
    pca_df = pd.DataFrame(data=principal_components, columns=['PC1', 'PC2'])
    pca_df['Label'] = y_processed.reset_index(drop=True)
    targets = y_processed.unique()
    cmap = get_cmap('tab10')
    for i, target in enumerate(targets):
        idx = pca_df['Label'] == target
        axs[0].scatter(pca_df.loc[idx, 'PC1'], pca_df.loc[idx, 'PC2'], color=cmap(i % cmap.N), label=f'Label {target}')
    axs[0].set_title('Score Plot: PC1 vs. PC2')
    axs[0].legend()
    axs[0].grid()
    loadings = pca.components_.T
    axs[1].plot(loadings[:, 0], label='PC1 Loadings')
    axs[1].plot(loadings[:, 1], label='PC2 Loadings', color='black')
    axs[1].set_title('Loadings Plot')
    axs[1].legend()
    axs[1].grid()
    plt.tight_layout()
    plots.append(fig7)
    plt.close(fig7)

    # Plot 8: 3x2 PCA Summary
    fig8, axs = plt.subplots(3, 2, figsize=(16, 14))
    raw_data = df.iloc[:, 1:].values.astype(float)
    derivative_data = np.diff(raw_data, axis=1)
    scaler = StandardScaler()
    raw_scaled = scaler.fit_transform(raw_data)
    derivative_scaled = scaler.fit_transform(derivative_data)
    pca_raw = PCA(n_components=10)
    pca_raw_scores = pca_raw.fit_transform(raw_scaled)
    explained_var_raw = np.cumsum(pca_raw.explained_variance_ratio_) * 100
    pca_der = PCA(n_components=10)
    pca_der_scores = pca_der.fit_transform(derivative_scaled)
    explained_var_der = np.cumsum(pca_der.explained_variance_ratio_) * 100
    targets = np.unique(labels)
    cmap = get_cmap('tab10')
    for i, target in enumerate(targets):
        idx = labels == target
        axs[0, 0].scatter(pca_raw_scores[idx, 0], pca_raw_scores[idx, 1], s=40, label=f'Milk {target}', color=cmap(i % cmap.N))
        axs[0, 0].axhline(0, color='gray', linestyle='--', linewidth=2)  # Horizontal
        axs[0, 0].axvline(0, color='gray', linestyle='--', linewidth=2)  # Vertical
        axs[0, 1].scatter(pca_der_scores[idx, 0], pca_der_scores[idx, 1], s=40, label=f'Milk {target}', color=cmap(i % cmap.N))
        axs[0, 1].axhline(0, color='gray', linestyle='--', linewidth=2)  # Horizontal
        axs[0, 1].axvline(0, color='gray', linestyle='--', linewidth=2)  # Vertical
        axs[0, 0].set_title('Raw Data: PCA Score Plot')
        axs[0, 1].set_title('1st Derivative: PCA Score Plot')
    

    # Row 2: PCA Loadings for Raw and Derivative (with horizontal and vertical lines at 0)
    axs[1, 0].plot(pca_raw.components_[0], label='PC1')
    axs[1, 0].plot(pca_raw.components_[1], label='PC2')
    axs[1, 0].axhline(0, color='gray', linestyle='--', linewidth=2)  # Horizontal
    axs[1, 0].axvline(0, color='gray', linestyle='--', linewidth=2)  # Vertical
    
    axs[1, 1].plot(pca_der.components_[0], label='PC1')
    axs[1, 1].plot(pca_der.components_[1], label='PC2')
    axs[1, 1].axhline(0, color='gray', linestyle='--', linewidth=2)  # Horizontal
    axs[1, 1].axvline(0, color='gray', linestyle='--', linewidth=2)  # Vertical

    axs[2, 0].plot(range(1, 11), explained_var_raw, marker='o')
    axs[2, 1].plot(range(1, 11), explained_var_der, marker='o')
    axs[0, 0].legend(); axs[0, 1].legend()
    axs[1, 0].legend(); axs[1, 1].legend()
    axs[2, 0].set_ylim(0, 105)
    axs[2, 1].set_ylim(0, 105)
    axs[2, 0].set_title('Raw Data: Scree Plot')
    axs[2, 1].set_title('1st Derivative: Scree Plot')
    plt.tight_layout()
    plots.append(fig8)
    plt.close(fig8)

    return plots


# Encode labels
le = LabelEncoder()
y = le.fit_transform(df['Label'].values)
X = df.iloc[:, 1:].values
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# === PCA reduction ===
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X_scaled)
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# === Models ===
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)

# === CNN ===
class MilkDataset(Dataset):
    def __init__(self, X, y):
        self.X = torch.tensor(X, dtype=torch.float32).unsqueeze(1)
        self.y = torch.tensor(y, dtype=torch.long)
    def __len__(self): return len(self.X)
    def __getitem__(self, idx): return self.X[idx], self.y[idx]

X_train_raw, X_test_raw, y_train_raw, y_test_raw = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
train_loader = DataLoader(MilkDataset(X_train_raw, y_train_raw), batch_size=16, shuffle=True)
test_loader = DataLoader(MilkDataset(X_test_raw, y_test_raw), batch_size=16)

class CNN1D(nn.Module):
    def __init__(self):
        super().__init__()
        self.net = nn.Sequential(
            nn.Conv1d(1, 32, 3, padding=1), nn.ReLU(),
            nn.Conv1d(32, 64, 3, padding=1), nn.ReLU(),
            nn.AdaptiveAvgPool1d(1),
            nn.Flatten(),
            nn.Linear(64, len(np.unique(y)))
        )
    def forward(self, x): return self.net(x)

model = CNN1D()
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

for epoch in range(10):
    model.train()
    for Xb, yb in train_loader:
        optimizer.zero_grad()
        loss = criterion(model(Xb), yb)
        loss.backward()
        optimizer.step()

model.eval()
with torch.no_grad():
    X_test_tensor = torch.tensor(X_test_raw, dtype=torch.float32).unsqueeze(1)
    test_preds = model(X_test_tensor).argmax(dim=1)
    test_acc = (test_preds == torch.tensor(y_test_raw)).float().mean().item()

    X_train_tensor = torch.tensor(X_train_raw, dtype=torch.float32).unsqueeze(1)
    train_preds = model(X_train_tensor).argmax(dim=1)
    train_acc = (train_preds == torch.tensor(y_train_raw)).float().mean().item()

with gr.Blocks() as demo:
    gr.Markdown("# 🧪 SPECTROSCOPY - YOUR HEALTH OUR CONCERN!!!")

    with gr.Tabs():
        with gr.Tab("Preview Raw Data"):
            gr.DataFrame(df.head(50), label="Preview of Raw Data")

        with gr.Tab("Visualizations"):
            plot_button = gr.Button("Generate Spectroscopy Visualizations")
            out_gallery = [gr.Plot() for _ in range(8)]
            plot_button.click(fn=plot_all, inputs=[], outputs=out_gallery)

        with gr.Tab("Models"):
            with gr.Tabs():
                with gr.Tab("Random Forest"):
                    gr.Image(value="rf.png", label="Random Forest Output")

                with gr.Tab("Decision Tree"):
                    gr.Markdown("**Confusion Matrix**")
                    gr.Image(value="tree_cm.png", label="Confusion Matrix")
                    gr.Markdown("**Decision Tree Visualization**")
                    gr.Image(value="tree.png", label="Tree Structure")

                with gr.Tab("1D CNN (Raw Data)"):
                    gr.Image(value="1d.png", label="1D CNN Output")

        with gr.Tab("Takeaways"):
            gr.Markdown("## 🌿 Why Spectroscopy Matters in the Dairy Ecosystem")

            gr.Markdown("### 👨‍🌾 Farmers")
            gr.Markdown("""
        - ✅ Enables **quick, non-destructive testing** of milk quality at the source.
        - ⚠️ Allows **early detection** of spoilage, contamination, or adulteration.
        - 💰 Supports **transparent and fair pricing** in cooperative and local markets.
            """)
        
            gr.Markdown("### 🏛️ Government & Regulators")
            gr.Markdown("""
        - 🛡️ Reinforces **food safety and public health** monitoring systems.
        - 📊 Ensures **consistency and traceability** across the dairy supply chain.
        - 🚀 Encourages **innovation in agricultural technologies** and rural development.
            """)
        
            gr.Markdown("### 🏭 Businesses & Cooperatives")
            gr.Markdown("""
        - ⏱️ Facilitates **real-time quality control** during production and logistics.
        - 💡 Reduces dependency on slow, expensive lab tests.
        - 🤝 Builds **consumer trust** through transparency and quality assurance.
            """)
        
            gr.Markdown("---")
            gr.Markdown("## 🧬 Parting Thought: Healthy Living Starts with Smart Choices")
            gr.Markdown("""
        > “Milk is nature’s first food – and spectroscopy helps us keep it honest, pure, and nutritious.”  
        >  
        > Embrace technology. Protect health.  
        > Let's make every drop of milk safe and reliable – for everyone.
            """)

demo.launch(server_name="0.0.0.0", server_port=7860, ssr_mode=False)