Spaces:

metricspace
/

cabasus

Build error

App Files Files Community

arcan3 commited on May 13, 2023

Commit

46fcc2f

1 Parent(s): 03507e5

added labelling stages

Browse files

Files changed (11) hide show

.gitignore +3 -0
app.py +13 -7
funcs/dataloader.py +107 -0
funcs/plot_func.py +23 -11
funcs/processor.py +5 -5
funcs/som.py +425 -0
ml_inference.py +30 -0
models/cluster_som2.pkl +3 -0
models/r10d_2.pth +3 -0
requirements.txt +59 -4
test_plot.py +40 -0

.gitignore CHANGED Viewed

@@ -1,4 +1,7 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
 *$py.class

 # Byte-compiled / optimized / DLL files
+*.zip
+Data-*
+drive-*
 __pycache__/
 *.py[cod]
 *$py.class

app.py CHANGED Viewed

@@ -10,8 +10,7 @@ with gr.Blocks(title='Cabasus') as cabasus_sensor:
         with gr.Row():
             processed_file_box = gr.File(label='Processed CSV File')
             json_file_box = gr.File(label='Generated Json file')
-        video_box = gr.PlayableVideo(label='Video box')
         with gr.Row():
             slice_size_slider = gr.inputs.Slider(16, 512, 1, 64, label="Slice Size")
             sample_rate = gr.inputs.Slider(1, 199, 1, 20, label="Sample rate")
@@ -26,13 +25,20 @@ with gr.Blocks(title='Cabasus') as cabasus_sensor:
             plot_box_overlay = gr.Plot(label="Overlay Signal Plot")
         with gr.Row():
-            slice_slider = gr.Slider(minimum=0, maximum=300, label='Current slice', step=1)
-        slices_per_leg = gr.Textbox(label="Number of slices found per LEG")
-        csv_file_box.change(process_data, inputs=[csv_file_box, slice_size_slider, sample_rate, window_size_slider], outputs=[processed_file_box, json_file_box, slices_per_leg, plot_box_leg, plot_box_overlay, slice_slider])
-        leg_dropdown.change(plot_sensor_data_from_json, inputs=[json_file_box, leg_dropdown], outputs=[plot_box_leg])
-        repeat_process.click(process_data, inputs=[csv_file_box, slice_size_slider, sample_rate, window_size_slider], outputs=[processed_file_box, json_file_box, slices_per_leg, plot_box_leg, plot_box_overlay, slice_slider])
 cabasus_sensor.queue(concurrency_count=2).launch(debug=True)

         with gr.Row():
             processed_file_box = gr.File(label='Processed CSV File')
             json_file_box = gr.File(label='Generated Json file')
         with gr.Row():
             slice_size_slider = gr.inputs.Slider(16, 512, 1, 64, label="Slice Size")
             sample_rate = gr.inputs.Slider(1, 199, 1, 20, label="Sample rate")
             plot_box_overlay = gr.Plot(label="Overlay Signal Plot")
         with gr.Row():
+            slice_slider = gr.Slider(minimum=1, maximum=300, label='Current slice', step=1)
+        with gr.Row():
+            plot_slice_leg = gr.Plot(label="Sliced Signal Plot")
+            get_real_slice = gr.Plot(label="Real Signal Plot")
+        with gr.Row():
+            animation = gr.PlayableVideo(label="Animated horse steps")
+        slices_per_leg = gr.Textbox(label="Number of slices found per LEG")
+        csv_file_box.change(process_data, inputs=[csv_file_box, slice_size_slider, sample_rate, window_size_slider], outputs=[processed_file_box, json_file_box, slices_per_leg, plot_box_leg, plot_box_overlay, slice_slider, plot_slice_leg, get_real_slice])
+        leg_dropdown.change(plot_sensor_data_from_json, inputs=[json_file_box, leg_dropdown, slice_slider], outputs=[plot_box_leg, plot_slice_leg, get_real_slice])
+        repeat_process.click(process_data, inputs=[csv_file_box, slice_size_slider, sample_rate, window_size_slider], outputs=[processed_file_box, json_file_box, slices_per_leg, plot_box_leg, plot_box_overlay, slice_slider, plot_slice_leg, get_real_slice])
+        slice_slider.change(plot_sensor_data_from_json, inputs=[json_file_box, leg_dropdown, slice_slider], outputs=[plot_box_leg, plot_slice_leg, get_real_slice])
 cabasus_sensor.queue(concurrency_count=2).launch(debug=True)

funcs/dataloader.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import glob, json, os
+import torch
+import warnings
+from torch.utils.data import Dataset
+class BaseDataset2(Dataset):
+    """Template class for all datasets in the project."""
+    def __init__(self, x, y):
+        """Initialize dataset.
+        Args:
+            x(ndarray): Input features.
+            y(ndarray): Targets.
+        """
+        self.data = torch.from_numpy(x).float()
+        self.targets = torch.from_numpy(y).float()
+        self.latents = None
+        self.labels = None
+        self.is_radial = []
+        self.partition = True
+    def __getitem__(self, index):
+        return self.data[index], self.targets[index], index
+    def __len__(self):
+        return len(self.data)
+    def numpy(self, idx=None):
+        """Get dataset as ndarray.
+        Specify indices to return a subset of the dataset, otherwise return whole dataset.
+        Args:
+            idx(int, optional): Specify index or indices to return.
+        Returns:
+            ndarray: Return flattened dataset as a ndarray.
+        """
+        n = len(self)
+        data = self.data.numpy().reshape((n, -1))
+        if idx is None:
+            return data, self.targets.numpy()
+        else:
+            return data[idx], self.targets[idx].numpy()
+    def get_latents(self):
+        """Get latent variables.
+        Returns:
+            latents(ndarray): Latent variables for each sample.
+        """
+        return self.latents
+def load_json(file_path):
+    with open(file_path, 'r') as f:
+        data = json.load(f)
+    return data
+def read_json_files(file):
+    data_x = []
+    data_y = []
+    samples = load_json(file)
+    valid_samples = 0
+    for sample in samples:
+        data = []
+        skip_sample = False
+        for key in ['AX1', 'AX2', 'AX3', 'AX4', 'AY1', 'AY2', 'AY3', 'AY4', 'AZ1', 'AZ2', 'AZ3', 'AZ4', 'GX1', 'GX2', 'GX3', 'GX4', 'GY1', 'GY2', 'GY3', 'GY4', 'GZ1', 'GZ2', 'GZ3', 'GZ4', 'GZ1_precise_time_diff', 'GZ2_precise_time_diff', 'GZ3_precise_time_diff', 'GZ4_precise_time_diff', 'precise_time_diff']:
+            if key in sample:
+                if key.endswith('_precise_time_diff') or key == 'precise_time_diff':
+                    if sample[key] is None:
+                        skip_sample = True
+                        break
+                    data.append(round(sample[key])*20)
+                else:
+                    data.extend(sample[key])
+            else:
+                warnings.warn(f"KeyError: {key} not found in JSON file: {file}")
+        if skip_sample:
+            #warnings.warn(f"Skipped sample with null values in JSON file: {json_file}")
+            continue
+        if len(data) != 768*2 + 5:  # 24 keys * 64 values each + 5 additional values
+            warnings.warn(f"Incomplete sample in JSON file: {file}")
+            continue
+        valid_samples += 1
+        tensor = torch.tensor(data, dtype=torch.float32)
+        data_x.append(tensor)
+        data_y.append(1)
+    if valid_samples == 0:
+        warnings.warn(f"No valid samples found in JSON file: {file}")
+    if not data_x:
+        raise ValueError("No valid samples found in all the JSON files.")
+    return torch.stack(data_x), torch.tensor(data_y, dtype=torch.long)

funcs/plot_func.py CHANGED Viewed

@@ -1,10 +1,12 @@
 import json
 import matplotlib
 import pandas as pd
 import matplotlib.pyplot as plt
 matplotlib.use('Agg')
 def plot_sensor_data_from_json(json_file, sensor, slice_select=1):
     # Read the JSON file
@@ -18,35 +20,45 @@ def plot_sensor_data_from_json(json_file, sensor, slice_select=1):
     # Concatenate the slices and create a new timestamp series with 20ms intervals
     timestamps = []
     sensor_data = []
-    for slice_dict in slices:
         start_timestamp = slice_dict["timestamp"]
         slice_length = len(slice_dict[sensor])
-        slice_timestamps = [start_timestamp + 20 * i for i in range(slice_length)]
         timestamps.extend(slice_timestamps)
         sensor_data.extend(slice_dict[sensor])
     # Create a DataFrame with the sensor data
-    data = pd.DataFrame({sensor: sensor_data}, index=timestamps)
     # Plot the sensor data
     fig, ax = plt.subplots(figsize=(12, 6))
-    ax = plt.plot(data[sensor], label=sensor)
-    # Mark the slice start and end points
-    for slice_dict in slices:
-        start_timestamp = slice_dict["timestamp"]
-        end_timestamp = start_timestamp + 20 * (len(slice_dict[sensor]) - 1)
-        plt.axvline(x=start_timestamp, color='black', linestyle=':', label='Start' if start_timestamp == slices[0]["timestamp"] else None)
-        plt.axvline(x=end_timestamp, color='red', linestyle=':', label='End' if end_timestamp == slices[0]["timestamp"] + 20 * (len(slices[0][sensor]) - 1) else None)
     plt.xlabel("Timestamp")
     plt.ylabel(sensor)
     plt.legend()
     plt.tight_layout()
-    return fig
 def plot_overlay_data_from_json(json_file, sensors, use_precise_timestamp=False):
     # Read the JSON file

 import json
 import matplotlib
+import numpy as np
 import pandas as pd
 import matplotlib.pyplot as plt
 matplotlib.use('Agg')
+plt.style.use('ggplot')
 def plot_sensor_data_from_json(json_file, sensor, slice_select=1):
     # Read the JSON file
     # Concatenate the slices and create a new timestamp series with 20ms intervals
     timestamps = []
     sensor_data = []
+    slice_item = []
+    temp_end = 0
+    for slice_count, slice_dict in enumerate(slices):
         start_timestamp = slice_dict["timestamp"]
         slice_length = len(slice_dict[sensor])
+        slice_timestamps = [start_timestamp + 20 * i for i in range(temp_end, slice_length + temp_end)]
         timestamps.extend(slice_timestamps)
         sensor_data.extend(slice_dict[sensor])
+        temp_end += slice_length
+        slice_item.extend([slice_count+1]*len(slice_timestamps))
     # Create a DataFrame with the sensor data
+    data = pd.DataFrame({sensor: sensor_data, 'slice selection': slice_item, 'time': timestamps})
     # Plot the sensor data
     fig, ax = plt.subplots(figsize=(12, 6))
+    ax = plt.plot(data['time'].to_list(), data[sensor].to_list())
+    df_temp = data[data['slice selection'] == int(slice_select)].reset_index()
+    y = [np.NaN]*((int(slice_select)-1)*len(df_temp[sensor].to_list())) + df_temp[sensor].to_list() + [np.NaN]*((len(slices) - int(slice_select))*len(df_temp[sensor].to_list()))
+    x = data['time'].to_list()
+    ax = plt.plot(x, y, '-')
+    plt.xlabel("Timestamp")
+    plt.ylabel(sensor)
+    plt.legend()
+    plt.tight_layout()
+    fig1, ax1 = plt.subplots(figsize=(12, 6))
+    ax1 = plt.plot(df_temp['time'].to_list(), df_temp[sensor].to_list())
     plt.xlabel("Timestamp")
     plt.ylabel(sensor)
     plt.legend()
     plt.tight_layout()
+    return fig, fig1
 def plot_overlay_data_from_json(json_file, sensors, use_precise_timestamp=False):
     # Read the JSON file

funcs/processor.py CHANGED Viewed

@@ -9,11 +9,11 @@ def process_data(input_file, slice_size=64, min_slice_size=16, sample_rate=20, w
     # Read the data from the file, including the CRC column
     try:
         if input_file.name is None:
-            return None, None, None, None, None, None
         data = pd.read_csv(input_file.name, delimiter=";", index_col="NR", usecols=["NR", "TS", "LEG", "GX", "GY", "GZ", "AX", "AY", "AZ", "CRC"])
     except:
         if input_file is None:
-            return None, None, None, None, None, None
         data = pd.read_csv(input_file, delimiter=";", index_col="NR", usecols=["NR", "TS", "LEG", "GX", "GY", "GZ", "AX", "AY", "AZ", "CRC"])
@@ -69,7 +69,7 @@ def process_data(input_file, slice_size=64, min_slice_size=16, sample_rate=20, w
     if not no_significant_change_index.empty:
         # Save the data up to the point where no significant change appears in all channels
         data = data.loc[:no_significant_change_index[0]]
-        return None, None, f'Warning: gap of {gap_size} ms found at line {gap_start_index}', None, None, None
     # Save the resulting DataFrame to a new file
     data.to_csv('output.csv', sep=";", na_rep="NaN", float_format="%.0f")
@@ -77,10 +77,10 @@ def process_data(input_file, slice_size=64, min_slice_size=16, sample_rate=20, w
     file, len_ = slice_csv_to_json('output.csv', slice_size, min_slice_size, sample_rate, window_size=window_size)
     # get the plot automatically
-    sensor_fig = plot_sensor_data_from_json(file, "GZ1")
     overlay_fig = plot_overlay_data_from_json(file, ["GZ1", "GZ2", "GZ3", "GZ4"], use_precise_timestamp=True)
     #
-    return 'output.csv', file, f'{len_}', sensor_fig, overlay_fig, gr.Slider.update(interactive=True, maximum=len_, minimum=1, value=1)

     # Read the data from the file, including the CRC column
     try:
         if input_file.name is None:
+            return None, None, None, None, None, None, None, None
         data = pd.read_csv(input_file.name, delimiter=";", index_col="NR", usecols=["NR", "TS", "LEG", "GX", "GY", "GZ", "AX", "AY", "AZ", "CRC"])
     except:
         if input_file is None:
+            return None, None, None, None, None, None, None, None
         data = pd.read_csv(input_file, delimiter=";", index_col="NR", usecols=["NR", "TS", "LEG", "GX", "GY", "GZ", "AX", "AY", "AZ", "CRC"])
     if not no_significant_change_index.empty:
         # Save the data up to the point where no significant change appears in all channels
         data = data.loc[:no_significant_change_index[0]]
+        return None, None, f'Warning: gap of {gap_size} ms found at line {gap_start_index}', None, None, None, None, None
     # Save the resulting DataFrame to a new file
     data.to_csv('output.csv', sep=";", na_rep="NaN", float_format="%.0f")
     file, len_ = slice_csv_to_json('output.csv', slice_size, min_slice_size, sample_rate, window_size=window_size)
     # get the plot automatically
+    sensor_fig, slice_fig = plot_sensor_data_from_json(file, "GZ1")
     overlay_fig = plot_overlay_data_from_json(file, ["GZ1", "GZ2", "GZ3", "GZ4"], use_precise_timestamp=True)
     #
+    return 'output.csv', file, f'{len_}', sensor_fig, overlay_fig, gr.Slider.update(interactive=True, maximum=len_, minimum=1, value=1), slice_fig, None

funcs/som.py ADDED Viewed

	@@ -0,0 +1,425 @@

+import numpy as np
+import hdbscan
+from minisom import MiniSom
+import pickle
+from collections import Counter
+import matplotlib.pyplot as plt
+import phate
+import imageio
+from tqdm import tqdm
+import io
+import plotly.graph_objs as go
+import plotly.subplots as sp
+import umap
+from sklearn.datasets import make_blobs
+from sklearn.preprocessing import LabelEncoder
+from sklearn.cluster import KMeans
+from sklearn.semi_supervised import LabelSpreading
+from moviepy.editor import *
+class ClusterSOM:
+    def __init__(self):
+        self.hdbscan_model = None
+        self.som_models = {}
+        self.sigma_values = {}
+        self.mean_values = {}
+        self.cluster_mapping = {}
+        self.embedding = None
+        self.dim_red_op = None
+    def train(self, dataset, min_samples_per_cluster=100, n_clusters=None, som_size=(20, 20), sigma=1.0, learning_rate=0.5, num_iteration=200000, random_seed=42, n_neighbors=5, coverage=0.95):
+        """
+        Train HDBSCAN and SOM models on the given dataset.
+        """
+        # Train HDBSCAN model
+        print('Identifying clusters in the embedding ...')
+        self.hdbscan_model = hdbscan.HDBSCAN(min_cluster_size=min_samples_per_cluster)
+        self.hdbscan_model.fit(dataset)
+        # Calculate n_clusters if not provided
+        if n_clusters is None:
+            cluster_labels, counts = zip(*Counter(self.hdbscan_model.labels_).most_common())
+            cluster_labels = list(cluster_labels)
+            total_points = sum(counts)
+            covered_points = 0
+            n_clusters = 0
+            for count in counts:
+                covered_points += count
+                n_clusters += 1
+                if covered_points / total_points >= coverage:
+                    break
+        # Train SOM models for the n_clusters most common clusters in the HDBSCAN model
+        cluster_labels, counts = zip(*Counter(self.hdbscan_model.labels_).most_common(n_clusters + 1))
+        cluster_labels = list(cluster_labels)
+        if -1 in cluster_labels:
+            cluster_labels.remove(-1)
+        else:
+            cluster_labels.pop()
+        for i, label in tqdm(enumerate(cluster_labels), total=len(cluster_labels), desc="Fitting 2D maps"):
+            if label == -1:
+                continue  # Ignore noise
+            cluster_data = dataset[self.hdbscan_model.labels_ == label]
+            som = MiniSom(som_size[0], som_size[1], dataset.shape[1], sigma=sigma, learning_rate=learning_rate, random_seed=random_seed)
+            som.train_random(cluster_data, num_iteration)
+            self.som_models[i+1] = som
+            self.cluster_mapping[i+1] = label
+            # Compute sigma values
+            mean_cluster, sigma_cluster = self.compute_sigma_values(cluster_data, som_size, som, n_neighbors=n_neighbors)
+            self.sigma_values[i+1] = sigma_cluster
+            self.mean_values[i+1] = mean_cluster
+    def compute_sigma_values(self, cluster_data, som_size, som, n_neighbors=5):
+        som_weights = som.get_weights()
+        # Assign each datapoint to its nearest node
+        partitions = {idx: [] for idx in np.ndindex(som_size[0], som_size[1])}
+        for sample in cluster_data:
+            x, y = som.winner(sample)
+            partitions[(x, y)].append(sample)
+        # Compute the mean distance and std deviation of these partitions
+        mean_cluster = np.zeros(som_size)
+        sigma_cluster = np.zeros(som_size)
+        for idx in partitions:
+            if len(partitions[idx]) > 0:
+                partition_data = np.array(partitions[idx])
+                mean_distance = np.mean(np.linalg.norm(partition_data - som_weights[idx], axis=-1))
+                std_distance = np.std(np.linalg.norm(partition_data - som_weights[idx], axis=-1))
+            else:
+                mean_distance = 0
+                std_distance = 0
+            mean_cluster[idx] = mean_distance
+            sigma_cluster[idx] = std_distance
+        return mean_cluster, sigma_cluster
+    def train_label(self, labeled_data, labels):
+        """
+        Train on labeled data to find centroids and compute distances to the labels.
+        """
+        le = LabelEncoder()
+        encoded_labels = le.fit_transform(labels)
+        unique_labels = np.unique(encoded_labels)
+        # Use label spreading to propagate the labels
+        label_prop_model = LabelSpreading(kernel='knn', n_neighbors=5)
+        label_prop_model.fit(labeled_data, encoded_labels)
+        # Find the centroids for each label using KMeans
+        kmeans = KMeans(n_clusters=len(unique_labels), random_state=42)
+        kmeans.fit(labeled_data)
+        # Store the label centroids and label encodings
+        self.label_centroids = kmeans.cluster_centers_
+        self.label_encodings = le
+    def predict(self, data, sigma_factor=1.5):
+        """
+        Predict the cluster and BMU SOM coordinate for each sample in the data if it's inside the sigma value.
+        Also, predict the label and distance to the center of the label if labels are trained.
+        """
+        results = []
+        for sample in data:
+            min_distance = float('inf')
+            nearest_cluster_idx = None
+            nearest_node = None
+            for i, som in self.som_models.items():
+                x, y = som.winner(sample)
+                node = som.get_weights()[x, y]
+                distance = np.linalg.norm(sample - node)
+                if distance < min_distance:
+                    min_distance = distance
+                    nearest_cluster_idx = i
+                    nearest_node = (x, y)
+            # Check if the nearest node is within the sigma value
+            if min_distance <= self.mean_values[nearest_cluster_idx][nearest_node] * 1.5:  # * self.sigma_values[nearest_cluster_idx][nearest_node] * sigma_factor:
+                if hasattr(self, 'label_centroids'):
+                    # Predict the label and distance to the center of the label
+                    label_idx = self.label_encodings.inverse_transform([nearest_cluster_idx - 1])[0]
+                    label_distance = np.linalg.norm(sample - self.label_centroids[label_idx])
+                    results.append((nearest_cluster_idx, nearest_node, label_idx, label_distance))
+                else:
+                    results.append((nearest_cluster_idx, nearest_node))
+            else:
+                results.append((-1, None))  # Noise
+        return results
+    def plot_embedding(self, new_data=None, dim_reduction='umap', interactive=False):
+        """
+        Plot the dataset and SOM grids for each cluster.
+        If new_data is provided, it will be used for plotting instead of the entire dataset.
+        """
+        if self.hdbscan_model is None:
+            raise ValueError("HDBSCAN model not trained yet.")
+        if len(self.som_models) == 0:
+            raise ValueError("SOM models not trained yet.")
+        if dim_reduction not in ['phate', 'umap']:
+            raise ValueError("Invalid dimensionality reduction method. Use 'phate' or 'umap'.")
+        if self.dim_red_op is None or self.embedding is None:
+            n_components = 3
+            if dim_reduction == 'phate':
+                self.dim_red_op = phate.PHATE(n_components=n_components, random_state=42)
+            elif dim_reduction == 'umap':
+                self.dim_red_op = umap.UMAP(n_components=n_components, random_state=42)
+            self.embedding = self.dim_red_op.fit_transform(new_data)
+        if new_data is not None:
+            new_embedding = self.dim_red_op.transform(new_data)
+        else:
+            new_embedding = self.embedding
+        if interactive:
+            fig = sp.make_subplots(rows=1, cols=1, specs=[[{'type': 'scatter3d'}]])
+        else:
+            fig = plt.figure(figsize=(30, 30))
+            ax = fig.add_subplot(111, projection='3d')
+        colors = plt.cm.rainbow(np.linspace(0, 1, len(self.som_models) + 1))
+        for reindexed_label, som in self.som_models.items():
+            original_label = self.cluster_mapping[reindexed_label]
+            cluster_data = embedding[self.hdbscan_model.labels_ == original_label]
+            som_weights = som.get_weights()
+            som_embedding = dim_red_op.transform(som_weights.reshape(-1, dataset.shape[1])).reshape(som_weights.shape[0], som_weights.shape[1], n_components)
+            if interactive:
+                # Plot the original data points
+                fig.add_trace(
+                    go.Scatter3d(
+                        x=cluster_data[:, 0],
+                        y=cluster_data[:, 1],
+                        z=cluster_data[:, 2],
+                        mode='markers',
+                        marker=dict(color=colors[reindexed_label], size=1),
+                        name=f"Cluster {reindexed_label}"
+                    )
+                )
+            else:
+                # Plot the original data points
+                ax.scatter(cluster_data[:, 0], cluster_data[:, 1], cluster_data[:, 2], c=[colors[reindexed_label]], alpha=0.3, s=5, label=f"Cluster {reindexed_label}")
+            for x in range(som_embedding.shape[0]):
+                for y in range(som_embedding.shape[1]):
+                    if interactive:
+                        # Plot the SOM grid
+                        fig.add_trace(
+                            go.Scatter3d(
+                                x=[som_embedding[x, y, 0]],
+                                y=[som_embedding[x, y, 1]],
+                                z=[som_embedding[x, y, 2]],
+                                mode='markers+text',
+                                marker=dict(color=colors[reindexed_label], size=3, symbol='circle'),
+                                text=[f"{x},{y}"],
+                                textposition="top center"
+                            )
+                        )
+                    else:
+                        # Plot the SOM grid
+                        ax.plot([som_embedding[x, y, 0]], [som_embedding[x, y, 1]], [som_embedding[x, y, 2]], '+', markersize=8, mew=2, zorder=10, c=colors[reindexed_label])
+            for i in range(som_embedding.shape[0] - 1):
+                for j in range(som_embedding.shape[1] - 1):
+                    if interactive:
+                        # Plot the SOM connections
+                        fig.add_trace(
+                            go.Scatter3d(
+                                x=np.append(som_embedding[i:i+2, j, 0], som_embedding[i, j:j+2, 0]),
+                                y=np.append(som_embedding[i:i+2, j, 1], som_embedding[i, j:j+2, 1]),
+                                z=np.append(som_embedding[i:i+2, j, 2], som_embedding[i, j:j+2, 2]),
+                                mode='lines',
+                                line=dict(color=colors[reindexed_label], width=2),
+                                showlegend=False
+                            )
+                        )
+                    else:
+                        # Plot the SOM connections
+                        ax.plot(som_embedding[i:i+2, j, 0], som_embedding[i:i+2, j, 1], som_embedding[i:i+2, j, 2], lw=1, c=colors[reindexed_label])
+                        ax.plot(som_embedding[i, j:j+2, 0], som_embedding[i, j:j+2, 1], som_embedding[i, j:j+2, 2], lw=1, c=colors[reindexed_label])
+        if interactive:
+            # Plot noise
+            noise_data = embedding[self.hdbscan_model.labels_ == -1]
+            if len(noise_data) > 0:
+                fig.add_trace(
+                    go.Scatter3d(
+                        x=noise_data[:, 0],
+                        y=noise_data[:, 1],
+                        z=noise_data[:, 2],
+                        mode='markers',
+                        marker=dict(color="gray", size=1),
+                        name="Noise"
+                    )
+                )
+            fig.update_layout(scene=dict(xaxis_title='X', yaxis_title='Y', zaxis_title='Z'))
+            fig.show()
+        else:
+            # Plot noise
+            noise_data = embedding[self.hdbscan_model.labels_ == -1]
+            if len(noise_data) > 0:
+                ax.scatter(noise_data[:, 0], noise_data[:, 1], noise_data[:, 2], c="gray", label="Noise")
+            ax.legend()
+            plt.show()
+    def plot_label_heatmap(self):
+        """
+        Plot a heatmap for each main cluster showing the best label for each coordinate in a single subplot layout.
+        """
+        if not hasattr(self, 'label_centroids'):
+            raise ValueError("Labels not trained yet.")
+        n_labels = len(self.label_centroids)
+        label_colors = plt.cm.rainbow(np.linspace(0, 1, n_labels))
+        n_clusters = len(self.som_models)
+        # Create a subplot layout with a heatmap for each main cluster
+        n_rows = int(np.ceil(np.sqrt(n_clusters)))
+        n_cols = n_rows if n_rows * (n_rows - 1) < n_clusters else n_rows - 1
+        fig, axes = plt.subplots(n_rows, n_cols, figsize=(n_cols * 10, n_rows * 10), squeeze=False)
+        for i, (reindexed_label, som) in enumerate(self.som_models.items()):
+            som_weights = som.get_weights()
+            label_map = np.zeros(som_weights.shape[:2], dtype=int)
+            label_distance_map = np.full(som_weights.shape[:2], np.inf)
+            for label_idx, label_centroid in enumerate(self.label_centroids):
+                for x in range(som_weights.shape[0]):
+                    for y in range(som_weights.shape[1]):
+                        node = som_weights[x, y]
+                        distance = np.linalg.norm(label_centroid - node)
+                        if distance < label_distance_map[x, y]:
+                            label_distance_map[x, y] = distance
+                            label_map[x, y] = label_idx
+            row, col = i // n_cols, i % n_cols
+            ax = axes[row, col]
+            cmap = plt.cm.rainbow
+            cmap.set_under(color='white')
+            im = ax.imshow(label_map, cmap=cmap, origin='lower', interpolation='none', vmin=0.5)
+            ax.set_xticks(range(label_map.shape[1]))
+            ax.set_yticks(range(label_map.shape[0]))
+            ax.grid(True, linestyle='-', linewidth=0.5)
+            ax.set_title(f"Label Heatmap for Cluster {reindexed_label}")
+        # Add a colorbar for label colors
+        cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7])
+        cbar = fig.colorbar(im, cax=cbar_ax, ticks=range(n_labels))
+        cbar.ax.set_yticklabels(self.label_encodings.classes_)
+        # Adjust the layout to fit everything nicely
+        fig.subplots_adjust(wspace=0.5, hspace=0.5, right=0.9)
+        plt.show()
+    def plot_activation(self, data, filename='prediction_output', start=None, end=None):
+        """
+        Generate a GIF visualization of the prediction output using the activation maps of individual SOMs.
+        """
+        if len(self.som_models) == 0:
+            raise ValueError("SOM models not trained yet.")
+        if start is None:
+            start = 0
+        if end is None:
+            end = len(data)
+        images = []
+        for sample in tqdm(data[start:end], desc="Visualizing prediction output"):
+            prediction = self.predict([sample])[0]
+            if prediction[0] == -1:  # Noise
+                continue
+            fig, axes = plt.subplots(1, len(self.som_models), figsize=(20, 5), sharex=True, sharey=True)
+            fig.suptitle(f"Activation map for SOM {prediction[0]}, node {prediction[1]}", fontsize=16)
+            for idx, (som_key, som) in enumerate(self.som_models.items()):
+                ax = axes[idx]
+                activation_map = np.zeros(som._weights.shape[:2])
+                for x in range(som._weights.shape[0]):
+                    for y in range(som._weights.shape[1]):
+                        activation_map[x, y] = np.linalg.norm(sample - som._weights[x, y])
+                winner = som.winner(sample)  # Find the BMU for this SOM
+                activation_map[winner] = 0  # Set the BMU's value to 0 so it will be red in the colormap
+                if som_key == prediction[0]:  # Active SOM
+                    im_active = ax.imshow(activation_map, cmap='viridis', origin='lower', interpolation='none')
+                    ax.plot(winner[1], winner[0], 'r+')  # Mark the BMU with a red plus sign
+                    ax.set_title(f"SOM {som_key}", color='blue', fontweight='bold')
+                    if hasattr(self, 'label_centroids'):
+                        label_idx = self.label_encodings.inverse_transform([som_key - 1])[0]
+                        ax.set_xlabel(f"Label: {label_idx}", fontsize=12)
+                else:  # Inactive SOM
+                    im_inactive = ax.imshow(activation_map, cmap='gray', origin='lower', interpolation='none')
+                    ax.set_title(f"SOM {som_key}")
+                ax.set_xticks(range(activation_map.shape[1]))
+                ax.set_yticks(range(activation_map.shape[0]))
+                ax.grid(True, linestyle='-', linewidth=0.5)
+            # Create a colorbar for each frame
+            fig.subplots_adjust(right=0.8)
+            cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
+            fig.colorbar(im_active, cax=cbar_ax)
+            # Save the plot to a buffer
+            buf = io.BytesIO()
+            plt.savefig(buf, format='png')
+            buf.seek(0)
+            img = imageio.imread(buf)
+            images.append(img)
+            plt.close()
+        # Save the images as a GIF
+        imageio.mimsave(f"{filename}.gif", images, duration=500, loop=1)
+        # Load the gif
+        gif_file = f"{filename}.gif"  # Replace with the path to your GIF file
+        clip = VideoFileClip(gif_file)
+        # Convert the gif to mp4
+        mp4_file = f"{filename}.mp4"  # Replace with the desired output path
+        clip.write_videofile(mp4_file, codec='libx264')
+        # Close the clip to release resources
+        clip.close()
+    def save(self, file_path):
+        """
+        Save the ClusterSOM model to a file.
+        """
+        model_data = (self.hdbscan_model, self.som_models, self.mean_values, self.sigma_values, self.cluster_mapping)
+        if hasattr(self, 'label_centroids'):
+            model_data += (self.label_centroids, self.label_encodings)
+        with open(file_path, "wb") as f:
+            pickle.dump(model_data, f)
+    def load(self, file_path):
+        """
+        Load a ClusterSOM model from a file.
+        """
+        with open(file_path, "rb") as f:
+            model_data = pickle.load(f)
+        self.hdbscan_model, self.som_models, self.mean_values, self.sigma_values, self.cluster_mapping = model_data[:5]
+        if len(model_data) > 5:
+            self.label_centroids, self.label_encodings = model_data[5:]

ml_inference.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import torch
+from phate import PHATEAE
+from funcs.som import ClusterSOM
+from funcs.dataloader import BaseDataset2, read_json_files
+DEVICE = torch.device("cpu")
+reducer10d = PHATEAE(epochs=30, n_components=10, lr=.0001, batch_size=128, t='auto', knn=8, relax=True, metric='euclidean')
+reducer10d.load('models/r10d_2.pth')
+cluster_som = ClusterSOM()
+cluster_som.load("models/cluster_som2.pkl")
+train_x, train_y  = read_json_files('output.json')
+# Convert tensors to numpy arrays if necessary
+if isinstance(train_x, torch.Tensor):
+    train_x = train_x.numpy()
+if isinstance(train_y, torch.Tensor):
+    train_y = train_y.numpy()
+# load the time series slices of the data 4*3*2*64 (feeds+axis*sensor*samples) + 5 for time diff
+data = BaseDataset2(train_x.reshape(len(train_x), -1) / 32768, train_y)
+#compute the 10 dimensional embeding vector
+embedding10d = reducer10d.transform(data)
+prediction = cluster_som.predict(embedding10d)
+cluster_som.plot_activation(embedding10d)

models/cluster_som2.pkl ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5282b68cae29910b6b38c03e0e7e9ab528fb67ef689812d6b02012950303c2d6
+size 8367290

models/r10d_2.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:5c8272bde6d372c90002f6d6afe39584255a99371bdbf18c54f5f574725d9902
+size 13100259

requirements.txt CHANGED Viewed

@@ -3,66 +3,121 @@ aiohttp==3.8.4
 aiosignal==1.3.1
 altair==4.2.2
 anyio==3.6.2
 async-timeout==4.0.2
 attrs==23.1.0
 certifi==2022.12.7
 charset-normalizer==3.1.0
 click==8.1.3
 contourpy==1.0.7
 cycler==0.11.0
 entrypoints==0.4
 fastapi==0.95.1
 ffmpy==0.3.0
 filelock==3.12.0
 fonttools==4.39.3
 frozenlist==1.3.3
 fsspec==2023.4.0
-gradio==3.28.2
 gradio_client==0.1.4
 h11==0.14.0
 httpcore==0.17.0
 httpx==0.24.0
 huggingface-hub==0.14.1
 idna==3.4
 importlib-resources==5.12.0
 Jinja2==3.1.2
 jsonschema==4.17.3
 kiwisolver==1.4.4
 linkify-it-py==2.0.2
 markdown-it-py==2.2.0
 MarkupSafe==2.1.2
 matplotlib==3.7.1
 mdit-py-plugins==0.3.3
 mdurl==0.1.2
 multidict==6.0.4
 numpy==1.24.3
 orjson==3.8.11
 packaging==23.1
 pandas==2.0.1
 Pillow==9.5.0
 pkgutil_resolve_name==1.3.10
 pydantic==1.10.7
 pydub==0.25.1
 Pygments==2.15.1
 pyparsing==3.0.9
 pyrsistent==0.19.3
 python-dateutil==2.8.2
 python-multipart==0.0.6
 pytz==2023.3
 PyYAML==6.0
-requests==2.29.0
-scipy==1.10.1
 semantic-version==2.10.0
 six==1.16.0
 sniffio==1.3.0
 starlette==0.26.1
 toolz==0.12.0
 tqdm==4.65.0
 typing_extensions==4.5.0
 tzdata==2023.3
 uc-micro-py==1.0.2
-urllib3==1.26.15
 uvicorn==0.22.0
 websockets==11.0.2
 yarl==1.9.2
 zipp==3.15.0

 aiosignal==1.3.1
 altair==4.2.2
 anyio==3.6.2
+appnope==0.1.3
+asttokens==2.2.1
 async-timeout==4.0.2
 attrs==23.1.0
+babyplots==1.7.0
+backcall==0.2.0
 certifi==2022.12.7
 charset-normalizer==3.1.0
 click==8.1.3
 contourpy==1.0.7
 cycler==0.11.0
+Cython==0.29.34
+decorator==4.4.2
+Deprecated==1.2.13
 entrypoints==0.4
+executing==1.2.0
 fastapi==0.95.1
 ffmpy==0.3.0
 filelock==3.12.0
 fonttools==4.39.3
 frozenlist==1.3.3
 fsspec==2023.4.0
+future==0.18.3
+gradio==3.28.3
 gradio_client==0.1.4
+graphtools==1.5.3
 h11==0.14.0
+hdbscan==0.8.29
 httpcore==0.17.0
 httpx==0.24.0
 huggingface-hub==0.14.1
 idna==3.4
+imageio==2.28.1
+imageio-ffmpeg==0.4.8
+importlib-metadata==6.6.0
 importlib-resources==5.12.0
+ipython==8.12.2
+jedi==0.18.2
 Jinja2==3.1.2
+joblib==1.2.0
 jsonschema==4.17.3
 kiwisolver==1.4.4
+lazy_loader==0.2
 linkify-it-py==2.0.2
+llvmlite==0.40.0
 markdown-it-py==2.2.0
 MarkupSafe==2.1.2
 matplotlib==3.7.1
+matplotlib-inline==0.1.6
 mdit-py-plugins==0.3.3
 mdurl==0.1.2
+MiniSom==2.3.1
+moviepy==1.0.3
+mpmath==1.3.0
 multidict==6.0.4
+networkx==3.1
+numba==0.57.0
+numexpr==2.8.4
 numpy==1.24.3
 orjson==3.8.11
 packaging==23.1
 pandas==2.0.1
+parso==0.8.3
+pexpect==4.8.0
+phate @ git+https://github.com/metric-space-ai/phate.git@5fcb5bc29f6634391b0ad3831544b09a23123122
+pickleshare==0.7.5
 Pillow==9.5.0
 pkgutil_resolve_name==1.3.10
+plotly==5.14.1
+proglog==0.1.10
+prompt-toolkit==3.0.38
+ptyprocess==0.7.0
+pure-eval==0.2.2
 pydantic==1.10.7
+pydiffmap==0.2.0.1
 pydub==0.25.1
 Pygments==2.15.1
+PyGSP==0.5.1
+pynndescent==0.5.10
 pyparsing==3.0.9
 pyrsistent==0.19.3
 python-dateutil==2.8.2
 python-multipart==0.0.6
 pytz==2023.3
+PyWavelets==1.4.1
 PyYAML==6.0
+requests==2.30.0
+scikit-image==0.20.0
+scikit-learn==1.2.2
+scipy==1.9.1
+seaborn==0.12.2
 semantic-version==2.10.0
 six==1.16.0
 sniffio==1.3.0
+stack-data==0.6.2
 starlette==0.26.1
+sympy==1.12
+tasklogger==1.2.0
+tenacity==8.2.2
+threadpoolctl==3.1.0
+tifffile==2023.4.12
 toolz==0.12.0
+torch==2.0.1
+torchaudio==2.0.2
+torchvision==0.15.2
 tqdm==4.65.0
+traitlets==5.9.0
 typing_extensions==4.5.0
 tzdata==2023.3
 uc-micro-py==1.0.2
+umap-learn==0.5.3
+urllib3==2.0.2
 uvicorn==0.22.0
+wcwidth==0.2.6
 websockets==11.0.2
+wrapt==1.15.0
 yarl==1.9.2
 zipp==3.15.0

test_plot.py CHANGED Viewed

	@@ -0,0 +1,40 @@

+import matplotlib.pyplot as plt
+import json
+import pandas as pd
+import numpy as np
+plt.style.use('ggplot')
+def plot_overlay_data_from_json(json_file, sensors, use_precise_timestamp=False, slice_select=1):
+    # Read the JSON file
+    with open(json_file, "r") as f:
+        slices = json.load(f)
+    # Set up the colormap
+    cmap = plt.get_cmap('viridis')
+    # Create subplots for each sensor
+    fig, axs = plt.subplots(len(sensors), 1, figsize=(12, 2 * len(sensors)), sharex=True)
+    for idx, sensor in enumerate(sensors):
+        # Plot the overlay of the slices
+        for slice_idx, slice_dict in enumerate(slices):
+            slice_length = len(slice_dict[sensor])
+            # Create timestamp array starting from 0 for each slice
+            slice_timestamps = [20 * i for i in range(slice_length)]
+            sensor_data = slice_dict[sensor]
+            data = pd.DataFrame({sensor: sensor_data}, index=slice_timestamps)
+            color = cmap(slice_idx / len(slices))
+            axs[idx].plot(data[sensor], color=color, label=f'Slice {slice_idx + 1}')
+        axs[idx].set_ylabel(sensor)
+    axs[-1].set_xlabel("Timestamp")
+    axs[0].legend()
+    return fig
+plot_overlay_data_from_json('output.json', ["GZ1", "GZ2", "GZ3", "GZ4"], 4)