Spaces:

hawke84
/

logfiles_analysis

Running

App Files Files Community

Jasper Siebelink commited on Apr 14, 2024

Commit

06b052d

1 Parent(s): 9ab5be6

OC_SVM support

Browse files

Files changed (6) hide show

.gitignore +1 -0
app.py +43 -51
app_desktop.py +53 -81
dataset_content.py +37 -0
isolation_forest.py +11 -0
oc_svm.py +14 -0

.gitignore CHANGED Viewed

@@ -1,3 +1,4 @@
 .DS_Store
 .env

+__pycache__
 .DS_Store
 .env

app.py CHANGED Viewed

@@ -1,22 +1,36 @@
 import streamlit as st
 import numpy as np
-from sklearn.ensemble import IsolationForest
 import matplotlib.pyplot as plt
-from mpl_toolkits.mplot3d import Axes3D  # This import is necessary for 3D plotting, even if it seems unused
 import json
 # Title for Streamlit app
-st.title('Isolation Forest Anomaly Detection')
 col1, col2 = st.columns(2)
 # Content from upload
-json_content = None
 with col1:
     with st.container(border=True):
         uploaded_file = st.file_uploader("Upload JSON", type="json")
         if uploaded_file:
-            json_content = json.loads(uploaded_file.getvalue())
 # Content from local file
 with col2:
@@ -24,17 +38,14 @@ with col2:
         st.write('Load embedded JSON')
         if st.button('Load'):
             with open('cattle_log.json', 'r') as file:
-                json_content = json.load(file)
-if json_content:
-    # Select dimensions
-    num_dimensions = st.selectbox('Select number of dimensions:', [1, 2, 3], index=2)
     X = []
     # Iterate over each log entry in the log_content
-    for log_entry in json_content['logs']:
         # Extract and convert the necessary attributes
         total_today_str = log_entry['distanceTraveled']['totalToday'].rstrip('m')
         heart_rate = int(log_entry['healthData']['heartRate'])  # Assuming heart rate is always an integer
@@ -54,50 +65,31 @@ if json_content:
     # Generating synthetic data
     rng = np.random.RandomState(42)
-    # Fit the model
-    clf = IsolationForest(max_samples=100, random_state=rng)
-    clf.fit(X)
-    y_pred = clf.predict(X)
-    # Plotting based on the selected number of dimensions
     if num_dimensions == 3:
-        fig = plt.figure(figsize=(10, 7))
-        ax = fig.add_subplot(111, projection='3d')
-        ax.scatter(X[:, 0], X[:, 1], X[:, 2], color=['red' if pred == -1 else 'blue' for pred in y_pred], s=50)
-        ax.set_title("Isolation Forest Anomaly Detection (3D)")
         ax.set_xlabel("Distance travelled")
         ax.set_ylabel("Heartrate")
         ax.set_zlabel("Weight")
-        st.pyplot(fig)
-    elif num_dimensions == 1:
-        fig, ax = plt.subplots()
-        # For 1D, ensure to select one dimension (e.g., X[:, 0] for distance travelled)
-        ax.scatter(X[:, 0], np.zeros_like(X[:, 0]), color=['red' if pred == -1 else 'blue' for pred in y_pred], s=50)
-        ax.set_title("Isolation Forest Anomaly Detection (1D)")
-        ax.set_xlabel("Distance travelled")
-        st.pyplot(fig)
-    else:  # Default to 2D plotting
-        fig, ax = plt.subplots()
-        ax.scatter(X[:, 0], X[:, 1], color=['red' if pred == -1 else 'blue' for pred in y_pred], s=50)
-        ax.set_title("Isolation Forest Anomaly Detection (2D)")
-        ax.set_xlabel("Distance travelled")
-        ax.set_ylabel("Heartrate")
-        st.pyplot(fig)
-# Random data
-    # Generating a dataset with points. 95 points are generated from a Gaussian distribution,
-    # and 5 points are anomalies added manually.
-    # X = 0.3 * rng.randn(95, num_dimensions)
-    # X = np.r_[X + 2, X - 2]
-    # X_outliers = rng.uniform(low=-4, high=4, size=(5, num_dimensions))
-    # X = np.r_[X, X_outliers]
-    # # # Fit the model
-    # clf = IsolationForest(max_samples=100, random_state=rng)
-    # clf.fit(X)
-    # Predictions
-    # y_pred = clf.predict(X)

 import streamlit as st
 import numpy as np
 import matplotlib.pyplot as plt
 import json
+from isolation_forest import apply_isolation_forest
+from oc_svm import apply_oc_svm
 # Title for Streamlit app
+st.title('Cattle logfile analysis')
 col1, col2 = st.columns(2)
 # Content from upload
+if 'json_content' not in st.session_state:
+    st.session_state.json_content = None
+    st.session_state.json_content = None
+    st.session_state.json_content = None
+# Select dimensions
+num_dimensions = st.selectbox('Select number of dimensions:', [1, 2, 3],
+                              index=2)
+# Select Algorithm
+algorithm = st.selectbox('Select algorithm:', ["Isolation Forest", "One-Class Support Vector Machine"],
+                         index=0)
 with col1:
     with st.container(border=True):
         uploaded_file = st.file_uploader("Upload JSON", type="json")
         if uploaded_file:
+            st.session_state.json_content = json.loads(uploaded_file.getvalue())
 # Content from local file
 with col2:
         st.write('Load embedded JSON')
         if st.button('Load'):
             with open('cattle_log.json', 'r') as file:
+                st.session_state.json_content = json.load(file)
+if st.session_state.json_content:
     X = []
     # Iterate over each log entry in the log_content
+    for log_entry in st.session_state.json_content['logs']:
         # Extract and convert the necessary attributes
         total_today_str = log_entry['distanceTraveled']['totalToday'].rstrip('m')
         heart_rate = int(log_entry['healthData']['heartRate'])  # Assuming heart rate is always an integer
     # Generating synthetic data
     rng = np.random.RandomState(42)
+    if algorithm == 'Isolation Forest':
+        plotted_result = apply_isolation_forest(rng,
+                                                X)
+    else:
+        plotted_result = apply_oc_svm(X)
+    # Create a figure
+    fig, ax = plt.subplots(figsize=(10, 7), subplot_kw={'projection': '3d'} if num_dimensions == 3 else {})
+    # Configure the plot based on the number of dimensions
     if num_dimensions == 3:
+        ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=['red' if pred == -1 else 'blue' for pred in plotted_result], s=50)
         ax.set_xlabel("Distance travelled")
         ax.set_ylabel("Heartrate")
         ax.set_zlabel("Weight")
+    else:
+        x_axis = X[:, 0]
+        y_axis = np.zeros_like(X[:, 0]) if num_dimensions == 1 else X[:, 1]
+        ax.scatter(x_axis, y_axis, c=['red' if pred == -1 else 'blue' for pred in plotted_result], s=50)
+        ax.set_xlabel("Distance travelled")
+        ax.set_ylabel("Heartrate" if num_dimensions > 1 else "")
+    # Set common properties and show plot
+    ax.set_title(algorithm)
+    ax.grid(True)
+    st.pyplot(fig)

app_desktop.py CHANGED Viewed

@@ -1,90 +1,62 @@
 import numpy as np
-from sklearn.ensemble import IsolationForest
 import matplotlib.pyplot as plt
-from mpl_toolkits.mplot3d import Axes3D
-import json
-def get_data_from_json():
-    with open('cattle_log.json', 'r') as file:
-        log_content = json.load(file)['logs']
-    X = []
-    # Iterate over each log entry in the log_content
-    for log_entry in log_content:
-        # Extract and convert the necessary attributes
-        total_today_str = log_entry['distanceTraveled']['totalToday'].rstrip('m')
-        heart_rate = int(log_entry['healthData']['heartRate'])  # Assuming heart rate is always an integer
-        weight_str = log_entry['healthData']['weight'].rstrip('kg')
-        # Convert the distance and weight to floating-point values
-        total_today = float(total_today_str)  # Convert distance to float
-        weight = float(weight_str)  # Convert weight to float
-        # Create a 3D vector for the current log entry and append it to the list of vectors
-        vector_3d = [total_today, heart_rate, weight]
-        X.append(vector_3d)
-    # Convert X into a NumPy array for easier slicing
-    X = np.array(X)
-    return X
-def generate_random_data(num_dimensions, rng):
-    # Generating a dataset with 100 points. 95 points are generated from a Gaussian distribution,
-    # and 5 points are anomalies added manually.
-    X = 0.3 * rng.randn(95, num_dimensions)
-    X = np.r_[X + 2, X - 2]
-    X_outliers = rng.uniform(low=-4, high=4, size=(5, num_dimensions))
-    X = np.r_[X, X_outliers]
-    return X
-# Generating synthetic data
-rng = np.random.RandomState(42)
-# Ask the user for the number of dimensions
-num_dimensions = int(input("Select number of dimensions (1, 2, or 3): "))
-# Input data
-X = get_data_from_json()
-# X = generate_random_data(num_dimensions, rng)
-# Fit the model
-clf = IsolationForest(max_samples=100, random_state=rng)
-clf.fit(X)
-# # Predictions
-y_pred = clf.predict(X)
-if num_dimensions == 3:
-    # Plotting in 3D
-    fig = plt.figure(figsize=(10, 7))
-    ax = fig.add_subplot(111, projection='3d')  # Create a 3D subplot
-    # Extracting the three dimensions for plotting
-    x_axis = X[:, 0]
-    y_axis = X[:, 1]
-    z_axis = X[:, 2]
-    # Scatter plot for 3D data
-    ax.scatter(x_axis, y_axis, z_axis, color=['red' if pred == -1 else 'blue' for pred in y_pred], s=50)
-    ax.set_title("Isolation Forest Anomaly Detection (3D)")
-    ax.set_xlabel("Distance travelled")
-    ax.set_ylabel("Heartrate")
-    ax.set_zlabel("Weight")
-    plt.show()
-else:
     # Plotting
-    plt.figure(figsize=(10, 7))
-    x_axis = X if num_dimensions == 1 else X[:, 0]
-    y_axis = np.zeros_like(X) if num_dimensions == 1 else X[:, 1]
-    plt.scatter(x_axis,y_axis, color=['red' if pred == -1 else 'blue' for pred in y_pred], s=50)
-    plt.title("Isolation Forest Anomaly Detection")
-    plt.xlabel("Distance travelled")
-    plt.ylabel("Heartrate")
     plt.grid(True)
-    plt.show()

 import numpy as np
 import matplotlib.pyplot as plt
+from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
+import tkinter as tk
+from tkinter import ttk
+from dataset_content import get_data_from_json
+from isolation_forest import apply_isolation_forest
+from oc_svm import apply_oc_svm
+def plot_data(num_dimensions):
+    rng = np.random.RandomState(42)
+    X = get_data_from_json()
+    # Apply algorithm
+    plotted_result = apply_isolation_forest(rng, X) if combo_box_alg.current() == 0 else apply_oc_svm(X)
     # Plotting
+    fig, ax = plt.subplots(figsize=(10, 7), subplot_kw={'projection': '3d'} if num_dimensions == 3 else {})
+    if num_dimensions == 3:
+        ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=['red' if pred == -1 else 'blue' for pred in plotted_result], s=50)
+        ax.set_xlabel("Distance travelled")
+        ax.set_ylabel("Heartrate")
+        ax.set_zlabel("Weight")
+    else:
+        x_axis = X[:, 0]
+        y_axis = np.zeros_like(X[:, 0]) if num_dimensions == 1 else X[:, 1]
+        ax.scatter(x_axis, y_axis, c=['red' if pred == -1 else 'blue' for pred in plotted_result], s=50)
+        ax.set_xlabel("Distance travelled")
+        ax.set_ylabel("Heartrate" if num_dimensions > 1 else "")
     plt.grid(True)
+    return fig
+# Create the main window
+root = tk.Tk()
+root.title("Dimension Selector")
+def update_plot(event):
+    num_dimensions = int(combo_box_dim.get()[0])
+    fig = plot_data(num_dimensions)
+    canvas = FigureCanvasTkAgg(fig, master=root)
+    canvas_widget = canvas.get_tk_widget()
+    canvas_widget.grid(row=1, column=0, columnspan=4)
+    canvas.draw()
+# Dimension selection
+combo_box_dim = ttk.Combobox(root, values=("1 Dimension", "2 Dimensions", "3 Dimensions"), state="readonly")
+combo_box_dim.grid(row=0, column=1, pady=10)
+combo_box_dim.current(2)
+combo_box_dim.bind("<<ComboboxSelected>>", update_plot)
+# Algorithm selection
+combo_box_alg = ttk.Combobox(root, values=("Isolation Forest", "One-Class Support Vector Machine"), state="readonly")
+combo_box_alg.grid(row=0, column=2, pady=10)
+combo_box_alg.current(0)
+combo_box_alg.bind("<<ComboboxSelected>>", update_plot)
+update_plot(None)
+root.mainloop()

dataset_content.py ADDED Viewed

	@@ -0,0 +1,37 @@

+import numpy as np
+import json
+def get_data_from_json() -> np.ndarray:
+    with open('cattle_log.json', 'r') as file:
+        log_content = json.load(file)['logs']
+    X = []
+    # Iterate over each log entry in the log_content
+    for log_entry in log_content:
+        # Extract and convert the necessary attributes
+        total_today_str = log_entry['distanceTraveled']['totalToday'].rstrip('m')
+        heart_rate = int(log_entry['healthData']['heartRate'])  # Assuming heart rate is always an integer
+        weight_str = log_entry['healthData']['weight'].rstrip('kg')
+        # Convert the distance and weight to floating-point values
+        total_today = float(total_today_str)  # Convert distance to float
+        weight = float(weight_str)  # Convert weight to float
+        # Create a 3D vector for the current log entry and append it to the list of vectors
+        vector_3d = [total_today, heart_rate, weight]
+        X.append(vector_3d)
+    # Convert X into a NumPy array for easier slicing
+    X = np.array(X)
+    return X
+def generate_random_data(num_dimensions, rng) -> np.ndarray:
+    # Generating a dataset with 100 points. 95 points are generated from a Gaussian distribution,
+    # and 5 points are anomalies added manually.
+    X = 0.3 * rng.randn(95, num_dimensions)
+    X = np.r_[X + 2, X - 2]
+    X_outliers = rng.uniform(low=-4, high=4, size=(5, num_dimensions))
+    X = np.r_[X, X_outliers]
+    return X

isolation_forest.py ADDED Viewed

	@@ -0,0 +1,11 @@

+#Isolation Forest
+from matplotlib.pylab import RandomState
+from sklearn.ensemble import IsolationForest
+import numpy as np
+def apply_isolation_forest( rng: RandomState,
+                            all_data: np.ndarray) -> np.ndarray:
+    clf = IsolationForest(max_samples=40, random_state=rng)
+    return clf.fit_predict(all_data)

oc_svm.py ADDED Viewed

	@@ -0,0 +1,14 @@

+#One-Class Support Vector Machine
+from sklearn import svm
+import numpy as np
+from sklearn.discriminant_analysis import StandardScaler
+def apply_oc_svm(all_data: np.ndarray) -> np.ndarray:
+    # Normalize
+    scaler = StandardScaler()
+    X_scaled = scaler.fit_transform(all_data)
+    # Initialize One-Class SVM
+    oc_svm = svm.OneClassSVM(kernel='rbf', gamma='auto', nu=0.2)
+    return oc_svm.fit_predict(X_scaled)