Jasper Siebelink commited on
Commit
06b052d
·
1 Parent(s): 9ab5be6

OC_SVM support

Browse files
Files changed (6) hide show
  1. .gitignore +1 -0
  2. app.py +43 -51
  3. app_desktop.py +53 -81
  4. dataset_content.py +37 -0
  5. isolation_forest.py +11 -0
  6. oc_svm.py +14 -0
.gitignore CHANGED
@@ -1,3 +1,4 @@
 
1
  .DS_Store
2
  .env
3
 
 
1
+ __pycache__
2
  .DS_Store
3
  .env
4
 
app.py CHANGED
@@ -1,22 +1,36 @@
1
  import streamlit as st
2
  import numpy as np
3
- from sklearn.ensemble import IsolationForest
4
  import matplotlib.pyplot as plt
5
- from mpl_toolkits.mplot3d import Axes3D # This import is necessary for 3D plotting, even if it seems unused
6
  import json
7
 
 
 
 
8
  # Title for Streamlit app
9
- st.title('Isolation Forest Anomaly Detection')
10
 
11
  col1, col2 = st.columns(2)
12
 
13
  # Content from upload
14
- json_content = None
 
 
 
 
 
 
 
 
 
 
 
 
 
15
  with col1:
16
  with st.container(border=True):
17
  uploaded_file = st.file_uploader("Upload JSON", type="json")
18
  if uploaded_file:
19
- json_content = json.loads(uploaded_file.getvalue())
20
 
21
  # Content from local file
22
  with col2:
@@ -24,17 +38,14 @@ with col2:
24
  st.write('Load embedded JSON')
25
  if st.button('Load'):
26
  with open('cattle_log.json', 'r') as file:
27
- json_content = json.load(file)
28
-
29
 
30
- if json_content:
31
- # Select dimensions
32
- num_dimensions = st.selectbox('Select number of dimensions:', [1, 2, 3], index=2)
33
 
 
34
  X = []
35
 
36
  # Iterate over each log entry in the log_content
37
- for log_entry in json_content['logs']:
38
  # Extract and convert the necessary attributes
39
  total_today_str = log_entry['distanceTraveled']['totalToday'].rstrip('m')
40
  heart_rate = int(log_entry['healthData']['heartRate']) # Assuming heart rate is always an integer
@@ -54,50 +65,31 @@ if json_content:
54
  # Generating synthetic data
55
  rng = np.random.RandomState(42)
56
 
57
- # Fit the model
58
- clf = IsolationForest(max_samples=100, random_state=rng)
59
- clf.fit(X)
60
- y_pred = clf.predict(X)
 
61
 
62
- # Plotting based on the selected number of dimensions
 
 
 
63
  if num_dimensions == 3:
64
- fig = plt.figure(figsize=(10, 7))
65
- ax = fig.add_subplot(111, projection='3d')
66
- ax.scatter(X[:, 0], X[:, 1], X[:, 2], color=['red' if pred == -1 else 'blue' for pred in y_pred], s=50)
67
- ax.set_title("Isolation Forest Anomaly Detection (3D)")
68
  ax.set_xlabel("Distance travelled")
69
  ax.set_ylabel("Heartrate")
70
  ax.set_zlabel("Weight")
71
- st.pyplot(fig)
72
- elif num_dimensions == 1:
73
- fig, ax = plt.subplots()
74
- # For 1D, ensure to select one dimension (e.g., X[:, 0] for distance travelled)
75
- ax.scatter(X[:, 0], np.zeros_like(X[:, 0]), color=['red' if pred == -1 else 'blue' for pred in y_pred], s=50)
76
- ax.set_title("Isolation Forest Anomaly Detection (1D)")
77
- ax.set_xlabel("Distance travelled")
78
- st.pyplot(fig)
79
- else: # Default to 2D plotting
80
- fig, ax = plt.subplots()
81
- ax.scatter(X[:, 0], X[:, 1], color=['red' if pred == -1 else 'blue' for pred in y_pred], s=50)
82
- ax.set_title("Isolation Forest Anomaly Detection (2D)")
83
- ax.set_xlabel("Distance travelled")
84
- ax.set_ylabel("Heartrate")
85
- st.pyplot(fig)
86
-
87
-
88
-
89
 
90
- # Random data
91
- # Generating a dataset with points. 95 points are generated from a Gaussian distribution,
92
- # and 5 points are anomalies added manually.
93
- # X = 0.3 * rng.randn(95, num_dimensions)
94
- # X = np.r_[X + 2, X - 2]
95
- # X_outliers = rng.uniform(low=-4, high=4, size=(5, num_dimensions))
96
- # X = np.r_[X, X_outliers]
97
-
98
- # # # Fit the model
99
- # clf = IsolationForest(max_samples=100, random_state=rng)
100
- # clf.fit(X)
101
 
102
- # Predictions
103
- # y_pred = clf.predict(X)
 
1
  import streamlit as st
2
  import numpy as np
 
3
  import matplotlib.pyplot as plt
 
4
  import json
5
 
6
+ from isolation_forest import apply_isolation_forest
7
+ from oc_svm import apply_oc_svm
8
+
9
  # Title for Streamlit app
10
+ st.title('Cattle logfile analysis')
11
 
12
  col1, col2 = st.columns(2)
13
 
14
  # Content from upload
15
+ if 'json_content' not in st.session_state:
16
+ st.session_state.json_content = None
17
+ st.session_state.json_content = None
18
+ st.session_state.json_content = None
19
+
20
+
21
+ # Select dimensions
22
+ num_dimensions = st.selectbox('Select number of dimensions:', [1, 2, 3],
23
+ index=2)
24
+
25
+ # Select Algorithm
26
+ algorithm = st.selectbox('Select algorithm:', ["Isolation Forest", "One-Class Support Vector Machine"],
27
+ index=0)
28
+
29
  with col1:
30
  with st.container(border=True):
31
  uploaded_file = st.file_uploader("Upload JSON", type="json")
32
  if uploaded_file:
33
+ st.session_state.json_content = json.loads(uploaded_file.getvalue())
34
 
35
  # Content from local file
36
  with col2:
 
38
  st.write('Load embedded JSON')
39
  if st.button('Load'):
40
  with open('cattle_log.json', 'r') as file:
41
+ st.session_state.json_content = json.load(file)
 
42
 
 
 
 
43
 
44
+ if st.session_state.json_content:
45
  X = []
46
 
47
  # Iterate over each log entry in the log_content
48
+ for log_entry in st.session_state.json_content['logs']:
49
  # Extract and convert the necessary attributes
50
  total_today_str = log_entry['distanceTraveled']['totalToday'].rstrip('m')
51
  heart_rate = int(log_entry['healthData']['heartRate']) # Assuming heart rate is always an integer
 
65
  # Generating synthetic data
66
  rng = np.random.RandomState(42)
67
 
68
+ if algorithm == 'Isolation Forest':
69
+ plotted_result = apply_isolation_forest(rng,
70
+ X)
71
+ else:
72
+ plotted_result = apply_oc_svm(X)
73
 
74
+ # Create a figure
75
+ fig, ax = plt.subplots(figsize=(10, 7), subplot_kw={'projection': '3d'} if num_dimensions == 3 else {})
76
+
77
+ # Configure the plot based on the number of dimensions
78
  if num_dimensions == 3:
79
+ ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=['red' if pred == -1 else 'blue' for pred in plotted_result], s=50)
 
 
 
80
  ax.set_xlabel("Distance travelled")
81
  ax.set_ylabel("Heartrate")
82
  ax.set_zlabel("Weight")
83
+ else:
84
+ x_axis = X[:, 0]
85
+ y_axis = np.zeros_like(X[:, 0]) if num_dimensions == 1 else X[:, 1]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
 
87
+ ax.scatter(x_axis, y_axis, c=['red' if pred == -1 else 'blue' for pred in plotted_result], s=50)
88
+ ax.set_xlabel("Distance travelled")
89
+ ax.set_ylabel("Heartrate" if num_dimensions > 1 else "")
90
+
91
+ # Set common properties and show plot
92
+ ax.set_title(algorithm)
93
+ ax.grid(True)
94
+ st.pyplot(fig)
 
 
 
95
 
 
 
app_desktop.py CHANGED
@@ -1,90 +1,62 @@
1
  import numpy as np
2
- from sklearn.ensemble import IsolationForest
3
  import matplotlib.pyplot as plt
4
- from mpl_toolkits.mplot3d import Axes3D
5
- import json
 
6
 
7
- def get_data_from_json():
8
- with open('cattle_log.json', 'r') as file:
9
- log_content = json.load(file)['logs']
10
 
11
- X = []
 
12
 
13
- # Iterate over each log entry in the log_content
14
- for log_entry in log_content:
15
- # Extract and convert the necessary attributes
16
- total_today_str = log_entry['distanceTraveled']['totalToday'].rstrip('m')
17
- heart_rate = int(log_entry['healthData']['heartRate']) # Assuming heart rate is always an integer
18
- weight_str = log_entry['healthData']['weight'].rstrip('kg')
19
-
20
- # Convert the distance and weight to floating-point values
21
- total_today = float(total_today_str) # Convert distance to float
22
- weight = float(weight_str) # Convert weight to float
23
-
24
- # Create a 3D vector for the current log entry and append it to the list of vectors
25
- vector_3d = [total_today, heart_rate, weight]
26
- X.append(vector_3d)
27
 
28
- # Convert X into a NumPy array for easier slicing
29
- X = np.array(X)
30
- return X
31
 
32
-
33
- def generate_random_data(num_dimensions, rng):
34
- # Generating a dataset with 100 points. 95 points are generated from a Gaussian distribution,
35
- # and 5 points are anomalies added manually.
36
- X = 0.3 * rng.randn(95, num_dimensions)
37
- X = np.r_[X + 2, X - 2]
38
- X_outliers = rng.uniform(low=-4, high=4, size=(5, num_dimensions))
39
- X = np.r_[X, X_outliers]
40
- return X
41
-
42
-
43
-
44
- # Generating synthetic data
45
- rng = np.random.RandomState(42)
46
-
47
- # Ask the user for the number of dimensions
48
- num_dimensions = int(input("Select number of dimensions (1, 2, or 3): "))
49
-
50
- # Input data
51
- X = get_data_from_json()
52
- # X = generate_random_data(num_dimensions, rng)
53
-
54
- # Fit the model
55
- clf = IsolationForest(max_samples=100, random_state=rng)
56
- clf.fit(X)
57
-
58
- # # Predictions
59
- y_pred = clf.predict(X)
60
-
61
- if num_dimensions == 3:
62
- # Plotting in 3D
63
- fig = plt.figure(figsize=(10, 7))
64
- ax = fig.add_subplot(111, projection='3d') # Create a 3D subplot
65
-
66
- # Extracting the three dimensions for plotting
67
- x_axis = X[:, 0]
68
- y_axis = X[:, 1]
69
- z_axis = X[:, 2]
70
-
71
- # Scatter plot for 3D data
72
- ax.scatter(x_axis, y_axis, z_axis, color=['red' if pred == -1 else 'blue' for pred in y_pred], s=50)
73
-
74
- ax.set_title("Isolation Forest Anomaly Detection (3D)")
75
- ax.set_xlabel("Distance travelled")
76
- ax.set_ylabel("Heartrate")
77
- ax.set_zlabel("Weight")
78
- plt.show()
79
-
80
- else:
81
  # Plotting
82
- plt.figure(figsize=(10, 7))
83
- x_axis = X if num_dimensions == 1 else X[:, 0]
84
- y_axis = np.zeros_like(X) if num_dimensions == 1 else X[:, 1]
85
- plt.scatter(x_axis,y_axis, color=['red' if pred == -1 else 'blue' for pred in y_pred], s=50)
86
- plt.title("Isolation Forest Anomaly Detection")
87
- plt.xlabel("Distance travelled")
88
- plt.ylabel("Heartrate")
 
 
 
 
 
 
 
89
  plt.grid(True)
90
- plt.show()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import numpy as np
 
2
  import matplotlib.pyplot as plt
3
+ from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
4
+ import tkinter as tk
5
+ from tkinter import ttk
6
 
7
+ from dataset_content import get_data_from_json
8
+ from isolation_forest import apply_isolation_forest
9
+ from oc_svm import apply_oc_svm
10
 
11
+ def plot_data(num_dimensions):
12
+ rng = np.random.RandomState(42)
13
 
14
+ X = get_data_from_json()
 
 
 
 
 
 
 
 
 
 
 
 
 
15
 
16
+ # Apply algorithm
17
+ plotted_result = apply_isolation_forest(rng, X) if combo_box_alg.current() == 0 else apply_oc_svm(X)
 
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
  # Plotting
20
+ fig, ax = plt.subplots(figsize=(10, 7), subplot_kw={'projection': '3d'} if num_dimensions == 3 else {})
21
+ if num_dimensions == 3:
22
+ ax.scatter(X[:, 0], X[:, 1], X[:, 2], c=['red' if pred == -1 else 'blue' for pred in plotted_result], s=50)
23
+ ax.set_xlabel("Distance travelled")
24
+ ax.set_ylabel("Heartrate")
25
+ ax.set_zlabel("Weight")
26
+ else:
27
+ x_axis = X[:, 0]
28
+ y_axis = np.zeros_like(X[:, 0]) if num_dimensions == 1 else X[:, 1]
29
+
30
+ ax.scatter(x_axis, y_axis, c=['red' if pred == -1 else 'blue' for pred in plotted_result], s=50)
31
+ ax.set_xlabel("Distance travelled")
32
+ ax.set_ylabel("Heartrate" if num_dimensions > 1 else "")
33
+
34
  plt.grid(True)
35
+ return fig
36
+
37
+ # Create the main window
38
+ root = tk.Tk()
39
+ root.title("Dimension Selector")
40
+
41
+ def update_plot(event):
42
+ num_dimensions = int(combo_box_dim.get()[0])
43
+ fig = plot_data(num_dimensions)
44
+ canvas = FigureCanvasTkAgg(fig, master=root)
45
+ canvas_widget = canvas.get_tk_widget()
46
+ canvas_widget.grid(row=1, column=0, columnspan=4)
47
+ canvas.draw()
48
+
49
+ # Dimension selection
50
+ combo_box_dim = ttk.Combobox(root, values=("1 Dimension", "2 Dimensions", "3 Dimensions"), state="readonly")
51
+ combo_box_dim.grid(row=0, column=1, pady=10)
52
+ combo_box_dim.current(2)
53
+ combo_box_dim.bind("<<ComboboxSelected>>", update_plot)
54
+
55
+ # Algorithm selection
56
+ combo_box_alg = ttk.Combobox(root, values=("Isolation Forest", "One-Class Support Vector Machine"), state="readonly")
57
+ combo_box_alg.grid(row=0, column=2, pady=10)
58
+ combo_box_alg.current(0)
59
+ combo_box_alg.bind("<<ComboboxSelected>>", update_plot)
60
+
61
+ update_plot(None)
62
+ root.mainloop()
dataset_content.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import json
3
+
4
+ def get_data_from_json() -> np.ndarray:
5
+ with open('cattle_log.json', 'r') as file:
6
+ log_content = json.load(file)['logs']
7
+
8
+ X = []
9
+
10
+ # Iterate over each log entry in the log_content
11
+ for log_entry in log_content:
12
+ # Extract and convert the necessary attributes
13
+ total_today_str = log_entry['distanceTraveled']['totalToday'].rstrip('m')
14
+ heart_rate = int(log_entry['healthData']['heartRate']) # Assuming heart rate is always an integer
15
+ weight_str = log_entry['healthData']['weight'].rstrip('kg')
16
+
17
+ # Convert the distance and weight to floating-point values
18
+ total_today = float(total_today_str) # Convert distance to float
19
+ weight = float(weight_str) # Convert weight to float
20
+
21
+ # Create a 3D vector for the current log entry and append it to the list of vectors
22
+ vector_3d = [total_today, heart_rate, weight]
23
+ X.append(vector_3d)
24
+
25
+ # Convert X into a NumPy array for easier slicing
26
+ X = np.array(X)
27
+ return X
28
+
29
+
30
+ def generate_random_data(num_dimensions, rng) -> np.ndarray:
31
+ # Generating a dataset with 100 points. 95 points are generated from a Gaussian distribution,
32
+ # and 5 points are anomalies added manually.
33
+ X = 0.3 * rng.randn(95, num_dimensions)
34
+ X = np.r_[X + 2, X - 2]
35
+ X_outliers = rng.uniform(low=-4, high=4, size=(5, num_dimensions))
36
+ X = np.r_[X, X_outliers]
37
+ return X
isolation_forest.py ADDED
@@ -0,0 +1,11 @@
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ #Isolation Forest
3
+
4
+ from matplotlib.pylab import RandomState
5
+ from sklearn.ensemble import IsolationForest
6
+ import numpy as np
7
+
8
+ def apply_isolation_forest( rng: RandomState,
9
+ all_data: np.ndarray) -> np.ndarray:
10
+ clf = IsolationForest(max_samples=40, random_state=rng)
11
+ return clf.fit_predict(all_data)
oc_svm.py ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #One-Class Support Vector Machine
2
+
3
+ from sklearn import svm
4
+ import numpy as np
5
+ from sklearn.discriminant_analysis import StandardScaler
6
+
7
+ def apply_oc_svm(all_data: np.ndarray) -> np.ndarray:
8
+ # Normalize
9
+ scaler = StandardScaler()
10
+ X_scaled = scaler.fit_transform(all_data)
11
+
12
+ # Initialize One-Class SVM
13
+ oc_svm = svm.OneClassSVM(kernel='rbf', gamma='auto', nu=0.2)
14
+ return oc_svm.fit_predict(X_scaled)