Spaces:

davidwisdom
/

hgbdt-viz

Runtime error

App Files Files Community

none commited on Oct 4, 2022

Commit

045d7d4

0 Parent(s):

Working version of the streamlit animation

Browse files

Files changed (4) hide show

README.md +1 -0
streamlit_viz.py +254 -0
train_classifier.py +86 -0
viz_classifier.py +215 -0

README.md ADDED Viewed

	@@ -0,0 +1 @@


1	+ The `id` column is baloney. There are lots of duplicates.

streamlit_viz.py ADDED Viewed

	@@ -0,0 +1,254 @@

+import joblib
+import time
+import plotly.graph_objects as go
+import streamlit as st
+import pandas as pd
+import numpy as np
+FEATS = [
+  'srcip',
+  'sport',
+  'dstip',
+  'dsport',
+  'proto',
+  #'state',  I dropped this one when I trained the model
+  'dur',
+  'sbytes',
+  'dbytes',
+  'sttl',
+  'dttl',
+  'sloss',
+  'dloss',
+  'service',
+  'Sload',
+  'Dload',
+  'Spkts',
+  'Dpkts',
+  'swin',
+  'dwin',
+  'stcpb',
+  'dtcpb',
+  'smeansz',
+  'dmeansz',
+  'trans_depth',
+  'res_bdy_len',
+  'Sjit',
+  'Djit',
+  'Stime',
+  'Ltime',
+  'Sintpkt',
+  'Dintpkt',
+  'tcprtt',
+  'synack',
+  'ackdat',
+  'is_sm_ips_ports',
+  'ct_state_ttl',
+  'ct_flw_http_mthd',
+  'is_ftp_login',
+  'ct_ftp_cmd',
+  'ct_srv_src',
+  'ct_srv_dst',
+  'ct_dst_ltm',
+  'ct_src_ltm',
+  'ct_src_dport_ltm',
+  'ct_dst_sport_ltm',
+  'ct_dst_src_ltm',
+]
+COLORS = [
+  'aliceblue','aqua','aquamarine','azure',
+  'bisque','black','blanchedalmond','blue',
+  'blueviolet','brown','burlywood','cadetblue',
+  'chartreuse','chocolate','coral','cornflowerblue',
+  'cornsilk','crimson','cyan','darkblue','darkcyan',
+  'darkgoldenrod','darkgray','darkgreen',
+  'darkkhaki','darkmagenta','darkolivegreen','darkorange',
+  'darkorchid','darkred','darksalmon','darkseagreen',
+  'darkslateblue','darkslategray',
+  'darkturquoise','darkviolet','deeppink','deepskyblue',
+  'dimgray','dodgerblue',
+  'forestgreen','fuchsia','gainsboro',
+  'gold','goldenrod','gray','green',
+  'greenyellow','honeydew','hotpink','indianred','indigo',
+  'ivory','khaki','lavender','lavenderblush','lawngreen',
+  'lemonchiffon','lightblue','lightcoral','lightcyan',
+  'lightgoldenrodyellow','lightgray',
+  'lightgreen','lightpink','lightsalmon','lightseagreen',
+  'lightskyblue','lightslategray',
+  'lightsteelblue','lightyellow','lime','limegreen',
+  'linen','magenta','maroon','mediumaquamarine',
+  'mediumblue','mediumorchid','mediumpurple',
+  'mediumseagreen','mediumslateblue','mediumspringgreen',
+  'mediumturquoise','mediumvioletred','midnightblue',
+  'mintcream','mistyrose','moccasin','navy',
+  'oldlace','olive','olivedrab','orange','orangered',
+  'orchid','palegoldenrod','palegreen','paleturquoise',
+  'palevioletred','papayawhip','peachpuff','peru','pink',
+  'plum','powderblue','purple','red','rosybrown',
+  'royalblue','saddlebrown','salmon','sandybrown',
+  'seagreen','seashell','sienna','silver','skyblue',
+  'slateblue','slategray','slategrey','snow','springgreen',
+  'steelblue','tan','teal','thistle','tomato','turquoise',
+  'violet','wheat','yellow','yellowgreen'
+]
+def build_parents(tree, visit_order, node_id2plot_id):
+  parents = [None]
+  parent_plot_ids = [None]
+  directions = [None]
+  for i in visit_order[1:]:
+    parent = tree[tree['right']==i].index
+    if parent.empty:
+      p = tree[tree['left']==i].index[0]
+      parent_plot_ids.append(str(node_id2plot_id[p]))
+      parents.append(p)
+      directions.append('l')
+    else:
+      parent_plot_ids.append(str(node_id2plot_id[parent[0]]))
+      parents.append(parent[0])
+      directions.append('r')
+  return parents, parent_plot_ids, directions
+def build_labels_colors(tree, visit_order, parents, parent_plot_ids, directions):
+  labels = ['Histogram Gradient-Boosted Decision Tree']
+  colors = ['white']
+  for i, parent, parent_plot_id, direction in zip(
+    visit_order,
+    parents,
+    parent_plot_ids,
+    directions
+  ):
+    # skip the first one (the root)
+    if i == 0:
+      continue
+    node = tree.loc[i]
+    feat = FEATS[int(tree.loc[int(parent), 'feature_idx'])]
+    thresh = tree.loc[int(parent), 'num_threshold']
+    if direction == 'l':
+      labels.append(f"[{parent_plot_id}.L] {feat} <= {thresh}")
+    else:
+      labels.append(f"[{parent_plot_id}.R] {feat} > {thresh}")
+    # colors
+    offset = FEATS.index(feat)
+    colors.append(COLORS[offset])
+  return labels, colors
+def build_plot(tree):
+  #https://stackoverflow.com/questions/64393535/python-plotly-treemap-ids-format-and-how-to-display-multiple-duplicated-labels-i
+  # if you use `ids`, then `parents` has to be in terms of `ids`
+  visit_order = breadth_first_traverse(tree)
+  node_id2plot_id = {node:i for i, node in enumerate(visit_order)}
+  parents, parent_plot_ids, directions = build_parents(tree, visit_order, node_id2plot_id)
+  labels, colors = build_labels_colors(tree, visit_order, parents, parent_plot_ids, directions)
+  # this should just be ['0', '1', '2', . . .]
+  plot_ids = [str(node_id2plot_id[x]) for x in visit_order]
+  return go.Treemap(
+    values=tree['count'].to_numpy(),
+    labels=labels,
+    ids=plot_ids,
+    parents=parent_plot_ids,
+    marker_colors=colors,
+  )
+def breadth_first_traverse(tree):
+  """
+  https://www.101computing.net/breadth-first-traversal-of-a-binary-tree/
+  Iterative version makes more sense since I have the whole tree in a table
+  instead of just nodes and pointers
+  """
+  q = [0]
+  visited_nodes = []
+  while len(q) != 0:
+    cur = q.pop(0)
+    visited_nodes.append(cur)
+    if tree.loc[cur, 'left'] != 0:
+      q.append(tree.loc[cur, 'left'])
+    if tree.loc[cur, 'right'] != 0:
+      q.append(tree.loc[cur, 'right'])
+  return visited_nodes
+def main():
+  # load the data
+  hgb = joblib.load('hgb_classifier.joblib')
+  trees = [pd.DataFrame(x[0].nodes) for x in hgb._predictors]
+  # make the plots
+  graph_objs = [build_plot(tree) for tree in trees]
+  figures = [go.Figure(graph_obj) for graph_obj in graph_objs]
+  frames = [go.Frame(data=graph_obj) for graph_obj in graph_objs]
+  # show them with streamlit
+  # this puts them all on the screen at once
+  # like each new one shows up below the previous one
+  # instead of replacing the previous one
+  #for fig in figures:
+  #  st.plotly_chart(fig)
+  #  time.sleep(1)
+  # This works the way I want
+  # but the plot is tiny
+  # also it recalcualtes all of the plots
+  # every time the slider value changes
+  #
+  # I tried to cache the plots but build_plot() takes
+  # a DataFrame which is mutable and therefore unhashable I guess
+  # so it won't let me cache that function
+  # I could pack the dataframe bytes to smuggle them past that check
+  # but whatever
+  idx = st.slider(
+    label='which step to show',
+    min_value=0,
+    max_value=len(figures)-1,
+    value=0,
+    step=1
+  )
+  st.plotly_chart(figures[idx])
+  st.markdown(f'## Tree {idx}')
+  st.dataframe(trees[idx])
+  # Maybe just show a Plotly animated chart
+  # https://plotly.com/python/animations/#using-a-slider-and-buttons
+  # They don't really document the animation stuff on their website
+  # but it's in here
+  # https://raw.githubusercontent.com/plotly/plotly.js/master/dist/plot-schema.json
+  # I guess it's only in the JS docs and hasn't made it to the Python docs yet
+  # https://plotly.com/javascript/animations/
+  # trying to find stuff here instead
+  # https://plotly.com/python-api-reference/generated/plotly.graph_objects.layout.updatemenu.html?highlight=updatemenu
+  # this one finally set the speed
+  # no mention of how they figured this out but thank goodness I found it
+  # https://towardsdatascience.com/basic-animation-with-matplotlib-and-plotly-5eef4ad6c5aa
+  ani_fig = go.Figure(
+    data=graph_objs[0],
+    frames=frames,
+    layout=go.Layout(
+      updatemenus=[{
+        'type':'buttons',
+        'buttons':[{
+          'label':'Play',
+          'method': 'animate',
+          'args':[None, {
+            'frame': {'duration':5000},
+            'transition': {'duration': 2500}
+          }]
+        }]
+      }]
+    )
+  )
+  st.plotly_chart(ani_fig)
+if __name__=='__main__':
+  main()

train_classifier.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import joblib
+import pandas as pd
+from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
+from sklearn.ensemble import RandomForestClassifier, HistGradientBoostingClassifier
+from sklearn.metrics import classification_report
+def main():
+  train_df = pd.read_csv('train_data.csv', na_values='-')
+  # `service` is about half-empty and the rest are completely full
+  # one of the rows has `no` for `state` which isn't listed as an option in the description of the fields
+  # I'm just going to delete that
+  train_df = train_df.drop(columns=['id'])
+  train_df = train_df.drop(index=train_df[train_df['state']=='no'].index)
+  # It can predict `label` really well ~0.95 accuracy/f1/whatever other stat you care about
+  # It does a lot worse trying to predict `attack_cat` b/c there are 10 classes
+  # and some of them are not well-represented
+  # so that might be more interesting to visualize
+  cheating = train_df.pop('attack_cat')
+  y_enc = LabelEncoder().fit(train_df['label'])
+  train_y = y_enc.transform(train_df.pop('label'))
+  x_enc = OrdinalEncoder().fit(train_df)
+  train_df = x_enc.transform(train_df)
+  # Random forest doesn't handle NaNs
+  # I could drop the `service` column or I can use the HistGradientBoostingClassifier
+  # super helpful error message from sklearn pointed me to this list
+  # https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values
+  #rf = RandomForestClassifier()
+  #rf.fit(train_df, y_train)
+  # max_iter is the number of time it builds a gradient-boosted tree
+  # so it's the number of estimators
+  hgb = HistGradientBoostingClassifier(max_iter=10).fit(train_df, train_y)
+  joblib.dump(hgb, 'hgb_classifier.joblib', compress=9)
+  test_df = pd.read_csv('test_data.csv', na_values='-')
+  test_df = test_df.drop(columns=['id', 'attack_cat'])
+  test_y = y_enc.transform(test_df.pop('label'))
+  test_df = x_enc.transform(test_df)
+  test_preds = hgb.predict(test_df)
+  print(classification_report(test_y, test_preds))
+  # I guess they took out the RF feature importance
+  # or maybe that's only in XGBoost
+  # you can still kind of get to it
+  # with RandomForestClassifier.feature_importances_
+  # or like this
+  # https://scikit-learn.org/stable/auto_examples/ensemble/plot_forest_importances.html
+  # but there's really nothing for the HistGradientBoostingClassifier
+  # but you can get to the actual nodes for each predictor/estimator like this
+  #  hgb._predictors[i][0].nodes
+  # and that has information gain metric for each node which might be viz-able
+  # so that might be an interesting viz
+  # like plot the whole forest
+  # maybe only do like 10 estimators to keep it smaller
+  # or stick with 100 and figure out a good way to viz big models
+  # the first two estimators are almost identical
+  # so maybe like plot the first estimator
+  # and then fuzz the nodes by how much the other estimators differ
+  # assuming there's some things they all agree on exactly and others where they differ a little bit
+  # idk I don't really know how the algorithm works
+  # the 96th estimator looks pretty different (I'm assuming from boosting)
+  # so maybe like an evolution animation from the first to the last
+  # to see the effect of the boosting
+  # like plot the points and show how the decision boundary shifts with each generation
+  # alongside an animation of the actual decision tree morphing each step
+  # That might look too much like an animation of the model being trained though
+  # which I guess that's sort of what it is so idk
+  # https://scikit-learn.org/stable/modules/ensemble.html#interpretation-with-feature-importance
+  # also
+  # you can see what path a data point takes through the forest
+  # with RandomForestClassifier.decision_path()
+  # which might be really cool
+  # to see like 10 trees and the path through each tree and what each tree predicted
+if __name__ == '__main__':
+  main()

viz_classifier.py ADDED Viewed

	@@ -0,0 +1,215 @@

+import joblib
+import pandas as pd
+import plotly.graph_objects as go
+import plotly.express as px
+hgb = joblib.load('hgb_classifier.joblib')
+FEATS = [
+  'srcip',
+  'sport',
+  'dstip',
+  'dsport',
+  'proto',
+  #'state',  I dropped this one when I trained the model
+  'dur',
+  'sbytes',
+  'dbytes',
+  'sttl',
+  'dttl',
+  'sloss',
+  'dloss',
+  'service',
+  'Sload',
+  'Dload',
+  'Spkts',
+  'Dpkts',
+  'swin',
+  'dwin',
+  'stcpb',
+  'dtcpb',
+  'smeansz',
+  'dmeansz',
+  'trans_depth',
+  'res_bdy_len',
+  'Sjit',
+  'Djit',
+  'Stime',
+  'Ltime',
+  'Sintpkt',
+  'Dintpkt',
+  'tcprtt',
+  'synack',
+  'ackdat',
+  'is_sm_ips_ports',
+  'ct_state_ttl',
+  'ct_flw_http_mthd',
+  'is_ftp_login',
+  'ct_ftp_cmd',
+  'ct_srv_src',
+  'ct_srv_dst',
+  'ct_dst_ltm',
+  'ct_src_ltm',
+  'ct_src_dport_ltm',
+  'ct_dst_sport_ltm',
+  'ct_dst_src_ltm',
+]
+# plotly only has the CSS named colors
+# I don't think I can use xkcd colors
+# I copied a bunch of CSS colors from somewhere online
+# and then deleted whites and things that showed up too close on the tree
+# this is not really a general solution, it just works for this specific tree
+# I'll have to come up with a better colormap at some point
+COLORS = [
+  'aliceblue','aqua','aquamarine','azure',
+  'bisque','black','blanchedalmond','blue',
+  'blueviolet','brown','burlywood','cadetblue',
+  'chartreuse','chocolate','coral','cornflowerblue',
+  'cornsilk','crimson','cyan','darkblue','darkcyan',
+  'darkgoldenrod','darkgray','darkgreen',
+  'darkkhaki','darkmagenta','darkolivegreen','darkorange',
+  'darkorchid','darkred','darksalmon','darkseagreen',
+  'darkslateblue','darkslategray',
+  'darkturquoise','darkviolet','deeppink','deepskyblue',
+  'dimgray','dodgerblue',
+  'forestgreen','fuchsia','gainsboro',
+  'gold','goldenrod','gray','green',
+  'greenyellow','honeydew','hotpink','indianred','indigo',
+  'ivory','khaki','lavender','lavenderblush','lawngreen',
+  'lemonchiffon','lightblue','lightcoral','lightcyan',
+  'lightgoldenrodyellow','lightgray',
+  'lightgreen','lightpink','lightsalmon','lightseagreen',
+  'lightskyblue','lightslategray',
+  'lightsteelblue','lightyellow','lime','limegreen',
+  'linen','magenta','maroon','mediumaquamarine',
+  'mediumblue','mediumorchid','mediumpurple',
+  'mediumseagreen','mediumslateblue','mediumspringgreen',
+  'mediumturquoise','mediumvioletred','midnightblue',
+  'mintcream','mistyrose','moccasin','navy',
+  'oldlace','olive','olivedrab','orange','orangered',
+  'orchid','palegoldenrod','palegreen','paleturquoise',
+  'palevioletred','papayawhip','peachpuff','peru','pink',
+  'plum','powderblue','purple','red','rosybrown',
+  'royalblue','saddlebrown','salmon','sandybrown',
+  'seagreen','seashell','sienna','silver','skyblue',
+  'slateblue','slategray','slategrey','snow','springgreen',
+  'steelblue','tan','teal','thistle','tomato','turquoise',
+  'violet','wheat','yellow','yellowgreen'
+]
+trees = [x[0].nodes for x in hgb._predictors]
+# the final tree definitely has a similar structure but is noticably different
+# that's really cool
+# I think this will make a cool animation
+# if I can figure it out
+tree = pd.DataFrame(trees[0])
+#tree = pd.DataFrame(trees[9])
+# parents is going to be tricky
+# I need get the index of whichever node has the current node listed in either left or right
+parents = [None]
+# keep track of whether each node is a left or right child of the parent in the list
+directions = [None]
+# it uses 0 to say "no left/right child"
+# so I have to skip searching for node 0
+# which is fine b/c node 0 is the root
+for i in tree.index[1:]:
+  # it seems to make a very even tree
+  # so just guess it's in the right side
+  # and that will be right half the time
+  parent = tree[tree['right']==i].index
+  if parent.empty:
+    parents.append(str(tree[tree['left']==i].index[0]))
+    directions.append('l')
+  else:
+    parents.append(str(parent[0]))
+    directions.append('r')
+# generate the labels
+# and the colors
+labels = ['Histogram Gradient-Boosted Decision Tree']
+colors = ['white']
+for i, node, parent, direction in zip(
+  tree.index.to_numpy(),
+  tree.iterrows(),
+  parents,
+  directions
+):
+  # skip the first one (the root)
+  if i == 0:
+    continue
+  node = node[1]
+  feat = FEATS[int(tree.loc[int(parent), 'feature_idx'])]
+  thresh = tree.loc[int(parent), 'num_threshold']
+  if direction == 'l':
+    labels.append(f"[{i}] {feat} <= {thresh}")
+  else:
+    labels.append(f"[{i}] {feat} > {thresh}")
+  # colors
+  offset = FEATS.index(feat)
+  colors.append(COLORS[offset])
+# actual plot
+f = go.Figure(
+  go.Treemap(
+    values=tree['count'].to_numpy(),
+    labels=labels,
+    ids=tree.index.to_numpy(),
+    parents=parents,
+    marker_colors=colors,
+  )
+)
+#f.update_layout(
+#  treemapcolorway = ['pink']
+#)
+breakpoint()
+# converting the ndarry with columns names to a pandas df
+# 3284 bytes as an ndarry
+# 3300 bytes as a dataframe
+# so they're the same size
+# do I need to convert it to pandas? idk
+# just curious
+# https://linuxtut.com/en/ffb2e319db5545965933/
+# https://github.com/scikit-learn/scikit-learn/blob/main/sklearn/ensemble/_hist_gradient_boosting/_predictor.pyx
+# figuring out how the thing works
+# `value` is the predicted class / value / whatever
+# so if it's a leaf node, it returns that value as the prediction
+# there are negative values in some of the leaves
+# maybe the classes are +/-1 instead of 0/1?
+# if the data value is <= `num_threshold` then it goes in the left node
+# if it's > `num_threshold` then it goes in the right node
+# okay and then all the leave have feature_idx=0, num_threshold=0, left=0, right=0
+# that makes sense
+# still kind of annoying that they use 0 instead of np.nan but oh well
+# also super super hard to figure out what the labels on the tree map should be
+# like it has to check the parent's feature_idx and num_threshold
+# which I guess isn't too bad once we have the list of parents already built
+# except that I don't know whether a node is left or right from its parent
+# hmmmm