minancy commited on
Commit
1d4f790
·
1 Parent(s): 617e380

update analyze

Browse files
Files changed (1) hide show
  1. sections/analyze.py +68 -240
sections/analyze.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import polars as pl
2
  import plotly.express as px
3
  import streamlit as st
@@ -8,261 +9,88 @@ if "parsed_df" not in st.session_state:
8
  # Page title
9
  st.title("Data Analysis")
10
 
11
- # Loading data
12
  if st.session_state.parsed_df is None:
13
  st.info("Please upload a log file on the 'Upload' page.")
14
  st.stop()
15
 
16
  data = st.session_state.parsed_df
17
 
18
- # Sidebar for controls
19
- st.sidebar.header("Visualization Options")
20
-
21
- # Chart type options
22
- chart_options = ["Pie Chart", "Sunburst Chart", "Histogram"]
23
-
24
- chart_type = st.sidebar.selectbox("Choose chart type", chart_options)
25
-
26
- # Get categorical columns
27
- categorical_columns = [
28
- name
29
- for name, dtype in data.schema.items()
30
- if dtype == pl.Utf8 or dtype == pl.Categorical
31
- ]
32
- # Get numerical columns
33
- numeric_dtypes = [
34
- pl.Int8,
35
- pl.Int16,
36
- pl.Int32,
37
- pl.Int64,
38
- pl.UInt8,
39
- pl.UInt16,
40
- pl.UInt32,
41
- pl.UInt64,
42
- pl.Float32,
43
- pl.Float64,
44
- ]
45
- numerical_columns = [
46
- name for name, dtype in data.schema.items() if dtype in numeric_dtypes
47
- ]
48
-
49
- # Data filtering tools in main page
50
- st.header("Filter Data")
51
-
52
- filtered_data = data.clone()
53
- original_count = data.shape[0]
54
-
55
- col1, col2 = st.columns(2)
56
-
57
- with col1:
58
- # Look for accept/reject status columns
59
- status_cols = [
60
- col
61
- for col in categorical_columns
62
- if any(term in col.lower() for term in ["status", "action", "result"])
63
- ]
64
 
65
- if status_cols:
66
- status_col = st.selectbox("Status field:", status_cols)
67
- status_values = filtered_data[status_col].unique().to_list()
 
 
 
 
68
 
69
- # Identify accepted/rejected values
70
- accept_values = [
71
- val
72
- for val in status_values
73
- if any(
74
- term in str(val).lower()
75
- for term in ["accept", "allow", "permit", "pass"]
76
- )
77
- ]
78
- reject_values = [
79
- val
80
- for val in status_values
81
- if any(
82
- term in str(val).lower() for term in ["reject", "deny", "drop", "block"]
83
  )
84
- ]
85
 
86
- if accept_values or reject_values:
87
- flow_status = st.radio(
88
- "Flow status:", ["All", "Accepted", "Rejected"], horizontal=True
89
- )
90
 
91
- if flow_status == "Accepted" and accept_values:
 
92
  filtered_data = filtered_data.filter(
93
- pl.col(status_col).is_in(accept_values)
 
94
  )
95
- elif flow_status == "Rejected" and reject_values:
96
- filtered_data = filtered_data.filter(
97
- pl.col(status_col).is_in(reject_values)
98
- )
99
-
100
- with col2:
101
- # Port range filter according to RFC 6056
102
- port_cols = [col for col in numerical_columns if "port" in col.lower()]
103
-
104
- if port_cols:
105
- port_col = st.selectbox("Port field:", port_cols)
106
-
107
- # RFC 6056 port ranges
108
- rfc_ranges = {
109
- "Well-known ports (0-1023)": (0, 1023),
110
- "Windows ephemeral (1024-5000)": (1024, 5000),
111
- "Linux/BSD ephemeral (1024-65535)": (1024, 65535),
112
- "IANA ephemeral (49152-65535)": (49152, 65535),
113
- }
114
-
115
- selected_ranges = st.multiselect(
116
- "RFC 6056 port ranges:", options=list(rfc_ranges.keys())
117
- )
118
-
119
- if selected_ranges:
120
- range_filter = None
121
- for range_name in selected_ranges:
122
- min_port, max_port = rfc_ranges[range_name]
123
- current_filter = (pl.col(port_col) >= min_port) & (
124
- pl.col(port_col) <= max_port
125
- )
126
-
127
- if range_filter is None:
128
- range_filter = current_filter
129
- else:
130
- range_filter = range_filter | current_filter
131
-
132
- filtered_data = filtered_data.filter(range_filter)
133
-
134
- if filtered_data.shape[0] != original_count:
135
- st.write(f"Showing {filtered_data.shape[0]} of {original_count} records")
136
- data = filtered_data
137
-
138
- st.write("---")
139
 
 
 
 
140
 
141
- # Main area for visualization
142
- if chart_type == "Pie Chart":
143
- st.header("Pie Chart")
144
-
145
- # Select variable to visualize
146
- selected_column = st.sidebar.selectbox(
147
- "Select a categorical variable", categorical_columns
148
- )
149
-
150
- # Create and display pie chart
151
- fig = px.pie(
152
- data,
153
- names=selected_column,
154
- title=f"Distribution of '{selected_column}'",
155
- )
156
- st.plotly_chart(fig)
157
-
158
- # Display value table
159
- st.write("Value distribution:")
160
- st.write(data[selected_column].value_counts())
161
-
162
- elif chart_type == "Sunburst Chart":
163
- st.header("Sunburst Chart")
164
-
165
- selected_columns = st.sidebar.multiselect(
166
- "Select one or more categorical variables:",
167
- categorical_columns,
168
- default=categorical_columns[:1],
169
- )
170
-
171
- if not selected_columns:
172
- st.warning("Please select at least one variable.")
173
- st.stop()
174
-
175
- fig = px.sunburst(
176
- data,
177
- path=selected_columns,
178
- title="Sunburst Chart",
179
- )
180
- fig.update_traces(textinfo="label+percent parent")
181
- st.plotly_chart(fig)
182
-
183
- st.write("Value distribution:")
184
- group_counts = data.group_by(selected_columns).agg(pl.count().alias("Count"))
185
- st.write(group_counts)
186
-
187
- elif chart_type == "Histogram":
188
- st.header("Histogram")
189
-
190
- # Add option to choose between numeric values or counts
191
- hist_mode = st.sidebar.radio("Histogram type", ["Numeric Values", "Count Values"])
192
-
193
- if hist_mode == "Numeric Values" and numerical_columns:
194
- selected_column = st.sidebar.selectbox(
195
- "Select a numerical variable", numerical_columns
196
- )
197
- fig = px.histogram(data, x=selected_column)
198
- st.plotly_chart(fig)
199
- elif hist_mode == "Count Values" and categorical_columns:
200
- selected_column = st.sidebar.selectbox(
201
- "Select a categorical variable", categorical_columns
202
- )
203
- # Get counts and create histogram
204
- st.write(type(data.select(pl.col(selected_column))))
205
- counts = data.select(pl.col(selected_column)).value_counts()
206
-
207
- counts = counts.rename({selected_column: "value"})
208
- fig = px.bar(
209
- counts,
210
- x="value",
211
- y="count",
212
- labels={"value": selected_column, "count": "Count"},
213
- title=f"Count of {selected_column} values",
214
- )
215
- st.plotly_chart(fig)
216
  else:
217
- st.write("No suitable columns available for the selected histogram type.")
218
 
 
 
 
219
 
220
- # Option to display raw data
221
- if st.sidebar.checkbox("Show raw data"):
222
- st.subheader("Data")
223
-
224
- if chart_type == "Pie Chart":
225
- # For categorical charts, allow filtering by category
226
- filter_option = st.selectbox(
227
- f"Filter by {selected_column}:",
228
- ["Show all data"] + sorted(data[selected_column].unique().tolist()),
229
- )
230
-
231
- if filter_option != "Show all data":
232
- filtered_data = data[data[selected_column] == filter_option]
233
- st.write(filtered_data)
234
- else:
235
- st.write(data)
236
-
237
- elif chart_type == "Histogram":
238
- if hist_mode == "Numeric Values" and numerical_columns:
239
- # For histogram, allow filtering by value range
240
- min_val = float(data[selected_column].min())
241
- max_val = float(data[selected_column].max())
242
-
243
- selected_range = st.slider(
244
- f"Filter by {selected_column} range:",
245
- min_val,
246
- max_val,
247
- (min_val, max_val),
248
- )
249
-
250
- filtered_data = data[
251
- (data[selected_column] >= selected_range[0])
252
- & (data[selected_column] <= selected_range[1])
253
- ]
254
- st.write(filtered_data)
255
- else:
256
- # For categorical histogram
257
- filter_option = st.selectbox(
258
- f"Filter by {selected_column}:",
259
- ["Show all data"] + sorted(data[selected_column].unique().tolist()),
260
- )
261
-
262
- if filter_option != "Show all data":
263
- filtered_data = data[data[selected_column] == filter_option]
264
- st.write(filtered_data)
265
- else:
266
- st.write(data)
267
- else:
268
- st.write(data)
 
1
+ import pandas as pd
2
  import polars as pl
3
  import plotly.express as px
4
  import streamlit as st
 
9
  # Page title
10
  st.title("Data Analysis")
11
 
12
+ # Vérifier que les données sont chargées
13
  if st.session_state.parsed_df is None:
14
  st.info("Please upload a log file on the 'Upload' page.")
15
  st.stop()
16
 
17
  data = st.session_state.parsed_df
18
 
19
+ # Créer les onglets principaux
20
+ tab1, tab2 = st.tabs(["Analysis", "Sankey"])
21
+
22
+ # Onglet Analysis
23
+ with tab1:
24
+ st.subheader("Analysis")
25
+
26
+ # Vérifier que la colonne timestamp existe et est bien de type datetime
27
+ if "timestamp" in data.columns and data["timestamp"].dtype == pl.Datetime:
28
+ # Obtenir les valeurs min et max des dates
29
+ min_date = data["timestamp"].min().date()
30
+ max_date = data["timestamp"].max().date()
31
+
32
+ # Disposition des filtres en colonnes
33
+ col1, col2, col3 = st.columns(3)
34
+
35
+ # ---- FILTRE DATE ----
36
+ with col1:
37
+ st.markdown("### 📅 Date")
38
+ start_date = st.date_input("Date début", min_date)
39
+ end_date = st.date_input("Date fin", max_date)
40
+
41
+ # ---- FILTRE STATUS ----
42
+ with col2:
43
+ st.markdown("### 🔄 Status")
44
+ if "status" in data.columns:
45
+ unique_statuses = sorted(data["status"].unique().cast(pl.Utf8).to_list()) # S'assurer du bon format
46
+ selected_status = st.selectbox("Sélectionnez un status", ["Tous"] + unique_statuses)
47
+ else:
48
+ selected_status = "Tous"
49
+ st.warning("Colonne 'status' non trouvée.")
50
+
51
+ # ---- FILTRE PORTDEST ----
52
+ with col3:
53
+ st.markdown("### 🔢 Port")
54
+ if "portdest" in data.columns:
55
+ min_port, max_port = int(data["portdest"].min()), int(data["portdest"].max())
56
+ selected_port = st.slider("Sélectionnez un port destination", min_port, max_port, (min_port, max_port))
57
+ else:
58
+ min_port, max_port = 0, 600000 # Valeurs par défaut si la colonne est absente
59
+ selected_port = (min_port, max_port)
60
+ st.warning("Colonne 'portdest' non trouvée, valeurs par défaut appliquées.")
 
 
 
 
61
 
62
+ # Vérification des dates sélectionnées
63
+ if start_date > end_date:
64
+ st.error("La date de début ne peut pas être postérieure à la date de fin.")
65
+ else:
66
+ # Conversion des dates en datetime
67
+ start_datetime = pl.datetime(start_date.year, start_date.month, start_date.day)
68
+ end_datetime = pl.datetime(end_date.year, end_date.month, end_date.day, 23, 59, 59)
69
 
70
+ # ---- APPLICATION DES FILTRES ----
71
+ filtered_data = data.filter(
72
+ (pl.col("timestamp") >= start_datetime) & (pl.col("timestamp") <= end_datetime)
 
 
 
 
 
 
 
 
 
 
 
73
  )
 
74
 
75
+ # Correction du filtrage par status (forcer conversion Utf8)
76
+ if "status" in data.columns and selected_status != "Tous":
77
+ filtered_data = filtered_data.filter(pl.col("status").cast(pl.Utf8) == selected_status)
 
78
 
79
+ # Filtrer par portdest en prenant en compte min/max
80
+ if "portdest" in data.columns:
81
  filtered_data = filtered_data.filter(
82
+ (pl.col("portdest").cast(pl.Int64) >= selected_port[0]) &
83
+ (pl.col("portdest").cast(pl.Int64) <= selected_port[1])
84
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85
 
86
+ # Affichage des données filtrées
87
+ st.write("### 🔍 Data filtred :")
88
+ st.dataframe(filtered_data)
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  else:
91
+ st.warning("La colonne 'timestamp' n'existe pas ou n'est pas au format datetime.")
92
 
93
+ # Onglet Sankey
94
+ with tab2:
95
+ st.subheader("Sankey Diagram")
96