Elmira Manavi commited on
Commit
6a07b6c
·
2 Parent(s): cbe1cd5 131b6cd

Merge branch 'SCRUM-59' into 'main'

Browse files
Files changed (2) hide show
  1. requirements.txt +0 -0
  2. src/pages/Test_Evaluation.py +245 -24
requirements.txt CHANGED
Binary files a/requirements.txt and b/requirements.txt differ
 
src/pages/Test_Evaluation.py CHANGED
@@ -1,5 +1,7 @@
1
  import matplotlib.pyplot as plt
2
  import pandas as pd
 
 
3
  import seaborn as sns
4
  import streamlit as st
5
  from bson import ObjectId
@@ -56,7 +58,6 @@ def create_data_metrics_df(overall_metrics: dict) -> pd.DataFrame:
56
  cleaned_metrics = metrics.copy()
57
  cleaned_metrics = {k: v for k, v in cleaned_metrics.items() if isinstance(v, float)}
58
  rows[field] = cleaned_metrics
59
- print(rows[field])
60
 
61
  df = pd.DataFrame(rows).T
62
  return df
@@ -109,21 +110,64 @@ def create_fn_df(record_results: dict):
109
 
110
  def create_error_df(overall_metrics: dict, batchsize: int):
111
  rows = []
112
- sum = 0
113
  for k, v in overall_metrics.get("error", {}).items():
114
  rows.append({
115
  'Error': k.upper(),
116
  'Anzahl': v,
117
  'Prozent': v / batchsize * 100
118
  })
119
- sum += v
120
- rows.append({
121
- 'Error': "Gesamt",
122
- 'Anzahl': sum,
123
- 'Prozent': sum / batchsize * 100
124
- })
125
- sum += v
126
- df = pd.DataFrame(rows).style.format({'Prozent': '{:.1f}%'})
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  return df
128
 
129
 
@@ -190,6 +234,51 @@ def create_detail_table(test: dict):
190
  return pd.DataFrame(rows)
191
 
192
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
193
  @st.dialog("Original Seite", width="medium")
194
  def show_website(url, html):
195
  st.info(f"Link zur Original Website: {url}")
@@ -203,7 +292,6 @@ tests = list(db.test_evaluation.find({}, {"_id": 1, "status": 1, "created_at": 1
203
  tests_sorted = sorted(tests, key=lambda t: t["created_at"], reverse=True)
204
 
205
  options = {str(t["_id"]): f"{t['status']} - {t['created_at'].strftime('%Y-%m-%d %H:%M:%S')}" for t in tests_sorted}
206
-
207
  selected_id = st.selectbox("Wähle einen Test aus", options=list(options.keys()), format_func=lambda x: options[x])
208
 
209
  if selected_id:
@@ -211,40 +299,106 @@ if selected_id:
211
  record_results = test.get("record_results", {})
212
  batchsize = len(record_results)
213
 
214
- st.write(
215
- f"**Test ID:** {selected_id} | **Status:** {test.get("status")} | **Batchsize:** {batchsize}")
 
 
 
216
 
217
  overall_metrics = test.get("overall_metrics", {})
218
  if overall_metrics:
219
- df_data_metrics = create_data_metrics_df(overall_metrics)
220
  cm_fig = create_confusion_matrix(overall_metrics)
221
  df_fn = create_fn_df(record_results)
222
  df_error = create_error_df(overall_metrics, batchsize)
 
 
 
 
223
 
224
- st.write("# Overall Metrics")
225
 
226
- st.write("### Data Metrics")
227
- st.bar_chart(df_data_metrics, width=400, stack=False, sort=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
228
 
229
  col1, col2 = st.columns(2)
230
  with col1:
231
- st.write("### Confusion Matrix für Page Classification (page_type)")
232
  st.pyplot(cm_fig, width=450)
233
 
234
  with col2:
235
- st.write("### Falsch abgelehnte Seiten (false negatives fn)")
236
  st.dataframe(df_fn)
237
 
238
- st.write("### Fehler in der Pipeline (error)")
239
- st.dataframe(df_error)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
 
241
 
242
  else:
243
  st.info("Der Test läuft noch. Es konnte noch keine Metric erstellt werden")
244
 
245
- st.write(f"# Testergebnisse im Detail")
246
- df = create_detail_table(test)
247
- st.dataframe(df, height=600)
248
 
249
  record_id = st.text_input(label="Gebe eine Record ID ein um die Original Website anzusehen.", value="")
250
  if record_id:
@@ -254,3 +408,70 @@ if selected_id:
254
  if html:
255
  html = html.decode("utf-8")
256
  show_website(url, html)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import matplotlib.pyplot as plt
2
  import pandas as pd
3
+ import plotly.express as px
4
+ import plotly.graph_objects as go
5
  import seaborn as sns
6
  import streamlit as st
7
  from bson import ObjectId
 
58
  cleaned_metrics = metrics.copy()
59
  cleaned_metrics = {k: v for k, v in cleaned_metrics.items() if isinstance(v, float)}
60
  rows[field] = cleaned_metrics
 
61
 
62
  df = pd.DataFrame(rows).T
63
  return df
 
110
 
111
  def create_error_df(overall_metrics: dict, batchsize: int):
112
  rows = []
 
113
  for k, v in overall_metrics.get("error", {}).items():
114
  rows.append({
115
  'Error': k.upper(),
116
  'Anzahl': v,
117
  'Prozent': v / batchsize * 100
118
  })
119
+ df = pd.DataFrame(rows)
120
+ return df
121
+
122
+
123
+ def create_sunburst_chart(overall_metrics: dict, batchsize: int):
124
+ page_type_metrics = overall_metrics.get("page_type", {})
125
+ tp = page_type_metrics.get("tp", 0)
126
+ fn = page_type_metrics.get("fn", 0)
127
+ fp = page_type_metrics.get("fp", 0)
128
+ tn = page_type_metrics.get("tn", 0)
129
+ error = batchsize - tp - fn - fp - tn
130
+
131
+ correct = tp + tn
132
+ incorrect = fp + fn
133
+
134
+ error_df = create_error_df(overall_metrics, batchsize)
135
+
136
+ labels = ["Gesamt", "Korrekt", "Falsch", "Error", "True Positive", "True Negative", "False Positive",
137
+ "False Negative"]
138
+ parents = ["", "Gesamt", "Gesamt", "Gesamt", "Korrekt", "Korrekt", "Falsch", "Falsch"]
139
+ values = [batchsize, correct, incorrect, error, tp, tn, fp, fn]
140
+
141
+ for i, row in error_df.iterrows():
142
+ labels.append(row['Error'])
143
+ parents.append("Error")
144
+ values.append(row['Anzahl'])
145
+
146
+ colors = ["#FFFFFF", "#7FD1B9", "#FFB284", "#FF8585", "#5BC0BE", "#379683", "#F2881A", "#F7B32B"]
147
+ colors.extend(["#FF8585"] * len(error_df))
148
+
149
+ fig = go.Figure(go.Sunburst(
150
+ labels=labels,
151
+ parents=parents,
152
+ values=values,
153
+ branchvalues="total",
154
+ marker=dict(colors=colors),
155
+ hovertemplate='<b>%{label}</b><br>Anzahl: %{value}<br>Prozent: %{percentParent:.1%}<extra></extra>'
156
+ ))
157
+
158
+ fig.update_layout(margin=dict(t=0, b=0, l=0, r=0))
159
+ return fig
160
+
161
+
162
+ def create_page_type_chart(overall_metrics: dict):
163
+ page_type_metrics = overall_metrics.get("page_type", {})
164
+ df = pd.DataFrame([{
165
+ "precision": page_type_metrics.get("precision", 0),
166
+ "recall": page_type_metrics.get("recall", 0),
167
+ "f1": page_type_metrics.get("f1", 0),
168
+ "accuracy": page_type_metrics.get("accuracy", 0),
169
+ "effective_accuracy": page_type_metrics.get("effective_accuracy", 0)
170
+ }]).T
171
  return df
172
 
173
 
 
234
  return pd.DataFrame(rows)
235
 
236
 
237
+ def create_event_score_chart(test: dict):
238
+ event_scores = [r["record_metrics"].get("event_score") for r in test["record_results"].values() if
239
+ r["record_metrics"].get("event_score") is not None]
240
+ mean_score = test.get("overall_metrics", {}).get("event_score", 0)
241
+ fig = go.Figure()
242
+ fig.add_trace(go.Histogram(
243
+ x=event_scores,
244
+ name='control',
245
+ xbins=dict(
246
+ start=0.0,
247
+ end=1.1,
248
+ size=0.1
249
+ ),
250
+ marker=dict(
251
+ color="#43cd80",
252
+ line=dict(color='white', width=1) # Trennung zwischen Balken
253
+
254
+ ),
255
+
256
+ ))
257
+ fig.update_layout(
258
+ xaxis=dict(tickvals=[i / 10 for i in range(11)]),
259
+ yaxis_title="Anzahl Events",
260
+ xaxis_title="Event Score",
261
+ title="Event Score",
262
+ annotations=[
263
+ dict(
264
+ x=0.02,
265
+ y=0.94,
266
+ xref="paper",
267
+ yref="paper",
268
+ text=f"Ø Event Score: {mean_score:.2f}",
269
+ showarrow=False,
270
+ align="left",
271
+ font=dict(size=13),
272
+ bgcolor="rgba(255,255,255,0.8)",
273
+ bordercolor="#ccc",
274
+ borderwidth=1
275
+ )
276
+ ]
277
+ )
278
+
279
+ return fig
280
+
281
+
282
  @st.dialog("Original Seite", width="medium")
283
  def show_website(url, html):
284
  st.info(f"Link zur Original Website: {url}")
 
292
  tests_sorted = sorted(tests, key=lambda t: t["created_at"], reverse=True)
293
 
294
  options = {str(t["_id"]): f"{t['status']} - {t['created_at'].strftime('%Y-%m-%d %H:%M:%S')}" for t in tests_sorted}
 
295
  selected_id = st.selectbox("Wähle einen Test aus", options=list(options.keys()), format_func=lambda x: options[x])
296
 
297
  if selected_id:
 
299
  record_results = test.get("record_results", {})
300
  batchsize = len(record_results)
301
 
302
+ st.success(
303
+ f"**Test ID:** {selected_id} | "
304
+ f"**Status:** {test.get('status')} | "
305
+ f"**Batchsize:** {batchsize}"
306
+ )
307
 
308
  overall_metrics = test.get("overall_metrics", {})
309
  if overall_metrics:
310
+ df_data_metrics = create_data_metrics_df(overall_metrics.get("event_metrics", {}))
311
  cm_fig = create_confusion_matrix(overall_metrics)
312
  df_fn = create_fn_df(record_results)
313
  df_error = create_error_df(overall_metrics, batchsize)
314
+ fig_event_score = create_event_score_chart(test)
315
+ overall_event_score = overall_metrics.get("event_score", {})
316
+ page_type_suburst_chart = create_sunburst_chart(overall_metrics, batchsize)
317
+ page_type_metrics = create_page_type_chart(overall_metrics)
318
 
319
+ st.write("## Page Type Metriken")
320
 
321
+ st.write(
322
+ "Klassifikation einer Website als Event- oder Nicht-Event-Seite während der Pipeline.")
323
+ col1, col2 = st.columns([2, 1.5])
324
+ with col1:
325
+ st.plotly_chart(page_type_suburst_chart)
326
+
327
+ with col2:
328
+ st.write("")
329
+ st.write("")
330
+ st.markdown("""
331
+ <span style="font-size:12px">
332
+ <span style="color:#5BC0BE">■</span> <b>True Positive (TP):</b> Event-Seite korrekt erkannt<br>
333
+ <span style="color:#379683">■</span> <b>True Negative (TN):</b> Nicht-Event korrekt erkannt<br>
334
+ <span style="color:#F2881A">■</span> <b>False Positive (FP):</b> Nicht-Event fälschlich als Event erkannt<br>
335
+ <span style="color:#F7B32B">■</span> <b>False Negative (FN):</b> Event-Seite nicht erkannt<br>
336
+ <span style="color:#FF8585">■</span> <b>Error:</b> Fehler während Verarbeitung<br>
337
+ &nbsp;&nbsp;<span style="color:#FF8585">●</span> RATE_LIMIT_ERROR: LLM API-Limit erreicht<br>
338
+ &nbsp;&nbsp;<span style="color:#FF8585">●</span> INVALID_EVENT: Event extrahiert, relevante Daten fehlten<br>
339
+ &nbsp;&nbsp;<span style="color:#FF8585">●</span> INVALID_FORMAT: Event extrahiert, aber falsches JSON<br>
340
+ &nbsp;&nbsp;<span style="color:#FF8585">●</span> ERROR: Andere Fehlerarten
341
+ </span>
342
+ """, unsafe_allow_html=True)
343
 
344
  col1, col2 = st.columns(2)
345
  with col1:
346
+ st.write("#### Confusion Matrix")
347
  st.pyplot(cm_fig, width=450)
348
 
349
  with col2:
350
+ st.write("#### Gründe für False Negatives")
351
  st.dataframe(df_fn)
352
 
353
+ col1,col2 = st.columns([2, 1])
354
+ with col1:
355
+ st.write("#### Scores")
356
+ st.bar_chart(page_type_metrics, height=450)
357
+ with col2:
358
+ st.space(size=100)
359
+
360
+ st.markdown("""
361
+ <span style="font-size:12px">
362
+ <span style="color:#5BC0BE">■</span> <b>Accuracy:</b> Anteil korrekt klassifizierter Seiten an allen klassifizierten Seiten (ohne Errors)<br>
363
+ <span style="color:#379683">■</span> <b>Effective Accuracy:</b> Anteil korrekt klassifizierter Seiten bezogen auf alle Testergebnisse (mit Errors)<br>
364
+ <span style="color:#F2881A">■</span> <b>F1:</b> Harmonic Mean aus Precision und Recall<br>
365
+ <span style="color:#F7B32B">■</span> <b>Precision:</b> Anteil der als Event erkannten Seiten, die tatsächlich Events sind<br>
366
+ <span style="color:#FF8585">■</span> <b>Recall:</b> Anteil der tatsächlichen Event-Seiten, die korrekt erkannt wurden<br>
367
+ </span>
368
+ """, unsafe_allow_html=True)
369
+
370
+
371
+ st.write("---")
372
+
373
+ st.write("## Event-Metriken")
374
+ st.write("Qualität und Korrektheit der extrahierten Event-Informationen.")
375
+
376
+ col1, col2 = st.columns([1, 2])
377
+ with col1:
378
+ st.space(size=100)
379
+ st.markdown("""
380
+ <span style="font-size:12px">
381
+ <span style="color:#43cd80">■</span> <b>Event Score:</b> Gesamtbewertung der Event-Qualität, berechnet aus F1-Score und Match Scores der einzelnen Felder<br>
382
+ <span style="color:#ff2b2b">■</span> <b>Precision:</b> Anteil korrekt extrahierter Informationen<br>
383
+ <span style="color:#ffabab">■</span> <b>Recall:</b> Anteil erkannter Informationen von allen erwarteten<br>
384
+ <span style="color:#0068c9">■</span> <b>F1-Score:</b> Harmonisches Mittel aus Precision und Recall<br>
385
+ <span style="color:#83c9ff">■</span> <b>Match Score:</b> Textähnlichkeit zweier Strings (Fuzzy Matching)
386
+ </span>
387
+ """, unsafe_allow_html=True, width=300)
388
+
389
+ with col2:
390
+ st.plotly_chart(fig_event_score)
391
+
392
+ st.write("**Ergebnisse der einzelnen Event-Informationen**")
393
+ st.bar_chart(df_data_metrics, stack=False, sort=False)
394
 
395
 
396
  else:
397
  st.info("Der Test läuft noch. Es konnte noch keine Metric erstellt werden")
398
 
399
+ with st.expander("Testergebnisse im Detail"):
400
+ df = create_detail_table(test)
401
+ st.dataframe(df, height=600)
402
 
403
  record_id = st.text_input(label="Gebe eine Record ID ein um die Original Website anzusehen.", value="")
404
  if record_id:
 
408
  if html:
409
  html = html.decode("utf-8")
410
  show_website(url, html)
411
+
412
+ with st.expander("Ergebnisse aller Tests im Verlauf"):
413
+ pipeline = [
414
+ {"$match": {"status": "completed"}},
415
+ {"$project": {
416
+ "_id": 1,
417
+ "created_at": 1,
418
+ "overall_metrics": 1,
419
+ "pipeline_version": 1,
420
+ "batchsize": {
421
+ "$size": {
422
+ "$objectToArray": {
423
+ "$ifNull": ["$record_results", {}]
424
+ }
425
+ }
426
+ }
427
+ }}
428
+ ]
429
+
430
+ tests = list(db.test_evaluation.aggregate(pipeline))
431
+ if not tests:
432
+ st.info("Es sind noch keine Testergebnisse vorhanden.")
433
+ else:
434
+ event_scores_time_series = pd.DataFrame([
435
+ {
436
+ "timestamp": pd.to_datetime(t.get("created_at")),
437
+ "pipeline_version": t.get("pipeline_version"),
438
+ "event_score": t.get("overall_metrics", {}).get("event_score"),
439
+ "errors": sum(t.get("overall_metrics", {}).get("error", {"error": 90}).values()) / t.get(
440
+ "batchsize") * 100,
441
+ "page_type_effective_accuracy": t.get("overall_metrics", {}).get("page_type", {}).get(
442
+ "effective_accuracy", 0) * 100,
443
+ "page_type_precision": t.get("overall_metrics", {}).get("page_type", {}).get("precision", 0) * 100,
444
+ "page_type_recall": t.get("overall_metrics", {}).get("page_type", {}).get("recall", 0) * 100,
445
+ "page_type_f1": t.get("overall_metrics", {}).get("page_type", {}).get("f1", 0) * 100,
446
+ "page_type_accuracy": t.get("overall_metrics", {}).get("page_type", {}).get("accuracy", 0) * 100,
447
+
448
+ }
449
+ for t in tests
450
+ ])
451
+
452
+ event_scores_time_series = (
453
+ event_scores_time_series
454
+ .sort_values("timestamp")
455
+ .set_index("timestamp")
456
+ )
457
+
458
+ df = event_scores_time_series.reset_index()
459
+
460
+ fig = px.line(
461
+ df,
462
+ x="timestamp",
463
+ y=["event_score", "errors", "page_type_effective_accuracy", "page_type_precision", "page_type_recall",
464
+ "page_type_f1",
465
+ "page_type_accuracy"],
466
+ hover_data=["pipeline_version"],
467
+ labels={
468
+ "value": "Prozent",
469
+ "variable": "Metrik"
470
+ },
471
+ markers=True
472
+ )
473
+
474
+ fig.update_yaxes(tick0=0, dtick=10, title="Wert in Prozent")
475
+
476
+ st.plotly_chart(fig, use_container_width=True)
477
+