ninarg commited on
Commit
c5e72f6
·
verified ·
1 Parent(s): 037f332

Fix: drop NaT timestamps, avoid colon in altair column names

Browse files
Files changed (1) hide show
  1. app.py +28 -15
app.py CHANGED
@@ -15,10 +15,13 @@ def load_data():
15
  ds = load_dataset("VynFi/vynfi-supply-chain-ocel", "events", split="train")
16
  df = ds.to_pandas()
17
  df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
 
 
 
18
  df = df.rename(columns={
19
- "case_id": "case:concept:name",
20
- "activity_name": "concept:name",
21
- "timestamp": "time:timestamp",
22
  })
23
  return df
24
 
@@ -27,8 +30,8 @@ df = load_data()
27
 
28
  st.sidebar.header("Dataset")
29
  st.sidebar.metric("Events", f"{len(df):,}")
30
- st.sidebar.metric("Activities", df["concept:name"].nunique())
31
- st.sidebar.metric("Cases", df["case:concept:name"].nunique())
32
 
33
  tab1, tab2, tab3, tab4 = st.tabs(["Process Model", "Variants", "Statistics", "Raw Data"])
34
 
@@ -36,7 +39,13 @@ with tab1:
36
  st.subheader("Directly-Follows Graph")
37
  try:
38
  import pm4py
39
- event_log = pm4py.convert_to_event_log(df)
 
 
 
 
 
 
40
  dfg, sa, ea = pm4py.discover_dfg(event_log)
41
  from pm4py.visualization.dfg import visualizer as dfg_vis
42
  gviz = dfg_vis.apply(dfg, log=event_log, variant=dfg_vis.Variants.FREQUENCY,
@@ -48,13 +57,13 @@ with tab1:
48
  st.image(dfg_vis.serialize(gviz).decode("utf-8"), use_container_width=True)
49
  except Exception as e:
50
  st.warning(f"Could not render DFG: {e}")
51
- st.info("pm4py or graphviz may not be available. Try the Variants tab.")
52
 
53
  with tab2:
54
  st.subheader("Process Variants")
55
  variants = {}
56
- for cid, grp in df.sort_values("time:timestamp").groupby("case:concept:name"):
57
- variants[cid] = tuple(grp["concept:name"].tolist())
58
  vc = Counter(variants.values())
59
  total = len(variants)
60
  st.metric("Unique Variants", len(vc))
@@ -66,12 +75,16 @@ with tab2:
66
  st.info(f"**Happy path**: {' → '.join(hp[0])} ({hp[1]} cases, {hp[1]/total*100:.1f}%)")
67
 
68
  with tab3:
69
- st.subheader("Activity Statistics")
70
- ac = df["concept:name"].value_counts()
71
- st.bar_chart(ac)
72
- if "time:timestamp" in df.columns:
73
- st.subheader("Events Over Time")
74
- st.line_chart(df.set_index("time:timestamp").resample("W").size())
 
 
 
 
75
 
76
  with tab4:
77
  st.subheader("Raw Event Data")
 
15
  ds = load_dataset("VynFi/vynfi-supply-chain-ocel", "events", split="train")
16
  df = ds.to_pandas()
17
  df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
18
+ # Drop rows with NaT timestamps (pm4py can't handle them)
19
+ df = df.dropna(subset=["timestamp"])
20
+ # Rename for pm4py — use safe names without colons for display
21
  df = df.rename(columns={
22
+ "case_id": "case_id_pm",
23
+ "activity_name": "activity",
24
+ "timestamp": "ts",
25
  })
26
  return df
27
 
 
30
 
31
  st.sidebar.header("Dataset")
32
  st.sidebar.metric("Events", f"{len(df):,}")
33
+ st.sidebar.metric("Activities", df["activity"].nunique())
34
+ st.sidebar.metric("Cases", df["case_id_pm"].nunique())
35
 
36
  tab1, tab2, tab3, tab4 = st.tabs(["Process Model", "Variants", "Statistics", "Raw Data"])
37
 
 
39
  st.subheader("Directly-Follows Graph")
40
  try:
41
  import pm4py
42
+ # Convert to pm4py format
43
+ pm_df = df.rename(columns={
44
+ "case_id_pm": "case:concept:name",
45
+ "activity": "concept:name",
46
+ "ts": "time:timestamp",
47
+ })
48
+ event_log = pm4py.convert_to_event_log(pm_df)
49
  dfg, sa, ea = pm4py.discover_dfg(event_log)
50
  from pm4py.visualization.dfg import visualizer as dfg_vis
51
  gviz = dfg_vis.apply(dfg, log=event_log, variant=dfg_vis.Variants.FREQUENCY,
 
57
  st.image(dfg_vis.serialize(gviz).decode("utf-8"), use_container_width=True)
58
  except Exception as e:
59
  st.warning(f"Could not render DFG: {e}")
60
+ st.info("Try the Variants or Statistics tabs instead.")
61
 
62
  with tab2:
63
  st.subheader("Process Variants")
64
  variants = {}
65
+ for cid, grp in df.sort_values("ts").groupby("case_id_pm"):
66
+ variants[cid] = tuple(grp["activity"].tolist())
67
  vc = Counter(variants.values())
68
  total = len(variants)
69
  st.metric("Unique Variants", len(vc))
 
75
  st.info(f"**Happy path**: {' → '.join(hp[0])} ({hp[1]} cases, {hp[1]/total*100:.1f}%)")
76
 
77
  with tab3:
78
+ st.subheader("Activity Frequency")
79
+ ac = df["activity"].value_counts().reset_index()
80
+ ac.columns = ["Activity", "Count"]
81
+ st.bar_chart(ac, x="Activity", y="Count")
82
+
83
+ st.subheader("Events Over Time")
84
+ if "ts" in df.columns:
85
+ weekly = df.set_index("ts").resample("W").size().reset_index()
86
+ weekly.columns = ["Week", "Events"]
87
+ st.line_chart(weekly, x="Week", y="Events")
88
 
89
  with tab4:
90
  st.subheader("Raw Event Data")