Spaces:
Sleeping
Sleeping
Fix: drop NaT timestamps, avoid colon in altair column names
Browse files
app.py
CHANGED
|
@@ -15,10 +15,13 @@ def load_data():
|
|
| 15 |
ds = load_dataset("VynFi/vynfi-supply-chain-ocel", "events", split="train")
|
| 16 |
df = ds.to_pandas()
|
| 17 |
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
|
|
|
|
|
|
|
|
|
|
| 18 |
df = df.rename(columns={
|
| 19 |
-
"case_id": "
|
| 20 |
-
"activity_name": "
|
| 21 |
-
"timestamp": "
|
| 22 |
})
|
| 23 |
return df
|
| 24 |
|
|
@@ -27,8 +30,8 @@ df = load_data()
|
|
| 27 |
|
| 28 |
st.sidebar.header("Dataset")
|
| 29 |
st.sidebar.metric("Events", f"{len(df):,}")
|
| 30 |
-
st.sidebar.metric("Activities", df["
|
| 31 |
-
st.sidebar.metric("Cases", df["
|
| 32 |
|
| 33 |
tab1, tab2, tab3, tab4 = st.tabs(["Process Model", "Variants", "Statistics", "Raw Data"])
|
| 34 |
|
|
@@ -36,7 +39,13 @@ with tab1:
|
|
| 36 |
st.subheader("Directly-Follows Graph")
|
| 37 |
try:
|
| 38 |
import pm4py
|
| 39 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 40 |
dfg, sa, ea = pm4py.discover_dfg(event_log)
|
| 41 |
from pm4py.visualization.dfg import visualizer as dfg_vis
|
| 42 |
gviz = dfg_vis.apply(dfg, log=event_log, variant=dfg_vis.Variants.FREQUENCY,
|
|
@@ -48,13 +57,13 @@ with tab1:
|
|
| 48 |
st.image(dfg_vis.serialize(gviz).decode("utf-8"), use_container_width=True)
|
| 49 |
except Exception as e:
|
| 50 |
st.warning(f"Could not render DFG: {e}")
|
| 51 |
-
st.info("
|
| 52 |
|
| 53 |
with tab2:
|
| 54 |
st.subheader("Process Variants")
|
| 55 |
variants = {}
|
| 56 |
-
for cid, grp in df.sort_values("
|
| 57 |
-
variants[cid] = tuple(grp["
|
| 58 |
vc = Counter(variants.values())
|
| 59 |
total = len(variants)
|
| 60 |
st.metric("Unique Variants", len(vc))
|
|
@@ -66,12 +75,16 @@ with tab2:
|
|
| 66 |
st.info(f"**Happy path**: {' → '.join(hp[0])} ({hp[1]} cases, {hp[1]/total*100:.1f}%)")
|
| 67 |
|
| 68 |
with tab3:
|
| 69 |
-
st.subheader("Activity
|
| 70 |
-
ac = df["
|
| 71 |
-
|
| 72 |
-
|
| 73 |
-
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
with tab4:
|
| 77 |
st.subheader("Raw Event Data")
|
|
|
|
| 15 |
ds = load_dataset("VynFi/vynfi-supply-chain-ocel", "events", split="train")
|
| 16 |
df = ds.to_pandas()
|
| 17 |
df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
|
| 18 |
+
# Drop rows with NaT timestamps (pm4py can't handle them)
|
| 19 |
+
df = df.dropna(subset=["timestamp"])
|
| 20 |
+
# Rename for pm4py — use safe names without colons for display
|
| 21 |
df = df.rename(columns={
|
| 22 |
+
"case_id": "case_id_pm",
|
| 23 |
+
"activity_name": "activity",
|
| 24 |
+
"timestamp": "ts",
|
| 25 |
})
|
| 26 |
return df
|
| 27 |
|
|
|
|
| 30 |
|
| 31 |
st.sidebar.header("Dataset")
|
| 32 |
st.sidebar.metric("Events", f"{len(df):,}")
|
| 33 |
+
st.sidebar.metric("Activities", df["activity"].nunique())
|
| 34 |
+
st.sidebar.metric("Cases", df["case_id_pm"].nunique())
|
| 35 |
|
| 36 |
tab1, tab2, tab3, tab4 = st.tabs(["Process Model", "Variants", "Statistics", "Raw Data"])
|
| 37 |
|
|
|
|
| 39 |
st.subheader("Directly-Follows Graph")
|
| 40 |
try:
|
| 41 |
import pm4py
|
| 42 |
+
# Convert to pm4py format
|
| 43 |
+
pm_df = df.rename(columns={
|
| 44 |
+
"case_id_pm": "case:concept:name",
|
| 45 |
+
"activity": "concept:name",
|
| 46 |
+
"ts": "time:timestamp",
|
| 47 |
+
})
|
| 48 |
+
event_log = pm4py.convert_to_event_log(pm_df)
|
| 49 |
dfg, sa, ea = pm4py.discover_dfg(event_log)
|
| 50 |
from pm4py.visualization.dfg import visualizer as dfg_vis
|
| 51 |
gviz = dfg_vis.apply(dfg, log=event_log, variant=dfg_vis.Variants.FREQUENCY,
|
|
|
|
| 57 |
st.image(dfg_vis.serialize(gviz).decode("utf-8"), use_container_width=True)
|
| 58 |
except Exception as e:
|
| 59 |
st.warning(f"Could not render DFG: {e}")
|
| 60 |
+
st.info("Try the Variants or Statistics tabs instead.")
|
| 61 |
|
| 62 |
with tab2:
|
| 63 |
st.subheader("Process Variants")
|
| 64 |
variants = {}
|
| 65 |
+
for cid, grp in df.sort_values("ts").groupby("case_id_pm"):
|
| 66 |
+
variants[cid] = tuple(grp["activity"].tolist())
|
| 67 |
vc = Counter(variants.values())
|
| 68 |
total = len(variants)
|
| 69 |
st.metric("Unique Variants", len(vc))
|
|
|
|
| 75 |
st.info(f"**Happy path**: {' → '.join(hp[0])} ({hp[1]} cases, {hp[1]/total*100:.1f}%)")
|
| 76 |
|
| 77 |
with tab3:
|
| 78 |
+
st.subheader("Activity Frequency")
|
| 79 |
+
ac = df["activity"].value_counts().reset_index()
|
| 80 |
+
ac.columns = ["Activity", "Count"]
|
| 81 |
+
st.bar_chart(ac, x="Activity", y="Count")
|
| 82 |
+
|
| 83 |
+
st.subheader("Events Over Time")
|
| 84 |
+
if "ts" in df.columns:
|
| 85 |
+
weekly = df.set_index("ts").resample("W").size().reset_index()
|
| 86 |
+
weekly.columns = ["Week", "Events"]
|
| 87 |
+
st.line_chart(weekly, x="Week", y="Events")
|
| 88 |
|
| 89 |
with tab4:
|
| 90 |
st.subheader("Raw Event Data")
|