File size: 3,606 Bytes
037f332
3c7d587
 
 
 
 
 
 
037f332
3c7d587
 
 
 
 
037f332
3c7d587
037f332
c5e72f6
 
 
037f332
c5e72f6
 
 
037f332
3c7d587
 
 
 
 
 
 
c5e72f6
 
3c7d587
 
 
 
 
 
037f332
c5e72f6
 
 
 
 
 
 
037f332
 
 
3c7d587
037f332
 
 
 
 
3c7d587
 
c5e72f6
3c7d587
 
 
037f332
c5e72f6
 
037f332
 
 
 
 
 
 
 
 
3c7d587
 
c5e72f6
 
 
 
 
 
 
 
 
 
3c7d587
 
 
037f332
3c7d587
 
037f332
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""VynFi Γ— pm4py: Interactive Process Mining Demo"""

import streamlit as st
import pandas as pd
from collections import Counter

st.set_page_config(page_title="VynFi Process Mining", page_icon="πŸ“Š", layout="wide")
st.title("πŸ“Š VynFi Γ— pm4py: Process Mining Demo")
st.caption("Synthetic supply-chain event log from [VynFi](https://vynfi.com)")


@st.cache_data
def load_data():
    from datasets import load_dataset
    ds = load_dataset("VynFi/vynfi-supply-chain-ocel", "events", split="train")
    df = ds.to_pandas()
    df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
    # Drop rows with NaT timestamps (pm4py can't handle them)
    df = df.dropna(subset=["timestamp"])
    # Rename for pm4py β€” use safe names without colons for display
    df = df.rename(columns={
        "case_id": "case_id_pm",
        "activity_name": "activity",
        "timestamp": "ts",
    })
    return df


df = load_data()

st.sidebar.header("Dataset")
st.sidebar.metric("Events", f"{len(df):,}")
st.sidebar.metric("Activities", df["activity"].nunique())
st.sidebar.metric("Cases", df["case_id_pm"].nunique())

tab1, tab2, tab3, tab4 = st.tabs(["Process Model", "Variants", "Statistics", "Raw Data"])

with tab1:
    st.subheader("Directly-Follows Graph")
    try:
        import pm4py
        # Convert to pm4py format
        pm_df = df.rename(columns={
            "case_id_pm": "case:concept:name",
            "activity": "concept:name",
            "ts": "time:timestamp",
        })
        event_log = pm4py.convert_to_event_log(pm_df)
        dfg, sa, ea = pm4py.discover_dfg(event_log)
        from pm4py.visualization.dfg import visualizer as dfg_vis
        gviz = dfg_vis.apply(dfg, log=event_log, variant=dfg_vis.Variants.FREQUENCY,
            parameters={
                dfg_vis.Variants.FREQUENCY.value.Parameters.START_ACTIVITIES: sa,
                dfg_vis.Variants.FREQUENCY.value.Parameters.END_ACTIVITIES: ea,
                dfg_vis.Variants.FREQUENCY.value.Parameters.FORMAT: "svg",
            })
        st.image(dfg_vis.serialize(gviz).decode("utf-8"), use_container_width=True)
    except Exception as e:
        st.warning(f"Could not render DFG: {e}")
        st.info("Try the Variants or Statistics tabs instead.")

with tab2:
    st.subheader("Process Variants")
    variants = {}
    for cid, grp in df.sort_values("ts").groupby("case_id_pm"):
        variants[cid] = tuple(grp["activity"].tolist())
    vc = Counter(variants.values())
    total = len(variants)
    st.metric("Unique Variants", len(vc))
    rows = [{"Trace": " β†’ ".join(t), "Count": c, "Frequency": f"{c/total*100:.1f}%"}
            for t, c in vc.most_common(20)]
    st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True)
    if vc:
        hp = vc.most_common(1)[0]
        st.info(f"**Happy path**: {' β†’ '.join(hp[0])} ({hp[1]} cases, {hp[1]/total*100:.1f}%)")

with tab3:
    st.subheader("Activity Frequency")
    ac = df["activity"].value_counts().reset_index()
    ac.columns = ["Activity", "Count"]
    st.bar_chart(ac, x="Activity", y="Count")

    st.subheader("Events Over Time")
    if "ts" in df.columns:
        weekly = df.set_index("ts").resample("W").size().reset_index()
        weekly.columns = ["Week", "Events"]
        st.line_chart(weekly, x="Week", y="Events")

with tab4:
    st.subheader("Raw Event Data")
    st.dataframe(df.head(200), use_container_width=True)

st.divider()
st.caption("[VynFi](https://vynfi.com) Β· [pm4py](https://pm4py.fit.fraunhofer.de/) Β· [Dataset](https://huggingface.co/datasets/VynFi/vynfi-supply-chain-ocel)")