"""VynFi ร— pm4py: Interactive Process Mining Demo""" import streamlit as st import pandas as pd from collections import Counter st.set_page_config(page_title="VynFi Process Mining", page_icon="๐Ÿ“Š", layout="wide") st.title("๐Ÿ“Š VynFi ร— pm4py: Process Mining Demo") st.caption("Synthetic supply-chain event log from [VynFi](https://vynfi.com)") @st.cache_data def load_data(): from datasets import load_dataset ds = load_dataset("VynFi/vynfi-supply-chain-ocel", "events", split="train") df = ds.to_pandas() df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce") # Drop rows with NaT timestamps (pm4py can't handle them) df = df.dropna(subset=["timestamp"]) # Rename for pm4py โ€” use safe names without colons for display df = df.rename(columns={ "case_id": "case_id_pm", "activity_name": "activity", "timestamp": "ts", }) return df df = load_data() st.sidebar.header("Dataset") st.sidebar.metric("Events", f"{len(df):,}") st.sidebar.metric("Activities", df["activity"].nunique()) st.sidebar.metric("Cases", df["case_id_pm"].nunique()) tab1, tab2, tab3, tab4 = st.tabs(["Process Model", "Variants", "Statistics", "Raw Data"]) with tab1: st.subheader("Directly-Follows Graph") try: import pm4py # Convert to pm4py format pm_df = df.rename(columns={ "case_id_pm": "case:concept:name", "activity": "concept:name", "ts": "time:timestamp", }) event_log = pm4py.convert_to_event_log(pm_df) dfg, sa, ea = pm4py.discover_dfg(event_log) from pm4py.visualization.dfg import visualizer as dfg_vis gviz = dfg_vis.apply(dfg, log=event_log, variant=dfg_vis.Variants.FREQUENCY, parameters={ dfg_vis.Variants.FREQUENCY.value.Parameters.START_ACTIVITIES: sa, dfg_vis.Variants.FREQUENCY.value.Parameters.END_ACTIVITIES: ea, dfg_vis.Variants.FREQUENCY.value.Parameters.FORMAT: "svg", }) st.image(dfg_vis.serialize(gviz).decode("utf-8"), use_container_width=True) except Exception as e: st.warning(f"Could not render DFG: {e}") st.info("Try the Variants or Statistics tabs instead.") with tab2: st.subheader("Process Variants") variants = {} for cid, grp in df.sort_values("ts").groupby("case_id_pm"): variants[cid] = tuple(grp["activity"].tolist()) vc = Counter(variants.values()) total = len(variants) st.metric("Unique Variants", len(vc)) rows = [{"Trace": " โ†’ ".join(t), "Count": c, "Frequency": f"{c/total*100:.1f}%"} for t, c in vc.most_common(20)] st.dataframe(pd.DataFrame(rows), use_container_width=True, hide_index=True) if vc: hp = vc.most_common(1)[0] st.info(f"**Happy path**: {' โ†’ '.join(hp[0])} ({hp[1]} cases, {hp[1]/total*100:.1f}%)") with tab3: st.subheader("Activity Frequency") ac = df["activity"].value_counts().reset_index() ac.columns = ["Activity", "Count"] st.bar_chart(ac, x="Activity", y="Count") st.subheader("Events Over Time") if "ts" in df.columns: weekly = df.set_index("ts").resample("W").size().reset_index() weekly.columns = ["Week", "Events"] st.line_chart(weekly, x="Week", y="Events") with tab4: st.subheader("Raw Event Data") st.dataframe(df.head(200), use_container_width=True) st.divider() st.caption("[VynFi](https://vynfi.com) ยท [pm4py](https://pm4py.fit.fraunhofer.de/) ยท [Dataset](https://huggingface.co/datasets/VynFi/vynfi-supply-chain-ocel)")