Spaces:

AgenticFinLab
/

H2EPR-Bench-Explorer

Sleeping

File size: 8,226 Bytes

from __future__ import annotations

import json
from pathlib import Path
import sys

APP_ROOT = Path(__file__).resolve().parent
sys.path.insert(0, str(APP_ROOT / "src"))

import streamlit as st

from h2epr_explorer.constants import (
    CATALOG_COLUMNS,
    GOLD_COMPANION_REPO,
    PUBLIC_DATASET_REPO,
    RELEASE_BOUNDARY_NOTICE,
)
from h2epr_explorer.data_loader import load_catalog, load_event_graph, load_finalcascade_summary, load_stages
from h2epr_explorer.filters import event_description, event_display_label, event_name, filter_catalog
from h2epr_explorer.navigation import (
    build_event_links,
    filter_summary_text,
    query_param_event_id,
    resolve_selected_event_index,
)
from h2epr_explorer.render_gantt import build_timeline_figure


def _as_records(frame):
    return frame.to_dict(orient="records")


def _select_columns(frame, columns):
    present = [column for column in columns if column in frame.columns]
    return frame[present] if present else frame


def _sort_stage_frame(frame):
    sort_columns = [column for column in ("stage_index", "stage_order", "stage_id") if column in frame.columns]
    return frame.sort_values(sort_columns) if sort_columns else frame


def _safe_int(value, default=0):
    try:
        return int(value)
    except (TypeError, ValueError):
        return default


st.set_page_config(page_title="H2EPR-Bench Explorer", layout="wide")

st.markdown(
    """
<style>
div[data-testid="stMetric"] {
    border: 1px solid #e5e7eb;
    border-radius: 8px;
    padding: 0.35rem 0.6rem;
    background: #fbfbf8;
}
.h2epr-kicker {
    color: #4b5563;
    font-size: 0.92rem;
    letter-spacing: 0;
    margin-bottom: 0.25rem;
}
.h2epr-title {
    font-size: 2.15rem;
    font-weight: 760;
    line-height: 1.12;
    margin-bottom: 0.25rem;
}
.h2epr-subtitle {
    color: #374151;
    max-width: 920px;
    margin-bottom: 0.75rem;
}
</style>
""",
    unsafe_allow_html=True,
)

st.markdown('<div class="h2epr-kicker">H²EPR-Bench · public release explorer</div>', unsafe_allow_html=True)
st.markdown('<div class="h2epr-title">Event-process graph browser</div>', unsafe_allow_html=True)
st.markdown(
    '<div class="h2epr-subtitle">Browse public event metadata, stage rows, FinalCascade summaries, and Gantt-style timelines for the H²EPR-Bench release.</div>',
    unsafe_allow_html=True,
)
st.info(RELEASE_BOUNDARY_NOTICE)

catalog = load_catalog()
stages = load_stages()
summary = load_finalcascade_summary()

catalog_rows = _as_records(catalog)

with st.sidebar:
    st.header("Filter events")
    query = st.text_input("Search", placeholder="event name, ID, category, keyword")
    domains = st.multiselect("Domain", sorted(catalog["domain"].dropna().unique().tolist()))
    categories = st.multiselect("Category", sorted(catalog["event_category"].dropna().unique().tolist()))
    min_source_count = st.slider("Minimum sources", 0, int(catalog["source_count"].max()), 0)
    min_stage_count = st.slider("Minimum stages", 0, int(catalog["stage_count"].max()), 0)
    st.divider()
    st.link_button("Dataset repository", f"https://huggingface.co/datasets/{PUBLIC_DATASET_REPO}", use_container_width=True)
    st.link_button("Request Gold access", f"https://huggingface.co/datasets/{GOLD_COMPANION_REPO}", use_container_width=True)

filtered_rows = filter_catalog(
    catalog_rows,
    query=query,
    domains=domains,
    categories=categories,
    min_source_count=min_source_count,
    min_stage_count=min_stage_count,
)

if not filtered_rows:
    st.warning("No event matches the current filters.")
    st.stop()

event_labels = {row["event_id"]: event_display_label(row) for row in catalog_rows}
requested_event_id = query_param_event_id(st.query_params)
selected_index = resolve_selected_event_index(filtered_rows, requested_event_id)

selected_event = st.selectbox(
    "Selected event",
    [row["event_id"] for row in filtered_rows],
    index=selected_index,
    format_func=lambda event_id: event_labels.get(event_id, event_id),
)
try:
    st.query_params["event_id"] = selected_event
except Exception:
    pass

event_row = catalog[catalog["event_id"] == selected_event].iloc[0]
event_record = event_row.to_dict()
event_stages = _sort_stage_frame(stages[stages["event_id"] == selected_event])
summary_row = summary[summary["event_id"] == selected_event]
event_links = build_event_links(selected_event, str(event_record.get("gantt_html_path") or ""))

st.caption(filter_summary_text(len(filtered_rows), len(catalog_rows)))

tabs = st.tabs(["Catalog", "Event detail", "Timeline", "Stages", "FinalCascade JSON", "Access and boundary"])

with tabs[0]:
    st.subheader("Event catalog")
    st.dataframe(_select_columns(catalog[catalog["event_id"].isin([row["event_id"] for row in filtered_rows])], CATALOG_COLUMNS), use_container_width=True, height=520)

with tabs[1]:
    st.subheader(event_name(event_record))
    st.write(event_description(event_record))
    c1, c2, c3, c4, c5 = st.columns(5)
    c1.metric("Sources", _safe_int(event_row.get("source_count", 0)))
    c2.metric("Stages", _safe_int(event_row.get("stage_count", 0)))
    c3.metric("Episodes", _safe_int(event_row.get("episode_count", 0)))
    c4.metric("Participants", _safe_int(event_row.get("participant_count", 0)))
    c5.metric("Relations", _safe_int(event_row.get("relation_count", 0)))

    st.markdown("#### Event profile")
    profile_columns = [
        "event_id",
        "display_name",
        "domain",
        "event_category",
        "event_scope_label",
        "keywords",
        "event_boundary_time_status",
        "temporal_anchor_summary",
        "gold_reference_access_level",
        "finalcascade_access_level",
    ]
    st.dataframe(_select_columns(catalog[catalog["event_id"] == selected_event], profile_columns), use_container_width=True)

    link_cols = st.columns(4)
    link_cols[0].link_button("Open dataset", event_links["public_dataset"], use_container_width=True)
    link_cols[1].link_button("Gold access", event_links["gold_request"], use_container_width=True)
    link_cols[2].link_button("FinalCascade file", event_links["finalcascade_jsonl"], use_container_width=True)
    if "gantt_html" in event_links:
        link_cols[3].link_button("Gantt artifact", event_links["gantt_html"], use_container_width=True)

    if not summary_row.empty:
        st.markdown("#### Public FinalCascade summary")
        summary_columns = [
            "event_id",
            "stage_count",
            "episode_count",
            "participant_count",
            "transaction_count",
            "relation_count",
            "event_boundary_time_status",
            "known_action_time_anchor_count",
            "not_gold_warning",
        ]
        st.dataframe(_select_columns(summary_row, summary_columns), use_container_width=True)

with tabs[2]:
    figure = build_timeline_figure(_as_records(event_stages), selected_event)
    if figure is None:
        st.warning("No public stage rows are available for this event.")
    else:
        st.plotly_chart(figure, use_container_width=True)
    if "gantt_html_path" in event_row and event_row.get("gantt_html_path"):
        st.markdown(f"Gantt HTML artifact path: `{event_row.get('gantt_html_path')}`")

with tabs[3]:
    st.dataframe(event_stages, use_container_width=True, height=520)

with tabs[4]:
    graph = load_event_graph(selected_event)
    st.download_button(
        "Download selected public FinalCascade JSON",
        data=json.dumps(graph, ensure_ascii=False, indent=2),
        file_name=f"{selected_event}_finalcascade_public.json",
        mime="application/json",
    )
    st.json(graph, expanded=False)

with tabs[5]:
    st.markdown(
        f"""
### Release boundary

- Public dataset repo: [`{PUBLIC_DATASET_REPO}`](https://huggingface.co/datasets/{PUBLIC_DATASET_REPO})
- Manual-gated Gold companion: [`{GOLD_COMPANION_REPO}`](https://huggingface.co/datasets/{GOLD_COMPANION_REPO})
- This Explorer loads public event metadata, public stages, public sanitized FinalCascade records, and public visualization paths.
- It does not load gated Gold references.
- Public FinalCascade and Gantt views are supplementary inspection assets, not official scoring references.
"""
    )