GitHub Actions
Sync from GitHub
0be7900
"""
Dashboard β€” home page of the Distractor Annotation Tool.
Streamlit multipage: this file is the entry point.
All other pages live in pages/.
"""
import os
import uuid
from collections import Counter
from datetime import datetime
import pandas as pd
import streamlit as st
from utils.dataset_io import load_annotations, load_seed_data, save_annotations
from utils.ui import render_sidebar
# ── Page config (must be first Streamlit call) ─────────────────────────────
st.set_page_config(
page_title="Distractor Annotation Tool",
page_icon="🎯",
layout="wide",
initial_sidebar_state="expanded",
)
annotator = render_sidebar("Dashboard")
# ── Header ─────────────────────────────────────────────────────────────────
st.title("🎯 Distractor Annotation Tool")
st.markdown(
"**MSc NLP β€” Keeping LLMs on Track in Task-Oriented Dialogue** \n"
"Collaborative annotation workspace for your research group."
)
st.divider()
# ── Load annotations ───────────────────────────────────────────────────────
with st.spinner("Loading shared annotations…"):
try:
annotations = load_annotations()
except Exception as exc:
st.error(f"Error loading annotations: {exc}")
annotations = []
# ── Summary metrics ────────────────────────────────────────────────────────
c1, c2, c3, c4, c5 = st.columns(5)
c1.metric("πŸ“ Total Entries", len(annotations))
c2.metric("βœ… Approved", sum(1 for a in annotations if a.get("_review_status") == "approved"))
c3.metric("πŸ“‹ Draft", sum(1 for a in annotations if a.get("_review_status") == "draft"))
c4.metric(
"🎭 Distractors",
sum(len(a.get("distractors_multiturn", [])) for a in annotations),
)
c5.metric(
"πŸ‘₯ Annotators",
len({a.get("_annotator", "?") for a in annotations} - {"seed_data", "?"}),
)
st.divider()
# ── Two-column section ─────────────────────────────────────────────────────
col_left, col_right = st.columns(2)
with col_left:
st.subheader("πŸ“Š Annotator Progress")
if annotations:
stats: dict[str, dict] = {}
for a in annotations:
ann = a.get("_annotator", "Unknown")
stats.setdefault(ann, {"Entries": 0, "Distractors": 0, "Approved": 0})
stats[ann]["Entries"] += 1
stats[ann]["Distractors"] += len(a.get("distractors_multiturn", []))
if a.get("_review_status") == "approved":
stats[ann]["Approved"] += 1
df = pd.DataFrame(
[{"Annotator": k, **v} for k, v in stats.items()]
).sort_values("Entries", ascending=False)
st.dataframe(df, use_container_width=True, hide_index=True)
else:
st.info("No annotations yet β€” import seed data below or go to **Annotate**.")
with col_right:
st.subheader("πŸ“‹ Status & Domain Breakdown")
if annotations:
statuses = Counter(a.get("_review_status", "draft") for a in annotations)
df_s = pd.DataFrame([{"Status": k, "Count": v} for k, v in statuses.most_common()])
st.dataframe(df_s, use_container_width=True, hide_index=True)
st.markdown("**By domain:**")
domains = Counter(a.get("domain", "unknown") for a in annotations)
for dom, cnt in domains.most_common():
st.write(f" β€’ {dom}: **{cnt}**")
else:
st.info("No data yet.")
st.divider()
# ── Recent activity ────────────────────────────────────────────────────────
if annotations:
st.subheader("πŸ•’ Recent Activity")
recent = sorted(
annotations,
key=lambda x: x.get("_updated_at", ""),
reverse=True,
)[:8]
for entry in recent:
scenario_preview = entry.get("scenario", "Unknown")[:70]
status_emoji = {"approved": "βœ…", "draft": "πŸ“‹", "failed": "❌"}.get(
entry.get("_review_status", "draft"), "❓"
)
with st.expander(
f"{status_emoji} [{entry.get('domain', '?')}] {scenario_preview}"
):
rc1, rc2, rc3 = st.columns(3)
rc1.write(f"**Annotator:** {entry.get('_annotator', '?')}")
rc2.write(f"**Status:** {entry.get('_review_status', 'draft')}")
rc3.write(f"**Distractors:** {len(entry.get('distractors_multiturn', []))}")
updated = entry.get("_updated_at", "")[:16].replace("T", " ")
st.caption(f"Last updated: {updated}")
st.divider()
# ── Seed data import ───────────────────────────────────────────────────────
st.subheader("πŸ“¦ Seed Data (Initial Group Work)")
seed = load_seed_data()
if seed:
st.write(
f"**{len(seed)}** entries are bundled as seed data "
f"(20 legal-domain scenarios generated and reviewed by the group)."
)
if len(annotations) == 0:
st.info("The shared repo is empty. Import the seed data to get started.")
col_imp, col_info = st.columns([1, 3])
with col_imp:
if st.button("🌱 Import Seed Data", use_container_width=True, type="primary"):
if not annotator:
st.error("Enter your name in the sidebar first.")
elif len(annotations) > 0:
st.warning(
"Shared repo already has entries. "
"Clear all annotations on the **πŸ‘₯ Annotations** page first if you want to re-import."
)
else:
tagged = []
for entry in seed:
ec = dict(entry)
ec["_id"] = ec.get("_id") or str(uuid.uuid4())
ec.setdefault("_annotator", "seed_data")
ec.setdefault("_created_at", datetime.now().isoformat())
ec["_updated_at"] = datetime.now().isoformat()
ec.setdefault("_review_status", "draft")
ec.setdefault("_needs_human_review", True)
ec.setdefault("_llm_test_results", [])
tagged.append(ec)
if save_annotations(tagged):
st.success(f"βœ… Imported {len(tagged)} seed entries into the shared repo!")
st.rerun()
with col_info:
st.caption(
"Seed data was generated with Gemma-4-E2B running locally via LM Studio, "
"then reviewed and partially rewritten by your groupmate."
)
else:
st.warning("seed_data/draft_distractors.json not found.")
st.divider()
# ── Configuration status ───────────────────────────────────────────────────
st.subheader("βš™οΈ Configuration")
hf_token = os.environ.get("HF_TOKEN") or (
st.secrets.get("HF_TOKEN") if hasattr(st, "secrets") else None
)
ann_repo = os.environ.get("ANNOTATIONS_REPO_ID") or (
st.secrets.get("ANNOTATIONS_REPO_ID") if hasattr(st, "secrets") else None
)
cfg1, cfg2 = st.columns(2)
with cfg1:
if hf_token:
st.success("βœ… `HF_TOKEN` is set")
else:
st.error("❌ `HF_TOKEN` not set β€” add it to your Space secrets")
with cfg2:
if ann_repo:
st.success(f"βœ… Annotations repo: `{ann_repo}`")
else:
st.error("❌ `ANNOTATIONS_REPO_ID` not set β€” add it to your Space secrets")