Spaces:

keepingLLMontrack
/

llm-annotation-platform

Running

GitHub Actions

Sync from GitHub

0be7900 3 days ago

7.98 kB

	"""
	Dashboard — home page of the Distractor Annotation Tool.

	Streamlit multipage: this file is the entry point.
	All other pages live in pages/.
	"""

	import os
	import uuid
	from collections import Counter
	from datetime import datetime

	import pandas as pd
	import streamlit as st

	from utils.dataset_io import load_annotations, load_seed_data, save_annotations
	from utils.ui import render_sidebar

	# ── Page config (must be first Streamlit call) ─────────────────────────────
	st.set_page_config(
	page_title="Distractor Annotation Tool",
	page_icon="🎯",
	layout="wide",
	initial_sidebar_state="expanded",
	)

	annotator = render_sidebar("Dashboard")

	# ── Header ─────────────────────────────────────────────────────────────────
	st.title("🎯 Distractor Annotation Tool")
	st.markdown(
	"MSc NLP — Keeping LLMs on Track in Task-Oriented Dialogue \n"
	"Collaborative annotation workspace for your research group."
	)
	st.divider()

	# ── Load annotations ───────────────────────────────────────────────────────
	with st.spinner("Loading shared annotations…"):
	try:
	annotations = load_annotations()
	except Exception as exc:
	st.error(f"Error loading annotations: {exc}")
	annotations = []

	# ── Summary metrics ────────────────────────────────────────────────────────
	c1, c2, c3, c4, c5 = st.columns(5)
	c1.metric("📝 Total Entries", len(annotations))
	c2.metric("✅ Approved", sum(1 for a in annotations if a.get("_review_status") == "approved"))
	c3.metric("📋 Draft", sum(1 for a in annotations if a.get("_review_status") == "draft"))
	c4.metric(
	"🎭 Distractors",
	sum(len(a.get("distractors_multiturn", [])) for a in annotations),
	)
	c5.metric(
	"👥 Annotators",
	len({a.get("_annotator", "?") for a in annotations} - {"seed_data", "?"}),
	)

	st.divider()

	# ── Two-column section ─────────────────────────────────────────────────────
	col_left, col_right = st.columns(2)

	with col_left:
	st.subheader("📊 Annotator Progress")
	if annotations:
	stats: dict[str, dict] = {}
	for a in annotations:
	ann = a.get("_annotator", "Unknown")
	stats.setdefault(ann, {"Entries": 0, "Distractors": 0, "Approved": 0})
	stats[ann]["Entries"] += 1
	stats[ann]["Distractors"] += len(a.get("distractors_multiturn", []))
	if a.get("_review_status") == "approved":
	stats[ann]["Approved"] += 1
	df = pd.DataFrame(
	[{"Annotator": k, **v} for k, v in stats.items()]
	).sort_values("Entries", ascending=False)
	st.dataframe(df, use_container_width=True, hide_index=True)
	else:
	st.info("No annotations yet — import seed data below or go to Annotate.")

	with col_right:
	st.subheader("📋 Status & Domain Breakdown")
	if annotations:
	statuses = Counter(a.get("_review_status", "draft") for a in annotations)
	df_s = pd.DataFrame([{"Status": k, "Count": v} for k, v in statuses.most_common()])
	st.dataframe(df_s, use_container_width=True, hide_index=True)

	st.markdown("By domain:")
	domains = Counter(a.get("domain", "unknown") for a in annotations)
	for dom, cnt in domains.most_common():
	st.write(f" • {dom}: {cnt}")
	else:
	st.info("No data yet.")

	st.divider()

	# ── Recent activity ────────────────────────────────────────────────────────
	if annotations:
	st.subheader("🕒 Recent Activity")
	recent = sorted(
	annotations,
	key=lambda x: x.get("_updated_at", ""),
	reverse=True,
	)[:8]
	for entry in recent:
	scenario_preview = entry.get("scenario", "Unknown")[:70]
	status_emoji = {"approved": "✅", "draft": "📋", "failed": "❌"}.get(
	entry.get("_review_status", "draft"), "❓"
	)
	with st.expander(
	f"{status_emoji} [{entry.get('domain', '?')}] {scenario_preview}"
	):
	rc1, rc2, rc3 = st.columns(3)
	rc1.write(f"Annotator: {entry.get('_annotator', '?')}")
	rc2.write(f"Status: {entry.get('_review_status', 'draft')}")
	rc3.write(f"Distractors: {len(entry.get('distractors_multiturn', []))}")
	updated = entry.get("_updated_at", "")[:16].replace("T", " ")
	st.caption(f"Last updated: {updated}")

	st.divider()

	# ── Seed data import ───────────────────────────────────────────────────────
	st.subheader("📦 Seed Data (Initial Group Work)")
	seed = load_seed_data()
	if seed:
	st.write(
	f"{len(seed)} entries are bundled as seed data "
	f"(20 legal-domain scenarios generated and reviewed by the group)."
	)
	if len(annotations) == 0:
	st.info("The shared repo is empty. Import the seed data to get started.")

	col_imp, col_info = st.columns([1, 3])
	with col_imp:
	if st.button("🌱 Import Seed Data", use_container_width=True, type="primary"):
	if not annotator:
	st.error("Enter your name in the sidebar first.")
	elif len(annotations) > 0:
	st.warning(
	"Shared repo already has entries. "
	"Clear all annotations on the 👥 Annotations page first if you want to re-import."
	)
	else:
	tagged = []
	for entry in seed:
	ec = dict(entry)
	ec["_id"] = ec.get("_id") or str(uuid.uuid4())
	ec.setdefault("_annotator", "seed_data")
	ec.setdefault("_created_at", datetime.now().isoformat())
	ec["_updated_at"] = datetime.now().isoformat()
	ec.setdefault("_review_status", "draft")
	ec.setdefault("_needs_human_review", True)
	ec.setdefault("_llm_test_results", [])
	tagged.append(ec)
	if save_annotations(tagged):
	st.success(f"✅ Imported {len(tagged)} seed entries into the shared repo!")
	st.rerun()
	with col_info:
	st.caption(
	"Seed data was generated with Gemma-4-E2B running locally via LM Studio, "
	"then reviewed and partially rewritten by your groupmate."
	)
	else:
	st.warning("seed_data/draft_distractors.json not found.")

	st.divider()

	# ── Configuration status ───────────────────────────────────────────────────
	st.subheader("⚙️ Configuration")
	hf_token = os.environ.get("HF_TOKEN") or (
	st.secrets.get("HF_TOKEN") if hasattr(st, "secrets") else None
	)
	ann_repo = os.environ.get("ANNOTATIONS_REPO_ID") or (
	st.secrets.get("ANNOTATIONS_REPO_ID") if hasattr(st, "secrets") else None
	)

	cfg1, cfg2 = st.columns(2)
	with cfg1:
	if hf_token:
	st.success("✅ `HF_TOKEN` is set")
	else:
	st.error("❌ `HF_TOKEN` not set — add it to your Space secrets")
	with cfg2:
	if ann_repo:
	st.success(f"✅ Annotations repo: `{ann_repo}`")
	else:
	st.error("❌ `ANNOTATIONS_REPO_ID` not set — add it to your Space secrets")