Spaces:

nus-project
/

annotation-dashboard

Running

Gintarė Zokaitytė

Cache validation logic fix

c4ef01c 13 days ago

18 kB

	import re
	import os
	import pickle
	from pathlib import Path
	from concurrent.futures import ThreadPoolExecutor
	import streamlit as st
	import pandas as pd
	import plotly.graph_objects as go
	import requests

	GOAL_WORDS = 2_200_000
	CATEGORY_GOAL = 1_100_000

	OUR_TEAM_PROJECT_IDS = {29, 30, 31, 32, 33, 37}
	ANNOTATED_STATES = ["Acceptable", "No Rating"]
	GOAL_STATES = ["Acceptable", "No Rating", "ReqAttn (entities)"]

	TEAM_COLORS = {
	"A.K. (22)": "#0066cc",
	"J.Š. (23)": "#00cccc",
	"J.Š. (24)": "#00cc00",
	"G.Z. (25)": "#ff9900",
	"L.M. (26)": "#9933ff",
	"M.M. (27)": "#cc0000",
	}

	# Cache file location (persists between runs)
	CACHE_FILE = Path(".cache.pkl")

	st.set_page_config(page_title="Annotation Progress", page_icon="📊", layout="wide")


	def fetch_project_data(proj, url, headers):
	"""Fetch data from one project (for parallel execution)."""
	pid, name, task_count = proj["id"], proj.get("title", f"Project {proj['id']}"), proj.get("task_number", 0)
	group = "Our Team" if pid in OUR_TEAM_PROJECT_IDS else "Others"

	rows = []
	submitted_count = 0 # Track submitted (annotated) tasks
	page = 1
	while True:
	resp = requests.get(f"{url}/api/projects/{pid}/tasks", headers=headers, params={"page": page, "page_size": 100}, timeout=30)
	resp.raise_for_status()
	data = resp.json()
	tasks = data if isinstance(data, list) else data.get("tasks", [])

	if not tasks:
	break

	for task in tasks:
	task_data = task.get("data", {})
	words = task_data.get("words") or len(task_data.get("text", "").split())
	category = task_data.get("category")

	annots = [a for a in task.get("annotations", []) if not a.get("was_cancelled")]
	if not annots:
	rows.append(
	{
	"project_id": pid,
	"project": name,
	"project_group": group,
	"date": None,
	"state": "Not Annotated",
	"words": int(words),
	"category": category,
	}
	)
	continue

	# Task has annotations - count as submitted
	submitted_count += 1

	ann = annots[0]
	date = ann.get("created_at", "")[:10] or None

	rating = None
	for item in ann.get("result", []):
	if item.get("type") == "choices" and item.get("from_name") == "text_rating":
	rating = item.get("value", {}).get("choices", [None])[0]
	break

	has_entities = any(i.get("type") == "labels" for i in ann.get("result", []))
	if rating is None:
	state = "No Rating"
	elif rating == "Requires Attention":
	state = f"ReqAttn ({'entities' if has_entities else 'empty'})"
	elif rating == "Unacceptable":
	state = f"Unacceptable ({'entities' if has_entities else 'empty'})"
	else:
	state = "Acceptable"

	rows.append(
	{"project_id": pid, "project": name, "project_group": group, "date": date, "state": state, "words": int(words), "category": category}
	)

	if isinstance(data, list) and len(data) < 100:
	break
	if isinstance(data, dict) and not data.get("next"):
	break
	page += 1

	return pid, task_count, submitted_count, rows


	@st.cache_data(ttl=300)
	def load_data(projects_hash):
	"""Load annotation data from Label Studio with disk cache.

	Args:
	projects_hash: Hash of project states to invalidate Streamlit cache when projects change
	"""
	try:
	url = st.secrets.get("LABEL_STUDIO_URL", os.getenv("LABEL_STUDIO_URL", "")).rstrip("/")
	key = st.secrets.get("LABEL_STUDIO_API_KEY", os.getenv("LABEL_STUDIO_API_KEY", ""))
	except (KeyError, FileNotFoundError, AttributeError):
	url = os.getenv("LABEL_STUDIO_URL", "").rstrip("/")
	key = os.getenv("LABEL_STUDIO_API_KEY", "")

	if not url or not key:
	st.error("Missing credentials. Set LABEL_STUDIO_URL and LABEL_STUDIO_API_KEY.")
	st.stop()

	headers = {"Authorization": f"Token {key}"}

	# Fetch all projects
	resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
	resp.raise_for_status()
	projects = resp.json().get("results", [])

	# Load cache
	cache = {}
	if CACHE_FILE.exists():
	try:
	with open(CACHE_FILE, "rb") as f:
	cache = pickle.load(f)
	except Exception:
	cache = {}

	# Check which projects need updating
	projects_to_fetch = []
	all_rows = []

	for proj in projects:
	pid = proj["id"]
	task_count = proj.get("task_number", 0)
	# Get submitted task count from Label Studio API
	api_submitted_count = proj.get("num_tasks_with_annotations", 0)

	cache_key = f"project_{pid}"

	# Invalidate cache if:
	# 1. No cache exists for this project
	# 2. Total task count changed (new tasks added/removed)
	# 3. Submitted task count changed (new annotations/submissions)
	use_cache = False
	if cache_key in cache:
	cached = cache[cache_key]
	# Use cache only if BOTH counts match
	if (cached.get("task_count") == task_count and
	cached.get("submitted_count") == api_submitted_count):
	use_cache = True

	if use_cache:
	all_rows.extend(cache[cache_key]["rows"])
	else:
	projects_to_fetch.append(proj)

	# Fetch updated projects in parallel
	if projects_to_fetch:
	with ThreadPoolExecutor(max_workers=10) as executor:
	futures = [executor.submit(fetch_project_data, proj, url, headers) for proj in projects_to_fetch]

	progress = st.progress(0, text=f"Loading {len(projects_to_fetch)} projects...")
	for i, future in enumerate(futures):
	pid, task_count, submitted_count, rows = future.result()
	all_rows.extend(rows)
	cache[f"project_{pid}"] = {"task_count": task_count, "submitted_count": submitted_count, "rows": rows}
	progress.progress((i + 1) / len(futures), text=f"Loaded {i + 1}/{len(futures)} projects")
	progress.empty()

	# Save cache
	try:
	with open(CACHE_FILE, "wb") as f:
	pickle.dump(cache, f)
	except Exception:
	pass

	# Create dataframe
	df = pd.DataFrame(all_rows)
	df["words"] = df["words"].astype(int)
	df["date"] = pd.to_datetime(df["date"], errors="coerce")
	df["is_annotated"] = df["state"].isin(ANNOTATED_STATES)
	df["is_goal_state"] = df["state"].isin(GOAL_STATES)

	return df


	def get_projects_hash():
	"""Fetch projects and return a hash of their states for cache invalidation."""
	import hashlib

	try:
	url = st.secrets.get("LABEL_STUDIO_URL", os.getenv("LABEL_STUDIO_URL", "")).rstrip("/")
	key = st.secrets.get("LABEL_STUDIO_API_KEY", os.getenv("LABEL_STUDIO_API_KEY", ""))
	except (KeyError, FileNotFoundError, AttributeError):
	url = os.getenv("LABEL_STUDIO_URL", "").rstrip("/")
	key = os.getenv("LABEL_STUDIO_API_KEY", "")

	if not url or not key:
	return "no-credentials"

	headers = {"Authorization": f"Token {key}"}
	resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
	resp.raise_for_status()
	projects = resp.json().get("results", [])

	# Create hash from project states (id, task_number, num_tasks_with_annotations)
	state_string = ""
	for proj in projects:
	pid = proj["id"]
	task_count = proj.get("task_number", 0)
	submitted_count = proj.get("num_tasks_with_annotations", 0)
	state_string += f"{pid}:{task_count}:{submitted_count};"

	return hashlib.md5(state_string.encode()).hexdigest()


	def anonymize(name):
	"""Convert '26 [Name Lastname]' to 'N.L. (26)'"""
	if name == "Others":
	return "Others"
	match = re.match(r"(\d+)\s+\[(.+?)\]", name)
	if match:
	num, full = match.groups()
	parts = full.split()
	if len(parts) >= 2:
	return f"{parts[0][0]}.{parts[-1][0]}. ({num})"
	return name


	st.title("📊 Annotation Progress Dashboard")
	st.markdown("---")

	# Load data
	with st.spinner("Loading..."):
	projects_hash = get_projects_hash()
	df = load_data(projects_hash)

	# Overview metrics
	total = df[df["is_goal_state"]]["words"].sum()
	remaining = GOAL_WORDS - total
	progress = total / GOAL_WORDS * 100

	col1, col2 = st.columns(2)
	col1.metric("Progress toward 2.2M", f"{total:,}", f"{progress:.1f}%")
	col2.metric("Remaining", f"{remaining:,}", f"{100 - progress:.1f}%")

	st.markdown("---")

	# Tabs
	tab1, tab2 = st.tabs(["📊 Weekly Stats", "⏱️ Pacing"])

	# ============== TAB 1: Weekly Stats ==============
	with tab1:
	st.caption("Goal states (Acceptable + No Rating + ReqAttn with entities)")

	cutoff_date = pd.Timestamp("2025-12-22")

	# Filter data - use GOAL_STATES to match progress metrics
	df_week = df[df["is_goal_state"] & df["date"].notna()].copy()
	df_week["week_start"] = df_week["date"] - pd.to_timedelta(df_week["date"].dt.dayofweek, unit="d")
	df_week["member"] = df_week.apply(lambda r: anonymize(r["project"]) if r["project_group"] == "Our Team" else "Others", axis=1)

	# Weekly pivot (all data)
	weekly_all = df_week.pivot_table(index="week_start", columns="member", values="words", aggfunc="sum", fill_value=0).astype(int)

	# Split into before and after cutoff
	weekly_before = weekly_all[weekly_all.index < cutoff_date]
	weekly_after = weekly_all[weekly_all.index >= cutoff_date]

	# Ensure consistent columns
	all_members = set(weekly_all.columns)
	if "Others" not in all_members:
	all_members.add("Others")

	for member in all_members:
	if member not in weekly_after.columns:
	weekly_after[member] = 0
	if member not in weekly_before.columns:
	weekly_before[member] = 0

	# Sort columns by total contribution
	totals = weekly_all.sum().sort_values(ascending=False)
	weekly_after = weekly_after[totals.index]
	weekly_after["Total"] = weekly_after.sum(axis=1)

	# Calculate "Before" summary row
	before_totals = weekly_before[totals.index].sum()
	before_totals["Total"] = before_totals.sum()

	# Format weekly data for display
	display = weekly_after.reset_index()
	display["Week"] = display["week_start"].dt.strftime("%Y-%m-%d") + " - " + (display["week_start"] + pd.Timedelta(days=6)).dt.strftime("%Y-%m-%d")
	display = display.drop("week_start", axis=1)
	display = display[["Week"] + list(totals.index) + ["Total"]]

	# Add "Before" row at the beginning
	before_row = pd.DataFrame([{"Week": f"Before {cutoff_date.strftime('%Y-%m-%d')}", **before_totals}])
	display = pd.concat([before_row, display], ignore_index=True)

	# Add TOTAL row at the end
	all_totals = weekly_all[totals.index].sum()
	all_totals["Total"] = all_totals.sum()
	total_row = pd.DataFrame([{"Week": "TOTAL", **all_totals}])
	display = pd.concat([display, total_row], ignore_index=True)

	# Format numbers
	for col in display.columns:
	if col != "Week":
	display[col] = display[col].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "")

	# Style and show
	def style_row(row):
	if row["Week"] == "TOTAL":
	return ["font-weight: bold; background-color: #f0f0f0;"] * len(row)
	elif row["Week"].startswith("Before"):
	return ["font-style: italic; background-color: #f9f9f9;"] * len(row)
	return [""] * len(row)

	styled = display.style.apply(style_row, axis=1).set_properties(subset=["Total"], **{"font-weight": "bold"})
	st.dataframe(styled, hide_index=True, use_container_width=True)

	# ============== TAB 2: Pacing ==============
	with tab2:
	st.subheader("Category Breakdown")
	st.caption("Requirement: 1.1M words from each category")

	# Split by status: Ready vs Needs Fixing
	df_ready = df[df["is_annotated"]] # Acceptable + No Rating
	df_needs_fixing = df[df["state"] == "ReqAttn (entities)"]
	df_total = df[df["is_goal_state"]]

	# Calculate by category
	mok_ready = df_ready[df_ready["category"] == "mokslinis"]["words"].sum()
	mok_fixing = df_needs_fixing[df_needs_fixing["category"] == "mokslinis"]["words"].sum()
	mok_total = mok_ready + mok_fixing

	zin_ready = df_ready[df_ready["category"] == "ziniasklaida"]["words"].sum()
	zin_fixing = df_needs_fixing[df_needs_fixing["category"] == "ziniasklaida"]["words"].sum()
	zin_total = zin_ready + zin_fixing

	total_ready = mok_ready + zin_ready
	total_fixing = mok_fixing + zin_fixing
	total_all = total_ready + total_fixing

	cat_df = pd.DataFrame(
	{
	"Category": ["mokslinis", "ziniasklaida", "TOTAL"],
	"Ready": [f"{mok_ready:,}", f"{zin_ready:,}", f"{total_ready:,}"],
	"Needs Fixing": [f"{mok_fixing:,}", f"{zin_fixing:,}", f"{total_fixing:,}"],
	"Total": [f"{mok_total:,}", f"{zin_total:,}", f"{total_all:,}"],
	"Goal": [f"{CATEGORY_GOAL:,}", f"{CATEGORY_GOAL:,}", f"{GOAL_WORDS:,}"],
	"Progress": [
	f"{mok_total / CATEGORY_GOAL * 100:.1f}%",
	f"{zin_total / CATEGORY_GOAL * 100:.1f}%",
	f"{total_all / GOAL_WORDS * 100:.1f}%",
	],
	}
	)
	st.dataframe(cat_df, hide_index=True, use_container_width=True)

	st.markdown("---")
	st.header("Cumulative Progress & Projection")

	# Cumulative data
	df_cum = df[df["is_goal_state"] & df["date"].notna()].copy()
	df_cum["member"] = df_cum.apply(lambda r: anonymize(r["project"]) if r["project_group"] == "Our Team" else "Others", axis=1)

	daily = df_cum.groupby(["date", "member"])["words"].sum().reset_index()
	pivot = daily.pivot_table(index="date", columns="member", values="words", fill_value=0)
	cumulative = pivot.sort_index().cumsum()
	cumulative["Total"] = cumulative.sum(axis=1)
	cumulative = cumulative[cumulative.index >= pd.Timestamp("2025-12-18")]

	# Projection calculation
	last_date = cumulative.index[-1]
	current = cumulative["Total"].iloc[-1]

	# Calculate rate from last 14 days
	lookback = cumulative[cumulative.index >= last_date - pd.Timedelta(days=14)]
	if len(lookback) >= 2:
	days = (last_date - lookback.index[0]).days or 1
	rate = (current - lookback["Total"].iloc[0]) / days
	days_left = (GOAL_WORDS - current) / rate if rate > 0 else 0
	completion = last_date + pd.Timedelta(days=days_left)
	weekly_rate = rate * 7
	else:
	rate = completion = weekly_rate = None

	# Chart
	fig = go.Figure()

	# Goal lines
	fig.add_hline(y=1_100_000, line_dash="dot", line_color="orange", annotation_text="Midpoint: 1.1M", annotation_position="top left")
	fig.add_hline(y=GOAL_WORDS, line_dash="dot", line_color="red", annotation_text="Goal: 2.2M", annotation_position="top left")

	# Members
	members = [c for c in cumulative.columns if c not in ["Total", "Others"]]
	members = sorted(members, key=lambda x: cumulative[x].iloc[-1], reverse=True)

	if "Others" in cumulative.columns:
	fig.add_trace(
	go.Scatter(
	x=cumulative.index,
	y=cumulative["Others"],
	name=f"Others: {cumulative['Others'].iloc[-1]:,.0f}",
	mode="lines",
	line=dict(width=2, color="#7f8c8d"),
	)
	)

	for m in members:
	color = TEAM_COLORS.get(m, "#34495e")
	fig.add_trace(
	go.Scatter(x=cumulative.index, y=cumulative[m], name=f"{m}: {cumulative[m].iloc[-1]:,.0f}", mode="lines", line=dict(width=2, color=color))
	)

	# Total
	fig.add_trace(
	go.Scatter(
	x=cumulative.index,
	y=cumulative["Total"],
	name=f"Total: {cumulative['Total'].iloc[-1]:,.0f}",
	mode="lines",
	line=dict(width=3, color="#d4af37"),
	fill="tozeroy",
	fillcolor="rgba(212, 175, 55, 0.1)",
	)
	)

	# Projection
	if completion:
	proj_dates = pd.date_range(last_date, completion, freq="D")
	proj_vals = current + rate * (proj_dates - last_date).days
	fig.add_trace(
	go.Scatter(
	x=proj_dates, y=proj_vals, name=f"Projection ({int(weekly_rate):,}/wk)", mode="lines", line=dict(width=3, color="#d4af37", dash="dot")
	)
	)
	fig.add_trace(
	go.Scatter(
	x=[completion],
	y=[GOAL_WORDS],
	mode="markers+text",
	marker=dict(size=14, color="#d4af37", symbol="diamond"),
	text=[completion.strftime("%b %d")],
	textposition="top center",
	showlegend=False,
	)
	)
	title = f"Cumulative Progress → Est. {completion.strftime('%B %d, %Y')}"
	else:
	title = "Cumulative Progress"

	fig.update_layout(title=title, xaxis_title="Date", yaxis_title="Cumulative Words", height=600, hovermode="x unified", template="plotly_white")
	fig.update_yaxes(tickformat=".2s")

	st.plotly_chart(fig, use_container_width=True)

	# Metrics
	if completion:
	st.markdown("### Pacing Estimates")
	c1, c2, c3 = st.columns(3)
	c1.metric("Per Week Rate", f"{int(weekly_rate):,} words")
	c2.metric("Weeks Remaining", f"{days_left / 7:.1f} weeks")
	c3.metric("Est. Completion", completion.strftime("%Y-%m-%d"))

	# Footer
	st.markdown("---")
	st.caption(f"Updated: {pd.Timestamp.now(tz='Europe/Vilnius').strftime('%Y-%m-%d %H:%M:%S')} \| Auto-refresh: 5 min \| Press 'R' to refresh")