Spaces:

nus-project
/

annotation-dashboard

Sleeping

App Files Files Community

Gintarė Zokaitytė commited on Feb 17

Commit

2806d4d

1 Parent(s): c4ef01c

Per annotator changes, cacheupdate

Browse files

Files changed (1) hide show

app.py +478 -236

app.py CHANGED Viewed

@@ -15,31 +15,106 @@ OUR_TEAM_PROJECT_IDS = {29, 30, 31, 32, 33, 37}
 ANNOTATED_STATES = ["Acceptable", "No Rating"]
 GOAL_STATES = ["Acceptable", "No Rating", "ReqAttn (entities)"]
 TEAM_COLORS = {
-    "A.K. (22)": "#0066cc",
-    "J.Š. (23)": "#00cccc",
-    "J.Š. (24)": "#00cc00",
-    "G.Z. (25)": "#ff9900",
-    "L.M. (26)": "#9933ff",
-    "M.M. (27)": "#cc0000",
 }
 # Cache file location (persists between runs)
 CACHE_FILE = Path(".cache.pkl")
 st.set_page_config(page_title="Annotation Progress", page_icon="📊", layout="wide")
-def fetch_project_data(proj, url, headers):
-    """Fetch data from one project (for parallel execution)."""
     pid, name, task_count = proj["id"], proj.get("title", f"Project {proj['id']}"), proj.get("task_number", 0)
     group = "Our Team" if pid in OUR_TEAM_PROJECT_IDS else "Others"
     rows = []
     submitted_count = 0  # Track submitted (annotated) tasks
     page = 1
     while True:
-        resp = requests.get(f"{url}/api/projects/{pid}/tasks", headers=headers, params={"page": page, "page_size": 100}, timeout=30)
         resp.raise_for_status()
         data = resp.json()
         tasks = data if isinstance(data, list) else data.get("tasks", [])
@@ -48,6 +123,11 @@ def fetch_project_data(proj, url, headers):
             break
         for task in tasks:
             task_data = task.get("data", {})
             words = task_data.get("words") or len(task_data.get("text", "").split())
             category = task_data.get("category")
@@ -56,9 +136,12 @@ def fetch_project_data(proj, url, headers):
             if not annots:
                 rows.append(
                     {
                         "project_id": pid,
                         "project": name,
                         "project_group": group,
                         "date": None,
                         "state": "Not Annotated",
                         "words": int(words),
@@ -73,6 +156,37 @@ def fetch_project_data(proj, url, headers):
             ann = annots[0]
             date = ann.get("created_at", "")[:10] or None
             rating = None
             for item in ann.get("result", []):
                 if item.get("type") == "choices" and item.get("from_name") == "text_rating":
@@ -90,7 +204,18 @@ def fetch_project_data(proj, url, headers):
                 state = "Acceptable"
             rows.append(
-                {"project_id": pid, "project": name, "project_group": group, "date": date, "state": state, "words": int(words), "category": category}
             )
         if isinstance(data, list) and len(data) < 100:
@@ -99,10 +224,10 @@ def fetch_project_data(proj, url, headers):
             break
         page += 1
-    return pid, task_count, submitted_count, rows
-@st.cache_data(ttl=300)
 def load_data(projects_hash):
     """Load annotation data from Label Studio with disk cache.
@@ -122,6 +247,9 @@ def load_data(projects_hash):
     headers = {"Authorization": f"Token {key}"}
     # Fetch all projects
     resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
     resp.raise_for_status()
@@ -138,6 +266,7 @@ def load_data(projects_hash):
     # Check which projects need updating
     projects_to_fetch = []
     all_rows = []
     for proj in projects:
@@ -148,34 +277,90 @@ def load_data(projects_hash):
         cache_key = f"project_{pid}"
-        # Invalidate cache if:
-        # 1. No cache exists for this project
-        # 2. Total task count changed (new tasks added/removed)
-        # 3. Submitted task count changed (new annotations/submissions)
-        use_cache = False
-        if cache_key in cache:
-            cached = cache[cache_key]
-            # Use cache only if BOTH counts match
-            if (cached.get("task_count") == task_count and
-                cached.get("submitted_count") == api_submitted_count):
-                use_cache = True
-        if use_cache:
-            all_rows.extend(cache[cache_key]["rows"])
         else:
-            projects_to_fetch.append(proj)
     # Fetch updated projects in parallel
-    if projects_to_fetch:
         with ThreadPoolExecutor(max_workers=10) as executor:
-            futures = [executor.submit(fetch_project_data, proj, url, headers) for proj in projects_to_fetch]
-            progress = st.progress(0, text=f"Loading {len(projects_to_fetch)} projects...")
-            for i, future in enumerate(futures):
-                pid, task_count, submitted_count, rows = future.result()
-                all_rows.extend(rows)
-                cache[f"project_{pid}"] = {"task_count": task_count, "submitted_count": submitted_count, "rows": rows}
-                progress.progress((i + 1) / len(futures), text=f"Loaded {i + 1}/{len(futures)} projects")
             progress.empty()
         # Save cache
@@ -251,230 +436,287 @@ total = df[df["is_goal_state"]]["words"].sum()
 remaining = GOAL_WORDS - total
 progress = total / GOAL_WORDS * 100
-col1, col2 = st.columns(2)
-col1.metric("Progress toward 2.2M", f"{total:,}", f"{progress:.1f}%")
-col2.metric("Remaining", f"{remaining:,}", f"{100 - progress:.1f}%")
-st.markdown("---")
-# Tabs
-tab1, tab2 = st.tabs(["📊 Weekly Stats", "⏱️ Pacing"])
-# ============== TAB 1: Weekly Stats ==============
-with tab1:
-    st.caption("Goal states (Acceptable + No Rating + ReqAttn with entities)")
-    cutoff_date = pd.Timestamp("2025-12-22")
-    # Filter data - use GOAL_STATES to match progress metrics
-    df_week = df[df["is_goal_state"] & df["date"].notna()].copy()
-    df_week["week_start"] = df_week["date"] - pd.to_timedelta(df_week["date"].dt.dayofweek, unit="d")
-    df_week["member"] = df_week.apply(lambda r: anonymize(r["project"]) if r["project_group"] == "Our Team" else "Others", axis=1)
-    # Weekly pivot (all data)
-    weekly_all = df_week.pivot_table(index="week_start", columns="member", values="words", aggfunc="sum", fill_value=0).astype(int)
-    # Split into before and after cutoff
-    weekly_before = weekly_all[weekly_all.index < cutoff_date]
-    weekly_after = weekly_all[weekly_all.index >= cutoff_date]
-    # Ensure consistent columns
-    all_members = set(weekly_all.columns)
-    if "Others" not in all_members:
-        all_members.add("Others")
-    for member in all_members:
-        if member not in weekly_after.columns:
-            weekly_after[member] = 0
-        if member not in weekly_before.columns:
-            weekly_before[member] = 0
-    # Sort columns by total contribution
-    totals = weekly_all.sum().sort_values(ascending=False)
-    weekly_after = weekly_after[totals.index]
-    weekly_after["Total"] = weekly_after.sum(axis=1)
-    # Calculate "Before" summary row
-    before_totals = weekly_before[totals.index].sum()
-    before_totals["Total"] = before_totals.sum()
-    # Format weekly data for display
-    display = weekly_after.reset_index()
-    display["Week"] = display["week_start"].dt.strftime("%Y-%m-%d") + " - " + (display["week_start"] + pd.Timedelta(days=6)).dt.strftime("%Y-%m-%d")
-    display = display.drop("week_start", axis=1)
-    display = display[["Week"] + list(totals.index) + ["Total"]]
-    # Add "Before" row at the beginning
-    before_row = pd.DataFrame([{"Week": f"Before {cutoff_date.strftime('%Y-%m-%d')}", **before_totals}])
-    display = pd.concat([before_row, display], ignore_index=True)
-    # Add TOTAL row at the end
-    all_totals = weekly_all[totals.index].sum()
-    all_totals["Total"] = all_totals.sum()
-    total_row = pd.DataFrame([{"Week": "TOTAL", **all_totals}])
-    display = pd.concat([display, total_row], ignore_index=True)
-    # Format numbers
-    for col in display.columns:
-        if col != "Week":
-            display[col] = display[col].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "")
-    # Style and show
-    def style_row(row):
-        if row["Week"] == "TOTAL":
-            return ["font-weight: bold; background-color: #f0f0f0;"] * len(row)
-        elif row["Week"].startswith("Before"):
-            return ["font-style: italic; background-color: #f9f9f9;"] * len(row)
-        return [""] * len(row)
-    styled = display.style.apply(style_row, axis=1).set_properties(subset=["Total"], **{"font-weight": "bold"})
-    st.dataframe(styled, hide_index=True, use_container_width=True)
-# ============== TAB 2: Pacing ==============
-with tab2:
-    st.subheader("Category Breakdown")
-    st.caption("Requirement: 1.1M words from each category")
-    # Split by status: Ready vs Needs Fixing
-    df_ready = df[df["is_annotated"]]  # Acceptable + No Rating
-    df_needs_fixing = df[df["state"] == "ReqAttn (entities)"]
-    df_total = df[df["is_goal_state"]]
-    # Calculate by category
-    mok_ready = df_ready[df_ready["category"] == "mokslinis"]["words"].sum()
-    mok_fixing = df_needs_fixing[df_needs_fixing["category"] == "mokslinis"]["words"].sum()
-    mok_total = mok_ready + mok_fixing
-    zin_ready = df_ready[df_ready["category"] == "ziniasklaida"]["words"].sum()
-    zin_fixing = df_needs_fixing[df_needs_fixing["category"] == "ziniasklaida"]["words"].sum()
-    zin_total = zin_ready + zin_fixing
-    total_ready = mok_ready + zin_ready
-    total_fixing = mok_fixing + zin_fixing
-    total_all = total_ready + total_fixing
-    cat_df = pd.DataFrame(
-        {
-            "Category": ["mokslinis", "ziniasklaida", "TOTAL"],
-            "Ready": [f"{mok_ready:,}", f"{zin_ready:,}", f"{total_ready:,}"],
-            "Needs Fixing": [f"{mok_fixing:,}", f"{zin_fixing:,}", f"{total_fixing:,}"],
-            "Total": [f"{mok_total:,}", f"{zin_total:,}", f"{total_all:,}"],
-            "Goal": [f"{CATEGORY_GOAL:,}", f"{CATEGORY_GOAL:,}", f"{GOAL_WORDS:,}"],
-            "Progress": [
-                f"{mok_total / CATEGORY_GOAL * 100:.1f}%",
-                f"{zin_total / CATEGORY_GOAL * 100:.1f}%",
-                f"{total_all / GOAL_WORDS * 100:.1f}%",
-            ],
-        }
-    )
-    st.dataframe(cat_df, hide_index=True, use_container_width=True)
-    st.markdown("---")
-    st.header("Cumulative Progress & Projection")
-    # Cumulative data
-    df_cum = df[df["is_goal_state"] & df["date"].notna()].copy()
-    df_cum["member"] = df_cum.apply(lambda r: anonymize(r["project"]) if r["project_group"] == "Our Team" else "Others", axis=1)
-    daily = df_cum.groupby(["date", "member"])["words"].sum().reset_index()
-    pivot = daily.pivot_table(index="date", columns="member", values="words", fill_value=0)
-    cumulative = pivot.sort_index().cumsum()
-    cumulative["Total"] = cumulative.sum(axis=1)
-    cumulative = cumulative[cumulative.index >= pd.Timestamp("2025-12-18")]
-    # Projection calculation
-    last_date = cumulative.index[-1]
-    current = cumulative["Total"].iloc[-1]
     # Calculate rate from last 14 days
-    lookback = cumulative[cumulative.index >= last_date - pd.Timedelta(days=14)]
     if len(lookback) >= 2:
         days = (last_date - lookback.index[0]).days or 1
-        rate = (current - lookback["Total"].iloc[0]) / days
-        days_left = (GOAL_WORDS - current) / rate if rate > 0 else 0
         completion = last_date + pd.Timedelta(days=days_left)
         weekly_rate = rate * 7
     else:
         rate = completion = weekly_rate = None
-    # Chart
-    fig = go.Figure()
-    # Goal lines
-    fig.add_hline(y=1_100_000, line_dash="dot", line_color="orange", annotation_text="Midpoint: 1.1M", annotation_position="top left")
-    fig.add_hline(y=GOAL_WORDS, line_dash="dot", line_color="red", annotation_text="Goal: 2.2M", annotation_position="top left")
-    # Members
-    members = [c for c in cumulative.columns if c not in ["Total", "Others"]]
-    members = sorted(members, key=lambda x: cumulative[x].iloc[-1], reverse=True)
-    if "Others" in cumulative.columns:
-        fig.add_trace(
-            go.Scatter(
-                x=cumulative.index,
-                y=cumulative["Others"],
-                name=f"Others: {cumulative['Others'].iloc[-1]:,.0f}",
-                mode="lines",
-                line=dict(width=2, color="#7f8c8d"),
-            )
-        )
-    for m in members:
-        color = TEAM_COLORS.get(m, "#34495e")
-        fig.add_trace(
-            go.Scatter(x=cumulative.index, y=cumulative[m], name=f"{m}: {cumulative[m].iloc[-1]:,.0f}", mode="lines", line=dict(width=2, color=color))
-        )
-    # Total
     fig.add_trace(
         go.Scatter(
             x=cumulative.index,
-            y=cumulative["Total"],
-            name=f"Total: {cumulative['Total'].iloc[-1]:,.0f}",
             mode="lines",
-            line=dict(width=3, color="#d4af37"),
-            fill="tozeroy",
-            fillcolor="rgba(212, 175, 55, 0.1)",
         )
     )
-    # Projection
-    if completion:
-        proj_dates = pd.date_range(last_date, completion, freq="D")
-        proj_vals = current + rate * (proj_dates - last_date).days
-        fig.add_trace(
-            go.Scatter(
-                x=proj_dates, y=proj_vals, name=f"Projection ({int(weekly_rate):,}/wk)", mode="lines", line=dict(width=3, color="#d4af37", dash="dot")
-            )
         )
-        fig.add_trace(
-            go.Scatter(
-                x=[completion],
-                y=[GOAL_WORDS],
-                mode="markers+text",
-                marker=dict(size=14, color="#d4af37", symbol="diamond"),
-                text=[completion.strftime("%b %d")],
-                textposition="top center",
-                showlegend=False,
-            )
         )
-        title = f"Cumulative Progress → Est. {completion.strftime('%B %d, %Y')}"
-    else:
-        title = "Cumulative Progress"
-    fig.update_layout(title=title, xaxis_title="Date", yaxis_title="Cumulative Words", height=600, hovermode="x unified", template="plotly_white")
-    fig.update_yaxes(tickformat=".2s")
-    st.plotly_chart(fig, use_container_width=True)
-    # Metrics
-    if completion:
-        st.markdown("### Pacing Estimates")
-        c1, c2, c3 = st.columns(3)
-        c1.metric("Per Week Rate", f"{int(weekly_rate):,} words")
-        c2.metric("Weeks Remaining", f"{days_left / 7:.1f} weeks")
-        c3.metric("Est. Completion", completion.strftime("%Y-%m-%d"))
 # Footer
 st.markdown("---")
-st.caption(f"Updated: {pd.Timestamp.now(tz='Europe/Vilnius').strftime('%Y-%m-%d %H:%M:%S')} | Auto-refresh: 5 min | Press 'R' to refresh")

 ANNOTATED_STATES = ["Acceptable", "No Rating"]
 GOAL_STATES = ["Acceptable", "No Rating", "ReqAttn (entities)"]
+# Map project IDs to annotator IDs (for admin-created annotations)
+PROJECT_ANNOTATOR_MAP = {
+    29: 27,
+    30: 28,
+    31: 29,
+    32: 30,
+    33: 31,
+    37: 33,
+}
+ANNOTATOR_NAMES = {
+    1: "Admin",
+    27: "A.K.",
+    28: "Jo.Š.",
+    29: "Ju.Š.",
+    30: "G.Z.",
+    31: "L.M.",
+    33: "M.M.",
+}
 TEAM_COLORS = {
+    27: "#0066cc",  # A.K.
+    28: "#00cccc",  # Jo.Š.
+    29: "#00cc00",  # Ju.Š.
+    30: "#ff9900",  # G.Z.
+    31: "#9933ff",  # L.M.
+    33: "#cc0000",  # M.M.
 }
+# Helper: map annotator names to colors (derived from TEAM_COLORS and ANNOTATOR_NAMES)
+COLORS_BY_NAME = {ANNOTATOR_NAMES[aid]: color for aid, color in TEAM_COLORS.items() if aid in ANNOTATOR_NAMES}
 # Cache file location (persists between runs)
 CACHE_FILE = Path(".cache.pkl")
 st.set_page_config(page_title="Annotation Progress", page_icon="📊", layout="wide")
+@st.cache_data(ttl=3600)  # Cache users for 1 hour (users rarely change)
+def fetch_users(url, key):
+    """Fetch all users and create a mapping of user_id -> user_name."""
+    try:
+        headers = {"Authorization": f"Token {key}"}
+        resp = requests.get(f"{url}/api/users", headers=headers, timeout=30)
+        resp.raise_for_status()
+        users = resp.json()
+        user_map = {}
+        for user in users:
+            user_id = user.get("id")
+            first_name = user.get("first_name", "")
+            email = user.get("email", "")
+            name = first_name or email or f"User {user_id}"
+            user_map[user_id] = name
+        return user_map
+    except Exception:
+        # If we can't fetch users, return empty map
+        return {}
+def fetch_project_data(proj, url, headers, user_map, since_date=None):
+    """Fetch data from one project (for parallel execution).
+    Args:
+        proj: Project dict from API
+        url: Label Studio URL
+        headers: Auth headers
+        user_map: User ID to name mapping
+        since_date: If provided, only fetch tasks updated after this ISO datetime string
+    """
     pid, name, task_count = proj["id"], proj.get("title", f"Project {proj['id']}"), proj.get("task_number", 0)
     group = "Our Team" if pid in OUR_TEAM_PROJECT_IDS else "Others"
     rows = []
     submitted_count = 0  # Track submitted (annotated) tasks
+    max_updated_at = since_date  # Track the latest updated_at we see
     page = 1
+    # Build query filter for incremental updates
+    params = {"page": page, "page_size": 100}
+    if since_date:
+        import json
+        query = {
+            "filters": {
+                "conjunction": "and",
+                "items": [{
+                    "filter": "filter:tasks:updated_at",
+                    "operator": "greater",
+                    "type": "Datetime",
+                    "value": since_date
+                }]
+            }
+        }
+        params["query"] = json.dumps(query)
+        print(f"[DEBUG] Incremental update for project {pid} since {since_date}")
     while True:
+        params["page"] = page
+        resp = requests.get(f"{url}/api/projects/{pid}/tasks", headers=headers, params=params, timeout=30)
         resp.raise_for_status()
         data = resp.json()
         tasks = data if isinstance(data, list) else data.get("tasks", [])
             break
         for task in tasks:
+            # Track the latest updated_at timestamp
+            task_updated = task.get("updated_at")
+            if task_updated and (not max_updated_at or task_updated > max_updated_at):
+                max_updated_at = task_updated
             task_data = task.get("data", {})
             words = task_data.get("words") or len(task_data.get("text", "").split())
             category = task_data.get("category")
             if not annots:
                 rows.append(
                     {
+                        "task_id": task.get("id"),  # Add task_id for merging updates
                         "project_id": pid,
                         "project": name,
                         "project_group": group,
+                        "annotator": None,
+                        "annotator_email": None,
                         "date": None,
                         "state": "Not Annotated",
                         "words": int(words),
             ann = annots[0]
             date = ann.get("created_at", "")[:10] or None
+            # Extract annotator info
+            # completed_by can be either a user ID (int) or a user object (dict)
+            completed_by = ann.get("completed_by")
+            if isinstance(completed_by, dict):
+                # Full user object
+                annotator_id = completed_by.get("id")
+                annotator_email = completed_by.get("email", "Unknown")
+            elif isinstance(completed_by, int):
+                # Just a user ID
+                annotator_id = completed_by
+                annotator_email = f"user_{completed_by}"
+            else:
+                # No completed_by info
+                annotator_id = None
+                annotator_email = "unknown"
+            # Backward compatibility: if admin annotated a team project, use project's default annotator
+            if group == "Our Team" and annotator_id == 1 and pid in PROJECT_ANNOTATOR_MAP:
+                mapped_id = PROJECT_ANNOTATOR_MAP[pid]
+                if mapped_id:
+                    annotator_id = mapped_id
+            # Get display name from ANNOTATOR_NAMES mapping (or fallback to user_map)
+            if annotator_id in ANNOTATOR_NAMES:
+                annotator_name = ANNOTATOR_NAMES[annotator_id]
+            elif annotator_id in user_map:
+                annotator_name = user_map[annotator_id]
+            else:
+                annotator_name = f"User {annotator_id}" if annotator_id else "Unknown"
             rating = None
             for item in ann.get("result", []):
                 if item.get("type") == "choices" and item.get("from_name") == "text_rating":
                 state = "Acceptable"
             rows.append(
+                {
+                    "task_id": task.get("id"),  # Add task_id for merging updates
+                    "project_id": pid,
+                    "project": name,
+                    "project_group": group,
+                    "annotator": annotator_name,
+                    "annotator_email": annotator_email,
+                    "date": date,
+                    "state": state,
+                    "words": int(words),
+                    "category": category,
+                }
             )
         if isinstance(data, list) and len(data) < 100:
             break
         page += 1
+    return pid, task_count, submitted_count, rows, max_updated_at
+@st.cache_data(ttl=120)  # Auto-refresh every 120 seconds (2 minutes)
 def load_data(projects_hash):
     """Load annotation data from Label Studio with disk cache.
     headers = {"Authorization": f"Token {key}"}
+    # Fetch all users first to map user IDs to names (cached for 1 hour)
+    user_map = fetch_users(url, key)
     # Fetch all projects
     resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
     resp.raise_for_status()
     # Check which projects need updating
     projects_to_fetch = []
+    projects_to_update_incrementally = []
     all_rows = []
     for proj in projects:
         cache_key = f"project_{pid}"
+        # Decide caching strategy:
+        # 1. No cache exists → full fetch
+        # 2. Task count changed → full fetch (tasks added/removed)
+        # 3. Submitted count changed + have last_updated → incremental update
+        # 4. Both counts match → use cache
+        if cache_key not in cache:
+            # No cache - need full fetch
+            projects_to_fetch.append((proj, None))
         else:
+            cached = cache[cache_key]
+            if cached.get("task_count") != task_count:
+                # Task count changed - full fetch required
+                projects_to_fetch.append((proj, None))
+            elif cached.get("submitted_count") != api_submitted_count:
+                # Annotations changed - try incremental update if we have a timestamp
+                last_updated = cached.get("last_updated")
+                if last_updated:
+                    # Incremental update: fetch only changed tasks
+                    projects_to_update_incrementally.append((proj, last_updated, cached["rows"]))
+                else:
+                    # No timestamp - full fetch
+                    projects_to_fetch.append((proj, None))
+            else:
+                # Both counts match - use cache
+                all_rows.extend(cached["rows"])
     # Fetch updated projects in parallel
+    total_fetches = len(projects_to_fetch) + len(projects_to_update_incrementally)
+    if total_fetches > 0:
         with ThreadPoolExecutor(max_workers=10) as executor:
+            futures = []
+            # Submit full fetches
+            for proj, _ in projects_to_fetch:
+                futures.append(("full", executor.submit(fetch_project_data, proj, url, headers, user_map, None)))
+            # Submit incremental updates
+            for proj, since_date, cached_rows in projects_to_update_incrementally:
+                futures.append(("incremental", executor.submit(fetch_project_data, proj, url, headers, user_map, since_date), cached_rows))
+            progress = st.progress(0, text=f"Loading {total_fetches} projects...")
+            for i, future_info in enumerate(futures):
+                if future_info[0] == "full":
+                    _, future = future_info
+                    pid, task_count, submitted_count, rows, max_updated_at = future.result()
+                    all_rows.extend(rows)
+                    cache[f"project_{pid}"] = {
+                        "task_count": task_count,
+                        "submitted_count": submitted_count,
+                        "last_updated": max_updated_at,
+                        "rows": rows
+                    }
+                else:  # incremental
+                    _, future, cached_rows = future_info
+                    pid, task_count, submitted_count, updated_rows, max_updated_at = future.result()
+                    # Get the previous timestamp from cache
+                    prev_timestamp = cache.get(f"project_{pid}", {}).get("last_updated")
+                    # Merge: update existing tasks or add new ones
+                    if updated_rows:
+                        # Create a dict of cached tasks by task_id for fast lookup
+                        cached_by_id = {row["task_id"]: row for row in cached_rows}
+                        # Update with new data
+                        for row in updated_rows:
+                            cached_by_id[row["task_id"]] = row
+                        # Convert back to list
+                        merged_rows = list(cached_by_id.values())
+                    else:
+                        # No updates, use cached rows
+                        merged_rows = cached_rows
+                    all_rows.extend(merged_rows)
+                    cache[f"project_{pid}"] = {
+                        "task_count": task_count,
+                        "submitted_count": submitted_count,
+                        "last_updated": max_updated_at or prev_timestamp,  # Keep previous if no new updates
+                        "rows": merged_rows
+                    }
+                progress.progress((i + 1) / total_fetches, text=f"Loaded {i + 1}/{total_fetches} projects")
             progress.empty()
         # Save cache
 remaining = GOAL_WORDS - total
 progress = total / GOAL_WORDS * 100
+# Calculate category breakdowns for overview
+df_ready = df[df["is_annotated"]]  # Acceptable + No Rating
+df_needs_fixing = df[df["state"] == "ReqAttn (entities)"]
+# Calculate pacing estimates
+df_pace = df[df["is_goal_state"] & df["date"].notna()].copy()
+daily_totals = df_pace.groupby("date")["words"].sum().reset_index()
+daily_totals = daily_totals.set_index("date").sort_index()
+cumulative_total = daily_totals.cumsum()
+cumulative_total = cumulative_total[cumulative_total.index >= pd.Timestamp("2025-12-18")]
+if len(cumulative_total) > 0:
+    last_date = cumulative_total.index[-1]
+    current_total = cumulative_total.iloc[-1]["words"]
     # Calculate rate from last 14 days
+    lookback = cumulative_total[cumulative_total.index >= last_date - pd.Timedelta(days=14)]
     if len(lookback) >= 2:
         days = (last_date - lookback.index[0]).days or 1
+        rate = (current_total - lookback.iloc[0]["words"]) / days
+        days_left = (GOAL_WORDS - current_total) / rate if rate > 0 else 0
         completion = last_date + pd.Timedelta(days=days_left)
         weekly_rate = rate * 7
     else:
         rate = completion = weekly_rate = None
+else:
+    rate = completion = weekly_rate = None
+# Calculate category breakdowns for display
+mok_ready = df_ready[df_ready["category"] == "mokslinis"]["words"].sum()
+mok_fixing = df_needs_fixing[df_needs_fixing["category"] == "mokslinis"]["words"].sum()
+mok_total = mok_ready + mok_fixing
+mok_remaining = CATEGORY_GOAL - mok_total
+zin_ready = df_ready[df_ready["category"] == "ziniasklaida"]["words"].sum()
+zin_fixing = df_needs_fixing[df_needs_fixing["category"] == "ziniasklaida"]["words"].sum()
+zin_total = zin_ready + zin_fixing
+zin_remaining = CATEGORY_GOAL - zin_total
+# Display metrics
+col1, col2, col3 = st.columns(3)
+# mokslinis category
+mok_progress = mok_total / CATEGORY_GOAL * 100
+col1.metric("mokslinis", f"{mok_total:,}")
+if mok_remaining > 0:
+    col1.markdown(f"<small>{mok_progress:.1f}% of 1.1M • {mok_remaining:,} remaining</small>", unsafe_allow_html=True)
+else:
+    col1.markdown(f"<small>{mok_progress:.1f}% of 1.1M • ✓ Complete</small>", unsafe_allow_html=True)
+# ziniasklaida category
+zin_progress = zin_total / CATEGORY_GOAL * 100
+col2.metric("ziniasklaida", f"{zin_total:,}")
+if zin_remaining > 0:
+    col2.markdown(f"<small>{zin_progress:.1f}% of 1.1M • {zin_remaining:,} remaining</small>", unsafe_allow_html=True)
+else:
+    col2.markdown(f"<small>{zin_progress:.1f}% of 1.1M • ✓ Complete</small>", unsafe_allow_html=True)
+# Completion estimate
+if weekly_rate:
+    col3.metric("Est. Completion", completion.strftime("%Y-%m-%d"))
+    col3.markdown(f"<small>📊 {int(weekly_rate):,} words/week • {days_left / 7:.1f} weeks left</small>", unsafe_allow_html=True)
+else:
+    col3.metric("Est. Completion", "N/A")
+st.markdown("---")
+# ============== Weekly Stats ==============
+st.subheader("📊 Weekly Stats")
+st.caption("Goal states (Acceptable + No Rating + ReqAttn with entities)")
+cutoff_date = pd.Timestamp("2025-12-22")
+# Filter data - use GOAL_STATES to match progress metrics
+# Show annotators for our team's projects, "Others" for everything else
+df_week = df[df["is_goal_state"] & df["date"].notna()].copy()
+df_week["week_start"] = df_week["date"] - pd.to_timedelta(df_week["date"].dt.dayofweek, unit="d")
+df_week["member"] = df_week.apply(
+    lambda r: (r["annotator"] if r["annotator"] else "Unknown") if r["project_group"] == "Our Team" else "Others",
+    axis=1
+)
+# Weekly pivot (all data)
+weekly_all = df_week.pivot_table(index="week_start", columns="member", values="words", aggfunc="sum", fill_value=0).astype(int)
+# Split into before and after cutoff
+weekly_before = weekly_all[weekly_all.index < cutoff_date]
+weekly_after = weekly_all[weekly_all.index >= cutoff_date]
+# Ensure consistent columns
+all_members = set(weekly_all.columns)
+if "Others" not in all_members:
+    all_members.add("Others")
+for member in all_members:
+    if member not in weekly_after.columns:
+        weekly_after[member] = 0
+    if member not in weekly_before.columns:
+        weekly_before[member] = 0
+# Sort columns by total contribution
+totals = weekly_all.sum().sort_values(ascending=False)
+weekly_after = weekly_after[totals.index]
+weekly_after["Total"] = weekly_after.sum(axis=1)
+# Calculate "Before" summary row
+before_totals = weekly_before[totals.index].sum()
+before_totals["Total"] = before_totals.sum()
+# Format weekly data for display
+display = weekly_after.reset_index()
+display["Week"] = display["week_start"].dt.strftime("%Y-%m-%d") + " - " + (display["week_start"] + pd.Timedelta(days=6)).dt.strftime("%Y-%m-%d")
+display = display.drop("week_start", axis=1)
+display = display[["Week"] + list(totals.index) + ["Total"]]
+# Add "Before" row at the beginning
+before_row = pd.DataFrame([{"Week": f"Before {cutoff_date.strftime('%Y-%m-%d')}", **before_totals}])
+display = pd.concat([before_row, display], ignore_index=True)
+# Add TOTAL row at the end
+all_totals = weekly_all[totals.index].sum()
+all_totals["Total"] = all_totals.sum()
+total_row = pd.DataFrame([{"Week": "TOTAL", **all_totals}])
+display = pd.concat([display, total_row], ignore_index=True)
+# Format numbers
+for col in display.columns:
+    if col != "Week":
+        display[col] = display[col].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "")
+# Style and show
+def style_row(row):
+    if row["Week"] == "TOTAL":
+        return ["font-weight: bold; background-color: #f0f0f0;"] * len(row)
+    elif row["Week"].startswith("Before"):
+        return ["font-style: italic; background-color: #f9f9f9;"] * len(row)
+    return [""] * len(row)
+styled = display.style.apply(style_row, axis=1).set_properties(subset=["Total"], **{"font-weight": "bold"})
+st.dataframe(styled, hide_index=True, use_container_width=True, height='content')
+st.markdown("---")
+# ============== Category Breakdown ==============
+st.subheader("📈 Category Breakdown")
+st.caption("Requirement: 1.1M words from each category")
+# df_ready and df_needs_fixing already defined in overview section
+df_total = df[df["is_goal_state"]]
+# Calculate by category
+mok_ready = df_ready[df_ready["category"] == "mokslinis"]["words"].sum()
+mok_fixing = df_needs_fixing[df_needs_fixing["category"] == "mokslinis"]["words"].sum()
+mok_total = mok_ready + mok_fixing
+zin_ready = df_ready[df_ready["category"] == "ziniasklaida"]["words"].sum()
+zin_fixing = df_needs_fixing[df_needs_fixing["category"] == "ziniasklaida"]["words"].sum()
+zin_total = zin_ready + zin_fixing
+total_ready = mok_ready + zin_ready
+total_fixing = mok_fixing + zin_fixing
+total_all = total_ready + total_fixing
+cat_df = pd.DataFrame(
+    {
+        "Category": ["mokslinis", "ziniasklaida"],
+        "Ready": [f"{mok_ready:,}", f"{zin_ready:,}"],
+        "Needs Fixing": [f"{mok_fixing:,}", f"{zin_fixing:,}"],
+        "Total": [f"{mok_total:,}", f"{zin_total:,}"],
+        "Goal": [f"{CATEGORY_GOAL:,}", f"{CATEGORY_GOAL:,}"],
+        "Progress": [
+            f"{mok_total / CATEGORY_GOAL * 100:.1f}%",
+            f"{zin_total / CATEGORY_GOAL * 100:.1f}%",
+        ],
+    }
+)
+st.dataframe(cat_df, hide_index=True, use_container_width=True, height='content')
+st.markdown("---")
+# ============== Cumulative Progress ==============
+st.subheader("📊 Cumulative Progress & Projection")
+# Cumulative data - show by annotator for our team, "Others" for rest
+df_cum = df[df["is_goal_state"] & df["date"].notna()].copy()
+df_cum["member"] = df_cum.apply(
+    lambda r: (r["annotator"] if r["annotator"] else "Unknown") if r["project_group"] == "Our Team" else "Others",
+    axis=1
+)
+daily = df_cum.groupby(["date", "member"])["words"].sum().reset_index()
+pivot = daily.pivot_table(index="date", columns="member", values="words", fill_value=0)
+cumulative = pivot.sort_index().cumsum()
+cumulative["Total"] = cumulative.sum(axis=1)
+cumulative = cumulative[cumulative.index >= pd.Timestamp("2025-12-18")]
+# Projection calculation
+last_date = cumulative.index[-1]
+current = cumulative["Total"].iloc[-1]
+# Calculate rate from last 14 days
+lookback = cumulative[cumulative.index >= last_date - pd.Timedelta(days=14)]
+if len(lookback) >= 2:
+    days = (last_date - lookback.index[0]).days or 1
+    rate = (current - lookback["Total"].iloc[0]) / days
+    days_left = (GOAL_WORDS - current) / rate if rate > 0 else 0
+    completion = last_date + pd.Timedelta(days=days_left)
+    weekly_rate = rate * 7
+else:
+    rate = completion = weekly_rate = None
+# Chart
+fig = go.Figure()
+# Goal lines
+fig.add_hline(y=1_100_000, line_dash="dot", line_color="orange", annotation_text="Midpoint: 1.1M", annotation_position="top left")
+fig.add_hline(y=GOAL_WORDS, line_dash="dot", line_color="red", annotation_text="Goal: 2.2M", annotation_position="top left")
+# Members
+members = [c for c in cumulative.columns if c not in ["Total", "Others"]]
+members = sorted(members, key=lambda x: cumulative[x].iloc[-1], reverse=True)
+if "Others" in cumulative.columns:
     fig.add_trace(
         go.Scatter(
             x=cumulative.index,
+            y=cumulative["Others"],
+            name=f"Others: {cumulative['Others'].iloc[-1]:,.0f}",
             mode="lines",
+            line=dict(width=2, color="#7f8c8d"),
         )
     )
+for m in members:
+    color = COLORS_BY_NAME.get(m, "#34495e")
+    fig.add_trace(
+        go.Scatter(x=cumulative.index, y=cumulative[m], name=f"{m}: {cumulative[m].iloc[-1]:,.0f}", mode="lines", line=dict(width=2, color=color))
+    )
+# Total
+fig.add_trace(
+    go.Scatter(
+        x=cumulative.index,
+        y=cumulative["Total"],
+        name=f"Total: {cumulative['Total'].iloc[-1]:,.0f}",
+        mode="lines",
+        line=dict(width=3, color="#d4af37"),
+        fill="tozeroy",
+        fillcolor="rgba(212, 175, 55, 0.1)",
+    )
+)
+# Projection
+if completion:
+    proj_dates = pd.date_range(last_date, completion, freq="D")
+    proj_vals = current + rate * (proj_dates - last_date).days
+    fig.add_trace(
+        go.Scatter(
+            x=proj_dates, y=proj_vals, name=f"Projection ({int(weekly_rate):,}/wk)", mode="lines", line=dict(width=3, color="#d4af37", dash="dot")
         )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=[completion],
+            y=[GOAL_WORDS],
+            mode="markers+text",
+            marker=dict(size=14, color="#d4af37", symbol="diamond"),
+            text=[completion.strftime("%b %d")],
+            textposition="top center",
+            showlegend=False,
         )
+    )
+    title = f"Cumulative Progress → Est. {completion.strftime('%B %d, %Y')}"
+else:
+    title = "Cumulative Progress"
+fig.update_layout(title=title, xaxis_title="Date", yaxis_title="Cumulative Words", height=600, hovermode="x unified", template="plotly_white")
+fig.update_yaxes(tickformat=".2s")
+st.plotly_chart(fig, use_container_width=True)
 # Footer
 st.markdown("---")
+st.caption(f"Updated: {pd.Timestamp.now(tz='Europe/Vilnius').strftime('%Y-%m-%d %H:%M:%S')} | Auto-refresh: 2 min | Press 'R' to refresh")