Gintarė Zokaitytė commited on
Commit
2806d4d
·
1 Parent(s): c4ef01c

Per annotator changes, cacheupdate

Browse files
Files changed (1) hide show
  1. app.py +478 -236
app.py CHANGED
@@ -15,31 +15,106 @@ OUR_TEAM_PROJECT_IDS = {29, 30, 31, 32, 33, 37}
15
  ANNOTATED_STATES = ["Acceptable", "No Rating"]
16
  GOAL_STATES = ["Acceptable", "No Rating", "ReqAttn (entities)"]
17
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
18
  TEAM_COLORS = {
19
- "A.K. (22)": "#0066cc",
20
- "J.Š. (23)": "#00cccc",
21
- "J.Š. (24)": "#00cc00",
22
- "G.Z. (25)": "#ff9900",
23
- "L.M. (26)": "#9933ff",
24
- "M.M. (27)": "#cc0000",
25
  }
26
 
 
 
 
27
  # Cache file location (persists between runs)
28
  CACHE_FILE = Path(".cache.pkl")
29
 
30
  st.set_page_config(page_title="Annotation Progress", page_icon="📊", layout="wide")
31
 
32
 
33
- def fetch_project_data(proj, url, headers):
34
- """Fetch data from one project (for parallel execution)."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
  pid, name, task_count = proj["id"], proj.get("title", f"Project {proj['id']}"), proj.get("task_number", 0)
36
  group = "Our Team" if pid in OUR_TEAM_PROJECT_IDS else "Others"
37
 
38
  rows = []
39
  submitted_count = 0 # Track submitted (annotated) tasks
 
40
  page = 1
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  while True:
42
- resp = requests.get(f"{url}/api/projects/{pid}/tasks", headers=headers, params={"page": page, "page_size": 100}, timeout=30)
 
43
  resp.raise_for_status()
44
  data = resp.json()
45
  tasks = data if isinstance(data, list) else data.get("tasks", [])
@@ -48,6 +123,11 @@ def fetch_project_data(proj, url, headers):
48
  break
49
 
50
  for task in tasks:
 
 
 
 
 
51
  task_data = task.get("data", {})
52
  words = task_data.get("words") or len(task_data.get("text", "").split())
53
  category = task_data.get("category")
@@ -56,9 +136,12 @@ def fetch_project_data(proj, url, headers):
56
  if not annots:
57
  rows.append(
58
  {
 
59
  "project_id": pid,
60
  "project": name,
61
  "project_group": group,
 
 
62
  "date": None,
63
  "state": "Not Annotated",
64
  "words": int(words),
@@ -73,6 +156,37 @@ def fetch_project_data(proj, url, headers):
73
  ann = annots[0]
74
  date = ann.get("created_at", "")[:10] or None
75
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  rating = None
77
  for item in ann.get("result", []):
78
  if item.get("type") == "choices" and item.get("from_name") == "text_rating":
@@ -90,7 +204,18 @@ def fetch_project_data(proj, url, headers):
90
  state = "Acceptable"
91
 
92
  rows.append(
93
- {"project_id": pid, "project": name, "project_group": group, "date": date, "state": state, "words": int(words), "category": category}
 
 
 
 
 
 
 
 
 
 
 
94
  )
95
 
96
  if isinstance(data, list) and len(data) < 100:
@@ -99,10 +224,10 @@ def fetch_project_data(proj, url, headers):
99
  break
100
  page += 1
101
 
102
- return pid, task_count, submitted_count, rows
103
 
104
 
105
- @st.cache_data(ttl=300)
106
  def load_data(projects_hash):
107
  """Load annotation data from Label Studio with disk cache.
108
 
@@ -122,6 +247,9 @@ def load_data(projects_hash):
122
 
123
  headers = {"Authorization": f"Token {key}"}
124
 
 
 
 
125
  # Fetch all projects
126
  resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
127
  resp.raise_for_status()
@@ -138,6 +266,7 @@ def load_data(projects_hash):
138
 
139
  # Check which projects need updating
140
  projects_to_fetch = []
 
141
  all_rows = []
142
 
143
  for proj in projects:
@@ -148,34 +277,90 @@ def load_data(projects_hash):
148
 
149
  cache_key = f"project_{pid}"
150
 
151
- # Invalidate cache if:
152
- # 1. No cache exists for this project
153
- # 2. Total task count changed (new tasks added/removed)
154
- # 3. Submitted task count changed (new annotations/submissions)
155
- use_cache = False
156
- if cache_key in cache:
157
- cached = cache[cache_key]
158
- # Use cache only if BOTH counts match
159
- if (cached.get("task_count") == task_count and
160
- cached.get("submitted_count") == api_submitted_count):
161
- use_cache = True
162
-
163
- if use_cache:
164
- all_rows.extend(cache[cache_key]["rows"])
165
  else:
166
- projects_to_fetch.append(proj)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
167
 
168
  # Fetch updated projects in parallel
169
- if projects_to_fetch:
 
 
170
  with ThreadPoolExecutor(max_workers=10) as executor:
171
- futures = [executor.submit(fetch_project_data, proj, url, headers) for proj in projects_to_fetch]
172
-
173
- progress = st.progress(0, text=f"Loading {len(projects_to_fetch)} projects...")
174
- for i, future in enumerate(futures):
175
- pid, task_count, submitted_count, rows = future.result()
176
- all_rows.extend(rows)
177
- cache[f"project_{pid}"] = {"task_count": task_count, "submitted_count": submitted_count, "rows": rows}
178
- progress.progress((i + 1) / len(futures), text=f"Loaded {i + 1}/{len(futures)} projects")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
179
  progress.empty()
180
 
181
  # Save cache
@@ -251,230 +436,287 @@ total = df[df["is_goal_state"]]["words"].sum()
251
  remaining = GOAL_WORDS - total
252
  progress = total / GOAL_WORDS * 100
253
 
254
- col1, col2 = st.columns(2)
255
- col1.metric("Progress toward 2.2M", f"{total:,}", f"{progress:.1f}%")
256
- col2.metric("Remaining", f"{remaining:,}", f"{100 - progress:.1f}%")
257
 
258
- st.markdown("---")
 
 
 
 
 
259
 
260
- # Tabs
261
- tab1, tab2 = st.tabs(["📊 Weekly Stats", "⏱️ Pacing"])
262
-
263
- # ============== TAB 1: Weekly Stats ==============
264
- with tab1:
265
- st.caption("Goal states (Acceptable + No Rating + ReqAttn with entities)")
266
-
267
- cutoff_date = pd.Timestamp("2025-12-22")
268
-
269
- # Filter data - use GOAL_STATES to match progress metrics
270
- df_week = df[df["is_goal_state"] & df["date"].notna()].copy()
271
- df_week["week_start"] = df_week["date"] - pd.to_timedelta(df_week["date"].dt.dayofweek, unit="d")
272
- df_week["member"] = df_week.apply(lambda r: anonymize(r["project"]) if r["project_group"] == "Our Team" else "Others", axis=1)
273
-
274
- # Weekly pivot (all data)
275
- weekly_all = df_week.pivot_table(index="week_start", columns="member", values="words", aggfunc="sum", fill_value=0).astype(int)
276
-
277
- # Split into before and after cutoff
278
- weekly_before = weekly_all[weekly_all.index < cutoff_date]
279
- weekly_after = weekly_all[weekly_all.index >= cutoff_date]
280
-
281
- # Ensure consistent columns
282
- all_members = set(weekly_all.columns)
283
- if "Others" not in all_members:
284
- all_members.add("Others")
285
-
286
- for member in all_members:
287
- if member not in weekly_after.columns:
288
- weekly_after[member] = 0
289
- if member not in weekly_before.columns:
290
- weekly_before[member] = 0
291
-
292
- # Sort columns by total contribution
293
- totals = weekly_all.sum().sort_values(ascending=False)
294
- weekly_after = weekly_after[totals.index]
295
- weekly_after["Total"] = weekly_after.sum(axis=1)
296
-
297
- # Calculate "Before" summary row
298
- before_totals = weekly_before[totals.index].sum()
299
- before_totals["Total"] = before_totals.sum()
300
-
301
- # Format weekly data for display
302
- display = weekly_after.reset_index()
303
- display["Week"] = display["week_start"].dt.strftime("%Y-%m-%d") + " - " + (display["week_start"] + pd.Timedelta(days=6)).dt.strftime("%Y-%m-%d")
304
- display = display.drop("week_start", axis=1)
305
- display = display[["Week"] + list(totals.index) + ["Total"]]
306
-
307
- # Add "Before" row at the beginning
308
- before_row = pd.DataFrame([{"Week": f"Before {cutoff_date.strftime('%Y-%m-%d')}", **before_totals}])
309
- display = pd.concat([before_row, display], ignore_index=True)
310
-
311
- # Add TOTAL row at the end
312
- all_totals = weekly_all[totals.index].sum()
313
- all_totals["Total"] = all_totals.sum()
314
- total_row = pd.DataFrame([{"Week": "TOTAL", **all_totals}])
315
- display = pd.concat([display, total_row], ignore_index=True)
316
-
317
- # Format numbers
318
- for col in display.columns:
319
- if col != "Week":
320
- display[col] = display[col].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "")
321
-
322
- # Style and show
323
- def style_row(row):
324
- if row["Week"] == "TOTAL":
325
- return ["font-weight: bold; background-color: #f0f0f0;"] * len(row)
326
- elif row["Week"].startswith("Before"):
327
- return ["font-style: italic; background-color: #f9f9f9;"] * len(row)
328
- return [""] * len(row)
329
-
330
- styled = display.style.apply(style_row, axis=1).set_properties(subset=["Total"], **{"font-weight": "bold"})
331
- st.dataframe(styled, hide_index=True, use_container_width=True)
332
-
333
- # ============== TAB 2: Pacing ==============
334
- with tab2:
335
- st.subheader("Category Breakdown")
336
- st.caption("Requirement: 1.1M words from each category")
337
-
338
- # Split by status: Ready vs Needs Fixing
339
- df_ready = df[df["is_annotated"]] # Acceptable + No Rating
340
- df_needs_fixing = df[df["state"] == "ReqAttn (entities)"]
341
- df_total = df[df["is_goal_state"]]
342
-
343
- # Calculate by category
344
- mok_ready = df_ready[df_ready["category"] == "mokslinis"]["words"].sum()
345
- mok_fixing = df_needs_fixing[df_needs_fixing["category"] == "mokslinis"]["words"].sum()
346
- mok_total = mok_ready + mok_fixing
347
-
348
- zin_ready = df_ready[df_ready["category"] == "ziniasklaida"]["words"].sum()
349
- zin_fixing = df_needs_fixing[df_needs_fixing["category"] == "ziniasklaida"]["words"].sum()
350
- zin_total = zin_ready + zin_fixing
351
-
352
- total_ready = mok_ready + zin_ready
353
- total_fixing = mok_fixing + zin_fixing
354
- total_all = total_ready + total_fixing
355
-
356
- cat_df = pd.DataFrame(
357
- {
358
- "Category": ["mokslinis", "ziniasklaida", "TOTAL"],
359
- "Ready": [f"{mok_ready:,}", f"{zin_ready:,}", f"{total_ready:,}"],
360
- "Needs Fixing": [f"{mok_fixing:,}", f"{zin_fixing:,}", f"{total_fixing:,}"],
361
- "Total": [f"{mok_total:,}", f"{zin_total:,}", f"{total_all:,}"],
362
- "Goal": [f"{CATEGORY_GOAL:,}", f"{CATEGORY_GOAL:,}", f"{GOAL_WORDS:,}"],
363
- "Progress": [
364
- f"{mok_total / CATEGORY_GOAL * 100:.1f}%",
365
- f"{zin_total / CATEGORY_GOAL * 100:.1f}%",
366
- f"{total_all / GOAL_WORDS * 100:.1f}%",
367
- ],
368
- }
369
- )
370
- st.dataframe(cat_df, hide_index=True, use_container_width=True)
371
-
372
- st.markdown("---")
373
- st.header("Cumulative Progress & Projection")
374
-
375
- # Cumulative data
376
- df_cum = df[df["is_goal_state"] & df["date"].notna()].copy()
377
- df_cum["member"] = df_cum.apply(lambda r: anonymize(r["project"]) if r["project_group"] == "Our Team" else "Others", axis=1)
378
-
379
- daily = df_cum.groupby(["date", "member"])["words"].sum().reset_index()
380
- pivot = daily.pivot_table(index="date", columns="member", values="words", fill_value=0)
381
- cumulative = pivot.sort_index().cumsum()
382
- cumulative["Total"] = cumulative.sum(axis=1)
383
- cumulative = cumulative[cumulative.index >= pd.Timestamp("2025-12-18")]
384
-
385
- # Projection calculation
386
- last_date = cumulative.index[-1]
387
- current = cumulative["Total"].iloc[-1]
388
 
389
  # Calculate rate from last 14 days
390
- lookback = cumulative[cumulative.index >= last_date - pd.Timedelta(days=14)]
391
  if len(lookback) >= 2:
392
  days = (last_date - lookback.index[0]).days or 1
393
- rate = (current - lookback["Total"].iloc[0]) / days
394
- days_left = (GOAL_WORDS - current) / rate if rate > 0 else 0
395
  completion = last_date + pd.Timedelta(days=days_left)
396
  weekly_rate = rate * 7
397
  else:
398
  rate = completion = weekly_rate = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
399
 
400
- # Chart
401
- fig = go.Figure()
402
-
403
- # Goal lines
404
- fig.add_hline(y=1_100_000, line_dash="dot", line_color="orange", annotation_text="Midpoint: 1.1M", annotation_position="top left")
405
- fig.add_hline(y=GOAL_WORDS, line_dash="dot", line_color="red", annotation_text="Goal: 2.2M", annotation_position="top left")
406
-
407
- # Members
408
- members = [c for c in cumulative.columns if c not in ["Total", "Others"]]
409
- members = sorted(members, key=lambda x: cumulative[x].iloc[-1], reverse=True)
410
-
411
- if "Others" in cumulative.columns:
412
- fig.add_trace(
413
- go.Scatter(
414
- x=cumulative.index,
415
- y=cumulative["Others"],
416
- name=f"Others: {cumulative['Others'].iloc[-1]:,.0f}",
417
- mode="lines",
418
- line=dict(width=2, color="#7f8c8d"),
419
- )
420
- )
421
 
422
- for m in members:
423
- color = TEAM_COLORS.get(m, "#34495e")
424
- fig.add_trace(
425
- go.Scatter(x=cumulative.index, y=cumulative[m], name=f"{m}: {cumulative[m].iloc[-1]:,.0f}", mode="lines", line=dict(width=2, color=color))
426
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
427
 
428
- # Total
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
429
  fig.add_trace(
430
  go.Scatter(
431
  x=cumulative.index,
432
- y=cumulative["Total"],
433
- name=f"Total: {cumulative['Total'].iloc[-1]:,.0f}",
434
  mode="lines",
435
- line=dict(width=3, color="#d4af37"),
436
- fill="tozeroy",
437
- fillcolor="rgba(212, 175, 55, 0.1)",
438
  )
439
  )
440
 
441
- # Projection
442
- if completion:
443
- proj_dates = pd.date_range(last_date, completion, freq="D")
444
- proj_vals = current + rate * (proj_dates - last_date).days
445
- fig.add_trace(
446
- go.Scatter(
447
- x=proj_dates, y=proj_vals, name=f"Projection ({int(weekly_rate):,}/wk)", mode="lines", line=dict(width=3, color="#d4af37", dash="dot")
448
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
449
  )
450
- fig.add_trace(
451
- go.Scatter(
452
- x=[completion],
453
- y=[GOAL_WORDS],
454
- mode="markers+text",
455
- marker=dict(size=14, color="#d4af37", symbol="diamond"),
456
- text=[completion.strftime("%b %d")],
457
- textposition="top center",
458
- showlegend=False,
459
- )
460
  )
461
- title = f"Cumulative Progress → Est. {completion.strftime('%B %d, %Y')}"
462
- else:
463
- title = "Cumulative Progress"
464
-
465
- fig.update_layout(title=title, xaxis_title="Date", yaxis_title="Cumulative Words", height=600, hovermode="x unified", template="plotly_white")
466
- fig.update_yaxes(tickformat=".2s")
467
 
468
- st.plotly_chart(fig, use_container_width=True)
 
469
 
470
- # Metrics
471
- if completion:
472
- st.markdown("### Pacing Estimates")
473
- c1, c2, c3 = st.columns(3)
474
- c1.metric("Per Week Rate", f"{int(weekly_rate):,} words")
475
- c2.metric("Weeks Remaining", f"{days_left / 7:.1f} weeks")
476
- c3.metric("Est. Completion", completion.strftime("%Y-%m-%d"))
477
 
478
  # Footer
479
  st.markdown("---")
480
- st.caption(f"Updated: {pd.Timestamp.now(tz='Europe/Vilnius').strftime('%Y-%m-%d %H:%M:%S')} | Auto-refresh: 5 min | Press 'R' to refresh")
 
15
  ANNOTATED_STATES = ["Acceptable", "No Rating"]
16
  GOAL_STATES = ["Acceptable", "No Rating", "ReqAttn (entities)"]
17
 
18
+ # Map project IDs to annotator IDs (for admin-created annotations)
19
+ PROJECT_ANNOTATOR_MAP = {
20
+ 29: 27,
21
+ 30: 28,
22
+ 31: 29,
23
+ 32: 30,
24
+ 33: 31,
25
+ 37: 33,
26
+ }
27
+
28
+ ANNOTATOR_NAMES = {
29
+ 1: "Admin",
30
+ 27: "A.K.",
31
+ 28: "Jo.Š.",
32
+ 29: "Ju.Š.",
33
+ 30: "G.Z.",
34
+ 31: "L.M.",
35
+ 33: "M.M.",
36
+ }
37
+
38
  TEAM_COLORS = {
39
+ 27: "#0066cc", # A.K.
40
+ 28: "#00cccc", # Jo.Š.
41
+ 29: "#00cc00", # Ju.Š.
42
+ 30: "#ff9900", # G.Z.
43
+ 31: "#9933ff", # L.M.
44
+ 33: "#cc0000", # M.M.
45
  }
46
 
47
+ # Helper: map annotator names to colors (derived from TEAM_COLORS and ANNOTATOR_NAMES)
48
+ COLORS_BY_NAME = {ANNOTATOR_NAMES[aid]: color for aid, color in TEAM_COLORS.items() if aid in ANNOTATOR_NAMES}
49
+
50
  # Cache file location (persists between runs)
51
  CACHE_FILE = Path(".cache.pkl")
52
 
53
  st.set_page_config(page_title="Annotation Progress", page_icon="📊", layout="wide")
54
 
55
 
56
+ @st.cache_data(ttl=3600) # Cache users for 1 hour (users rarely change)
57
+ def fetch_users(url, key):
58
+ """Fetch all users and create a mapping of user_id -> user_name."""
59
+ try:
60
+ headers = {"Authorization": f"Token {key}"}
61
+ resp = requests.get(f"{url}/api/users", headers=headers, timeout=30)
62
+ resp.raise_for_status()
63
+ users = resp.json()
64
+
65
+ user_map = {}
66
+ for user in users:
67
+ user_id = user.get("id")
68
+ first_name = user.get("first_name", "")
69
+ email = user.get("email", "")
70
+ name = first_name or email or f"User {user_id}"
71
+ user_map[user_id] = name
72
+
73
+ return user_map
74
+ except Exception:
75
+ # If we can't fetch users, return empty map
76
+ return {}
77
+
78
+
79
+ def fetch_project_data(proj, url, headers, user_map, since_date=None):
80
+ """Fetch data from one project (for parallel execution).
81
+
82
+ Args:
83
+ proj: Project dict from API
84
+ url: Label Studio URL
85
+ headers: Auth headers
86
+ user_map: User ID to name mapping
87
+ since_date: If provided, only fetch tasks updated after this ISO datetime string
88
+ """
89
  pid, name, task_count = proj["id"], proj.get("title", f"Project {proj['id']}"), proj.get("task_number", 0)
90
  group = "Our Team" if pid in OUR_TEAM_PROJECT_IDS else "Others"
91
 
92
  rows = []
93
  submitted_count = 0 # Track submitted (annotated) tasks
94
+ max_updated_at = since_date # Track the latest updated_at we see
95
  page = 1
96
+
97
+ # Build query filter for incremental updates
98
+ params = {"page": page, "page_size": 100}
99
+ if since_date:
100
+ import json
101
+ query = {
102
+ "filters": {
103
+ "conjunction": "and",
104
+ "items": [{
105
+ "filter": "filter:tasks:updated_at",
106
+ "operator": "greater",
107
+ "type": "Datetime",
108
+ "value": since_date
109
+ }]
110
+ }
111
+ }
112
+ params["query"] = json.dumps(query)
113
+ print(f"[DEBUG] Incremental update for project {pid} since {since_date}")
114
+
115
  while True:
116
+ params["page"] = page
117
+ resp = requests.get(f"{url}/api/projects/{pid}/tasks", headers=headers, params=params, timeout=30)
118
  resp.raise_for_status()
119
  data = resp.json()
120
  tasks = data if isinstance(data, list) else data.get("tasks", [])
 
123
  break
124
 
125
  for task in tasks:
126
+ # Track the latest updated_at timestamp
127
+ task_updated = task.get("updated_at")
128
+ if task_updated and (not max_updated_at or task_updated > max_updated_at):
129
+ max_updated_at = task_updated
130
+
131
  task_data = task.get("data", {})
132
  words = task_data.get("words") or len(task_data.get("text", "").split())
133
  category = task_data.get("category")
 
136
  if not annots:
137
  rows.append(
138
  {
139
+ "task_id": task.get("id"), # Add task_id for merging updates
140
  "project_id": pid,
141
  "project": name,
142
  "project_group": group,
143
+ "annotator": None,
144
+ "annotator_email": None,
145
  "date": None,
146
  "state": "Not Annotated",
147
  "words": int(words),
 
156
  ann = annots[0]
157
  date = ann.get("created_at", "")[:10] or None
158
 
159
+ # Extract annotator info
160
+ # completed_by can be either a user ID (int) or a user object (dict)
161
+ completed_by = ann.get("completed_by")
162
+
163
+ if isinstance(completed_by, dict):
164
+ # Full user object
165
+ annotator_id = completed_by.get("id")
166
+ annotator_email = completed_by.get("email", "Unknown")
167
+ elif isinstance(completed_by, int):
168
+ # Just a user ID
169
+ annotator_id = completed_by
170
+ annotator_email = f"user_{completed_by}"
171
+ else:
172
+ # No completed_by info
173
+ annotator_id = None
174
+ annotator_email = "unknown"
175
+
176
+ # Backward compatibility: if admin annotated a team project, use project's default annotator
177
+ if group == "Our Team" and annotator_id == 1 and pid in PROJECT_ANNOTATOR_MAP:
178
+ mapped_id = PROJECT_ANNOTATOR_MAP[pid]
179
+ if mapped_id:
180
+ annotator_id = mapped_id
181
+
182
+ # Get display name from ANNOTATOR_NAMES mapping (or fallback to user_map)
183
+ if annotator_id in ANNOTATOR_NAMES:
184
+ annotator_name = ANNOTATOR_NAMES[annotator_id]
185
+ elif annotator_id in user_map:
186
+ annotator_name = user_map[annotator_id]
187
+ else:
188
+ annotator_name = f"User {annotator_id}" if annotator_id else "Unknown"
189
+
190
  rating = None
191
  for item in ann.get("result", []):
192
  if item.get("type") == "choices" and item.get("from_name") == "text_rating":
 
204
  state = "Acceptable"
205
 
206
  rows.append(
207
+ {
208
+ "task_id": task.get("id"), # Add task_id for merging updates
209
+ "project_id": pid,
210
+ "project": name,
211
+ "project_group": group,
212
+ "annotator": annotator_name,
213
+ "annotator_email": annotator_email,
214
+ "date": date,
215
+ "state": state,
216
+ "words": int(words),
217
+ "category": category,
218
+ }
219
  )
220
 
221
  if isinstance(data, list) and len(data) < 100:
 
224
  break
225
  page += 1
226
 
227
+ return pid, task_count, submitted_count, rows, max_updated_at
228
 
229
 
230
+ @st.cache_data(ttl=120) # Auto-refresh every 120 seconds (2 minutes)
231
  def load_data(projects_hash):
232
  """Load annotation data from Label Studio with disk cache.
233
 
 
247
 
248
  headers = {"Authorization": f"Token {key}"}
249
 
250
+ # Fetch all users first to map user IDs to names (cached for 1 hour)
251
+ user_map = fetch_users(url, key)
252
+
253
  # Fetch all projects
254
  resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
255
  resp.raise_for_status()
 
266
 
267
  # Check which projects need updating
268
  projects_to_fetch = []
269
+ projects_to_update_incrementally = []
270
  all_rows = []
271
 
272
  for proj in projects:
 
277
 
278
  cache_key = f"project_{pid}"
279
 
280
+ # Decide caching strategy:
281
+ # 1. No cache exists full fetch
282
+ # 2. Task count changed full fetch (tasks added/removed)
283
+ # 3. Submitted count changed + have last_updated → incremental update
284
+ # 4. Both counts match → use cache
285
+ if cache_key not in cache:
286
+ # No cache - need full fetch
287
+ projects_to_fetch.append((proj, None))
 
 
 
 
 
 
288
  else:
289
+ cached = cache[cache_key]
290
+ if cached.get("task_count") != task_count:
291
+ # Task count changed - full fetch required
292
+ projects_to_fetch.append((proj, None))
293
+ elif cached.get("submitted_count") != api_submitted_count:
294
+ # Annotations changed - try incremental update if we have a timestamp
295
+ last_updated = cached.get("last_updated")
296
+ if last_updated:
297
+ # Incremental update: fetch only changed tasks
298
+ projects_to_update_incrementally.append((proj, last_updated, cached["rows"]))
299
+ else:
300
+ # No timestamp - full fetch
301
+ projects_to_fetch.append((proj, None))
302
+ else:
303
+ # Both counts match - use cache
304
+ all_rows.extend(cached["rows"])
305
 
306
  # Fetch updated projects in parallel
307
+ total_fetches = len(projects_to_fetch) + len(projects_to_update_incrementally)
308
+
309
+ if total_fetches > 0:
310
  with ThreadPoolExecutor(max_workers=10) as executor:
311
+ futures = []
312
+
313
+ # Submit full fetches
314
+ for proj, _ in projects_to_fetch:
315
+ futures.append(("full", executor.submit(fetch_project_data, proj, url, headers, user_map, None)))
316
+
317
+ # Submit incremental updates
318
+ for proj, since_date, cached_rows in projects_to_update_incrementally:
319
+ futures.append(("incremental", executor.submit(fetch_project_data, proj, url, headers, user_map, since_date), cached_rows))
320
+
321
+ progress = st.progress(0, text=f"Loading {total_fetches} projects...")
322
+ for i, future_info in enumerate(futures):
323
+ if future_info[0] == "full":
324
+ _, future = future_info
325
+ pid, task_count, submitted_count, rows, max_updated_at = future.result()
326
+ all_rows.extend(rows)
327
+ cache[f"project_{pid}"] = {
328
+ "task_count": task_count,
329
+ "submitted_count": submitted_count,
330
+ "last_updated": max_updated_at,
331
+ "rows": rows
332
+ }
333
+ else: # incremental
334
+ _, future, cached_rows = future_info
335
+ pid, task_count, submitted_count, updated_rows, max_updated_at = future.result()
336
+
337
+ # Get the previous timestamp from cache
338
+ prev_timestamp = cache.get(f"project_{pid}", {}).get("last_updated")
339
+
340
+ # Merge: update existing tasks or add new ones
341
+ if updated_rows:
342
+ # Create a dict of cached tasks by task_id for fast lookup
343
+ cached_by_id = {row["task_id"]: row for row in cached_rows}
344
+
345
+ # Update with new data
346
+ for row in updated_rows:
347
+ cached_by_id[row["task_id"]] = row
348
+
349
+ # Convert back to list
350
+ merged_rows = list(cached_by_id.values())
351
+ else:
352
+ # No updates, use cached rows
353
+ merged_rows = cached_rows
354
+
355
+ all_rows.extend(merged_rows)
356
+ cache[f"project_{pid}"] = {
357
+ "task_count": task_count,
358
+ "submitted_count": submitted_count,
359
+ "last_updated": max_updated_at or prev_timestamp, # Keep previous if no new updates
360
+ "rows": merged_rows
361
+ }
362
+
363
+ progress.progress((i + 1) / total_fetches, text=f"Loaded {i + 1}/{total_fetches} projects")
364
  progress.empty()
365
 
366
  # Save cache
 
436
  remaining = GOAL_WORDS - total
437
  progress = total / GOAL_WORDS * 100
438
 
439
+ # Calculate category breakdowns for overview
440
+ df_ready = df[df["is_annotated"]] # Acceptable + No Rating
441
+ df_needs_fixing = df[df["state"] == "ReqAttn (entities)"]
442
 
443
+ # Calculate pacing estimates
444
+ df_pace = df[df["is_goal_state"] & df["date"].notna()].copy()
445
+ daily_totals = df_pace.groupby("date")["words"].sum().reset_index()
446
+ daily_totals = daily_totals.set_index("date").sort_index()
447
+ cumulative_total = daily_totals.cumsum()
448
+ cumulative_total = cumulative_total[cumulative_total.index >= pd.Timestamp("2025-12-18")]
449
 
450
+ if len(cumulative_total) > 0:
451
+ last_date = cumulative_total.index[-1]
452
+ current_total = cumulative_total.iloc[-1]["words"]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
453
 
454
  # Calculate rate from last 14 days
455
+ lookback = cumulative_total[cumulative_total.index >= last_date - pd.Timedelta(days=14)]
456
  if len(lookback) >= 2:
457
  days = (last_date - lookback.index[0]).days or 1
458
+ rate = (current_total - lookback.iloc[0]["words"]) / days
459
+ days_left = (GOAL_WORDS - current_total) / rate if rate > 0 else 0
460
  completion = last_date + pd.Timedelta(days=days_left)
461
  weekly_rate = rate * 7
462
  else:
463
  rate = completion = weekly_rate = None
464
+ else:
465
+ rate = completion = weekly_rate = None
466
+
467
+ # Calculate category breakdowns for display
468
+ mok_ready = df_ready[df_ready["category"] == "mokslinis"]["words"].sum()
469
+ mok_fixing = df_needs_fixing[df_needs_fixing["category"] == "mokslinis"]["words"].sum()
470
+ mok_total = mok_ready + mok_fixing
471
+ mok_remaining = CATEGORY_GOAL - mok_total
472
+
473
+ zin_ready = df_ready[df_ready["category"] == "ziniasklaida"]["words"].sum()
474
+ zin_fixing = df_needs_fixing[df_needs_fixing["category"] == "ziniasklaida"]["words"].sum()
475
+ zin_total = zin_ready + zin_fixing
476
+ zin_remaining = CATEGORY_GOAL - zin_total
477
+
478
+ # Display metrics
479
+ col1, col2, col3 = st.columns(3)
480
+
481
+ # mokslinis category
482
+ mok_progress = mok_total / CATEGORY_GOAL * 100
483
+ col1.metric("mokslinis", f"{mok_total:,}")
484
+ if mok_remaining > 0:
485
+ col1.markdown(f"<small>{mok_progress:.1f}% of 1.1M • {mok_remaining:,} remaining</small>", unsafe_allow_html=True)
486
+ else:
487
+ col1.markdown(f"<small>{mok_progress:.1f}% of 1.1M • ✓ Complete</small>", unsafe_allow_html=True)
488
+
489
+ # ziniasklaida category
490
+ zin_progress = zin_total / CATEGORY_GOAL * 100
491
+ col2.metric("ziniasklaida", f"{zin_total:,}")
492
+ if zin_remaining > 0:
493
+ col2.markdown(f"<small>{zin_progress:.1f}% of 1.1M • {zin_remaining:,} remaining</small>", unsafe_allow_html=True)
494
+ else:
495
+ col2.markdown(f"<small>{zin_progress:.1f}% of 1.1M • ✓ Complete</small>", unsafe_allow_html=True)
496
+
497
+ # Completion estimate
498
+ if weekly_rate:
499
+ col3.metric("Est. Completion", completion.strftime("%Y-%m-%d"))
500
+ col3.markdown(f"<small>📊 {int(weekly_rate):,} words/week • {days_left / 7:.1f} weeks left</small>", unsafe_allow_html=True)
501
+ else:
502
+ col3.metric("Est. Completion", "N/A")
503
 
504
+ st.markdown("---")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
505
 
506
+ # ============== Weekly Stats ==============
507
+ st.subheader("📊 Weekly Stats")
508
+ st.caption("Goal states (Acceptable + No Rating + ReqAttn with entities)")
509
+
510
+ cutoff_date = pd.Timestamp("2025-12-22")
511
+
512
+ # Filter data - use GOAL_STATES to match progress metrics
513
+ # Show annotators for our team's projects, "Others" for everything else
514
+ df_week = df[df["is_goal_state"] & df["date"].notna()].copy()
515
+ df_week["week_start"] = df_week["date"] - pd.to_timedelta(df_week["date"].dt.dayofweek, unit="d")
516
+ df_week["member"] = df_week.apply(
517
+ lambda r: (r["annotator"] if r["annotator"] else "Unknown") if r["project_group"] == "Our Team" else "Others",
518
+ axis=1
519
+ )
520
+
521
+ # Weekly pivot (all data)
522
+ weekly_all = df_week.pivot_table(index="week_start", columns="member", values="words", aggfunc="sum", fill_value=0).astype(int)
523
+
524
+ # Split into before and after cutoff
525
+ weekly_before = weekly_all[weekly_all.index < cutoff_date]
526
+ weekly_after = weekly_all[weekly_all.index >= cutoff_date]
527
+
528
+ # Ensure consistent columns
529
+ all_members = set(weekly_all.columns)
530
+ if "Others" not in all_members:
531
+ all_members.add("Others")
532
+
533
+ for member in all_members:
534
+ if member not in weekly_after.columns:
535
+ weekly_after[member] = 0
536
+ if member not in weekly_before.columns:
537
+ weekly_before[member] = 0
538
+
539
+ # Sort columns by total contribution
540
+ totals = weekly_all.sum().sort_values(ascending=False)
541
+ weekly_after = weekly_after[totals.index]
542
+ weekly_after["Total"] = weekly_after.sum(axis=1)
543
+
544
+ # Calculate "Before" summary row
545
+ before_totals = weekly_before[totals.index].sum()
546
+ before_totals["Total"] = before_totals.sum()
547
+
548
+ # Format weekly data for display
549
+ display = weekly_after.reset_index()
550
+ display["Week"] = display["week_start"].dt.strftime("%Y-%m-%d") + " - " + (display["week_start"] + pd.Timedelta(days=6)).dt.strftime("%Y-%m-%d")
551
+ display = display.drop("week_start", axis=1)
552
+ display = display[["Week"] + list(totals.index) + ["Total"]]
553
+
554
+ # Add "Before" row at the beginning
555
+ before_row = pd.DataFrame([{"Week": f"Before {cutoff_date.strftime('%Y-%m-%d')}", **before_totals}])
556
+ display = pd.concat([before_row, display], ignore_index=True)
557
+
558
+ # Add TOTAL row at the end
559
+ all_totals = weekly_all[totals.index].sum()
560
+ all_totals["Total"] = all_totals.sum()
561
+ total_row = pd.DataFrame([{"Week": "TOTAL", **all_totals}])
562
+ display = pd.concat([display, total_row], ignore_index=True)
563
+
564
+ # Format numbers
565
+ for col in display.columns:
566
+ if col != "Week":
567
+ display[col] = display[col].apply(lambda x: f"{int(x):,}" if pd.notna(x) else "")
568
+
569
+ # Style and show
570
+ def style_row(row):
571
+ if row["Week"] == "TOTAL":
572
+ return ["font-weight: bold; background-color: #f0f0f0;"] * len(row)
573
+ elif row["Week"].startswith("Before"):
574
+ return ["font-style: italic; background-color: #f9f9f9;"] * len(row)
575
+ return [""] * len(row)
576
+
577
+ styled = display.style.apply(style_row, axis=1).set_properties(subset=["Total"], **{"font-weight": "bold"})
578
+ st.dataframe(styled, hide_index=True, use_container_width=True, height='content')
579
+
580
+ st.markdown("---")
581
+
582
+ # ============== Category Breakdown ==============
583
+ st.subheader("📈 Category Breakdown")
584
+ st.caption("Requirement: 1.1M words from each category")
585
+
586
+ # df_ready and df_needs_fixing already defined in overview section
587
+ df_total = df[df["is_goal_state"]]
588
+
589
+ # Calculate by category
590
+ mok_ready = df_ready[df_ready["category"] == "mokslinis"]["words"].sum()
591
+ mok_fixing = df_needs_fixing[df_needs_fixing["category"] == "mokslinis"]["words"].sum()
592
+ mok_total = mok_ready + mok_fixing
593
+
594
+ zin_ready = df_ready[df_ready["category"] == "ziniasklaida"]["words"].sum()
595
+ zin_fixing = df_needs_fixing[df_needs_fixing["category"] == "ziniasklaida"]["words"].sum()
596
+ zin_total = zin_ready + zin_fixing
597
+
598
+ total_ready = mok_ready + zin_ready
599
+ total_fixing = mok_fixing + zin_fixing
600
+ total_all = total_ready + total_fixing
601
+
602
+ cat_df = pd.DataFrame(
603
+ {
604
+ "Category": ["mokslinis", "ziniasklaida"],
605
+ "Ready": [f"{mok_ready:,}", f"{zin_ready:,}"],
606
+ "Needs Fixing": [f"{mok_fixing:,}", f"{zin_fixing:,}"],
607
+ "Total": [f"{mok_total:,}", f"{zin_total:,}"],
608
+ "Goal": [f"{CATEGORY_GOAL:,}", f"{CATEGORY_GOAL:,}"],
609
+ "Progress": [
610
+ f"{mok_total / CATEGORY_GOAL * 100:.1f}%",
611
+ f"{zin_total / CATEGORY_GOAL * 100:.1f}%",
612
+ ],
613
+ }
614
+ )
615
+ st.dataframe(cat_df, hide_index=True, use_container_width=True, height='content')
616
+
617
+ st.markdown("---")
618
 
619
+ # ============== Cumulative Progress ==============
620
+ st.subheader("📊 Cumulative Progress & Projection")
621
+
622
+ # Cumulative data - show by annotator for our team, "Others" for rest
623
+ df_cum = df[df["is_goal_state"] & df["date"].notna()].copy()
624
+ df_cum["member"] = df_cum.apply(
625
+ lambda r: (r["annotator"] if r["annotator"] else "Unknown") if r["project_group"] == "Our Team" else "Others",
626
+ axis=1
627
+ )
628
+
629
+ daily = df_cum.groupby(["date", "member"])["words"].sum().reset_index()
630
+ pivot = daily.pivot_table(index="date", columns="member", values="words", fill_value=0)
631
+ cumulative = pivot.sort_index().cumsum()
632
+ cumulative["Total"] = cumulative.sum(axis=1)
633
+ cumulative = cumulative[cumulative.index >= pd.Timestamp("2025-12-18")]
634
+
635
+ # Projection calculation
636
+ last_date = cumulative.index[-1]
637
+ current = cumulative["Total"].iloc[-1]
638
+
639
+ # Calculate rate from last 14 days
640
+ lookback = cumulative[cumulative.index >= last_date - pd.Timedelta(days=14)]
641
+ if len(lookback) >= 2:
642
+ days = (last_date - lookback.index[0]).days or 1
643
+ rate = (current - lookback["Total"].iloc[0]) / days
644
+ days_left = (GOAL_WORDS - current) / rate if rate > 0 else 0
645
+ completion = last_date + pd.Timedelta(days=days_left)
646
+ weekly_rate = rate * 7
647
+ else:
648
+ rate = completion = weekly_rate = None
649
+
650
+ # Chart
651
+ fig = go.Figure()
652
+
653
+ # Goal lines
654
+ fig.add_hline(y=1_100_000, line_dash="dot", line_color="orange", annotation_text="Midpoint: 1.1M", annotation_position="top left")
655
+ fig.add_hline(y=GOAL_WORDS, line_dash="dot", line_color="red", annotation_text="Goal: 2.2M", annotation_position="top left")
656
+
657
+ # Members
658
+ members = [c for c in cumulative.columns if c not in ["Total", "Others"]]
659
+ members = sorted(members, key=lambda x: cumulative[x].iloc[-1], reverse=True)
660
+
661
+ if "Others" in cumulative.columns:
662
  fig.add_trace(
663
  go.Scatter(
664
  x=cumulative.index,
665
+ y=cumulative["Others"],
666
+ name=f"Others: {cumulative['Others'].iloc[-1]:,.0f}",
667
  mode="lines",
668
+ line=dict(width=2, color="#7f8c8d"),
 
 
669
  )
670
  )
671
 
672
+ for m in members:
673
+ color = COLORS_BY_NAME.get(m, "#34495e")
674
+ fig.add_trace(
675
+ go.Scatter(x=cumulative.index, y=cumulative[m], name=f"{m}: {cumulative[m].iloc[-1]:,.0f}", mode="lines", line=dict(width=2, color=color))
676
+ )
677
+
678
+ # Total
679
+ fig.add_trace(
680
+ go.Scatter(
681
+ x=cumulative.index,
682
+ y=cumulative["Total"],
683
+ name=f"Total: {cumulative['Total'].iloc[-1]:,.0f}",
684
+ mode="lines",
685
+ line=dict(width=3, color="#d4af37"),
686
+ fill="tozeroy",
687
+ fillcolor="rgba(212, 175, 55, 0.1)",
688
+ )
689
+ )
690
+
691
+ # Projection
692
+ if completion:
693
+ proj_dates = pd.date_range(last_date, completion, freq="D")
694
+ proj_vals = current + rate * (proj_dates - last_date).days
695
+ fig.add_trace(
696
+ go.Scatter(
697
+ x=proj_dates, y=proj_vals, name=f"Projection ({int(weekly_rate):,}/wk)", mode="lines", line=dict(width=3, color="#d4af37", dash="dot")
698
  )
699
+ )
700
+ fig.add_trace(
701
+ go.Scatter(
702
+ x=[completion],
703
+ y=[GOAL_WORDS],
704
+ mode="markers+text",
705
+ marker=dict(size=14, color="#d4af37", symbol="diamond"),
706
+ text=[completion.strftime("%b %d")],
707
+ textposition="top center",
708
+ showlegend=False,
709
  )
710
+ )
711
+ title = f"Cumulative Progress → Est. {completion.strftime('%B %d, %Y')}"
712
+ else:
713
+ title = "Cumulative Progress"
 
 
714
 
715
+ fig.update_layout(title=title, xaxis_title="Date", yaxis_title="Cumulative Words", height=600, hovermode="x unified", template="plotly_white")
716
+ fig.update_yaxes(tickformat=".2s")
717
 
718
+ st.plotly_chart(fig, use_container_width=True)
 
 
 
 
 
 
719
 
720
  # Footer
721
  st.markdown("---")
722
+ st.caption(f"Updated: {pd.Timestamp.now(tz='Europe/Vilnius').strftime('%Y-%m-%d %H:%M:%S')} | Auto-refresh: 2 min | Press 'R' to refresh")