Gintarė Zokaitytė commited on
Commit
62e9aac
·
1 Parent(s): 2806d4d

cache fix

Browse files
Files changed (3) hide show
  1. app.py +328 -300
  2. pyproject.toml +1 -0
  3. uv.lock +11 -0
app.py CHANGED
@@ -1,12 +1,17 @@
 
1
  import re
2
  import os
3
  import pickle
 
4
  from pathlib import Path
5
  from concurrent.futures import ThreadPoolExecutor
6
  import streamlit as st
7
  import pandas as pd
8
  import plotly.graph_objects as go
9
  import requests
 
 
 
10
 
11
  GOAL_WORDS = 2_200_000
12
  CATEGORY_GOAL = 1_100_000
@@ -17,12 +22,12 @@ GOAL_STATES = ["Acceptable", "No Rating", "ReqAttn (entities)"]
17
 
18
  # Map project IDs to annotator IDs (for admin-created annotations)
19
  PROJECT_ANNOTATOR_MAP = {
20
- 29: 27,
21
  30: 28,
22
- 31: 29,
23
  32: 30,
24
- 33: 31,
25
- 37: 33,
26
  }
27
 
28
  ANNOTATOR_NAMES = {
@@ -48,12 +53,93 @@ TEAM_COLORS = {
48
  COLORS_BY_NAME = {ANNOTATOR_NAMES[aid]: color for aid, color in TEAM_COLORS.items() if aid in ANNOTATOR_NAMES}
49
 
50
  # Cache file location (persists between runs)
51
- CACHE_FILE = Path(".cache.pkl")
52
 
53
  st.set_page_config(page_title="Annotation Progress", page_icon="📊", layout="wide")
54
 
55
 
56
- @st.cache_data(ttl=3600) # Cache users for 1 hour (users rarely change)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  def fetch_users(url, key):
58
  """Fetch all users and create a mapping of user_id -> user_name."""
59
  try:
@@ -72,342 +158,247 @@ def fetch_users(url, key):
72
 
73
  return user_map
74
  except Exception:
75
- # If we can't fetch users, return empty map
76
  return {}
77
 
78
 
79
  def fetch_project_data(proj, url, headers, user_map, since_date=None):
80
- """Fetch data from one project (for parallel execution).
81
-
82
- Args:
83
- proj: Project dict from API
84
- url: Label Studio URL
85
- headers: Auth headers
86
- user_map: User ID to name mapping
87
- since_date: If provided, only fetch tasks updated after this ISO datetime string
88
- """
89
  pid, name, task_count = proj["id"], proj.get("title", f"Project {proj['id']}"), proj.get("task_number", 0)
90
  group = "Our Team" if pid in OUR_TEAM_PROJECT_IDS else "Others"
91
 
 
 
 
 
 
 
 
 
 
92
  rows = []
93
- submitted_count = 0 # Track submitted (annotated) tasks
94
- max_updated_at = since_date # Track the latest updated_at we see
95
- page = 1
96
-
97
- # Build query filter for incremental updates
98
- params = {"page": page, "page_size": 100}
99
- if since_date:
100
- import json
101
- query = {
102
- "filters": {
103
- "conjunction": "and",
104
- "items": [{
105
- "filter": "filter:tasks:updated_at",
106
- "operator": "greater",
107
- "type": "Datetime",
108
- "value": since_date
109
- }]
110
- }
111
- }
112
- params["query"] = json.dumps(query)
113
- print(f"[DEBUG] Incremental update for project {pid} since {since_date}")
114
 
115
- while True:
116
- params["page"] = page
117
- resp = requests.get(f"{url}/api/projects/{pid}/tasks", headers=headers, params=params, timeout=30)
118
- resp.raise_for_status()
119
- data = resp.json()
120
- tasks = data if isinstance(data, list) else data.get("tasks", [])
121
-
122
- if not tasks:
123
- break
124
-
125
- for task in tasks:
126
- # Track the latest updated_at timestamp
127
- task_updated = task.get("updated_at")
128
- if task_updated and (not max_updated_at or task_updated > max_updated_at):
129
- max_updated_at = task_updated
130
-
131
- task_data = task.get("data", {})
132
- words = task_data.get("words") or len(task_data.get("text", "").split())
133
- category = task_data.get("category")
134
-
135
- annots = [a for a in task.get("annotations", []) if not a.get("was_cancelled")]
136
- if not annots:
137
- rows.append(
138
- {
139
- "task_id": task.get("id"), # Add task_id for merging updates
140
- "project_id": pid,
141
- "project": name,
142
- "project_group": group,
143
- "annotator": None,
144
- "annotator_email": None,
145
- "date": None,
146
- "state": "Not Annotated",
147
- "words": int(words),
148
- "category": category,
149
- }
150
- )
151
- continue
152
-
153
- # Task has annotations - count as submitted
154
- submitted_count += 1
155
-
156
- ann = annots[0]
157
- date = ann.get("created_at", "")[:10] or None
158
-
159
- # Extract annotator info
160
- # completed_by can be either a user ID (int) or a user object (dict)
161
- completed_by = ann.get("completed_by")
162
-
163
- if isinstance(completed_by, dict):
164
- # Full user object
165
- annotator_id = completed_by.get("id")
166
- annotator_email = completed_by.get("email", "Unknown")
167
- elif isinstance(completed_by, int):
168
- # Just a user ID
169
- annotator_id = completed_by
170
- annotator_email = f"user_{completed_by}"
171
- else:
172
- # No completed_by info
173
- annotator_id = None
174
- annotator_email = "unknown"
175
-
176
- # Backward compatibility: if admin annotated a team project, use project's default annotator
177
- if group == "Our Team" and annotator_id == 1 and pid in PROJECT_ANNOTATOR_MAP:
178
- mapped_id = PROJECT_ANNOTATOR_MAP[pid]
179
- if mapped_id:
180
- annotator_id = mapped_id
181
-
182
- # Get display name from ANNOTATOR_NAMES mapping (or fallback to user_map)
183
- if annotator_id in ANNOTATOR_NAMES:
184
- annotator_name = ANNOTATOR_NAMES[annotator_id]
185
- elif annotator_id in user_map:
186
- annotator_name = user_map[annotator_id]
187
- else:
188
- annotator_name = f"User {annotator_id}" if annotator_id else "Unknown"
189
-
190
- rating = None
191
- for item in ann.get("result", []):
192
- if item.get("type") == "choices" and item.get("from_name") == "text_rating":
193
- rating = item.get("value", {}).get("choices", [None])[0]
194
- break
195
-
196
- has_entities = any(i.get("type") == "labels" for i in ann.get("result", []))
197
- if rating is None:
198
- state = "No Rating"
199
- elif rating == "Requires Attention":
200
- state = f"ReqAttn ({'entities' if has_entities else 'empty'})"
201
- elif rating == "Unacceptable":
202
- state = f"Unacceptable ({'entities' if has_entities else 'empty'})"
203
- else:
204
- state = "Acceptable"
205
 
 
 
 
 
 
 
206
  rows.append(
207
  {
208
- "task_id": task.get("id"), # Add task_id for merging updates
209
  "project_id": pid,
210
  "project": name,
211
  "project_group": group,
212
- "annotator": annotator_name,
213
- "annotator_email": annotator_email,
214
- "date": date,
215
- "state": state,
216
  "words": int(words),
217
  "category": category,
218
  }
219
  )
 
220
 
221
- if isinstance(data, list) and len(data) < 100:
222
- break
223
- if isinstance(data, dict) and not data.get("next"):
224
- break
225
- page += 1
226
 
227
- return pid, task_count, submitted_count, rows, max_updated_at
 
228
 
 
229
 
230
- @st.cache_data(ttl=120) # Auto-refresh every 120 seconds (2 minutes)
231
- def load_data(projects_hash):
232
- """Load annotation data from Label Studio with disk cache.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
233
 
234
- Args:
235
- projects_hash: Hash of project states to invalidate Streamlit cache when projects change
236
- """
237
- try:
238
- url = st.secrets.get("LABEL_STUDIO_URL", os.getenv("LABEL_STUDIO_URL", "")).rstrip("/")
239
- key = st.secrets.get("LABEL_STUDIO_API_KEY", os.getenv("LABEL_STUDIO_API_KEY", ""))
240
- except (KeyError, FileNotFoundError, AttributeError):
241
- url = os.getenv("LABEL_STUDIO_URL", "").rstrip("/")
242
- key = os.getenv("LABEL_STUDIO_API_KEY", "")
243
 
 
 
 
244
  if not url or not key:
245
- st.error("Missing credentials. Set LABEL_STUDIO_URL and LABEL_STUDIO_API_KEY.")
246
- st.stop()
247
 
248
  headers = {"Authorization": f"Token {key}"}
249
 
250
- # Fetch all users first to map user IDs to names (cached for 1 hour)
251
- user_map = fetch_users(url, key)
252
-
253
- # Fetch all projects
254
  resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
255
  resp.raise_for_status()
256
  projects = resp.json().get("results", [])
257
 
258
- # Load cache
259
- cache = {}
260
- if CACHE_FILE.exists():
261
- try:
262
- with open(CACHE_FILE, "rb") as f:
263
- cache = pickle.load(f)
264
- except Exception:
265
- cache = {}
266
 
267
- # Check which projects need updating
268
- projects_to_fetch = []
269
- projects_to_update_incrementally = []
270
- all_rows = []
271
 
272
  for proj in projects:
273
  pid = proj["id"]
 
274
  task_count = proj.get("task_number", 0)
275
- # Get submitted task count from Label Studio API
276
  api_submitted_count = proj.get("num_tasks_with_annotations", 0)
277
-
278
  cache_key = f"project_{pid}"
279
 
280
- # Decide caching strategy:
281
- # 1. No cache exists → full fetch
282
- # 2. Task count changed → full fetch (tasks added/removed)
283
- # 3. Submitted count changed + have last_updated → incremental update
284
- # 4. Both counts match → use cache
285
  if cache_key not in cache:
286
- # No cache - need full fetch
287
- projects_to_fetch.append((proj, None))
288
  else:
289
  cached = cache[cache_key]
290
- if cached.get("task_count") != task_count:
291
- # Task count changed - full fetch required
292
- projects_to_fetch.append((proj, None))
293
- elif cached.get("submitted_count") != api_submitted_count:
294
- # Annotations changed - try incremental update if we have a timestamp
 
 
 
295
  last_updated = cached.get("last_updated")
296
  if last_updated:
297
- # Incremental update: fetch only changed tasks
298
- projects_to_update_incrementally.append((proj, last_updated, cached["rows"]))
299
  else:
300
- # No timestamp - full fetch
301
- projects_to_fetch.append((proj, None))
302
  else:
303
- # Both counts match - use cache
304
- all_rows.extend(cached["rows"])
305
-
306
- # Fetch updated projects in parallel
307
- total_fetches = len(projects_to_fetch) + len(projects_to_update_incrementally)
308
-
309
- if total_fetches > 0:
310
- with ThreadPoolExecutor(max_workers=10) as executor:
311
- futures = []
312
-
313
- # Submit full fetches
314
- for proj, _ in projects_to_fetch:
315
- futures.append(("full", executor.submit(fetch_project_data, proj, url, headers, user_map, None)))
316
-
317
- # Submit incremental updates
318
- for proj, since_date, cached_rows in projects_to_update_incrementally:
319
- futures.append(("incremental", executor.submit(fetch_project_data, proj, url, headers, user_map, since_date), cached_rows))
320
-
321
- progress = st.progress(0, text=f"Loading {total_fetches} projects...")
322
- for i, future_info in enumerate(futures):
323
- if future_info[0] == "full":
324
- _, future = future_info
325
- pid, task_count, submitted_count, rows, max_updated_at = future.result()
326
- all_rows.extend(rows)
327
- cache[f"project_{pid}"] = {
328
- "task_count": task_count,
329
- "submitted_count": submitted_count,
330
- "last_updated": max_updated_at,
331
- "rows": rows
332
- }
333
- else: # incremental
334
- _, future, cached_rows = future_info
335
- pid, task_count, submitted_count, updated_rows, max_updated_at = future.result()
336
-
337
- # Get the previous timestamp from cache
338
- prev_timestamp = cache.get(f"project_{pid}", {}).get("last_updated")
339
-
340
- # Merge: update existing tasks or add new ones
341
- if updated_rows:
342
- # Create a dict of cached tasks by task_id for fast lookup
343
- cached_by_id = {row["task_id"]: row for row in cached_rows}
344
-
345
- # Update with new data
346
- for row in updated_rows:
347
- cached_by_id[row["task_id"]] = row
348
-
349
- # Convert back to list
350
- merged_rows = list(cached_by_id.values())
351
- else:
352
- # No updates, use cached rows
353
- merged_rows = cached_rows
354
-
355
- all_rows.extend(merged_rows)
356
- cache[f"project_{pid}"] = {
357
- "task_count": task_count,
358
- "submitted_count": submitted_count,
359
- "last_updated": max_updated_at or prev_timestamp, # Keep previous if no new updates
360
- "rows": merged_rows
361
- }
362
-
363
- progress.progress((i + 1) / total_fetches, text=f"Loaded {i + 1}/{total_fetches} projects")
364
- progress.empty()
365
-
366
- # Save cache
367
- try:
368
- with open(CACHE_FILE, "wb") as f:
369
- pickle.dump(cache, f)
370
- except Exception:
371
- pass
372
-
373
- # Create dataframe
374
- df = pd.DataFrame(all_rows)
375
- df["words"] = df["words"].astype(int)
376
- df["date"] = pd.to_datetime(df["date"], errors="coerce")
377
- df["is_annotated"] = df["state"].isin(ANNOTATED_STATES)
378
- df["is_goal_state"] = df["state"].isin(GOAL_STATES)
379
-
380
- return df
381
-
382
-
383
- def get_projects_hash():
384
- """Fetch projects and return a hash of their states for cache invalidation."""
385
- import hashlib
386
-
387
- try:
388
- url = st.secrets.get("LABEL_STUDIO_URL", os.getenv("LABEL_STUDIO_URL", "")).rstrip("/")
389
- key = st.secrets.get("LABEL_STUDIO_API_KEY", os.getenv("LABEL_STUDIO_API_KEY", ""))
390
- except (KeyError, FileNotFoundError, AttributeError):
391
- url = os.getenv("LABEL_STUDIO_URL", "").rstrip("/")
392
- key = os.getenv("LABEL_STUDIO_API_KEY", "")
393
 
394
- if not url or not key:
395
- return "no-credentials"
396
 
397
- headers = {"Authorization": f"Token {key}"}
398
- resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
399
- resp.raise_for_status()
400
- projects = resp.json().get("results", [])
 
 
 
 
 
 
 
 
 
 
 
 
 
401
 
402
- # Create hash from project states (id, task_number, num_tasks_with_annotations)
403
- state_string = ""
404
- for proj in projects:
405
- pid = proj["id"]
406
- task_count = proj.get("task_number", 0)
407
- submitted_count = proj.get("num_tasks_with_annotations", 0)
408
- state_string += f"{pid}:{task_count}:{submitted_count};"
409
 
410
- return hashlib.md5(state_string.encode()).hexdigest()
 
 
 
411
 
412
 
413
  def anonymize(name):
@@ -423,15 +414,56 @@ def anonymize(name):
423
  return name
424
 
425
 
 
 
426
  st.title("📊 Annotation Progress Dashboard")
427
- st.markdown("---")
428
 
429
- # Load data
430
- with st.spinner("Loading..."):
431
- projects_hash = get_projects_hash()
432
- df = load_data(projects_hash)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
433
 
434
- # Overview metrics
435
  total = df[df["is_goal_state"]]["words"].sum()
436
  remaining = GOAL_WORDS - total
437
  progress = total / GOAL_WORDS * 100
@@ -716,7 +748,3 @@ fig.update_layout(title=title, xaxis_title="Date", yaxis_title="Cumulative Words
716
  fig.update_yaxes(tickformat=".2s")
717
 
718
  st.plotly_chart(fig, use_container_width=True)
719
-
720
- # Footer
721
- st.markdown("---")
722
- st.caption(f"Updated: {pd.Timestamp.now(tz='Europe/Vilnius').strftime('%Y-%m-%d %H:%M:%S')} | Auto-refresh: 2 min | Press 'R' to refresh")
 
1
+ import gzip
2
  import re
3
  import os
4
  import pickle
5
+ from datetime import datetime
6
  from pathlib import Path
7
  from concurrent.futures import ThreadPoolExecutor
8
  import streamlit as st
9
  import pandas as pd
10
  import plotly.graph_objects as go
11
  import requests
12
+ from dotenv import load_dotenv
13
+
14
+ load_dotenv()
15
 
16
  GOAL_WORDS = 2_200_000
17
  CATEGORY_GOAL = 1_100_000
 
22
 
23
  # Map project IDs to annotator IDs (for admin-created annotations)
24
  PROJECT_ANNOTATOR_MAP = {
25
+ 29: 27,
26
  30: 28,
27
+ 31: 29,
28
  32: 30,
29
+ 33: 31,
30
+ 37: 33,
31
  }
32
 
33
  ANNOTATOR_NAMES = {
 
53
  COLORS_BY_NAME = {ANNOTATOR_NAMES[aid]: color for aid, color in TEAM_COLORS.items() if aid in ANNOTATOR_NAMES}
54
 
55
  # Cache file location (persists between runs)
56
+ CACHE_FILE = Path(".cache.pkl.gz")
57
 
58
  st.set_page_config(page_title="Annotation Progress", page_icon="📊", layout="wide")
59
 
60
 
61
+ # ============== Data layer ==============
62
+
63
+ def _get_credentials():
64
+ """Get Label Studio URL and API key from secrets or environment."""
65
+ try:
66
+ url = st.secrets.get("LABEL_STUDIO_URL", os.getenv("LABEL_STUDIO_URL", "")).rstrip("/")
67
+ key = st.secrets.get("LABEL_STUDIO_API_KEY", os.getenv("LABEL_STUDIO_API_KEY", ""))
68
+ except (KeyError, FileNotFoundError, AttributeError):
69
+ url = os.getenv("LABEL_STUDIO_URL", "").rstrip("/")
70
+ key = os.getenv("LABEL_STUDIO_API_KEY", "")
71
+ return url, key
72
+
73
+
74
+ def _load_cache():
75
+ """Load disk cache (gzip-compressed pickle)."""
76
+ if CACHE_FILE.exists():
77
+ try:
78
+ with gzip.open(CACHE_FILE, "rb") as f:
79
+ return pickle.load(f)
80
+ except Exception:
81
+ pass
82
+ # Try loading old uncompressed cache for migration
83
+ old_cache = Path(".cache.pkl")
84
+ if old_cache.exists():
85
+ try:
86
+ with open(old_cache, "rb") as f:
87
+ cache = pickle.load(f)
88
+ _save_cache(cache)
89
+ old_cache.unlink()
90
+ return cache
91
+ except Exception:
92
+ pass
93
+ return {}
94
+
95
+
96
+ def _save_cache(cache):
97
+ """Save disk cache (gzip-compressed pickle)."""
98
+ try:
99
+ with gzip.open(CACHE_FILE, "wb") as f:
100
+ pickle.dump(cache, f)
101
+ except Exception:
102
+ pass
103
+
104
+
105
+ def _build_df(all_rows):
106
+ """Build a DataFrame from row dicts."""
107
+ if not all_rows:
108
+ return pd.DataFrame(columns=[
109
+ "task_id", "project_id", "project", "project_group",
110
+ "annotator", "annotator_email", "date", "state", "words", "category",
111
+ "is_annotated", "is_goal_state",
112
+ ])
113
+ df = pd.DataFrame(all_rows)
114
+ df["words"] = df["words"].astype(int)
115
+ df["date"] = pd.to_datetime(df["date"], errors="coerce")
116
+ df["is_annotated"] = df["state"].isin(ANNOTATED_STATES)
117
+ df["is_goal_state"] = df["state"].isin(GOAL_STATES)
118
+ return df
119
+
120
+
121
+ def load_df_from_cache():
122
+ """Build DataFrame from disk cache only — no API calls, instant."""
123
+ cache = _load_cache()
124
+ if not cache:
125
+ return None, None
126
+
127
+ all_rows = []
128
+ last_updated = None
129
+ for key, data in cache.items():
130
+ if key.startswith("project_"):
131
+ all_rows.extend(data.get("rows", []))
132
+ ts = data.get("last_updated")
133
+ if ts and (not last_updated or ts > last_updated):
134
+ last_updated = ts
135
+
136
+ if not all_rows:
137
+ return None, None
138
+
139
+ return _build_df(all_rows), last_updated
140
+
141
+
142
+ @st.cache_data(ttl=3600)
143
  def fetch_users(url, key):
144
  """Fetch all users and create a mapping of user_id -> user_name."""
145
  try:
 
158
 
159
  return user_map
160
  except Exception:
 
161
  return {}
162
 
163
 
164
  def fetch_project_data(proj, url, headers, user_map, since_date=None):
165
+ """Fetch data from one project using the export API (excludes predictions)."""
 
 
 
 
 
 
 
 
166
  pid, name, task_count = proj["id"], proj.get("title", f"Project {proj['id']}"), proj.get("task_number", 0)
167
  group = "Our Team" if pid in OUR_TEAM_PROJECT_IDS else "Others"
168
 
169
+ resp = requests.get(
170
+ f"{url}/api/projects/{pid}/export",
171
+ headers=headers,
172
+ params={"exportType": "JSON", "download_all_tasks": "true"},
173
+ timeout=60,
174
+ )
175
+ resp.raise_for_status()
176
+ tasks = resp.json()
177
+
178
  rows = []
179
+ submitted_count = 0
180
+ max_updated_at = since_date
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
 
182
+ for task in tasks:
183
+ task_updated = task.get("updated_at")
184
+ if task_updated and (not max_updated_at or task_updated > max_updated_at):
185
+ max_updated_at = task_updated
186
+
187
+ if since_date and task_updated and task_updated <= since_date:
188
+ continue
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
+ task_data = task.get("data", {})
191
+ words = task_data.get("words") or len(task_data.get("text", "").split())
192
+ category = task_data.get("category")
193
+
194
+ annots = [a for a in task.get("annotations", []) if not a.get("was_cancelled")]
195
+ if not annots:
196
  rows.append(
197
  {
198
+ "task_id": task.get("id"),
199
  "project_id": pid,
200
  "project": name,
201
  "project_group": group,
202
+ "annotator": None,
203
+ "annotator_email": None,
204
+ "date": None,
205
+ "state": "Not Annotated",
206
  "words": int(words),
207
  "category": category,
208
  }
209
  )
210
+ continue
211
 
212
+ submitted_count += 1
 
 
 
 
213
 
214
+ ann = annots[0]
215
+ date = ann.get("created_at", "")[:10] or None
216
 
217
+ completed_by = ann.get("completed_by")
218
 
219
+ if isinstance(completed_by, dict):
220
+ annotator_id = completed_by.get("id")
221
+ annotator_email = completed_by.get("email", "Unknown")
222
+ elif isinstance(completed_by, int):
223
+ annotator_id = completed_by
224
+ annotator_email = f"user_{completed_by}"
225
+ else:
226
+ annotator_id = None
227
+ annotator_email = "unknown"
228
+
229
+ if group == "Our Team" and annotator_id == 1 and pid in PROJECT_ANNOTATOR_MAP:
230
+ mapped_id = PROJECT_ANNOTATOR_MAP[pid]
231
+ if mapped_id:
232
+ annotator_id = mapped_id
233
+
234
+ if annotator_id in ANNOTATOR_NAMES:
235
+ annotator_name = ANNOTATOR_NAMES[annotator_id]
236
+ elif annotator_id in user_map:
237
+ annotator_name = user_map[annotator_id]
238
+ else:
239
+ annotator_name = f"User {annotator_id}" if annotator_id else "Unknown"
240
+
241
+ rating = None
242
+ for item in ann.get("result", []):
243
+ if item.get("type") == "choices" and item.get("from_name") == "text_rating":
244
+ rating = item.get("value", {}).get("choices", [None])[0]
245
+ break
246
+
247
+ has_entities = any(i.get("type") == "labels" for i in ann.get("result", []))
248
+ if rating is None:
249
+ state = "No Rating"
250
+ elif rating == "Requires Attention":
251
+ state = f"ReqAttn ({'entities' if has_entities else 'empty'})"
252
+ elif rating == "Unacceptable":
253
+ state = f"Unacceptable ({'entities' if has_entities else 'empty'})"
254
+ else:
255
+ state = "Acceptable"
256
+
257
+ rows.append(
258
+ {
259
+ "task_id": task.get("id"),
260
+ "project_id": pid,
261
+ "project": name,
262
+ "project_group": group,
263
+ "annotator": annotator_name,
264
+ "annotator_email": annotator_email,
265
+ "date": date,
266
+ "state": state,
267
+ "words": int(words),
268
+ "category": category,
269
+ }
270
+ )
271
+
272
+ return pid, task_count, submitted_count, rows, max_updated_at
273
 
 
 
 
 
 
 
 
 
 
274
 
275
+ def check_and_update(status_container):
276
+ """Check for updates and fetch if needed. Returns True if cache was updated."""
277
+ url, key = _get_credentials()
278
  if not url or not key:
279
+ status_container.error("Missing credentials. Set LABEL_STUDIO_URL and LABEL_STUDIO_API_KEY.")
280
+ return False
281
 
282
  headers = {"Authorization": f"Token {key}"}
283
 
284
+ # Fetch project list to detect changes
 
 
 
285
  resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
286
  resp.raise_for_status()
287
  projects = resp.json().get("results", [])
288
 
289
+ cache = _load_cache()
290
+ user_map = fetch_users(url, key)
 
 
 
 
 
 
291
 
292
+ projects_to_fetch = [] # (proj, since_date, cached_rows_or_None, reason)
293
+ unchanged = 0
 
 
294
 
295
  for proj in projects:
296
  pid = proj["id"]
297
+ proj_name = proj.get("title", f"Project {pid}")
298
  task_count = proj.get("task_number", 0)
 
299
  api_submitted_count = proj.get("num_tasks_with_annotations", 0)
 
300
  cache_key = f"project_{pid}"
301
 
 
 
 
 
 
302
  if cache_key not in cache:
303
+ projects_to_fetch.append((proj, None, None, "new project"))
 
304
  else:
305
  cached = cache[cache_key]
306
+ old_tasks = cached.get("task_count", 0)
307
+ old_submitted = cached.get("submitted_count", 0)
308
+ if old_tasks != task_count:
309
+ diff = task_count - old_tasks
310
+ projects_to_fetch.append((proj, None, None,
311
+ f"{'+' if diff > 0 else ''}{diff} tasks"))
312
+ elif old_submitted != api_submitted_count:
313
+ diff = api_submitted_count - old_submitted
314
  last_updated = cached.get("last_updated")
315
  if last_updated:
316
+ projects_to_fetch.append((proj, last_updated, cached["rows"],
317
+ f"{'+' if diff > 0 else ''}{diff} annotations"))
318
  else:
319
+ projects_to_fetch.append((proj, None, None,
320
+ f"{'+' if diff > 0 else ''}{diff} annotations"))
321
  else:
322
+ unchanged += 1
323
+
324
+ total_fetches = len(projects_to_fetch)
325
+
326
+ if total_fetches == 0:
327
+ return False # Nothing changed
328
+
329
+ # Build a summary of what's updating
330
+ update_names = []
331
+ for p in projects_to_fetch:
332
+ proj_id = p[0]['id']
333
+ # Show annotator name for team projects, project ID for others
334
+ if proj_id in PROJECT_ANNOTATOR_MAP:
335
+ annotator_id = PROJECT_ANNOTATOR_MAP[proj_id]
336
+ name = ANNOTATOR_NAMES.get(annotator_id, f"#{proj_id}")
337
+ update_names.append(f"{name} {p[3]}")
338
+ else:
339
+ update_names.append(f"#{proj_id} ({p[3]})")
340
+ status_container.info(f"Updating {total_fetches} project(s): {', '.join(update_names)}")
341
+
342
+ with ThreadPoolExecutor(max_workers=10) as executor:
343
+ futures = []
344
+
345
+ for proj, since_date, cached_rows, reason in projects_to_fetch:
346
+ proj_name = proj.get("title", f"Project {proj['id']}")
347
+ api_sub = proj.get("num_tasks_with_annotations", 0)
348
+ is_incremental = since_date is not None and cached_rows is not None
349
+ futures.append((
350
+ "incremental" if is_incremental else "full",
351
+ executor.submit(fetch_project_data, proj, url, headers, user_map, since_date),
352
+ cached_rows,
353
+ proj_name,
354
+ reason,
355
+ api_sub,
356
+ ))
357
+
358
+ progress = status_container.progress(0, text=f"Updating {total_fetches} projects...")
359
+ for i, (mode, future, cached_rows, proj_name, reason, api_sub) in enumerate(futures):
360
+ pid = future.result()[0] # Get project ID early
361
+
362
+ # Show annotator name for team projects
363
+ if pid in PROJECT_ANNOTATOR_MAP:
364
+ annotator_id = PROJECT_ANNOTATOR_MAP[pid]
365
+ display_name = ANNOTATOR_NAMES.get(annotator_id, f"#{pid}")
366
+ progress.progress(i / total_fetches, text=f"Fetching: {display_name} {reason}...")
367
+ else:
368
+ progress.progress(i / total_fetches, text=f"Fetching: #{pid} ({reason})...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
369
 
370
+ pid, task_count, _, rows, max_updated_at = future.result()
 
371
 
372
+ if mode == "incremental" and cached_rows is not None:
373
+ prev_timestamp = cache.get(f"project_{pid}", {}).get("last_updated")
374
+ if rows:
375
+ cached_by_id = {row["task_id"]: row for row in cached_rows}
376
+ for row in rows:
377
+ cached_by_id[row["task_id"]] = row
378
+ rows = list(cached_by_id.values())
379
+ else:
380
+ rows = cached_rows
381
+ max_updated_at = max_updated_at or prev_timestamp
382
+
383
+ cache[f"project_{pid}"] = {
384
+ "task_count": task_count,
385
+ "submitted_count": api_sub,
386
+ "last_updated": max_updated_at,
387
+ "rows": rows
388
+ }
389
 
390
+ # Show annotator name for team projects
391
+ if pid in PROJECT_ANNOTATOR_MAP:
392
+ annotator_id = PROJECT_ANNOTATOR_MAP[pid]
393
+ display_name = ANNOTATOR_NAMES.get(annotator_id, f"#{pid}")
394
+ progress.progress((i + 1) / total_fetches, text=f"Done: {display_name} {reason}")
395
+ else:
396
+ progress.progress((i + 1) / total_fetches, text=f"Done: #{pid} ({reason})")
397
 
398
+ # Save updated timestamp
399
+ cache["_last_checked"] = datetime.now().isoformat()
400
+ _save_cache(cache)
401
+ return True
402
 
403
 
404
  def anonymize(name):
 
414
  return name
415
 
416
 
417
+ # ============== Page layout ==============
418
+
419
  st.title("📊 Annotation Progress Dashboard")
 
420
 
421
+ # Status bar placeholder at the very top (before any data)
422
+ status_bar = st.empty()
423
+
424
+ # Phase 1: Load cached data instantly (no API calls)
425
+ df, cache_timestamp = load_df_from_cache()
426
+
427
+ if df is None:
428
+ # No cache at all — must do a full fetch before we can show anything
429
+ status_bar.info("First load — fetching data from Label Studio...")
430
+ updated = check_and_update(status_bar)
431
+ df, cache_timestamp = load_df_from_cache()
432
+ if df is None:
433
+ st.error("Could not load any data. Check your Label Studio credentials.")
434
+ st.stop()
435
+
436
+ # Phase 2: Show "last updated" and check for updates in the background
437
+ # Use session_state to throttle update checks (every 5 minutes)
438
+ now = datetime.now()
439
+ last_check = st.session_state.get("_last_update_check")
440
+ needs_check = last_check is None or (now - last_check).total_seconds() > 300
441
+
442
+ if needs_check:
443
+ updated = check_and_update(status_bar)
444
+ st.session_state["_last_update_check"] = now
445
+ if updated:
446
+ status_bar.empty()
447
+ st.rerun() # Rerun to display fresh data
448
+
449
+ # Show the "last updated" timestamp
450
+ if cache_timestamp:
451
+ try:
452
+ ts = pd.Timestamp(cache_timestamp)
453
+ if ts.tzinfo:
454
+ ts = ts.tz_convert("Europe/Vilnius")
455
+ else:
456
+ ts = ts.tz_localize("UTC").tz_convert("Europe/Vilnius")
457
+ updated_str = ts.strftime("%Y-%m-%d %H:%M")
458
+ except Exception:
459
+ updated_str = str(cache_timestamp)[:16]
460
+ status_bar.caption(f"Last updated: {updated_str} | Auto-refresh: 5 min | Press 'R' to refresh")
461
+ else:
462
+ status_bar.caption(f"Updated: {pd.Timestamp.now(tz='Europe/Vilnius').strftime('%Y-%m-%d %H:%M')} | Auto-refresh: 5 min | Press 'R' to refresh")
463
+
464
+ st.markdown("---")
465
 
466
+ # ============== Overview metrics ==============
467
  total = df[df["is_goal_state"]]["words"].sum()
468
  remaining = GOAL_WORDS - total
469
  progress = total / GOAL_WORDS * 100
 
748
  fig.update_yaxes(tickformat=".2s")
749
 
750
  st.plotly_chart(fig, use_container_width=True)
 
 
 
 
pyproject.toml CHANGED
@@ -7,6 +7,7 @@ requires-python = ">=3.12"
7
  dependencies = [
8
  "pandas>=2.3.3",
9
  "plotly>=6.5.2",
 
10
  "requests>=2.32.5",
11
  "streamlit>=1.53.1",
12
  ]
 
7
  dependencies = [
8
  "pandas>=2.3.3",
9
  "plotly>=6.5.2",
10
+ "python-dotenv>=1.2.1",
11
  "requests>=2.32.5",
12
  "streamlit>=1.53.1",
13
  ]
uv.lock CHANGED
@@ -25,6 +25,7 @@ source = { virtual = "." }
25
  dependencies = [
26
  { name = "pandas" },
27
  { name = "plotly" },
 
28
  { name = "requests" },
29
  { name = "streamlit" },
30
  ]
@@ -33,6 +34,7 @@ dependencies = [
33
  requires-dist = [
34
  { name = "pandas", specifier = ">=2.3.3" },
35
  { name = "plotly", specifier = ">=6.5.2" },
 
36
  { name = "requests", specifier = ">=2.32.5" },
37
  { name = "streamlit", specifier = ">=1.53.1" },
38
  ]
@@ -577,6 +579,15 @@ wheels = [
577
  { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
578
  ]
579
 
 
 
 
 
 
 
 
 
 
580
  [[package]]
581
  name = "pytz"
582
  version = "2025.2"
 
25
  dependencies = [
26
  { name = "pandas" },
27
  { name = "plotly" },
28
+ { name = "python-dotenv" },
29
  { name = "requests" },
30
  { name = "streamlit" },
31
  ]
 
34
  requires-dist = [
35
  { name = "pandas", specifier = ">=2.3.3" },
36
  { name = "plotly", specifier = ">=6.5.2" },
37
+ { name = "python-dotenv", specifier = ">=1.2.1" },
38
  { name = "requests", specifier = ">=2.32.5" },
39
  { name = "streamlit", specifier = ">=1.53.1" },
40
  ]
 
579
  { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
580
  ]
581
 
582
+ [[package]]
583
+ name = "python-dotenv"
584
+ version = "1.2.1"
585
+ source = { registry = "https://pypi.org/simple" }
586
+ sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" }
587
+ wheels = [
588
+ { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
589
+ ]
590
+
591
  [[package]]
592
  name = "pytz"
593
  version = "2025.2"