Spaces:

nus-project
/

annotation-dashboard

Running

App Files Files Community

Gintarė Zokaitytė commited on 12 days ago

Commit

62e9aac

1 Parent(s): 2806d4d

cache fix

Browse files

Files changed (3) hide show

app.py +328 -300
pyproject.toml +1 -0
uv.lock +11 -0

app.py CHANGED Viewed

@@ -1,12 +1,17 @@
 import re
 import os
 import pickle
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor
 import streamlit as st
 import pandas as pd
 import plotly.graph_objects as go
 import requests
 GOAL_WORDS = 2_200_000
 CATEGORY_GOAL = 1_100_000
@@ -17,12 +22,12 @@ GOAL_STATES = ["Acceptable", "No Rating", "ReqAttn (entities)"]
 # Map project IDs to annotator IDs (for admin-created annotations)
 PROJECT_ANNOTATOR_MAP = {
-    29: 27,
     30: 28,
-    31: 29,
     32: 30,
-    33: 31,
-    37: 33,
 }
 ANNOTATOR_NAMES = {
@@ -48,12 +53,93 @@ TEAM_COLORS = {
 COLORS_BY_NAME = {ANNOTATOR_NAMES[aid]: color for aid, color in TEAM_COLORS.items() if aid in ANNOTATOR_NAMES}
 # Cache file location (persists between runs)
-CACHE_FILE = Path(".cache.pkl")
 st.set_page_config(page_title="Annotation Progress", page_icon="📊", layout="wide")
-@st.cache_data(ttl=3600)  # Cache users for 1 hour (users rarely change)
 def fetch_users(url, key):
     """Fetch all users and create a mapping of user_id -> user_name."""
     try:
@@ -72,342 +158,247 @@ def fetch_users(url, key):
         return user_map
     except Exception:
-        # If we can't fetch users, return empty map
         return {}
 def fetch_project_data(proj, url, headers, user_map, since_date=None):
-    """Fetch data from one project (for parallel execution).
-    Args:
-        proj: Project dict from API
-        url: Label Studio URL
-        headers: Auth headers
-        user_map: User ID to name mapping
-        since_date: If provided, only fetch tasks updated after this ISO datetime string
-    """
     pid, name, task_count = proj["id"], proj.get("title", f"Project {proj['id']}"), proj.get("task_number", 0)
     group = "Our Team" if pid in OUR_TEAM_PROJECT_IDS else "Others"
     rows = []
-    submitted_count = 0  # Track submitted (annotated) tasks
-    max_updated_at = since_date  # Track the latest updated_at we see
-    page = 1
-    # Build query filter for incremental updates
-    params = {"page": page, "page_size": 100}
-    if since_date:
-        import json
-        query = {
-            "filters": {
-                "conjunction": "and",
-                "items": [{
-                    "filter": "filter:tasks:updated_at",
-                    "operator": "greater",
-                    "type": "Datetime",
-                    "value": since_date
-                }]
-            }
-        }
-        params["query"] = json.dumps(query)
-        print(f"[DEBUG] Incremental update for project {pid} since {since_date}")
-    while True:
-        params["page"] = page
-        resp = requests.get(f"{url}/api/projects/{pid}/tasks", headers=headers, params=params, timeout=30)
-        resp.raise_for_status()
-        data = resp.json()
-        tasks = data if isinstance(data, list) else data.get("tasks", [])
-        if not tasks:
-            break
-        for task in tasks:
-            # Track the latest updated_at timestamp
-            task_updated = task.get("updated_at")
-            if task_updated and (not max_updated_at or task_updated > max_updated_at):
-                max_updated_at = task_updated
-            task_data = task.get("data", {})
-            words = task_data.get("words") or len(task_data.get("text", "").split())
-            category = task_data.get("category")
-            annots = [a for a in task.get("annotations", []) if not a.get("was_cancelled")]
-            if not annots:
-                rows.append(
-                    {
-                        "task_id": task.get("id"),  # Add task_id for merging updates
-                        "project_id": pid,
-                        "project": name,
-                        "project_group": group,
-                        "annotator": None,
-                        "annotator_email": None,
-                        "date": None,
-                        "state": "Not Annotated",
-                        "words": int(words),
-                        "category": category,
-                    }
-                )
-                continue
-            # Task has annotations - count as submitted
-            submitted_count += 1
-            ann = annots[0]
-            date = ann.get("created_at", "")[:10] or None
-            # Extract annotator info
-            # completed_by can be either a user ID (int) or a user object (dict)
-            completed_by = ann.get("completed_by")
-            if isinstance(completed_by, dict):
-                # Full user object
-                annotator_id = completed_by.get("id")
-                annotator_email = completed_by.get("email", "Unknown")
-            elif isinstance(completed_by, int):
-                # Just a user ID
-                annotator_id = completed_by
-                annotator_email = f"user_{completed_by}"
-            else:
-                # No completed_by info
-                annotator_id = None
-                annotator_email = "unknown"
-            # Backward compatibility: if admin annotated a team project, use project's default annotator
-            if group == "Our Team" and annotator_id == 1 and pid in PROJECT_ANNOTATOR_MAP:
-                mapped_id = PROJECT_ANNOTATOR_MAP[pid]
-                if mapped_id:
-                    annotator_id = mapped_id
-            # Get display name from ANNOTATOR_NAMES mapping (or fallback to user_map)
-            if annotator_id in ANNOTATOR_NAMES:
-                annotator_name = ANNOTATOR_NAMES[annotator_id]
-            elif annotator_id in user_map:
-                annotator_name = user_map[annotator_id]
-            else:
-                annotator_name = f"User {annotator_id}" if annotator_id else "Unknown"
-            rating = None
-            for item in ann.get("result", []):
-                if item.get("type") == "choices" and item.get("from_name") == "text_rating":
-                    rating = item.get("value", {}).get("choices", [None])[0]
-                    break
-            has_entities = any(i.get("type") == "labels" for i in ann.get("result", []))
-            if rating is None:
-                state = "No Rating"
-            elif rating == "Requires Attention":
-                state = f"ReqAttn ({'entities' if has_entities else 'empty'})"
-            elif rating == "Unacceptable":
-                state = f"Unacceptable ({'entities' if has_entities else 'empty'})"
-            else:
-                state = "Acceptable"
             rows.append(
                 {
-                    "task_id": task.get("id"),  # Add task_id for merging updates
                     "project_id": pid,
                     "project": name,
                     "project_group": group,
-                    "annotator": annotator_name,
-                    "annotator_email": annotator_email,
-                    "date": date,
-                    "state": state,
                     "words": int(words),
                     "category": category,
                 }
             )
-        if isinstance(data, list) and len(data) < 100:
-            break
-        if isinstance(data, dict) and not data.get("next"):
-            break
-        page += 1
-    return pid, task_count, submitted_count, rows, max_updated_at
-@st.cache_data(ttl=120)  # Auto-refresh every 120 seconds (2 minutes)
-def load_data(projects_hash):
-    """Load annotation data from Label Studio with disk cache.
-    Args:
-        projects_hash: Hash of project states to invalidate Streamlit cache when projects change
-    """
-    try:
-        url = st.secrets.get("LABEL_STUDIO_URL", os.getenv("LABEL_STUDIO_URL", "")).rstrip("/")
-        key = st.secrets.get("LABEL_STUDIO_API_KEY", os.getenv("LABEL_STUDIO_API_KEY", ""))
-    except (KeyError, FileNotFoundError, AttributeError):
-        url = os.getenv("LABEL_STUDIO_URL", "").rstrip("/")
-        key = os.getenv("LABEL_STUDIO_API_KEY", "")
     if not url or not key:
-        st.error("Missing credentials. Set LABEL_STUDIO_URL and LABEL_STUDIO_API_KEY.")
-        st.stop()
     headers = {"Authorization": f"Token {key}"}
-    # Fetch all users first to map user IDs to names (cached for 1 hour)
-    user_map = fetch_users(url, key)
-    # Fetch all projects
     resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
     resp.raise_for_status()
     projects = resp.json().get("results", [])
-    # Load cache
-    cache = {}
-    if CACHE_FILE.exists():
-        try:
-            with open(CACHE_FILE, "rb") as f:
-                cache = pickle.load(f)
-        except Exception:
-            cache = {}
-    # Check which projects need updating
-    projects_to_fetch = []
-    projects_to_update_incrementally = []
-    all_rows = []
     for proj in projects:
         pid = proj["id"]
         task_count = proj.get("task_number", 0)
-        # Get submitted task count from Label Studio API
         api_submitted_count = proj.get("num_tasks_with_annotations", 0)
         cache_key = f"project_{pid}"
-        # Decide caching strategy:
-        # 1. No cache exists → full fetch
-        # 2. Task count changed → full fetch (tasks added/removed)
-        # 3. Submitted count changed + have last_updated → incremental update
-        # 4. Both counts match → use cache
         if cache_key not in cache:
-            # No cache - need full fetch
-            projects_to_fetch.append((proj, None))
         else:
             cached = cache[cache_key]
-            if cached.get("task_count") != task_count:
-                # Task count changed - full fetch required
-                projects_to_fetch.append((proj, None))
-            elif cached.get("submitted_count") != api_submitted_count:
-                # Annotations changed - try incremental update if we have a timestamp
                 last_updated = cached.get("last_updated")
                 if last_updated:
-                    # Incremental update: fetch only changed tasks
-                    projects_to_update_incrementally.append((proj, last_updated, cached["rows"]))
                 else:
-                    # No timestamp - full fetch
-                    projects_to_fetch.append((proj, None))
             else:
-                # Both counts match - use cache
-                all_rows.extend(cached["rows"])
-    # Fetch updated projects in parallel
-    total_fetches = len(projects_to_fetch) + len(projects_to_update_incrementally)
-    if total_fetches > 0:
-        with ThreadPoolExecutor(max_workers=10) as executor:
-            futures = []
-            # Submit full fetches
-            for proj, _ in projects_to_fetch:
-                futures.append(("full", executor.submit(fetch_project_data, proj, url, headers, user_map, None)))
-            # Submit incremental updates
-            for proj, since_date, cached_rows in projects_to_update_incrementally:
-                futures.append(("incremental", executor.submit(fetch_project_data, proj, url, headers, user_map, since_date), cached_rows))
-            progress = st.progress(0, text=f"Loading {total_fetches} projects...")
-            for i, future_info in enumerate(futures):
-                if future_info[0] == "full":
-                    _, future = future_info
-                    pid, task_count, submitted_count, rows, max_updated_at = future.result()
-                    all_rows.extend(rows)
-                    cache[f"project_{pid}"] = {
-                        "task_count": task_count,
-                        "submitted_count": submitted_count,
-                        "last_updated": max_updated_at,
-                        "rows": rows
-                    }
-                else:  # incremental
-                    _, future, cached_rows = future_info
-                    pid, task_count, submitted_count, updated_rows, max_updated_at = future.result()
-                    # Get the previous timestamp from cache
-                    prev_timestamp = cache.get(f"project_{pid}", {}).get("last_updated")
-                    # Merge: update existing tasks or add new ones
-                    if updated_rows:
-                        # Create a dict of cached tasks by task_id for fast lookup
-                        cached_by_id = {row["task_id"]: row for row in cached_rows}
-                        # Update with new data
-                        for row in updated_rows:
-                            cached_by_id[row["task_id"]] = row
-                        # Convert back to list
-                        merged_rows = list(cached_by_id.values())
-                    else:
-                        # No updates, use cached rows
-                        merged_rows = cached_rows
-                    all_rows.extend(merged_rows)
-                    cache[f"project_{pid}"] = {
-                        "task_count": task_count,
-                        "submitted_count": submitted_count,
-                        "last_updated": max_updated_at or prev_timestamp,  # Keep previous if no new updates
-                        "rows": merged_rows
-                    }
-                progress.progress((i + 1) / total_fetches, text=f"Loaded {i + 1}/{total_fetches} projects")
-            progress.empty()
-        # Save cache
-        try:
-            with open(CACHE_FILE, "wb") as f:
-                pickle.dump(cache, f)
-        except Exception:
-            pass
-    # Create dataframe
-    df = pd.DataFrame(all_rows)
-    df["words"] = df["words"].astype(int)
-    df["date"] = pd.to_datetime(df["date"], errors="coerce")
-    df["is_annotated"] = df["state"].isin(ANNOTATED_STATES)
-    df["is_goal_state"] = df["state"].isin(GOAL_STATES)
-    return df
-def get_projects_hash():
-    """Fetch projects and return a hash of their states for cache invalidation."""
-    import hashlib
-    try:
-        url = st.secrets.get("LABEL_STUDIO_URL", os.getenv("LABEL_STUDIO_URL", "")).rstrip("/")
-        key = st.secrets.get("LABEL_STUDIO_API_KEY", os.getenv("LABEL_STUDIO_API_KEY", ""))
-    except (KeyError, FileNotFoundError, AttributeError):
-        url = os.getenv("LABEL_STUDIO_URL", "").rstrip("/")
-        key = os.getenv("LABEL_STUDIO_API_KEY", "")
-    if not url or not key:
-        return "no-credentials"
-    headers = {"Authorization": f"Token {key}"}
-    resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
-    resp.raise_for_status()
-    projects = resp.json().get("results", [])
-    # Create hash from project states (id, task_number, num_tasks_with_annotations)
-    state_string = ""
-    for proj in projects:
-        pid = proj["id"]
-        task_count = proj.get("task_number", 0)
-        submitted_count = proj.get("num_tasks_with_annotations", 0)
-        state_string += f"{pid}:{task_count}:{submitted_count};"
-    return hashlib.md5(state_string.encode()).hexdigest()
 def anonymize(name):
@@ -423,15 +414,56 @@ def anonymize(name):
     return name
 st.title("📊 Annotation Progress Dashboard")
-st.markdown("---")
-# Load data
-with st.spinner("Loading..."):
-    projects_hash = get_projects_hash()
-    df = load_data(projects_hash)
-# Overview metrics
 total = df[df["is_goal_state"]]["words"].sum()
 remaining = GOAL_WORDS - total
 progress = total / GOAL_WORDS * 100
@@ -716,7 +748,3 @@ fig.update_layout(title=title, xaxis_title="Date", yaxis_title="Cumulative Words
 fig.update_yaxes(tickformat=".2s")
 st.plotly_chart(fig, use_container_width=True)
-# Footer
-st.markdown("---")
-st.caption(f"Updated: {pd.Timestamp.now(tz='Europe/Vilnius').strftime('%Y-%m-%d %H:%M:%S')} | Auto-refresh: 2 min | Press 'R' to refresh")

+import gzip
 import re
 import os
 import pickle
+from datetime import datetime
 from pathlib import Path
 from concurrent.futures import ThreadPoolExecutor
 import streamlit as st
 import pandas as pd
 import plotly.graph_objects as go
 import requests
+from dotenv import load_dotenv
+load_dotenv()
 GOAL_WORDS = 2_200_000
 CATEGORY_GOAL = 1_100_000
 # Map project IDs to annotator IDs (for admin-created annotations)
 PROJECT_ANNOTATOR_MAP = {
+    29: 27,
     30: 28,
+    31: 29,
     32: 30,
+    33: 31,
+    37: 33,
 }
 ANNOTATOR_NAMES = {
 COLORS_BY_NAME = {ANNOTATOR_NAMES[aid]: color for aid, color in TEAM_COLORS.items() if aid in ANNOTATOR_NAMES}
 # Cache file location (persists between runs)
+CACHE_FILE = Path(".cache.pkl.gz")
 st.set_page_config(page_title="Annotation Progress", page_icon="📊", layout="wide")
+# ============== Data layer ==============
+def _get_credentials():
+    """Get Label Studio URL and API key from secrets or environment."""
+    try:
+        url = st.secrets.get("LABEL_STUDIO_URL", os.getenv("LABEL_STUDIO_URL", "")).rstrip("/")
+        key = st.secrets.get("LABEL_STUDIO_API_KEY", os.getenv("LABEL_STUDIO_API_KEY", ""))
+    except (KeyError, FileNotFoundError, AttributeError):
+        url = os.getenv("LABEL_STUDIO_URL", "").rstrip("/")
+        key = os.getenv("LABEL_STUDIO_API_KEY", "")
+    return url, key
+def _load_cache():
+    """Load disk cache (gzip-compressed pickle)."""
+    if CACHE_FILE.exists():
+        try:
+            with gzip.open(CACHE_FILE, "rb") as f:
+                return pickle.load(f)
+        except Exception:
+            pass
+    # Try loading old uncompressed cache for migration
+    old_cache = Path(".cache.pkl")
+    if old_cache.exists():
+        try:
+            with open(old_cache, "rb") as f:
+                cache = pickle.load(f)
+            _save_cache(cache)
+            old_cache.unlink()
+            return cache
+        except Exception:
+            pass
+    return {}
+def _save_cache(cache):
+    """Save disk cache (gzip-compressed pickle)."""
+    try:
+        with gzip.open(CACHE_FILE, "wb") as f:
+            pickle.dump(cache, f)
+    except Exception:
+        pass
+def _build_df(all_rows):
+    """Build a DataFrame from row dicts."""
+    if not all_rows:
+        return pd.DataFrame(columns=[
+            "task_id", "project_id", "project", "project_group",
+            "annotator", "annotator_email", "date", "state", "words", "category",
+            "is_annotated", "is_goal_state",
+        ])
+    df = pd.DataFrame(all_rows)
+    df["words"] = df["words"].astype(int)
+    df["date"] = pd.to_datetime(df["date"], errors="coerce")
+    df["is_annotated"] = df["state"].isin(ANNOTATED_STATES)
+    df["is_goal_state"] = df["state"].isin(GOAL_STATES)
+    return df
+def load_df_from_cache():
+    """Build DataFrame from disk cache only — no API calls, instant."""
+    cache = _load_cache()
+    if not cache:
+        return None, None
+    all_rows = []
+    last_updated = None
+    for key, data in cache.items():
+        if key.startswith("project_"):
+            all_rows.extend(data.get("rows", []))
+            ts = data.get("last_updated")
+            if ts and (not last_updated or ts > last_updated):
+                last_updated = ts
+    if not all_rows:
+        return None, None
+    return _build_df(all_rows), last_updated
+@st.cache_data(ttl=3600)
 def fetch_users(url, key):
     """Fetch all users and create a mapping of user_id -> user_name."""
     try:
         return user_map
     except Exception:
         return {}
 def fetch_project_data(proj, url, headers, user_map, since_date=None):
+    """Fetch data from one project using the export API (excludes predictions)."""
     pid, name, task_count = proj["id"], proj.get("title", f"Project {proj['id']}"), proj.get("task_number", 0)
     group = "Our Team" if pid in OUR_TEAM_PROJECT_IDS else "Others"
+    resp = requests.get(
+        f"{url}/api/projects/{pid}/export",
+        headers=headers,
+        params={"exportType": "JSON", "download_all_tasks": "true"},
+        timeout=60,
+    )
+    resp.raise_for_status()
+    tasks = resp.json()
     rows = []
+    submitted_count = 0
+    max_updated_at = since_date
+    for task in tasks:
+        task_updated = task.get("updated_at")
+        if task_updated and (not max_updated_at or task_updated > max_updated_at):
+            max_updated_at = task_updated
+        if since_date and task_updated and task_updated <= since_date:
+            continue
+        task_data = task.get("data", {})
+        words = task_data.get("words") or len(task_data.get("text", "").split())
+        category = task_data.get("category")
+        annots = [a for a in task.get("annotations", []) if not a.get("was_cancelled")]
+        if not annots:
             rows.append(
                 {
+                    "task_id": task.get("id"),
                     "project_id": pid,
                     "project": name,
                     "project_group": group,
+                    "annotator": None,
+                    "annotator_email": None,
+                    "date": None,
+                    "state": "Not Annotated",
                     "words": int(words),
                     "category": category,
                 }
             )
+            continue
+        submitted_count += 1
+        ann = annots[0]
+        date = ann.get("created_at", "")[:10] or None
+        completed_by = ann.get("completed_by")
+        if isinstance(completed_by, dict):
+            annotator_id = completed_by.get("id")
+            annotator_email = completed_by.get("email", "Unknown")
+        elif isinstance(completed_by, int):
+            annotator_id = completed_by
+            annotator_email = f"user_{completed_by}"
+        else:
+            annotator_id = None
+            annotator_email = "unknown"
+        if group == "Our Team" and annotator_id == 1 and pid in PROJECT_ANNOTATOR_MAP:
+            mapped_id = PROJECT_ANNOTATOR_MAP[pid]
+            if mapped_id:
+                annotator_id = mapped_id
+        if annotator_id in ANNOTATOR_NAMES:
+            annotator_name = ANNOTATOR_NAMES[annotator_id]
+        elif annotator_id in user_map:
+            annotator_name = user_map[annotator_id]
+        else:
+            annotator_name = f"User {annotator_id}" if annotator_id else "Unknown"
+        rating = None
+        for item in ann.get("result", []):
+            if item.get("type") == "choices" and item.get("from_name") == "text_rating":
+                rating = item.get("value", {}).get("choices", [None])[0]
+                break
+        has_entities = any(i.get("type") == "labels" for i in ann.get("result", []))
+        if rating is None:
+            state = "No Rating"
+        elif rating == "Requires Attention":
+            state = f"ReqAttn ({'entities' if has_entities else 'empty'})"
+        elif rating == "Unacceptable":
+            state = f"Unacceptable ({'entities' if has_entities else 'empty'})"
+        else:
+            state = "Acceptable"
+        rows.append(
+            {
+                "task_id": task.get("id"),
+                "project_id": pid,
+                "project": name,
+                "project_group": group,
+                "annotator": annotator_name,
+                "annotator_email": annotator_email,
+                "date": date,
+                "state": state,
+                "words": int(words),
+                "category": category,
+            }
+        )
+    return pid, task_count, submitted_count, rows, max_updated_at
+def check_and_update(status_container):
+    """Check for updates and fetch if needed. Returns True if cache was updated."""
+    url, key = _get_credentials()
     if not url or not key:
+        status_container.error("Missing credentials. Set LABEL_STUDIO_URL and LABEL_STUDIO_API_KEY.")
+        return False
     headers = {"Authorization": f"Token {key}"}
+    # Fetch project list to detect changes
     resp = requests.get(f"{url}/api/projects", headers=headers, timeout=30)
     resp.raise_for_status()
     projects = resp.json().get("results", [])
+    cache = _load_cache()
+    user_map = fetch_users(url, key)
+    projects_to_fetch = []  # (proj, since_date, cached_rows_or_None, reason)
+    unchanged = 0
     for proj in projects:
         pid = proj["id"]
+        proj_name = proj.get("title", f"Project {pid}")
         task_count = proj.get("task_number", 0)
         api_submitted_count = proj.get("num_tasks_with_annotations", 0)
         cache_key = f"project_{pid}"
         if cache_key not in cache:
+            projects_to_fetch.append((proj, None, None, "new project"))
         else:
             cached = cache[cache_key]
+            old_tasks = cached.get("task_count", 0)
+            old_submitted = cached.get("submitted_count", 0)
+            if old_tasks != task_count:
+                diff = task_count - old_tasks
+                projects_to_fetch.append((proj, None, None,
+                    f"{'+' if diff > 0 else ''}{diff} tasks"))
+            elif old_submitted != api_submitted_count:
+                diff = api_submitted_count - old_submitted
                 last_updated = cached.get("last_updated")
                 if last_updated:
+                    projects_to_fetch.append((proj, last_updated, cached["rows"],
+                        f"{'+' if diff > 0 else ''}{diff} annotations"))
                 else:
+                    projects_to_fetch.append((proj, None, None,
+                        f"{'+' if diff > 0 else ''}{diff} annotations"))
             else:
+                unchanged += 1
+    total_fetches = len(projects_to_fetch)
+    if total_fetches == 0:
+        return False  # Nothing changed
+    # Build a summary of what's updating
+    update_names = []
+    for p in projects_to_fetch:
+        proj_id = p[0]['id']
+        # Show annotator name for team projects, project ID for others
+        if proj_id in PROJECT_ANNOTATOR_MAP:
+            annotator_id = PROJECT_ANNOTATOR_MAP[proj_id]
+            name = ANNOTATOR_NAMES.get(annotator_id, f"#{proj_id}")
+            update_names.append(f"{name} {p[3]}")
+        else:
+            update_names.append(f"#{proj_id} ({p[3]})")
+    status_container.info(f"Updating {total_fetches} project(s): {', '.join(update_names)}")
+    with ThreadPoolExecutor(max_workers=10) as executor:
+        futures = []
+        for proj, since_date, cached_rows, reason in projects_to_fetch:
+            proj_name = proj.get("title", f"Project {proj['id']}")
+            api_sub = proj.get("num_tasks_with_annotations", 0)
+            is_incremental = since_date is not None and cached_rows is not None
+            futures.append((
+                "incremental" if is_incremental else "full",
+                executor.submit(fetch_project_data, proj, url, headers, user_map, since_date),
+                cached_rows,
+                proj_name,
+                reason,
+                api_sub,
+            ))
+        progress = status_container.progress(0, text=f"Updating {total_fetches} projects...")
+        for i, (mode, future, cached_rows, proj_name, reason, api_sub) in enumerate(futures):
+            pid = future.result()[0]  # Get project ID early
+            # Show annotator name for team projects
+            if pid in PROJECT_ANNOTATOR_MAP:
+                annotator_id = PROJECT_ANNOTATOR_MAP[pid]
+                display_name = ANNOTATOR_NAMES.get(annotator_id, f"#{pid}")
+                progress.progress(i / total_fetches, text=f"Fetching: {display_name} {reason}...")
+            else:
+                progress.progress(i / total_fetches, text=f"Fetching: #{pid} ({reason})...")
+            pid, task_count, _, rows, max_updated_at = future.result()
+            if mode == "incremental" and cached_rows is not None:
+                prev_timestamp = cache.get(f"project_{pid}", {}).get("last_updated")
+                if rows:
+                    cached_by_id = {row["task_id"]: row for row in cached_rows}
+                    for row in rows:
+                        cached_by_id[row["task_id"]] = row
+                    rows = list(cached_by_id.values())
+                else:
+                    rows = cached_rows
+                max_updated_at = max_updated_at or prev_timestamp
+            cache[f"project_{pid}"] = {
+                "task_count": task_count,
+                "submitted_count": api_sub,
+                "last_updated": max_updated_at,
+                "rows": rows
+            }
+            # Show annotator name for team projects
+            if pid in PROJECT_ANNOTATOR_MAP:
+                annotator_id = PROJECT_ANNOTATOR_MAP[pid]
+                display_name = ANNOTATOR_NAMES.get(annotator_id, f"#{pid}")
+                progress.progress((i + 1) / total_fetches, text=f"Done: {display_name} {reason}")
+            else:
+                progress.progress((i + 1) / total_fetches, text=f"Done: #{pid} ({reason})")
+    # Save updated timestamp
+    cache["_last_checked"] = datetime.now().isoformat()
+    _save_cache(cache)
+    return True
 def anonymize(name):
     return name
+# ============== Page layout ==============
 st.title("📊 Annotation Progress Dashboard")
+# Status bar placeholder at the very top (before any data)
+status_bar = st.empty()
+# Phase 1: Load cached data instantly (no API calls)
+df, cache_timestamp = load_df_from_cache()
+if df is None:
+    # No cache at all — must do a full fetch before we can show anything
+    status_bar.info("First load — fetching data from Label Studio...")
+    updated = check_and_update(status_bar)
+    df, cache_timestamp = load_df_from_cache()
+    if df is None:
+        st.error("Could not load any data. Check your Label Studio credentials.")
+        st.stop()
+# Phase 2: Show "last updated" and check for updates in the background
+# Use session_state to throttle update checks (every 5 minutes)
+now = datetime.now()
+last_check = st.session_state.get("_last_update_check")
+needs_check = last_check is None or (now - last_check).total_seconds() > 300
+if needs_check:
+    updated = check_and_update(status_bar)
+    st.session_state["_last_update_check"] = now
+    if updated:
+        status_bar.empty()
+        st.rerun()  # Rerun to display fresh data
+# Show the "last updated" timestamp
+if cache_timestamp:
+    try:
+        ts = pd.Timestamp(cache_timestamp)
+        if ts.tzinfo:
+            ts = ts.tz_convert("Europe/Vilnius")
+        else:
+            ts = ts.tz_localize("UTC").tz_convert("Europe/Vilnius")
+        updated_str = ts.strftime("%Y-%m-%d %H:%M")
+    except Exception:
+        updated_str = str(cache_timestamp)[:16]
+    status_bar.caption(f"Last updated: {updated_str} | Auto-refresh: 5 min | Press 'R' to refresh")
+else:
+    status_bar.caption(f"Updated: {pd.Timestamp.now(tz='Europe/Vilnius').strftime('%Y-%m-%d %H:%M')} | Auto-refresh: 5 min | Press 'R' to refresh")
+st.markdown("---")
+# ============== Overview metrics ==============
 total = df[df["is_goal_state"]]["words"].sum()
 remaining = GOAL_WORDS - total
 progress = total / GOAL_WORDS * 100
 fig.update_yaxes(tickformat=".2s")
 st.plotly_chart(fig, use_container_width=True)

pyproject.toml CHANGED Viewed

@@ -7,6 +7,7 @@ requires-python = ">=3.12"
 dependencies = [
     "pandas>=2.3.3",
     "plotly>=6.5.2",
     "requests>=2.32.5",
     "streamlit>=1.53.1",
 ]

 dependencies = [
     "pandas>=2.3.3",
     "plotly>=6.5.2",
+    "python-dotenv>=1.2.1",
     "requests>=2.32.5",
     "streamlit>=1.53.1",
 ]

uv.lock CHANGED Viewed

@@ -25,6 +25,7 @@ source = { virtual = "." }
 dependencies = [
     { name = "pandas" },
     { name = "plotly" },
     { name = "requests" },
     { name = "streamlit" },
 ]
@@ -33,6 +34,7 @@ dependencies = [
 requires-dist = [
     { name = "pandas", specifier = ">=2.3.3" },
     { name = "plotly", specifier = ">=6.5.2" },
     { name = "requests", specifier = ">=2.32.5" },
     { name = "streamlit", specifier = ">=1.53.1" },
 ]
@@ -577,6 +579,15 @@ wheels = [
     { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
 ]
 [[package]]
 name = "pytz"
 version = "2025.2"

 dependencies = [
     { name = "pandas" },
     { name = "plotly" },
+    { name = "python-dotenv" },
     { name = "requests" },
     { name = "streamlit" },
 ]
 requires-dist = [
     { name = "pandas", specifier = ">=2.3.3" },
     { name = "plotly", specifier = ">=6.5.2" },
+    { name = "python-dotenv", specifier = ">=1.2.1" },
     { name = "requests", specifier = ">=2.32.5" },
     { name = "streamlit", specifier = ">=1.53.1" },
 ]
     { url = "https://files.pythonhosted.org/packages/ec/57/56b9bcc3c9c6a792fcbaf139543cee77261f3651ca9da0c93f5c1221264b/python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427", size = 229892, upload-time = "2024-03-01T18:36:18.57Z" },
 ]
+[[package]]
+name = "python-dotenv"
+version = "1.2.1"
+source = { registry = "https://pypi.org/simple" }
+sdist = { url = "https://files.pythonhosted.org/packages/f0/26/19cadc79a718c5edbec86fd4919a6b6d3f681039a2f6d66d14be94e75fb9/python_dotenv-1.2.1.tar.gz", hash = "sha256:42667e897e16ab0d66954af0e60a9caa94f0fd4ecf3aaf6d2d260eec1aa36ad6", size = 44221, upload-time = "2025-10-26T15:12:10.434Z" }
+wheels = [
+    { url = "https://files.pythonhosted.org/packages/14/1b/a298b06749107c305e1fe0f814c6c74aea7b2f1e10989cb30f544a1b3253/python_dotenv-1.2.1-py3-none-any.whl", hash = "sha256:b81ee9561e9ca4004139c6cbba3a238c32b03e4894671e181b671e8cb8425d61", size = 21230, upload-time = "2025-10-26T15:12:09.109Z" },
+]
 [[package]]
 name = "pytz"
 version = "2025.2"