raylim Claude Opus 4.6 commited on
Commit
e297f75
·
1 Parent(s): 2e7e79a

feat: add cache-hit tracking to telemetry

Browse files

Add from_cache boolean to UsageEvent so telemetry distinguishes cache
hits from fresh GPU analyses. Cached analyses are excluded from session
analysis_time_sec and from the report's average duration to avoid
skewing usage metrics.

- Add from_cache to UsageEvent dataclass
- Add from_cache parameter to TelemetryTracker.log_usage_event()
- Track and report cache hits in analyze_slides()
- Update telemetry report (text + HTML) with cached analysis count
- Add tests for all new functionality

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>

scripts/telemetry_report.py CHANGED
@@ -204,9 +204,13 @@ def generate_text_report(telemetry_dir: Path, date: Optional[str] = None) -> str
204
  set(u.get("session_hash") for u in usage if u.get("session_hash"))
205
  )
206
 
207
- # Calculate average duration
 
 
 
 
208
  durations = [
209
- c.get("duration_sec", 0) for c in completes if c.get("duration_sec")
210
  ]
211
  avg_duration = sum(durations) / len(durations) if durations else 0
212
 
@@ -215,6 +219,8 @@ def generate_text_report(telemetry_dir: Path, date: Optional[str] = None) -> str
215
  lines.append(f"Analyses completed: {len(completes)}")
216
  lines.append(f"Successful analyses: {len(successful)}")
217
  lines.append(f"Total slides processed: {total_slides}")
 
 
218
  lines.append(f"Unique sessions: {unique_sessions}")
219
  if avg_duration > 0:
220
  lines.append(f"Average analysis duration: {avg_duration:.1f}s")
@@ -431,6 +437,9 @@ def generate_html_report(telemetry_dir: Path, date: Optional[str] = None) -> str
431
  set(u.get("session_hash") for u in usage if u.get("session_hash"))
432
  )
433
 
 
 
 
434
  html.append("<h2>Usage Summary</h2>")
435
  html.append("<table>")
436
  html.append(f"<tr><td>Analyses started</td><td>{len(starts)}</td></tr>")
@@ -439,6 +448,10 @@ def generate_html_report(telemetry_dir: Path, date: Optional[str] = None) -> str
439
  f"<tr><td>Successful analyses</td><td class='success'>{len(successful)}</td></tr>"
440
  )
441
  html.append(f"<tr><td>Total slides</td><td>{total_slides}</td></tr>")
 
 
 
 
442
  html.append(f"<tr><td>Unique sessions</td><td>{unique_sessions}</td></tr>")
443
  html.append("</table>")
444
 
 
204
  set(u.get("session_hash") for u in usage if u.get("session_hash"))
205
  )
206
 
207
+ # Cache hit tracking
208
+ cached_analyses = [c for c in completes if c.get("from_cache")]
209
+ fresh_analyses = [c for c in completes if not c.get("from_cache")]
210
+
211
+ # Calculate average duration (excluding cached analyses)
212
  durations = [
213
+ c.get("duration_sec", 0) for c in fresh_analyses if c.get("duration_sec")
214
  ]
215
  avg_duration = sum(durations) / len(durations) if durations else 0
216
 
 
219
  lines.append(f"Analyses completed: {len(completes)}")
220
  lines.append(f"Successful analyses: {len(successful)}")
221
  lines.append(f"Total slides processed: {total_slides}")
222
+ if cached_analyses:
223
+ lines.append(f"Cached analyses: {len(cached_analyses)}")
224
  lines.append(f"Unique sessions: {unique_sessions}")
225
  if avg_duration > 0:
226
  lines.append(f"Average analysis duration: {avg_duration:.1f}s")
 
437
  set(u.get("session_hash") for u in usage if u.get("session_hash"))
438
  )
439
 
440
+ # Cache hit tracking
441
+ cached_analyses = [c for c in completes if c.get("from_cache")]
442
+
443
  html.append("<h2>Usage Summary</h2>")
444
  html.append("<table>")
445
  html.append(f"<tr><td>Analyses started</td><td>{len(starts)}</td></tr>")
 
448
  f"<tr><td>Successful analyses</td><td class='success'>{len(successful)}</td></tr>"
449
  )
450
  html.append(f"<tr><td>Total slides</td><td>{total_slides}</td></tr>")
451
+ if cached_analyses:
452
+ html.append(
453
+ f"<tr><td>Cached analyses</td><td>{len(cached_analyses)}</td></tr>"
454
+ )
455
  html.append(f"<tr><td>Unique sessions</td><td>{unique_sessions}</td></tr>")
456
  html.append("</table>")
457
 
src/mosaic/tcga.py CHANGED
@@ -696,9 +696,7 @@ def save_metadata_cache(
696
  )
697
  except Exception as e:
698
  # Truly unexpected errors
699
- logger.exception(
700
- f"Unexpected error saving metadata cache for {file_uuid}: {e}"
701
- )
702
 
703
 
704
  def load_metadata_cache(
 
696
  )
697
  except Exception as e:
698
  # Truly unexpected errors
699
+ logger.exception(f"Unexpected error saving metadata cache for {file_uuid}: {e}")
 
 
700
 
701
 
702
  def load_metadata_cache(
src/mosaic/telemetry/events.py CHANGED
@@ -65,6 +65,7 @@ class UsageEvent:
65
  gpu_type: Optional[str] = None # From hardware.GPU_TYPE
66
  duration_sec: Optional[float] = None # On completion only
67
  success: Optional[bool] = None # On completion only
 
68
  is_logged_in: Optional[bool] = None # True if HF user logged in
69
  hf_username: Optional[str] = None # Raw HF username (HF Spaces only)
70
  event_id: str = field(default_factory=_generate_event_id)
 
65
  gpu_type: Optional[str] = None # From hardware.GPU_TYPE
66
  duration_sec: Optional[float] = None # On completion only
67
  success: Optional[bool] = None # On completion only
68
+ from_cache: Optional[bool] = None # On completion only
69
  is_logged_in: Optional[bool] = None # True if HF user logged in
70
  hf_username: Optional[str] = None # Raw HF username (HF Spaces only)
71
  event_id: str = field(default_factory=_generate_event_id)
src/mosaic/telemetry/tracker.py CHANGED
@@ -215,6 +215,7 @@ class TelemetryTracker:
215
  gpu_type: Optional[str] = None,
216
  duration_sec: Optional[float] = None,
217
  success: Optional[bool] = None,
 
218
  is_logged_in: Optional[bool] = None,
219
  hf_username: Optional[str] = None,
220
  ) -> None:
@@ -231,6 +232,7 @@ class TelemetryTracker:
231
  gpu_type: GPU type string
232
  duration_sec: Analysis duration (for analysis_complete only)
233
  success: Whether analysis succeeded (for analysis_complete only)
 
234
  is_logged_in: True if HF user logged in
235
  hf_username: HF username (HF Spaces only)
236
  """
@@ -248,6 +250,7 @@ class TelemetryTracker:
248
  gpu_type=gpu_type,
249
  duration_sec=duration_sec,
250
  success=success,
 
251
  is_logged_in=is_logged_in,
252
  hf_username=hf_username,
253
  )
@@ -257,7 +260,9 @@ class TelemetryTracker:
257
  if event_type == "analysis_complete" and duration_sec is not None:
258
  with self._session_lock:
259
  self._analysis_count += 1
260
- self._analysis_time_sec += duration_sec
 
 
261
 
262
  # =========================================================================
263
  # Resource Events
 
215
  gpu_type: Optional[str] = None,
216
  duration_sec: Optional[float] = None,
217
  success: Optional[bool] = None,
218
+ from_cache: Optional[bool] = None,
219
  is_logged_in: Optional[bool] = None,
220
  hf_username: Optional[str] = None,
221
  ) -> None:
 
232
  gpu_type: GPU type string
233
  duration_sec: Analysis duration (for analysis_complete only)
234
  success: Whether analysis succeeded (for analysis_complete only)
235
+ from_cache: True if result was served from cache (for analysis_complete only)
236
  is_logged_in: True if HF user logged in
237
  hf_username: HF username (HF Spaces only)
238
  """
 
250
  gpu_type=gpu_type,
251
  duration_sec=duration_sec,
252
  success=success,
253
+ from_cache=from_cache,
254
  is_logged_in=is_logged_in,
255
  hf_username=hf_username,
256
  )
 
260
  if event_type == "analysis_complete" and duration_sec is not None:
261
  with self._session_lock:
262
  self._analysis_count += 1
263
+ # Don't count duration if result was served from cache
264
+ if not from_cache:
265
+ self._analysis_time_sec += duration_sec
266
 
267
  # =========================================================================
268
  # Resource Events
src/mosaic/ui/app.py CHANGED
@@ -289,6 +289,7 @@ def analyze_slides(
289
  all_slide_masks = []
290
  all_aeon_results = []
291
  all_paladin_results = []
 
292
 
293
  # Log analysis start event
294
  tracker.log_usage_event(
@@ -401,6 +402,7 @@ def analyze_slides(
401
  cached = load_analysis_results(file_uuid, settings_hash)
402
  if cached is not None:
403
  slide_mask, aeon_results, paladin_results = cached
 
404
  logger.info(
405
  f"Using cached results for {file_uuid}/{settings_hash}"
406
  )
@@ -429,9 +431,7 @@ def analyze_slides(
429
  logger.info(
430
  f"Batch mode: Loading models once for {len(slides)} slides"
431
  )
432
- progress(
433
- 0.0, desc="Loading models for batch processing"
434
- )
435
  model_cache = load_all_models(
436
  use_gpu=True, aggressive_memory_mgmt=None
437
  )
@@ -514,6 +514,7 @@ def analyze_slides(
514
  slide_count=len(slides),
515
  duration_sec=duration_sec,
516
  success=True,
 
517
  gpu_type=GPU_TYPE,
518
  is_logged_in=user_info.is_logged_in,
519
  hf_username=user_info.username,
 
289
  all_slide_masks = []
290
  all_aeon_results = []
291
  all_paladin_results = []
292
+ from_cache = False
293
 
294
  # Log analysis start event
295
  tracker.log_usage_event(
 
402
  cached = load_analysis_results(file_uuid, settings_hash)
403
  if cached is not None:
404
  slide_mask, aeon_results, paladin_results = cached
405
+ from_cache = True
406
  logger.info(
407
  f"Using cached results for {file_uuid}/{settings_hash}"
408
  )
 
431
  logger.info(
432
  f"Batch mode: Loading models once for {len(slides)} slides"
433
  )
434
+ progress(0.0, desc="Loading models for batch processing")
 
 
435
  model_cache = load_all_models(
436
  use_gpu=True, aggressive_memory_mgmt=None
437
  )
 
514
  slide_count=len(slides),
515
  duration_sec=duration_sec,
516
  success=True,
517
+ from_cache=from_cache if from_cache else None,
518
  gpu_type=GPU_TYPE,
519
  is_logged_in=user_info.is_logged_in,
520
  hf_username=user_info.username,
tests/telemetry/test_events.py CHANGED
@@ -108,6 +108,37 @@ class TestUsageEvent:
108
  assert data["slide_count"] == 3
109
  assert data["session_hash"] is None
110
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
111
  def test_user_info_fields(self):
112
  """Test is_logged_in and hf_username fields in UsageEvent."""
113
  event = UsageEvent(
 
108
  assert data["slide_count"] == 3
109
  assert data["session_hash"] is None
110
 
111
+ def test_from_cache_true(self):
112
+ """Test from_cache field on analysis_complete event."""
113
+ event = UsageEvent(
114
+ event_type="analysis_complete",
115
+ analysis_id="test-123",
116
+ session_hash=None,
117
+ slide_count=1,
118
+ duration_sec=2.0,
119
+ success=True,
120
+ from_cache=True,
121
+ )
122
+
123
+ assert event.from_cache is True
124
+ data = event.to_dict()
125
+ assert data["from_cache"] is True
126
+
127
+ def test_from_cache_default_none(self):
128
+ """Test that from_cache defaults to None for backward compat."""
129
+ event = UsageEvent(
130
+ event_type="analysis_complete",
131
+ analysis_id="test-123",
132
+ session_hash=None,
133
+ slide_count=1,
134
+ duration_sec=60.0,
135
+ success=True,
136
+ )
137
+
138
+ assert event.from_cache is None
139
+ data = event.to_dict()
140
+ assert data["from_cache"] is None
141
+
142
  def test_user_info_fields(self):
143
  """Test is_logged_in and hf_username fields in UsageEvent."""
144
  event = UsageEvent(
tests/telemetry/test_report.py CHANGED
@@ -287,6 +287,81 @@ class TestGenerateTextReport:
287
  report = generate_text_report(tmp_path, date="2026-01-20")
288
  assert "for 2026-01-20" in report
289
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
290
  def test_report_with_resources(self, tmp_path):
291
  """Test report with resource data."""
292
  daily_dir = tmp_path / "daily"
@@ -347,6 +422,34 @@ class TestGenerateHtmlReport:
347
  assert "Analyses started" in report
348
  assert "<td>1</td>" in report # 1 analysis started
349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
350
  def test_html_with_failures(self, tmp_path):
351
  """Test HTML report with failures."""
352
  daily_dir = tmp_path / "daily"
 
287
  report = generate_text_report(tmp_path, date="2026-01-20")
288
  assert "for 2026-01-20" in report
289
 
290
+ def test_report_with_cached_analysis(self, tmp_path):
291
+ """Test report shows cached analysis count when present."""
292
+ daily_dir = tmp_path / "daily"
293
+ daily_dir.mkdir()
294
+
295
+ usage = [
296
+ {
297
+ "event_type": "analysis_start",
298
+ "slide_count": 1,
299
+ "session_hash": "s1",
300
+ "site_type": "Primary",
301
+ "seg_config": "TCGA",
302
+ },
303
+ {
304
+ "event_type": "analysis_complete",
305
+ "success": True,
306
+ "duration_sec": 2.0,
307
+ "slide_count": 1,
308
+ "from_cache": True,
309
+ },
310
+ {
311
+ "event_type": "analysis_start",
312
+ "slide_count": 1,
313
+ "session_hash": "s2",
314
+ "site_type": "Primary",
315
+ "seg_config": "TCGA",
316
+ },
317
+ {
318
+ "event_type": "analysis_complete",
319
+ "success": True,
320
+ "duration_sec": 120.0,
321
+ "slide_count": 1,
322
+ },
323
+ ]
324
+
325
+ file_path = daily_dir / "usage_2026-01-20.jsonl"
326
+ with open(file_path, "w", encoding="utf-8") as f:
327
+ for event in usage:
328
+ f.write(json.dumps(event) + "\n")
329
+
330
+ report = generate_text_report(tmp_path)
331
+ assert "Cached analyses: 1" in report
332
+ # Average duration should exclude cached analysis (2s),
333
+ # only include the fresh one (120s)
334
+ assert "Average analysis duration: 120.0s" in report
335
+
336
+ def test_report_without_cached_analysis(self, tmp_path):
337
+ """Test report omits cache stats when no cache hits."""
338
+ daily_dir = tmp_path / "daily"
339
+ daily_dir.mkdir()
340
+
341
+ usage = [
342
+ {
343
+ "event_type": "analysis_start",
344
+ "slide_count": 1,
345
+ "session_hash": "s1",
346
+ "site_type": "Primary",
347
+ "seg_config": "Biopsy",
348
+ },
349
+ {
350
+ "event_type": "analysis_complete",
351
+ "success": True,
352
+ "duration_sec": 60.0,
353
+ "slide_count": 1,
354
+ },
355
+ ]
356
+
357
+ file_path = daily_dir / "usage_2026-01-20.jsonl"
358
+ with open(file_path, "w", encoding="utf-8") as f:
359
+ for event in usage:
360
+ f.write(json.dumps(event) + "\n")
361
+
362
+ report = generate_text_report(tmp_path)
363
+ assert "Cached analyses" not in report
364
+
365
  def test_report_with_resources(self, tmp_path):
366
  """Test report with resource data."""
367
  daily_dir = tmp_path / "daily"
 
422
  assert "Analyses started" in report
423
  assert "<td>1</td>" in report # 1 analysis started
424
 
425
+ def test_html_with_cached_analysis(self, tmp_path):
426
+ """Test HTML report shows cached analysis count."""
427
+ daily_dir = tmp_path / "daily"
428
+ daily_dir.mkdir()
429
+
430
+ usage = [
431
+ {
432
+ "event_type": "analysis_start",
433
+ "slide_count": 1,
434
+ "session_hash": "s1",
435
+ },
436
+ {
437
+ "event_type": "analysis_complete",
438
+ "success": True,
439
+ "duration_sec": 2.0,
440
+ "slide_count": 1,
441
+ "from_cache": True,
442
+ },
443
+ ]
444
+
445
+ file_path = daily_dir / "usage_2026-01-20.jsonl"
446
+ with open(file_path, "w", encoding="utf-8") as f:
447
+ for event in usage:
448
+ f.write(json.dumps(event) + "\n")
449
+
450
+ report = generate_html_report(tmp_path)
451
+ assert "Cached analyses" in report
452
+
453
  def test_html_with_failures(self, tmp_path):
454
  """Test HTML report with failures."""
455
  daily_dir = tmp_path / "daily"
tests/telemetry/test_tracker.py CHANGED
@@ -196,6 +196,55 @@ class TestUsageEvents:
196
  assert metrics["analysis_count"] == 2
197
  assert metrics["analysis_time_sec"] == 105.0
198
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
199
 
200
  class TestResourceEvents:
201
  """Tests for resource event logging."""
 
196
  assert metrics["analysis_count"] == 2
197
  assert metrics["analysis_time_sec"] == 105.0
198
 
199
+ def test_log_usage_event_with_from_cache(self, tracker, temp_dir):
200
+ """Test that from_cache is persisted in usage events."""
201
+ tracker.log_usage_event(
202
+ event_type="analysis_complete",
203
+ analysis_id="test-cache",
204
+ slide_count=1,
205
+ duration_sec=2.0,
206
+ success=True,
207
+ from_cache=True,
208
+ )
209
+
210
+ usage_files = list((temp_dir / "daily").glob("usage_*.jsonl"))
211
+ with open(usage_files[0]) as f:
212
+ event = json.loads(f.read().strip())
213
+
214
+ assert event["from_cache"] is True
215
+
216
+ def test_cached_analysis_excludes_duration_from_metrics(self, tracker):
217
+ """Test that cached analyses don't count toward analysis_time_sec."""
218
+ tracker.log_app_start()
219
+ tracker.log_usage_event(
220
+ event_type="analysis_complete",
221
+ analysis_id="test-cache",
222
+ slide_count=1,
223
+ duration_sec=2.0,
224
+ success=True,
225
+ from_cache=True,
226
+ )
227
+
228
+ metrics = tracker._get_session_metrics()
229
+ assert metrics["analysis_count"] == 1
230
+ assert metrics["analysis_time_sec"] == 0.0
231
+
232
+ def test_fresh_analysis_includes_duration_in_metrics(self, tracker):
233
+ """Test that fresh analyses count toward analysis_time_sec."""
234
+ tracker.log_app_start()
235
+ tracker.log_usage_event(
236
+ event_type="analysis_complete",
237
+ analysis_id="test-fresh",
238
+ slide_count=1,
239
+ duration_sec=60.0,
240
+ success=True,
241
+ from_cache=False,
242
+ )
243
+
244
+ metrics = tracker._get_session_metrics()
245
+ assert metrics["analysis_count"] == 1
246
+ assert metrics["analysis_time_sec"] == 60.0
247
+
248
 
249
  class TestResourceEvents:
250
  """Tests for resource event logging."""
tests/test_tcga.py CHANGED
@@ -766,7 +766,11 @@ class TestMetadataCache:
766
  mode = kwargs.get("mode")
767
  if mode is None and len(args) > 1:
768
  mode = args[1]
769
- if "metadata.json" in str(args[0]) and mode is not None and "w" in str(mode):
 
 
 
 
770
  raise PermissionError("No permission")
771
  return original_open(*args, **kwargs)
772
 
 
766
  mode = kwargs.get("mode")
767
  if mode is None and len(args) > 1:
768
  mode = args[1]
769
+ if (
770
+ "metadata.json" in str(args[0])
771
+ and mode is not None
772
+ and "w" in str(mode)
773
+ ):
774
  raise PermissionError("No permission")
775
  return original_open(*args, **kwargs)
776