raylim Claude (claude-sonnet-4.5) commited on
Commit
aa47748
·
unverified ·
1 Parent(s): 248a8b1

refactor: optimize OncoTree API usage in TCGA mapping

Browse files

Optimized _find_paladin_subtype_for_tcga to minimize API calls:
- Check direct match in CANCER_TYPE_TO_INT_MAP first (0 API calls)
- Query OncoTree for TCGA code's ancestors only (1 API call max)
- Removed loop over all 183 Paladin codes (avoided 183 API calls)

Removed redundant _TCGA_TO_PALADIN_MAP as all codes were already
in CANCER_TYPE_TO_INT_MAP. Updated tests to reflect this.

Performance: Common TCGA codes use 0 API calls, rare codes use 1
cached API call with graceful degradation on failure.

Co-Authored-By: Claude (claude-sonnet-4.5) <noreply@anthropic.com>

Files changed (2) hide show
  1. src/mosaic/tcga.py +10 -11
  2. tests/test_tcga.py +2 -10
src/mosaic/tcga.py CHANGED
@@ -520,8 +520,8 @@ def _get_oncotree_ancestors(code: str) -> list[str]:
520
  def _find_paladin_subtype_for_tcga(tcga_code: str) -> str | None:
521
  """Find the best matching Paladin cancer subtype for a TCGA code.
522
 
523
- Checks if the TCGA code or any of its OncoTree descendants/ancestors
524
- have Paladin models available.
525
 
526
  Args:
527
  tcga_code: TCGA project suffix (e.g., "BRCA", "LUAD")
@@ -529,24 +529,23 @@ def _find_paladin_subtype_for_tcga(tcga_code: str) -> str | None:
529
  Returns:
530
  Paladin cancer subtype code if found, or None
531
  """
532
- # Direct match - TCGA code has Paladin models
533
  if tcga_code in CANCER_TYPE_TO_INT_MAP:
534
  return tcga_code
535
 
536
- # Check if any Paladin subtype is a descendant of the TCGA code
537
- # by checking if TCGA code is an ancestor of the Paladin subtype
538
- for paladin_code in CANCER_TYPE_TO_INT_MAP.keys():
539
- ancestors = _get_oncotree_ancestors(paladin_code)
540
- if tcga_code in ancestors:
541
- # Found a Paladin subtype that is a descendant of the TCGA code
542
- return paladin_code
543
-
544
  # Check if TCGA code is a descendant of any Paladin subtype
 
545
  tcga_ancestors = _get_oncotree_ancestors(tcga_code)
546
  for ancestor in tcga_ancestors:
547
  if ancestor in CANCER_TYPE_TO_INT_MAP:
 
 
 
548
  return ancestor
549
 
 
 
 
550
  return None
551
 
552
 
 
520
  def _find_paladin_subtype_for_tcga(tcga_code: str) -> str | None:
521
  """Find the best matching Paladin cancer subtype for a TCGA code.
522
 
523
+ Uses a fast direct match first, then falls back to OncoTree hierarchy
524
+ lookup only if needed (checking TCGA's ancestors, not all Paladin codes).
525
 
526
  Args:
527
  tcga_code: TCGA project suffix (e.g., "BRCA", "LUAD")
 
529
  Returns:
530
  Paladin cancer subtype code if found, or None
531
  """
532
+ # Direct match - TCGA code has Paladin models (most common case)
533
  if tcga_code in CANCER_TYPE_TO_INT_MAP:
534
  return tcga_code
535
 
 
 
 
 
 
 
 
 
536
  # Check if TCGA code is a descendant of any Paladin subtype
537
+ # This makes only 1 API call (for tcga_code) instead of 183 (for all Paladin codes)
538
  tcga_ancestors = _get_oncotree_ancestors(tcga_code)
539
  for ancestor in tcga_ancestors:
540
  if ancestor in CANCER_TYPE_TO_INT_MAP:
541
+ logger.info(
542
+ f"Mapped TCGA '{tcga_code}' to Paladin ancestor '{ancestor}' via OncoTree hierarchy"
543
+ )
544
  return ancestor
545
 
546
+ # Note: We removed the loop checking if TCGA is an ancestor of Paladin codes,
547
+ # as that would require 183 API calls. If a TCGA code doesn't match directly
548
+ # or via its ancestors, we return None (user can still run analysis with "Unknown")
549
  return None
550
 
551
 
tests/test_tcga.py CHANGED
@@ -30,7 +30,6 @@ from mosaic.tcga import (
30
  _map_sample_type_to_site_type,
31
  _find_paladin_subtype_for_tcga,
32
  _get_oncotree_ancestors,
33
- _TCGA_TO_PALADIN_MAP,
34
  _oncotree_ancestors_cache,
35
  )
36
 
@@ -230,20 +229,13 @@ class TestMetadataMapping:
230
  class TestOncoTreeMapping:
231
  """Tests for OncoTree hierarchy and cancer subtype mapping."""
232
 
233
- def test_precomputed_tcga_map_coverage(self):
234
- """Test that the precomputed TCGA map has expected common codes."""
235
- # Verify some of the most common TCGA codes are in the precomputed map
236
- common_codes = ["LUAD", "BRCA", "PRAD", "COAD", "BLCA", "LUSC", "HNSC", "STAD"]
237
- for code in common_codes:
238
- assert code in _TCGA_TO_PALADIN_MAP, f"{code} should be in precomputed map"
239
-
240
  def test_find_paladin_subtype_direct_match(self):
241
  """Test _find_paladin_subtype_for_tcga with direct match in CANCER_TYPE_TO_INT_MAP."""
242
- # LUAD is in CANCER_TYPE_TO_INT_MAP and precomputed map
243
  result = _find_paladin_subtype_for_tcga("LUAD")
244
  assert result == "LUAD"
245
 
246
- # BRCA is in CANCER_TYPE_TO_INT_MAP and precomputed map
247
  result = _find_paladin_subtype_for_tcga("BRCA")
248
  assert result == "BRCA"
249
 
 
30
  _map_sample_type_to_site_type,
31
  _find_paladin_subtype_for_tcga,
32
  _get_oncotree_ancestors,
 
33
  _oncotree_ancestors_cache,
34
  )
35
 
 
229
  class TestOncoTreeMapping:
230
  """Tests for OncoTree hierarchy and cancer subtype mapping."""
231
 
 
 
 
 
 
 
 
232
  def test_find_paladin_subtype_direct_match(self):
233
  """Test _find_paladin_subtype_for_tcga with direct match in CANCER_TYPE_TO_INT_MAP."""
234
+ # LUAD is in CANCER_TYPE_TO_INT_MAP
235
  result = _find_paladin_subtype_for_tcga("LUAD")
236
  assert result == "LUAD"
237
 
238
+ # BRCA is in CANCER_TYPE_TO_INT_MAP
239
  result = _find_paladin_subtype_for_tcga("BRCA")
240
  assert result == "BRCA"
241