| """Tests for TCGA/GDC API client functionality.""" |
|
|
| import os |
| import tempfile |
| from pathlib import Path |
| from unittest.mock import MagicMock, patch, Mock |
| import pytest |
|
|
| from mosaic.tcga import ( |
| is_valid_uuid, |
| is_valid_tcga_barcode, |
| validate_slide_id, |
| query_slide_uuid, |
| get_slide_metadata, |
| get_case_metadata, |
| convert_gdc_metadata_to_mosaic, |
| is_slide_cached, |
| get_cached_slide_path, |
| fetch_slide_by_uuid, |
| fetch_slide, |
| save_metadata_cache, |
| load_metadata_cache, |
| compute_settings_hash, |
| save_analysis_results, |
| load_analysis_results, |
| upload_results_to_hf, |
| download_results_from_hf, |
| METADATA_CACHE_FILENAME, |
| METADATA_CACHE_VERSION, |
| TCGAMetadata, |
| GDCError, |
| GDCInvalidIDError, |
| GDCNotFoundError, |
| GDCAccessDeniedError, |
| GDCNetworkError, |
| _map_gdc_gender_to_sex, |
| _map_gdc_site_to_tissue_site, |
| _map_project_id_to_cancer_subtype, |
| _map_sample_type_to_site_type, |
| _find_paladin_subtype_for_tcga, |
| _get_oncotree_ancestors, |
| _oncotree_ancestors_cache, |
| ) |
|
|
|
|
| class TestUUIDValidation: |
| """Tests for UUID validation.""" |
|
|
| def test_valid_uuid_lowercase(self): |
| """Test that lowercase UUID is valid.""" |
| assert is_valid_uuid("a1b2c3d4-e5f6-7890-abcd-ef1234567890") |
|
|
| def test_valid_uuid_uppercase(self): |
| """Test that uppercase UUID is valid.""" |
| assert is_valid_uuid("A1B2C3D4-E5F6-7890-ABCD-EF1234567890") |
|
|
| def test_valid_uuid_mixed_case(self): |
| """Test that mixed case UUID is valid.""" |
| assert is_valid_uuid("A1b2C3d4-E5f6-7890-AbCd-Ef1234567890") |
|
|
| def test_invalid_uuid_too_short(self): |
| """Test that too short string is invalid.""" |
| assert not is_valid_uuid("a1b2c3d4-e5f6-7890-abcd-ef123456789") |
|
|
| def test_invalid_uuid_too_long(self): |
| """Test that too long string is invalid.""" |
| assert not is_valid_uuid("a1b2c3d4-e5f6-7890-abcd-ef12345678901") |
|
|
| def test_invalid_uuid_no_hyphens(self): |
| """Test that UUID without hyphens is invalid.""" |
| assert not is_valid_uuid("a1b2c3d4e5f67890abcdef1234567890") |
|
|
| def test_invalid_uuid_wrong_format(self): |
| """Test that incorrectly formatted UUID is invalid.""" |
| assert not is_valid_uuid("a1b2c3d4-e5f67890-abcd-ef1234567890") |
|
|
| def test_invalid_uuid_empty(self): |
| """Test that empty string is invalid.""" |
| assert not is_valid_uuid("") |
|
|
| def test_invalid_uuid_random_string(self): |
| """Test that random string is invalid.""" |
| assert not is_valid_uuid("not-a-uuid") |
|
|
|
|
| class TestTCGABarcodeValidation: |
| """Tests for TCGA barcode validation.""" |
|
|
| def test_valid_barcode_full(self): |
| """Test that full TCGA barcode is valid.""" |
| assert is_valid_tcga_barcode("TCGA-A1-A0SB-01Z-00-DX1") |
|
|
| def test_valid_barcode_short(self): |
| """Test that short TCGA barcode is valid.""" |
| assert is_valid_tcga_barcode("TCGA-A1-A0SB") |
|
|
| def test_valid_barcode_lowercase(self): |
| """Test that lowercase barcode is valid (gets uppercased).""" |
| assert is_valid_tcga_barcode("tcga-a1-a0sb-01z-00-dx1") |
|
|
| def test_invalid_barcode_wrong_prefix(self): |
| """Test that non-TCGA prefix is invalid.""" |
| assert not is_valid_tcga_barcode("NOTC-A1-A0SB-01Z-00-DX1") |
|
|
| def test_invalid_barcode_empty(self): |
| """Test that empty string is invalid.""" |
| assert not is_valid_tcga_barcode("") |
|
|
| def test_invalid_barcode_random_string(self): |
| """Test that random string is invalid.""" |
| assert not is_valid_tcga_barcode("random-string") |
|
|
|
|
| class TestValidateSlideID: |
| """Tests for slide ID validation and normalization.""" |
|
|
| def test_valid_uuid_normalized(self): |
| """Test that UUID is normalized to lowercase.""" |
| result = validate_slide_id("A1B2C3D4-E5F6-7890-ABCD-EF1234567890") |
| assert result == "a1b2c3d4-e5f6-7890-abcd-ef1234567890" |
|
|
| def test_valid_barcode_normalized(self): |
| """Test that barcode is normalized to uppercase.""" |
| result = validate_slide_id("tcga-a1-a0sb-01z-00-dx1") |
| assert result == "TCGA-A1-A0SB-01Z-00-DX1" |
|
|
| def test_whitespace_stripped(self): |
| """Test that whitespace is stripped.""" |
| result = validate_slide_id(" a1b2c3d4-e5f6-7890-abcd-ef1234567890 ") |
| assert result == "a1b2c3d4-e5f6-7890-abcd-ef1234567890" |
|
|
| def test_invalid_id_raises_error(self): |
| """Test that invalid ID raises GDCInvalidIDError.""" |
| with pytest.raises(GDCInvalidIDError) as exc_info: |
| validate_slide_id("invalid-id") |
| assert "Invalid slide ID format" in str(exc_info.value) |
|
|
|
|
| class TestMetadataMapping: |
| """Tests for GDC to Mosaic metadata mapping functions.""" |
|
|
| def test_map_gender_male(self): |
| """Test mapping male gender.""" |
| assert _map_gdc_gender_to_sex("male") == "Male" |
| assert _map_gdc_gender_to_sex("Male") == "Male" |
| assert _map_gdc_gender_to_sex("MALE") == "Male" |
|
|
| def test_map_gender_female(self): |
| """Test mapping female gender.""" |
| assert _map_gdc_gender_to_sex("female") == "Female" |
| assert _map_gdc_gender_to_sex("Female") == "Female" |
| assert _map_gdc_gender_to_sex("FEMALE") == "Female" |
|
|
| def test_map_gender_none(self): |
| """Test mapping None gender.""" |
| assert _map_gdc_gender_to_sex(None) is None |
|
|
| def test_map_gender_unknown(self): |
| """Test mapping unknown gender value.""" |
| assert _map_gdc_gender_to_sex("unknown") is None |
| assert _map_gdc_gender_to_sex("other") is None |
|
|
| def test_map_tissue_site_direct_match(self): |
| """Test direct tissue site matching.""" |
| supported = ["Lung", "Breast", "Brain", "Kidney"] |
| assert _map_gdc_site_to_tissue_site("Lung", supported) == "Lung" |
| assert _map_gdc_site_to_tissue_site("lung", supported) == "Lung" |
| assert _map_gdc_site_to_tissue_site("LUNG", supported) == "Lung" |
|
|
| def test_map_tissue_site_gdc_mapping(self): |
| """Test GDC-specific tissue site mappings.""" |
| supported = ["Lung", "Liver", "Prostate"] |
| assert _map_gdc_site_to_tissue_site("Bronchus and lung", supported) == "Lung" |
| assert _map_gdc_site_to_tissue_site("Prostate gland", supported) == "Prostate" |
|
|
| def test_map_tissue_site_unknown(self): |
| """Test unknown tissue site falls back to Unknown.""" |
| supported = ["Lung", "Breast"] |
| assert _map_gdc_site_to_tissue_site("Spleen", supported) == "Unknown" |
| assert _map_gdc_site_to_tissue_site(None, supported) == "Unknown" |
|
|
| def test_map_project_id_direct_match(self): |
| """Test TCGA project IDs that directly match Paladin subtypes.""" |
| |
| assert _map_project_id_to_cancer_subtype("TCGA-LUAD") == "LUAD" |
| assert _map_project_id_to_cancer_subtype("TCGA-PRAD") == "PRAD" |
| assert _map_project_id_to_cancer_subtype("TCGA-COAD") == "COAD" |
| assert _map_project_id_to_cancer_subtype("TCGA-BLCA") == "BLCA" |
|
|
| @patch("mosaic.tcga._get_oncotree_ancestors") |
| def test_map_project_id_oncotree_hierarchy(self, mock_get_ancestors): |
| """Test TCGA project IDs that map via OncoTree hierarchy. |
| |
| Mocks OncoTree API to test ancestor/descendant matching logic |
| without making real network calls. |
| BRCA -> IDC (IDC is a child of BRCA in OncoTree) |
| """ |
|
|
| |
| |
| def mock_ancestors(code): |
| if code == "IDC": |
| return ["BRCA", "TISSUE"] |
| elif code == "BRCA": |
| return ["TISSUE"] |
| return [] |
|
|
| mock_get_ancestors.side_effect = mock_ancestors |
|
|
| |
| result = _map_project_id_to_cancer_subtype("TCGA-BRCA") |
| |
| assert result in ["BRCA", "IDC"], f"Expected BRCA or IDC, got {result}" |
|
|
| def test_map_project_id_invalid(self): |
| """Test invalid project ID returns Unknown.""" |
| assert _map_project_id_to_cancer_subtype("TCGA-XYZINVALID") == "Unknown" |
| assert _map_project_id_to_cancer_subtype("NOT-TCGA") == "Unknown" |
| assert _map_project_id_to_cancer_subtype(None) == "Unknown" |
|
|
| def test_map_sample_type_primary(self): |
| """Test primary tumor sample type mapping.""" |
| assert _map_sample_type_to_site_type("Primary Tumor") == "Primary" |
| assert _map_sample_type_to_site_type("Recurrent Tumor") == "Primary" |
| assert _map_sample_type_to_site_type("Solid Tissue Normal") == "Primary" |
|
|
| def test_map_sample_type_metastatic(self): |
| """Test metastatic sample type mapping.""" |
| assert _map_sample_type_to_site_type("Metastatic") == "Metastatic" |
| assert _map_sample_type_to_site_type("Additional Metastatic") == "Metastatic" |
|
|
| def test_map_sample_type_default(self): |
| """Test default sample type mapping.""" |
| assert _map_sample_type_to_site_type(None) == "Primary" |
| assert _map_sample_type_to_site_type("") == "Primary" |
|
|
|
|
| class TestOncoTreeMapping: |
| """Tests for OncoTree hierarchy and cancer subtype mapping.""" |
|
|
| def test_find_paladin_subtype_direct_match(self): |
| """Test _find_paladin_subtype_for_tcga with direct match in CANCER_TYPE_TO_INT_MAP.""" |
| |
| result = _find_paladin_subtype_for_tcga("LUAD") |
| assert result == "LUAD" |
|
|
| |
| result = _find_paladin_subtype_for_tcga("BRCA") |
| assert result == "BRCA" |
|
|
| @patch("mosaic.tcga._get_oncotree_ancestors") |
| def test_find_paladin_subtype_via_ancestors(self, mock_get_ancestors): |
| """Test _find_paladin_subtype_for_tcga using OncoTree ancestor lookup. |
| |
| Tests that if TCGA code is not in CANCER_TYPE_TO_INT_MAP, |
| it checks ancestors and returns the first matching Paladin subtype. |
| """ |
| |
| mock_get_ancestors.return_value = ["LUAD", "NSCLC", "TISSUE"] |
|
|
| result = _find_paladin_subtype_for_tcga("TESTCODE") |
| assert result == "LUAD" |
| mock_get_ancestors.assert_called_once_with("TESTCODE") |
|
|
| @patch("mosaic.tcga._get_oncotree_ancestors") |
| def test_find_paladin_subtype_no_match(self, mock_get_ancestors): |
| """Test _find_paladin_subtype_for_tcga with no matching Paladin subtype.""" |
| |
| mock_get_ancestors.return_value = ["UNKNOWN_PARENT", "TISSUE"] |
|
|
| result = _find_paladin_subtype_for_tcga("UNKNOWN_CODE") |
| assert result is None |
|
|
| @patch("mosaic.tcga.requests.get") |
| def test_get_oncotree_ancestors_success(self, mock_requests_get): |
| """Test _get_oncotree_ancestors with successful API calls.""" |
| |
| _oncotree_ancestors_cache.clear() |
|
|
| |
| |
| mock_responses = [ |
| |
| Mock(status_code=200, json=lambda: [{"code": "IDC", "parent": "BRCA"}]), |
| |
| Mock(status_code=200, json=lambda: [{"code": "BRCA", "parent": "TISSUE"}]), |
| ] |
| mock_requests_get.side_effect = mock_responses |
|
|
| result = _get_oncotree_ancestors("IDC") |
| assert result == ["BRCA"] |
|
|
| |
| assert "IDC" in _oncotree_ancestors_cache |
| assert _oncotree_ancestors_cache["IDC"] == ["BRCA"] |
|
|
| @patch("mosaic.tcga.requests.get") |
| def test_get_oncotree_ancestors_api_failure(self, mock_requests_get): |
| """Test _get_oncotree_ancestors handles API failures gracefully.""" |
| |
| _oncotree_ancestors_cache.clear() |
|
|
| |
| mock_requests_get.side_effect = Exception("API error") |
|
|
| result = _get_oncotree_ancestors("TESTCODE") |
| assert result == [] |
|
|
| @patch("mosaic.tcga.requests.get") |
| def test_get_oncotree_ancestors_caching(self, mock_requests_get): |
| """Test _get_oncotree_ancestors uses cache to avoid repeated API calls.""" |
| |
| _oncotree_ancestors_cache.clear() |
|
|
| |
| mock_requests_get.return_value = Mock( |
| status_code=200, json=lambda: [{"code": "IDC", "parent": "TISSUE"}] |
| ) |
|
|
| |
| result1 = _get_oncotree_ancestors("IDC") |
| initial_call_count = mock_requests_get.call_count |
| assert initial_call_count >= 1 |
|
|
| |
| result2 = _get_oncotree_ancestors("IDC") |
| assert mock_requests_get.call_count == initial_call_count |
| assert result1 == result2 |
|
|
| @patch("mosaic.tcga.requests.get") |
| def test_get_oncotree_ancestors_not_found(self, mock_requests_get): |
| """Test _get_oncotree_ancestors handles 404 responses.""" |
| |
| _oncotree_ancestors_cache.clear() |
|
|
| mock_requests_get.return_value = Mock(status_code=404) |
|
|
| result = _get_oncotree_ancestors("INVALID_CODE") |
| assert result == [] |
|
|
|
|
| class TestConvertGDCMetadata: |
| """Tests for full GDC metadata conversion.""" |
|
|
| def test_convert_complete_metadata(self): |
| """Test converting complete GDC metadata. |
| |
| Note: cancer_subtype mapping uses OncoTree hierarchy to find |
| the best matching Paladin subtype. BRCA may map to IDC. |
| """ |
| gdc_metadata = { |
| "file_name": "test_slide.svs", |
| "file_size": 1000000, |
| "cases": [ |
| { |
| "demographic": {"gender": "female"}, |
| "primary_site": "Breast", |
| "project": { |
| "project_id": "TCGA-LUAD" |
| }, |
| "samples": [{"sample_type": "Primary Tumor"}], |
| } |
| ], |
| } |
| supported_sites = ["Breast", "Lung", "Brain"] |
|
|
| result = convert_gdc_metadata_to_mosaic(gdc_metadata, supported_sites) |
|
|
| assert result.sex == "Female" |
| assert result.tissue_site == "Breast" |
| assert result.cancer_subtype == "LUAD" |
| assert result.site_type == "Primary" |
| assert result.filename == "test_slide.svs" |
| assert result.file_size == 1000000 |
|
|
| def test_convert_empty_metadata(self): |
| """Test converting empty GDC metadata.""" |
| gdc_metadata = {} |
| supported_sites = ["Breast", "Lung"] |
|
|
| result = convert_gdc_metadata_to_mosaic(gdc_metadata, supported_sites) |
|
|
| assert result.sex is None |
| assert result.tissue_site == "Unknown" |
| assert result.cancer_subtype == "Unknown" |
| assert result.site_type == "Primary" |
|
|
| def test_convert_partial_metadata(self): |
| """Test converting partial GDC metadata.""" |
| gdc_metadata = { |
| "cases": [ |
| { |
| "demographic": {"gender": "male"}, |
| |
| } |
| ], |
| } |
| supported_sites = ["Breast", "Lung"] |
|
|
| result = convert_gdc_metadata_to_mosaic(gdc_metadata, supported_sites) |
|
|
| assert result.sex == "Male" |
| assert result.tissue_site == "Unknown" |
| assert result.cancer_subtype == "Unknown" |
| assert result.site_type == "Primary" |
|
|
|
|
| class TestCaching: |
| """Tests for slide caching functionality.""" |
|
|
| def test_is_slide_cached_not_cached(self): |
| """Test is_slide_cached returns False when not cached.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| cache_dir = Path(tmpdir) |
| assert not is_slide_cached("test-uuid", cache_dir) |
|
|
| def test_is_slide_cached_cached(self): |
| """Test is_slide_cached returns True when cached.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| cache_dir = Path(tmpdir) |
| slide_dir = cache_dir / "test-uuid" |
| slide_dir.mkdir(parents=True) |
| (slide_dir / "test.svs").touch() |
|
|
| assert is_slide_cached("test-uuid", cache_dir) |
|
|
| def test_get_cached_slide_path_not_cached(self): |
| """Test get_cached_slide_path returns None when not cached.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| cache_dir = Path(tmpdir) |
| assert get_cached_slide_path("test-uuid", cache_dir) is None |
|
|
| def test_get_cached_slide_path_cached(self): |
| """Test get_cached_slide_path returns path when cached.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| cache_dir = Path(tmpdir) |
| slide_dir = cache_dir / "test-uuid" |
| slide_dir.mkdir(parents=True) |
| slide_path = slide_dir / "test.svs" |
| slide_path.touch() |
|
|
| result = get_cached_slide_path("test-uuid", cache_dir) |
| assert result == slide_path |
|
|
|
|
| class TestGDCAPIIntegration: |
| """Tests for GDC API integration (mocked).""" |
|
|
| @patch("mosaic.tcga._make_request_with_retry") |
| def test_query_slide_uuid_success(self, mock_request): |
| """Test successful barcode to UUID conversion.""" |
| mock_response = MagicMock() |
| mock_response.json.return_value = { |
| "data": {"hits": [{"file_id": "test-uuid-12345", "file_name": "slide.svs"}]} |
| } |
| mock_request.return_value = mock_response |
|
|
| result = query_slide_uuid("TCGA-A1-A0SB-01Z-00-DX1") |
| assert result == "test-uuid-12345" |
|
|
| @patch("mosaic.tcga._make_request_with_retry") |
| def test_query_slide_uuid_not_found(self, mock_request): |
| """Test barcode not found raises error.""" |
| mock_response = MagicMock() |
| mock_response.json.return_value = {"data": {"hits": []}} |
| mock_request.return_value = mock_response |
|
|
| with pytest.raises(GDCNotFoundError): |
| query_slide_uuid("TCGA-XX-XXXX") |
|
|
| @patch("mosaic.tcga._make_request_with_retry") |
| def test_get_slide_metadata_success(self, mock_request): |
| """Test successful metadata retrieval.""" |
| mock_response = MagicMock() |
| mock_response.json.return_value = { |
| "data": { |
| "file_id": "test-uuid", |
| "file_name": "slide.svs", |
| "file_size": 1000000, |
| "cases": [ |
| { |
| "demographic": {"gender": "female"}, |
| "primary_site": "Breast", |
| } |
| ], |
| } |
| } |
| mock_request.return_value = mock_response |
|
|
| result = get_slide_metadata("test-uuid") |
| assert result["file_name"] == "slide.svs" |
| assert result["file_size"] == 1000000 |
|
|
| @patch("mosaic.tcga._make_request_with_retry") |
| def test_get_slide_metadata_not_found(self, mock_request): |
| """Test metadata not found raises error.""" |
| mock_response = MagicMock() |
| mock_response.json.return_value = {} |
| mock_request.return_value = mock_response |
|
|
| with pytest.raises(GDCNotFoundError): |
| get_slide_metadata("nonexistent-uuid") |
|
|
| @patch("mosaic.tcga.get_slide_metadata") |
| @patch("mosaic.tcga.get_cached_slide_path") |
| def test_fetch_slide_by_uuid_cached(self, mock_cached, mock_metadata): |
| """Test fetching a cached slide returns cached path.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| cache_dir = Path(tmpdir) |
| cached_path = cache_dir / "test-uuid" / "slide.svs" |
| cached_path.parent.mkdir(parents=True) |
| cached_path.touch() |
|
|
| mock_cached.return_value = cached_path |
|
|
| result = fetch_slide_by_uuid("test-uuid", cache_dir) |
| assert result == cached_path |
| mock_metadata.assert_not_called() |
|
|
| @patch("mosaic.tcga._make_request_with_retry") |
| @patch("mosaic.tcga.get_slide_metadata") |
| @patch("mosaic.tcga.get_cached_slide_path") |
| def test_fetch_slide_by_uuid_download( |
| self, mock_cached, mock_metadata, mock_request |
| ): |
| """Test fetching an uncached slide downloads it.""" |
| with tempfile.TemporaryDirectory() as tmpdir: |
| cache_dir = Path(tmpdir) |
| mock_cached.return_value = None |
| mock_metadata.return_value = { |
| "file_name": "slide.svs", |
| "file_size": 100, |
| } |
|
|
| |
| mock_response = MagicMock() |
| mock_response.headers = {"content-length": "100"} |
| mock_response.iter_content.return_value = [b"test content"] |
| mock_request.return_value = mock_response |
|
|
| result = fetch_slide_by_uuid("test-uuid", cache_dir) |
|
|
| assert result.name == "slide.svs" |
| assert result.exists() |
|
|
| @patch("mosaic.tcga.fetch_slide_by_uuid") |
| def test_fetch_slide_with_uuid(self, mock_fetch): |
| """Test fetch_slide with UUID input.""" |
| mock_fetch.return_value = Path("/cache/test-uuid/slide.svs") |
|
|
| path, uuid = fetch_slide("a1b2c3d4-e5f6-7890-abcd-ef1234567890") |
|
|
| assert uuid == "a1b2c3d4-e5f6-7890-abcd-ef1234567890" |
| mock_fetch.assert_called_once() |
|
|
| @patch("mosaic.tcga.query_slide_uuid") |
| @patch("mosaic.tcga.fetch_slide_by_uuid") |
| def test_fetch_slide_with_barcode(self, mock_fetch, mock_query): |
| """Test fetch_slide with barcode input.""" |
| mock_query.return_value = "resolved-uuid" |
| mock_fetch.return_value = Path("/cache/resolved-uuid/slide.svs") |
|
|
| path, uuid = fetch_slide("TCGA-A1-A0SB-01Z-00-DX1") |
|
|
| assert uuid == "resolved-uuid" |
| mock_query.assert_called_once_with("TCGA-A1-A0SB-01Z-00-DX1", None) |
|
|
|
|
| class TestTCGAMetadataDataclass: |
| """Tests for TCGAMetadata dataclass.""" |
|
|
| def test_create_metadata(self): |
| """Test creating TCGAMetadata instance.""" |
| metadata = TCGAMetadata( |
| sex="Female", |
| tissue_site="Breast", |
| cancer_subtype="BRCA", |
| site_type="Primary", |
| filename="slide.svs", |
| file_size=1000000, |
| ) |
|
|
| assert metadata.sex == "Female" |
| assert metadata.tissue_site == "Breast" |
| assert metadata.cancer_subtype == "BRCA" |
| assert metadata.site_type == "Primary" |
| assert metadata.filename == "slide.svs" |
| assert metadata.file_size == 1000000 |
|
|
| def test_create_metadata_defaults(self): |
| """Test TCGAMetadata with default values.""" |
| metadata = TCGAMetadata( |
| sex=None, |
| tissue_site=None, |
| cancer_subtype=None, |
| site_type=None, |
| ) |
|
|
| assert metadata.sex is None |
| assert metadata.filename is None |
| assert metadata.file_size is None |
|
|
|
|
| class TestErrorClasses: |
| """Tests for custom exception classes.""" |
|
|
| def test_gdc_error_base(self): |
| """Test GDCError base class.""" |
| error = GDCError("Test error") |
| assert str(error) == "Test error" |
| assert isinstance(error, Exception) |
|
|
| def test_gdc_not_found_error(self): |
| """Test GDCNotFoundError.""" |
| error = GDCNotFoundError("File not found") |
| assert isinstance(error, GDCError) |
|
|
| def test_gdc_access_denied_error(self): |
| """Test GDCAccessDeniedError.""" |
| error = GDCAccessDeniedError("Access denied") |
| assert isinstance(error, GDCError) |
|
|
| def test_gdc_network_error(self): |
| """Test GDCNetworkError.""" |
| error = GDCNetworkError("Network error") |
| assert isinstance(error, GDCError) |
|
|
| def test_gdc_invalid_id_error(self): |
| """Test GDCInvalidIDError.""" |
| error = GDCInvalidIDError("Invalid ID") |
| assert isinstance(error, GDCError) |
|
|
|
|
| class TestMetadataCache: |
| """Tests for metadata caching functionality.""" |
|
|
| def test_save_and_load_roundtrip(self, tmp_path): |
| """Test saving and loading metadata cache round-trip.""" |
| file_uuid = "a1b2c3d4-e5f6-7890-abcd-ef1234567890" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
|
|
| metadata = TCGAMetadata( |
| sex="Male", |
| tissue_site="Breast", |
| cancer_subtype="BRCA", |
| site_type="Primary", |
| filename="test.svs", |
| file_size=1024000, |
| ) |
|
|
| save_metadata_cache(file_uuid, metadata, cache_dir=tmp_path) |
| loaded = load_metadata_cache(file_uuid, cache_dir=tmp_path) |
|
|
| assert loaded is not None |
| assert loaded.sex == "Male" |
| assert loaded.tissue_site == "Breast" |
| assert loaded.cancer_subtype == "BRCA" |
| assert loaded.site_type == "Primary" |
| assert loaded.filename == "test.svs" |
| assert loaded.file_size == 1024000 |
|
|
| def test_load_missing_cache_returns_none(self, tmp_path): |
| """Test loading non-existent cache returns None.""" |
| file_uuid = "missing-uuid" |
| loaded = load_metadata_cache(file_uuid, cache_dir=tmp_path) |
| assert loaded is None |
|
|
| def test_load_corrupt_json_returns_none(self, tmp_path): |
| """Test loading corrupt JSON returns None.""" |
| file_uuid = "corrupt-uuid" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
|
|
| cache_path = slide_dir / METADATA_CACHE_FILENAME |
| cache_path.write_text("not valid json{{{") |
|
|
| loaded = load_metadata_cache(file_uuid, cache_dir=tmp_path) |
| assert loaded is None |
|
|
| def test_load_wrong_version_returns_none(self, tmp_path): |
| """Test loading wrong version cache returns None.""" |
| file_uuid = "old-version-uuid" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
|
|
| cache_path = slide_dir / METADATA_CACHE_FILENAME |
| import json |
|
|
| cache_path.write_text( |
| json.dumps( |
| { |
| "cache_version": METADATA_CACHE_VERSION + 1, |
| "sex": "Male", |
| "tissue_site": "Breast", |
| "cancer_subtype": "BRCA", |
| "site_type": "Primary", |
| } |
| ) |
| ) |
|
|
| loaded = load_metadata_cache(file_uuid, cache_dir=tmp_path) |
| assert loaded is None |
|
|
| def test_roundtrip_with_none_values(self, tmp_path): |
| """Test round-trip with None values.""" |
| file_uuid = "none-values-uuid" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
|
|
| metadata = TCGAMetadata( |
| sex=None, |
| tissue_site=None, |
| cancer_subtype=None, |
| site_type=None, |
| filename=None, |
| file_size=None, |
| ) |
|
|
| save_metadata_cache(file_uuid, metadata, cache_dir=tmp_path) |
| loaded = load_metadata_cache(file_uuid, cache_dir=tmp_path) |
|
|
| assert loaded is not None |
| assert loaded.sex is None |
| assert loaded.tissue_site is None |
| assert loaded.cancer_subtype is None |
| assert loaded.site_type is None |
|
|
| def test_save_no_slide_dir_is_noop(self, tmp_path): |
| """Test saving when slide directory doesn't exist is a no-op.""" |
| file_uuid = "no-dir-uuid" |
| metadata = TCGAMetadata( |
| sex="Male", |
| tissue_site="Breast", |
| cancer_subtype="BRCA", |
| site_type="Primary", |
| ) |
|
|
| |
| save_metadata_cache(file_uuid, metadata, cache_dir=tmp_path) |
|
|
| |
| cache_path = tmp_path / file_uuid / METADATA_CACHE_FILENAME |
| assert not cache_path.exists() |
|
|
| def test_save_permission_error_is_silent(self, tmp_path, monkeypatch): |
| """Test that permission errors during save are silent.""" |
| file_uuid = "perm-error-uuid" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
|
|
| metadata = TCGAMetadata( |
| sex="Male", |
| tissue_site="Breast", |
| cancer_subtype="BRCA", |
| site_type="Primary", |
| ) |
|
|
| |
| import builtins |
|
|
| original_open = builtins.open |
|
|
| def mock_open(*args, **kwargs): |
| |
| mode = kwargs.get("mode") |
| if mode is None and len(args) > 1: |
| mode = args[1] |
| if ( |
| "metadata.json" in str(args[0]) |
| and mode is not None |
| and "w" in str(mode) |
| ): |
| raise PermissionError("No permission") |
| return original_open(*args, **kwargs) |
|
|
| monkeypatch.setattr(builtins, "open", mock_open) |
|
|
| |
| save_metadata_cache(file_uuid, metadata, cache_dir=tmp_path) |
|
|
|
|
| class TestAnalysisResultsCache: |
| """Tests for analysis results caching functionality.""" |
|
|
| def test_save_and_load_roundtrip(self, tmp_path): |
| """Test saving and loading analysis results round-trip.""" |
| import pandas as pd |
| from PIL import Image |
|
|
| file_uuid = "results-uuid" |
| settings_hash = "abc123def456" |
|
|
| |
| aeon_results = pd.DataFrame( |
| {"Confidence": [0.95, 0.85]}, index=["BRCA", "LUAD"] |
| ) |
| aeon_results.index.name = "Cancer Subtype" |
|
|
| paladin_results = pd.DataFrame( |
| {"Biomarker": ["HER2", "ER"], "Probability": [0.9, 0.8]} |
| ) |
|
|
| slide_mask = Image.new("RGB", (100, 100), color="red") |
|
|
| |
| save_analysis_results( |
| file_uuid, |
| settings_hash, |
| aeon_results, |
| paladin_results, |
| slide_mask, |
| cache_dir=tmp_path, |
| ) |
|
|
| |
| loaded = load_analysis_results(file_uuid, settings_hash, cache_dir=tmp_path) |
|
|
| assert loaded is not None |
| loaded_mask, loaded_aeon, loaded_paladin = loaded |
|
|
| |
| assert loaded_aeon is not None |
| assert loaded_aeon.index.name == "Cancer Subtype" |
| assert list(loaded_aeon.index) == ["BRCA", "LUAD"] |
| assert list(loaded_aeon["Confidence"]) == [0.95, 0.85] |
|
|
| |
| assert loaded_paladin is not None |
| assert list(loaded_paladin["Biomarker"]) == ["HER2", "ER"] |
| assert list(loaded_paladin["Probability"]) == [0.9, 0.8] |
|
|
| |
| assert loaded_mask is not None |
| assert loaded_mask.size == (100, 100) |
|
|
| def test_load_missing_dir_returns_none(self, tmp_path): |
| """Test loading from missing directory returns None.""" |
| loaded = load_analysis_results( |
| "missing-uuid", "missing-hash", cache_dir=tmp_path |
| ) |
| assert loaded is None |
|
|
| def test_load_partial_results_returns_none(self, tmp_path): |
| """Test loading partial results returns None.""" |
| import pandas as pd |
|
|
| file_uuid = "partial-uuid" |
| settings_hash = "partial-hash" |
| results_dir = tmp_path / file_uuid / "results" / settings_hash |
| results_dir.mkdir(parents=True) |
|
|
| |
| aeon_results = pd.DataFrame({"Confidence": [0.95]}, index=["BRCA"]) |
| aeon_path = results_dir / "aeon_results.csv" |
| aeon_results.to_csv(aeon_path, index=True) |
|
|
| loaded = load_analysis_results(file_uuid, settings_hash, cache_dir=tmp_path) |
| assert loaded is None |
|
|
| def test_settings_hash_deterministic(self): |
| """Test that settings hash is deterministic.""" |
| seg_config = {"threshold": 0.5, "min_size": 100} |
| hash1 = compute_settings_hash( |
| seg_config, "Primary", "Male", "Breast", "BRCA", "HER2+" |
| ) |
| hash2 = compute_settings_hash( |
| seg_config, "Primary", "Male", "Breast", "BRCA", "HER2+" |
| ) |
| assert hash1 == hash2 |
| assert len(hash1) == 12 |
|
|
| def test_settings_hash_varies_with_input(self): |
| """Test that settings hash varies with different inputs.""" |
| seg_config = {"threshold": 0.5, "min_size": 100} |
|
|
| hash1 = compute_settings_hash( |
| seg_config, "Primary", "Male", "Breast", "BRCA", None |
| ) |
| hash2 = compute_settings_hash( |
| seg_config, "Metastatic", "Male", "Breast", "BRCA", None |
| ) |
| hash3 = compute_settings_hash( |
| seg_config, "Primary", "Female", "Breast", "BRCA", None |
| ) |
|
|
| assert hash1 != hash2 |
| assert hash1 != hash3 |
| assert hash2 != hash3 |
|
|
| def test_save_with_none_components(self, tmp_path): |
| """Test saving with None components skips those files.""" |
| file_uuid = "none-components-uuid" |
| settings_hash = "none-hash" |
|
|
| save_analysis_results( |
| file_uuid, settings_hash, None, None, None, cache_dir=tmp_path |
| ) |
|
|
| results_dir = tmp_path / file_uuid / "results" / settings_hash |
| assert results_dir.exists() |
|
|
| |
| assert not (results_dir / "aeon_results.csv").exists() |
| assert not (results_dir / "paladin_results.csv").exists() |
| assert not (results_dir / "slide_mask.png").exists() |
|
|
| def test_aeon_index_restored(self, tmp_path): |
| """Test that aeon results index is properly restored.""" |
| import pandas as pd |
| from PIL import Image |
|
|
| file_uuid = "index-uuid" |
| settings_hash = "index-hash" |
|
|
| aeon_results = pd.DataFrame({"Confidence": [0.95]}, index=["BRCA"]) |
| aeon_results.index.name = "Cancer Subtype" |
|
|
| paladin_results = pd.DataFrame({"Biomarker": ["HER2"]}) |
| slide_mask = Image.new("RGB", (50, 50)) |
|
|
| save_analysis_results( |
| file_uuid, |
| settings_hash, |
| aeon_results, |
| paladin_results, |
| slide_mask, |
| cache_dir=tmp_path, |
| ) |
|
|
| loaded = load_analysis_results(file_uuid, settings_hash, cache_dir=tmp_path) |
| _, loaded_aeon, _ = loaded |
|
|
| |
| assert loaded_aeon.index.name is not None |
| assert list(loaded_aeon.index) == ["BRCA"] |
|
|
|
|
| class TestHFDatasetSync: |
| """Tests for HuggingFace Dataset sync functionality.""" |
|
|
| def test_upload_results_calls_hf_api(self, tmp_path, monkeypatch): |
| """Test that upload_results_to_hf calls HF API correctly.""" |
| import json |
|
|
| file_uuid = "upload-uuid" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
|
|
| |
| metadata_path = slide_dir / METADATA_CACHE_FILENAME |
| metadata_path.write_text(json.dumps({"cache_version": 1, "sex": "Male"})) |
|
|
| |
| results_dir = slide_dir / "results" / "hash1" |
| results_dir.mkdir(parents=True) |
| (results_dir / "aeon_results.csv").write_text("test") |
|
|
| |
| mock_api = MagicMock() |
|
|
| def mock_hf_api(): |
| return mock_api |
|
|
| |
| import sys |
| from types import ModuleType |
|
|
| mock_hf_hub = ModuleType("huggingface_hub") |
| mock_hf_hub.HfApi = mock_hf_api |
| sys.modules["huggingface_hub"] = mock_hf_hub |
|
|
| monkeypatch.setenv("MOSAIC_TCGA_CACHE_HF_REPO", "test-org/test-repo") |
|
|
| try: |
| upload_results_to_hf(file_uuid, cache_dir=tmp_path) |
|
|
| |
| assert mock_api.upload_file.call_count >= 1 |
| finally: |
| |
| if "huggingface_hub" in sys.modules: |
| del sys.modules["huggingface_hub"] |
|
|
| def test_upload_noop_without_env_var(self, tmp_path, monkeypatch): |
| """Test that upload is no-op when env var not set.""" |
| monkeypatch.delenv("MOSAIC_TCGA_CACHE_HF_REPO", raising=False) |
|
|
| file_uuid = "noop-uuid" |
|
|
| |
| upload_results_to_hf(file_uuid, cache_dir=tmp_path) |
|
|
| def test_download_results_populates_cache(self, tmp_path, monkeypatch): |
| """Test that download_results_from_hf populates local cache.""" |
| file_uuid = "download-uuid" |
|
|
| |
| mock_api = MagicMock() |
|
|
| |
| class MockRepoFile: |
| def __init__(self, path): |
| self.path = path |
|
|
| mock_api.list_repo_tree.return_value = [ |
| MockRepoFile(f"{file_uuid}/metadata.json"), |
| MockRepoFile(f"{file_uuid}/results/hash1/aeon_results.csv"), |
| ] |
|
|
| def mock_hf_api(): |
| return mock_api |
|
|
| |
| def mock_download(repo_id, filename, repo_type): |
| temp_file = tmp_path / "temp" / filename.replace("/", "_") |
| temp_file.parent.mkdir(parents=True, exist_ok=True) |
| temp_file.write_text("test content") |
| return str(temp_file) |
|
|
| |
| import sys |
| from types import ModuleType |
|
|
| mock_hf_hub = ModuleType("huggingface_hub") |
| mock_hf_hub.HfApi = mock_hf_api |
| mock_hf_hub.hf_hub_download = mock_download |
| sys.modules["huggingface_hub"] = mock_hf_hub |
|
|
| monkeypatch.setenv("MOSAIC_TCGA_CACHE_HF_REPO", "test-org/test-repo") |
|
|
| try: |
| result = download_results_from_hf(file_uuid, cache_dir=tmp_path) |
|
|
| assert result is True |
| |
| assert (tmp_path / file_uuid / "metadata.json").exists() |
| finally: |
| |
| if "huggingface_hub" in sys.modules: |
| del sys.modules["huggingface_hub"] |
|
|
| def test_download_noop_without_env_var(self, tmp_path, monkeypatch): |
| """Test that download is no-op when env var not set.""" |
| monkeypatch.delenv("MOSAIC_TCGA_CACHE_HF_REPO", raising=False) |
|
|
| result = download_results_from_hf("test-uuid", cache_dir=tmp_path) |
| assert result is False |
|
|
| def test_download_handles_api_error_gracefully(self, tmp_path, monkeypatch): |
| """Test that download handles API errors gracefully.""" |
| |
| mock_api = MagicMock() |
| mock_api.list_repo_tree.side_effect = Exception("API error") |
|
|
| def mock_hf_api(): |
| return mock_api |
|
|
| |
| import sys |
| from types import ModuleType |
|
|
| mock_hf_hub = ModuleType("huggingface_hub") |
| mock_hf_hub.HfApi = mock_hf_api |
| sys.modules["huggingface_hub"] = mock_hf_hub |
|
|
| monkeypatch.setenv("MOSAIC_TCGA_CACHE_HF_REPO", "test-org/test-repo") |
|
|
| try: |
| result = download_results_from_hf("test-uuid", cache_dir=tmp_path) |
| assert result is False |
| finally: |
| |
| if "huggingface_hub" in sys.modules: |
| del sys.modules["huggingface_hub"] |
|
|
|
|
| class TestGetCaseMetadataWithCache: |
| """Tests for get_case_metadata with caching.""" |
|
|
| @patch("mosaic.tcga.get_slide_metadata") |
| def test_cache_hit_skips_api(self, mock_get_metadata, tmp_path): |
| """Test that cache hit skips API call.""" |
| file_uuid = "cached-uuid" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
|
|
| |
| metadata = TCGAMetadata( |
| sex="Male", |
| tissue_site="Breast", |
| cancer_subtype="BRCA", |
| site_type="Primary", |
| ) |
| save_metadata_cache(file_uuid, metadata, cache_dir=tmp_path) |
|
|
| |
| result = get_case_metadata(file_uuid, ["Breast"], cache_dir=tmp_path) |
|
|
| |
| mock_get_metadata.assert_not_called() |
|
|
| |
| assert result.sex == "Male" |
| assert result.cancer_subtype == "BRCA" |
|
|
| @patch("mosaic.tcga.get_slide_metadata") |
| @patch("mosaic.tcga.convert_gdc_metadata_to_mosaic") |
| def test_cache_miss_fetches_and_saves( |
| self, mock_convert, mock_get_metadata, tmp_path |
| ): |
| """Test that cache miss fetches from API and saves.""" |
| file_uuid = "uncached-uuid" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
|
|
| mock_get_metadata.return_value = {"file_name": "test.svs"} |
| mock_convert.return_value = TCGAMetadata( |
| sex="Female", |
| tissue_site="Lung", |
| cancer_subtype="LUAD", |
| site_type="Primary", |
| ) |
|
|
| result = get_case_metadata(file_uuid, ["Lung"], cache_dir=tmp_path) |
|
|
| |
| mock_get_metadata.assert_called_once() |
|
|
| |
| assert result.sex == "Female" |
|
|
| |
| cached = load_metadata_cache(file_uuid, cache_dir=tmp_path) |
| assert cached is not None |
| assert cached.sex == "Female" |
|
|
|
|
| class TestTCGACacheDirHFSpaces: |
| """Test TCGA cache directory detection on HF Spaces.""" |
|
|
| def test_tcga_cache_dir_uses_data_on_hf_spaces(self, monkeypatch, tmp_path): |
| """Test that HF Spaces uses /data for persistent storage.""" |
| from mosaic import data_directory |
| from mosaic.data_directory import get_tcga_cache_directory |
|
|
| |
| monkeypatch.setenv("SPACE_ID", "test-space-id") |
|
|
| |
| mock_data_dir = tmp_path / "data" |
| mock_data_dir.mkdir() |
|
|
| |
| original_path = Path |
|
|
| class MockPath: |
| def __new__(cls, path_str): |
| if path_str == "/data": |
| return mock_data_dir |
| return original_path(path_str) |
|
|
| monkeypatch.setattr(data_directory, "Path", MockPath) |
|
|
| |
| data_directory._TCGA_CACHE_DIR = None |
|
|
| |
| cache_dir = get_tcga_cache_directory() |
|
|
| |
| assert "mosaic_tcga_slides" in str(cache_dir) |
|
|
|
|
| class TestUIAnalysisCachingIntegration: |
| """Integration tests for TCGA caching through the UI analysis flow.""" |
|
|
| def test_cache_hit_skips_analysis(self, tmp_path): |
| """Test that a cache hit skips the full analysis pipeline.""" |
| import pandas as pd |
| from PIL import Image |
|
|
| file_uuid = "ui-cache-hit-uuid" |
| settings_hash = compute_settings_hash( |
| seg_config="TCGA", |
| site_type="Primary", |
| sex="Male", |
| tissue_site="Lung", |
| cancer_subtype="LUAD", |
| ihc_subtype="", |
| ) |
|
|
| |
| aeon_results = pd.DataFrame( |
| {"Confidence": [0.95, 0.85]}, index=["LUAD", "LUSC"] |
| ) |
| aeon_results.index.name = "Cancer Subtype" |
| paladin_results = pd.DataFrame({"Biomarker": ["PD-L1"], "Probability": [0.7]}) |
| slide_mask = Image.new("RGB", (100, 100), color="blue") |
|
|
| save_analysis_results( |
| file_uuid, |
| settings_hash, |
| aeon_results, |
| paladin_results, |
| slide_mask, |
| cache_dir=tmp_path, |
| ) |
|
|
| |
| cached = load_analysis_results(file_uuid, settings_hash, cache_dir=tmp_path) |
| assert cached is not None |
|
|
| loaded_mask, loaded_aeon, loaded_paladin = cached |
| assert list(loaded_aeon.index) == ["LUAD", "LUSC"] |
| assert loaded_paladin["Biomarker"].iloc[0] == "PD-L1" |
| assert loaded_mask.size == (100, 100) |
|
|
| def test_cache_miss_returns_none(self, tmp_path): |
| """Test that a cache miss returns None, requiring fresh analysis.""" |
| settings_hash = compute_settings_hash( |
| seg_config="TCGA", |
| site_type="Primary", |
| sex="Female", |
| tissue_site="Breast", |
| cancer_subtype="BRCA", |
| ihc_subtype="HER2+", |
| ) |
|
|
| cached = load_analysis_results( |
| "nonexistent-uuid", settings_hash, cache_dir=tmp_path |
| ) |
| assert cached is None |
|
|
| def test_uuid_extraction_from_tcga_cache_path(self, tmp_path): |
| """Test extracting UUID from a slide path under TCGA cache directory.""" |
| file_uuid = "a1b2c3d4-e5f6-7890-abcd-ef1234567890" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
| slide_path = slide_dir / "test_slide.svs" |
| slide_path.touch() |
|
|
| |
| slide_path_obj = Path(str(slide_path)) |
| if tmp_path in slide_path_obj.parents: |
| extracted_uuid = slide_path_obj.parent.name |
| assert extracted_uuid == file_uuid |
|
|
| def test_settings_hash_consistency_across_calls(self): |
| """Test that settings hash is consistent across multiple calls.""" |
| kwargs = dict( |
| seg_config="TCGA", |
| site_type="Primary", |
| sex="Male", |
| tissue_site="Lung", |
| cancer_subtype="LUAD", |
| ihc_subtype=None, |
| ) |
| hashes = [compute_settings_hash(**kwargs) for _ in range(10)] |
| assert len(set(hashes)) == 1 |
|
|
|
|
| class TestCacheEdgeCases: |
| """Tests for edge cases in cache operations.""" |
|
|
| def test_save_and_load_empty_dataframes(self, tmp_path): |
| """Test saving and loading empty DataFrames.""" |
| import pandas as pd |
| from PIL import Image |
|
|
| file_uuid = "empty-df-uuid" |
| settings_hash = "empty-df-hash" |
|
|
| aeon_results = pd.DataFrame({"Confidence": []}) |
| aeon_results.index.name = "Cancer Subtype" |
| paladin_results = pd.DataFrame({"Biomarker": [], "Probability": []}) |
| slide_mask = Image.new("RGB", (50, 50)) |
|
|
| save_analysis_results( |
| file_uuid, |
| settings_hash, |
| aeon_results, |
| paladin_results, |
| slide_mask, |
| cache_dir=tmp_path, |
| ) |
|
|
| loaded = load_analysis_results(file_uuid, settings_hash, cache_dir=tmp_path) |
| assert loaded is not None |
| _, loaded_aeon, loaded_paladin = loaded |
| assert len(loaded_aeon) == 0 |
| assert len(loaded_paladin) == 0 |
|
|
| def test_load_corrupt_csv_returns_none(self, tmp_path): |
| """Test that corrupted CSV cache returns None.""" |
| from PIL import Image |
|
|
| file_uuid = "corrupt-csv-uuid" |
| settings_hash = "corrupt-csv-hash" |
| results_dir = tmp_path / file_uuid / "results" / settings_hash |
| results_dir.mkdir(parents=True) |
|
|
| |
| (results_dir / "aeon_results.csv").write_text('not,valid\ncsv,"broken') |
| (results_dir / "paladin_results.csv").write_text("col1\nval1") |
| |
| mask = Image.new("RGB", (10, 10)) |
| mask.save(results_dir / "slide_mask.png") |
|
|
| loaded = load_analysis_results(file_uuid, settings_hash, cache_dir=tmp_path) |
| |
| assert loaded is None or ( |
| isinstance(loaded, tuple) and len(loaded) == 3 |
| ), "Expected None or valid 3-tuple" |
|
|
| def test_load_corrupt_image_returns_none(self, tmp_path): |
| """Test that corrupted image cache returns None.""" |
| import pandas as pd |
|
|
| file_uuid = "corrupt-img-uuid" |
| settings_hash = "corrupt-img-hash" |
| results_dir = tmp_path / file_uuid / "results" / settings_hash |
| results_dir.mkdir(parents=True) |
|
|
| |
| aeon = pd.DataFrame({"Confidence": [0.9]}, index=["BRCA"]) |
| aeon.to_csv(results_dir / "aeon_results.csv", index=True) |
| paladin = pd.DataFrame({"Biomarker": ["HER2"]}) |
| paladin.to_csv(results_dir / "paladin_results.csv", index=False) |
|
|
| |
| (results_dir / "slide_mask.png").write_text("not a valid PNG file") |
|
|
| loaded = load_analysis_results(file_uuid, settings_hash, cache_dir=tmp_path) |
| |
| assert loaded is None |
|
|
| def test_settings_hash_with_nested_dict_ordering(self): |
| """Test that settings hash is consistent regardless of dict insertion order.""" |
| seg_config_1 = {"threshold": 0.5, "min_size": 100, "method": "otsu"} |
| seg_config_2 = {"method": "otsu", "threshold": 0.5, "min_size": 100} |
|
|
| hash1 = compute_settings_hash( |
| seg_config_1, "Primary", "Male", "Lung", "LUAD", None |
| ) |
| hash2 = compute_settings_hash( |
| seg_config_2, "Primary", "Male", "Lung", "LUAD", None |
| ) |
| assert hash1 == hash2 |
|
|
| def test_metadata_cache_with_special_characters(self, tmp_path): |
| """Test metadata cache handles special characters in fields.""" |
| file_uuid = "special-chars-uuid" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
|
|
| metadata = TCGAMetadata( |
| sex="Male", |
| tissue_site="Liver & Intrahepatic Bile Ducts", |
| cancer_subtype="CHOL", |
| site_type="Primary", |
| filename="TCGA-W5-AA2R-01Z-00-DX1.C7E20B0C-5F4F-4352-B8E8-4D2EBEE48CD1.svs", |
| file_size=2147483647, |
| ) |
|
|
| save_metadata_cache(file_uuid, metadata, cache_dir=tmp_path) |
| loaded = load_metadata_cache(file_uuid, cache_dir=tmp_path) |
|
|
| assert loaded is not None |
| assert loaded.tissue_site == "Liver & Intrahepatic Bile Ducts" |
| assert ( |
| loaded.filename |
| == "TCGA-W5-AA2R-01Z-00-DX1.C7E20B0C-5F4F-4352-B8E8-4D2EBEE48CD1.svs" |
| ) |
|
|
| def test_metadata_cache_invalid_structure_returns_none(self, tmp_path): |
| """Test that cache with missing required fields returns None.""" |
| import json |
|
|
| file_uuid = "bad-structure-uuid" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
|
|
| |
| cache_path = slide_dir / METADATA_CACHE_FILENAME |
| cache_path.write_text( |
| json.dumps( |
| { |
| "cache_version": METADATA_CACHE_VERSION, |
| "sex": "Male", |
| |
| } |
| ) |
| ) |
|
|
| loaded = load_metadata_cache(file_uuid, cache_dir=tmp_path) |
| |
| assert loaded is None |
|
|
| def test_save_analysis_results_disk_full_error(self, tmp_path, monkeypatch): |
| """Test that disk full errors are logged but don't raise.""" |
| import pandas as pd |
| from PIL import Image |
|
|
| file_uuid = "disk-full-uuid" |
| settings_hash = "disk-full-hash" |
|
|
| aeon_results = pd.DataFrame({"Confidence": [0.9]}, index=["BRCA"]) |
| paladin_results = pd.DataFrame({"Biomarker": ["HER2"]}) |
| slide_mask = Image.new("RGB", (10, 10)) |
|
|
| |
| original_mkdir = Path.mkdir |
|
|
| def mock_mkdir(self, *args, **kwargs): |
| if "disk-full" in str(self): |
| error = OSError("No space left on device") |
| error.errno = 28 |
| raise error |
| return original_mkdir(self, *args, **kwargs) |
|
|
| monkeypatch.setattr(Path, "mkdir", mock_mkdir) |
|
|
| |
| save_analysis_results( |
| file_uuid, |
| settings_hash, |
| aeon_results, |
| paladin_results, |
| slide_mask, |
| cache_dir=tmp_path, |
| ) |
|
|
|
|
| class TestHFAuthenticationFailures: |
| """Tests for HF Dataset authentication and permission error handling. |
| |
| Since loguru is mocked in conftest.py, these tests verify that the correct |
| logger methods are called with appropriate error messages by inspecting |
| the mock logger's call args. |
| """ |
|
|
| @staticmethod |
| def _get_mock_logger(): |
| """Get the mock logger used by mosaic.tcga.""" |
| import mosaic.tcga |
|
|
| return mosaic.tcga.logger |
|
|
| @staticmethod |
| def _get_log_messages(mock_logger): |
| """Extract all log message strings from mock logger calls.""" |
| messages = [] |
| for method_name in ["error", "warning", "info", "debug"]: |
| method = getattr(mock_logger, method_name) |
| for call in method.call_args_list: |
| if call.args: |
| messages.append(str(call.args[0])) |
| return messages |
|
|
| def _setup_hf_mock(self, mock_api, monkeypatch): |
| """Set up HF hub mock module.""" |
| import sys |
| from types import ModuleType |
|
|
| mock_hf_hub = ModuleType("huggingface_hub") |
| mock_hf_hub.HfApi = lambda: mock_api |
| mock_hf_hub.hf_hub_download = MagicMock() |
| sys.modules["huggingface_hub"] = mock_hf_hub |
| monkeypatch.setenv("MOSAIC_TCGA_CACHE_HF_REPO", "test-org/test-repo") |
|
|
| @staticmethod |
| def _cleanup_hf_mock(): |
| """Clean up HF hub mock module.""" |
| import sys |
|
|
| if "huggingface_hub" in sys.modules: |
| del sys.modules["huggingface_hub"] |
|
|
| def test_upload_auth_error_logged(self, tmp_path, monkeypatch): |
| """Test that 401 auth errors are logged with actionable message.""" |
| import json |
|
|
| file_uuid = "auth-error-uuid" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
|
|
| metadata_path = slide_dir / METADATA_CACHE_FILENAME |
| metadata_path.write_text(json.dumps({"cache_version": 1, "sex": "Male"})) |
|
|
| mock_api = MagicMock() |
| mock_api.upload_file.side_effect = Exception("401 Unauthorized: Invalid token") |
|
|
| self._setup_hf_mock(mock_api, monkeypatch) |
| mock_logger = self._get_mock_logger() |
| mock_logger.reset_mock() |
|
|
| try: |
| upload_results_to_hf(file_uuid, cache_dir=tmp_path) |
|
|
| messages = self._get_log_messages(mock_logger) |
| assert any( |
| "401" in msg or "Authentication" in msg for msg in messages |
| ), f"Expected '401' or 'Authentication' in log messages, got: {messages}" |
| finally: |
| self._cleanup_hf_mock() |
|
|
| def test_upload_permission_denied_logged(self, tmp_path, monkeypatch): |
| """Test that 403 permission errors are logged with actionable message.""" |
| import json |
|
|
| file_uuid = "perm-denied-uuid" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
|
|
| metadata_path = slide_dir / METADATA_CACHE_FILENAME |
| metadata_path.write_text(json.dumps({"cache_version": 1, "sex": "Male"})) |
|
|
| mock_api = MagicMock() |
| mock_api.upload_file.side_effect = Exception( |
| "403 Forbidden: You don't have permission" |
| ) |
|
|
| self._setup_hf_mock(mock_api, monkeypatch) |
| mock_logger = self._get_mock_logger() |
| mock_logger.reset_mock() |
|
|
| try: |
| upload_results_to_hf(file_uuid, cache_dir=tmp_path) |
|
|
| messages = self._get_log_messages(mock_logger) |
| assert any( |
| "403" in msg or "Permission denied" in msg for msg in messages |
| ), f"Expected '403' or 'Permission denied' in log messages, got: {messages}" |
| finally: |
| self._cleanup_hf_mock() |
|
|
| def test_download_auth_error_logged(self, tmp_path, monkeypatch): |
| """Test that download auth errors are logged with actionable message.""" |
| mock_api = MagicMock() |
| mock_api.list_repo_tree.side_effect = Exception( |
| "401 Unauthorized: Invalid token" |
| ) |
|
|
| self._setup_hf_mock(mock_api, monkeypatch) |
| mock_logger = self._get_mock_logger() |
| mock_logger.reset_mock() |
|
|
| try: |
| result = download_results_from_hf("test-uuid", cache_dir=tmp_path) |
|
|
| assert result is False |
| messages = self._get_log_messages(mock_logger) |
| assert any( |
| "401" in msg or "authentication" in msg.lower() for msg in messages |
| ), f"Expected '401' or 'authentication' in log messages, got: {messages}" |
| finally: |
| self._cleanup_hf_mock() |
|
|
| def test_download_repo_not_found_logged(self, tmp_path, monkeypatch): |
| """Test that 404 repo not found errors are logged with actionable message.""" |
| mock_api = MagicMock() |
| mock_api.list_repo_tree.side_effect = Exception( |
| "404 Not Found: Repository not found" |
| ) |
|
|
| self._setup_hf_mock(mock_api, monkeypatch) |
| mock_logger = self._get_mock_logger() |
| mock_logger.reset_mock() |
|
|
| try: |
| result = download_results_from_hf("test-uuid", cache_dir=tmp_path) |
|
|
| assert result is False |
| messages = self._get_log_messages(mock_logger) |
| assert any( |
| "404" in msg or "not found" in msg.lower() for msg in messages |
| ), f"Expected '404' or 'not found' in log messages, got: {messages}" |
| finally: |
| self._cleanup_hf_mock() |
|
|
| def test_upload_rate_limit_logged_as_warning(self, tmp_path, monkeypatch): |
| """Test that rate limit errors are logged as warnings.""" |
| import json |
|
|
| file_uuid = "rate-limit-uuid" |
| slide_dir = tmp_path / file_uuid |
| slide_dir.mkdir() |
|
|
| metadata_path = slide_dir / METADATA_CACHE_FILENAME |
| metadata_path.write_text(json.dumps({"cache_version": 1, "sex": "Male"})) |
|
|
| mock_api = MagicMock() |
| mock_api.upload_file.side_effect = Exception( |
| "429 Too Many Requests: Rate limit exceeded" |
| ) |
|
|
| self._setup_hf_mock(mock_api, monkeypatch) |
| mock_logger = self._get_mock_logger() |
| mock_logger.reset_mock() |
|
|
| try: |
| upload_results_to_hf(file_uuid, cache_dir=tmp_path) |
|
|
| messages = self._get_log_messages(mock_logger) |
| assert any( |
| "429" in msg or "rate limit" in msg.lower() for msg in messages |
| ), f"Expected '429' or 'rate limit' in log messages, got: {messages}" |
| finally: |
| self._cleanup_hf_mock() |
|
|
|
|
| |
| @pytest.mark.skip(reason="Integration test - requires network access") |
| class TestGDCAPIRealIntegration: |
| """Real integration tests with GDC API (skipped by default).""" |
|
|
| def test_fetch_public_slide_metadata(self): |
| """Test fetching metadata for a known public TCGA slide.""" |
| |
| |
| pass |
|
|
| def test_query_known_barcode(self): |
| """Test querying a known TCGA barcode.""" |
| pass |
|
|