raylim Claude Sonnet 4.5 commited on
Commit
8607da4
·
unverified ·
1 Parent(s): d1e1666

Bundle metadata files in repo to skip download on startup

Browse files

- Add data/*.csv and data/metadata/ to repo
- Update .gitignore to allow metadata files while ignoring model files
- Update Dockerfile to copy metadata files into image
- Update download_metadata() to use local files if available
- Falls back to HuggingFace download if local files not found
- Reduces startup time by ~2 seconds (no network call needed)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

.gitignore CHANGED
@@ -11,7 +11,10 @@ tmp*
11
  *.swp
12
  .idea/
13
  .vscode/
14
- data/
 
 
 
15
  .pytest_cache/
16
  .coverage
17
  htmlcov/
 
11
  *.swp
12
  .idea/
13
  .vscode/
14
+ data/*
15
+ # But include metadata files
16
+ !data/*.csv
17
+ !data/metadata/
18
  .pytest_cache/
19
  .coverage
20
  htmlcov/
Dockerfile CHANGED
@@ -15,6 +15,7 @@ WORKDIR /app
15
  # Copy project files
16
  COPY pyproject.toml README.md app.py ./
17
  COPY src/ ./src/
 
18
 
19
  # Create venv and install with dependencies using GH_TOKEN secret
20
  # Use BuildKit secret mount to securely pass GH_TOKEN
 
15
  # Copy project files
16
  COPY pyproject.toml README.md app.py ./
17
  COPY src/ ./src/
18
+ COPY data/*.csv data/metadata/ ./data/
19
 
20
  # Create venv and install with dependencies using GH_TOKEN secret
21
  # Use BuildKit secret mount to securely pass GH_TOKEN
data/metadata/int_to_name_class_mapping.tsv ADDED
@@ -0,0 +1,160 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 0 ACC
2
+ 1 ACRM
3
+ 2 ACYC
4
+ 3 ALUCA
5
+ 4 ANGS
6
+ 5 ANSC
7
+ 6 ARMM
8
+ 7 ARMS
9
+ 8 ASTR
10
+ 9 ATM
11
+ 10 BA
12
+ 11 BCC
13
+ 12 BLAD
14
+ 13 BLCA
15
+ 14 BMGCT
16
+ 15 CCOV
17
+ 16 CCRCC
18
+ 17 CESC
19
+ 18 CHDM
20
+ 19 CHRCC
21
+ 20 CHS
22
+ 21 COAD
23
+ 22 CSCC
24
+ 23 DA
25
+ 24 DASTR
26
+ 25 DDLS
27
+ 26 DES
28
+ 27 DSRCT
29
+ 28 ECAD
30
+ 29 EGC
31
+ 30 EHAE
32
+ 31 EHCH
33
+ 32 EMPD
34
+ 33 EOV
35
+ 34 EPIS
36
+ 35 EPM
37
+ 36 ERMS
38
+ 37 ES
39
+ 38 ESCA
40
+ 39 ESCC
41
+ 40 GBAD
42
+ 41 GBM
43
+ 42 GCCAP
44
+ 43 GEJ
45
+ 44 GIST
46
+ 45 GRCT
47
+ 46 HCC
48
+ 47 HGNEC
49
+ 48 HGSOC
50
+ 49 HNMUCM
51
+ 50 HNSC
52
+ 51 IDC
53
+ 52 IHCH
54
+ 53 ILC
55
+ 54 LGSOC
56
+ 55 LMS
57
+ 56 LUAD
58
+ 57 LUCA
59
+ 58 LUNE
60
+ 59 LUPC
61
+ 60 LUSC
62
+ 61 MAAP
63
+ 62 MACR
64
+ 63 MBC
65
+ 64 MCC
66
+ 65 MFH
67
+ 66 MFS
68
+ 67 MNG
69
+ 68 MOV
70
+ 69 MPNST
71
+ 70 MRLS
72
+ 71 NBL
73
+ 72 NPC
74
+ 73 NSGCT
75
+ 74 OCS
76
+ 75 ODG
77
+ 76 OPHSC
78
+ 77 OS
79
+ 78 PAAC
80
+ 79 PAAD
81
+ 80 PAMPCA
82
+ 81 PANET
83
+ 82 PAST
84
+ 83 PECOMA
85
+ 84 PEMESO
86
+ 85 PHC
87
+ 86 PLMESO
88
+ 87 PRAD
89
+ 88 PRCC
90
+ 89 PTAD
91
+ 90 RBL
92
+ 91 READ
93
+ 92 SBOV
94
+ 93 SBWDNET
95
+ 94 SCBC
96
+ 95 SCHW
97
+ 96 SCLC
98
+ 97 SDCA
99
+ 98 SEM
100
+ 99 SFT
101
+ 100 SKCM
102
+ 101 SSRCC
103
+ 102 STAD
104
+ 103 SYNS
105
+ 104 TAC
106
+ 105 THAP
107
+ 106 THHC
108
+ 107 THME
109
+ 108 THPA
110
+ 109 THPD
111
+ 110 THYC
112
+ 111 THYM
113
+ 112 UCCC
114
+ 113 UCP
115
+ 114 UCS
116
+ 115 UEC
117
+ 116 ULMS
118
+ 117 UM
119
+ 118 USC
120
+ 119 UTUC
121
+ 120 VMM
122
+ 121 VSC
123
+ 122 WDLS
124
+ 123 WT
125
+ 124 RCC
126
+ 125 ADNOS
127
+ 126 CUPNOS
128
+ 127 NOT
129
+ 128 MDLC
130
+ 129 URCC
131
+ 130 NVRINT
132
+ 131 PAASC
133
+ 132 MEL
134
+ 133 GBC
135
+ 134 BRCNOS
136
+ 135 LUAS
137
+ 136 UDMN
138
+ 137 UMEC
139
+ 138 NETNOS
140
+ 139 COADREAD
141
+ 140 CHOL
142
+ 141 PDC
143
+ 142 GNOS
144
+ 143 GINET
145
+ 144 NSCLC
146
+ 145 CSCLC
147
+ 146 SBC
148
+ 147 NECNOS
149
+ 148 BRCA
150
+ 149 SCCNOS
151
+ 150 AMPCA
152
+ 151 CUP
153
+ 152 SARCNOS
154
+ 153 BRCANOS
155
+ 154 APAD
156
+ 155 NSCLCPD
157
+ 156 DIFG
158
+ 157 MXOV
159
+ 158 UCEC
160
+ 159 MUP
data/metadata/longest_sequence.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 20000
data/metadata/n_classes.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 160
data/metadata/ontology_embedding_dim.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ 32
data/metadata/target_dict.tsv ADDED
@@ -0,0 +1 @@
 
 
1
+ {'histologies': ['ACC', 'ACRM', 'ACYC', 'ALUCA', 'ANGS', 'ANSC', 'ARMM', 'ARMS', 'ASTR', 'ATM', 'BA', 'BCC', 'BLAD', 'BLCA', 'BMGCT', 'CCOV', 'CCRCC', 'CESC', 'CHDM', 'CHRCC', 'CHS', 'COAD', 'CSCC', 'DA', 'DASTR', 'DDLS', 'DES', 'DSRCT', 'ECAD', 'EGC', 'EHAE', 'EHCH', 'EMPD', 'EOV', 'EPIS', 'EPM', 'ERMS', 'ES', 'ESCA', 'ESCC', 'GBAD', 'GBM', 'GCCAP', 'GEJ', 'GIST', 'GRCT', 'HCC', 'HGNEC', 'HGSOC', 'HNMUCM', 'HNSC', 'IDC', 'IHCH', 'ILC', 'LGSOC', 'LMS', 'LUAD', 'LUCA', 'LUNE', 'LUPC', 'LUSC', 'MAAP', 'MACR', 'MBC', 'MCC', 'MFH', 'MFS', 'MNG', 'MOV', 'MPNST', 'MRLS', 'NBL', 'NPC', 'NSGCT', 'OCS', 'ODG', 'OPHSC', 'OS', 'PAAC', 'PAAD', 'PAMPCA', 'PANET', 'PAST', 'PECOMA', 'PEMESO', 'PHC', 'PLMESO', 'PRAD', 'PRCC', 'PTAD', 'RBL', 'READ', 'SBOV', 'SBWDNET', 'SCBC', 'SCHW', 'SCLC', 'SDCA', 'SEM', 'SFT', 'SKCM', 'SSRCC', 'STAD', 'SYNS', 'TAC', 'THAP', 'THHC', 'THME', 'THPA', 'THPD', 'THYC', 'THYM', 'UCCC', 'UCP', 'UCS', 'UEC', 'ULMS', 'UM', 'USC', 'UTUC', 'VMM', 'VSC', 'WDLS', 'WT', 'RCC', 'ADNOS', 'CUPNOS', 'NOT', 'MDLC', 'URCC', 'NVRINT', 'PAASC', 'MEL', 'GBC', 'BRCNOS', 'LUAS', 'UDMN', 'UMEC', 'NETNOS', 'COADREAD', 'CHOL', 'PDC', 'GNOS', 'GINET', 'NSCLC', 'CSCLC', 'SBC', 'NECNOS', 'BRCA', 'SCCNOS', 'AMPCA', 'CUP', 'SARCNOS', 'BRCANOS', 'APAD', 'NSCLCPD', 'DIFG', 'MXOV', 'UCEC', 'MUP'], 'oncotree_mapping': {'MGCT': 'NSGCT', 'PLBMESO': 'PLMESO', 'PLEMESO': 'PLMESO', 'DSTAD': 'SSRCC', 'HGSFT': 'HGSOC', 'PSEC': 'HGSOC', 'GB': 'GBM', 'AODG': 'ODG', 'AASTR': 'ASTR', 'LXSC': 'HNSC', 'OCSC': 'HNSC'}, 'sites': ['Primary', 'Metastasis'], 'target': ['oncotree_code_aeon'], 'target_type': ['other'], 'task': ['multiclass-classification'], 'class_weights': [3.685110294117647, 2.9138081395348836, 1.606330128205128, 6.111890243902439, 3.13234375, 1.8700559701492536, 4.818990384615384, 6.425320512820512, 0.9564408396946565, 7.830859375, 8.352916666666667, 5.966369047619048, 7.830859375, 0.1324458245243129, 8.949553571428572, 1.7772163120567377, 0.4427340989399293, 1.8562037037037038, 6.425320512820512, 3.5798214285714285, 3.9154296875, 0.08294852697782192, 2.5311868686868686, 3.977579365079365, 6.425320512820512, 1.1287725225225225, 4.3204741379310345, 2.088229166666667, 2.880316091954023, 1.3768543956043957, 7.593560606060606, 2.694489247311828, 3.0559451219512197, 2.7537087912087914, 8.949553571428572, 8.352916666666667, 4.3204741379310345, 2.298967889908257, 0.5556263858093127, 2.053995901639344, 2.8155898876404493, 0.2762816979051819, 4.640509259259259, 0.8325166112956811, 0.44351769911504424, 5.447554347826087, 0.9456132075471698, 5.695170454545455, 0.15545130272952853, 5.966369047619048, 0.7391961651917404, 0.07947589597209008, 0.5264443277310924, 0.46491187384044524, 2.198135964912281, 0.9943948412698412, 0.049318539657547726, 2.1236228813559324, 2.409495192307692, 5.01175, 0.31881361323155216, 1.912881679389313, 2.3640330188679246, 6.772635135135135, 2.386547619047619, 1.2592336683417085, 2.557015306122449, 3.796780303030303, 5.331648936170213, 3.5798214285714285, 3.432705479452055, 0.5708143507972665, 7.593560606060606, 1.5187121212121213, 3.13234375, 1.6817953020134229, 2.637763157894737, 1.3329122340425532, 6.425320512820512, 0.09549828506097562, 7.370220588235294, 0.8981630824372759, 5.966369047619048, 10.0235, 3.5798214285714285, 10.0235, 1.0397821576763486, 0.081998527486911, 3.432705479452055, 4.041733870967742, 3.855192307692308, 0.19041603343465047, 7.830859375, 2.983184523809524, 8.352916666666667, 6.960763888888889, 0.6846653005464481, 3.855192307692308, 3.0191265060240964, 3.796780303030303, 0.33590817694369973, 3.5294014084507044, 0.4197445561139028, 2.5833762886597937, 6.960763888888889, 2.6102864583333334, 5.966369047619048, 3.0559451219512197, 0.7639862804878049, 1.6705833333333333, 9.281018518518518, 5.447554347826087, 3.4803819444444444, 10.0235, 0.9179029304029304, 0.18027877697841727, 1.5468364197530864, 2.5833762886597937, 0.6327967171717171, 0.6629298941798942, 6.594407894736842, 5.01175, 3.171993670886076, 3.4803819444444444, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
src/mosaic/gradio_app.py CHANGED
@@ -29,26 +29,36 @@ from mosaic.model_manager import load_all_models
29
 
30
 
31
  def download_metadata():
32
- """Download only metadata files (CSV, tissue site mappings).
33
 
34
- Fast synchronous download of small files needed for UI initialization.
 
35
 
36
  Returns:
37
  tuple: (cancer_subtype_name_map, reversed_cancer_subtype_name_map, cancer_subtypes)
38
  """
39
- logger.info("Downloading metadata from HuggingFace Hub...")
40
- cache_dir = snapshot_download(
41
- repo_id="PDM-Group/paladin-aeon-models",
42
- allow_patterns=[
43
- "*.csv", # Model maps and metadata
44
- "tissue_site_*", # Tissue site mappings
45
- "metadata/*", # Metadata files (including target_dict.tsv)
46
- ],
47
- )
48
- logger.info(f"Metadata downloaded to: {cache_dir}")
 
 
 
 
 
 
 
 
 
49
 
50
  # Set the data directory for other modules to use
51
- set_data_directory(cache_dir)
52
 
53
  model_map = pd.read_csv(Path(cache_dir) / "paladin_model_map.csv")
54
  cancer_subtypes = model_map["cancer_subtype"].unique().tolist()
 
29
 
30
 
31
  def download_metadata():
32
+ """Load metadata files from local data/ directory or download from HuggingFace.
33
 
34
+ Fast loading of small files needed for UI initialization.
35
+ Prioritizes local files in data/ directory (bundled with Docker image).
36
 
37
  Returns:
38
  tuple: (cancer_subtype_name_map, reversed_cancer_subtype_name_map, cancer_subtypes)
39
  """
40
+ # Check if metadata exists in local data/ directory (Docker deployment)
41
+ local_data_dir = Path(__file__).parent.parent.parent / "data"
42
+ local_model_map = local_data_dir / "paladin_model_map.csv"
43
+
44
+ if local_model_map.exists():
45
+ logger.info(f"Using local metadata from: {local_data_dir}")
46
+ cache_dir = local_data_dir
47
+ else:
48
+ # Fall back to downloading from HuggingFace Hub
49
+ logger.info("Downloading metadata from HuggingFace Hub...")
50
+ cache_dir = Path(snapshot_download(
51
+ repo_id="PDM-Group/paladin-aeon-models",
52
+ allow_patterns=[
53
+ "*.csv", # Model maps and metadata
54
+ "tissue_site_*", # Tissue site mappings
55
+ "metadata/*", # Metadata files (including target_dict.tsv)
56
+ ],
57
+ ))
58
+ logger.info(f"Metadata downloaded to: {cache_dir}")
59
 
60
  # Set the data directory for other modules to use
61
+ set_data_directory(str(cache_dir))
62
 
63
  model_map = pd.read_csv(Path(cache_dir) / "paladin_model_map.csv")
64
  cancer_subtypes = model_map["cancer_subtype"].unique().tolist()