Bundle metadata files in repo to skip download on startup
Browse files- Add data/*.csv and data/metadata/ to repo
- Update .gitignore to allow metadata files while ignoring model files
- Update Dockerfile to copy metadata files into image
- Update download_metadata() to use local files if available
- Falls back to HuggingFace download if local files not found
- Reduces startup time by ~2 seconds (no network call needed)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
- .gitignore +4 -1
- Dockerfile +1 -0
- data/metadata/int_to_name_class_mapping.tsv +160 -0
- data/metadata/longest_sequence.txt +1 -0
- data/metadata/n_classes.txt +1 -0
- data/metadata/ontology_embedding_dim.txt +1 -0
- data/metadata/target_dict.tsv +1 -0
- src/mosaic/gradio_app.py +23 -13
.gitignore
CHANGED
|
@@ -11,7 +11,10 @@ tmp*
|
|
| 11 |
*.swp
|
| 12 |
.idea/
|
| 13 |
.vscode/
|
| 14 |
-
data
|
|
|
|
|
|
|
|
|
|
| 15 |
.pytest_cache/
|
| 16 |
.coverage
|
| 17 |
htmlcov/
|
|
|
|
| 11 |
*.swp
|
| 12 |
.idea/
|
| 13 |
.vscode/
|
| 14 |
+
data/*
|
| 15 |
+
# But include metadata files
|
| 16 |
+
!data/*.csv
|
| 17 |
+
!data/metadata/
|
| 18 |
.pytest_cache/
|
| 19 |
.coverage
|
| 20 |
htmlcov/
|
Dockerfile
CHANGED
|
@@ -15,6 +15,7 @@ WORKDIR /app
|
|
| 15 |
# Copy project files
|
| 16 |
COPY pyproject.toml README.md app.py ./
|
| 17 |
COPY src/ ./src/
|
|
|
|
| 18 |
|
| 19 |
# Create venv and install with dependencies using GH_TOKEN secret
|
| 20 |
# Use BuildKit secret mount to securely pass GH_TOKEN
|
|
|
|
| 15 |
# Copy project files
|
| 16 |
COPY pyproject.toml README.md app.py ./
|
| 17 |
COPY src/ ./src/
|
| 18 |
+
COPY data/*.csv data/metadata/ ./data/
|
| 19 |
|
| 20 |
# Create venv and install with dependencies using GH_TOKEN secret
|
| 21 |
# Use BuildKit secret mount to securely pass GH_TOKEN
|
data/metadata/int_to_name_class_mapping.tsv
ADDED
|
@@ -0,0 +1,160 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
0 ACC
|
| 2 |
+
1 ACRM
|
| 3 |
+
2 ACYC
|
| 4 |
+
3 ALUCA
|
| 5 |
+
4 ANGS
|
| 6 |
+
5 ANSC
|
| 7 |
+
6 ARMM
|
| 8 |
+
7 ARMS
|
| 9 |
+
8 ASTR
|
| 10 |
+
9 ATM
|
| 11 |
+
10 BA
|
| 12 |
+
11 BCC
|
| 13 |
+
12 BLAD
|
| 14 |
+
13 BLCA
|
| 15 |
+
14 BMGCT
|
| 16 |
+
15 CCOV
|
| 17 |
+
16 CCRCC
|
| 18 |
+
17 CESC
|
| 19 |
+
18 CHDM
|
| 20 |
+
19 CHRCC
|
| 21 |
+
20 CHS
|
| 22 |
+
21 COAD
|
| 23 |
+
22 CSCC
|
| 24 |
+
23 DA
|
| 25 |
+
24 DASTR
|
| 26 |
+
25 DDLS
|
| 27 |
+
26 DES
|
| 28 |
+
27 DSRCT
|
| 29 |
+
28 ECAD
|
| 30 |
+
29 EGC
|
| 31 |
+
30 EHAE
|
| 32 |
+
31 EHCH
|
| 33 |
+
32 EMPD
|
| 34 |
+
33 EOV
|
| 35 |
+
34 EPIS
|
| 36 |
+
35 EPM
|
| 37 |
+
36 ERMS
|
| 38 |
+
37 ES
|
| 39 |
+
38 ESCA
|
| 40 |
+
39 ESCC
|
| 41 |
+
40 GBAD
|
| 42 |
+
41 GBM
|
| 43 |
+
42 GCCAP
|
| 44 |
+
43 GEJ
|
| 45 |
+
44 GIST
|
| 46 |
+
45 GRCT
|
| 47 |
+
46 HCC
|
| 48 |
+
47 HGNEC
|
| 49 |
+
48 HGSOC
|
| 50 |
+
49 HNMUCM
|
| 51 |
+
50 HNSC
|
| 52 |
+
51 IDC
|
| 53 |
+
52 IHCH
|
| 54 |
+
53 ILC
|
| 55 |
+
54 LGSOC
|
| 56 |
+
55 LMS
|
| 57 |
+
56 LUAD
|
| 58 |
+
57 LUCA
|
| 59 |
+
58 LUNE
|
| 60 |
+
59 LUPC
|
| 61 |
+
60 LUSC
|
| 62 |
+
61 MAAP
|
| 63 |
+
62 MACR
|
| 64 |
+
63 MBC
|
| 65 |
+
64 MCC
|
| 66 |
+
65 MFH
|
| 67 |
+
66 MFS
|
| 68 |
+
67 MNG
|
| 69 |
+
68 MOV
|
| 70 |
+
69 MPNST
|
| 71 |
+
70 MRLS
|
| 72 |
+
71 NBL
|
| 73 |
+
72 NPC
|
| 74 |
+
73 NSGCT
|
| 75 |
+
74 OCS
|
| 76 |
+
75 ODG
|
| 77 |
+
76 OPHSC
|
| 78 |
+
77 OS
|
| 79 |
+
78 PAAC
|
| 80 |
+
79 PAAD
|
| 81 |
+
80 PAMPCA
|
| 82 |
+
81 PANET
|
| 83 |
+
82 PAST
|
| 84 |
+
83 PECOMA
|
| 85 |
+
84 PEMESO
|
| 86 |
+
85 PHC
|
| 87 |
+
86 PLMESO
|
| 88 |
+
87 PRAD
|
| 89 |
+
88 PRCC
|
| 90 |
+
89 PTAD
|
| 91 |
+
90 RBL
|
| 92 |
+
91 READ
|
| 93 |
+
92 SBOV
|
| 94 |
+
93 SBWDNET
|
| 95 |
+
94 SCBC
|
| 96 |
+
95 SCHW
|
| 97 |
+
96 SCLC
|
| 98 |
+
97 SDCA
|
| 99 |
+
98 SEM
|
| 100 |
+
99 SFT
|
| 101 |
+
100 SKCM
|
| 102 |
+
101 SSRCC
|
| 103 |
+
102 STAD
|
| 104 |
+
103 SYNS
|
| 105 |
+
104 TAC
|
| 106 |
+
105 THAP
|
| 107 |
+
106 THHC
|
| 108 |
+
107 THME
|
| 109 |
+
108 THPA
|
| 110 |
+
109 THPD
|
| 111 |
+
110 THYC
|
| 112 |
+
111 THYM
|
| 113 |
+
112 UCCC
|
| 114 |
+
113 UCP
|
| 115 |
+
114 UCS
|
| 116 |
+
115 UEC
|
| 117 |
+
116 ULMS
|
| 118 |
+
117 UM
|
| 119 |
+
118 USC
|
| 120 |
+
119 UTUC
|
| 121 |
+
120 VMM
|
| 122 |
+
121 VSC
|
| 123 |
+
122 WDLS
|
| 124 |
+
123 WT
|
| 125 |
+
124 RCC
|
| 126 |
+
125 ADNOS
|
| 127 |
+
126 CUPNOS
|
| 128 |
+
127 NOT
|
| 129 |
+
128 MDLC
|
| 130 |
+
129 URCC
|
| 131 |
+
130 NVRINT
|
| 132 |
+
131 PAASC
|
| 133 |
+
132 MEL
|
| 134 |
+
133 GBC
|
| 135 |
+
134 BRCNOS
|
| 136 |
+
135 LUAS
|
| 137 |
+
136 UDMN
|
| 138 |
+
137 UMEC
|
| 139 |
+
138 NETNOS
|
| 140 |
+
139 COADREAD
|
| 141 |
+
140 CHOL
|
| 142 |
+
141 PDC
|
| 143 |
+
142 GNOS
|
| 144 |
+
143 GINET
|
| 145 |
+
144 NSCLC
|
| 146 |
+
145 CSCLC
|
| 147 |
+
146 SBC
|
| 148 |
+
147 NECNOS
|
| 149 |
+
148 BRCA
|
| 150 |
+
149 SCCNOS
|
| 151 |
+
150 AMPCA
|
| 152 |
+
151 CUP
|
| 153 |
+
152 SARCNOS
|
| 154 |
+
153 BRCANOS
|
| 155 |
+
154 APAD
|
| 156 |
+
155 NSCLCPD
|
| 157 |
+
156 DIFG
|
| 158 |
+
157 MXOV
|
| 159 |
+
158 UCEC
|
| 160 |
+
159 MUP
|
data/metadata/longest_sequence.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
20000
|
data/metadata/n_classes.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
160
|
data/metadata/ontology_embedding_dim.txt
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
32
|
data/metadata/target_dict.tsv
ADDED
|
@@ -0,0 +1 @@
|
|
|
|
|
|
|
| 1 |
+
{'histologies': ['ACC', 'ACRM', 'ACYC', 'ALUCA', 'ANGS', 'ANSC', 'ARMM', 'ARMS', 'ASTR', 'ATM', 'BA', 'BCC', 'BLAD', 'BLCA', 'BMGCT', 'CCOV', 'CCRCC', 'CESC', 'CHDM', 'CHRCC', 'CHS', 'COAD', 'CSCC', 'DA', 'DASTR', 'DDLS', 'DES', 'DSRCT', 'ECAD', 'EGC', 'EHAE', 'EHCH', 'EMPD', 'EOV', 'EPIS', 'EPM', 'ERMS', 'ES', 'ESCA', 'ESCC', 'GBAD', 'GBM', 'GCCAP', 'GEJ', 'GIST', 'GRCT', 'HCC', 'HGNEC', 'HGSOC', 'HNMUCM', 'HNSC', 'IDC', 'IHCH', 'ILC', 'LGSOC', 'LMS', 'LUAD', 'LUCA', 'LUNE', 'LUPC', 'LUSC', 'MAAP', 'MACR', 'MBC', 'MCC', 'MFH', 'MFS', 'MNG', 'MOV', 'MPNST', 'MRLS', 'NBL', 'NPC', 'NSGCT', 'OCS', 'ODG', 'OPHSC', 'OS', 'PAAC', 'PAAD', 'PAMPCA', 'PANET', 'PAST', 'PECOMA', 'PEMESO', 'PHC', 'PLMESO', 'PRAD', 'PRCC', 'PTAD', 'RBL', 'READ', 'SBOV', 'SBWDNET', 'SCBC', 'SCHW', 'SCLC', 'SDCA', 'SEM', 'SFT', 'SKCM', 'SSRCC', 'STAD', 'SYNS', 'TAC', 'THAP', 'THHC', 'THME', 'THPA', 'THPD', 'THYC', 'THYM', 'UCCC', 'UCP', 'UCS', 'UEC', 'ULMS', 'UM', 'USC', 'UTUC', 'VMM', 'VSC', 'WDLS', 'WT', 'RCC', 'ADNOS', 'CUPNOS', 'NOT', 'MDLC', 'URCC', 'NVRINT', 'PAASC', 'MEL', 'GBC', 'BRCNOS', 'LUAS', 'UDMN', 'UMEC', 'NETNOS', 'COADREAD', 'CHOL', 'PDC', 'GNOS', 'GINET', 'NSCLC', 'CSCLC', 'SBC', 'NECNOS', 'BRCA', 'SCCNOS', 'AMPCA', 'CUP', 'SARCNOS', 'BRCANOS', 'APAD', 'NSCLCPD', 'DIFG', 'MXOV', 'UCEC', 'MUP'], 'oncotree_mapping': {'MGCT': 'NSGCT', 'PLBMESO': 'PLMESO', 'PLEMESO': 'PLMESO', 'DSTAD': 'SSRCC', 'HGSFT': 'HGSOC', 'PSEC': 'HGSOC', 'GB': 'GBM', 'AODG': 'ODG', 'AASTR': 'ASTR', 'LXSC': 'HNSC', 'OCSC': 'HNSC'}, 'sites': ['Primary', 'Metastasis'], 'target': ['oncotree_code_aeon'], 'target_type': ['other'], 'task': ['multiclass-classification'], 'class_weights': [3.685110294117647, 2.9138081395348836, 1.606330128205128, 6.111890243902439, 3.13234375, 1.8700559701492536, 4.818990384615384, 6.425320512820512, 0.9564408396946565, 7.830859375, 8.352916666666667, 5.966369047619048, 7.830859375, 0.1324458245243129, 8.949553571428572, 1.7772163120567377, 0.4427340989399293, 1.8562037037037038, 6.425320512820512, 3.5798214285714285, 3.9154296875, 0.08294852697782192, 2.5311868686868686, 3.977579365079365, 6.425320512820512, 1.1287725225225225, 4.3204741379310345, 2.088229166666667, 2.880316091954023, 1.3768543956043957, 7.593560606060606, 2.694489247311828, 3.0559451219512197, 2.7537087912087914, 8.949553571428572, 8.352916666666667, 4.3204741379310345, 2.298967889908257, 0.5556263858093127, 2.053995901639344, 2.8155898876404493, 0.2762816979051819, 4.640509259259259, 0.8325166112956811, 0.44351769911504424, 5.447554347826087, 0.9456132075471698, 5.695170454545455, 0.15545130272952853, 5.966369047619048, 0.7391961651917404, 0.07947589597209008, 0.5264443277310924, 0.46491187384044524, 2.198135964912281, 0.9943948412698412, 0.049318539657547726, 2.1236228813559324, 2.409495192307692, 5.01175, 0.31881361323155216, 1.912881679389313, 2.3640330188679246, 6.772635135135135, 2.386547619047619, 1.2592336683417085, 2.557015306122449, 3.796780303030303, 5.331648936170213, 3.5798214285714285, 3.432705479452055, 0.5708143507972665, 7.593560606060606, 1.5187121212121213, 3.13234375, 1.6817953020134229, 2.637763157894737, 1.3329122340425532, 6.425320512820512, 0.09549828506097562, 7.370220588235294, 0.8981630824372759, 5.966369047619048, 10.0235, 3.5798214285714285, 10.0235, 1.0397821576763486, 0.081998527486911, 3.432705479452055, 4.041733870967742, 3.855192307692308, 0.19041603343465047, 7.830859375, 2.983184523809524, 8.352916666666667, 6.960763888888889, 0.6846653005464481, 3.855192307692308, 3.0191265060240964, 3.796780303030303, 0.33590817694369973, 3.5294014084507044, 0.4197445561139028, 2.5833762886597937, 6.960763888888889, 2.6102864583333334, 5.966369047619048, 3.0559451219512197, 0.7639862804878049, 1.6705833333333333, 9.281018518518518, 5.447554347826087, 3.4803819444444444, 10.0235, 0.9179029304029304, 0.18027877697841727, 1.5468364197530864, 2.5833762886597937, 0.6327967171717171, 0.6629298941798942, 6.594407894736842, 5.01175, 3.171993670886076, 3.4803819444444444, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0]}
|
src/mosaic/gradio_app.py
CHANGED
|
@@ -29,26 +29,36 @@ from mosaic.model_manager import load_all_models
|
|
| 29 |
|
| 30 |
|
| 31 |
def download_metadata():
|
| 32 |
-
"""
|
| 33 |
|
| 34 |
-
Fast
|
|
|
|
| 35 |
|
| 36 |
Returns:
|
| 37 |
tuple: (cancer_subtype_name_map, reversed_cancer_subtype_name_map, cancer_subtypes)
|
| 38 |
"""
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 49 |
|
| 50 |
# Set the data directory for other modules to use
|
| 51 |
-
set_data_directory(cache_dir)
|
| 52 |
|
| 53 |
model_map = pd.read_csv(Path(cache_dir) / "paladin_model_map.csv")
|
| 54 |
cancer_subtypes = model_map["cancer_subtype"].unique().tolist()
|
|
|
|
| 29 |
|
| 30 |
|
| 31 |
def download_metadata():
|
| 32 |
+
"""Load metadata files from local data/ directory or download from HuggingFace.
|
| 33 |
|
| 34 |
+
Fast loading of small files needed for UI initialization.
|
| 35 |
+
Prioritizes local files in data/ directory (bundled with Docker image).
|
| 36 |
|
| 37 |
Returns:
|
| 38 |
tuple: (cancer_subtype_name_map, reversed_cancer_subtype_name_map, cancer_subtypes)
|
| 39 |
"""
|
| 40 |
+
# Check if metadata exists in local data/ directory (Docker deployment)
|
| 41 |
+
local_data_dir = Path(__file__).parent.parent.parent / "data"
|
| 42 |
+
local_model_map = local_data_dir / "paladin_model_map.csv"
|
| 43 |
+
|
| 44 |
+
if local_model_map.exists():
|
| 45 |
+
logger.info(f"Using local metadata from: {local_data_dir}")
|
| 46 |
+
cache_dir = local_data_dir
|
| 47 |
+
else:
|
| 48 |
+
# Fall back to downloading from HuggingFace Hub
|
| 49 |
+
logger.info("Downloading metadata from HuggingFace Hub...")
|
| 50 |
+
cache_dir = Path(snapshot_download(
|
| 51 |
+
repo_id="PDM-Group/paladin-aeon-models",
|
| 52 |
+
allow_patterns=[
|
| 53 |
+
"*.csv", # Model maps and metadata
|
| 54 |
+
"tissue_site_*", # Tissue site mappings
|
| 55 |
+
"metadata/*", # Metadata files (including target_dict.tsv)
|
| 56 |
+
],
|
| 57 |
+
))
|
| 58 |
+
logger.info(f"Metadata downloaded to: {cache_dir}")
|
| 59 |
|
| 60 |
# Set the data directory for other modules to use
|
| 61 |
+
set_data_directory(str(cache_dir))
|
| 62 |
|
| 63 |
model_map = pd.read_csv(Path(cache_dir) / "paladin_model_map.csv")
|
| 64 |
cancer_subtypes = model_map["cancer_subtype"].unique().tolist()
|