| | #!/bin/bash |
| | |
| | |
| |
|
| | set -e |
| |
|
| | |
| | ROOT_DIR="/roshare/nlst_global/data_23Dec2024/manifest-NLST_allCT/NLST/" |
| | OUTPUT_BASE="site2_data" |
| | SITE_SPLITS="../subsets/site_splits" |
| |
|
| | echo "============================================" |
| | echo "SITE 2 - Embedding Extraction" |
| | echo "============================================" |
| | echo "" |
| |
|
| | |
| | mkdir -p ${OUTPUT_BASE}/train |
| | mkdir -p ${OUTPUT_BASE}/test |
| |
|
| | |
| | echo "π¦ Extracting TRAINING embeddings..." |
| | python extract-embeddings.py \ |
| | --root-dir ${ROOT_DIR} \ |
| | --pid-csv ${SITE_SPLITS}/train_pid_labelsT0T7_split_2.csv \ |
| | --output-dir ${OUTPUT_BASE}/train \ |
| | --num-workers 8 \ |
| | --checkpoint-interval 500 |
| |
|
| | echo "" |
| | echo "β Training embeddings complete!" |
| | echo "" |
| |
|
| | |
| | echo "π¦ Extracting TEST embeddings..." |
| | python extract-embeddings.py \ |
| | --root-dir ${ROOT_DIR} \ |
| | --pid-csv ${SITE_SPLITS}/test_pid_labelsT0T7_split_2.csv \ |
| | --output-dir ${OUTPUT_BASE}/test \ |
| | --num-workers 8 \ |
| | --checkpoint-interval 500 |
| |
|
| | echo "" |
| | echo "β Test embeddings complete!" |
| | echo "" |
| |
|
| | |
| | echo "π Preparing files for federated learning..." |
| | mkdir -p ${OUTPUT_BASE}/fl_ready |
| |
|
| | |
| | cp ${OUTPUT_BASE}/train/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site2_embeddings_train.parquet |
| | cp ${OUTPUT_BASE}/test/all_embeddings.parquet ${OUTPUT_BASE}/fl_ready/site2_embeddings_test.parquet |
| |
|
| | |
| | echo "Creating site2_labels-train.csv..." |
| | head -n 1 ${SITE_SPLITS}/train_pid_labelsT0T7_split_2.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site2_labels-train.csv |
| | tail -n +2 ${SITE_SPLITS}/train_pid_labelsT0T7_split_2.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site2_labels-train.csv |
| |
|
| | echo "Creating site2_labels-test.csv..." |
| | head -n 1 ${SITE_SPLITS}/test_pid_labelsT0T7_split_2.csv | cut -d, -f1,14-20 > ${OUTPUT_BASE}/fl_ready/site2_labels-test.csv |
| | tail -n +2 ${SITE_SPLITS}/test_pid_labelsT0T7_split_2.csv | cut -d, -f1,14-20 >> ${OUTPUT_BASE}/fl_ready/site2_labels-test.csv |
| |
|
| | echo "" |
| | echo "============================================" |
| | echo "SITE 2 - COMPLETE! β
" |
| | echo "============================================" |
| | echo "" |
| | echo "FL-ready files in: ${OUTPUT_BASE}/fl_ready/" |
| | ls -lh ${OUTPUT_BASE}/fl_ready/ |
| | echo "" |
| | echo "Files ready for federated learning:" |
| | echo " β site2_embeddings_train.parquet" |
| | echo " β site2_embeddings_test.parquet" |
| | echo " β site2_labels-train.csv" |
| | echo " β site2_labels-test.csv" |
| | echo "" |
| |
|