| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| module use /appl/local/csc/modulefiles/ |
| module use /appl/local/training/modules/AI-20241126/ |
| export TOKENIZERS_PARALLELISM=false |
| export CRUX_ROOT=${HOME}/datasets/crux |
|
|
| lr=5e-5 |
| model_dir=${HOME}/models/CoveR/relevance-ms-pft.cover-10k |
|
|
| mkdir -p ${model_dir} |
| cp $0 ${model_dir} |
|
|
| GPUS_PER_NODE=4 |
| NUM_NODES=1 |
| NUM_PROCESSES=$(expr $NUM_NODES \* $GPUS_PER_NODE) |
| PRETRAINED=DylanJHJ/modernbert-base.relevance-10k |
|
|
| |
| srun singularity exec $SIF \ |
| accelerate launch -m \ |
| --multi_gpu --mixed_precision=bf16 \ |
| --num_processes $NUM_PROCESSES --num_machines $NUM_NODES \ |
| tevatron.retriever.driver.train_dualdistil \ |
| --exclude_title \ |
| --output_dir ${model_dir} \ |
| --model_name_or_path $PRETRAINED \ |
| --save_steps 1000 \ |
| --dataset_name DylanJHJ/crux-researchy-kdnew-ext \ |
| --corpus_name DylanJHJ/crux-researchy-corpus \ |
| --request_as_query True \ |
| --dataset_split pos_half.neu_low.neg_zero \ |
| --per_device_train_batch_size 16 \ |
| --train_group_size 8 \ |
| --prediction_loss_only True \ |
| --bf16 --pooling mean --normalize \ |
| --passage_prefix "search_document: " \ |
| --query_prefix "search_query: " \ |
| --subquery_prefix "search_query: " \ |
| --temperature 0.02 \ |
| --use_crossentropy 1.0 \ |
| --use_kld 0.0 \ |
| --contrastive_lambda 1.0 \ |
| --sq_contrastive_lambda 0.0 \ |
| --covdistil_method KLD \ |
| --covdistil_lambda 0.1 \ |
| --eval_steps 500 \ |
| --learning_rate $lr \ |
| --query_max_len 180 \ |
| --passage_max_len 512 \ |
| --dataloader_num_workers 8 \ |
| --lr_scheduler_type 'cosine' \ |
| --weight_decay 0.01 \ |
| --max_steps 10000 \ |
| --warmup_steps 1000 \ |
| --logging_steps 10 \ |
| --overwrite_output_dir \ |
| --run_name ${model_dir |
|
|