#!/bin/bash -l #SBATCH --job-name=cover #SBATCH --output=logs/cover.out #SBATCH --error=logs/cover.err #SBATCH --partition=small-g #SBATCH --ntasks-per-node=1 #SBATCH --nodes=1 # Total number of nodes #SBATCH --cpus-per-task=16 #SBATCH --gpus-per-node=4 # Allocate one gpu per MPI rank #SBATCH --array=0 #SBATCH --mem=128G #SBATCH --time=12:00:00 # Run time (d-hh:mm:ss) #SBATCH --account=project_465002532 # Project for billing module use /appl/local/csc/modulefiles/ module use /appl/local/training/modules/AI-20241126/ export TOKENIZERS_PARALLELISM=false export CRUX_ROOT=${HOME}/datasets/crux lr=5e-5 model_dir=${HOME}/models/CoveR/relevance-ms-pft.cover-10k mkdir -p ${model_dir} cp $0 ${model_dir} GPUS_PER_NODE=4 NUM_NODES=1 NUM_PROCESSES=$(expr $NUM_NODES \* $GPUS_PER_NODE) PRETRAINED=DylanJHJ/modernbert-base.relevance-10k # Start experiments srun singularity exec $SIF \ accelerate launch -m \ --multi_gpu --mixed_precision=bf16 \ --num_processes $NUM_PROCESSES --num_machines $NUM_NODES \ tevatron.retriever.driver.train_dualdistil \ --exclude_title \ --output_dir ${model_dir} \ --model_name_or_path $PRETRAINED \ --save_steps 1000 \ --dataset_name DylanJHJ/crux-researchy-kdnew-ext \ --corpus_name DylanJHJ/crux-researchy-corpus \ --request_as_query True \ --dataset_split pos_half.neu_low.neg_zero \ --per_device_train_batch_size 16 \ --train_group_size 8 \ --prediction_loss_only True \ --bf16 --pooling mean --normalize \ --passage_prefix "search_document: " \ --query_prefix "search_query: " \ --subquery_prefix "search_query: " \ --temperature 0.02 \ --use_crossentropy 1.0 \ --use_kld 0.0 \ --contrastive_lambda 1.0 \ --sq_contrastive_lambda 0.0 \ --covdistil_method KLD \ --covdistil_lambda 0.1 \ --eval_steps 500 \ --learning_rate $lr \ --query_max_len 180 \ --passage_max_len 512 \ --dataloader_num_workers 8 \ --lr_scheduler_type 'cosine' \ --weight_decay 0.01 \ --max_steps 10000 \ --warmup_steps 1000 \ --logging_steps 10 \ --overwrite_output_dir \ --run_name ${model_dir##*/}