| seed = 6 |
| sampling_seed = 6 |
| train_dataset_seed = 11 |
| val_dataset_seed = 43 |
| normalize_embeddings = true |
| mixed_precision = "bf16" |
| weight_init = "kaiming" |
| dataset = "nq" |
| max_seq_length = 32 |
| unsup_emb = "stella" |
| sup_emb = "gte" |
| n_embs_per_batch = 1 |
| finetune_mode = false |
| noise_level = 0.0 |
| style = "res_mlp" |
| norm_style = "batch" |
| depth = 3 |
| transform_depth = 4 |
| d_adapter = 1024 |
| d_hidden = 1024 |
| d_transform = 1024 |
| use_small_output_adapters = false |
| use_residual_adapters = true |
| gan_style = "least_squares" |
| disc_depth = 5 |
| disc_dim = 1024 |
| use_residual = true |
| bs = 256 |
| gradient_accumulation_steps = 1 |
| lr = 2e-5 |
| no_scheduler = true |
| max_grad_norm = 1.0 |
| loss_coefficient_reverse_rec = 0.0 |
| loss_coefficient_rec = 1.0 |
| loss_coefficient_vsp = 1.0 |
| loss_coefficient_cc_trans = 10.0 |
| loss_coefficient_cc_rec = 0.0 |
| loss_coefficient_cc_vsp = 10.0 |
| loss_coefficient_r1_penalty = 0.01 |
| warmup_length = 2000 |
| patience = 20 |
| min_delta = 0.0 |
| min_epochs = 80 |
| disc_lr = 1e-5 |
| eps = 6.25e-10 |
| smooth = 0.9 |
| loss_coefficient_disc = 1.0 |
| loss_coefficient_gen = 1.0 |
| loss_coefficient_latent_gen = 1.0 |
| loss_coefficient_similarity_gen = 0.0 |
| val_size = 4096 |
| val_bs = 1024 |
| top_k_size = 1024 |
| top_k_batches = 4 |
| k = 16 |
| heatmap_size = 64 |
| use_wandb = true |
| wandb_project = "unsupervised_disc" |
| wandb_name = "e:100,n:400000" |
| load_dir = "./finetuning_unsupervised/n:100000,e:10,s:n_double,d:4,d:4,d:64/" |
| save_dir = "./finetuning_unsupervised/{}/" |
| force_dump = true |
| epochs = 100 |
| num_points = 400000 |
| num_params = 21004800 |
| num_disc_params = 3943425 |
| num_sup_disc_params = 3943425 |
| num_latent_disc_params = 4205569 |
| num_similarity_disc_params = 3419137 |
|
|