Upload folder using huggingface_hub
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .gitattributes +20 -0
- evaluation/analysis/artifact_index.json +131 -0
- evaluation/plots/cka_subspace_matrix.png +0 -0
- evaluation/plots/llm_detection_rates_bar.png +3 -0
- evaluation/plots/test_main_cal_calibration.png +3 -0
- evaluation/plots/test_main_calibration.png +3 -0
- evaluation/plots/test_main_confusion.png +0 -0
- evaluation/plots/test_main_f1_sensitivity.png +3 -0
- evaluation/plots/test_main_pca_z_lex.png +3 -0
- evaluation/plots/test_main_pca_z_sem.png +3 -0
- evaluation/plots/test_main_pca_z_syn.png +3 -0
- evaluation/plots/test_main_pca_zsem.png +3 -0
- evaluation/plots/test_main_pr.png +0 -0
- evaluation/plots/test_main_roc.png +3 -0
- evaluation/plots/test_main_score_dist.png +3 -0
- evaluation/plots/test_main_violin_dist.png +3 -0
- evaluation/plots/val_cosine_heatmap_zsem.png +3 -0
- evaluation/plots/val_main_cal_calibration.png +3 -0
- evaluation/plots/val_main_calibration.png +3 -0
- evaluation/plots/val_main_confusion.png +0 -0
- evaluation/plots/val_main_pca_z_lex.png +3 -0
- evaluation/plots/val_main_pca_z_sem.png +3 -0
- evaluation/plots/val_main_pca_z_syn.png +3 -0
- evaluation/plots/val_main_pca_zsem.png +3 -0
- evaluation/plots/val_main_pr.png +0 -0
- evaluation/plots/val_main_roc.png +3 -0
- evaluation/plots/val_main_score_dist.png +3 -0
- evaluation/tables/ablations_weights_regularizers.csv +15 -0
- evaluation/tables/baselines_simple.csv +5 -0
- evaluation/tables/baselines_trained.csv +7 -0
- evaluation/tables/bootstrap_ci_main_test.csv +9 -0
- evaluation/tables/calibrated_metrics.csv +3 -0
- evaluation/tables/cka_disentanglement.json +7 -0
- evaluation/tables/final_overall_table.csv +29 -0
- evaluation/tables/hard_correct_low_margin.csv +0 -0
- evaluation/tables/latency_comparison.csv +5 -0
- evaluation/tables/llm_paraphrase_detection_results.csv +37 -0
- evaluation/tables/main_metrics.csv +3 -0
- evaluation/tables/misclassified_examples.csv +0 -0
- evaluation/tables/robustness_summary.csv +2 -0
- evaluation/tables/significance_tests.csv +4 -0
- evaluation/tables/structural_probing.json +20 -0
- model/ablations/ablation_weights_no_aux.pt +3 -0
- model/ablations/ablation_weights_no_ortho.pt +3 -0
- model/ablations/ablation_weights_no_simplex.pt +3 -0
- model/ablations/ablation_weights_w050_050.pt +3 -0
- model/ablations/ablation_weights_w060_040.pt +3 -0
- model/ablations/ablation_weights_w070_030.pt +3 -0
- model/ablations/ablation_weights_w080_020.pt +3 -0
- model/baselines/baseline_single_a.pt +3 -0
.gitattributes
CHANGED
|
@@ -33,3 +33,23 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
| 34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
+
evaluation/plots/llm_detection_rates_bar.png filter=lfs diff=lfs merge=lfs -text
|
| 37 |
+
evaluation/plots/test_main_cal_calibration.png filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
evaluation/plots/test_main_calibration.png filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
evaluation/plots/test_main_f1_sensitivity.png filter=lfs diff=lfs merge=lfs -text
|
| 40 |
+
evaluation/plots/test_main_pca_z_lex.png filter=lfs diff=lfs merge=lfs -text
|
| 41 |
+
evaluation/plots/test_main_pca_z_sem.png filter=lfs diff=lfs merge=lfs -text
|
| 42 |
+
evaluation/plots/test_main_pca_z_syn.png filter=lfs diff=lfs merge=lfs -text
|
| 43 |
+
evaluation/plots/test_main_pca_zsem.png filter=lfs diff=lfs merge=lfs -text
|
| 44 |
+
evaluation/plots/test_main_roc.png filter=lfs diff=lfs merge=lfs -text
|
| 45 |
+
evaluation/plots/test_main_score_dist.png filter=lfs diff=lfs merge=lfs -text
|
| 46 |
+
evaluation/plots/test_main_violin_dist.png filter=lfs diff=lfs merge=lfs -text
|
| 47 |
+
evaluation/plots/val_cosine_heatmap_zsem.png filter=lfs diff=lfs merge=lfs -text
|
| 48 |
+
evaluation/plots/val_main_cal_calibration.png filter=lfs diff=lfs merge=lfs -text
|
| 49 |
+
evaluation/plots/val_main_calibration.png filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
evaluation/plots/val_main_pca_z_lex.png filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
evaluation/plots/val_main_pca_z_sem.png filter=lfs diff=lfs merge=lfs -text
|
| 52 |
+
evaluation/plots/val_main_pca_z_syn.png filter=lfs diff=lfs merge=lfs -text
|
| 53 |
+
evaluation/plots/val_main_pca_zsem.png filter=lfs diff=lfs merge=lfs -text
|
| 54 |
+
evaluation/plots/val_main_roc.png filter=lfs diff=lfs merge=lfs -text
|
| 55 |
+
evaluation/plots/val_main_score_dist.png filter=lfs diff=lfs merge=lfs -text
|
evaluation/analysis/artifact_index.json
ADDED
|
@@ -0,0 +1,131 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"out_dir": "/mnt/heirarchy/models_stage2_hier/acl_final_package",
|
| 3 |
+
"plots_dir": "/mnt/heirarchy/models_stage2_hier/acl_final_package/plots",
|
| 4 |
+
"tables_dir": "/mnt/heirarchy/models_stage2_hier/acl_final_package/tables",
|
| 5 |
+
"checkpoints_dir": "/mnt/heirarchy/models_stage2_hier/acl_final_package/checkpoints",
|
| 6 |
+
"main_ckpt_used": "/mnt/heirarchy/models_stage2_hier/best_model.pt",
|
| 7 |
+
"main_threshold": 0.5544444444444444,
|
| 8 |
+
"calibration_temperature": 1.2828316688537598,
|
| 9 |
+
"cka_disentanglement": {
|
| 10 |
+
"CKA(z_lex,z_syn)": 0.0008444060222245753,
|
| 11 |
+
"CKA(z_lex,z_sem)": 0.0004650297632906586,
|
| 12 |
+
"CKA(z_syn,z_sem)": 0.0006945761269889772,
|
| 13 |
+
"orthogonality_penalty": 3.980763267463772e-06,
|
| 14 |
+
"simplex_penalty": 0.0007237753015942872
|
| 15 |
+
},
|
| 16 |
+
"structural_probing": {
|
| 17 |
+
"z_lex_vs_lexical_overlap_spearman": 0.47922946593127974,
|
| 18 |
+
"z_lex_vs_lexical_overlap_pval": 0.0,
|
| 19 |
+
"z_lex_vs_length_ratio_spearman": 0.4421186338704228,
|
| 20 |
+
"z_lex_vs_length_ratio_pval": 0.0,
|
| 21 |
+
"z_syn_vs_lexical_overlap_spearman": 0.532769976054425,
|
| 22 |
+
"z_syn_vs_lexical_overlap_pval": 0.0,
|
| 23 |
+
"z_syn_vs_length_ratio_spearman": 0.4493643625740219,
|
| 24 |
+
"z_syn_vs_length_ratio_pval": 0.0,
|
| 25 |
+
"z_sem_vs_lexical_overlap_spearman": 0.22826571007995736,
|
| 26 |
+
"z_sem_vs_lexical_overlap_pval": 1.0401520845752815e-234,
|
| 27 |
+
"z_sem_vs_length_ratio_spearman": 0.3032353994433464,
|
| 28 |
+
"z_sem_vs_length_ratio_pval": 0.0,
|
| 29 |
+
"z_lex_probe_acc": 0.9825,
|
| 30 |
+
"z_lex_probe_f1": 0.9868421052631579,
|
| 31 |
+
"z_syn_probe_acc": 0.9825,
|
| 32 |
+
"z_syn_probe_f1": 0.9868371568258744,
|
| 33 |
+
"z_sem_probe_acc": 0.9815,
|
| 34 |
+
"z_sem_probe_f1": 0.9860902255639098
|
| 35 |
+
},
|
| 36 |
+
"parameter_counts": {
|
| 37 |
+
"Main_Hier": 361542923,
|
| 38 |
+
"DeBERTa_single": 184423682,
|
| 39 |
+
"RoBERTa_single": 124647170,
|
| 40 |
+
"Fusion_NoDis": 309134594
|
| 41 |
+
},
|
| 42 |
+
"files": {
|
| 43 |
+
"tables": [
|
| 44 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/ablations_weights_regularizers.csv",
|
| 45 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/baselines_simple.csv",
|
| 46 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/baselines_trained.csv",
|
| 47 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/bootstrap_ci_main_test.csv",
|
| 48 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/calibrated_metrics.csv",
|
| 49 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/cka_disentanglement.json",
|
| 50 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/final_overall_table.csv",
|
| 51 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/hard_correct_low_margin.csv",
|
| 52 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/latency_comparison.csv",
|
| 53 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/main_metrics.csv",
|
| 54 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/misclassified_examples.csv",
|
| 55 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/robustness_summary.csv",
|
| 56 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/significance_tests.csv",
|
| 57 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/tables/structural_probing.json"
|
| 58 |
+
],
|
| 59 |
+
"plots": [
|
| 60 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/cka_subspace_matrix.png",
|
| 61 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/test_main_cal_calibration.png",
|
| 62 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/test_main_calibration.png",
|
| 63 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/test_main_confusion.png",
|
| 64 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/test_main_pca_z_lex.png",
|
| 65 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/test_main_pca_z_sem.png",
|
| 66 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/test_main_pca_z_syn.png",
|
| 67 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/test_main_pca_zsem.png",
|
| 68 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/test_main_pr.png",
|
| 69 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/test_main_roc.png",
|
| 70 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/test_main_score_dist.png",
|
| 71 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/val_cosine_heatmap_zsem.png",
|
| 72 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/val_main_cal_calibration.png",
|
| 73 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/val_main_calibration.png",
|
| 74 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/val_main_confusion.png",
|
| 75 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/val_main_pca_z_lex.png",
|
| 76 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/val_main_pca_z_sem.png",
|
| 77 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/val_main_pca_z_syn.png",
|
| 78 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/val_main_pca_zsem.png",
|
| 79 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/val_main_pr.png",
|
| 80 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/val_main_roc.png",
|
| 81 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/plots/val_main_score_dist.png"
|
| 82 |
+
],
|
| 83 |
+
"checkpoints": [
|
| 84 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/checkpoints/ablation_weights_no_aux.pt",
|
| 85 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/checkpoints/ablation_weights_no_ortho.pt",
|
| 86 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/checkpoints/ablation_weights_no_simplex.pt",
|
| 87 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/checkpoints/ablation_weights_w050_050.pt",
|
| 88 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/checkpoints/ablation_weights_w060_040.pt",
|
| 89 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/checkpoints/ablation_weights_w070_030.pt",
|
| 90 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/checkpoints/ablation_weights_w080_020.pt",
|
| 91 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/checkpoints/baseline_single_a.pt",
|
| 92 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/checkpoints/baseline_single_b.pt",
|
| 93 |
+
"/mnt/heirarchy/models_stage2_hier/acl_final_package/checkpoints/fusion_no_disentangle.pt"
|
| 94 |
+
]
|
| 95 |
+
},
|
| 96 |
+
"cfg": {
|
| 97 |
+
"data_root": "/mnt/heirarchy/data_stage1",
|
| 98 |
+
"model_root": "/mnt/heirarchy/models_stage2_hier",
|
| 99 |
+
"main_ckpt_name": "best_model.pt",
|
| 100 |
+
"out_dir_name": "acl_final_package",
|
| 101 |
+
"seed": 42,
|
| 102 |
+
"device": "cuda",
|
| 103 |
+
"enc_a": "microsoft/deberta-v3-base",
|
| 104 |
+
"enc_b": "roberta-base",
|
| 105 |
+
"max_len": 128,
|
| 106 |
+
"batch_size_eval": 128,
|
| 107 |
+
"num_workers_eval": 4,
|
| 108 |
+
"d_lex": 256,
|
| 109 |
+
"d_syn": 256,
|
| 110 |
+
"d_sem": 256,
|
| 111 |
+
"threshold_grid_n": 199,
|
| 112 |
+
"bootstrap_n": 1000,
|
| 113 |
+
"bootstrap_seed": 123,
|
| 114 |
+
"noise_trials": 20,
|
| 115 |
+
"noise_std": 0.1,
|
| 116 |
+
"train_baselines": true,
|
| 117 |
+
"train_ablations": true,
|
| 118 |
+
"epochs_small": 2,
|
| 119 |
+
"lr_enc": 2e-06,
|
| 120 |
+
"lr_head": 1e-05,
|
| 121 |
+
"weight_decay": 0.01,
|
| 122 |
+
"grad_clip": 1.0,
|
| 123 |
+
"grad_accum": 2,
|
| 124 |
+
"warmup_frac": 0.1,
|
| 125 |
+
"use_amp_bf16": true,
|
| 126 |
+
"baseline_batch_start": 96,
|
| 127 |
+
"baseline_num_workers": 4,
|
| 128 |
+
"latency_warmup_batches": 3,
|
| 129 |
+
"latency_measure_batches": 20
|
| 130 |
+
}
|
| 131 |
+
}
|
evaluation/plots/cka_subspace_matrix.png
ADDED
|
evaluation/plots/llm_detection_rates_bar.png
ADDED
|
Git LFS Details
|
evaluation/plots/test_main_cal_calibration.png
ADDED
|
Git LFS Details
|
evaluation/plots/test_main_calibration.png
ADDED
|
Git LFS Details
|
evaluation/plots/test_main_confusion.png
ADDED
|
evaluation/plots/test_main_f1_sensitivity.png
ADDED
|
Git LFS Details
|
evaluation/plots/test_main_pca_z_lex.png
ADDED
|
Git LFS Details
|
evaluation/plots/test_main_pca_z_sem.png
ADDED
|
Git LFS Details
|
evaluation/plots/test_main_pca_z_syn.png
ADDED
|
Git LFS Details
|
evaluation/plots/test_main_pca_zsem.png
ADDED
|
Git LFS Details
|
evaluation/plots/test_main_pr.png
ADDED
|
evaluation/plots/test_main_roc.png
ADDED
|
Git LFS Details
|
evaluation/plots/test_main_score_dist.png
ADDED
|
Git LFS Details
|
evaluation/plots/test_main_violin_dist.png
ADDED
|
Git LFS Details
|
evaluation/plots/val_cosine_heatmap_zsem.png
ADDED
|
Git LFS Details
|
evaluation/plots/val_main_cal_calibration.png
ADDED
|
Git LFS Details
|
evaluation/plots/val_main_calibration.png
ADDED
|
Git LFS Details
|
evaluation/plots/val_main_confusion.png
ADDED
|
evaluation/plots/val_main_pca_z_lex.png
ADDED
|
Git LFS Details
|
evaluation/plots/val_main_pca_z_sem.png
ADDED
|
Git LFS Details
|
evaluation/plots/val_main_pca_z_syn.png
ADDED
|
Git LFS Details
|
evaluation/plots/val_main_pca_zsem.png
ADDED
|
Git LFS Details
|
evaluation/plots/val_main_pr.png
ADDED
|
evaluation/plots/val_main_roc.png
ADDED
|
Git LFS Details
|
evaluation/plots/val_main_score_dist.png
ADDED
|
Git LFS Details
|
evaluation/tables/ablations_weights_regularizers.csv
ADDED
|
@@ -0,0 +1,15 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Split,F1,Accuracy,Precision,Recall,AUC_ROC,AUC_PR,MCC,Brier,Threshold,PosRate,PredPosRate,ECE
|
| 2 |
+
Ablation_w060_040,val,0.9843691477939525,0.9791333333333333,0.9831662427053718,0.985575,0.997693409375,0.9988274203102888,0.9529988669495449,0.01671784785590838,0.6385858585858586,0.6666666666666666,0.6683,0.32218480102844405
|
| 3 |
+
Ablation_w060_040,test,0.9847921088775128,0.9797,0.9836867049139436,0.9859,0.99779508234375,0.9988660309149758,0.9542790049343938,0.016326491194564294,0.6385858585858586,0.6666666666666666,0.6681666666666667,0.32282863776160947
|
| 4 |
+
Ablation_w080_020,val,0.9844113846211473,0.9791916666666667,0.9833002831094648,0.985525,0.99770540703125,0.9988325662771161,0.9531337276390448,0.01670249371391625,0.6534343434343434,0.6666666666666666,0.668175,0.32183524033744926
|
| 5 |
+
Ablation_w080_020,test,0.9847852010014172,0.9796916666666666,0.983722699661981,0.98585,0.9978058028124999,0.9988708003116085,0.9542618213721543,0.01631232422072963,0.6534343434343434,0.6666666666666666,0.6681083333333333,0.3224618584667339
|
| 6 |
+
Ablation_w050_050,val,0.9844119685118018,0.9791916666666667,0.9832641201192214,0.9855625,0.9976962893750001,0.9988292112881609,0.9531323390988441,0.016732426177827286,0.6534343434343434,0.6666666666666666,0.668225,0.32185190810303865
|
| 7 |
+
Ablation_w050_050,test,0.9847414029917838,0.9796333333333334,0.9836850770842688,0.9858,0.997800121875,0.9988692136142501,0.9541306575870855,0.016351432126203075,0.6534343434343434,0.6666666666666666,0.6681,0.32251145017204375
|
| 8 |
+
Ablation_w070_030,val,0.9844859649779708,0.9792833333333333,0.9830014207023753,0.985975,0.9977294720312498,0.9988471970819475,0.9533267355818308,0.01663759113924582,0.6088888888888889,0.6666666666666666,0.6686833333333333,0.3220832024911769
|
| 9 |
+
Ablation_w070_030,test,0.9847286797516148,0.9796083333333333,0.9832865956253505,0.986175,0.997827079375,0.9988843568012699,0.9540605516647764,0.016256614222825583,0.6088888888888889,0.6666666666666666,0.668625,0.3230683782519989
|
| 10 |
+
Ablation_no_ortho,val,0.9843961711613269,0.979175,0.9834566391775729,0.9853375,0.99768869140625,0.9988243736670946,0.9531028302312351,0.016731035523894367,0.6534343434343434,0.6666666666666666,0.6679416666666667,0.3220496838060625
|
| 11 |
+
Ablation_no_ortho,test,0.9847385445417193,0.9796333333333334,0.9838661376057498,0.9856125,0.9977883821875,0.9988622933644848,0.9541376820992606,0.01631981225885173,0.6534343434343434,0.6666666666666666,0.66785,0.3227769878038881
|
| 12 |
+
Ablation_no_simplex,val,0.9843804624681616,0.97915,0.9832510226479098,0.9855125,0.9977131526562498,0.9988352887862108,0.9530391733707425,0.016612307611900797,0.6237373737373737,0.6666666666666666,0.6682,0.3220004786138208
|
| 13 |
+
Ablation_no_simplex,test,0.9848097946543963,0.979725,0.983771782110292,0.98585,0.99781884484375,0.9988788825066155,0.9543378156377528,0.01623362765250846,0.6237373737373737,0.6666666666666666,0.668075,0.32267068606601873
|
| 14 |
+
Ablation_no_aux,val,0.9844582870410825,0.97925,0.9831450868925624,0.985775,0.9977012100000001,0.9988304959228492,0.9532576662357405,0.016633437165139008,0.6286868686868686,0.6666666666666666,0.66845,0.32205011862544175
|
| 15 |
+
Ablation_no_aux,test,0.9847878602505602,0.9796916666666666,0.9835538210246755,0.986025,0.9977947654687499,0.9988639976879095,0.9542555689801461,0.01626560819270326,0.6286868686868686,0.6666666666666666,0.6683416666666666,0.3226737070147825
|
evaluation/tables/baselines_simple.csv
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Split,F1,Accuracy,Precision,Recall,AUC_ROC,AUC_PR,MCC,Brier,Threshold,PosRate,PredPosRate,ECE
|
| 2 |
+
Majority,val,0.8,0.6666666666666666,0.6666666666666666,1.0,0.5,0.6666666666666666,0.0,0.3333333333333333,0.5,0.6666666666666666,1.0,0.33333333333333337
|
| 3 |
+
Majority,test,0.8,0.6666666666666666,0.6666666666666666,1.0,0.5,0.6666666666666666,0.0,0.3333333333333333,0.5,0.6666666666666666,1.0,0.33333333333333337
|
| 4 |
+
Random,val,0.5716080312341635,0.4998416666666667,0.6662229838776766,0.500525,0.4996836871875,0.6671614714263645,-0.0009428104307880674,0.3339217430811074,0.5,0.6666666666666666,0.5008583333333333,0.12943867729580721
|
| 5 |
+
Random,test,0.5728642498733075,0.5013083333333334,0.6676871370813436,0.501625,0.5004068259375,0.6663609042103221,0.002168463990812555,0.3336011523544602,0.5,0.6666666666666666,0.5008583333333333,0.12985453809448294
|
evaluation/tables/baselines_trained.csv
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Split,F1,Accuracy,Precision,Recall,AUC_ROC,AUC_PR,MCC,Brier,Threshold,PosRate,PredPosRate,ECE
|
| 2 |
+
single_a,val,0.9663489690400549,0.9549833333333333,0.963169005339625,0.96955,0.9912929257812501,0.9957359907202555,0.8984205278079552,0.03380162120526994,0.5544444444444444,0.6666666666666666,0.6710833333333334,0.29008581158202973
|
| 3 |
+
single_a,test,0.9661497083159527,0.9546916666666667,0.9624406157357447,0.9698875,0.9913094723437501,0.995762771322309,0.8977212598004605,0.034033367701225774,0.5544444444444444,0.6666666666666666,0.671825,0.2904249676381496
|
| 4 |
+
single_b,val,0.9630006073602142,0.95025,0.9549868476042973,0.97115,0.9901091703125,0.9951647831887384,0.8873980693561916,0.03825369490427699,0.26242424242424245,0.6666666666666666,0.67795,0.30390818855774443
|
| 5 |
+
single_b,test,0.9619146799568564,0.9488,0.9540960625660952,0.9698625,0.9897823395312499,0.9950351191004415,0.8841164271113443,0.03911290967531241,0.26242424242424245,0.6666666666666666,0.6776833333333333,0.30312035976796636
|
| 6 |
+
fusion_no_disentangle,val,0.9721020680370477,0.9627,0.9694313915616221,0.9747875,0.9937600273437499,0.9969086009180431,0.9158741423249567,0.028123683592866097,0.5197979797979798,0.6666666666666666,0.67035,0.3029591020420244
|
| 7 |
+
fusion_no_disentangle,test,0.9715210960610307,0.9618916666666667,0.9680546075085324,0.9750125,0.99359549921875,0.9968455256211879,0.9140005020365057,0.028654657741795552,0.5197979797979798,0.6666666666666666,0.6714583333333334,0.30299720453313167
|
evaluation/tables/bootstrap_ci_main_test.csv
ADDED
|
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Metric,Mean,Std,CI_2p5,CI_97p5
|
| 2 |
+
F1,0.9845918766604145,0.00030190005853397884,0.9840091178021428,0.9852012456834911
|
| 3 |
+
Accuracy,0.9794132249999999,0.0003976940426926965,0.9786414583333333,0.9802166666666666
|
| 4 |
+
Precision,0.9825064592380237,0.00046424626114902463,0.9816129046395479,0.9833785105651656
|
| 5 |
+
Recall,0.9866863639486735,0.000401180822596787,0.9859045059557503,0.9874505767904969
|
| 6 |
+
AUC_ROC,0.9978006175390444,6.991096433653672e-05,0.9976582487022725,0.9979278452430971
|
| 7 |
+
AUC_PR,0.9988723777435939,4.607768384939104e-05,0.9987745492939044,0.9989541009887469
|
| 8 |
+
MCC,0.9536026940037996,0.0008926468344904707,0.9519090282731164,0.9553955430476658
|
| 9 |
+
Brier,0.01592975333410666,0.0002803061094560436,0.015386495012964699,0.016499933598053452
|
evaluation/tables/calibrated_metrics.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Split,Temperature,F1,Accuracy,Precision,Recall,AUC_ROC,AUC_PR,MCC,Brier,Threshold,PosRate,PredPosRate,ECE
|
| 2 |
+
Main_Hier_TempScaled,val,1.2828316688537598,0.984432571275382,0.9791916666666667,0.9819898257441013,0.9868875,0.9976992392187499,0.9988337505797158,0.9530910095840649,0.015995269378800053,0.5395959595959596,0.6666666666666666,0.6699916666666667,0.313200017021315
|
| 3 |
+
Main_Hier_TempScaled,test,1.2828316688537598,0.9846032963538062,0.979425,0.9824039622195399,0.9868125,0.9978019790624999,0.9988735303622573,0.9536238465534232,0.015635657094649303,0.5395959595959596,0.6666666666666666,0.6696583333333334,0.313609491239351
|
evaluation/tables/cka_disentanglement.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"CKA(z_lex,z_syn)": 0.0008444060222245753,
|
| 3 |
+
"CKA(z_lex,z_sem)": 0.0004650297632906586,
|
| 4 |
+
"CKA(z_syn,z_sem)": 0.0006945761269889772,
|
| 5 |
+
"orthogonality_penalty": 3.980763267463772e-06,
|
| 6 |
+
"simplex_penalty": 0.0007237753015942872
|
| 7 |
+
}
|
evaluation/tables/final_overall_table.csv
ADDED
|
@@ -0,0 +1,29 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Split,F1,Accuracy,Precision,Recall,AUC_ROC,AUC_PR,MCC,Brier,ECE,Threshold,PosRate,PredPosRate
|
| 2 |
+
Main_Hier,val,0.9844434883996434,0.9792083333333333,0.9820982048443079,0.9868,0.9976992098437498,0.9988336959906345,0.9531311937410741,0.016322911332254584,0.3190471318293634,0.5544444444444444,0.6666666666666666,0.6698583333333333
|
| 3 |
+
Main_Hier,test,0.9845952351253586,0.9794166666666667,0.9825118247448344,0.9866875,0.9978019731250002,0.9988734821827461,0.9536083367983766,0.015926154131562547,0.3196304980424112,0.5544444444444444,0.6666666666666666,0.6695
|
| 4 |
+
Main_Hier_TempScaled,val,0.984432571275382,0.9791916666666667,0.9819898257441013,0.9868875,0.9976992392187499,0.9988337505797158,0.9530910095840649,0.015995269378800053,0.313200017021315,0.5395959595959596,0.6666666666666666,0.6699916666666667
|
| 5 |
+
Main_Hier_TempScaled,test,0.9846032963538062,0.979425,0.9824039622195399,0.9868125,0.9978019790624999,0.9988735303622573,0.9536238465534232,0.015635657094649303,0.313609491239351,0.5395959595959596,0.6666666666666666,0.6696583333333334
|
| 6 |
+
Majority,val,0.8,0.6666666666666666,0.6666666666666666,1.0,0.5,0.6666666666666666,0.0,0.3333333333333333,0.33333333333333337,0.5,0.6666666666666666,1.0
|
| 7 |
+
Majority,test,0.8,0.6666666666666666,0.6666666666666666,1.0,0.5,0.6666666666666666,0.0,0.3333333333333333,0.33333333333333337,0.5,0.6666666666666666,1.0
|
| 8 |
+
Random,val,0.5716080312341635,0.4998416666666667,0.6662229838776766,0.500525,0.4996836871875,0.6671614714263645,-0.0009428104307880674,0.3339217430811074,0.12943867729580721,0.5,0.6666666666666666,0.5008583333333333
|
| 9 |
+
Random,test,0.5728642498733075,0.5013083333333334,0.6676871370813436,0.501625,0.5004068259375,0.6663609042103221,0.002168463990812555,0.3336011523544602,0.12985453809448294,0.5,0.6666666666666666,0.5008583333333333
|
| 10 |
+
single_a,val,0.9663489690400549,0.9549833333333333,0.963169005339625,0.96955,0.9912929257812501,0.9957359907202555,0.8984205278079552,0.03380162120526994,0.29008581158202973,0.5544444444444444,0.6666666666666666,0.6710833333333334
|
| 11 |
+
single_a,test,0.9661497083159527,0.9546916666666667,0.9624406157357447,0.9698875,0.9913094723437501,0.995762771322309,0.8977212598004605,0.034033367701225774,0.2904249676381496,0.5544444444444444,0.6666666666666666,0.671825
|
| 12 |
+
single_b,val,0.9630006073602142,0.95025,0.9549868476042973,0.97115,0.9901091703125,0.9951647831887384,0.8873980693561916,0.03825369490427699,0.30390818855774443,0.26242424242424245,0.6666666666666666,0.67795
|
| 13 |
+
single_b,test,0.9619146799568564,0.9488,0.9540960625660952,0.9698625,0.9897823395312499,0.9950351191004415,0.8841164271113443,0.03911290967531241,0.30312035976796636,0.26242424242424245,0.6666666666666666,0.6776833333333333
|
| 14 |
+
fusion_no_disentangle,val,0.9721020680370477,0.9627,0.9694313915616221,0.9747875,0.9937600273437499,0.9969086009180431,0.9158741423249567,0.028123683592866097,0.3029591020420244,0.5197979797979798,0.6666666666666666,0.67035
|
| 15 |
+
fusion_no_disentangle,test,0.9715210960610307,0.9618916666666667,0.9680546075085324,0.9750125,0.99359549921875,0.9968455256211879,0.9140005020365057,0.028654657741795552,0.30299720453313167,0.5197979797979798,0.6666666666666666,0.6714583333333334
|
| 16 |
+
Ablation_w060_040,val,0.9843691477939525,0.9791333333333333,0.9831662427053718,0.985575,0.997693409375,0.9988274203102888,0.9529988669495449,0.01671784785590838,0.32218480102844405,0.6385858585858586,0.6666666666666666,0.6683
|
| 17 |
+
Ablation_w060_040,test,0.9847921088775128,0.9797,0.9836867049139436,0.9859,0.99779508234375,0.9988660309149758,0.9542790049343938,0.016326491194564294,0.32282863776160947,0.6385858585858586,0.6666666666666666,0.6681666666666667
|
| 18 |
+
Ablation_w080_020,val,0.9844113846211473,0.9791916666666667,0.9833002831094648,0.985525,0.99770540703125,0.9988325662771161,0.9531337276390448,0.01670249371391625,0.32183524033744926,0.6534343434343434,0.6666666666666666,0.668175
|
| 19 |
+
Ablation_w080_020,test,0.9847852010014172,0.9796916666666666,0.983722699661981,0.98585,0.9978058028124999,0.9988708003116085,0.9542618213721543,0.01631232422072963,0.3224618584667339,0.6534343434343434,0.6666666666666666,0.6681083333333333
|
| 20 |
+
Ablation_w050_050,val,0.9844119685118018,0.9791916666666667,0.9832641201192214,0.9855625,0.9976962893750001,0.9988292112881609,0.9531323390988441,0.016732426177827286,0.32185190810303865,0.6534343434343434,0.6666666666666666,0.668225
|
| 21 |
+
Ablation_w050_050,test,0.9847414029917838,0.9796333333333334,0.9836850770842688,0.9858,0.997800121875,0.9988692136142501,0.9541306575870855,0.016351432126203075,0.32251145017204375,0.6534343434343434,0.6666666666666666,0.6681
|
| 22 |
+
Ablation_w070_030,val,0.9844859649779708,0.9792833333333333,0.9830014207023753,0.985975,0.9977294720312498,0.9988471970819475,0.9533267355818308,0.01663759113924582,0.3220832024911769,0.6088888888888889,0.6666666666666666,0.6686833333333333
|
| 23 |
+
Ablation_w070_030,test,0.9847286797516148,0.9796083333333333,0.9832865956253505,0.986175,0.997827079375,0.9988843568012699,0.9540605516647764,0.016256614222825583,0.3230683782519989,0.6088888888888889,0.6666666666666666,0.668625
|
| 24 |
+
Ablation_no_ortho,val,0.9843961711613269,0.979175,0.9834566391775729,0.9853375,0.99768869140625,0.9988243736670946,0.9531028302312351,0.016731035523894367,0.3220496838060625,0.6534343434343434,0.6666666666666666,0.6679416666666667
|
| 25 |
+
Ablation_no_ortho,test,0.9847385445417193,0.9796333333333334,0.9838661376057498,0.9856125,0.9977883821875,0.9988622933644848,0.9541376820992606,0.01631981225885173,0.3227769878038881,0.6534343434343434,0.6666666666666666,0.66785
|
| 26 |
+
Ablation_no_simplex,val,0.9843804624681616,0.97915,0.9832510226479098,0.9855125,0.9977131526562498,0.9988352887862108,0.9530391733707425,0.016612307611900797,0.3220004786138208,0.6237373737373737,0.6666666666666666,0.6682
|
| 27 |
+
Ablation_no_simplex,test,0.9848097946543963,0.979725,0.983771782110292,0.98585,0.99781884484375,0.9988788825066155,0.9543378156377528,0.01623362765250846,0.32267068606601873,0.6237373737373737,0.6666666666666666,0.668075
|
| 28 |
+
Ablation_no_aux,val,0.9844582870410825,0.97925,0.9831450868925624,0.985775,0.9977012100000001,0.9988304959228492,0.9532576662357405,0.016633437165139008,0.32205011862544175,0.6286868686868686,0.6666666666666666,0.66845
|
| 29 |
+
Ablation_no_aux,test,0.9847878602505602,0.9796916666666666,0.9835538210246755,0.986025,0.9977947654687499,0.9988639976879095,0.9542555689801461,0.01626560819270326,0.3226737070147825,0.6286868686868686,0.6666666666666666,0.6683416666666666
|
evaluation/tables/hard_correct_low_margin.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evaluation/tables/latency_comparison.csv
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,ms_per_sample,samples_per_sec,total_batches,total_samples,Params_M
|
| 2 |
+
Main_Hier,0.5217993179726932,1916.4455865623268,20,2560,361.542923
|
| 3 |
+
DeBERTa_single,0.2041170511688506,4899.149748997577,20,2560,184.423682
|
| 4 |
+
RoBERTa_single,0.09236764336009173,10826.301977864025,20,2560,124.64717
|
| 5 |
+
Fusion_NoDis,0.28973708710822166,3451.404892555171,20,2560,309.134594
|
evaluation/tables/llm_paraphrase_detection_results.csv
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Evaluator_Model,Generator_LLM,Recall_Detection_Rate,Mean_Confidence
|
| 2 |
+
Main_Hier,Claude Haiku 3.5,1.0,0.9999719858169556
|
| 3 |
+
Main_Hier,Claude Opus,1.0,0.9999507665634155
|
| 4 |
+
Main_Hier,Claude Sonnet 4.5,1.0,0.9999481439590454
|
| 5 |
+
Main_Hier,Gpt 5,1.0,0.999935507774353
|
| 6 |
+
Main_Hier,Grok 4,1.0,0.9999352693557739
|
| 7 |
+
Main_Hier,Llama 4,1.0,0.9999626278877258
|
| 8 |
+
DeBERTa_Single,Claude Haiku 3.5,1.0,0.9999567270278931
|
| 9 |
+
DeBERTa_Single,Claude Opus,1.0,0.9999545812606812
|
| 10 |
+
DeBERTa_Single,Claude Sonnet 4.5,1.0,0.9999490976333618
|
| 11 |
+
DeBERTa_Single,Gpt 5,1.0,0.999960720539093
|
| 12 |
+
DeBERTa_Single,Grok 4,1.0,0.9999508857727051
|
| 13 |
+
DeBERTa_Single,Llama 4,1.0,0.9999526739120483
|
| 14 |
+
RoBERTa_Single,Claude Haiku 3.5,1.0,0.8706628084182739
|
| 15 |
+
RoBERTa_Single,Claude Opus,1.0,0.8086847066879272
|
| 16 |
+
RoBERTa_Single,Claude Sonnet 4.5,1.0,0.7935842275619507
|
| 17 |
+
RoBERTa_Single,Gpt 5,1.0,0.790298581123352
|
| 18 |
+
RoBERTa_Single,Grok 4,1.0,0.8576690554618835
|
| 19 |
+
RoBERTa_Single,Llama 4,1.0,0.8422956466674805
|
| 20 |
+
Fusion_NoDisentangle,Claude Haiku 3.5,1.0,0.9995506405830383
|
| 21 |
+
Fusion_NoDisentangle,Claude Opus,1.0,0.9996359944343567
|
| 22 |
+
Fusion_NoDisentangle,Claude Sonnet 4.5,1.0,0.9996067881584167
|
| 23 |
+
Fusion_NoDisentangle,Gpt 5,1.0,0.9996180534362793
|
| 24 |
+
Fusion_NoDisentangle,Grok 4,1.0,0.9996291995048523
|
| 25 |
+
Fusion_NoDisentangle,Llama 4,1.0,0.9996486902236938
|
| 26 |
+
Ablation_w050_050,Claude Haiku 3.5,1.0,0.9999241828918457
|
| 27 |
+
Ablation_w050_050,Claude Opus,1.0,0.9998364448547363
|
| 28 |
+
Ablation_w050_050,Claude Sonnet 4.5,1.0,0.9998245239257812
|
| 29 |
+
Ablation_w050_050,Gpt 5,1.0,0.9997270703315735
|
| 30 |
+
Ablation_w050_050,Grok 4,1.0,0.9997703433036804
|
| 31 |
+
Ablation_w050_050,Llama 4,1.0,0.999880313873291
|
| 32 |
+
Ablation_no_aux,Claude Haiku 3.5,1.0,0.9999264478683472
|
| 33 |
+
Ablation_no_aux,Claude Opus,1.0,0.9998310208320618
|
| 34 |
+
Ablation_no_aux,Claude Sonnet 4.5,1.0,0.9998253583908081
|
| 35 |
+
Ablation_no_aux,Gpt 5,1.0,0.9996639490127563
|
| 36 |
+
Ablation_no_aux,Grok 4,1.0,0.9997631907463074
|
| 37 |
+
Ablation_no_aux,Llama 4,1.0,0.9998739361763
|
evaluation/tables/main_metrics.csv
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Split,F1,Accuracy,Precision,Recall,AUC_ROC,AUC_PR,MCC,Brier,Threshold,PosRate,PredPosRate,ECE,F1_aux_lex,F1_aux_syn,F1_aux_sem
|
| 2 |
+
Main_Hier,val,0.9844434883996434,0.9792083333333333,0.9820982048443079,0.9868,0.9976992098437498,0.9988336959906345,0.9531311937410741,0.016322911332254584,0.5544444444444444,0.6666666666666666,0.6698583333333333,0.3190471318293634,0.9843777273520684,0.984332981849719,0.9844664312717564
|
| 3 |
+
Main_Hier,test,0.9845952351253586,0.9794166666666667,0.9825118247448344,0.9866875,0.9978019731250002,0.9988734821827461,0.9536083367983766,0.015926154131562547,0.5544444444444444,0.6666666666666666,0.6695,0.3196304980424112,0.9845543714807725,0.9845746805111821,0.9846499438412579
|
evaluation/tables/misclassified_examples.csv
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
evaluation/tables/robustness_summary.csv
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Model,Split,mean_neg_prob,mean_pos_prob,median_neg_prob,median_pos_prob,KS_statistic,KS_p_value,noise_mean_acc,noise_std_acc,noise_mean_f1,noise_std_f1
|
| 2 |
+
Main_Hier,test,0.04507184028625488,0.9831141829490662,0.00018522574100643396,0.9999657869338989,0.9559375,0.0,0.9792304166666668,0.00013952486337567092,0.9844527953126972,0.0001042871009220201
|
evaluation/tables/significance_tests.csv
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
Comparison,Delta_F1,p_value,Significant_005
|
| 2 |
+
Main_Hier vs DeBERTa_single,0.01844552680940592,0.0,True
|
| 3 |
+
Main_Hier vs RoBERTa_single,0.022680555168502203,0.0,True
|
| 4 |
+
Main_Hier vs Fusion_NoDis,0.013074139064327972,0.0,True
|
evaluation/tables/structural_probing.json
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"z_lex_vs_lexical_overlap_spearman": 0.47922946593127974,
|
| 3 |
+
"z_lex_vs_lexical_overlap_pval": 0.0,
|
| 4 |
+
"z_lex_vs_length_ratio_spearman": 0.4421186338704228,
|
| 5 |
+
"z_lex_vs_length_ratio_pval": 0.0,
|
| 6 |
+
"z_syn_vs_lexical_overlap_spearman": 0.532769976054425,
|
| 7 |
+
"z_syn_vs_lexical_overlap_pval": 0.0,
|
| 8 |
+
"z_syn_vs_length_ratio_spearman": 0.4493643625740219,
|
| 9 |
+
"z_syn_vs_length_ratio_pval": 0.0,
|
| 10 |
+
"z_sem_vs_lexical_overlap_spearman": 0.22826571007995736,
|
| 11 |
+
"z_sem_vs_lexical_overlap_pval": 1.0401520845752815e-234,
|
| 12 |
+
"z_sem_vs_length_ratio_spearman": 0.3032353994433464,
|
| 13 |
+
"z_sem_vs_length_ratio_pval": 0.0,
|
| 14 |
+
"z_lex_probe_acc": 0.9825,
|
| 15 |
+
"z_lex_probe_f1": 0.9868421052631579,
|
| 16 |
+
"z_syn_probe_acc": 0.9825,
|
| 17 |
+
"z_syn_probe_f1": 0.9868371568258744,
|
| 18 |
+
"z_sem_probe_acc": 0.9815,
|
| 19 |
+
"z_sem_probe_f1": 0.9860902255639098
|
| 20 |
+
}
|
model/ablations/ablation_weights_no_aux.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:360c64d3cdc6f5946db804d07cd8de34abea94453a34d775dfe10309dd4f0df3
|
| 3 |
+
size 1446327303
|
model/ablations/ablation_weights_no_ortho.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d69e6b739e7e557514d73cf410c9e61a60997de466f4515a00f1e39b2c981c6f
|
| 3 |
+
size 1446328235
|
model/ablations/ablation_weights_no_simplex.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:04531aa03b16a67ccad2c89ddbc48659dab77359ee08363a81f425522ae05b31
|
| 3 |
+
size 1446329167
|
model/ablations/ablation_weights_w050_050.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7cd7f2602fe467139658fbc560db5c03214f78c26f9635dcf7bc2b0da4db9335
|
| 3 |
+
size 1446328235
|
model/ablations/ablation_weights_w060_040.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:4837ea4e0afe62d25f27b92b2e593e4d77ed7c15944d8d3e8cbd545b18b5b20c
|
| 3 |
+
size 1446328235
|
model/ablations/ablation_weights_w070_030.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:7ac042d00e5dc6fc167ee84ce0978e081941208816cbc3f0fbf7a62913e1c633
|
| 3 |
+
size 1446328235
|
model/ablations/ablation_weights_w080_020.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:70f3fc73262bab0265569faf8817a55f8f619a3a3336753e87c49e48e7391835
|
| 3 |
+
size 1446328235
|
model/baselines/baseline_single_a.pt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:51c7e62d8a85aecc37d289a58d1cf980d15f5fcf3b8ae18d08e4b3d57016ca84
|
| 3 |
+
size 737763275
|