lestienne commited on Jun 4, 2025

Commit

386cc98

verified ·

1 Parent(s): 5ea870c

Add files using upload-large-folder tool

Browse files

Files changed (49) hide show

lists/banking77/size=32/seed=0/0.0-0.3.txt +93 -0
lists/banking77/size=32/seed=0/0.0-0.7.txt +216 -0
lists/banking77/size=32/seed=0/0.0-1.0.txt +308 -0
lists/banking77/size=32/seed=0/0.7-1.0.txt +92 -0
lists/banking77/size=32/seed=1/0.0-0.3.txt +93 -0
lists/banking77/size=32/seed=1/0.0-0.7.txt +216 -0
lists/banking77/size=32/seed=1/0.0-1.0.txt +308 -0
lists/banking77/size=32/seed=1/0.7-1.0.txt +92 -0
lists/banking77/size=32/seed=5/0.0-0.3.txt +93 -0
lists/banking77/size=32/seed=5/0.0-0.7.txt +216 -0
lists/banking77/size=32/seed=5/0.7-1.0.txt +92 -0
lists/banking77/size=32/seed=7/0.0-0.3.txt +93 -0
lists/banking77/size=32/seed=7/0.0-0.7.txt +216 -0
lists/banking77/size=32/seed=7/0.0-1.0.txt +308 -0
lists/banking77/size=32/seed=7/0.7-1.0.txt +92 -0
prompts/basic_20newsgroups.yaml +22 -0
prompts/basic_agnews.yaml +6 -0
prompts/basic_banking77.yaml +79 -0
prompts/basic_dbpedia.yaml +16 -0
prompts/basic_sst2.yaml +4 -0
src/llmcal/__init__.py +0 -0
src/llmcal/scripts/__init__.py +0 -0
src/llmcal/scripts/affine_calibration.old.py +219 -0
src/llmcal/scripts/affine_calibration.py +203 -0
src/llmcal/scripts/affine_prediction.py +36 -0
src/llmcal/scripts/compare_models.py +110 -0
src/llmcal/scripts/compute_matched_results.py +309 -0
src/llmcal/scripts/create_lists_new.py +54 -0
src/llmcal/scripts/evals.py +42 -0
src/llmcal/scripts/prepare_data.py +64 -0
src/llmcal/scripts/results_bars.py +184 -0
src/llmcal/scripts/results_table.py +167 -0
src/llmcal/scripts/results_vs_samples.py +181 -0
src/llmcal/scripts/run_posteriors.py +193 -0
src/llmcal/scripts/train_lora.py +418 -0
src/llmcal/src/__init__.py +0 -0
src/llmcal/src/evaluation/calibration.py +84 -0
src/llmcal/src/evaluation/metrics.py +86 -0
src/llmcal/src/loggers.py +41 -0
src/llmcal/src/prompts/__init__.py +6 -0
src/llmcal/src/prompts/gemma.py +39 -0
src/llmcal/src/prompts/llama3.py +38 -0
src/llmcal/src/prompts/phi.py +38 -0
src/llmcal/src/prompts/pythia.py +38 -0
src/llmcal/src/prompts/qwen.py +37 -0
src/llmcal/src/prompts/tinyllama.py +40 -0
src/llmcal/src/utils.py +93 -0
src/llmcal/tests/__init__.py +0 -0
src/llmcal/tests/check_lists.py +64 -0

lists/banking77/size=32/seed=0/0.0-0.3.txt ADDED Viewed

	@@ -0,0 +1,93 @@

+8200
+723
+832
+7702
+10558
+2724
+3991
+8087
+4580
+5829
+1223
+11974
+8991
+10115
+11593
+8587
+1287
+8882
+3913
+6998
+12163
+1284
+12023
+12975
+11480
+4042
+4016
+9031
+11034
+10159
+10039
+6372
+8787
+12040
+8743
+7326
+7556
+8780
+9351
+12522
+380
+6307
+9173
+4480
+11113
+3453
+5507
+12060
+3120
+2596
+3557
+3806
+9793
+802
+6929
+11922
+3416
+6763
+4630
+532
+1651
+7104
+6631
+12496
+9678
+12991
+10606
+5669
+11729
+6276
+720
+4933
+732
+5432
+12399
+703
+12141
+6878
+1120
+3015
+5379
+4540
+6314
+8604
+8896
+3014
+7668
+1800
+8216
+3996
+10239
+8844
+9544

lists/banking77/size=32/seed=0/0.0-0.7.txt ADDED Viewed

	@@ -0,0 +1,216 @@

+8200
+723
+832
+7702
+10558
+2724
+3991
+8087
+4580
+5829
+1223
+11974
+8991
+10115
+11593
+8587
+1287
+8882
+3913
+6998
+12163
+1284
+12023
+12975
+11480
+4042
+4016
+9031
+11034
+10159
+10039
+6372
+8787
+12040
+8743
+7326
+7556
+8780
+9351
+12522
+380
+6307
+9173
+4480
+11113
+3453
+5507
+12060
+3120
+2596
+3557
+3806
+9793
+802
+6929
+11922
+3416
+6763
+4630
+532
+1651
+7104
+6631
+12496
+9678
+12991
+10606
+5669
+11729
+6276
+720
+4933
+732
+5432
+12399
+703
+12141
+6878
+1120
+3015
+5379
+4540
+6314
+8604
+8896
+3014
+7668
+1800
+8216
+3996
+10239
+8844
+9544
+4185
+8509
+9747
+5551
+6861
+2508
+8876
+4049
+8649
+2897
+10500
+7933
+12190
+9294
+4577
+11197
+5034
+5740
+9929
+1076
+10826
+9482
+7239
+11458
+8430
+3317
+3801
+3892
+4620
+958
+9871
+6561
+7308
+6105
+12094
+9134
+7103
+1677
+81
+3702
+2250
+6531
+6483
+10062
+3815
+2512
+4387
+12747
+12246
+7676
+9474
+178
+6299
+354
+4428
+11401
+8852
+4722
+8793
+9823
+896
+13073
+11185
+11257
+7631
+3027
+2042
+6576
+1960
+3903
+10724
+10663
+141
+6986
+3985
+4417
+8987
+1236
+1082
+9826
+11057
+4219
+2678
+1323
+12575
+9321
+10018
+12394
+1636
+9990
+10476
+1691
+11576
+8843
+4986
+3414
+10343
+9605
+1929
+8469
+7339
+6385
+4176
+2444
+4996
+3766
+8447
+4314
+4582
+4971
+1221
+123
+4172
+7916
+10882
+7519
+9802
+4262
+4697
+10498
+4836
+7568
+8406

lists/banking77/size=32/seed=0/0.0-1.0.txt ADDED Viewed

	@@ -0,0 +1,308 @@

+8200
+723
+832
+7702
+10558
+2724
+3991
+8087
+4580
+5829
+1223
+11974
+8991
+10115
+11593
+8587
+1287
+8882
+3913
+6998
+12163
+1284
+12023
+12975
+11480
+4042
+4016
+9031
+11034
+10159
+10039
+6372
+8787
+12040
+8743
+7326
+7556
+8780
+9351
+12522
+380
+6307
+9173
+4480
+11113
+3453
+5507
+12060
+3120
+2596
+3557
+3806
+9793
+802
+6929
+11922
+3416
+6763
+4630
+532
+1651
+7104
+6631
+12496
+9678
+12991
+10606
+5669
+11729
+6276
+720
+4933
+732
+5432
+12399
+703
+12141
+6878
+1120
+3015
+5379
+4540
+6314
+8604
+8896
+3014
+7668
+1800
+8216
+3996
+10239
+8844
+9544
+4185
+8509
+9747
+5551
+6861
+2508
+8876
+4049
+8649
+2897
+10500
+7933
+12190
+9294
+4577
+11197
+5034
+5740
+9929
+1076
+10826
+9482
+7239
+11458
+8430
+3317
+3801
+3892
+4620
+958
+9871
+6561
+7308
+6105
+12094
+9134
+7103
+1677
+81
+3702
+2250
+6531
+6483
+10062
+3815
+2512
+4387
+12747
+12246
+7676
+9474
+178
+6299
+354
+4428
+11401
+8852
+4722
+8793
+9823
+896
+13073
+11185
+11257
+7631
+3027
+2042
+6576
+1960
+3903
+10724
+10663
+141
+6986
+3985
+4417
+8987
+1236
+1082
+9826
+11057
+4219
+2678
+1323
+12575
+9321
+10018
+12394
+1636
+9990
+10476
+1691
+11576
+8843
+4986
+3414
+10343
+9605
+1929
+8469
+7339
+6385
+4176
+2444
+4996
+3766
+8447
+4314
+4582
+4971
+1221
+123
+4172
+7916
+10882
+7519
+9802
+4262
+4697
+10498
+4836
+7568
+8406
+4673
+5543
+6448
+6818
+7075
+10106
+4743
+5779
+9052
+9100
+11452
+9203
+232
+129
+11705
+3924
+4110
+2775
+10424
+4254
+153
+12729
+11522
+12384
+3645
+8064
+1817
+7204
+2115
+9209
+6685
+3951
+5877
+9518
+7617
+11777
+5418
+401
+2447
+10679
+961
+10032
+263
+5308
+12198
+6637
+5749
+4684
+11457
+4403
+10389
+2016
+10707
+12159
+11978
+2990
+1993
+12951
+1978
+8162
+3548
+7764
+3678
+10087
+1759
+865
+3791
+4399
+12408
+6950
+11981
+6749
+4764
+6274
+3184
+6574
+2128
+4103
+11835
+9963
+7927
+833
+7475
+11897
+12568
+9281
+8815
+729
+10501
+7441
+146
+6673

lists/banking77/size=32/seed=0/0.7-1.0.txt ADDED Viewed

	@@ -0,0 +1,92 @@

+4673
+5543
+6448
+6818
+7075
+10106
+4743
+5779
+9052
+9100
+11452
+9203
+232
+129
+11705
+3924
+4110
+2775
+10424
+4254
+153
+12729
+11522
+12384
+3645
+8064
+1817
+7204
+2115
+9209
+6685
+3951
+5877
+9518
+7617
+11777
+5418
+401
+2447
+10679
+961
+10032
+263
+5308
+12198
+6637
+5749
+4684
+11457
+4403
+10389
+2016
+10707
+12159
+11978
+2990
+1993
+12951
+1978
+8162
+3548
+7764
+3678
+10087
+1759
+865
+3791
+4399
+12408
+6950
+11981
+6749
+4764
+6274
+3184
+6574
+2128
+4103
+11835
+9963
+7927
+833
+7475
+11897
+12568
+9281
+8815
+729
+10501
+7441
+146
+6673

lists/banking77/size=32/seed=1/0.0-0.3.txt ADDED Viewed

	@@ -0,0 +1,93 @@

+7371
+7722
+237
+1508
+8320
+10115
+10484
+7562
+11005
+11117
+12370
+8585
+10533
+1363
+7018
+6432
+12928
+12218
+3576
+9175
+3738
+9515
+9613
+6629
+5388
+3175
+5751
+10649
+1211
+10064
+3991
+656
+240
+8779
+9704
+3440
+4614
+6730
+12625
+8543
+8183
+1709
+8358
+2282
+12485
+10770
+12682
+12421
+10565
+6029
+5919
+4254
+325
+7110
+1783
+4549
+7609
+5403
+12647
+1573
+10490
+11056
+1163
+5285
+4996
+1065
+164
+9242
+11766
+4236
+8946
+445
+2060
+1272
+490
+11155
+870
+9137
+10214
+5465
+3050
+3521
+12526
+3693
+1604
+926
+12884
+7293
+8235
+11592
+9853
+12160
+3358

lists/banking77/size=32/seed=1/0.0-0.7.txt ADDED Viewed

	@@ -0,0 +1,216 @@

+7371
+7722
+237
+1508
+8320
+10115
+10484
+7562
+11005
+11117
+12370
+8585
+10533
+1363
+7018
+6432
+12928
+12218
+3576
+9175
+3738
+9515
+9613
+6629
+5388
+3175
+5751
+10649
+1211
+10064
+3991
+656
+240
+8779
+9704
+3440
+4614
+6730
+12625
+8543
+8183
+1709
+8358
+2282
+12485
+10770
+12682
+12421
+10565
+6029
+5919
+4254
+325
+7110
+1783
+4549
+7609
+5403
+12647
+1573
+10490
+11056
+1163
+5285
+4996
+1065
+164
+9242
+11766
+4236
+8946
+445
+2060
+1272
+490
+11155
+870
+9137
+10214
+5465
+3050
+3521
+12526
+3693
+1604
+926
+12884
+7293
+8235
+11592
+9853
+12160
+3358
+10800
+736
+6101
+6314
+213
+2236
+6328
+7958
+1900
+8852
+8099
+825
+9076
+2657
+379
+467
+5597
+1295
+12639
+11607
+6065
+1862
+2118
+7983
+259
+1082
+2765
+3786
+7856
+3212
+4324
+48
+7946
+6557
+6112
+12501
+11824
+113
+12887
+11921
+7881
+127
+1918
+12086
+13011
+12008
+10473
+12614
+1764
+12801
+6706
+12657
+4177
+12220
+5716
+6404
+10950
+5222
+8941
+3074
+9228
+4702
+1070
+5390
+9078
+7906
+648
+10184
+6956
+3729
+4558
+9058
+11021
+10202
+2161
+661
+8950
+1964
+6264
+7259
+2816
+2238
+4006
+12610
+6687
+6103
+6295
+1580
+1434
+2363
+10470
+7450
+770
+12124
+3476
+4998
+3026
+7244
+3125
+9718
+979
+3438
+1879
+7735
+1673
+8734
+7242
+5291
+10405
+3962
+5945
+8776
+1471
+5522
+12474
+1279
+6535
+12479
+1574
+11172
+2583
+6136
+2731

lists/banking77/size=32/seed=1/0.0-1.0.txt ADDED Viewed

	@@ -0,0 +1,308 @@

+7371
+7722
+237
+1508
+8320
+10115
+10484
+7562
+11005
+11117
+12370
+8585
+10533
+1363
+7018
+6432
+12928
+12218
+3576
+9175
+3738
+9515
+9613
+6629
+5388
+3175
+5751
+10649
+1211
+10064
+3991
+656
+240
+8779
+9704
+3440
+4614
+6730
+12625
+8543
+8183
+1709
+8358
+2282
+12485
+10770
+12682
+12421
+10565
+6029
+5919
+4254
+325
+7110
+1783
+4549
+7609
+5403
+12647
+1573
+10490
+11056
+1163
+5285
+4996
+1065
+164
+9242
+11766
+4236
+8946
+445
+2060
+1272
+490
+11155
+870
+9137
+10214
+5465
+3050
+3521
+12526
+3693
+1604
+926
+12884
+7293
+8235
+11592
+9853
+12160
+3358
+10800
+736
+6101
+6314
+213
+2236
+6328
+7958
+1900
+8852
+8099
+825
+9076
+2657
+379
+467
+5597
+1295
+12639
+11607
+6065
+1862
+2118
+7983
+259
+1082
+2765
+3786
+7856
+3212
+4324
+48
+7946
+6557
+6112
+12501
+11824
+113
+12887
+11921
+7881
+127
+1918
+12086
+13011
+12008
+10473
+12614
+1764
+12801
+6706
+12657
+4177
+12220
+5716
+6404
+10950
+5222
+8941
+3074
+9228
+4702
+1070
+5390
+9078
+7906
+648
+10184
+6956
+3729
+4558
+9058
+11021
+10202
+2161
+661
+8950
+1964
+6264
+7259
+2816
+2238
+4006
+12610
+6687
+6103
+6295
+1580
+1434
+2363
+10470
+7450
+770
+12124
+3476
+4998
+3026
+7244
+3125
+9718
+979
+3438
+1879
+7735
+1673
+8734
+7242
+5291
+10405
+3962
+5945
+8776
+1471
+5522
+12474
+1279
+6535
+12479
+1574
+11172
+2583
+6136
+2731
+10635
+2969
+1296
+4234
+4315
+8738
+1136
+10197
+11782
+5045
+10508
+4615
+1168
+7283
+3084
+7959
+6021
+12930
+11934
+4737
+6411
+2343
+10562
+1985
+8863
+10839
+5481
+3007
+9785
+3434
+4022
+2037
+11609
+348
+3069
+11783
+4367
+6096
+12665
+3573
+9385
+12224
+5476
+188
+7511
+10482
+2503
+982
+3357
+10371
+3766
+3403
+7971
+10859
+10099
+5980
+9315
+5394
+1005
+1572
+8014
+3843
+8243
+7102
+2266
+9030
+8820
+1634
+1287
+11855
+5641
+4943
+1978
+9467
+7116
+2100
+205
+9279
+12274
+12322
+9549
+10845
+1217
+12860
+4173
+8858
+1624
+11055
+292
+4825
+1246
+7758

lists/banking77/size=32/seed=1/0.7-1.0.txt ADDED Viewed

	@@ -0,0 +1,92 @@

+10635
+2969
+1296
+4234
+4315
+8738
+1136
+10197
+11782
+5045
+10508
+4615
+1168
+7283
+3084
+7959
+6021
+12930
+11934
+4737
+6411
+2343
+10562
+1985
+8863
+10839
+5481
+3007
+9785
+3434
+4022
+2037
+11609
+348
+3069
+11783
+4367
+6096
+12665
+3573
+9385
+12224
+5476
+188
+7511
+10482
+2503
+982
+3357
+10371
+3766
+3403
+7971
+10859
+10099
+5980
+9315
+5394
+1005
+1572
+8014
+3843
+8243
+7102
+2266
+9030
+8820
+1634
+1287
+11855
+5641
+4943
+1978
+9467
+7116
+2100
+205
+9279
+12274
+12322
+9549
+10845
+1217
+12860
+4173
+8858
+1624
+11055
+292
+4825
+1246
+7758

lists/banking77/size=32/seed=5/0.0-0.3.txt ADDED Viewed

	@@ -0,0 +1,93 @@

+9151
+4098
+12941
+8514
+2177
+3841
+3254
+11480
+9917
+6270
+9678
+2985
+8279
+11508
+9748
+10502
+6084
+10441
+8371
+4804
+12406
+8361
+12765
+2900
+1513
+11047
+100
+6154
+9317
+1092
+9808
+6627
+4111
+8865
+3155
+3062
+6220
+10334
+6618
+8101
+10724
+4806
+6714
+6002
+12015
+1501
+2733
+8538
+3792
+633
+3498
+3557
+2328
+9128
+9832
+8211
+10844
+13041
+5174
+3227
+10922
+1765
+11687
+5293
+6805
+4557
+9247
+12375
+3946
+9754
+2673
+9204
+7378
+7888
+8893
+9805
+11982
+5847
+5242
+11006
+954
+8193
+8644
+3074
+12140
+9429
+2137
+8592
+5615
+8239
+10454
+1637
+12218

lists/banking77/size=32/seed=5/0.0-0.7.txt ADDED Viewed

	@@ -0,0 +1,216 @@

+9151
+4098
+12941
+8514
+2177
+3841
+3254
+11480
+9917
+6270
+9678
+2985
+8279
+11508
+9748
+10502
+6084
+10441
+8371
+4804
+12406
+8361
+12765
+2900
+1513
+11047
+100
+6154
+9317
+1092
+9808
+6627
+4111
+8865
+3155
+3062
+6220
+10334
+6618
+8101
+10724
+4806
+6714
+6002
+12015
+1501
+2733
+8538
+3792
+633
+3498
+3557
+2328
+9128
+9832
+8211
+10844
+13041
+5174
+3227
+10922
+1765
+11687
+5293
+6805
+4557
+9247
+12375
+3946
+9754
+2673
+9204
+7378
+7888
+8893
+9805
+11982
+5847
+5242
+11006
+954
+8193
+8644
+3074
+12140
+9429
+2137
+8592
+5615
+8239
+10454
+1637
+12218
+7735
+9755
+7741
+2512
+4198
+12283
+2710
+4701
+7681
+10950
+8175
+864
+5084
+5228
+11493
+8104
+5961
+3719
+9848
+5783
+5543
+5856
+3459
+12433
+8383
+1565
+3611
+5234
+997
+8520
+5371
+9315
+1886
+3141
+9721
+3008
+5945
+9011
+12214
+8162
+11645
+5735
+12287
+12118
+4269
+5197
+10857
+7711
+10184
+4645
+12987
+8465
+8599
+12509
+4488
+1249
+6350
+4856
+4026
+7495
+11101
+108
+5663
+6254
+12611
+10597
+13065
+2965
+8648
+6900
+11077
+8585
+2450
+2749
+2738
+5239
+8460
+3725
+12718
+10914
+4596
+8745
+3406
+5527
+2
+3099
+3675
+8331
+9685
+12838
+5506
+5075
+10994
+1966
+3036
+34
+8000
+11625
+9500
+5456
+12236
+2497
+10170
+7396
+5751
+2984
+6809
+1186
+2322
+6489
+7885
+11127
+6145
+2858
+5625
+6044
+10496
+4756
+11825
+5291
+7115
+9199
+8223

lists/banking77/size=32/seed=5/0.7-1.0.txt ADDED Viewed

	@@ -0,0 +1,92 @@

+3920
+9749
+9549
+12269
+1941
+9574
+9130
+12726
+9993
+6748
+12666
+5323
+10057
+12294
+7938
+9448
+10814
+11019
+5905
+8576
+4598
+5675
+6820
+816
+3649
+7154
+11695
+2652
+5629
+10227
+6224
+1862
+2682
+12522
+7695
+4294
+11061
+11293
+9359
+8580
+151
+11078
+9792
+12324
+11690
+6309
+11451
+1346
+12969
+4896
+1653
+6142
+4962
+9261
+230
+5327
+7936
+7578
+6236
+2864
+7403
+12023
+1702
+5954
+10963
+8016
+4350
+12573
+6610
+3876
+1709
+3865
+88
+8993
+577
+10642
+1949
+2516
+3246
+8212
+10051
+5230
+2702
+12963
+5984
+4736
+3007
+2795
+7801
+7548
+9273
+4867

lists/banking77/size=32/seed=7/0.0-0.3.txt ADDED Viewed

	@@ -0,0 +1,93 @@

+13000
+7386
+6913
+6923
+4771
+8791
+7496
+6009
+9291
+8127
+8364
+2058
+5769
+263
+497
+7121
+5646
+5452
+8807
+11304
+10972
+11111
+11112
+1455
+12932
+73
+11694
+37
+2071
+6805
+9312
+1363
+1070
+5918
+4126
+4370
+460
+188
+861
+5737
+9981
+11911
+7569
+8347
+4018
+1714
+9266
+11128
+4921
+9072
+6213
+9223
+8050
+4225
+11824
+6687
+9959
+5900
+3696
+5504
+11545
+5731
+9731
+11566
+9910
+8338
+12669
+1439
+9584
+6315
+11492
+5969
+10499
+10993
+9847
+6115
+4718
+62
+9150
+8352
+11740
+4990
+9997
+405
+8467
+783
+2879
+4409
+11641
+1719
+7403
+9560
+10739

lists/banking77/size=32/seed=7/0.0-0.7.txt ADDED Viewed

	@@ -0,0 +1,216 @@

+13000
+7386
+6913
+6923
+4771
+8791
+7496
+6009
+9291
+8127
+8364
+2058
+5769
+263
+497
+7121
+5646
+5452
+8807
+11304
+10972
+11111
+11112
+1455
+12932
+73
+11694
+37
+2071
+6805
+9312
+1363
+1070
+5918
+4126
+4370
+460
+188
+861
+5737
+9981
+11911
+7569
+8347
+4018
+1714
+9266
+11128
+4921
+9072
+6213
+9223
+8050
+4225
+11824
+6687
+9959
+5900
+3696
+5504
+11545
+5731
+9731
+11566
+9910
+8338
+12669
+1439
+9584
+6315
+11492
+5969
+10499
+10993
+9847
+6115
+4718
+62
+9150
+8352
+11740
+4990
+9997
+405
+8467
+783
+2879
+4409
+11641
+1719
+7403
+9560
+10739
+7914
+2178
+6281
+3317
+8013
+10323
+7974
+6993
+1124
+9135
+2727
+3173
+3187
+3997
+12941
+10080
+3557
+1845
+390
+6406
+1058
+3439
+6828
+2593
+8350
+6862
+11809
+10470
+3086
+2048
+4366
+6729
+12244
+8945
+6469
+2143
+8790
+1252
+12153
+10093
+5914
+6056
+8720
+1809
+11414
+139
+4808
+3016
+8704
+11306
+9157
+5233
+1459
+98
+2449
+11750
+4541
+1272
+7637
+8616
+7205
+8599
+12872
+4083
+8591
+6337
+5711
+5771
+9057
+11667
+9548
+10941
+11294
+9670
+6073
+925
+4463
+2425
+11915
+2232
+6041
+2282
+12767
+2191
+6649
+11067
+10988
+4690
+10717
+288
+5403
+2116
+10815
+2249
+6329
+7290
+10531
+12888
+13071
+10318
+8373
+4462
+6876
+7204
+7362
+2835
+8353
+4432
+11354
+8852
+4629
+12266
+8970
+10152
+56
+3277
+4593
+13077
+6348
+9217
+2934
+9546
+2161

lists/banking77/size=32/seed=7/0.0-1.0.txt ADDED Viewed

	@@ -0,0 +1,308 @@

+13000
+7386
+6913
+6923
+4771
+8791
+7496
+6009
+9291
+8127
+8364
+2058
+5769
+263
+497
+7121
+5646
+5452
+8807
+11304
+10972
+11111
+11112
+1455
+12932
+73
+11694
+37
+2071
+6805
+9312
+1363
+1070
+5918
+4126
+4370
+460
+188
+861
+5737
+9981
+11911
+7569
+8347
+4018
+1714
+9266
+11128
+4921
+9072
+6213
+9223
+8050
+4225
+11824
+6687
+9959
+5900
+3696
+5504
+11545
+5731
+9731
+11566
+9910
+8338
+12669
+1439
+9584
+6315
+11492
+5969
+10499
+10993
+9847
+6115
+4718
+62
+9150
+8352
+11740
+4990
+9997
+405
+8467
+783
+2879
+4409
+11641
+1719
+7403
+9560
+10739
+7914
+2178
+6281
+3317
+8013
+10323
+7974
+6993
+1124
+9135
+2727
+3173
+3187
+3997
+12941
+10080
+3557
+1845
+390
+6406
+1058
+3439
+6828
+2593
+8350
+6862
+11809
+10470
+3086
+2048
+4366
+6729
+12244
+8945
+6469
+2143
+8790
+1252
+12153
+10093
+5914
+6056
+8720
+1809
+11414
+139
+4808
+3016
+8704
+11306
+9157
+5233
+1459
+98
+2449
+11750
+4541
+1272
+7637
+8616
+7205
+8599
+12872
+4083
+8591
+6337
+5711
+5771
+9057
+11667
+9548
+10941
+11294
+9670
+6073
+925
+4463
+2425
+11915
+2232
+6041
+2282
+12767
+2191
+6649
+11067
+10988
+4690
+10717
+288
+5403
+2116
+10815
+2249
+6329
+7290
+10531
+12888
+13071
+10318
+8373
+4462
+6876
+7204
+7362
+2835
+8353
+4432
+11354
+8852
+4629
+12266
+8970
+10152
+56
+3277
+4593
+13077
+6348
+9217
+2934
+9546
+2161
+3302
+11311
+6134
+10786
+4451
+3519
+10932
+6309
+4710
+5751
+231
+8558
+1275
+154
+11966
+12113
+6060
+7269
+2979
+7270
+11919
+5222
+88
+1592
+8725
+6583
+4792
+2713
+9258
+11816
+2268
+7014
+10837
+9493
+219
+10660
+11781
+7854
+3742
+7040
+11961
+39
+12412
+6119
+12132
+2897
+12583
+7671
+5126
+11689
+1107
+5472
+10630
+7562
+8901
+179
+8693
+3908
+9583
+8069
+1847
+902
+421
+7544
+8953
+9438
+11537
+8004
+11547
+12557
+8439
+349
+8924
+4111
+228
+10192
+323
+10135
+12743
+2137
+10546
+10814
+1490
+7723
+6345
+6475
+3069
+9827
+1064
+1532
+4926
+4797

lists/banking77/size=32/seed=7/0.7-1.0.txt ADDED Viewed

	@@ -0,0 +1,92 @@

+3302
+11311
+6134
+10786
+4451
+3519
+10932
+6309
+4710
+5751
+231
+8558
+1275
+154
+11966
+12113
+6060
+7269
+2979
+7270
+11919
+5222
+88
+1592
+8725
+6583
+4792
+2713
+9258
+11816
+2268
+7014
+10837
+9493
+219
+10660
+11781
+7854
+3742
+7040
+11961
+39
+12412
+6119
+12132
+2897
+12583
+7671
+5126
+11689
+1107
+5472
+10630
+7562
+8901
+179
+8693
+3908
+9583
+8069
+1847
+902
+421
+7544
+8953
+9438
+11537
+8004
+11547
+12557
+8439
+349
+8924
+4111
+228
+10192
+323
+10135
+12743
+2137
+10546
+10814
+1490
+7723
+6345
+6475
+3069
+9827
+1064
+1532
+4926
+4797

prompts/basic_20newsgroups.yaml ADDED Viewed

	@@ -0,0 +1,22 @@

+prompt_template: "Determine the category of the posted document given by the user."
+answers_templates:
+    - "Atheism"
+    - "Graphics"
+    - "Microsoft"
+    - "IBM Hardware"
+    - "Mac Hardware"
+    - "X Window System"
+    - "Sales"
+    - "Cars"
+    - "Motorcycles"
+    - "Baseball"
+    - "Hockey"
+    - "Cryptography"
+    - "Electronics"
+    - "Medicine"
+    - "Space"
+    - "Christianity"
+    - "Guns"
+    - "Middle East"
+    - "Politics"
+    - "Religion"

prompts/basic_agnews.yaml ADDED Viewed

	@@ -0,0 +1,6 @@

+prompt_template: "Determine the category of the news article given by the user."
+answers_templates:
+  - "World"
+  - "Sports"
+  - "Business"
+  - "Science and Technology"

prompts/basic_banking77.yaml ADDED Viewed

	@@ -0,0 +1,79 @@

+prompt_template: "Classify the intent of the question input by the user."
+answers_templates:
+    - "Active my card"
+    - "Age limit"
+    - "Apple pay or google pay"
+    - "ATM support"
+    - "Automatic top up"
+    - "Balance not updated after bank transfer"
+    - "Balance not updated after cheque or cash deposit"
+    - "Beneficiary not allowed"
+    - "Cancel transfer"
+    - "Card about to expire"
+    - "Card acceptance"
+    - "Card arrival"
+    - "Card delivery estimate"
+    - "Card linking"
+    - "Card not working"
+    - "Card payment fee charged"
+    - "Card payment not recognised"
+    - "Card payment wrong exchange rate"
+    - "Card swallowed"
+    - "Cash withdrawal charge"
+    - "Cash withdrawal not recognised"
+    - "Change pin"
+    - "Compromised card"
+    - "Contactless not working"
+    - "Country support"
+    - "Declined card payment"
+    - "Declined cash withdrawal"
+    - "Declined transfer"
+    - "Direct debit payment not recognised"
+    - "Disposable card limits"
+    - "Edit personal details"
+    - "Exchange charge"
+    - "Exchange rate"
+    - "Exchange via app"
+    - "Extra charge on statement"
+    - "Failed transfer"
+    - "Fiat currency support"
+    - "Get disposable virtual card"
+    - "Get physical card"
+    - "Getting spare card"
+    - "Getting virtual card"
+    - "Lost or stolen card"
+    - "Lost or stolen phone"
+    - "Order physical card"
+    - "Passcode forgotten"
+    - "Pending card payment"
+    - "Pending cash withdrawal"
+    - "Pending top up"
+    - "Pending transfer"
+    - "Pin blocked"
+    - "Receiving money"
+    - "Refund not showing up"
+    - "Request refund"
+    - "Reverted card payment?"
+    - "Supported cards and currencies"
+    - "Terminate account"
+    - "Top up by bank transfer charge"
+    - "Top up by card charge"
+    - "Top up by cash or cheque"
+    - "Top up failed"
+    - "Top up limits"
+    - "Top up reverted"
+    - "Topping up by card"
+    - "Transaction charged twice"
+    - "Transfer fee charged"
+    - "Transfer into account"
+    - "Transfer not received by recipient"
+    - "Transfer timing"
+    - "Unable to verify identity"
+    - "Verify my identity"
+    - "Verify source of funds"
+    - "Verify top up"
+    - "Virtual card not working"
+    - "Visa or mastercard"
+    - "Why verify identity"
+    - "Wrong amount of cash received"
+    - "Wrong exchange rate for cash withdrawal"

prompts/basic_dbpedia.yaml ADDED Viewed

	@@ -0,0 +1,16 @@

+prompt_template: "Determine the category of the article given by the user."
+answers_templates:
+  - "Company"
+  - "Educational Institution"
+  - "Artist"
+  - "Athlete"
+  - "Office Holder"
+  - "Mean Of Transportation"
+  - "Building"
+  - "Natural Place"
+  - "Village"
+  - "Animal"
+  - "Plant"
+  - "Album"
+  - "Film"
+  - "Written Work"

prompts/basic_sst2.yaml ADDED Viewed

	@@ -0,0 +1,4 @@

+prompt_template: "Determine if the following review is positive or negative, based on the input given by the user."
+answers_templates:
+  - "Negative"
+  - "Positive"

src/llmcal/__init__.py ADDED Viewed

File without changes

src/llmcal/scripts/__init__.py ADDED Viewed

File without changes

src/llmcal/scripts/affine_calibration.old.py ADDED Viewed

	@@ -0,0 +1,219 @@

+import os
+import warnings
+from pathlib import Path
+import pandas as pd
+import torch
+from torch.utils.data import DataLoader, TensorDataset
+from torch.optim.lbfgs import LBFGS
+import torch.nn.functional as F
+from typing import Literal
+from ..src.loggers import TBLogger, CSVLogger
+from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold, GroupKFold, KFold
+warnings.filterwarnings("ignore", category=UserWarning, message=".*Experiment logs directory outputs*")
+class AffineCalibrator(torch.nn.Module):
+    def __init__(self, method: str, num_classes: int):
+        super().__init__()
+        self.method = method
+        self.num_classes = num_classes
+        self._init_params(method)
+    def _init_params(self, method):
+        if method == "dp_calibration":
+            self.alpha = torch.nn.Parameter(torch.ones(1), requires_grad=True)
+            self.beta = torch.nn.Parameter(torch.zeros(self.num_classes), requires_grad=True)
+        elif method == "vector_scaling":
+            self.alpha = torch.nn.Parameter(torch.ones(self.num_classes), requires_grad=True)
+            self.beta = torch.nn.Parameter(torch.zeros(self.num_classes), requires_grad=True)
+        elif method == "temp_scaling":
+            self.alpha = torch.nn.Parameter(torch.ones(1), requires_grad=True)
+            self.beta = torch.nn.Parameter(torch.zeros(self.num_classes), requires_grad=False)
+        elif method == "bias_shift":
+            self.alpha = torch.nn.Parameter(torch.ones(1), requires_grad=False)
+            self.beta = torch.nn.Parameter(torch.zeros(self.num_classes), requires_grad=True)
+        else:
+            raise ValueError(f"Invalid method: {method}")
+    def forward(self, logits):
+        return logits * self.alpha + self.beta
+def main(
+    output_dir: str = 'output',
+    log_dir: str = 'output/logs',
+    checkpoint_dir: str = 'output/checkpoints',
+    train_logits: str = 'logits.csv',
+    train_labels: str = 'labels.csv',
+    predict_logits: str = 'logits.csv',
+    predict_labels: str = 'labels.csv',
+    method: Literal["dp_calibration", "temp_scaling", "bias_only"] = "dp_calibration",
+    learning_rate: float = 1e-3,
+    tolerance: float = 1e-4,
+    max_ls: int = 100,
+    seed: int = 0,
+):
+    torch.set_float32_matmul_precision("high")
+    output_dir = Path(output_dir)
+    checkpoint_dir = Path(checkpoint_dir)
+    # Load train data
+    train_logits = torch.log_softmax(torch.from_numpy(pd.read_csv(train_logits, index_col=0, header=None).values).float(), dim=1)
+    train_labels = torch.from_numpy(pd.read_csv(train_labels, index_col=0, header=None).values.flatten()).long()
+    # Load predict data
+    df_predict_logits = pd.read_csv(predict_logits, index_col=0, header=None)
+    predict_logits = torch.log_softmax(torch.from_numpy(df_predict_logits.values).float(), dim=1)
+    df_predict_labels = pd.read_csv(predict_labels, index_col=0, header=None)
+    predict_labels = torch.from_numpy(df_predict_labels.values.flatten()).long()
+    state = fit(method, train_logits, train_labels, log_dir, tolerance, train_logits.shape[1], learning_rate, max_ls, seed)
+    # Predict
+    model = AffineCalibrator(method=method, num_classes=train_logits.shape[1])
+    model.load_state_dict(state['model'])
+    cal_logits = predict(model, predict_logits)
+    # Save results
+    pd.DataFrame(cal_logits, index=df_predict_logits.index).to_csv(output_dir / 'logits.csv', index=True, header=False)
+    df_predict_labels.to_csv(output_dir / 'labels.csv', index=True, header=False)
+    torch.save(state, checkpoint_dir / 'last.ckpt')
+def fit(method, logits, labels, log_dir, tolerance, num_classes, learning_rate, max_ls, seed):
+    # Create folds
+    steps = []
+    rs = torch.Generator().manual_seed(seed)
+    for i in range(5):
+        ids = torch.randperm(logits.shape[0], generator=rs)
+        trni = ids[:int(0.7*len(ids))]
+        tsti = ids[int(0.7*len(ids)):]
+        # Train model
+        model = AffineCalibrator(method=method, num_classes=num_classes)
+        optimizer = LBFGS(
+            params=(param for param in model.parameters() if param.requires_grad),
+            lr=learning_rate,
+            max_iter=max_ls,
+            tolerance_change=tolerance,
+        )
+        train_dataset = TensorDataset(logits[trni], labels[trni])
+        train_loader = DataLoader(
+            train_dataset,
+            batch_size=len(train_dataset),
+            shuffle=False,
+        )
+        val_dataset = TensorDataset(logits[tsti], labels[tsti])
+        val_loader = DataLoader(
+            val_dataset,
+            batch_size=len(val_dataset),
+            shuffle=False,
+        )
+        state = _fit_to_fold(model, optimizer, train_loader, val_loader, os.path.join(log_dir,f"fold_{i}"), float('inf'), tolerance, patience=10)
+        steps.append(state['step_count'])
+    print(f"Fitting final model with {max(steps)} steps. All steps: {steps}")
+    model = AffineCalibrator(method=method, num_classes=num_classes)
+    optimizer = LBFGS(
+        params=(param for param in model.parameters() if param.requires_grad),
+        lr=learning_rate,
+        max_iter=max_ls,
+        tolerance_change=tolerance,
+    )
+    train_dataset = TensorDataset(logits[trni], labels[trni])
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=len(train_dataset),
+        shuffle=False,
+    )
+    state = _fit_to_fold(model, optimizer, train_loader, None, os.path.join(log_dir,'final'), max(steps), tolerance, patience=None)
+    return state
+@torch.no_grad()
+def validate(model, val_loader):
+    logits, labels = next(iter(val_loader))
+    cal_logits = model(logits)
+    loss = F.cross_entropy(cal_logits, labels)
+    er = (cal_logits.argmax(dim=1) != labels).float().mean().item()
+    return loss.item(), er
+def _fit_to_fold(model, optimizer, train_loader, val_loader, log_dir, max_step_count, tolerance=1e-4, patience=10):
+    if val_loader is None:
+        val_loader = train_loader
+    model.train()
+    loggers = [
+        TBLogger(log_dir),
+        CSVLogger(log_dir),
+    ]
+    logits, labels = next(iter(train_loader))
+    priors = torch.bincount(labels, minlength=logits.shape[1]).float() / len(labels)
+    priors_ce = -torch.log(priors[labels]).mean().item()
+    if priors_ce == 0:
+        priors_ce = 1.
+    priors_er = (priors.argmax() != labels).float().mean().item()
+    if priors_er == 0:
+        priors_er = 1.
+    state = {
+        'model': model.state_dict(),
+        'best_val_loss': float('inf'),
+        'step_count': 0,
+        'best_step_count': 0,
+        'patience': 0,
+    }
+    while state['step_count'] < max_step_count:
+        logits, labels = next(iter(train_loader))
+        def closure():
+            optimizer.zero_grad()
+            cal_logits = model(logits)
+            loss = F.cross_entropy(cal_logits, labels)
+            er = (cal_logits.argmax(dim=1) != labels).float().mean().item()
+            for logger in loggers:
+                logger.log_metrics({
+                    "train/NCE": loss.item() / priors_ce,
+                    "train/NER": er / priors_er,
+                }, step=state['step_count'])
+            loss.backward()
+            state['step_count'] += 1
+            return loss
+        optimizer.step(closure)
+        val_loss, val_er = validate(model, val_loader)
+        norm_val_loss = val_loss / priors_ce
+        for logger in loggers:
+            logger.log_metrics({
+                "val/NCE": norm_val_loss,
+                "val/NER": val_er / priors_er,
+            }, step=state['step_count'])
+        if abs(state['best_val_loss'] - norm_val_loss) <= tolerance and patience is not None:
+            if state['patience'] >= patience:
+                break
+            state['patience'] += 1
+        else:
+            state['model'] = model.state_dict()
+            state['best_val_loss'] = norm_val_loss
+            state['best_step_count'] = state['step_count']
+    return state
+@torch.no_grad()
+def predict(model, logits):
+    model.eval()
+    cal_logits = model(logits)
+    cal_logits = torch.log_softmax(cal_logits, dim=1).numpy()
+    return cal_logits
+if __name__ == '__main__':
+    from fire import Fire
+    Fire(main)

src/llmcal/scripts/affine_calibration.py ADDED Viewed

	@@ -0,0 +1,203 @@

+import os
+import warnings
+from pathlib import Path
+import pandas as pd
+import torch
+from torch.utils.data import DataLoader, TensorDataset
+from torch.optim.lbfgs import LBFGS
+import torch.nn.functional as F
+from typing import Literal
+from ..src.loggers import TBLogger, CSVLogger
+warnings.filterwarnings("ignore", category=UserWarning, message=".*Experiment logs directory outputs*")
+class AffineCalibrator(torch.nn.Module):
+    def __init__(self, method: str, num_classes: int):
+        super().__init__()
+        self.method = method
+        self.num_classes = num_classes
+        self._init_params(method)
+    def _init_params(self, method):
+        if method == "dp_calibration":
+            self.alpha = torch.nn.Parameter(torch.ones(1), requires_grad=True)
+            self.beta = torch.nn.Parameter(torch.zeros(self.num_classes), requires_grad=True)
+        elif method == "vector_scaling":
+            self.alpha = torch.nn.Parameter(torch.ones(self.num_classes), requires_grad=True)
+            self.beta = torch.nn.Parameter(torch.zeros(self.num_classes), requires_grad=True)
+        elif method == "temp_scaling":
+            self.alpha = torch.nn.Parameter(torch.ones(1), requires_grad=True)
+            self.beta = torch.nn.Parameter(torch.zeros(self.num_classes), requires_grad=False)
+        elif method == "bias_shift":
+            self.alpha = torch.nn.Parameter(torch.ones(1), requires_grad=False)
+            self.beta = torch.nn.Parameter(torch.zeros(self.num_classes), requires_grad=True)
+        elif method == "matrix_scaling":
+            self.alpha = torch.nn.Parameter(torch.eye(self.num_classes), requires_grad=True)
+            self.beta = torch.nn.Parameter(torch.zeros(self.num_classes), requires_grad=True)
+        else:
+            raise ValueError(f"Invalid method: {method}")
+    def forward(self, logits):
+        if self.method != "matrix_scaling":
+            return logits * self.alpha + self.beta
+        return logits @ self.alpha.T + self.beta
+def main(
+    output_dir: str = 'output',
+    log_dir: str = 'output/logs',
+    checkpoint_dir: str = 'output/checkpoints',
+    train_logits: str = 'logits.csv',
+    train_labels: str = 'labels.csv',
+    predict_logits: str = 'logits.csv',
+    predict_labels: str = 'labels.csv',
+    method: Literal["dp_calibration", "temp_scaling", "bias_only"] = "dp_calibration",
+    learning_rate: float = 1e-3,
+    tolerance: float = 1e-4,
+    max_ls: int = 100,
+    seed: int = 0,
+):
+    torch.set_float32_matmul_precision("high")
+    output_dir = Path(output_dir)
+    checkpoint_dir = Path(checkpoint_dir)
+    # Load train data
+    train_logits = torch.log_softmax(torch.from_numpy(pd.read_csv(train_logits, index_col=0, header=None).values).float(), dim=1)
+    train_labels = torch.from_numpy(pd.read_csv(train_labels, index_col=0, header=None).values.flatten()).long()
+    # Load predict data
+    df_predict_logits = pd.read_csv(predict_logits, index_col=0, header=None)
+    predict_logits = torch.log_softmax(torch.from_numpy(df_predict_logits.values).float(), dim=1)
+    df_predict_labels = pd.read_csv(predict_labels, index_col=0, header=None)
+    predict_labels = torch.from_numpy(df_predict_labels.values.flatten()).long()
+    num_classes = train_logits.shape[1]
+    model = AffineCalibrator(method=method, num_classes=num_classes)
+    state = fit(model, train_logits, train_labels, log_dir, tolerance, learning_rate, max_ls)
+    torch.save(state, checkpoint_dir / 'state.ckpt')
+    model.load_state_dict(state['best_model'])
+    # Predict
+    cal_logits = predict(model, predict_logits)
+    # Save results
+    pd.DataFrame(cal_logits, index=df_predict_logits.index).to_csv(output_dir / 'logits.csv', index=True, header=False)
+    df_predict_labels.to_csv(output_dir / 'labels.csv', index=True, header=False)
+def fit(model, logits, labels, log_dir, tolerance, learning_rate, max_ls):
+    # Train model
+    optimizer = LBFGS(
+        params=(param for param in model.parameters() if param.requires_grad),
+        lr=learning_rate,
+        max_iter=max_ls,
+        tolerance_change=tolerance,
+    )
+    train_dataset = TensorDataset(logits, labels)
+    train_loader = DataLoader(
+        train_dataset,
+        batch_size=len(train_dataset),
+        shuffle=False,
+    )
+    val_dataset = TensorDataset(logits, labels)
+    val_loader = DataLoader(
+        val_dataset,
+        batch_size=len(val_dataset),
+        shuffle=False,
+    )
+    state = _fit(model, optimizer, train_loader, val_loader, log_dir, float('inf'), tolerance, 10)
+    return state
+@torch.no_grad()
+def validate(model, val_loader):
+    logits, labels = next(iter(val_loader))
+    cal_logits = model(logits)
+    loss = F.cross_entropy(cal_logits, labels)
+    er = (cal_logits.argmax(dim=1) != labels).float().mean().item()
+    return loss.item(), er
+def _fit(model, optimizer, train_loader, val_loader, log_dir, max_step_count, tolerance=1e-4, patience=10):
+    model.train()
+    loggers = [
+        TBLogger(log_dir),
+        CSVLogger(log_dir),
+    ]
+    logits, labels = next(iter(train_loader))
+    priors = torch.bincount(labels, minlength=logits.shape[1]).float() / len(labels)
+    priors_ce = -torch.log(priors[labels]).mean().item()
+    if priors_ce == 0:
+        priors_ce = 1.
+    priors_er = (priors.argmax() != labels).float().mean().item()
+    if priors_er == 0:
+        priors_er = 1.
+    state = {
+        'last_model': model.state_dict(),
+        'best_model': model.state_dict(),
+        'best_val_loss': float('inf'),
+        'step_count': 0,
+        'best_step_count': 0,
+        'patience': 0,
+    }
+    should_stop = False
+    while not should_stop:
+        logits, labels = next(iter(train_loader))
+        def closure():
+            optimizer.zero_grad()
+            cal_logits = model(logits)
+            loss = F.cross_entropy(cal_logits, labels)
+            er = (cal_logits.argmax(dim=1) != labels).float().mean().item()
+            for logger in loggers:
+                logger.log_metrics({
+                    "train/NCE": loss.item() / priors_ce,
+                    "train/NER": er / priors_er,
+                }, step=state['step_count'])
+            loss.backward()
+            state['step_count'] += 1
+            return loss
+        optimizer.step(closure)
+        val_loss, val_er = validate(model, val_loader)
+        norm_val_loss = val_loss / priors_ce
+        for logger in loggers:
+            logger.log_metrics({
+                "val/NCE": norm_val_loss,
+                "val/NER": val_er / priors_er,
+            }, step=state['step_count'])
+        if (state['best_val_loss'] - norm_val_loss) / norm_val_loss <= tolerance:
+            if patience is not None:
+                if state['patience'] >= patience:
+                    should_stop = True
+                state['patience'] += 1
+        else:
+            state['best_model'] = model.state_dict()
+            state['best_val_loss'] = norm_val_loss
+            state['best_step_count'] = state['step_count']
+        state['last_model'] = model.state_dict()
+        should_stop = should_stop or state['step_count'] >= max_step_count
+    return state
+@torch.no_grad()
+def predict(model, logits):
+    model.eval()
+    cal_logits = model(logits)
+    cal_logits = torch.log_softmax(cal_logits, dim=1).numpy()
+    return cal_logits
+if __name__ == '__main__':
+    from fire import Fire
+    Fire(main)

src/llmcal/scripts/affine_prediction.py ADDED Viewed

	@@ -0,0 +1,36 @@

+from pathlib import Path
+from .affine_calibration import AffineCalibrator, predict
+import pandas as pd
+import torch
+def main(
+    checkpoint_path: str,
+    method: str,
+    predict_logits: str,
+    predict_labels: str,
+    output_dir: str = 'output',
+):
+    # Load logits
+    predict_logits = torch.log_softmax(torch.from_numpy(pd.read_csv(predict_logits, index_col=0, header=None).values).float(), dim=1)
+    df_predict_labels = pd.read_csv(predict_labels, index_col=0, header=None)
+    # Load model
+    model = AffineCalibrator(method=method, num_classes=predict_logits.shape[1])
+    state = torch.load(checkpoint_path, weights_only=False)
+    model.load_state_dict(state['best_model'])
+    # Predict
+    cal_logits = predict(model, predict_logits)
+    # Save
+    output_dir = Path(output_dir)
+    pd.DataFrame(cal_logits, index=df_predict_labels.index).to_csv(output_dir / 'logits.csv', index=True, header=False)
+    df_predict_labels.to_csv(output_dir / 'labels.csv', index=True, header=False)
+if __name__ == "__main__":
+    from fire import Fire
+    Fire(main)

src/llmcal/scripts/compare_models.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from pathlib import Path
+from matplotlib import pyplot as plt
+import numpy as np
+import pandas as pd
+from ..src.utils import load_yaml
+from .results_vs_samples import DATASETS, metric2name, process_data, plot_metric_vs_samples
+method2style = {
+    "no_adaptation": "-",
+    "lora_1.0_no_es": "-",
+    "lora_1.0_no_es_plus_tempscaling": "--",
+    "dp_calibration": "-.",
+    "temp_scaling": "-.",
+    "vector_scaling": "-.",
+}
+model2style = {
+    "llama3.2-1b-instruct": "-",
+    "qwen2.5-7b-instruct": "--",
+}
+model2name = {
+    "llama3.2-1b-instruct": "LLama3.2-1B",
+    "qwen2.5-7b-instruct": "Qwen2.5-7B",
+}
+def main(
+    datasets,
+    metrics,
+    sizes,
+    methods_config,
+    output_path,
+    models,
+    results_dirs,
+    intervals,
+    methods,
+):
+    datasets = list(map(str, datasets.split()))
+    sizes = list(map(int, sizes.split()))
+    models = list(map(str, models.split()))
+    methods = list(map(str, methods.split()))
+    methods_config = load_yaml(methods_config)
+    metrics = list(map(str, metrics.split()))
+    output_path = Path(output_path)
+    output_dir = output_path.parent
+    results_dirs = list(map(Path, results_dirs.split()))
+    fig, axs = plt.subplots(len(metrics), len(datasets), figsize=(6 * len(datasets), 12))
+    processed_data = {}
+    custom_handles = []
+    all_data = []
+    for i, (model, results_dir) in enumerate(zip(models, results_dirs)):
+        for method in methods:
+            # methods_config[method]["color"] = f"C{i}"
+            methods_config[method]["linestyle"] = model2style[model]
+        for ax, metric in zip(axs,metrics):
+            data = pd.read_json(results_dir / f"{metric}.jsonl", orient='records', lines=True)
+            processed_data[metric] = data
+            data = process_data(data, datasets, sizes, methods)
+            plot_metric_vs_samples(ax, data, methods, methods_config, datasets, sizes, intervals=intervals, pos=i/10, no_adaptation="text", modelname_noa=model2name[model], fontsize_noa=16)
+            data["model"] = model
+            data["metric"] = metric
+            all_data.append(data)
+            data.to_csv(output_dir / f"{metric}.csv", index=False)
+            ax[0].set_ylabel(f"{metric2name[metric]}", fontsize=22)
+    all_data = pd.concat(all_data)
+    for j, dataset in enumerate(datasets):
+        axs[0,j].set_title(DATASETS[dataset]["name"], fontsize=22)
+        for ax, metric in zip(axs,metrics):
+            min_y = all_data.loc[
+                (all_data["dataset"] == dataset) & \
+                (all_data["metric"] == metric) & \
+                (all_data["method"].isin(set(methods) - {"no_adaptation"})),"median"].min()
+            max_y = all_data.loc[
+                (all_data["dataset"] == dataset) & \
+                (all_data["metric"] == metric) & \
+                (all_data["method"].isin(set(methods) - {"no_adaptation"})),"median"].max()
+            ax[j].set_ylim(min_y*0.99, max_y*1.2)
+            ax[j].set_yticks(np.round(ax[j].get_yticks(),3))
+            ax[j].set_yticklabels(ax[j].get_yticks(), fontsize=16)
+            ax[j].grid(axis="y")
+    fig.text(0.5, 0.04, 'Number of train samples', ha='center', fontsize=22)
+    # Gather handles and labels from all axes
+    custom_handles = []
+    for i, model in enumerate(models):
+        custom_handles.append(
+            plt.Line2D([0], [0], color="black", linestyle=model2style[model], label=model2name[model])
+        )
+    for method in methods:
+        if method == "no_adaptation":
+            continue
+        custom_handles.append(
+            plt.Line2D([0], [0], color=methods_config[method]["color"], linestyle="none", marker="o", markersize=10, label=methods_config[method]["label"])
+        )
+    # fig.legend(handles=custom_handles, loc='upper right', bbox_to_anchor=(1.08, .95), title_fontsize=24, fontsize=22)
+    fig.legend(handles=custom_handles, loc='lower center', bbox_to_anchor=(0.5, -0.1), fontsize=24, ncol=4)
+    plt.savefig(output_path, bbox_inches="tight", dpi=300)
+    plt.close(fig)
+if __name__ == "__main__":
+    from fire import Fire
+    Fire(main)

src/llmcal/scripts/compute_matched_results.py ADDED Viewed

	@@ -0,0 +1,309 @@

+from pathlib import Path
+import pandas as pd
+from tqdm import tqdm
+from ..src.evaluation.metrics import compute_psr_with_mincal
+METHODS = [
+    "no_adaptation",
+    "dpcal",
+    "tempscaling",
+    "vectorscaling",
+    "biasshift",
+    "finetunne_lora",
+    "lora_plus_dpcal",
+    "lora_plus_tempscaling",
+    "lora_plus_biasshfit",
+    "lora_plus_vectorscaling",
+]
+def read_finetuning_results(root_results_dir: Path):
+    data = []
+    for dataset_dir in root_results_dir.iterdir():
+        train_dataset = dataset_dir.name
+        for size_dir in dataset_dir.iterdir():
+            size = int(size_dir.name.split("=")[1])
+            for seed_dir in size_dir.iterdir():
+                seed = int(seed_dir.name.split("=")[1])
+                for method in seed_dir.iterdir():
+                    method_name = method.name
+                    for train_lst in method.iterdir():
+                        train_lst_name = train_lst.name
+                        for val_lst in train_lst.iterdir():
+                            val_lst_name = val_lst.name
+                            for test_dataset_dir in val_lst.iterdir():
+                                if not test_dataset_dir.name.startswith("test="):
+                                    continue
+                                test_dataset = test_dataset_dir.name.split("=")[1]
+                                for test_lst in test_dataset_dir.iterdir():
+                                    if not test_lst.name.startswith("list=test"):
+                                        continue
+                                    test_lst_name = test_lst.name.split("=")[1]
+                                    if not (logits_path := test_lst / "logits.csv").exists():
+                                        continue
+                                    if not (labels_path := test_lst / "labels.csv").exists():
+                                        continue
+                                    data.append({
+                                        "train_dataset": train_dataset,
+                                        "size": size,
+                                        "seed": seed,
+                                        "method": method_name,
+                                        "train_lst": train_lst_name,
+                                        "val_lst": val_lst_name,
+                                        "cal_lst": None,
+                                        "test_dataset": test_dataset,
+                                        "test_lst": test_lst_name,
+                                        "logits": logits_path,
+                                        "labels": labels_path,
+                                    })
+    return pd.DataFrame(data)
+def read_lora_plus_calibration_results(root_results_dir: Path):
+    data = []
+    for dataset_dir in root_results_dir.iterdir():
+        train_dataset = dataset_dir.name
+        for size_dir in dataset_dir.iterdir():
+            size = int(size_dir.name.split("=")[1])
+            for seed_dir in size_dir.iterdir():
+                seed = int(seed_dir.name.split("=")[1])
+                for method in seed_dir.iterdir():
+                    method_name = method.name
+                    for train_lst in method.iterdir():
+                        train_lst_name = train_lst.name
+                        for val_lst in train_lst.iterdir():
+                            val_lst_name = val_lst.name
+                            for cal_lst in val_lst.iterdir():
+                                cal_lst_name = cal_lst.name
+                                for test_dataset_dir in cal_lst.iterdir():
+                                    if not test_dataset_dir.name.startswith("test="):
+                                        continue
+                                    test_dataset = test_dataset_dir.name.split("=")[1]
+                                    for test_lst in test_dataset_dir.iterdir():
+                                        if not test_lst.name.startswith("list=test"):
+                                            continue
+                                        test_lst_name = test_lst.name.split("=")[1]
+                                        if not (logits_path := test_lst / "logits.csv").exists():
+                                            continue
+                                        if not (labels_path := test_lst / "labels.csv").exists():
+                                            continue
+                                        data.append({
+                                            "train_dataset": train_dataset,
+                                            "size": size,
+                                            "seed": seed,
+                                            "method": method_name,
+                                            "train_lst": train_lst_name,
+                                            "val_lst": val_lst_name,
+                                            "cal_lst": cal_lst_name,
+                                            "test_dataset": test_dataset,
+                                            "test_lst": test_lst_name,
+                                            "logits": logits_path,
+                                            "labels": labels_path,
+                                        })
+    return pd.DataFrame(data)
+def read_calibration_results(root_results_dir: Path):
+    data = []
+    for dataset_dir in root_results_dir.iterdir():
+        train_dataset = dataset_dir.name
+        for size_dir in dataset_dir.iterdir():
+            size = int(size_dir.name.split("=")[1])
+            for seed_dir in size_dir.iterdir():
+                seed = int(seed_dir.name.split("=")[1])
+                for method in seed_dir.iterdir():
+                    method_name = method.name
+                    for train_lst in method.iterdir():
+                        train_lst_name = train_lst.name
+                        for val_lst in train_lst.iterdir():
+                            val_lst_name = val_lst.name
+                            for test_dataset_dir in val_lst.iterdir():
+                                if not test_dataset_dir.name.startswith("test="):
+                                    continue
+                                test_dataset = test_dataset_dir.name.split("=")[1]
+                                for test_lst in test_dataset_dir.iterdir():
+                                    if not test_lst.name.startswith("list=test"):
+                                        continue
+                                    test_lst_name = test_lst.name.split("=")[1]
+                                    if not (logits_path := test_lst / "logits.csv").exists():
+                                        continue
+                                    if not (labels_path := test_lst / "labels.csv").exists():
+                                        continue
+                                    data.append({
+                                        "train_dataset": train_dataset,
+                                        "size": size,
+                                        "seed": seed,
+                                        "method": method_name,
+                                        "train_lst": train_lst_name,
+                                        "val_lst": val_lst_name,
+                                        "cal_lst": None,
+                                        "test_dataset": test_dataset,
+                                        "test_lst": test_lst_name,
+                                        "logits": logits_path,
+                                        "labels": labels_path,
+                                    })
+    return pd.DataFrame(data)
+def read_no_adaptation_results(root_results_dir: Path):
+    data = []
+    for dataset_dir in root_results_dir.iterdir():
+        train_dataset = dataset_dir.name
+        for test_dataset_dir in (dataset_dir / "size=all/seed=all").iterdir():
+            if not test_dataset_dir.name.startswith("test="):
+                continue
+            test_dataset = test_dataset_dir.name.split("=")[1]
+            for test_lst in test_dataset_dir.iterdir():
+                if not test_lst.name.startswith("list=test"):
+                    continue
+                test_lst_name = test_lst.name.split("=")[1]
+                if not (logits_path := test_lst / "logits.csv").exists():
+                    continue
+                if not (labels_path := test_lst / "labels.csv").exists():
+                    continue
+                data.append({
+                    "train_dataset": train_dataset,
+                    "size": "all",
+                    "seed": "all",
+                    "method": "no_adaptation",
+                    "train_lst": None,
+                    "val_lst": None,
+                    "cal_lst": None,
+                    "test_dataset": test_dataset,
+                    "test_lst": test_lst_name,
+                    "logits": logits_path,
+                    "labels": labels_path,
+                })
+    return pd.DataFrame(data)
+def compute_metrics(data, metric):
+    data_with_metrics = data.copy()
+    for i, row in tqdm(data.iterrows(), total=len(data)):
+        logits = pd.read_csv(row["logits"], index_col=0, header=None).values.astype(float)
+        labels = pd.read_csv(row["labels"], index_col=0, header=None).values.flatten().astype(int)
+        value, min_value = compute_psr_with_mincal(logits, labels, metric, "none")
+        data_with_metrics.loc[i, "result"] = value
+        data_with_metrics.loc[i, "min_result"] = min_value
+    data_with_metrics = data_with_metrics.drop(columns=["logits", "labels"])
+    return data_with_metrics
+def extract_method(row):
+    if row["method_type"] == "no_adaptation":
+        method = row["method_type"]
+    elif row["method_type"] == "calibration":
+        method = row["method"]
+    elif row["method_type"] == "finetune_lora":
+        s, e = map(float,row["train_lst"].split("-"))
+        p_train = e - s
+        if row["method"] != "lora_ans_no_es":
+            method = f"lora_{p_train:.1f}"
+        else:
+            method = f"lora_{p_train:.1f}_no_es"
+    elif row["method_type"] in ["lora_plus_dpcal", "lora_plus_tempscaling", "lora_plus_biasshift", "lora_plus_vectorscaling", "lora_plus_dpcal_trainontest", "lora_plus_tempscaling_trainontest", "lora_plus_dpcal_naive", "lora_plus_tempscaling_naive"]:
+        s, e = map(float,row["train_lst"].split("-"))
+        p_train = e - s
+        if row["method"] != "lora_ans_no_es":
+            method = f"lora_{p_train:.1f}" + "_plus_" + row["method_type"].split("_plus_")[1]
+        else:
+            method = f"lora_{p_train:.1f}_no_es" + "_plus_" + row["method_type"].split("_plus_")[1]
+    else:
+        raise ValueError(f"Unknown method: {row['method_type']}, {row['method']}")
+    return method
+def process_data(data, reduced = False):
+    # Keep matched trainings
+    data = data[data["train_dataset"] == data["test_dataset"]]
+    data = data.drop(columns=["train_dataset"])
+    data = data.rename(columns={"test_dataset": "dataset"})
+    # Keep evaluation in test
+    if reduced:
+        data = data[data["test_lst"].str.startswith("test_")]
+    else:
+        data = data[data["test_lst"] == "test"]
+    data = data.drop(columns=["test_lst"])
+    # Replace method name for full description
+    data["method"] = data.apply(extract_method, axis=1)
+    data = data.drop(columns=["method_type", "train_lst", "val_lst", "cal_lst"])
+    # Reorder columns
+    data = data.loc[:, ["dataset", "method", "size", "seed", "result", "min_result"]]
+    return data
+def main(
+    metric: str,
+    finetuning_root_results_dirs: str = None,
+    lora_plus_cal_root_results_dirs: str = None,
+    lora_plus_cal_naive_root_results_dirs: str = None,
+    cal_root_results_dirs: str = None,
+    trainontest_root_results_dirs: str = None,
+    no_adaptation_root_results_dirs: str = None,
+    output_path: str = "outputs",
+    reduced: bool = False,
+):
+    # Read results
+    finetuning_root_results_dirs = [Path(d) for d in finetuning_root_results_dirs.split(",")] if finetuning_root_results_dirs is not None else []
+    cal_root_results_dirs = [Path(d) for d in cal_root_results_dirs.split(",")] if cal_root_results_dirs is not None else []
+    lora_plus_cal_root_results_dirs = [Path(d) for d in lora_plus_cal_root_results_dirs.split(",")] if lora_plus_cal_root_results_dirs is not None else []
+    lora_plus_cal_naive_root_results_dirs = [Path(d) for d in lora_plus_cal_naive_root_results_dirs.split(",")] if lora_plus_cal_naive_root_results_dirs is not None else []
+    trainontest_root_results_dirs = [Path(d) for d in trainontest_root_results_dirs.split(",")] if trainontest_root_results_dirs is not None else []
+    no_adaptation_root_results_dirs = [Path(d) for d in no_adaptation_root_results_dirs.split(",")] if no_adaptation_root_results_dirs is not None else []
+    all_data = []
+    for root_results_dir in finetuning_root_results_dirs:
+        finetuning_data = read_finetuning_results(root_results_dir)
+        finetuning_data["method_type"] = str(root_results_dir).split("/")[-2]
+        all_data.append(finetuning_data)
+    for root_results_dir in cal_root_results_dirs:
+        cal_data = read_calibration_results(root_results_dir)
+        cal_data["method_type"] = str(root_results_dir).split("/")[-2]
+        all_data.append(cal_data)
+    for root_results_dir in lora_plus_cal_root_results_dirs:
+        cal_data = read_lora_plus_calibration_results(root_results_dir)
+        cal_data["method_type"] = str(root_results_dir).split("/")[-2]
+        all_data.append(cal_data)
+    for root_results_dir in lora_plus_cal_naive_root_results_dirs:
+        cal_data = read_lora_plus_calibration_results(root_results_dir)
+        cal_data["method_type"] = str(root_results_dir).split("/")[-2]
+        all_data.append(cal_data)
+    for root_results_dir in trainontest_root_results_dirs:
+        trainontest_data = read_lora_plus_calibration_results(root_results_dir)
+        trainontest_data["method_type"] = str(root_results_dir).split("/")[-2]
+        all_data.append(trainontest_data)
+    for root_results_dir in no_adaptation_root_results_dirs:
+        no_adaptation_data = read_no_adaptation_results(root_results_dir)
+        no_adaptation_data["method_type"] = "no_adaptation"
+        all_data.append(no_adaptation_data)
+    data = pd.concat(all_data, ignore_index=True)
+    # Compute metrics
+    data_with_metrics = compute_metrics(data, metric)
+    # Process data
+    data_with_metrics = process_data(data_with_metrics, reduced)
+    # Save data
+    data_with_metrics.to_json(output_path, orient="records", lines=True)
+if __name__ == "__main__":
+    from fire import Fire
+    Fire(main)

src/llmcal/scripts/create_lists_new.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+from pathlib import Path
+import numpy as np
+import yaml
+from tqdm import tqdm
+DATASETS = {"sst2": 2, "agnews": 4, "dbpedia": 14, "20newsgroups": 20, "banking77": 77}
+TEST_SAMPLES = {"sst2": 400, "agnews": 400, "dbpedia": 700, "20newsgroups": 800, "banking77": 1000}
+N_SEEDS = 10
+FACTORS = [8, 16, 32, 64, 128, 256, 512]
+def main():
+    rs = np.random.RandomState(8364)
+    for dataset in tqdm(DATASETS):
+        num_classes = DATASETS[dataset]
+        for factor in FACTORS:
+            scale = factor / np.log2(num_classes)
+            nearest_power_of_2 = 2 ** np.round(np.log2(scale)) # round to nearest power of 2
+            num_samples = int(nearest_power_of_2 * num_classes)
+            for seed in range(N_SEEDS):
+                os.makedirs(f"lists/{dataset}/size={factor}/seed={seed}", exist_ok=True)
+                full_trainlist = np.loadtxt(f"../llmcal2/lists/{dataset}/train.txt", dtype=int)
+                if Path(f"../llmcal2/lists/{dataset}/train_{num_samples}_0.3_{seed}.txt").exists() and Path(f"../llmcal2/lists/{dataset}/val_{num_samples}_0.3_{seed}.txt").exists():
+                    samples_list = np.hstack([
+                        np.loadtxt(f"../llmcal2/lists/{dataset}/train_{num_samples}_0.3_{seed}.txt", dtype=int),
+                        np.loadtxt(f"../llmcal2/lists/{dataset}/val_{num_samples}_0.3_{seed}.txt", dtype=int),
+                    ])
+                else:
+                    seedrs = np.random.RandomState(2834+seed)
+                    idx = seedrs.permutation(full_trainlist)
+                    samples_list = idx[:num_samples]
+                np.savetxt(f"lists/{dataset}/size={factor}/seed={seed}/0.0-0.7.txt", samples_list[:(num_samples-int(num_samples*0.3))], fmt="%d")
+                np.savetxt(f"lists/{dataset}/size={factor}/seed={seed}/0.7-1.0.txt", samples_list[(num_samples-int(num_samples*0.3)):], fmt="%d")
+                np.savetxt(f"lists/{dataset}/size={factor}/seed={seed}/0.0-0.3.txt", samples_list[:(num_samples-int(num_samples*0.7))], fmt="%d")
+                np.savetxt(f"lists/{dataset}/size={factor}/seed={seed}/0.0-1.0.txt", samples_list, fmt="%d")
+    for dataset in tqdm(DATASETS):
+        full_train_list = np.loadtxt(f"../llmcal2/lists/{dataset}/train.txt", dtype=int)
+        np.savetxt(f"lists/{dataset}/train.txt", full_train_list, fmt="%d")
+        full_test_list = np.loadtxt(f"../llmcal2/lists/{dataset}/test.txt", dtype=int)
+        np.savetxt(f"lists/{dataset}/test.txt", full_test_list, fmt="%d")
+        partial_test_list = np.loadtxt(f"../llmcal2/lists/{dataset}/test_{TEST_SAMPLES[dataset]}.txt", dtype=int)
+        np.savetxt(f"lists/{dataset}/test_{TEST_SAMPLES[dataset]}.txt", partial_test_list, fmt="%d")
+if __name__ == "__main__":
+    main()

src/llmcal/scripts/evals.py ADDED Viewed

	@@ -0,0 +1,42 @@

+from datasets import load_dataset
+import numpy as np
+from scipy.special import log_softmax
+def compute_nce(scores, labels):
+    logprobs = log_softmax(scores, axis=1)
+    ce = -logprobs[np.arange(len(labels)), labels].mean()
+    priors = np.bincount(labels) / len(labels)
+    ce_priors = -np.mean(np.log(priors[labels]))
+    nce = ce / ce_priors
+    return ce, nce
+def compute_ner(scores, labels):
+    preds = np.argmax(scores, axis=1)
+    er = np.mean(preds != labels)
+    max_label = np.bincount(labels).argmax()
+    ner = er / np.mean(preds != max_label)
+    return er, ner
+def main():
+    data = load_dataset("meta-llama/Llama-3.2-1B-evals", "Llama-3.2-1B-evals__mmlu__details", split="latest")
+    # data = load_dataset("meta-llama/Llama-3.1-405B-evals", "Llama-3.1-405B-evals__mmlu__details", split="latest")
+    classes = ["A", "B", "C", "D"]
+    # keep columns "output_choice_negative_log_likelihood", "input_correct_responses"
+    data = data.select_columns(["output_choice_negative_log_likelihoods", "input_correct_responses"]).to_pandas()
+    data = data.rename(columns={"output_choice_negative_log_likelihoods": "score", "input_correct_responses": "label"})
+    scores = np.vstack(data["score"].apply(lambda x: np.array(x["raw"])))
+    labels = data["label"].apply(lambda x: classes.index(x[0].split(" ")[1])).astype(int).values.flatten()
+    ce, nce = compute_nce(scores, labels)
+    er, ner = compute_ner(scores, labels)
+    goodness = nce * ner
+    print(f"NCE: {nce}")
+    print(f"CE: {ce}")
+    print(f"NER: {ner}")
+    print(f"ER: {er}")
+    print(f"Goodness: {goodness}")
+if __name__ == "__main__":
+    main()

src/llmcal/scripts/prepare_data.py ADDED Viewed

	@@ -0,0 +1,64 @@

+from pathlib import Path
+import numpy as np
+import pandas as pd
+from ..src.utils import load_yaml
+from ..src.prompts import *
+def load_dataset(dataset_path):
+    return pd.read_csv(dataset_path, index_col=0, header=0)
+def load_shots(dataset, shots_list, answers):
+    if shots_list is None:
+        return []
+    shots_list = np.loadtxt(shots_list, dtype=int)
+    shots = dataset.loc[shots_list]
+    shots["label"] = shots["label"].apply(lambda x: answers[x])
+    return shots.to_dict(orient="records")
+def select_prompt(model):
+    if "llama3" in model:
+        return Llama3Prompt
+    elif "gemma" in model:
+        return GemmaPrompt
+    elif "qwen" in model:
+        return QwenPrompt
+    elif "tinyllama" in model:
+        return TinyLlamaPrompt
+    elif "phi3" in model:
+        return Phi3Prompt
+    elif "pythia" in model:
+        return PythiaPrompt
+    else:
+        raise ValueError(f"Unknown model: {model}")
+def main(dataset_path, prompt_template, model, output_path, shots_list=None, max_characters=400):
+    dataset_path = Path(dataset_path)
+    prompt_template = Path(prompt_template)
+    output_path = Path(output_path)
+    # Load data
+    dataset = load_dataset(dataset_path)
+    # Create prompts
+    prompt = load_yaml(prompt_template)
+    prompt_template = prompt["prompt_template"]
+    answers = prompt["answers_templates"]
+    shots = load_shots(dataset, shots_list, answers)
+    prompt_cls = select_prompt(model)
+    prompt = prompt_cls(max_characters=max_characters)
+    prompt.fit(prompt_template, shots)
+    dataset["answer"] = [answers for _ in range(len(dataset))]
+    dataset["prompt"] = prompt.apply(dataset["text"])
+    dataset = dataset.loc[:,["prompt", "answer", "label"]].reset_index(drop=False).rename(columns={"index": "idx"})
+    # Save prompts
+    dataset.to_json(output_path, orient="records", lines=True)
+if __name__ == "__main__":
+    from fire import Fire
+    Fire(main)

src/llmcal/scripts/results_bars.py ADDED Viewed

	@@ -0,0 +1,184 @@

+from pathlib import Path
+from typing import List
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from ..src.utils import load_yaml
+from .results_vs_samples import compute_num_samples
+DATASETS = {
+    "sst2": {"name":  "SST-2", "num_classes": 2},
+    "agnews": {"name":  "AGNews", "num_classes": 4},
+    "dbpedia": {"name":  "DBPedia", "num_classes": 14},
+    "20newsgroups": {"name":  "20 Newsgroups", "num_classes": 20},
+    "banking77": {"name":  "Banking77", "num_classes": 77},
+}
+metric2name = {
+    "nce": "NCE",
+    "ner": "NER",
+}
+def read_data(results_dir: Path, metrics: List[str]):
+    dfs = []
+    for metric in metrics:
+        df = pd.read_json(results_dir / f"{metric}.jsonl", orient='records', lines=True)
+        df = df.rename(columns={"result": f"{metric}"})
+        df = df.drop(columns=["min_result"])
+        dfs.append(df)
+    # merge dataframes with columns [dataset, method, size, seed, nce] and [dataset, method, size, seed, ner] on all columns but last
+    data = dfs[0]
+    for df in dfs[1:]:
+        data = data.merge(df, on=["dataset", "method", "size", "seed"], how="outer")
+    return data
+def plot_bars(data, methods_config, output_path, datasets, metrics, methods, sizes, no_adaptation="auto", fontsize_noa=22, pos=0):
+    fig, ax = plt.subplots(len(metrics), len(datasets), figsize=(6 * len(datasets), 12), sharex=False)
+    data_adapted = data.loc[data["method"] != "no_adaptation"]
+    adapted_methods = [m for m in methods if m != "no_adaptation"]
+    data_no_adapt = data.loc[data["method"] == "no_adaptation"]
+    i_ax, j_ax = 0, 0
+    n_methods = len(methods) + 1
+    medians = {}
+    for i, metric in enumerate(metrics):
+        medians[metric] = []
+        for j, dataset in enumerate(datasets):
+            # plot bar groups. One group per size. Each bar is a method
+            for k, method in enumerate(adapted_methods):
+                for s, size in enumerate(sizes):
+                    method_data = data_adapted.loc[data_adapted["method"] == method]
+                    method_data = method_data.loc[method_data["dataset"] == dataset]
+                    method_data = method_data.loc[method_data["size"] == size]
+                    median = method_data.groupby("size")[metric].median().values[0]
+                    medians[metric].append(median.max())
+                    q1 = method_data.groupby("size")[metric].quantile(0.25).values[0]
+                    q3 = method_data.groupby("size")[metric].quantile(0.75).values[0]
+                    # alpha = 0.5 if "SFT+PHC" in methods_config[method]["label"] else 1.0
+                    # alpha = 0.7 if "SFT " in methods_config[method]["label"] else 1.0
+                    # if "SFT " in methods_config[method]["label"]:
+                    #     hatch = "x"
+                    # elif "SFT+PHC" in methods_config[method]["label"]:
+                    #     hatch = "/"
+                    # else:
+                    #     hatch = None
+                    alpha = None
+                    hatch = None
+                    ax[i, j].bar(s + k / n_methods, median, yerr=[[median - q1], [q3 - median]], label=methods_config[method]["label"], width=0.8 / n_methods, color=methods_config[method]["color"], alpha=alpha, hatch=hatch)
+                    # ax[i,j].set_xticks(range(len(sizes)))
+    # plot no adaptation
+    for i, metric in enumerate(metrics):
+        y_max = np.round(min(1.4, 1.05 * max(medians[metric])),1)
+        for j, dataset in enumerate(datasets):
+            dataset_data = data_no_adapt.loc[data_no_adapt["dataset"] == dataset]
+            method_data = dataset_data.loc[dataset_data["method"] == "no_adaptation"]
+            # min_q1 = data_adapted[data_adapted["dataset"] == dataset].groupby("size")[metric].quantile(0.25).min()
+            # max_median = data_adapted[data_adapted["dataset"] == dataset].groupby("size")[metric].median().max()
+            if method_data.loc[:, metric].item() < y_max and no_adaptation in ["plot", "auto"]:
+                num_samples = [-1/n_methods, len(sizes) - 1 + len(adapted_methods)/n_methods]
+                noa_medians = [method_data.loc[:, metric].item()] * len(num_samples)
+                ax[i,j].plot(num_samples, noa_medians, label=methods_config["no_adaptation"]["label"], color=methods_config["no_adaptation"]["color"], linestyle=methods_config["no_adaptation"]["linestyle"])
+                i_ax = i
+                j_ax = j
+                # print(metric, dataset)
+            elif no_adaptation in ["text", "auto"]:
+                text = f"{methods_config['no_adaptation']['label']}"
+                ax[i,j].text(.95, .95-pos,
+                    f"{text} = {method_data.loc[:, metric].item():.2f}",
+                    fontsize=fontsize_noa, ha="right", va="top", transform=ax[i,j].transAxes, color=methods_config["no_adaptation"]["color"]
+                )
+            # no_adapt_value = data_no_adapt.loc[data_no_adapt["dataset"] == dataset,metric].values[0]
+            xlims = [-1/n_methods, len(sizes) - 1 + len(adapted_methods)/n_methods]
+            # ax[i,j].plot(xlims, [no_adapt_value] * len(sizes), label=methods_config["no_adaptation"]["label"], color=methods_config["no_adaptation"]["color"], linestyle="--")
+            ax[i,j].set_xlim(xlims)
+            # ax[i,j].set_xticks(range(len(sizes)))
+            # ax[i,j].set_xticklabels(ax[i,j].get_xticklabels(), fontsize=26)
+            ax[i,j].set_ylim(0, y_max)
+            if j == 0:
+                ax[i,j].set_ylabel(f"{metric2name[metric]}", fontsize=30)
+                ax[i,j].set_yticks(np.arange(0,int(y_max*10+1),2)/10)
+                ax[i,j].set_yticklabels([f"{d:.1f}" for d in np.arange(0,int(y_max*10+1),2)/10], fontsize=24)
+                # ax[i,j].set_yticks(ax[i,j].get_yticks())
+                # ax[i,j].set_yticklabels([f"{d:.1f}" for d in ax[i,j].get_yticks()], fontsize=24)
+            else:
+                # ax[i,j].sharey(ax[i,0])
+                ax[i,j].set_yticks([])
+                ax[i,j].set_yticklabels([])
+            # ax[i,j].set_yticks(ax[i,j].get_yticks())
+            # ax[i,j].set_yticklabels(ax[i,j].get_yticklabels(), fontsize=24)
+            # ax[i,j].grid(axis="y")
+    #YES
+    for i, dataset in enumerate(datasets):
+        ax[0, i].set_title(DATASETS[dataset]["name"], fontsize=30)
+        ax[0, i].set_xticks([])
+        ax[-1, i].set_xticks(range(len(sizes)))
+        ax[-1, i].set_xticklabels([f"{' '*15}N = {size}" for size in compute_num_samples(sizes, dataset)], fontsize=26)
+    # for j, metric in enumerate(metrics):
+    #     ax[j, 0].set_ylabel(f"{metric2name[metric]}", fontsize=30)
+    #     ax[j, 0].set_yticks(ax[j, 0].get_yticks())
+    #     ax[j, 0].set_yticklabels(ax[j, 0].get_yticklabels(), fontsize=24)
+    fig.text(0.5, -0.05, 'Adaptation sizes', ha='center', fontsize=26)
+    # Gather handles and labels from all axes
+    labels = [methods_config[method]["label"] for method in methods]
+    handles = []
+    hs, ls = ax[i_ax,j_ax].get_legend_handles_labels()
+    for l in labels:
+        i = 0
+        while ls[i] != l:
+            i += 1
+        handles.append(hs[i])
+    remaining = len(handles) % 4
+    for i in range(remaining):
+        handles.append(plt.Line2D([], [], color='none', label=''))
+        labels.append('')
+    fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(0.72, -0.3), title="Method", ncol=4, title_fontsize=30, fontsize=28)
+    fig.tight_layout(pad=1.0, h_pad=1.0, w_pad=-8.0)
+    plt.savefig(output_path, bbox_inches="tight", dpi=300)
+    plt.close(fig)
+def main(
+    datasets,
+    metrics,
+    sizes,
+    methods_config,
+    results_dir,
+    output_path,
+    methods,
+):
+    metrics = list(map(str, metrics.split()))
+    datasets = list(map(str, datasets.split()))
+    methods = list(map(str, methods.split()))
+    sizes = list(map(int, sizes.split()))
+    sizes = [sizes[0], sizes[-1]]
+    results_dir = Path(results_dir)
+    methods_config = load_yaml(methods_config)
+    data = read_data(results_dir, metrics)
+    plot_bars(data, methods_config, output_path, datasets, metrics, methods, sizes)
+if __name__ == "__main__":
+    from fire import Fire
+    Fire(main)

src/llmcal/scripts/results_table.py ADDED Viewed

	@@ -0,0 +1,167 @@

+from pathlib import Path
+from matplotlib import pyplot as plt
+import numpy as np
+import pandas as pd
+from ..src.utils import load_yaml
+from .results_vs_samples import DATASETS, metric2name, process_data
+def highlight_local(x):
+    if x['group'] == "noa":
+        return str(x['result'])
+    return f"$\\mathbf{{{x['result']}}}$"
+def highlight_global(x):
+    return f"\\underline{{{x['result']}}}"
+def get_all_mins(x):
+    import pdb; pdb.set_trace()
+    return x[x.round(2) == x.round(2).min()].index.to_numpy()
+def method2group(method):
+    if method == "no_adaptation":
+        return "noa"
+    elif method in ["temp_scaling", "dp_calibration", "bias_shift", "vector_scaling"]:
+        return "phc"
+    elif method.startswith("lora") and "plus" not in method:
+        return "sft"
+    elif method.startswith("lora") and "plus" in method:
+        return "sft+phc"
+def create_table(results_dir, methods, metrics, methods_config, datasets, sizes):
+    methods = [m for m in methods if m != "no_adaptation"]
+    # data_all = pd.DataFrame([
+    #     {"dataset": dataset, "size": size, "method": method, "group": method2group(method), "result": ""}
+    #     for size in [sizes[0], sizes[-1]]
+    #     for method in methods
+    #     for dataset in datasets
+    # ]).pivot(index=["size","group", "method"], columns="dataset", values="result")
+    # index = [(size, method2group(method), method) for size in [sizes[0],sizes[-1]] for method in methods]
+    # data_all = data_all.reindex(index)
+    data_all = []
+    for metric in metrics:
+        data = pd.read_json(results_dir / f"{metric}.jsonl", orient='records', lines=True)
+        data = process_data(data, datasets, [sizes[0],sizes[-1]], methods)
+        # no_adaptation = data[data["method"] == "no_adaptation"]
+        # min_size_no_adaptation = no_adaptation.copy()
+        # min_size_no_adaptation["size"] = sizes[0]
+        # max_size_no_adaptation = no_adaptation.copy()
+        # max_size_no_adaptation["size"] = sizes[-1]
+        data = data[data["method"] != "no_adaptation"]
+        # data = pd.concat([data, min_size_no_adaptation, max_size_no_adaptation], ignore_index=True)
+        data = data[data["size"].isin([sizes[0],sizes[-1]])].reset_index(drop=True)
+        data["group"] = data["method"].apply(method2group)
+        data["result"] = data["median"].apply(lambda x: f"{x:.2f}")
+        best_idx = []
+        for (dataset, size, group), group_data in data.groupby(["dataset","size","group"]):
+            med = group_data["median"]
+            best_idx.extend(med[med.round(2) == med.round(2).min()].index.to_list())
+        # best_idx = data.groupby(["dataset","size","group"])["median"].idxmin().values
+        # best_idx = data[data["median"].isin(data.loc[best_idx,"median"])].index.to_numpy()
+        data.loc[best_idx,"result"] = data.loc[best_idx,:].apply(highlight_local, axis=1)
+        best_idx = []
+        for (dataset, size), group_data in data.groupby(["dataset","size"]):
+            med = group_data["median"]
+            best_idx.extend(med[med.round(2) == med.round(2).min()].index.to_list())
+        # best_idx = data.groupby(["dataset","size"])["median"].idxmin().values
+        # best_idx = data[data["median"].isin(data.loc[best_idx,"median"])].index.to_numpy()
+        data.loc[best_idx,"result"] = data.loc[best_idx,:].apply(highlight_global, axis=1)
+        data = data.pivot(index=["size","group","method"], columns="dataset", values="result")
+        index = [(size, method2group(method), method) for size in [sizes[0],sizes[-1]] for method in methods]
+        data = data.reindex(index)
+        data = data.fillna("N/A")
+        data_all.append(data)
+        # for size in [sizes[0],sizes[-1]]:
+        #     for method in methods:
+        #         for dataset in datasets:
+        #             if data_all.loc[(size, method2group(method), method), dataset] == "":
+        #                 data_all.loc[(size, method2group(method), method), dataset] += data.loc[(size, method2group(method), method), dataset]
+        #             else:
+        #                 data_all.loc[(size, method2group(method), method), dataset] += " / " + data.loc[(size, method2group(method), method), dataset]
+    data_all = pd.concat(data_all, axis=1, keys=metrics)
+    data_all.columns = data_all.columns.swaplevel(0,1)
+    data_all = data_all.loc[:,[(dataset,metric) for dataset in datasets for metric in metrics]]
+    # data_all.index = data.index.map(lambda x: ({sizes[0]: f"min (N = {int(np.log2(sizes[0]))})", sizes[-1]: f"max (N = {int(np.log2(sizes[-1]))})"}[x[0]], methods_config[x[1]]["label"].replace("%","\\%").replace("\n", " &" * len(datasets) + " \\\\\n & ") ))
+    smallest = f"$T' = {int(np.log2(sizes[0]))}$"
+    largest = f"$T' = {int(np.log2(sizes[-1]))}$"
+    data_all.index = data.index.map(lambda x: ({sizes[0]: "\\rotatebox[origin=c]{{90}}{smallest}".format(smallest="{" + smallest + "}"), sizes[-1]: "\\rotatebox[origin=c]{{90}}{largest}".format(largest="{" + largest + "}")}[x[0]], method2group(x[2]), methods_config[x[2]]["label"] ))
+    data_all = data_all.reset_index(level=1,drop=True)
+    data_all = data_all.loc[:,datasets]
+    data_all.columns = data_all.columns.map(lambda x: (DATASETS[x[0]]["name"], metric2name[x[1]]))
+    data_all.columns.name = None
+    data_all.index.names = [None, None]
+    # noa_data = pd.DataFrame({
+    #     "dataset": datasets,
+    #     "method": [methods_config["no_adaptation"]["label"]] * len(datasets),
+    #     "size": [""] * len(datasets),
+    #     "result": [""] * len(datasets),
+    #     "metric": [""] * len(datasets),
+    # }).pivot(index=["size","method"], columns=["dataset","metric"], values="result").loc[:,datasets]
+    # noa_data.columns.name = None
+    # noa_data.index.names = [None, None]
+    noa_data = []
+    for metric in metrics:
+        data = pd.read_json(results_dir / f"{metric}.jsonl", orient='records', lines=True)
+        data = process_data(data, datasets, [sizes[0],sizes[-1]], ["no_adaptation"])
+        data["metric"] = metric
+        noa_data.append(data)
+        # for dataset in datasets:
+        #     m = data[(data["dataset"] == dataset)]["median"].values[0]
+        #     noa_data.loc[:, dataset] += f"{m:.2f} \ "
+    # noa_data.iloc[0,:] = noa_data.iloc[0,:].apply(lambda x: x[:-2])
+    # noa_data.columns = noa_data.columns.map(lambda x: (DATASETS[x[0]]["name"], metric2name[x[1]]))
+    noa_data = pd.concat(noa_data, axis=0)
+    noa_data = noa_data.pivot(index="method",columns=["dataset","metric"],values=["median"])
+    noa_data.columns = noa_data.columns.droplevel(0)
+    noa_data = noa_data.loc[:,[(dataset,metric) for dataset in datasets for metric in metrics]]
+    noa_data.columns = noa_data.columns.map(lambda x: (DATASETS[x[0]]["name"], metric2name[x[1]]))
+    noa_data.columns.names = [None, None]
+    noa_data.index = noa_data.index.map(lambda x: ("",methods_config[x]["label"]))
+    noa_data.index.names = ["size", "method"]
+    noa_data = noa_data.apply(lambda x: x.apply(lambda y: f"{y:.2f}"), axis=1)
+    data_all = pd.concat([noa_data, data_all], axis=0)
+    return data_all
+def main(
+    datasets,
+    sizes,
+    metrics,
+    methods,
+    methods_config,
+    results_dir,
+    output_path
+):
+    datasets = list(map(str, datasets.split()))
+    sizes = list(map(int, sizes.split()))
+    methods = list(map(str, methods.split()))
+    methods_config = load_yaml(methods_config)
+    metrics = list(map(str, metrics.split()))
+    output_path = Path(output_path)
+    output_dir = output_path.parent
+    results_dir = Path(results_dir)
+    table = create_table(results_dir, methods, metrics, methods_config, datasets, sizes)
+    table_str = table.to_latex(escape=False, column_format="ll" + "||c|c" * len(datasets))
+    table_str = table_str.replace("multirow[t]", "multirow[c]")
+    table_str = table_str.replace("multicolumn{2}{r}", "multicolumn{2}{c||}")
+    with open(output_path, "w") as f:
+        f.write(table_str)
+if __name__ == '__main__':
+    from fire import Fire
+    Fire(main)

src/llmcal/scripts/results_vs_samples.py ADDED Viewed

	@@ -0,0 +1,181 @@

+from pathlib import Path
+from matplotlib import pyplot as plt
+import numpy as np
+import pandas as pd
+from ..src.utils import load_yaml
+DATASETS = {
+    "sst2": {"name":  "SST-2", "num_classes": 2},
+    "agnews": {"name":  "AGNews", "num_classes": 4},
+    "dbpedia": {"name":  "DBPedia", "num_classes": 14},
+    "20newsgroups": {"name":  "20 Newsgroups", "num_classes": 20},
+    "banking77": {"name":  "Banking77", "num_classes": 77},
+}
+metric2name = {
+    "nce": "NCE",
+    "ner": "NER",
+}
+def process_data(data, datasets, sizes, methods):
+    # Keep only selected datasets
+    data = data[data["dataset"].isin(datasets)]
+    # Keep only selected sizes
+    data = data[data["size"].isin(sizes) | (data["size"] == 'all')]
+    # Keep only selected methods
+    data = data[data["method"].isin(methods)]
+    # Group by size
+    data = data.groupby(["dataset", "method", "size"])["result"].agg(
+        median=lambda x: x.median(),
+        q1=lambda x: x.quantile(0.25),
+        q3=lambda x: x.quantile(0.75),
+        count=lambda x: x.count(),
+    ).reset_index()
+    assert (data["count"] > 0).all()
+    data = data.drop(columns=["count"])
+    return data
+def compute_num_samples(sizes, dataset):
+    num_classes = DATASETS[dataset]["num_classes"]
+    scale = sizes / np.log2(num_classes)
+    nearest_power_of_2 = 2 ** np.round(np.log2(scale))
+    num_samples = nearest_power_of_2 * num_classes
+    return num_samples.astype(int)
+def plot_metric_vs_samples(ax, data, all_methods, methods_config, datasets, sizes, intervals=False, pos=0, no_adaptation="plot", modelname_noa=None, fontsize_noa=18):
+    datasets_data = {}
+    for i, dataset in enumerate(datasets):
+        # All methods in dataset
+        dataset_data = data[data["dataset"] == dataset]
+        methods = [m for m in all_methods if m in dataset_data["method"].unique()]
+        for j, method in enumerate(methods):
+            # Get data for method
+            method_data = dataset_data[dataset_data["method"] == method].set_index("size").drop(columns=["dataset", "method"])
+            # Fill missing sizes
+            missing_sizes = set(sizes) - set(method_data.index) if method != "no_adaptation" else set()
+            for size in missing_sizes:
+                method_data.loc[size] = [np.nan, np.nan, np.nan]
+            # Sort by size
+            method_data = method_data.sort_index()
+            # Plot
+            if method == "no_adaptation":
+                if dataset_data.loc[:, "q1"].min() < method_data.loc["all", "median"] < dataset_data.loc[:, "median"].max() and no_adaptation in ["plot", "auto"]:
+                    num_samples = compute_num_samples(sizes, dataset)
+                    medians = [method_data.loc["all", "median"]] * len(num_samples)
+                    q1 = [method_data.loc["all", "q1"]] * len(num_samples)
+                    q3 = [method_data.loc["all", "q3"]] * len(num_samples)
+                    kwargs = methods_config[method]
+                    ax[i].plot(num_samples, medians, **kwargs)
+                elif no_adaptation in ["text", "auto"]:
+                    if modelname_noa is not None:
+                        text = f"{methods_config['no_adaptation']['label']} ({modelname_noa})"
+                    else:
+                        text = f"{methods_config['no_adaptation']['label']}"
+                    ax[i].text(.95, .95-pos,
+                        f"{text} = {method_data.loc['all', 'median']:.2f}",
+                        fontsize=fontsize_noa, ha="right", va="top", transform=ax[i].transAxes, color=methods_config[method]["color"]
+                    )
+                elif no_adaptation == "skip":
+                    pass
+            else:
+                num_samples = compute_num_samples(method_data.index.astype(int), dataset)
+                medians = method_data["median"]
+                q1 = method_data["q1"]
+                q3 = method_data["q3"]
+                kwargs = methods_config[method]
+                ax[i].plot(num_samples, medians, **kwargs)
+                if intervals:
+                    ax[i].fill_between(num_samples, q1, q3, alpha=0.3, color=kwargs["color"])
+        ax[i].set_xscale("log")
+        ax[i].set_xticks(num_samples)
+        ax[i].set_xticklabels(num_samples, fontsize=18)
+        ax[i].set_xlim([min(num_samples)*0.9, max(num_samples)*1.1])
+        datasets_data[dataset] = dataset_data
+    return datasets_data
+def main(
+    datasets,
+    sizes,
+    metrics,
+    methods,
+    methods_config,
+    results_dir,
+    output_path,
+    intervals = False,
+):
+    datasets = list(map(str, datasets.split()))
+    sizes = list(map(int, sizes.split()))
+    methods = list(map(str, methods.split()))
+    methods_config = load_yaml(methods_config)
+    metrics = list(map(str, metrics.split()))
+    output_path = Path(output_path)
+    output_dir = output_path.parent
+    results_dir = Path(results_dir)
+    fig, axs = plt.subplots(len(metrics), len(datasets), figsize=(6 * len(datasets), 12))
+    processed_data = {}
+    for ax, metric in zip(axs,metrics):
+        data = pd.read_json(results_dir / f"{metric}.jsonl", orient='records', lines=True)
+        processed_data[metric] = data
+        data = process_data(data, datasets, sizes, methods)
+        datasets_data = plot_metric_vs_samples(ax, data, methods, methods_config, datasets, sizes, intervals=intervals, no_adaptation="auto")
+        for i, dataset in enumerate(datasets):
+            min_y, max_y = datasets_data[dataset].loc[datasets_data[dataset]["method"].isin(set(methods) - {"no_adaptation"}),"median"].min(), datasets_data[dataset].loc[datasets_data[dataset]["method"].isin(set(methods) - {"no_adaptation"}),"median"].max()
+            ax[i].set_ylim(min_y*0.99, max_y*1.01)
+            ax[i].set_yticks(np.round(ax[i].get_yticks(),3))
+            ax[i].set_yticklabels(ax[i].get_yticks(), fontsize=16)
+            # ax[i].grid(axis="y")
+        data.to_csv(output_dir / f"{metric}.csv", index=False)
+        ax[0].set_ylabel(f"{metric2name[metric]}", fontsize=22)
+    for j, dataset in enumerate(datasets):
+        axs[0,j].set_title(DATASETS[dataset]["name"], fontsize=22)
+    fig.text(0.5, 0.04, 'Number of train samples', ha='center', fontsize=22)
+    # axs[0,-1].legend(loc="upper right", bbox_to_anchor=(2.4, 1), title="Method", title_fontsize=24, fontsize=22)
+    # Gather handles and labels from all axes
+    handles, labels = [], []
+    for ax in axs.flat:
+        hs, ls = ax.get_legend_handles_labels()
+        for h, l in zip(hs, ls):
+            if l not in labels:
+                handles.append(h)
+                labels.append(l)
+    fig.legend(handles, labels, loc='lower center', bbox_to_anchor=(.5, -0.1), ncols=5, title="Method", title_fontsize=28, fontsize=26)
+    plt.savefig(output_path, bbox_inches="tight", dpi=300)
+    plt.close(fig)
+if __name__ == '__main__':
+    from fire import Fire
+    Fire(main)

src/llmcal/scripts/run_posteriors.py ADDED Viewed

	@@ -0,0 +1,193 @@

+import numpy as np
+import pandas as pd
+import torch
+from pathlib import Path
+from typing import Optional, Union, Literal
+import warnings
+from litgpt.tokenizer import Tokenizer
+from litgpt.utils import (
+    check_valid_checkpoint_dir,
+    get_default_supported_precision,
+    check_nvlink_connectivity,
+    load_checkpoint
+)
+import lightning as L
+from lightning_utilities.core.imports import RequirementCache
+from lightning.fabric.plugins import BitsandbytesPrecision
+from lightning.fabric.strategies import FSDPStrategy
+from tqdm import tqdm
+from ..src.utils import get_dataloader
+def setup(
+    base_checkpoint_dir: str,
+    checkpoint_dir,
+    data_path: str,
+    output_dir: str,
+    prediction_lists: Optional[str] = None,
+    peft: Union[Literal["lora", "adapter"], None] = None,
+    precision: Optional[str] = None,
+    devices: Union[int, str] = 1,
+    num_nodes: int = 1,
+    batch_size: int = 1,
+    max_seq_length: int = 1024,
+    **peft_kwargs,
+):
+    # Basic setup
+    torch.set_float32_matmul_precision("high")
+    data_path = Path(data_path)
+    output_dir = Path(output_dir)
+    base_checkpoint_dir = Path(base_checkpoint_dir)
+    checkpoint_dir = Path(checkpoint_dir)
+    prediction_list = np.hstack([np.loadtxt(prediction_list, dtype=int) for prediction_list in prediction_lists.split(",")])
+    # Load config file
+    check_valid_checkpoint_dir(base_checkpoint_dir)
+    if peft is None:
+        from litgpt.config import Config
+        from litgpt.model import Block
+    elif peft == "lora":
+        from litgpt.lora import Config, Block
+    elif peft == "adapter":
+        from litgpt.adapter import Config, Block
+    else:
+        raise ValueError(f"Unknown peft type: {peft}")
+    config = Config.from_file(checkpoint_dir / "model_config.yaml", **peft_kwargs)
+    # Precision
+    precision = precision or get_default_supported_precision(training=True)
+    # Strategy
+    if devices * num_nodes > 1:
+        strategy = "ddp"
+    else:
+        strategy = "auto"
+    # Init fabric
+    fabric = L.Fabric(
+        devices=devices,
+        num_nodes=num_nodes,
+        strategy=strategy,
+        precision=precision,
+    )
+    if torch.cuda.is_available() and devices > 1:
+        check_nvlink_connectivity(fabric)
+    # Launch
+    fabric.launch(main, peft, config, base_checkpoint_dir, checkpoint_dir, data_path, output_dir, prediction_list, batch_size, max_seq_length)
+def main(
+    fabric: L.Fabric,
+    peft,
+    config,
+    base_checkpoint_dir,
+    checkpoint_dir,
+    data_path,
+    output_dir,
+    prediction_list,
+    batch_size,
+    max_seq_length,
+):
+    # Seed everything
+    fabric.seed_everything(92837)
+    # Load model parameters from checkpoint
+    if peft is None:
+        from litgpt.model import GPT
+    elif peft == "lora":
+        from litgpt.lora import GPT
+    elif peft == "adapter":
+        from litgpt.adapter import GPT
+    else:
+        raise ValueError(f"Unknown peft type: {peft}")
+    checkpoint_path = base_checkpoint_dir / "lit_model.pth"
+    with fabric.init_module(empty_init=(fabric.world_size > 1)):
+        model = GPT(config)
+        model.max_seq_length = max_seq_length
+        model.set_kv_cache(batch_size=batch_size, max_seq_length=max_seq_length)
+    model = fabric.setup_module(model)
+    load_checkpoint(fabric, model, checkpoint_path, strict=False)
+    if peft == "lora":
+        from litgpt.lora import merge_lora_weights
+        lora_checkpoint_path = checkpoint_dir / "lit_model.pth.lora"
+        load_checkpoint(fabric, model, lora_checkpoint_path, strict=False)
+        merge_lora_weights(model)
+    elif peft == "adapter":
+        adapter_checkpoint_path = checkpoint_dir / "lit_model.pth.adapter"
+        load_checkpoint(fabric, model, adapter_checkpoint_path, strict=False)
+    # Load tokenizer
+    tokenizer = Tokenizer(checkpoint_dir)
+    # Predict
+    dataloader = get_dataloader([data_path], [prediction_list], tokenizer, batch_size, pad_token_id=0, max_seq_length=model.max_seq_length, shuffle = False)
+    dataloader = fabric.setup_dataloaders(dataloader)
+    predictions = predict(fabric, model, dataloader)
+    if fabric.global_rank == 0:
+        pd.DataFrame(predictions["logits"], index=predictions["idx"]).to_csv(output_dir / f"logits.csv", index=True, header=False)
+        pd.DataFrame(predictions["label"], index=predictions["idx"]).to_csv(output_dir / f"labels.csv", index=True, header=False)
+def predict_step(fabric, model, indices, prompt_ids, prompt_mask, answers_ids, labels):
+    logits = []
+    for input_ids, attention_mask, answers in zip(prompt_ids, prompt_mask, answers_ids):
+        input_ids = input_ids[attention_mask == 1].unsqueeze(0)
+        T = torch.sum(attention_mask)
+        with fabric.init_tensor():
+            input_pos = torch.arange(0, T)
+        output = model(idx=input_ids, input_pos=input_pos)
+        answers_logits = []
+        for answer in answers:
+            answer = answer.unsqueeze(0)
+            input_pos = torch.arange(T, answer.shape[1] + T, device=answer.device, dtype=answer.dtype)
+            ans_out = model(idx=answer, input_pos=input_pos)
+            logprobs = torch.cat([output[:,-1:,:], ans_out[:,:-1,:]], dim=1).log_softmax(dim=2)
+            index = answer.unsqueeze(2)
+            gather_probs = torch.gather(logprobs, -1, index).squeeze(2)
+            ans_logit = gather_probs.sum()
+            answers_logits.append(ans_logit)
+        logits.append(torch.stack(answers_logits, dim=0))
+    logits = torch.stack(logits, dim=0)
+    return {"idx": indices, "logits": logits, "label": labels}
+@torch.no_grad()
+def predict(fabric, model, dataloader):
+    predict_outputs = {"idx": [], "logits": [], "label": []}
+    model.eval()
+    for i, batch in enumerate(dataloader):
+        if i % max(len(dataloader) // 50,1) == 0:
+            fabric.print(f"Predicting batch {i+1}/{len(dataloader)}")
+        outputs = predict_step(fabric, model, batch["idx"], batch["prompt_ids"], batch["prompt_mask"], batch["answers_ids"], batch["label"])
+        fabric.barrier()
+        gathered_outputs = fabric.all_gather(outputs)
+        if fabric.global_rank == 0:
+            for k, v in gathered_outputs.items():
+                if fabric.world_size > 1:
+                    v = v.view(-1, *v.shape[2:]).cpu()
+                else:
+                    v = v.cpu()
+                if k in ["idx", "label"]:
+                    predict_outputs[k].append(v.long())
+                else:
+                    predict_outputs[k].append(v.float())
+    if fabric.global_rank == 0:
+        for k, v in predict_outputs.items():
+            predict_outputs[k] = torch.cat(v, dim=0).numpy()
+    return predict_outputs
+if __name__ == "__main__":
+    from fire import Fire
+    Fire(setup)

src/llmcal/scripts/train_lora.py ADDED Viewed

	@@ -0,0 +1,418 @@

+import time
+import numpy as np
+import torch
+from pathlib import Path
+from typing import Literal, Optional, Union
+import warnings
+from litgpt.tokenizer import Tokenizer
+from litgpt.utils import (
+    check_valid_checkpoint_dir,
+    get_default_supported_precision,
+    check_nvlink_connectivity,
+    load_checkpoint,
+    CycleIterator,
+)
+from litgpt.lora import Config, Block, GPT, mark_only_lora_as_trainable, lora_filter
+import lightning as L
+from lightning_utilities.core.imports import RequirementCache
+from lightning.fabric.utilities.load import _lazy_load as lazy_load
+from lightning.fabric.plugins import BitsandbytesPrecision
+from lightning.fabric.strategies import FSDPStrategy
+from tqdm import tqdm
+from ..src.utils import get_dataloader, save_yaml
+from ..src.loggers import TBLogger, CSVLogger
+warnings.filterwarnings("ignore", category=UserWarning, message=".*Experiment logs directory outputs*")
+def setup(
+    base_checkpoint_dir: str,
+    lora_checkpoint_dir: Optional[str] = None,
+    data_paths: str = None,
+    train_lists: str = None,
+    val_lists: str = None,
+    output_dir: str = None,
+    output_checkpoint_dir: str = None,
+    log_dir: str = None,
+    precision: Optional[str] = None,
+    quantize: Optional[Literal["bnb.nf4", "bnb.nf4-dq", "bnb.fp4", "bnb.fp4-dq", "bnb.int8-training"]] = None,
+    devices: Union[int, str] = 1,
+    num_nodes: int = 1,
+    global_batch_size: int = 16,
+    micro_batch_size: int = 1,
+    val_check_interval = 16,
+    learning_rate = 0.0001,
+    optimizer = Literal["sgd", "adamw"],
+    weight_decay = 0.0,
+    loss: Literal["fs", "ans", "norm"] = "fs",
+    patience: int = 10,
+    max_steps: int = -1,
+    seed = 0,
+    max_seq_length: int = 1024,
+    **lora_kwargs,
+):
+    # Basic setup
+    torch.set_float32_matmul_precision("high")
+    data_paths = [Path(data_path) for data_path in data_paths.split(",")]
+    output_dir = Path(output_dir)
+    output_checkpoint_dir = Path(output_checkpoint_dir)
+    base_checkpoint_dir = Path(base_checkpoint_dir)
+    lora_checkpoint_dir = Path(lora_checkpoint_dir) if lora_checkpoint_dir is not None else None
+    train_lists = [np.loadtxt(train_list, dtype=int) for train_list in train_lists.split(",")]
+    if val_lists is None:
+        rs = np.random.RandomState(seed)
+        val_lists = [rs.choice(train_list, min(len(train_list) // 10, 10), replace=False) for train_list in train_lists]
+    else:
+        val_lists = [np.loadtxt(val_list, dtype=int) for val_list in val_lists.split(",")]
+    # Load config file
+    check_valid_checkpoint_dir(base_checkpoint_dir)
+    config = Config.from_file(base_checkpoint_dir / "model_config.yaml", **lora_kwargs)
+    # Precision and quantization
+    precision = precision or get_default_supported_precision(training=True)
+    plugins = None
+    if quantize is not None and quantize.startswith("bnb."):
+        if "mixed" in precision:
+            raise ValueError("Quantization and mixed precision is not supported.")
+        if RequirementCache("bitsandbytes != 0.42.0"):
+            warnings.warn(
+                "LitGPT only supports bitsandbytes v0.42.0. "
+                "This may result in errors when using quantization."
+            )
+        dtype = {"16-true": torch.float16, "bf16-true": torch.bfloat16, "32-true": torch.float32}[precision]
+        plugins = BitsandbytesPrecision(quantize[4:], dtype)
+        precision = None
+    if devices * num_nodes > 1:
+        if quantize:
+            raise NotImplementedError(
+                "Quantization is currently not supported for multi-GPU training. Please set devices=1 and num_nodes=1"
+                " when using the --quantize flag."
+            )
+        strategy = FSDPStrategy(
+            auto_wrap_policy={Block},
+            activation_checkpointing_policy={Block},
+            state_dict_type="full",
+            limit_all_gathers=True,
+            cpu_offload=False,
+        )
+    else:
+        strategy = "auto"
+    # Init fabric
+    fabric = L.Fabric(
+        devices=devices,
+        num_nodes=num_nodes,
+        strategy=strategy,
+        precision=precision,
+        plugins=plugins,
+        loggers=[
+            TBLogger(save_dir=log_dir),
+            CSVLogger(save_dir=log_dir),
+        ]
+    )
+    if torch.cuda.is_available() and devices > 1:
+        check_nvlink_connectivity(fabric)
+    # Launch
+    train_args = {
+        "loss": "norm" if loss.startswith("norm-") else loss,
+        "K": int(loss.split("-")[-1]) if loss.startswith("norm-") else None,
+        "global_batch_size": global_batch_size,
+        "micro_batch_size": micro_batch_size,
+        "val_check_interval": val_check_interval,
+        "learning_rate": learning_rate,
+        "optimizer_name": optimizer,
+        "weight_decay": weight_decay,
+        "patience": patience,
+        "max_steps": max_steps,
+    }
+    fabric.launch(main, config, base_checkpoint_dir, lora_checkpoint_dir, data_paths, output_dir, output_checkpoint_dir, train_lists, val_lists, train_args, devices, seed, max_seq_length)
+def main(
+    fabric: L.Fabric,
+    config: Config,
+    base_checkpoint_dir: Path,
+    lora_checkpoint_dir: Optional[Path],
+    data_paths: Path,
+    output_dir: Path,
+    output_checkpoint_dir: Path,
+    train_lists: np.ndarray,
+    val_lists: np.ndarray,
+    train_args: dict,
+    devices: int,
+    seed: int,
+    max_seq_length: int,
+):
+    fabric.seed_everything(seed)
+    # Init dataloaders
+    tokenizer = Tokenizer(base_checkpoint_dir)
+    train_dataloader = get_dataloader(data_paths, train_lists, tokenizer, train_args["micro_batch_size"], 0, max_seq_length, shuffle = True, seed = seed)
+    val_dataloader = get_dataloader(data_paths, val_lists, tokenizer, train_args["micro_batch_size"], 0, max_seq_length, shuffle = False, seed = seed)
+    train_dataloader, val_dataloader = fabric.setup_dataloaders(train_dataloader, val_dataloader)
+    # Init Base model
+    base_checkpoint_path = base_checkpoint_dir / "lit_model.pth"
+    with fabric.init_module(empty_init=(fabric.world_size > 1)):
+        model = GPT(config)
+        model.max_seq_length = max_seq_length
+    mark_only_lora_as_trainable(model)
+    model = fabric.setup_module(model)
+    # Init optimizer
+    trainable_params = [p for p in model.parameters() if p.requires_grad]
+    if train_args["optimizer_name"] == "sgd":
+        optimizer = torch.optim.SGD(trainable_params, lr=train_args["learning_rate"], weight_decay=train_args["weight_decay"])
+    elif train_args["optimizer_name"] == "adamw":
+        optimizer = torch.optim.AdamW(trainable_params, lr=train_args["learning_rate"], weight_decay=train_args["weight_decay"])
+    else:
+        raise ValueError(f"Unknown optimizer: {train_args['optimizer_name']}")
+    optimizer = fabric.setup_optimizers(optimizer)
+    # Load weights
+    load_checkpoint(fabric, model, base_checkpoint_path, strict=False)
+    if lora_checkpoint_dir is not None:
+        lora_checkpoint_path = lora_checkpoint_dir / "lit_model.pth.lora"
+        load_checkpoint(fabric, model, lora_checkpoint_path, strict=False)
+    # Train
+    fit(fabric, model, optimizer, train_dataloader, val_dataloader, devices, output_dir, seed, **train_args)
+    if fabric.device.type == "cuda":
+        fabric.print(f"Memory used: {torch.cuda.max_memory_allocated() / 1e9:.02f} GB")
+    fabric.print("Training finished.")
+    if fabric.global_rank == 0:
+        save_yaml(train_args, output_dir / "train_args.yaml")
+    fabric.save(output_checkpoint_dir / "lit_model.pth.lora", {k: v for k, v in model.state_dict().items() if lora_filter(k,v)})
+def fit(fabric, model, optimizer, train_dataloader, val_dataloader, devices, output_dir, seed, **train_args):
+    if (state_dict_path := output_dir / "last.ckpt").exists():
+        state = lazy_load(state_dict_path)
+        model.load_state_dict(state["model"], strict=False) # only lora params are saved
+    else:
+        state = {
+            "model": {k: v for k, v in model.state_dict().items() if lora_filter(k,v)},
+            "step_count": 0,
+            "iter_num": 0,
+            "best_val_loss": float("inf"),
+            "last_val_loss": float("inf"),
+            "patience_count": 0,
+            "cum_train_loss": 0,
+            "cum_train_num_tokens": 0,
+            "start_time": time.perf_counter(),
+            "end_time": None,
+        }
+    train_iterator = CycleIterator(train_dataloader)
+    rs = np.random.RandomState(seed)
+    gradient_accumulation_iters = (train_args["global_batch_size"] // devices) // train_args["micro_batch_size"]
+    # Define the loss
+    if train_args["loss"] == "fs":
+        loss_fn = FullSentenceLoss()
+    elif train_args["loss"] == "ans":
+        loss_fn = LossOnAnswer()
+    elif train_args["loss"] == "norm":
+        loss_fn = LossNormByAnswers(len(train_dataloader.dataset[0]["answers_ids"]), train_args["K"], seed)
+    else:
+        raise ValueError(f"Unknown loss: {train_args['loss']}")
+    # Advance until state
+    step_count = 0
+    iter_num = 0
+    while step_count < state["step_count"]:
+        iter_num += 1
+        batch = next(train_iterator)
+        if train_args["loss"] == "norm":
+            loss_fn.use_ids(batch["label"])
+        is_accumulating = iter_num % gradient_accumulation_iters != 0
+        if not is_accumulating:
+            step_count += 1
+    # Continue training
+    model.train()
+    stop_training = False
+    while not stop_training:
+        state["iter_num"] += 1
+        batch = next(train_iterator)
+        iter_t0 = time.perf_counter()
+        # Perform forward and backward pass
+        is_accumulating = state["iter_num"] % gradient_accumulation_iters != 0
+        with fabric.no_backward_sync(model, enabled=is_accumulating):
+            loss, num_tokens = loss_fn(model, batch["prompt_ids"], batch["prompt_mask"], batch["answers_ids"], batch["label"])
+            fabric.backward(loss / num_tokens / gradient_accumulation_iters)
+        # Accumulate loss for logging
+        state["cum_train_loss"] += loss.item()
+        state["cum_train_num_tokens"] += num_tokens
+        # Perform optimizer step
+        if not is_accumulating:
+            optimizer.step()
+            optimizer.zero_grad()
+            state["step_count"] += 1
+        # Log train loss
+        if not is_accumulating or state["iter_num"] == 1:
+            t1 = time.perf_counter()
+            metrics = {
+                "train/loss": state["cum_train_loss"] / state["cum_train_num_tokens"],
+                "iter": state["iter_num"],
+                "step": state["step_count"],
+                "epoch": train_iterator.epoch,
+                "iter_time": t1 - iter_t0,
+            }
+            fabric.print(
+                f"Epoch {metrics['epoch']+1} | iter {metrics['iter']} step {metrics['step']} |"
+                f" train loss: {metrics['train/loss']:.3f},"
+                f" val loss: {state['last_val_loss']:.3f} |"
+                f" best val loss: {state['best_val_loss']:.3f} |"
+                f" patience: {state['patience_count']} |"
+                f" iter time: {metrics['iter_time'] * 1000:.2f} ms"
+            )
+            fabric.log_dict(metrics, step=state["step_count"])
+        # Validate
+        if not is_accumulating and state["step_count"] % train_args["val_check_interval"] == 0:
+            val_loss, val_num_tokens = validate(fabric, model, val_dataloader, train_args, seed)
+            state["last_val_loss"] = val_loss.item() / val_num_tokens
+            fabric.log_dict({
+                "val/loss": state["last_val_loss"],
+            }, step=state["step_count"])
+            if state["last_val_loss"] < state["best_val_loss"]:
+                state.update({
+                    "model": {k: v for k, v in model.state_dict().items() if lora_filter(k,v)},
+                    "end_time": time.perf_counter(),
+                    "best_val_loss": state["last_val_loss"],
+                    "patience_count": 0,
+                })
+                fabric.save(output_dir / "best.ckpt", state)
+            else:
+                state["patience_count"] += 1
+            fabric.barrier()
+        # Save last checkpoint
+        if not is_accumulating:
+            state["model"] = {k: v for k, v in model.state_dict().items() if lora_filter(k,v)}
+            state["end_time"] = time.perf_counter()
+            fabric.save(output_dir / "last.ckpt", state)
+            state["cum_train_loss"] = 0
+            state["cum_train_num_tokens"] = 0
+        # Check if training should stop
+        if train_args["max_steps"] > 0:
+            stop_training = state["step_count"] >= train_args["max_steps"]
+        else:
+            stop_training = state["patience_count"] >= train_args["patience"]
+@torch.no_grad()
+def validate(fabric, model, val_dataloader, train_args, seed):
+    if train_args["loss"] == "fs":
+        loss_fn = FullSentenceLoss()
+    elif train_args["loss"] == "ans":
+        loss_fn = LossOnAnswer()
+    elif train_args["loss"] == "norm":
+        loss_fn = LossNormByAnswers(len(val_dataloader.dataset[0]["answers_ids"]), train_args["K"], seed)
+    total_loss = 0
+    total_num_tokens = 0
+    model.eval()
+    fabric.print("Validating...")
+    for batch in val_dataloader:
+        loss, num_tokens = loss_fn(model, batch["prompt_ids"], batch["prompt_mask"], batch["answers_ids"], batch["label"])
+        total_loss += loss
+        total_num_tokens += num_tokens
+    model.train()
+    return total_loss, total_num_tokens
+class FullSentenceLoss(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, model, prompt_ids, prompt_mask, answers_ids, labels):
+        loss = 0
+        num_tokens = 0
+        for input_ids, attention_mask, answers, label in zip(prompt_ids, prompt_mask, answers_ids, labels):
+            input_ids = input_ids[attention_mask == 1].unsqueeze(0)
+            full_input_ids = torch.cat([input_ids, answers[label.item()].unsqueeze(0)], dim=1)
+            logprobs = model(full_input_ids, None)[:,:-1,:].log_softmax(dim=2)
+            index = full_input_ids[:,1:].unsqueeze(2)
+            gather_logprobs = torch.gather(logprobs, -1, index).squeeze(2)
+            loss = loss - gather_logprobs.sum()
+            num_tokens = num_tokens + index.size(1)
+        return loss, num_tokens
+class LossOnAnswer(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+    def forward(self, model, prompt_ids, prompt_mask, answers_ids, labels):
+        loss = 0
+        num_tokens = 0
+        for input_ids, attention_mask, answers, label in zip(prompt_ids, prompt_mask, answers_ids, labels):
+            input_ids = input_ids[attention_mask == 1].unsqueeze(0)
+            full_input_ids = torch.cat([input_ids, answers[label.item()].unsqueeze(0)], dim=1)
+            logprobs = model(full_input_ids, None)[:,input_ids.shape[1]-1:-1,:].log_softmax(dim=2)
+            index = full_input_ids[:,input_ids.shape[1]:].unsqueeze(2)
+            gather_logprobs = torch.gather(logprobs, -1, index).squeeze(2)
+            loss = loss - gather_logprobs.sum()
+            num_tokens = num_tokens + index.size(1)
+        return loss, num_tokens
+class LossNormByAnswers(torch.nn.Module):
+    def __init__(self, total_num_answers, K = 5, seed = None):
+        super().__init__()
+        self.total_num_answers = total_num_answers
+        self.K = K
+        self._rs = np.random.RandomState(seed)
+    def use_ids(self, label):
+        if self.K is not None:
+            use_ids = np.hstack(
+                (self._rs.choice([i for i in range(self.total_num_answers) if i != label], min(self.K-1,self.total_num_answers-1), replace=False),[label.item()])
+            )
+        else:
+            use_ids = np.arange(self.total_num_answers)
+        return use_ids
+    def forward(self, model, prompt_ids, prompt_mask, answers_ids, labels):
+        loss = 0
+        num_tokens = 0
+        for input_ids, attention_mask, answers, label in zip(prompt_ids, prompt_mask, answers_ids, labels):
+            input_ids = input_ids[attention_mask == 1].unsqueeze(0)
+            class_logprobs = []
+            use_ids = self.use_ids(label)
+            for i, ans_ids in enumerate(answers):
+                if i not in use_ids:
+                    class_logprobs.append(torch.tensor(-float("inf"), device=input_ids.device))
+                    continue
+                full_input_ids = torch.cat([input_ids, ans_ids.unsqueeze(0)], dim=1)
+                logprobs = model(full_input_ids, None)[:,input_ids.shape[1]-1:-1,:].log_softmax(dim=2)
+                index = full_input_ids[:,input_ids.shape[1]:].unsqueeze(2)
+                gather_logprobs = torch.gather(logprobs, -1, index).squeeze(2)
+                logprob = gather_logprobs.sum()
+                class_logprobs.append(logprob)
+            logits = torch.stack(class_logprobs, dim=0)
+            num_tokens = num_tokens + answers[label.item()].size(0)
+            loss = loss + torch.nn.functional.cross_entropy(logits.unsqueeze(0), label.unsqueeze(0), reduction="sum")
+        return loss, num_tokens
+if __name__ == '__main__':
+    from fire import Fire
+    Fire(setup)

src/llmcal/src/__init__.py ADDED Viewed

File without changes

src/llmcal/src/evaluation/calibration.py ADDED Viewed

	@@ -0,0 +1,84 @@

+import numpy as np
+import pandas as pd
+import torch
+from torch import nn
+import torch.nn.functional as F
+from sklearn.model_selection import StratifiedKFold, StratifiedGroupKFold, GroupKFold, KFold
+class DPCalibrator(nn.Module):
+    def __init__(self, n_classes):
+        super().__init__()
+        self.n_classes = n_classes
+        self.alpha = nn.Parameter(torch.tensor(1.0))
+        self.beta = nn.Parameter(torch.zeros(n_classes))
+    def forward(self, x):
+        return self.alpha * x + self.beta
+    def calibrate(self, logprobs):
+        self.eval()
+        with torch.no_grad():
+            cal_logprobs = torch.log_softmax(self(logprobs), dim=1)
+        return cal_logprobs
+    def fit(self, logprobs, labels):
+        self.train()
+        optimizer = torch.optim.LBFGS(self.parameters(), lr=1e-1, max_iter=40)
+        priors = torch.bincount(labels, minlength=logprobs.shape[1]).float() / len(labels)
+        priors_ce = -torch.log(priors[labels]).mean().item()
+        last_nce = float("inf")
+        while True:
+            def closure():
+                optimizer.zero_grad()
+                cal_logits = self(logprobs)
+                loss = F.cross_entropy(cal_logits, labels)
+                loss.backward()
+                return loss
+            loss = optimizer.step(closure)
+            nce = loss.item() / priors_ce
+            if abs(last_nce - nce) < 1e-5:
+                break
+            last_nce = nce
+        return self
+def train_cal_on_test(logits, labels):
+    calibrator = DPCalibrator(n_classes=logits.shape[1])
+    logprobs = torch.log_softmax(torch.from_numpy(logits).float(), dim=1)
+    labels = torch.from_numpy(labels).long()
+    calibrator.fit(logprobs, labels)
+    calibrated_logprobs = calibrator.calibrate(logprobs).numpy()
+    return calibrated_logprobs
+def calibrate_xval(logits, targets, seed=0, condition_ids=None, stratified=True, nfolds=5):
+    logprobs = torch.log_softmax(torch.from_numpy(logits).float(), dim=1)
+    targets = torch.from_numpy(targets).long()
+    logprobscal = torch.zeros(logprobs.size())
+    if stratified:
+        if condition_ids is not None:
+            skf = StratifiedGroupKFold(n_splits=nfolds, shuffle=True, random_state=seed)
+        else:
+            skf = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=seed)
+    else:
+        if condition_ids is not None:
+            skf = GroupKFold(n_splits=nfolds)
+        else:
+            skf = KFold(n_splits=nfolds, shuffle=True, random_state=seed)
+    for trni, tsti in skf.split(logprobs, targets, condition_ids):
+        model = DPCalibrator(n_classes=logprobs.shape[1])
+        model.fit(logprobs[trni], targets[trni])
+        with torch.no_grad():
+            logprobscal[tsti] = torch.log_softmax(model.forward(logprobs[tsti]), dim=1)
+    return logprobscal

src/llmcal/src/evaluation/metrics.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import numpy as np
+from scipy.special import softmax, log_softmax
+from .calibration import train_cal_on_test, calibrate_xval
+def compute_ner(logits, labels):
+    er = (logits.argmax(axis=1) != labels).mean()
+    winner = np.bincount(labels, minlength=logits.shape[1]).argmax()
+    norm = (labels != winner).mean()
+    return er / norm
+def compute_nce(logits, labels):
+    ce = -log_softmax(logits, axis=1)[np.arange(len(labels)), labels].mean()
+    priors = np.bincount(labels, minlength=logits.shape[1]) / len(labels)
+    norm = -np.log(priors[labels]).mean()
+    return ce / norm
+def compute_nbrier(logits, labels):
+    one_hot = np.zeros(logits.shape)
+    one_hot[np.arange(len(labels)), labels] = 1
+    brier = ((one_hot - softmax(logits, axis=1))**2).mean()
+    priors = np.bincount(labels, minlength=logits.shape[1]) / len(labels)
+    norm = ((one_hot - priors)**2).mean()
+    return brier / norm
+def compute_cal_loss(logits, labels, mode="trainontest", metric="nce"):
+    if mode == "trainontest":
+        cal_logprobs = train_cal_on_test(logits, labels)
+    elif mode == "xval":
+        cal_logprobs = calibrate_xval(logits, labels, seed=1234, condition_ids=None, stratified=True, nfolds=5)
+    else:
+        raise ValueError(f"Unknown mode: {mode}")
+    nce = compute_metric(logits, labels, metric)
+    cal_nce = compute_metric(cal_logprobs, labels, metric)
+    return (nce - cal_nce) / nce
+def compute_ece(logits, labels):
+    n_bins = 10
+    bin_boundaries = np.linspace(0, 1, n_bins + 1)
+    bin_lowers = bin_boundaries[:-1]
+    bin_uppers = bin_boundaries[1:]
+    softmaxes = softmax(logits, axis=1)
+    confidences = softmaxes.max(axis=1)
+    predictions = softmaxes.argmax(axis=1)
+    accuracies = predictions == labels
+    ece = 0
+    for bin_lower, bin_upper in zip(bin_lowers, bin_uppers):
+        in_bin = (confidences > bin_lower) * (confidences < bin_upper)
+        prop_in_bin = in_bin.mean()
+        if prop_in_bin > 0:
+            accuracy_in_bin = accuracies[in_bin].mean()
+            avg_confidence_in_bin = confidences[in_bin].mean()
+            ece += np.abs(avg_confidence_in_bin - accuracy_in_bin) * prop_in_bin
+    return ece
+def compute_metric(logits, labels, metric):
+    if metric == "ner":
+        return compute_ner(logits, labels)
+    elif metric == "nce":
+        return compute_nce(logits, labels)
+    elif metric == "nbrier":
+        return compute_nbrier(logits, labels)
+    elif "calloss" in metric:
+        _, metric, mode = metric.split("_")
+        return compute_cal_loss(logits, labels, mode, metric)
+    elif metric == "ece":
+        return compute_ece(logits, labels)
+    else:
+        raise ValueError(f"Unknown metric: {metric}")
+def compute_psr_with_mincal(logits, labels, psr, mode):
+    if mode == "trainontest":
+        cal_logprobs = train_cal_on_test(logits, labels)
+    elif mode == "xval":
+        cal_logprobs = calibrate_xval(logits, labels, seed=1234, condition_ids=None, stratified=True, nfolds=5)
+    elif mode == "none":
+        cal_logprobs = logits
+    else:
+        raise ValueError(f"Unknown mode: {mode}")
+    loss = compute_metric(logits, labels, psr)
+    cal_loss = compute_metric(cal_logprobs, labels, psr)
+    return loss, cal_loss

src/llmcal/src/loggers.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import numpy as np
+import torch
+import os
+from lightning.pytorch.loggers import TensorBoardLogger, CSVLogger as _CSVLogger
+import pandas as pd
+from .utils import save_yaml
+class TBLogger(TensorBoardLogger):
+    def __init__(self, save_dir):
+        _save_dir = "/".join(save_dir.split("/")[:-1])
+        _version = save_dir.split("/")[-1]
+        super().__init__(
+            save_dir=_save_dir,
+            name="",
+            version=_version,
+            log_graph=False,
+            default_hp_metric=False,
+            prefix="",
+            sub_dir=None,
+        )
+    def log_hyperparams(self, hyperparams, metrics = None):
+        super().log_hyperparams(hyperparams, metrics)
+        save_yaml(hyperparams, os.path.join(self.log_dir, "hyperparams.yaml"))
+class CSVLogger(_CSVLogger):
+    def __init__(self, save_dir):
+        _save_dir = "/".join(save_dir.split("/")[:-1])
+        _version = save_dir.split("/")[-1]
+        super().__init__(
+            save_dir=_save_dir,
+            name="",
+            version=_version,
+            prefix="",
+            flush_logs_every_n_steps=1,
+        )
+        if os.path.exists(os.path.join(self.log_dir, "metrics.csv")):
+            self.experiment.metrics = pd.read_csv(os.path.join(self.log_dir, "metrics.csv")).to_dict(orient="records")

src/llmcal/src/prompts/__init__.py ADDED Viewed

	@@ -0,0 +1,6 @@

+from .llama3 import Llama3Prompt
+from .phi import Phi3Prompt
+from .tinyllama import TinyLlamaPrompt
+from .pythia import PythiaPrompt
+from .gemma import GemmaPrompt
+from .qwen import QwenPrompt

src/llmcal/src/prompts/gemma.py ADDED Viewed

	@@ -0,0 +1,39 @@

+class GemmaPrompt:
+    def __init__(self, max_characters=400):
+        self.max_characters = max_characters
+        self.prompt = None
+    def apply(self, text):
+        filled_prompts = []
+        for t in text:
+            filled_prompts.append(self.prompt.replace("{inpt}", t[:self.max_characters]))
+        return filled_prompts
+    def fit(self, prompt_template, shots):
+        preface = (
+            "<start_of_turn>user\n"
+            f"{prompt_template}\n\n"
+        )
+        output_preface = (
+            "{inpt}<end_of_turn>\n"
+            "<start_of_turn>model\n"
+        )
+        if len(shots) == 0:
+            self.prompt = preface + output_preface
+            return self
+        shot_template = (
+            "{shot_inpt}<end_of_turn>\n"
+            "<start_of_turn>model\n{shot_label}<end_of_turn>\n<start_of_turn>user\n"
+        )
+        shots_prompt = ""
+        for shot in shots:
+            shots_prompt += shot_template.format(shot_inpt=shot["text"][:self.max_characters], shot_label=shot["label"])
+        self.prompt = preface + shots_prompt + output_preface
+        return self

src/llmcal/src/prompts/llama3.py ADDED Viewed

	@@ -0,0 +1,38 @@

+class Llama3Prompt:
+    def __init__(self, max_characters=400):
+        self.max_characters = max_characters
+        self.prompt = None
+    def apply(self, text):
+        filled_prompts = []
+        for t in text:
+            filled_prompts.append(self.prompt.replace("{inpt}", t[:self.max_characters]))
+        return filled_prompts
+    def fit(self, prompt_template, shots):
+        preface = (
+            "<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\n"
+            f"{prompt_template}<|eot_id|>" # No newline
+        )
+        output_preface = (
+            "<|start_header_id|>user<|end_header_id|>\n\n{inpt}<|eot_id|>" # No newline
+            "<|start_header_id|>assistant<|end_header_id|>\n\n"
+        )
+        if len(shots) == 0:
+            self.prompt = preface + output_preface
+            return self
+        shot_template = (
+            "<|start_header_id|>user<|end_header_id|>\n\n{shot_inpt}<|eot_id|>" # No newline
+            "<|start_header_id|>assistant<|end_header_id|>\n\n{shot_label}<|eot_id|>" # No newline
+        )
+        shots_prompt = ""
+        for shot in shots:
+            shots_prompt += shot_template.format(shot_inpt=shot["text"][:self.max_characters], shot_label=shot["label"])
+        self.prompt = preface + shots_prompt + output_preface
+        return self

src/llmcal/src/prompts/phi.py ADDED Viewed

	@@ -0,0 +1,38 @@

+class Phi3Prompt:
+    def __init__(self, max_characters=400):
+        self.max_characters = max_characters
+        self.prompt = None
+    def apply(self, text):
+        filled_prompts = []
+        for t in text:
+            filled_prompts.append(self.prompt.format(inpt=t[:self.max_characters]))
+        return filled_prompts
+    def fit(self, prompt_template, shots):
+        preface = (
+            f'<|system|>\n{prompt_template}<|end|>\n'
+        )
+        output_preface = (
+            "<|user|>\n{inpt}<|end|>\n"
+            "<|assistant|>\n"
+        )
+        if len(shots) == 0:
+            self.prompt = preface + output_preface
+            return self
+        shot_template = (
+            "<|user|>\n{shot_inpt}<|end|>\n"
+            "<|assistant|>\n{shot_label}<|end|>\n"
+        )
+        shots_prompt = ""
+        for shot in shots:
+            shots_prompt += shot_template.format(shot_inpt=shot["text"][:self.max_characters], shot_label=shot["label"])
+        self.prompt = preface + shots_prompt + output_preface
+        return self

src/llmcal/src/prompts/pythia.py ADDED Viewed

	@@ -0,0 +1,38 @@

+class PythiaPrompt:
+    def __init__(self, max_characters=400):
+        self.max_characters = max_characters
+        self.prompt = None
+    def apply(self, text):
+        filled_prompts = []
+        for t in text:
+            filled_prompts.append(self.prompt.format(inpt=t[:self.max_characters]))
+        return filled_prompts
+    def fit(self, prompt_template, shots):
+        preface = (
+            f'<|system|>\n{prompt_template}<|end|>\n'
+        )
+        output_preface = (
+            "<|user|>\n{inpt}<|end|>\n"
+            "<|assistant|>\n"
+        )
+        if len(shots) == 0:
+            self.prompt = preface + output_preface
+            return self
+        shot_template = (
+            "<|user|>\n{shot_inpt}<|end|>\n"
+            "<|assistant|>\n{shot_label}<|end|>\n"
+        )
+        shots_prompt = ""
+        for shot in shots:
+            shots_prompt += shot_template.format(shot_inpt=shot["text"][:self.max_characters], shot_label=shot["label"])
+        self.prompt = preface + shots_prompt + output_preface
+        return self

src/llmcal/src/prompts/qwen.py ADDED Viewed

	@@ -0,0 +1,37 @@

+class QwenPrompt:
+    def __init__(self, max_characters=400):
+        self.max_characters = max_characters
+        self.prompt = None
+    def apply(self, text):
+        filled_prompts = []
+        for t in text:
+            filled_prompts.append(self.prompt.replace("{inpt}", t[:self.max_characters]))
+        return filled_prompts
+    def fit(self, prompt_template, shots):
+        preface = (
+            f"<|im_start|>system\n{prompt_template}<|im_end|>\n"
+        )
+        output_preface = (
+            "<|im_start|>user\n{inpt}<|im_end|>\n"
+            "<|im_start|>assistant\n"
+        )
+        if len(shots) == 0:
+            self.prompt = preface + output_preface
+            return self
+        shot_template = (
+            "<|im_start|>user\n{shot_inpt}<|im_end|>\n"
+            "<|im_start|>assistant\n{shot_label}<|im_end|>\n"
+        )
+        shots_prompt = ""
+        for shot in shots:
+            shots_prompt += shot_template.format(shot_inpt=shot["text"][:self.max_characters], shot_label=shot["label"])
+        self.prompt = preface + shots_prompt + output_preface
+        return self

src/llmcal/src/prompts/tinyllama.py ADDED Viewed

	@@ -0,0 +1,40 @@

+class TinyLlamaPrompt:
+    def __init__(self, max_characters=400):
+        self.max_characters = max_characters
+        self.prompt = None
+    def apply(self, text):
+        filled_prompts = []
+        for t in text:
+            filled_prompts.append(self.prompt.format(inpt=t[:self.max_characters]))
+        return filled_prompts
+    def fit(self, prompt_template, shots):
+        preface = (
+            "<|system|>\n"
+            f"{prompt_template}</s>\n"
+        )
+        output_preface = (
+            "<|user|>\n{inpt}</s>\n"
+            "<|assistant|>\n"
+        )
+        if len(shots) == 0:
+            self.prompt = preface + output_preface
+            return self
+        shot_template = (
+           "<|user|>\n{shot_inpt}</s>\n"
+            "<|assistant|>\n{shot_label}</s>\n"
+        )
+        shots_prompt = ""
+        for shot in shots:
+            shots_prompt += shot_template.format(shot_inpt=shot["text"][:self.max_characters], shot_label=shot["label"])
+        self.prompt = preface + shots_prompt + output_preface
+        return self

src/llmcal/src/utils.py ADDED Viewed

	@@ -0,0 +1,93 @@

+from functools import partial
+from typing import Dict, List
+import numpy as np
+import torch
+from torch import Tensor
+from torch.utils.data import Dataset, DataLoader
+import pandas as pd
+import yaml
+from litgpt import Tokenizer
+def load_yaml(path):
+    with open(path, "r") as f:
+        file = yaml.safe_load(f)
+    if file is None:
+        return {}
+    return file
+def save_yaml(data: dict, path) -> None:
+    with open(path, "w") as f:
+        yaml.dump(data, f)
+class JSONDataset(Dataset):
+    def __init__(self, paths, lsts, tokenizer):
+        self.lsts = lsts
+        self.paths = paths
+        self.tokenizer = tokenizer
+        data = []
+        for i, (path, lst) in enumerate(zip(paths, lsts)):
+            d = pd.read_json(path, lines=True).set_index("idx").loc[lst].reset_index(drop=False)
+            d["task_id"] = i
+            d = d.apply(self._transform, axis=1)
+            data.append(d)
+        self.data = pd.concat(data, ignore_index=False)
+    def _transform(self, sample):
+        idx = torch.tensor(sample["idx"], dtype=torch.long)
+        prompt_ids = self.tokenizer.encode(sample["prompt"], bos=True).long()
+        answers_ids = [self.tokenizer.encode(ans, bos=True)[1:].long() for ans in sample["answer"]]
+        label = torch.tensor(sample["label"], dtype=torch.long)
+        task_id = torch.tensor(sample["task_id"], dtype=torch.long)
+        return pd.Series({"idx": idx, "prompt_ids": prompt_ids, "answers_ids": answers_ids, "label": label, "task_id": task_id})
+    def __len__(self):
+        return len(self.data)
+    def __getitem__(self, idx):
+        return self.data.iloc[idx].to_dict()
+class Collator:
+    def __init__(self, pad_token_id, max_seq_len):
+        # batch = {"idx": ..., "prompt_ids": ..., "answers_ids": ...}
+        self.pad_token_id = pad_token_id
+        self.max_seq_len = max_seq_len
+    def __call__(self, batch):
+        prompts_ids = []
+        prompt_masks = []
+        answers_ids = []
+        max_ans_len = max([max([ans.shape[0] for ans in sample["answers_ids"]]) for sample in batch])
+        max_prompt_len = min(self.max_seq_len - max_ans_len, max([sample["prompt_ids"].shape[0] for sample in batch]))
+        for sample in batch:
+            seq = sample["prompt_ids"][-max_prompt_len:]
+            prompts_ids.append(torch.cat([torch.ones(max_prompt_len - seq.shape[0], dtype=torch.long) * self.pad_token_id, seq]))
+            prompt_masks.append(torch.cat([torch.zeros(max_prompt_len - seq.shape[0], dtype=torch.long), torch.ones(seq.shape[0], dtype=torch.long)]))
+            answers_ids.append(sample["answers_ids"])
+        return {
+            "idx": torch.stack([sample["idx"] for sample in batch]),
+            "prompt_ids": torch.stack(prompts_ids),
+            "prompt_mask": torch.stack(prompt_masks),
+            "answers_ids": answers_ids,
+            "task_id": torch.stack([sample["task_id"] for sample in batch]),
+            "label": torch.stack([sample["label"] for sample in batch])
+        }
+def get_dataloader(data_paths, lsts, tokenizer, batch_size = 1, pad_token_id = 0, max_seq_length = 2048, shuffle = False, seed = 42):
+    dataset = JSONDataset(data_paths, lsts, tokenizer)
+    collate_fn = Collator(pad_token_id=pad_token_id, max_seq_len=max_seq_length)
+    dataloader = DataLoader(
+        dataset,
+        batch_size=batch_size,
+        collate_fn=collate_fn,
+        shuffle=shuffle,
+        generator=torch.Generator().manual_seed(seed)
+    )
+    return dataloader

src/llmcal/tests/__init__.py ADDED Viewed

File without changes

src/llmcal/tests/check_lists.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import numpy as np
+import pandas as pd
+from pathlib import Path
+import yaml
+from tqdm import tqdm
+DATASETS = {"sst2": 2, "agnews": 4, "dbpedia": 14, "20newsgroups": 20, "banking77": 77}
+N_SHOTS = [0, 1, 2, 4, 8, 16, 32, 64]
+N_SEEDS = 5
+FACTORS = [8, 32, 64, 128, 256, 512]
+VAL_PROPS = [0.0, 0.3]
+TEST_SAMPLES = {"sst2": 400, "agnews": 400, "dbpedia": 700, "20newsgroups": 800, "banking77": 1000}
+def main():
+    for dataset in tqdm(DATASETS):
+        num_classes = DATASETS[dataset]
+        for factor in FACTORS:
+            scale = factor / np.log2(num_classes)
+            nearest_power_of_2 = 2 ** np.round(np.log2(scale)) # round to nearest power of 2
+            num_samples = int(nearest_power_of_2 * num_classes)
+            # Read data
+            data = pd.read_csv(f"data/{dataset}/all.csv")
+            # check train, test and test_nsamples lists are ok
+            train_list = np.loadtxt(f"lists/{dataset}/train.txt")
+            assert data.index.isin(train_list).sum() == len(train_list) and np.unique(train_list).size == len(train_list)
+            test_list = np.loadtxt(f"lists/{dataset}/test.txt")
+            assert data.index.isin(test_list).sum() == len(test_list) and np.unique(test_list).size == len(test_list)
+            test_nsamples_list = np.loadtxt(f"lists/{dataset}/test_{TEST_SAMPLES[dataset]}.txt")
+            assert data.index.isin(test_nsamples_list).sum() == len(test_nsamples_list) and np.unique(test_nsamples_list).size == len(test_nsamples_list)
+            # Check no overlap between train and test, and train and test_nsamples
+            assert len(np.intersect1d(train_list, test_list)) == 0
+            assert len(np.intersect1d(train_list, test_nsamples_list)) == 0
+            for valprop in VAL_PROPS:
+                for seed in range(N_SEEDS):
+                    with open(f"lists/{dataset}/size={factor}/valprop={valprop}/seed={seed}/matched.yaml", 'r') as file:
+                        matched = yaml.load(file, Loader=yaml.FullLoader)
+                    val_size = int(valprop * num_samples)
+                    train_size = num_samples - val_size
+                    assert len(matched["train"][dataset]) == train_size
+                    if val_size > 0:
+                        assert len(matched["val"][dataset]) == val_size
+                        assert not np.isin(matched["val"][dataset], matched["train"][dataset]).any()
+                        assert not np.isin(matched["val"][dataset], test_list).any()
+                    else:
+                        assert np.isin(matched["val"][dataset], matched["train"][dataset]).all()
+                    with open(f"lists/{dataset}/size={factor}/valprop={valprop}/seed={seed}/mismatched.yaml", 'r') as file:
+                        mismatched = yaml.load(file, Loader=yaml.FullLoader)
+                    assert all([train_dataset != dataset for train_dataset in mismatched["train"]])
+    print("All lists are ok!")
+if __name__ == '__main__':
+    main()