File size: 73,177 Bytes
90c099b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
"""
Unified evaluation script for semantic (LLM-based) and auto_metric (rule-based) evaluation.

This script:
1. Reads eval_rubrics.json (from 1_generate_review_based_rubrics.py) containing rubrics for each paper
2. Reads input JSON file containing model reviews (supports multiple formats)
3. Supports three evaluation modes:
   - semantic: LLM-based rubrics evaluation (from 2_evaluate_direct.py)
   - auto_metric: Rule-based metrics evaluation (from 3_rule_evaluate.py)
   - both: Run both evaluations separately
4. Supports strict mode: normalize scores to discrete scales before computing metrics (--strict_mode)
5. Outputs separate JSON files for results and summaries

Usage:
    # Semantic evaluation only
    python 2_evaluate.py \
        --rubrics_path eval_rubrics.json \
        --reviews_path model_reviews.json \
        --mode semantic \
        --yaml_path prompts.yaml \
        --config_path configs.yaml \
        --semantic_output semantic_results.json \
        --max_workers 5

    # Auto-metric evaluation only
    python 2_evaluate.py \
        --rubrics_path eval_rubrics.json \
        --reviews_path model_reviews.json \
        --mode auto_metric \
        --auto_metric_output auto_metric_results.json

    # Auto-metric evaluation with strict mode (normalize scores to discrete scales)
    python 2_evaluate.py \
        --rubrics_path eval_rubrics.json \
        --reviews_path model_reviews.json \
        --mode auto_metric \
        --auto_metric_output auto_metric_results.json \
        --strict_mode

    # Auto-metric evaluation with manually specified input format (refined)
    python 2_evaluate.py \
        --rubrics_path eval_rubrics.json \
        --reviews_path model_reviews.json \
        --mode auto_metric \
        --auto_metric_output auto_metric_results.json \
        --input_format refined

    # Auto-metric evaluation with manually specified input format (original)
    python 2_evaluate.py \
        --rubrics_path eval_rubrics.json \
        --reviews_path ours.json \
        --mode auto_metric \
        --auto_metric_output auto_metric_results.json \
        --input_format original

    # Both evaluations
    python 2_evaluate.py \
        --rubrics_path eval_rubrics.json \
        --reviews_path model_reviews.json \
        --mode both \
        --yaml_path prompts.yaml \
        --config_path configs.yaml \
        --semantic_output semantic_results.json \
        --auto_metric_output auto_metric_results.json \
        --max_workers 32
"""
from __future__ import annotations

import json
import os
import sys
import argparse
import yaml
import math
from typing import Dict, List, Any, Optional
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from itertools import combinations
from scipy.stats import spearmanr
from sklearn.metrics import precision_recall_fscore_support

# Add parent directory to path
sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Import parse_llm_response from local llm_service module
import llm_service as local_llm_service
parse_llm_response = local_llm_service.parse_llm_response

# Import from shared/utils for gpt/vllm support
project_root = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from shared.utils.llm_service import LLMService
from shared.utils.vllm_service import VLLMService
from shared.utils.gpt_service import GPTService
sys.path.insert(0, os.path.join(project_root, 'shared', 'utils'))
from json_parser import parse_review_markdown

class ReviewProcessor:
    """Handles the extraction and processing of reviews from different sources."""

    @staticmethod
    def extract_review_content(pred_context):
        """
        Extract the review content from the prediction context.

        Args:
            pred_context: Raw prediction data that contains the review

        Returns:
            str: Extracted review content
        """
        try:
            # First attempt to extract from boxed format
            return pred_context.split(r'\boxed_review{')[-1].split('\n}')[0]
        except Exception:
            # Alternative extraction if the first method fails
            if isinstance(pred_context, dict) and 'output' in pred_context:
                return pred_context['output'].split(r'\boxed_review{')[-1].split('\n}')[0]
            else:
                # Return as is if extraction fails
                return pred_context


# ============================================================================
# Semantic Evaluation Functions (from 2_evaluate_direct.py)
# ============================================================================

def load_prompt_template(yaml_path: str) -> str:
    """Load the evaluator prompt from YAML file."""
    with open(yaml_path, 'r', encoding='utf-8') as f:
        prompts = yaml.safe_load(f)
    return prompts.get('v1_evaluator_prompt', '')


def build_evaluation_prompt(
    rubrics: List[Dict[str, Any]],
    paper_content: str,
    review: str,
    prompt_template: str
) -> str:
    """Build the evaluation prompt by replacing placeholders."""
    rubrics_json = json.dumps(rubrics, indent=4, ensure_ascii=False)
    prompt = prompt_template.replace('{rubrics_json}', rubrics_json)
    prompt = prompt.replace('<<paper_content>>', paper_content)
    prompt = prompt.replace('<<review>>', review)
    return prompt


def calculate_weighted_scores(
    raw_scores: Dict[str, Dict[str, Any]], 
    rubrics: List[Dict[str, Any]]
) -> Dict[str, float]:
    """Calculate weighted scores for each rubric."""
    rubric_weights = {r['title']: r['weight'] for r in rubrics}
    weighted_scores = {}
    
    for rubric_title, rubric_data in raw_scores.items():
        if rubric_title not in rubric_weights:
            continue
        
        rubric_score = rubric_data.get('score', 0)
        if isinstance(rubric_score, str):
            try:
                rubric_score = int(rubric_score)
            except ValueError:
                rubric_score = 0
        
        if rubric_score not in [0, 1]:
            rubric_score = 1 if rubric_score > 0 else 0
        
        weight = rubric_weights[rubric_title]
        weighted_scores[rubric_title] = rubric_score * weight
    
    return weighted_scores


def calculate_scores(raw_scores: Dict[str, Dict[str, Any]]) -> Dict[str, float]:
    """Calculate scores for each rubric."""
    scores = {}
    for rubric_title, rubric_data in raw_scores.items():
        scores[rubric_title] = rubric_data.get('score', 0)
    return scores


def evaluate_review_semantic(
    entry: Dict[str, Any],
    paper_content: str,
    prompt_template: str,
    llm_service: LLMService
) -> Dict[str, Any]:
    """Evaluate a single review using article-specific rubrics."""
    entry_id = entry.get('id', 'unknown')
    rubrics = entry.get('rubrics', [])
    model_review = entry.get('model_review', '')
    
    if not rubrics:
        return {
            'id': entry_id,
            'raw_scores': {},
            'weighted_scores': {},
            'total_score': 0.0,
            'error': 'No valid rubrics found',
            'raw_response': ''
        }
    
    # Build prompt
    prompt = build_evaluation_prompt(rubrics, paper_content, model_review, prompt_template)
    
    # Call LLM
    try:
        messages = [{"role": "user", "content": prompt}]
        response = llm_service.generate(messages=messages)
        
        # Parse response
        raw_scores = parse_llm_response(response)
        weighted_scores = calculate_scores(raw_scores)
        total_score = sum(weighted_scores.values())
        
        return {
            'id': entry_id,
            'raw_scores': raw_scores,
            'weighted_scores': weighted_scores,
            'total_score': total_score,
            'raw_response': response
        }
    except Exception as e:
        print(f"[ERROR] Error evaluating review {entry_id}: {e}")
        return {
            'id': entry_id,
            'raw_scores': {},
            'weighted_scores': {},
            'total_score': 0.0,
            'error': str(e),
            'raw_response': ''
        }


def calculate_per_rubric_statistics(
    valid_results: List[Dict[str, Any]],
    rubric_titles: List[str]
) -> Dict[str, Dict[str, float]]:
    """Calculate per-rubric statistics from evaluation results."""
    rubric_scores = {title: [] for title in rubric_titles}
    
    for result in valid_results:
        weighted_scores = result.get('weighted_scores', {})
        if not isinstance(weighted_scores, dict):
            continue
        
        for rubric_title in rubric_titles:
            if rubric_title in weighted_scores:
                score = weighted_scores[rubric_title]
                if isinstance(score, str):
                    try:
                        score = float(score)
                    except ValueError:
                        continue
                elif isinstance(score, (int, float)):
                    score = float(score)
                else:
                    continue
                rubric_scores[rubric_title].append(score)
    
    per_rubric_stats = {}
    for rubric_title in rubric_titles:
        scores = rubric_scores[rubric_title]
        if not scores:
            continue
        
        mean_score = sum(scores) / len(scores)
        min_score = min(scores)
        max_score = max(scores)
        count = len(scores)
        
        if rubric_title == "False or Contradictory Claims":
            pass_count = sum(1 for s in scores if s >= 0)
        else:
            pass_count = sum(1 for s in scores if s >= 1)
        pass_rate = pass_count / count if count > 0 else 0.0
        
        per_rubric_stats[rubric_title] = {
            'mean': mean_score,
            'min': min_score,
            'max': max_score,
            'count': count,
            'pass_rate': pass_rate
        }
    
    return per_rubric_stats


# ============================================================================
# Auto-Metric Evaluation Functions (from 3_rule_evaluate.py)
# ============================================================================

def extract_scores_from_review(review_text: str) -> Dict[str, Any]:
    """Extract numeric scores and decision from a review markdown text."""
    if not review_text:
        return {'soundness': None, 'presentation': None, 'rating': None, 'confidence': None, 'decision': None}
    
    try:
        parsed = parse_review_markdown(review_text)
        decision = parsed.get('decision', '')
        if decision:
            decision_lower = decision.lower().strip()
            if 'accept' in decision_lower:
                decision = 'accept'
            elif 'reject' in decision_lower:
                decision = 'reject'
            elif 'undecided' in decision_lower:
                decision = 'undecided'
            else:
                decision = decision_lower
        else:
            decision = None
        
        return {
            'soundness': parsed.get('soundness'),
            'presentation': parsed.get('presentation'),
            'rating': parsed.get('rating'),
            'confidence': parsed.get('confidence'),
            'decision': decision
        }
    except Exception as e:
        print(f"Warning: Failed to parse review text: {e}")
        return {'soundness': None, 'presentation': None, 'rating': None, 'confidence': None, 'decision': None}


def calculate_mse(predicted: float, ground_truth: float) -> Optional[float]:
    """Calculate Mean Squared Error for a single value."""
    if predicted is None or ground_truth is None:
        return None
    return (predicted - ground_truth) ** 2


def calculate_mae(predicted: float, ground_truth: float) -> Optional[float]:
    """Calculate Mean Absolute Error for a single value."""
    if predicted is None or ground_truth is None:
        return None
    return abs(predicted - ground_truth)


def normalize_to_discrete_scale(score: Optional[float], scale_type: str) -> Optional[float]:
    """
    Normalize a float score to the nearest discrete value based on scale type.
    Uses round-half-up tie-breaking (e.g., 3.5 rounds to 4, 1.5 rounds to 2).
    
    Args:
        score: The float score to normalize (can be None)
        scale_type: Either '0-5' for 0-5 scale (discrete: 0,1,2,3,4,5) 
                    or '0-10' for 0-10 scale (discrete: 0,2,4,6,8,10)
    
    Returns:
        Normalized discrete score, or None if input is None
    """
    if score is None:
        return None
    
    try:
        score = float(score)
    except (ValueError, TypeError):
        return None
    
    if scale_type == '0-5':
        # Discrete values: 0, 1, 2, 3, 4, 5
        discrete_values = [0, 1, 2, 3, 4, 5]
        # Clamp to valid range
        score = max(0, min(5, score))
        # Find nearest discrete value, with round-half-up tie-breaking
        # For ties, prefer the higher value
        best_value = None
        best_distance = float('inf')
        for val in discrete_values:
            distance = abs(val - score)
            if distance < best_distance:
                best_distance = distance
                best_value = val
            elif distance == best_distance and val > best_value:
                # Tie-breaking: prefer higher value (round-half-up)
                best_value = val
        return best_value
    elif scale_type == '0-10':
        # Discrete values: 0, 2, 4, 6, 8, 10
        discrete_values = [0, 2, 4, 6, 8, 10]
        # Clamp to valid range
        score = max(0, min(10, score))
        # Find nearest discrete value, with round-half-up tie-breaking
        best_value = None
        best_distance = float('inf')
        for val in discrete_values:
            distance = abs(val - score)
            if distance < best_distance:
                best_distance = distance
                best_value = val
            elif distance == best_distance and val > best_value:
                # Tie-breaking: prefer higher value (round-half-up)
                best_value = val
        return best_value
    else:
        raise ValueError(f"Unknown scale_type: {scale_type}. Must be '0-5' or '0-10'")


def normalize_scores_dict(scores: Dict[str, Optional[float]]) -> Dict[str, Optional[float]]:
    """
    Normalize all scores in a dictionary to their appropriate discrete scales.
    
    Args:
        scores: Dictionary with keys 'soundness', 'presentation', 'rating', 'confidence'
    
    Returns:
        Dictionary with normalized scores
    """
    normalized = {}
    
    # soundness, presentation, confidence use 0-5 scale
    for key in ['soundness', 'presentation', 'confidence']:
        normalized[key] = normalize_to_discrete_scale(scores.get(key), '0-5')
    
    # rating uses 0-10 scale
    normalized['rating'] = normalize_to_discrete_scale(scores.get('rating'), '0-10')
    
    return normalized


def calculate_score_metrics(
    model_scores: Dict[str, float],
    ground_truth_scores: Dict[str, float],
    normalize: bool = False
) -> Dict[str, Any]:
    """
    Calculate MSE and MAE metrics for each scoring dimension.
    
    Args:
        model_scores: Dictionary with model scores
        ground_truth_scores: Dictionary with ground truth scores
        normalize: If True, normalize scores to discrete scales before computing metrics
    
    Returns:
        Dictionary with MSE, MAE metrics and optionally normalized scores
    """
    dimensions = ['soundness', 'presentation', 'rating', 'confidence']
    
    # Normalize scores to discrete scales if requested
    if normalize:
        model_scores_normalized = normalize_scores_dict(model_scores)
        gt_scores_normalized = normalize_scores_dict(ground_truth_scores)
    else:
        model_scores_normalized = model_scores
        gt_scores_normalized = ground_truth_scores
    
    mse_values = {}
    mae_values = {}
    valid_count = 0
    
    for dim in dimensions:
        # Use normalized scores for metric calculation
        mse = calculate_mse(model_scores_normalized.get(dim), gt_scores_normalized.get(dim))
        mae = calculate_mae(model_scores_normalized.get(dim), gt_scores_normalized.get(dim))
        mse_values[f'{dim}_mse'] = mse
        mae_values[f'{dim}_mae'] = mae
        if mse is not None:
            valid_count += 1
    
    overall_error = sum([v for v in mse_values.values() if v is not None])
    
    result = {
        **mse_values,
        **mae_values,
        'overall_error': overall_error if valid_count > 0 else None,
        'valid_dimensions': valid_count
    }
    
    # Include normalized scores in result for transparency (only if normalize=True)
    if normalize:
        result['model_scores_normalized'] = model_scores_normalized
        result['gt_scores_normalized'] = gt_scores_normalized
    
    return result


def normalize_score_value(value):
    """Normalize score value to float, handling string representations."""
    if value is None:
        return None
    if isinstance(value, (int, float)):
        return float(value)
    if isinstance(value, str):
        # Try to extract numeric value from string (e.g., "2.75" -> 2.75)
        try:
            import re
            match = re.search(r'(\d+\.?\d*)', value)
            if match:
                return float(match.group(1))
        except:
            pass
    return None


def normalize_decision(decision):
    """Normalize decision string to standard format."""
    if decision is None:
        return None
    decision_lower = str(decision).lower().strip()
    if 'accept' in decision_lower:
        return 'accept'
    elif 'reject' in decision_lower:
        return 'reject'
    elif 'undecided' in decision_lower:
        return 'undecided'
    else:
        return decision_lower


def extract_scores_from_dict(scores_dict: Dict[str, Any]) -> Dict[str, Any]:
    """
    Extract scores from a structured dictionary (scores or initial_scores format).
    
    Args:
        scores_dict: Dict containing scores (e.g., {'rating': 5.75, 'soundness': '2.75', ...})
    
    Returns:
        Dict with normalized scores: {'soundness', 'presentation', 'rating', 'confidence', 'decision'}
    """
    if not scores_dict:
        return {
            'soundness': None,
            'presentation': None,
            'rating': None,
            'confidence': None,
            'decision': None
        }
    
    return {
        'soundness': normalize_score_value(scores_dict.get('soundness')),
        'presentation': normalize_score_value(scores_dict.get('presentation')),
        'rating': normalize_score_value(scores_dict.get('rating')),
        'confidence': normalize_score_value(scores_dict.get('confidence')),
        'decision': normalize_decision(scores_dict.get('decision'))
    }


def evaluate_review_auto_metric(entry: Dict[str, Any], use_initial_scores: bool = False, strict_mode: bool = False) -> Dict[str, Any]:
    """
    Evaluate a single entry by extracting scores and calculating metrics.
    
    Args:
        entry: Evaluation entry containing model_review, scores, initial_scores, etc.
        use_initial_scores: If True, use initial_scores instead of refined scores (for refined format)
    
    Returns:
        Dict containing evaluation metrics
    """
    entry_id = entry.get('id', 'unknown')
    model_review = entry.get('model_review', '')
    format_type = entry.get('format', 'unknown')
    
    # Extract scores based on format
    model_scores = {}
    model_decision = None
    
    if format_type == 'refined' and not use_initial_scores:
        # Use refined scores from structured data
        scores_dict = entry.get('scores', {})
        model_data = extract_scores_from_dict(scores_dict)
        model_scores = {
            'soundness': model_data.get('soundness'),
            'presentation': model_data.get('presentation'),
            'rating': model_data.get('rating'),
            'confidence': model_data.get('confidence')
        }
        model_decision = model_data.get('decision')
    elif format_type == 'refined' and use_initial_scores:
        # Use initial scores from structured data
        initial_scores_dict = entry.get('initial_scores', {})
        model_data = extract_scores_from_dict(initial_scores_dict)
        model_scores = {
            'soundness': model_data.get('soundness'),
            'presentation': model_data.get('presentation'),
            'rating': model_data.get('rating'),
            'confidence': model_data.get('confidence')
        }
        model_decision = model_data.get('decision')
    elif format_type == 'original':
        # Use initial scores from structured data
        initial_scores_dict = entry.get('initial_scores', {})
        model_data = extract_scores_from_dict(initial_scores_dict)
        model_scores = {
            'soundness': model_data.get('soundness'),
            'presentation': model_data.get('presentation'),
            'rating': model_data.get('rating'),
            'confidence': model_data.get('confidence')
        }
        model_decision = model_data.get('decision')
        
        # Fallback: If confidence is missing from structured data, try to extract from review text
        # (meta_review may not have confidence field, but review text might)
        if model_scores.get('confidence') is None and model_review:
            try:
                review_data = extract_scores_from_review(model_review)
                if review_data.get('confidence') is not None:
                    model_scores['confidence'] = review_data.get('confidence')
            except Exception:
                pass  # Keep confidence as None if extraction fails
    else:
        # Fallback: extract from markdown review text
        model_data = extract_scores_from_review(model_review)
        model_scores = {
            'soundness': model_data.get('soundness'),
            'presentation': model_data.get('presentation'),
            'rating': model_data.get('rating'),
            'confidence': model_data.get('confidence')
        }
        model_decision = model_data.get('decision')
    
    # Get ground truth scores from golden_review ONLY
    # Ground truth must ONLY come from golden_review, never from model output
    # If extraction fails, leave fields as None (do not use model_review as fallback)
    ground_truth_review = entry.get('golden_review', '')
    ground_truth_scores = {}
    gt_decision = None
    
    if not ground_truth_review:
        print(f"Warning: No golden_review found for entry {entry_id}. Ground truth scores will be empty.")
    else:
        try:
            # Extract scores from golden_review markdown text
            gt_data = extract_scores_from_review(ground_truth_review)
            if not gt_data:
                print(f"Warning: Failed to parse golden_review for entry {entry_id}. Ground truth scores will be empty.")
            else:
                ground_truth_scores = {
                    'soundness': gt_data.get('soundness'),
                    'presentation': gt_data.get('presentation'),
                    'rating': gt_data.get('rating'),
                    'confidence': gt_data.get('confidence')
                }
                gt_decision = normalize_decision(gt_data.get('decision'))
                # Note: If any field is None, it stays None - we do NOT use model_review as fallback
                # Using model output as ground truth would inflate evaluation scores
        except Exception as e:
            print(f"Warning: Failed to extract scores from golden_review for {entry_id}: {e}")
            print(f"  Ground truth scores will be empty. Error: {str(e)}")
    
    # Calculate MSE and MAE metrics (with optional normalization in strict mode)
    score_metrics = calculate_score_metrics(model_scores, ground_truth_scores, normalize=strict_mode)
    
    # Calculate decision accuracy
    decision_match = False
    decision_accuracy = None
    if model_decision is not None and gt_decision is not None:
        model_decision_normalized = normalize_decision(model_decision)
        decision_match = (model_decision_normalized == gt_decision)
        decision_accuracy = 1.0 if decision_match else 0.0
    
    result = {
        'id': entry_id,
        'format': format_type,
        'model_soundness': model_scores.get('soundness'),
        'model_presentation': model_scores.get('presentation'),
        'model_rating': model_scores.get('rating'),
        'model_confidence': model_scores.get('confidence'),
        'model_decision': model_decision,
        'gt_soundness': ground_truth_scores.get('soundness'),
        'gt_presentation': ground_truth_scores.get('presentation'),
        'gt_rating': ground_truth_scores.get('rating'),
        'gt_confidence': ground_truth_scores.get('confidence'),
        'gt_decision': gt_decision,
        'decision_match': decision_match,
        'decision_accuracy': decision_accuracy,
        **score_metrics
    }
    
    # Add prefix to indicate which scores were used
    if format_type == 'refined':
        if use_initial_scores:
            result['score_type'] = 'initial'
        else:
            result['score_type'] = 'refined'
    else:
        result['score_type'] = 'auto'
    
    return result


def calculate_pairwise_accuracies(paper_scores: List[Dict[str, float]]) -> Dict[str, float]:
    """Calculate pairwise accuracy for each metric by comparing rankings."""
    if len(paper_scores) < 2:
        return {}
    
    total_valid_pairs = {'rating': 0, 'soundness': 0, 'presentation': 0, 'confidence': 0}
    correct_pairs = {'rating': 0, 'soundness': 0, 'presentation': 0, 'confidence': 0}
    
    for paper1, paper2 in combinations(paper_scores, 2):
        # Check rating ranking
        if (paper1.get('true_rating') is not None and paper2.get('true_rating') is not None and
            paper1.get('pred_rating') is not None and paper2.get('pred_rating') is not None):
            total_valid_pairs['rating'] += 1
            true_order = paper1['true_rating'] > paper2['true_rating']
            pred_order = paper1['pred_rating'] > paper2['pred_rating']
            if true_order == pred_order:
                correct_pairs['rating'] += 1
        
        # Similar for other dimensions...
        # (abbreviated for space, similar logic for soundness, presentation, confidence)
        for metric in ['soundness', 'presentation', 'confidence']:
            true_key = f'true_{metric}'
            pred_key = f'pred_{metric}'
            if (paper1.get(true_key) is not None and paper2.get(true_key) is not None and
                paper1.get(pred_key) is not None and paper2.get(pred_key) is not None):
                total_valid_pairs[metric] += 1
                true_order = paper1[true_key] > paper2[true_key]
                pred_order = paper1[pred_key] > paper2[pred_key]
                if true_order == pred_order:
                    correct_pairs[metric] += 1
    
    pairwise_accuracies = {
        metric: correct_pairs[metric] / total_valid_pairs[metric] if total_valid_pairs[metric] > 0 else 0.0
        for metric in ['rating', 'soundness', 'presentation', 'confidence']
    }
    
    return pairwise_accuracies


# ============================================================================
# Data Loading Functions
# ============================================================================

def load_rubrics_json(rubrics_path: str) -> Dict[str, Dict[str, Any]]:
    """Load rubrics JSON and create lookup by id."""
    with open(rubrics_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if isinstance(data, list):
        return {item['id']: item for item in data}
    elif isinstance(data, dict):
        return data
    else:
        raise ValueError(f"Invalid rubrics JSON format: expected list or dict, got {type(data)}")


def load_model_reviews_json(reviews_path: str, format_override: Optional[str] = None) -> Dict[str, Dict[str, Any]]:
    """
    Load model reviews JSON and extract reviews by id.
    
    Supports two input formats:
    1. Refined format: Contains 'scores' and 'initial_scores' fields (from refinement pipeline)
    2. Original format: Contains 'model_prediction' with 'meta_review' and 'decision' (like ours.json)
    
    Args:
        reviews_path: Path to JSON file containing model reviews
        format_override: Optional format override ('refined', 'original', or None for auto-detect)
    
    Returns:
        Dict mapping paper_id to dict containing:
        - 'review': review text (markdown)
        - 'scores': refined scores dict (if available)
        - 'initial_scores': initial scores dict (if available)
        - 'format': 'refined' or 'original'
    """
    with open(reviews_path, 'r', encoding='utf-8') as f:
        data = json.load(f)
    
    if isinstance(data, dict):
        data = list(data.values())
    
    reviews_dict = {}
    for item in data:
        item_id = None
        review_text = ''
        scores = None
        initial_scores = None
        format_type = None
        
        # Use format override if provided, otherwise auto-detect
        if format_override and format_override != 'auto':
            # Force use specified format
            if format_override == 'refined':
                item_id = item.get('paper_id') or item.get('id')
                if not item_id:
                    continue
                format_type = 'refined'
                review_text = item.get('review_markdown', '') or item.get('review', '')
                scores = item.get('scores', {})
                initial_scores = item.get('initial_scores', {})
            elif format_override == 'original':
                item_id = item.get('id')
                if not item_id:
                    continue
                format_type = 'original'
                model_prediction = item.get('model_prediction', {})
                meta_review = model_prediction.get('meta_review', {})
                review_text = meta_review.get('content', '') or model_prediction.get('raw_text', '')
                initial_scores = {
                    'rating': meta_review.get('rating'),
                    'soundness': meta_review.get('soundness'),
                    'presentation': meta_review.get('presentation'),
                    'contribution': meta_review.get('contribution'),
                    'decision': model_prediction.get('decision'),
                }
            else:
                raise ValueError(f"Unknown format_override: {format_override}. Must be 'refined', 'original', or 'auto'")
        else:
            # Auto-detect format
            if "paper_id" in item:
                # Refined format (from refinement pipeline)
                item_id = item.get('paper_id')
                if not item_id:
                    continue
                
                # Check if this is refined format (has scores and initial_scores)
                if 'scores' in item and 'initial_scores' in item:
                    format_type = 'refined'
                    review_text = item.get('review_markdown', '') or item.get('review', '')
                    scores = item.get('scores', {})
                    initial_scores = item.get('initial_scores', {})
                else:
                    # Standard format with paper_id
                    format_type = 'standard'
                    review_text = item.get('review_markdown', '') or item.get('review', '')
            elif "model_prediction" in item:
                # Original format (like ours.json)
                item_id = item.get('id')
                if not item_id:
                    continue
                
                format_type = 'original'
                model_prediction = item.get('model_prediction', {})
                meta_review = model_prediction.get('meta_review', {})
                
                # Extract review content (prefer meta_review.content, fallback to raw_text)
                review_text = meta_review.get('content', '') or model_prediction.get('raw_text', '')    
                    
                # Extract initial scores
                initial_scores = {
                    'rating': meta_review.get('rating'),
                    'soundness': meta_review.get('soundness'),
                    'presentation': meta_review.get('presentation'),
                    'contribution': meta_review.get('contribution'),
                    'decision': model_prediction.get('decision'),
                }
            else:
                # Legacy format (pred_fast_mode)
                item_id = item.get('id')
                if not item_id:
                    continue
                
                format_type = 'legacy'
                review_dict = item.get('pred_fast_mode', {})
                if isinstance(review_dict, dict):
                    # review_text = review_dict.get('raw_text', '')
                    review_text = review_dict
                else:
                    review_text = str(review_dict)
        
        # Extract review content from the review text field
        try:
            if review_text:
                extracted_review = ReviewProcessor.extract_review_content(review_text)
            else:
                extracted_review = ''
            
            reviews_dict[item_id] = {
                'review': extracted_review,
                'scores': scores,
                'initial_scores': initial_scores,
                'format': format_type
            }
        except Exception as e:
            print(f"[WARN] Failed to extract review for {item_id}: {e}")
            continue
    
    return reviews_dict


def combine_rubrics_and_reviews(
    rubrics_data: Dict[str, Dict[str, Any]],
    reviews_dict: Dict[str, Dict[str, Any]]
) -> List[Dict[str, Any]]:
    """
    Combine rubrics and reviews into evaluation entries.
    
    Args:
        rubrics_data: Dict mapping paper_id to rubric entry
        reviews_dict: Dict mapping paper_id to dict containing 'review', 'scores', 'initial_scores', 'format'
    
    Returns:
        List of evaluation entries with model_review, scores, initial_scores, and format info
    """
    combined = []
    missing_reviews = []
    
    for paper_id, rubric_entry in rubrics_data.items():
        review_data = reviews_dict.get(paper_id)
        if not review_data or not review_data.get('review'):
            missing_reviews.append(paper_id)
            continue
        
        entry = {
            'id': paper_id,
            'paper_context': rubric_entry.get('paper_context', ''),
            'decision': rubric_entry.get('decision', ''),
            'golden_review': rubric_entry.get('golden_review', ''),
            'rubrics': rubric_entry.get('rubrics', []),
            'model_review': review_data.get('review', ''),
            'scores': review_data.get('scores'),  # Refined scores (if available)
            'initial_scores': review_data.get('initial_scores'),  # Initial scores (if available)
            'format': review_data.get('format', 'unknown')  # Format type
        }
        combined.append(entry)
    
    if missing_reviews:
        print(f"[WARN] {len(missing_reviews)} papers have no model review, skipping them")
    
    return combined


# ============================================================================
# LLM Service Configuration
# ============================================================================

def load_llm_config(config_path: str) -> Dict[str, Any]:
    """Load LLM configuration from YAML file."""
    with open(config_path, 'r', encoding='utf-8') as f:
        config = yaml.safe_load(f)
    return config


def create_llm_service_from_config(config: Dict[str, Any]) -> LLMService:
    """Create LLM service from configuration."""
    mode = config.get('mode', 'gpt').lower()
    
    if mode == 'gpt':
        gpt_config = config.get('gpt', {})
        api_key = gpt_config.get('api_key') or os.getenv('OPENAI_API_KEY')
        if not api_key:
            raise ValueError("GPT mode requires api_key in configs.yaml or OPENAI_API_KEY environment variable")
        
        service = GPTService(
            api_key=api_key,
            model_name=gpt_config.get('model_name', 'gpt-4o'),
            base_url=gpt_config.get('base_url'),
            timeout=gpt_config.get('timeout', 300)
        )
        return service
        
    elif mode == 'vllm':
        vllm_config = config.get('vllm', {})
        service = VLLMService(
            base_url=vllm_config.get('base_url', 'http://localhost:8000/v1'),
            api_key=vllm_config.get('api_key', 'dummy-key'),
            model_name=vllm_config.get('model_name'),
            timeout=vllm_config.get('timeout', 300),
            max_concurrent_requests=vllm_config.get('max_concurrent_requests', 64),
            max_retries=vllm_config.get('max_retries', 3),
            retry_delay=vllm_config.get('retry_delay', 1.0),
            retry_backoff=vllm_config.get('retry_backoff', 2.0)
        )
        return service
        
    else:
        raise ValueError(f"Unknown mode: {mode}. Must be 'gpt' or 'vllm'")


# ============================================================================
# Main Evaluation Functions
# ============================================================================

def run_semantic_evaluation(
    evaluation_data: List[Dict[str, Any]],
    prompt_template: str,
    llm_service: LLMService,
    max_workers: int
) -> tuple:
    """Run semantic evaluation and return results and summary."""
    print(f"\n{'='*80}")
    print("RUNNING SEMANTIC EVALUATION")
    print(f"{'='*80}")
    print(f"Evaluating {len(evaluation_data)} reviews using {max_workers} workers...")
    
    results = []
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        future_to_entry = {
            executor.submit(
                evaluate_review_semantic,
                entry,
                entry['paper_context'],
                prompt_template,
                llm_service
            ): entry
            for entry in evaluation_data
        }
        
        for future in tqdm(as_completed(future_to_entry), total=len(evaluation_data), desc="Semantic evaluation"):
            try:
                result = future.result()
                results.append(result)
            except Exception as e:
                entry = future_to_entry[future]
                print(f"\n[ERROR] Failed to process entry {entry.get('id', 'unknown')}: {e}")
                results.append({
                    'id': entry.get('id', 'unknown'),
                    'raw_scores': {},
                    'weighted_scores': {},
                    'total_score': 0.0,
                    'error': str(e),
                    'raw_response': ''
                })
    
    # Calculate statistics
    valid_results = [r for r in results if 'error' not in r and r.get('weighted_scores')]
    review_scores = [r.get('total_score', 0.0) for r in valid_results]
    
    summary = {
        'total_entries': len(results),
        'valid_entries': len(valid_results),
        'failed_entries': len(results) - len(valid_results)
    }
    
    if review_scores:
        summary['overall_score'] = {
            'mean': sum(review_scores) / len(review_scores),
            'min': min(review_scores),
            'max': max(review_scores)
        }
    
    # Calculate per-rubric statistics (extract rubric titles from first entry)
    if evaluation_data and evaluation_data[0].get('rubrics'):
        rubric_titles = [r['title'] for r in evaluation_data[0]['rubrics']]
        per_rubric_stats = calculate_per_rubric_statistics(valid_results, rubric_titles)
        summary['per_rubric_statistics'] = per_rubric_stats
    
    return results, summary


def run_auto_metric_evaluation(
    evaluation_data: List[Dict[str, Any]],
    strict_mode: bool = False
) -> tuple:
    """
    Run auto-metric evaluation and return results and summary.
    
    For refined format (has scores and initial_scores), evaluates both:
    - Refined scores evaluation
    - Initial scores evaluation
    
    For original format (only initial_scores), evaluates:
    - Initial scores evaluation only
    
    Returns:
        Tuple of (results_list, summary_dict)
        - results_list: List of evaluation results (may contain both refined and initial results for refined format)
        - summary_dict: Summary statistics
    """
    print(f"\n{'='*80}")
    print("RUNNING AUTO-METRIC EVALUATION")
    print(f"{'='*80}")
    print(f"Evaluating {len(evaluation_data)} entries...")
    
    # Detect format types
    refined_format_count = sum(1 for e in evaluation_data if e.get('format') == 'refined')
    original_format_count = sum(1 for e in evaluation_data if e.get('format') == 'original')
    
    if refined_format_count > 0:
        print(f"Detected {refined_format_count} entries in refined format (will evaluate both refined and initial scores)")
    if original_format_count > 0:
        print(f"Detected {original_format_count} entries in original format (will evaluate initial scores only)")
    
    results = []
    for entry in tqdm(evaluation_data, desc="Auto-metric evaluation"):
        format_type = entry.get('format', 'unknown')
        
        if format_type == 'refined':
            # Evaluate both refined scores and initial scores
            try:
                entry_id = entry.get('id', 'unknown')
                
                # Evaluate refined scores
                refined_result = evaluate_review_auto_metric(entry, use_initial_scores=False, strict_mode=strict_mode)
                refined_result['paper_id'] = entry_id  # Keep original paper_id
                refined_result['id'] = f"{entry_id}_refined"
                results.append(refined_result)
                
                # Evaluate initial scores
                initial_result = evaluate_review_auto_metric(entry, use_initial_scores=True, strict_mode=strict_mode)
                initial_result['paper_id'] = entry_id  # Keep original paper_id
                initial_result['id'] = f"{entry_id}_initial"
                results.append(initial_result)
            except Exception as e:
                print(f"Error evaluating entry {entry.get('id', 'unknown')}: {e}")
                results.append({
                    'id': entry.get('id', 'unknown'),
                    'error': str(e)
                })
        else:
            # Evaluate initial scores only (or extract from markdown)
            try:
                result = evaluate_review_auto_metric(entry, use_initial_scores=False, strict_mode=strict_mode)
                results.append(result)
            except Exception as e:
                print(f"Error evaluating entry {entry.get('id', 'unknown')}: {e}")
                results.append({
                    'id': entry.get('id', 'unknown'),
                    'error': str(e)
                })
    
    # Calculate statistics
    valid_results = [r for r in results if 'error' not in r]
    mse_results = [r for r in valid_results if r.get('overall_error') is not None]
    
    # Separate refined and initial results for refined format
    refined_results = [r for r in valid_results if r.get('score_type') == 'refined']
    initial_results = [r for r in valid_results if r.get('score_type') == 'initial']
    auto_results = [r for r in valid_results if r.get('score_type') == 'auto' or r.get('score_type') is None]
    
    summary = {
        'total_entries': len(results),
        'valid_entries': len(valid_results),
        'mse_entries': len(mse_results),
        'refined_results_count': len(refined_results),
        'initial_results_count': len(initial_results),
        'auto_results_count': len(auto_results)
    }
    
    # Calculate MSE/MAE statistics
    # For refined format, only use refined results for overall statistics (avoid double counting)
    # For other formats, use all results
    if refined_format_count > 0:
        # Refined format: use only refined results for overall statistics
        stats_results = [r for r in refined_results if r.get('overall_error') is not None]
    else:
        # Original/other formats: use all results
        stats_results = mse_results
    
    if stats_results:
        dimensions = ['soundness', 'presentation', 'confidence', 'rating']
        mse_stats = {}
        mae_stats = {}
        
        for dim in dimensions:
            mse_list = [r.get(f'{dim}_mse') for r in stats_results if r.get(f'{dim}_mse') is not None]
            mae_list = [r.get(f'{dim}_mae') for r in stats_results if r.get(f'{dim}_mae') is not None]
            
            mse_clean = [x for x in mse_list if x is not None and not (isinstance(x, float) and math.isnan(x))]
            mae_clean = [x for x in mae_list if x is not None and not (isinstance(x, float) and math.isnan(x))]
            
            if mse_clean:
                mse_stats[dim] = {
                    'mean': sum(mse_clean) / len(mse_clean),
                    'count': len(mse_clean)
                }
            if mae_clean:
                mae_stats[dim] = {
                    'mean': sum(mae_clean) / len(mae_clean),
                    'count': len(mae_clean)
                }
        
        overall_errors = [r.get('overall_error') for r in stats_results if r.get('overall_error') is not None]
        overall_clean = [x for x in overall_errors if x is not None and not (isinstance(x, float) and math.isnan(x))]
        
        if overall_clean:
            summary['overall_error'] = {
                'mean': sum(overall_clean) / len(overall_clean),
                'count': len(overall_clean)
            }
        
        summary['mse_statistics'] = mse_stats
        summary['mae_statistics'] = mae_stats
        
        # Calculate separate statistics for refined and initial results
        if refined_results:
            refined_mse_results = [r for r in refined_results if r.get('overall_error') is not None]
            if refined_mse_results:
                refined_mse_stats = {}
                refined_mae_stats = {}
                for dim in dimensions:
                    mse_list = [r.get(f'{dim}_mse') for r in refined_mse_results if r.get(f'{dim}_mse') is not None]
                    mae_list = [r.get(f'{dim}_mae') for r in refined_mse_results if r.get(f'{dim}_mae') is not None]
                    mse_clean = [x for x in mse_list if x is not None and not (isinstance(x, float) and math.isnan(x))]
                    mae_clean = [x for x in mae_list if x is not None and not (isinstance(x, float) and math.isnan(x))]
                    if mse_clean:
                        refined_mse_stats[dim] = {'mean': sum(mse_clean) / len(mse_clean), 'count': len(mse_clean)}
                    if mae_clean:
                        refined_mae_stats[dim] = {'mean': sum(mae_clean) / len(mae_clean), 'count': len(mae_clean)}
                summary['refined_mse_statistics'] = refined_mse_stats
                summary['refined_mae_statistics'] = refined_mae_stats
        
        if initial_results:
            initial_mse_results = [r for r in initial_results if r.get('overall_error') is not None]
            if initial_mse_results:
                initial_mse_stats = {}
                initial_mae_stats = {}
                for dim in dimensions:
                    mse_list = [r.get(f'{dim}_mse') for r in initial_mse_results if r.get(f'{dim}_mse') is not None]
                    mae_list = [r.get(f'{dim}_mae') for r in initial_mse_results if r.get(f'{dim}_mae') is not None]
                    mse_clean = [x for x in mse_list if x is not None and not (isinstance(x, float) and math.isnan(x))]
                    mae_clean = [x for x in mae_list if x is not None and not (isinstance(x, float) and math.isnan(x))]
                    if mse_clean:
                        initial_mse_stats[dim] = {'mean': sum(mse_clean) / len(mse_clean), 'count': len(mse_clean)}
                    if mae_clean:
                        initial_mae_stats[dim] = {'mean': sum(mae_clean) / len(mae_clean), 'count': len(mae_clean)}
                summary['initial_mse_statistics'] = initial_mse_stats
                summary['initial_mae_statistics'] = initial_mae_stats
    
    # Calculate Spearman correlations
    def filter_valid_pairs(true_list, pred_list):
        filtered_true = []
        filtered_pred = []
        for t, p in zip(true_list, pred_list):
            if (t is not None and p is not None and 
                not (isinstance(t, float) and math.isnan(t)) and
                not (isinstance(p, float) and math.isnan(p))):
                filtered_true.append(t)
                filtered_pred.append(p)
        return filtered_true, filtered_pred
    
    # Calculate Spearman correlations
    # For refined format, calculate separately for refined and initial, and use refined for overall
    # For other formats, use all results
    if refined_format_count > 0:
        # Calculate refined spearman correlations
        refined_spearman_stats = {}
        dimensions = ['soundness', 'presentation', 'confidence', 'rating']
        for dim in dimensions:
            true_values = [r.get(f'gt_{dim}') for r in refined_results]
            pred_values = [r.get(f'model_{dim}') for r in refined_results]
            true_clean, pred_clean = filter_valid_pairs(true_values, pred_values)
            
            if len(true_clean) >= 2 and len(pred_clean) >= 2:
                try:
                    corr, _ = spearmanr(true_clean, pred_clean)
                    if not math.isnan(corr):
                        refined_spearman_stats[dim] = {
                            'correlation': corr,
                            'count': len(true_clean)
                        }
                except Exception:
                    pass
        
        # Calculate initial spearman correlations
        initial_spearman_stats = {}
        for dim in dimensions:
            true_values = [r.get(f'gt_{dim}') for r in initial_results]
            pred_values = [r.get(f'model_{dim}') for r in initial_results]
            true_clean, pred_clean = filter_valid_pairs(true_values, pred_values)
            
            if len(true_clean) >= 2 and len(pred_clean) >= 2:
                try:
                    corr, _ = spearmanr(true_clean, pred_clean)
                    if not math.isnan(corr):
                        initial_spearman_stats[dim] = {
                            'correlation': corr,
                            'count': len(true_clean)
                        }
                except Exception:
                    pass
        
        # Use refined for overall statistics (avoid double counting)
        summary['spearman_correlations'] = refined_spearman_stats
        summary['refined_spearman_correlations'] = refined_spearman_stats
        summary['initial_spearman_correlations'] = initial_spearman_stats
    else:
        # Original/other formats: use all results
        correlation_results = valid_results
        spearman_stats = {}
        dimensions = ['soundness', 'presentation', 'confidence', 'rating']
        for dim in dimensions:
            true_values = [r.get(f'gt_{dim}') for r in correlation_results]
            pred_values = [r.get(f'model_{dim}') for r in correlation_results]
            true_clean, pred_clean = filter_valid_pairs(true_values, pred_values)
            
            if len(true_clean) >= 2 and len(pred_clean) >= 2:
                try:
                    corr, _ = spearmanr(true_clean, pred_clean)
                    if not math.isnan(corr):
                        spearman_stats[dim] = {
                            'correlation': corr,
                            'count': len(true_clean)
                        }
                except Exception:
                    pass
        
        summary['spearman_correlations'] = spearman_stats
    
    # Calculate Decision metrics
    # For refined format, calculate separately for refined and initial, and use refined for overall
    # For other formats, use all results
    if refined_format_count > 0:
        # Calculate refined decision metrics
        refined_decision_results = [r for r in refined_results if r.get('gt_decision') is not None and r.get('model_decision') is not None]
        if refined_decision_results:
            true_decisions = []
            pred_decisions = []
            decision_acc = []
            
            for r in refined_decision_results:
                gt_decision = str(r.get('gt_decision', '')).lower().strip()
                pred_decision = str(r.get('model_decision', '')).lower().strip()
                
                if 'accept' in pred_decision:
                    pred_binary = 1
                else:
                    pred_binary = 0
                
                if 'accept' in gt_decision:
                    gt_binary = 1
                else:
                    gt_binary = 0
                
                true_decisions.append(gt_binary)
                pred_decisions.append(pred_binary)
                
                if pred_decision == gt_decision or ('accept' in pred_decision and 'accept' in gt_decision) or ('reject' in pred_decision and 'reject' in gt_decision):
                    decision_acc.append(1.0)
                else:
                    decision_acc.append(0.0)
            
            if decision_acc:
                decision_accuracy = sum(decision_acc) / len(decision_acc)
                try:
                    _, _, f1_score, _ = precision_recall_fscore_support(true_decisions, pred_decisions, average='macro')
                    refined_decision_metrics = {
                        'accuracy': decision_accuracy,
                        'f1_macro': f1_score,
                        'count': len(decision_acc)
                    }
                except Exception:
                    refined_decision_metrics = {
                        'accuracy': decision_accuracy,
                        'count': len(decision_acc)
                    }
                summary['refined_decision_metrics'] = refined_decision_metrics
                summary['decision_metrics'] = refined_decision_metrics  # Use refined for overall
        
        # Calculate initial decision metrics
        initial_decision_results = [r for r in initial_results if r.get('gt_decision') is not None and r.get('model_decision') is not None]
        if initial_decision_results:
            true_decisions = []
            pred_decisions = []
            decision_acc = []
            
            for r in initial_decision_results:
                gt_decision = str(r.get('gt_decision', '')).lower().strip()
                pred_decision = str(r.get('model_decision', '')).lower().strip()
                
                if 'accept' in pred_decision:
                    pred_binary = 1
                else:
                    pred_binary = 0
                
                if 'accept' in gt_decision:
                    gt_binary = 1
                else:
                    gt_binary = 0
                
                true_decisions.append(gt_binary)
                pred_decisions.append(pred_binary)
                
                if pred_decision == gt_decision or ('accept' in pred_decision and 'accept' in gt_decision) or ('reject' in pred_decision and 'reject' in gt_decision):
                    decision_acc.append(1.0)
                else:
                    decision_acc.append(0.0)
            
            if decision_acc:
                decision_accuracy = sum(decision_acc) / len(decision_acc)
                try:
                    _, _, f1_score, _ = precision_recall_fscore_support(true_decisions, pred_decisions, average='macro')
                    initial_decision_metrics = {
                        'accuracy': decision_accuracy,
                        'f1_macro': f1_score,
                        'count': len(decision_acc)
                    }
                except Exception:
                    initial_decision_metrics = {
                        'accuracy': decision_accuracy,
                        'count': len(decision_acc)
                    }
                summary['initial_decision_metrics'] = initial_decision_metrics
    else:
        # Original/other formats: use all results
        decision_results = [r for r in valid_results if r.get('gt_decision') is not None and r.get('model_decision') is not None]
        if decision_results:
            true_decisions = []
            pred_decisions = []
            decision_acc = []
            
            for r in decision_results:
                gt_decision = str(r.get('gt_decision', '')).lower().strip()
                pred_decision = str(r.get('model_decision', '')).lower().strip()
                
                if 'accept' in pred_decision:
                    pred_binary = 1
                else:
                    pred_binary = 0
                
                if 'accept' in gt_decision:
                    gt_binary = 1
                else:
                    gt_binary = 0
                
                true_decisions.append(gt_binary)
                pred_decisions.append(pred_binary)
                
                if pred_decision == gt_decision or ('accept' in pred_decision and 'accept' in gt_decision) or ('reject' in pred_decision and 'reject' in gt_decision):
                    decision_acc.append(1.0)
                else:
                    decision_acc.append(0.0)
            
            if decision_acc:
                decision_accuracy = sum(decision_acc) / len(decision_acc)
                try:
                    _, _, f1_score, _ = precision_recall_fscore_support(true_decisions, pred_decisions, average='macro')
                    summary['decision_metrics'] = {
                        'accuracy': decision_accuracy,
                        'f1_macro': f1_score,
                        'count': len(decision_acc)
                    }
                except Exception:
                    summary['decision_metrics'] = {
                        'accuracy': decision_accuracy,
                        'count': len(decision_acc)
                    }
    
    # Calculate Pairwise comparison
    # For refined format, only use refined results (avoid double counting)
    # For other formats, use all results
    if refined_format_count > 0:
        pairwise_results = refined_results
    else:
        pairwise_results = valid_results
    
    paper_scores = []
    for r in pairwise_results:
        if (r.get('gt_rating') is not None and r.get('model_rating') is not None) or \
           (r.get('gt_soundness') is not None and r.get('model_soundness') is not None):
            paper_scores.append({
                'true_rating': r.get('gt_rating'),
                'pred_rating': r.get('model_rating'),
                'true_soundness': r.get('gt_soundness'),
                'pred_soundness': r.get('model_soundness'),
                'true_presentation': r.get('gt_presentation'),
                'pred_presentation': r.get('model_presentation'),
                'true_confidence': r.get('gt_confidence'),
                'pred_confidence': r.get('model_confidence')
            })
    
    if len(paper_scores) >= 2:
        pairwise_accuracies = calculate_pairwise_accuracies(paper_scores)
        summary['pairwise_accuracies'] = pairwise_accuracies
    
    return results, summary


# ============================================================================
# Main Function
# ============================================================================

def parse_args():
    """Parse command line arguments."""
    parser = argparse.ArgumentParser(description="Unified evaluation script for semantic and auto-metric evaluation")
    
    # Input paths
    parser.add_argument("--rubrics_path", type=str, required=True,
                       help="Path to eval_rubrics.json file (from 1_generate_review_based_rubrics.py)")
    parser.add_argument("--reviews_path", type=str, required=True,
                       help="Path to JSON file with model reviews (contains pred_fast_mode)")
    
    # Evaluation mode
    parser.add_argument("--mode", type=str, choices=["semantic", "auto_metric", "both"], default="both",
                       help="Evaluation mode: semantic (LLM-based), auto_metric (rule-based), or both")
    
    # Output paths
    parser.add_argument("--semantic_output", type=str, default=None,
                       help="Path to output JSON file for semantic evaluation results (required if mode is semantic or both)")
    parser.add_argument("--auto_metric_output", type=str, default=None,
                       help="Path to output JSON file for auto-metric evaluation results (required if mode is auto_metric or both)")
    
    # Semantic evaluation settings
    parser.add_argument("--yaml_path", type=str, default=None,
                       help="Path to prompts.yaml file (required for semantic evaluation)")
    parser.add_argument("--config_path", type=str, default=None,
                       help="Path to configs.yaml file (required for semantic evaluation)")
    
    # Multi-threading
    parser.add_argument("--max_workers", type=int, default=None,
                       help="Maximum number of worker threads for semantic evaluation (default: 5)")
    
    # Strict mode (normalize scores to discrete scales)
    parser.add_argument("--strict_mode", action="store_true", default=False,
                       help="Enable strict mode: normalize scores to discrete scales before computing metrics (default: False)")
    
    # Input format override
    parser.add_argument("--input_format", type=str, choices=['auto', 'refined', 'original'], default='auto',
                       help="Manually specify input JSON format: 'refined' (has scores and initial_scores), 'original' (has model_prediction), or 'auto' for auto-detection (default: 'auto')")
    
    return parser.parse_args()


def main():
    """Main execution function."""
    args = parse_args()
    
    script_dir = os.path.dirname(os.path.abspath(__file__))
    
    # Resolve paths
    rubrics_path = args.rubrics_path
    if not os.path.isabs(rubrics_path):
        rubrics_path = os.path.join(script_dir, rubrics_path)
    
    reviews_path = args.reviews_path
    if not os.path.isabs(reviews_path):
        reviews_path = os.path.join(script_dir, reviews_path)
    
    max_workers = args.max_workers or int(os.getenv("MAX_WORKERS", "5"))
    
    # Validate mode and output paths
    if args.mode in ["semantic", "both"]:
        if not args.semantic_output:
            raise ValueError("--semantic_output is required when mode is 'semantic' or 'both'")
        if not args.yaml_path:
            raise ValueError("--yaml_path is required for semantic evaluation")
        if not args.config_path:
            raise ValueError("--config_path is required for semantic evaluation")
    
    if args.mode in ["auto_metric", "both"]:
        if not args.auto_metric_output:
            raise ValueError("--auto_metric_output is required when mode is 'auto_metric' or 'both'")
    
    # Check if files exist
    if not os.path.exists(rubrics_path):
        raise FileNotFoundError(f"Rubrics file not found: {rubrics_path}")
    if not os.path.exists(reviews_path):
        raise FileNotFoundError(f"Reviews file not found: {reviews_path}")
    
    # Load data
    print(f"Loading rubrics from {rubrics_path}...")
    rubrics_data = load_rubrics_json(rubrics_path)
    print(f"Loaded {len(rubrics_data)} rubrics entries")
    
    print(f"Loading model reviews from {reviews_path}...")
    if args.input_format != 'auto':
        print(f"Using manually specified format: {args.input_format}")
    else:
        print("Auto-detecting input format...")
    reviews_dict = load_model_reviews_json(reviews_path, format_override=args.input_format if args.input_format != 'auto' else None)
    print(f"Loaded {len(reviews_dict)} model reviews")
    
    # Combine rubrics and reviews
    print("Combining rubrics and reviews...")
    evaluation_data = combine_rubrics_and_reviews(rubrics_data, reviews_dict)
    print(f"Prepared {len(evaluation_data)} entries for evaluation")
    
    # Run evaluations based on mode
    if args.mode in ["semantic", "both"]:
        # Resolve semantic evaluation paths
        yaml_path = args.yaml_path
        if not os.path.isabs(yaml_path):
            yaml_path = os.path.join(script_dir, yaml_path)
        
        config_path = args.config_path
        if not os.path.isabs(config_path):
            config_path = os.path.join(script_dir, config_path)
        
        if not os.path.exists(yaml_path):
            raise FileNotFoundError(f"YAML file not found: {yaml_path}")
        if not os.path.exists(config_path):
            raise FileNotFoundError(f"Config file not found: {config_path}")
        
        # Load prompt template
        print(f"Loading prompt template from {yaml_path}...")
        prompt_template = load_prompt_template(yaml_path)
        if not prompt_template:
            raise ValueError("Could not find 'v1_evaluator_prompt' in YAML file")
        
        # Initialize LLM service
        print(f"Loading LLM configuration from {config_path}...")
        llm_config = load_llm_config(config_path)
        llm_service = create_llm_service_from_config(llm_config)
        mode = llm_config.get('mode', 'gpt')
        print(f"LLM service initialized (mode: {mode})")
        if hasattr(llm_service, 'model_name'):
            print(f"Using model: {llm_service.model_name}")
        
        # Run semantic evaluation
        semantic_results, semantic_summary = run_semantic_evaluation(
            evaluation_data, prompt_template, llm_service, max_workers
        )
        
        # Save semantic results
        semantic_output = args.semantic_output
        if not os.path.isabs(semantic_output):
            semantic_output = os.path.join(script_dir, semantic_output)
        
        output_dir = os.path.dirname(semantic_output)
        os.makedirs(output_dir, exist_ok=True)
        
        with open(semantic_output, 'w', encoding='utf-8') as f:
            json.dump(semantic_results, f, ensure_ascii=False, indent=2)
        print(f"\nSemantic evaluation results saved to {semantic_output}")
        
        # Save semantic summary
        semantic_summary_path = semantic_output.replace('.json', '_summary.json')
        with open(semantic_summary_path, 'w', encoding='utf-8') as f:
            json.dump(semantic_summary, f, ensure_ascii=False, indent=2)
        print(f"Semantic evaluation summary saved to {semantic_summary_path}")
        
        # Print semantic summary
        print("\n" + "="*80)
        print("SEMANTIC EVALUATION SUMMARY")
        print("="*80)
        print(f"Total entries: {semantic_summary['total_entries']}")
        print(f"Valid entries: {semantic_summary['valid_entries']}")
        print(f"Failed entries: {semantic_summary['failed_entries']}")
        if 'overall_score' in semantic_summary:
            score = semantic_summary['overall_score']
            print(f"\nOverall Score:")
            print(f"  Mean: {score['mean']:.2f}")
            print(f"  Min: {score['min']:.2f}")
            print(f"  Max: {score['max']:.2f}")
    
    if args.mode in ["auto_metric", "both"]:
        # Run auto-metric evaluation
        auto_metric_results, auto_metric_summary = run_auto_metric_evaluation(
            evaluation_data, 
            strict_mode=args.strict_mode
        )
        
        # Save auto-metric results
        auto_metric_output = args.auto_metric_output
        if not os.path.isabs(auto_metric_output):
            auto_metric_output = os.path.join(script_dir, auto_metric_output)
        
        output_dir = os.path.dirname(auto_metric_output)
        os.makedirs(output_dir, exist_ok=True)
        
        with open(auto_metric_output, 'w', encoding='utf-8') as f:
            json.dump(auto_metric_results, f, ensure_ascii=False, indent=2)
        print(f"\nAuto-metric evaluation results saved to {auto_metric_output}")
        
        # Save auto-metric summary
        auto_metric_summary_path = auto_metric_output.replace('.json', '_summary.json')
        with open(auto_metric_summary_path, 'w', encoding='utf-8') as f:
            json.dump(auto_metric_summary, f, ensure_ascii=False, indent=2)
        print(f"Auto-metric evaluation summary saved to {auto_metric_summary_path}")
        
        # Print auto-metric summary
        print("\n" + "="*80)
        print("AUTO-METRIC EVALUATION SUMMARY")
        print("="*80)
        print(f"Total entries: {auto_metric_summary['total_entries']}")
        print(f"Valid entries: {auto_metric_summary['valid_entries']}")
        print(f"MSE entries: {auto_metric_summary['mse_entries']}")
        
        if 'mse_statistics' in auto_metric_summary:
            print("\nMSE Statistics:")
            for dim, stats in auto_metric_summary['mse_statistics'].items():
                print(f"  {dim.capitalize()}: Mean={stats['mean']:.4f}, Count={stats['count']}")
        
        if 'mae_statistics' in auto_metric_summary:
            print("\nMAE Statistics:")
            for dim, stats in auto_metric_summary['mae_statistics'].items():
                print(f"  {dim.capitalize()}: Mean={stats['mean']:.4f}, Count={stats['count']}")
        
        # Print refined and initial statistics if available
        if 'refined_mse_statistics' in auto_metric_summary:
            print("\nRefined Scores - MSE Statistics:")
            for dim, stats in auto_metric_summary['refined_mse_statistics'].items():
                print(f"  {dim.capitalize()}: Mean={stats['mean']:.4f}, Count={stats['count']}")
        
        if 'refined_mae_statistics' in auto_metric_summary:
            print("\nRefined Scores - MAE Statistics:")
            for dim, stats in auto_metric_summary['refined_mae_statistics'].items():
                print(f"  {dim.capitalize()}: Mean={stats['mean']:.4f}, Count={stats['count']}")
        
        if 'initial_mse_statistics' in auto_metric_summary:
            print("\nInitial Scores - MSE Statistics:")
            for dim, stats in auto_metric_summary['initial_mse_statistics'].items():
                print(f"  {dim.capitalize()}: Mean={stats['mean']:.4f}, Count={stats['count']}")
        
        if 'initial_mae_statistics' in auto_metric_summary:
            print("\nInitial Scores - MAE Statistics:")
            for dim, stats in auto_metric_summary['initial_mae_statistics'].items():
                print(f"  {dim.capitalize()}: Mean={stats['mean']:.4f}, Count={stats['count']}")
        
        if 'spearman_correlations' in auto_metric_summary:
            print("\nSpearman Correlations:")
            for dim, stats in auto_metric_summary['spearman_correlations'].items():
                print(f"  {dim.capitalize()}: {stats['correlation']:.4f} (n={stats['count']})")
        
        # Print refined and initial spearman correlations if available
        if 'refined_spearman_correlations' in auto_metric_summary:
            print("\nRefined Scores - Spearman Correlations:")
            for dim, stats in auto_metric_summary['refined_spearman_correlations'].items():
                print(f"  {dim.capitalize()}: {stats['correlation']:.4f} (n={stats['count']})")
        
        if 'initial_spearman_correlations' in auto_metric_summary:
            print("\nInitial Scores - Spearman Correlations:")
            for dim, stats in auto_metric_summary['initial_spearman_correlations'].items():
                print(f"  {dim.capitalize()}: {stats['correlation']:.4f} (n={stats['count']})")
        
        if 'decision_metrics' in auto_metric_summary:
            dm = auto_metric_summary['decision_metrics']
            print(f"\nDecision Metrics:")
            print(f"  Accuracy: {dm['accuracy']:.4f} (n={dm['count']})")
            if 'f1_macro' in dm:
                print(f"  F1 (macro): {dm['f1_macro']:.4f}")
        
        # Print refined and initial decision metrics if available
        if 'refined_decision_metrics' in auto_metric_summary:
            print("\nRefined Scores - Decision Metrics:")
            rdm = auto_metric_summary['refined_decision_metrics']
            print(f"  Accuracy: {rdm['accuracy']:.4f} (n={rdm['count']})")
            if 'f1_macro' in rdm:
                print(f"  F1 (macro): {rdm['f1_macro']:.4f}")
        
        if 'initial_decision_metrics' in auto_metric_summary:
            print("\nInitial Scores - Decision Metrics:")
            idm = auto_metric_summary['initial_decision_metrics']
            print(f"  Accuracy: {idm['accuracy']:.4f} (n={idm['count']})")
            if 'f1_macro' in idm:
                print(f"  F1 (macro): {idm['f1_macro']:.4f}")
    
    print("\n" + "="*80)
    print("EVALUATION COMPLETE")
    print("="*80)


if __name__ == "__main__":
    main()