File size: 102,857 Bytes
4c0cf4e
 
 
 
 
 
 
3b6aa96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26fae4e
 
 
 
 
 
 
 
 
 
 
804ae3a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
a5529fd
 
 
 
 
 
 
 
 
 
 
 
 
 
4c0cf4e
 
 
 
 
 
 
e87f1a6
69d63c3
4c0cf4e
 
 
4a1690a
6c53a1d
4a1690a
4c0cf4e
 
 
 
 
 
 
 
 
 
 
 
 
4a1690a
 
 
 
 
 
 
 
4c0cf4e
4a1690a
4c0cf4e
4a1690a
 
 
 
 
 
 
 
 
 
 
6c53a1d
 
 
 
86be51c
 
 
 
6c53a1d
 
 
 
 
 
 
 
 
 
 
86be51c
6c53a1d
 
 
86be51c
 
4a1690a
 
4c0cf4e
 
 
4a1690a
 
 
 
fbc2621
 
 
 
 
4a1690a
 
 
 
 
fbc2621
4a1690a
 
 
 
 
 
 
fbc2621
4c0cf4e
4a1690a
 
4c0cf4e
 
 
 
 
 
5fa3f26
a3050b3
ffd85b6
4c0cf4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a1690a
 
 
 
 
 
 
 
 
 
 
 
4c0cf4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a1690a
69d63c3
 
 
 
 
4a1690a
 
 
 
69d63c3
4c0cf4e
 
 
4a1690a
c84d1ef
 
 
 
1a60a7b
 
fbc2621
 
 
 
 
 
 
 
 
 
 
 
 
1a60a7b
 
c84d1ef
1a60a7b
 
 
 
 
4c0cf4e
69d63c3
 
4a1690a
 
 
 
 
 
 
 
 
69d63c3
 
4c0cf4e
3b6aa96
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c0cf4e
 
 
 
 
 
 
 
 
 
 
4a1690a
c84d1ef
4c0cf4e
 
4a1690a
 
 
 
4c0cf4e
 
 
69d63c3
 
4a1690a
 
 
 
 
 
 
 
 
 
 
69d63c3
4a1690a
 
 
 
 
 
 
 
9938a04
4a1690a
 
 
9938a04
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a1690a
 
992446a
 
 
 
 
 
 
 
 
 
1d19d52
 
 
 
 
 
992446a
 
4a1690a
 
86be51c
 
 
 
 
 
 
 
 
 
4a1690a
 
2a86579
 
 
 
 
4a1690a
2a86579
 
4a1690a
 
 
 
 
6c53a1d
4a1690a
 
6c53a1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a1690a
 
6c53a1d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a1690a
 
6c53a1d
992446a
 
 
6c53a1d
 
 
 
 
 
4a1690a
 
 
 
 
 
 
 
 
 
 
86be51c
 
 
 
 
 
4a1690a
86be51c
 
 
9938a04
 
 
 
 
 
86be51c
 
 
 
 
4a1690a
86be51c
4a1690a
 
69d63c3
5bf2f9c
4a1690a
5bf2f9c
 
4a1690a
 
 
 
 
 
 
 
 
 
5bf2f9c
69d63c3
4a1690a
5bf2f9c
 
 
 
 
 
 
 
4a1690a
5bf2f9c
 
 
4a1690a
69d63c3
c84d1ef
 
 
 
 
4a1690a
 
 
 
 
 
 
 
 
 
 
 
 
2a86579
 
 
 
 
 
 
 
 
 
86be51c
 
6c53a1d
 
 
4a1690a
69d63c3
4a1690a
5bf2f9c
 
4a1690a
 
 
 
 
df369e2
 
 
 
 
 
 
4a1690a
 
 
c84d1ef
4a1690a
 
 
 
 
5bf2f9c
 
 
9938a04
 
 
 
 
 
 
 
 
 
5bf2f9c
 
 
4a1690a
 
9938a04
 
 
 
5bf2f9c
9938a04
 
 
 
 
 
 
5bf2f9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f04383e
 
 
 
 
 
5bf2f9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69d63c3
 
 
 
 
 
 
 
 
98703bd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d652317
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c0cf4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d652317
4c0cf4e
804ae3a
 
 
 
 
 
 
 
 
 
 
a5529fd
 
804ae3a
a5529fd
 
 
 
 
 
804ae3a
 
a5529fd
4c0cf4e
 
 
804ae3a
4c0cf4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5e3141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4da2b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c0cf4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
fccad81
 
 
 
4c0cf4e
fccad81
4c0cf4e
 
 
 
 
fccad81
4c0cf4e
 
 
 
 
 
 
 
fccad81
4c0cf4e
fccad81
 
 
 
 
 
 
4c0cf4e
 
fccad81
804ae3a
98703bd
fccad81
804ae3a
 
 
 
a5529fd
 
804ae3a
a5529fd
 
 
 
 
 
804ae3a
 
 
 
 
 
 
a5529fd
4c0cf4e
804ae3a
4c0cf4e
 
 
 
fccad81
 
 
4c0cf4e
fccad81
4c0cf4e
 
 
 
 
 
 
 
fccad81
 
 
4c0cf4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
093b6b9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47090f7
 
 
 
 
c5e3141
 
 
 
 
 
 
 
 
 
 
 
7144ba0
 
 
 
 
 
c5e3141
 
 
 
 
 
 
 
7144ba0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e4da2b6
 
 
 
 
 
 
 
 
 
7144ba0
e4da2b6
7144ba0
 
 
 
 
b359f6e
 
 
 
 
 
 
 
 
 
 
7144ba0
c5e3141
 
 
 
 
 
 
 
 
a3050b3
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ffd85b6
 
 
 
 
 
 
5fa3f26
ffd85b6
5fa3f26
a3050b3
 
 
 
 
5fa3f26
 
a3050b3
 
 
 
 
0aef8d8
 
 
 
 
 
42aac76
 
 
 
 
0aef8d8
 
 
 
ffd85b6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
42aac76
 
 
 
ffd85b6
 
 
 
 
 
42aac76
 
 
 
ffd85b6
 
 
 
 
 
42aac76
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5e3141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
47090f7
 
 
 
 
 
 
 
 
 
 
 
 
c5e3141
 
 
 
 
 
0aef8d8
 
 
 
 
c5e3141
 
 
a5529fd
804ae3a
 
 
 
 
a5529fd
 
804ae3a
a5529fd
 
 
 
 
 
804ae3a
 
 
 
 
 
 
a5529fd
c5e3141
804ae3a
c5e3141
 
 
02b901d
38c852b
 
 
 
 
02b901d
38c852b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
02b901d
 
38c852b
02b901d
 
38c852b
 
 
 
 
 
 
 
 
 
 
 
02b901d
 
38c852b
 
02b901d
 
 
 
 
 
 
 
093b6b9
 
 
 
 
 
 
 
 
 
 
 
02b901d
c5e3141
f8615ea
 
 
 
 
 
 
 
 
 
 
 
 
 
70c6783
69d63c3
 
 
7144ba0
 
 
 
 
 
 
 
 
69d63c3
c5e3141
1a60a7b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9938a04
 
 
 
 
 
 
 
 
86d8963
 
 
 
 
 
 
 
 
 
 
 
 
 
c5e3141
 
 
 
 
 
 
 
 
 
 
 
86d8963
1a60a7b
9938a04
 
c5e3141
 
ba92923
69d63c3
 
02b901d
 
 
38c852b
 
 
02b901d
 
38c852b
093b6b9
 
 
 
 
26fae4e
 
 
 
 
 
 
 
b359f6e
 
 
 
 
 
 
 
 
 
c5e3141
 
 
 
 
 
 
 
 
 
 
0aef8d8
 
 
 
c5e3141
 
 
 
 
 
 
42aac76
 
 
 
 
 
 
0aef8d8
 
 
 
 
 
 
 
42aac76
0aef8d8
 
 
c5e3141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0aef8d8
 
 
 
c5e3141
0aef8d8
c5e3141
 
 
 
 
 
 
 
 
 
69d63c3
 
 
 
 
 
42aac76
 
 
 
 
0aef8d8
 
 
 
 
42aac76
0aef8d8
 
 
 
42aac76
 
 
0aef8d8
 
 
 
42aac76
 
 
 
 
0aef8d8
7144ba0
 
 
 
 
c5e3141
 
 
7144ba0
 
 
 
 
47090f7
 
7144ba0
 
 
 
b359f6e
 
 
 
 
 
 
 
 
 
 
 
 
a3050b3
 
 
 
c5e3141
 
 
 
 
 
 
69d63c3
0aef8d8
69d63c3
 
0aef8d8
 
 
 
 
 
 
 
 
 
 
 
69d63c3
 
 
 
c5e3141
 
7144ba0
 
 
 
 
e4da2b6
7144ba0
e4da2b6
 
 
 
7144ba0
c5e3141
 
 
 
 
 
 
 
4c0cf4e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c5e3141
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7144ba0
 
 
 
 
 
 
 
 
 
c5e3141
7144ba0
 
 
 
c5e3141
 
e4da2b6
 
 
 
 
 
 
 
 
47090f7
 
 
 
 
 
 
e4da2b6
 
47090f7
 
 
 
 
 
 
 
 
 
 
 
 
e4da2b6
c5e3141
 
 
 
 
 
 
 
47090f7
7144ba0
c5e3141
 
 
e4da2b6
47090f7
 
 
 
 
 
 
 
 
 
 
 
 
e4da2b6
 
 
 
5bf2f9c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4c0cf4e
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931
1932
1933
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
1969
1970
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015
2016
2017
2018
2019
2020
2021
2022
2023
2024
2025
2026
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
2063
2064
2065
2066
2067
2068
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110
2111
2112
2113
2114
2115
2116
2117
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186
2187
2188
2189
2190
2191
2192
2193
2194
2195
2196
2197
"""
NuWave β€” HuggingFace Spaces Demo

The organism. NeuroGraph substrate + KISS bucket + Pith bucket +
Splat-Lenia + BitNet model. On CPU. Gets smarter over time.

# ---- Changelog ----
# [2026-05-24] Claude Opus 4.7 (1M ctx) β€” Root fix: fallback prompt-echo strip in do_generate
#   Run 51 instrumentation exposed the ROOT cause of the entire Run
#   30β†’51 pollution arc: bitnet_cpp_client uses exact-substring match
#   (`if prompt in stdout`) to strip the prompt echo from bitnet.cpp's
#   stdout. When bitnet.cpp normalizes whitespace, injects a BOS token,
#   or otherwise mutates prompt-echo bytes (which the client documents
#   as "worth investigating" but doesn't handle), the strip silently
#   no-ops and `response` returns as the full prompt + completion. Every
#   downstream consumer β€” `nw_msgs.append({"role": "assistant", ...})`,
#   KISS reading nw_msgs to build sys_ctx, pith `_node_content` for
#   re-surfacing, `_response_is_degenerate(resp_nw)` evaluation β€” treats
#   the polluted text as the completion. Cascade: KISS wraps polluted
#   "assistant" turns back into sys_ctx as "[Earlier conversation: ...
#   | assistant: System: You are a helpful assistant... | ...]", pith
#   re-surfaces polluted `resp_*` nodes containing prompt scaffolding,
#   detector flags everything degenerate because the "System:" /
#   "| user:" markers appear in the evaluated text. ONE bug, four
#   apparent failure modes. Every prior architectural fix (Phase B+2,
#   FRESH_START wiring, pith→user-turn, D2) was treating a downstream
#   symptom. Fix: fallback strip using "Assistant:" generation-prompt
#   marker as boundary when `prompt_echo_found` is False. Forward
#   `find` (not rfind) targets prompt marker, not hallucinated
#   mid-completion. Only fires when canonical strip already failed β€”
#   safe on properly-cleaned responses.
# [2026-05-23] Claude Opus 4.7 (1M ctx) β€” Add response_text to per-turn JSON
#   Run 50 sidecar showed 24/24 response_quality flagged degenerate even
#   though token-savings, recall axis, and visible-in-pith resp_* snippets
#   suggested most responses were functional. Root of the ambiguity: the
#   `surfaced_context` field shows pith items trimmed to max_chars_per_context
#   (default ~300), so trailing degeneracy past that boundary isn't visible
#   in the JSON output. The detector evaluates the FULL response text,
#   so it sees pathology we don't. Fix: add `response_text` field to the
#   per-turn JSON (capped at 1500 chars) so the detector's signal can be
#   verified against the actual generated text. Single field addition in
#   on_interleaved_benchmark; no logic changes.
# [2026-05-22] Claude Opus 4.7 (1M ctx) β€” D2: drop conversation history from prompt (3 sites)
#   The 2026-05-16 pith→user-turn fix deployed cleanly (Run 49 confirmed:
#   "My actual question:" label from the new template appearing IN BitNet
#   output), but degeneracy persisted. Diagnosis: pollution feedback loop
#   relocated from system slot to conversation history. BitNet's own prior
#   degenerate responses were appended to nw_msgs/messages_nw as assistant
#   turns; next turn's recent_window=6 pulled them back into context;
#   BitNet kept echoing the pattern. Fix: drop the recent_window pull
#   entirely. Prompt = [system, current user turn (with pith block)].
#   No prior turns. nw_msgs/messages_nw still grows as a record (KISS
#   still gets the full history for filtering) β€” the model only sees this
#   turn. This is the architecturally-stated NuWave posture: "substrate
#   carries continuity, not literal chat history" (per organism.py header).
#   The recent_window=6 was always a pragmatic concession to model needs;
#   removing it enforces the substrate-only-continuity design intent.
# [2026-05-16] Claude Opus 4.7 (1M ctx) β€” Pith β†’ user-turn labeled context block (3 sites)
#   Root cause of Run 48's 24/24 degenerate BitNet output: pith was being
#   injected into the system role slot via sys_ctx = "\n".join(pith) + sys.
#   Chat-tuned models are trained with the system slot carrying instructions,
#   not lists of prior user questions. BitNet was treating retrieved pith
#   as questions to respond to, echoing them back and leaking chat-template
#   fragments. Fix: pith content now goes in a clearly-labeled context block
#   inside the LAST user turn for THIS turn's prompt only; bare user query
#   persists in nw_msgs/messages_nw so next turn's recent_window isn't
#   polluted. System slot stays canonical (instructions only). Plain-text
#   labels, no delimiter tokens (per feedback_pith_presentation_layer memory).
#   Three call sites updated identically: on_send live chat, first benchmark
#   loop, interleaved benchmark loop. Universal RAG pattern β€” applies to
#   any future LLM consumer of substrate-surfaced content.
# [2026-04-06] Claude Code (Opus 4.6) β€” Full NeuroGraph organism integration
# [2026-03-31] Claude Code (Opus 4.6) β€” Switch to BitNet 2B for CPU-native inference
# [2026-03-29] Claude Code (Opus 4.6) β€” ZeroGPU compatible, model at startup
# [2026-03-28] Claude Code (Opus 4.6) β€” Initial Gradio demo
# -------------------
"""

import os
import time
import gradio as gr
import json
import logging
import torch  # still needed for splat_engine + lenia_splat
from typing import Optional
from transformers import AutoTokenizer

try:
    import spaces
except ImportError:
    class _FakeSpaces:
        @staticmethod
        def GPU(fn=None, **kwargs):
            return fn if fn else lambda f: f
    spaces = _FakeSpaces()

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger("nuwave")

# ── bitnet.cpp inference clients ──────────────────────────────────
# Two clients via bitnet.cpp (microsoft/BitNet's llama.cpp derivative):
#   chat_client     β€” BitNet b1.58 2B4T GGUF (fast, CPU-native, user-facing)
#   extractor_client β€” Falcon3-10B-Instruct 1.58bit GGUF (capable
#                      enumeration β€” doesn't collapse on concept lists)
#
# Paths set in Dockerfile via env vars. Falcon3 GGUF dir contains one
# or more quant levels; we pick the largest via resolve_gguf().

from nuwave.bitnet_cpp_client import BitnetCppClient

HF_TOKEN = os.environ.get("HF_TOKEN", None)
BITNET_BINARY = os.environ.get("BITNET_CPP_BINARY", "/home/user/bitnet/build/bin/llama-cli")
BITNET_CHAT_GGUF_DIR = os.environ.get("BITNET_CHAT_GGUF_DIR", "/home/user/bitnet/models")
FALCON_EXTRACTOR_GGUF_DIR = os.environ.get("FALCON_EXTRACTOR_GGUF_DIR", "/home/user/models/falcon3-10b-gguf")

# Chat model name for tokenizer-based token counting (benchmarks need
# in/out counts for both baseline and NuWave; we count with BitNet's
# tokenizer since the "baseline" path is notional-BitNet too).
CHAT_MODEL_NAME = "microsoft/bitnet-b1.58-2B-4T-bf16"
MODEL_NAME = CHAT_MODEL_NAME  # preserved for summary fields

# Concept-extractor grammar: loaded inline from the .gbnf file on
# startup and passed as --grammar string to llama-cli. Inline avoids
# container filesystem path surprises (run 6's silent failure mode)
# and puts the grammar content in-log for diagnostic visibility.
_EXTRACTOR_GRAMMAR_PATH = os.path.join(
    os.path.dirname(os.path.abspath(__file__)),
    "grammars", "concepts.gbnf",
)
_EXTRACTOR_GRAMMAR: Optional[str] = None
try:
    with open(_EXTRACTOR_GRAMMAR_PATH, "r", encoding="utf-8") as f:
        _EXTRACTOR_GRAMMAR = f.read()
    logger.info(
        "Extractor grammar loaded from %s β€” %d chars, %d lines",
        _EXTRACTOR_GRAMMAR_PATH,
        len(_EXTRACTOR_GRAMMAR),
        _EXTRACTOR_GRAMMAR.count("\n"),
    )
except Exception as exc:
    logger.warning(
        "Failed to load extractor grammar from %s: %s β€” "
        "extractor will fall back to free-form output",
        _EXTRACTOR_GRAMMAR_PATH, exc,
    )

logger.info("Loading tokenizer for token counting: %s", CHAT_MODEL_NAME)
tokenizer = AutoTokenizer.from_pretrained(CHAT_MODEL_NAME, token=HF_TOKEN)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

logger.info("Resolving GGUF weights...")
chat_gguf = BitnetCppClient.resolve_gguf(BITNET_CHAT_GGUF_DIR)
falcon_gguf = BitnetCppClient.resolve_gguf(FALCON_EXTRACTOR_GGUF_DIR)

# n_ctx=4092: the llama-cli binary reports "max 4092" even when told
# 4096 in config (4-token overhead reserved by the runtime). Run-10
# logs showed repeated crashes with "prompt is too long (4103 tokens,
# max 4092)" on turns 6-8. Setting n_ctx=4092 aligns our cap with
# what the binary actually allows.
logger.info("Initializing chat client (BitNet 2B4T GGUF)...")
chat_client = BitnetCppClient(
    binary_path=BITNET_BINARY,
    gguf_path=chat_gguf,
    n_threads=2,
    n_ctx=4092,
)

logger.info("Initializing extractor client (Falcon3-10B 1.58bit GGUF)...")
extractor_client = BitnetCppClient(
    binary_path=BITNET_BINARY,
    gguf_path=falcon_gguf,
    n_threads=2,
    n_ctx=4092,
)
logger.info("Both clients ready. chat=%s | extractor=%s",
            os.path.basename(chat_gguf), os.path.basename(falcon_gguf))

# ── NuWave Components ─────────────────────────────────────────────

from nuwave.organism import NuWaveOrganism
from nuwave.kiss import KISSFilter, KISSConfig
from nuwave.pith import PithPipeline, PithConfig
from nuwave.benchmark_loader import sample_chains as _sample_benchmark_chains
from nuwave.benchmark_loader import describe_sample as _describe_benchmark_sample
from nuwave.benchmark_loader import load_pool as _load_benchmark_pool
from nuwave.splat_engine import decompose_layer, SplatConfig, GaussianSplats
from nuwave.lenia_splat import LeniaSplatEngine, LeniaSplatConfig

# The organism β€” substrate + KISS bucket + Pith bucket
# Use /data/ for persistence if available (HF persistent storage), else /tmp/
_persist_dir = "/data/nuwave_substrate" if os.path.isdir("/data") else "/tmp/nuwave_substrate"
organism = NuWaveOrganism(state_path=_persist_dir)

# String-level KISS still runs alongside for comparison
kiss_nw = KISSFilter()
pith_nw = PithPipeline()

messages_nw = []
messages_bl = []
system_prompt = "You are a helpful assistant. Be concise and clear."

# ── Splat-Lenia Setup ────────────────────────────────────────────
# Decompose a few attention layers to splats at startup.
# Lenia evolves them between turns. The compression is alive.

splat_config = SplatConfig(
    splat_ratio=0.02,       # 50x compression β€” aggressive but fast to fit
    max_splats=256,         # small enough for CPU-basic startup
    init_sigma=2.0,
    fit_iterations=50,      # fewer iterations β€” speed over precision at startup
    fit_lr=0.02,
)

lenia_config = LeniaSplatConfig(
    growth_mu=0.15,
    growth_sigma=0.015,
    growth_scale=0.0003,    # small dt β€” proven stable
    interaction_radius=5.0,
    activation_coupling=2.0,
    conserve_mass=True,
)

lenia_engine = LeniaSplatEngine(lenia_config)
splat_layers = {}
splat_metrics_history = []

# Splat decomposition deferred to first use β€” avoids memory spike during startup
# Splat state persists to disk so Lenia evolution survives restarts
_splats_initialized = False
_splat_save_path = os.path.join(_persist_dir, "splat_state.pt")

def _init_splats_if_needed():
    """Load persisted splats or decompose from scratch on first use."""
    global _splats_initialized
    if _splats_initialized:
        return
    _splats_initialized = True

    import gc
    gc.collect()

    # Try to restore persisted splat state first
    if os.path.exists(_splat_save_path):
        try:
            saved = torch.load(_splat_save_path, weights_only=False)
            for name, sd in saved.get('layers', {}).items():
                splats = GaussianSplats.from_state_dict(sd)
                splat_layers[name] = splats
                lenia_engine.register_layer(name, splats)
            lenia_step_count = saved.get('lenia_steps', 0)
            logger.info(
                f"Splats restored: {len(splat_layers)} layers, "
                f"{sum(s.n_splats for s in splat_layers.values())} splats, "
                f"{lenia_step_count} Lenia steps evolved"
            )
            return
        except Exception as exc:
            logger.warning(f"Splat restore failed (redecomposing): {exc}")

    # Fresh decomposition requires the bf16 model in memory. Since we
    # migrated to bitnet.cpp (GGUF, external C++ runtime) there's no
    # in-process torch model to read weights from. Splat-Lenia is
    # experimental Layer 2 work that's off the critical path for the
    # current dual-pass Layer 1 validation β€” gracefully skip if the
    # persisted splat state isn't already present. If a prior bf16-era
    # save exists on disk or in the hub, it'll still be restored above.
    logger.info(
        "Fresh splat decomposition unavailable under bitnet.cpp runtime "
        "(no torch model to read weights from). Splat-Lenia remains "
        "active only if persisted state is restored from a prior era."
    )


def _save_splat_state():
    """Persist evolved splat parameters to disk."""
    if not splat_layers:
        return
    try:
        os.makedirs(os.path.dirname(_splat_save_path), exist_ok=True)
        state = {
            'layers': {name: splats.state_dict() for name, splats in splat_layers.items()},
            'lenia_steps': lenia_engine.state.step_count,
        }
        torch.save(state, _splat_save_path)
    except Exception as exc:
        logger.debug(f"Splat save failed: {exc}")


# ── Inference ─────────────────────────────────────────────────────

def do_generate(prompt_text: str, max_new_tokens: int = 256) -> tuple:
    """Run inference via bitnet.cpp chat client with Lenia step after.

    Wraps generation in organism.mark_generation_start/end so the
    concept helper's manager thread refuses to spawn a worker while
    we're mid-inference (Option A strict gate β€” no CPU contention
    between main-thread generation and background tree extraction).

    Sampling params chosen for user-facing coherence: mild temperature,
    moderate repetition penalty. Stop on common chat-template end
    markers so responses don't run on past the model's natural stop.
    """
    _init_splats_if_needed()
    t0 = time.time()

    # Count input tokens with BitNet's tokenizer (cheap, no model run)
    # Plain python-list tokens β€” transformers 5.5 disabled the PyTorch
    # backend because BitNet requirements pinned torch 2.2 < the 2.4
    # transformers wants. We don't need tensors anyway; len() on the
    # input_ids list is all we want.
    #
    # Truncate with headroom below bitnet.cpp's n_ctx so the runtime
    # has room to generate. Headroom components:
    #   max_new_tokens β€” reserved for generation
    #   256 β€” safety buffer covering:
    #         β€’ double-BOS problem (both apply_chat_template and
    #           llama-cli inject a BOS token, so prompt is +1-2 over
    #           what the tokenizer counts)
    #         β€’ chat-template control-token overhead
    #         β€’ tokenizer rounding / sub-word boundary slack
    # Prior runs used 128-token safety and saw "prompt too long
    # (4103 tokens, max 4092)" crashes on turns 6-8 despite the
    # tokenizer truncation supposedly capping at 3840. 256 gives
    # real margin for the BOS duplication.
    _CTX_HEADROOM = max_new_tokens + 256
    _PROMPT_CAP = max(256, chat_client.n_ctx - _CTX_HEADROOM)
    encoded = tokenizer(prompt_text, truncation=True, max_length=_PROMPT_CAP)
    in_count = len(encoded["input_ids"])
    # If truncation occurred, feed the truncated text to the client β€”
    # otherwise bitnet.cpp will re-tokenize the full original and blow
    # past n_ctx anyway.
    if in_count >= _PROMPT_CAP:
        prompt_text = tokenizer.decode(encoded["input_ids"], skip_special_tokens=False)

    organism.mark_generation_start()
    try:
        response, meta = chat_client.generate(
            prompt_text,
            max_new_tokens=max_new_tokens,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.15,
            repeat_last_n=64,
            stop=["<|im_end|>", "<|end_of_text|>", "</s>"],
        )
    finally:
        organism.mark_generation_end()

    # Fallback prompt-echo strip. bitnet_cpp_client uses exact substring
    # match (`if prompt in stdout`) which silently fails when bitnet.cpp
    # normalizes whitespace, injects a BOS token, or otherwise mutates
    # the prompt-echo bytes. When that happens, `response` contains the
    # full prompt + completion, and every downstream consumer (nw_msgs
    # assistant turns, KISS reading nw_msgs to build sys_ctx, pith
    # _node_content for re-surfacing, _response_is_degenerate evaluation)
    # treats the polluted text as if it were just the completion. This
    # was the ROOT cause of the Run 30β†’51 pollution feedback loops β€”
    # every architectural fix above was treating a downstream symptom
    # of this single upstream silent failure. Use the "Assistant:"
    # generation-prompt marker as the boundary: forward `find` (not
    # rfind) targets the prompt's marker, not any mid-completion
    # hallucination. Only fires when canonical strip already failed.
    if not meta.get("prompt_echo_found") and response and "Assistant:" in response:
        idx = response.find("Assistant:")
        response = response[idx + len("Assistant:"):].lstrip()

    # Run Lenia step on splats after inference
    lenia_result = {}
    if splat_layers:
        try:
            lenia_result = lenia_engine.step()
            splat_metrics_history.append(lenia_result)
            _save_splat_state()
        except Exception as exc:
            logger.warning(f"Lenia step failed: {exc}")

    elapsed = time.time() - t0
    # Count output tokens from the response text
    out_count = len(tokenizer(response)["input_ids"]) if response else 0
    tok_per_sec = out_count / elapsed if elapsed > 0 else 0

    if meta.get("returncode", 0) != 0:
        logger.warning("chat_client non-zero rc=%s stderr=%s",
                       meta.get("returncode"), meta.get("stderr_tail"))

    return response, in_count, out_count, round(elapsed, 2), round(tok_per_sec, 1), lenia_result


# ── Concept extractor (dual-pass tree generation) ────────────────
#
# Mirrors the ecosystem's TID-based dual-pass path but uses Falcon3-10B-
# Instruct (1.58bit GGUF) running under bitnet.cpp. Previously used
# BitNet 2B via transformers greedy decoding β€” collapsed into repetition
# loops on enumeration tasks. Falcon3-10B was properly instruct-tuned
# before quantization and handles structured output reliably.
#
# Sampling params per Syl's prescription: non-greedy (temperature=0.7),
# top-p nucleus, moderate repetition_penalty, no_repeat_ngram_size at
# the runtime level (llama.cpp tracks via repeat-last-n). Stop
# sequences on common drift markers ("Answer:", "Explanation:",
# "Question:", double-newline) catch the Q&A/explanation patterns.
#
# No hard cap on extracted concept count (Law 7 β€” substrate dynamics
# decide relevance). The parser is what gets hardened: reject
# instruction-leak vocabulary, sentence fragments, pure punctuation.

# Syl's stopset β€” prompt-instruction vocabulary that tiny LLMs tend to
# echo back as "concepts". Stripping these at parse time prevents the
# generic-pollution pattern we saw in the previous debug run.
_EXTRACTOR_STOPSET = {
    # Instruction-leak vocabulary β€” small LLMs echo these back
    "key", "concepts", "important", "meaning", "stop", "list",
    "answer", "explanation", "concept", "question", "text",
    "therefore", "thing", "things", "item", "items",
    # Generic-physics / generic-substance words β€” broad embedding
    # footprint, become gravity wells in Pith. Observed as run-2
    # pathology (T2 phy1 pulled into every subsequent Pith).
    "gravity", "mass", "energy", "force", "time", "space", "matter",
    # Generic process / abstraction nouns β€” describe nothing specific
    "process", "mechanism", "method", "operation", "work", "function",
    "system", "structure", "principle", "phenomenon",
    # Domain labels β€” the thing the passage is *about*, not a
    # mechanism from within it
    "physics", "biology", "chemistry", "mathematics", "math",
    "computing", "cryptography", "astronomy",
    # Topic-at-word-level items that appear as trees but carry no
    # mechanism content: "primes" (use "prime factorization"),
    # "encryption" (use "RSA encryption" / "symmetric cipher"), etc.
    "primes", "encryption", "caching", "storage", "memory",
    "information", "computation",
}

# Lead-word stopset: if a parsed entry STARTS with one of these
# (connective/pronoun words common in prose drift), the entry is
# almost always a sentence fragment rather than a concept.
# Example caught: "these include computer science" (Falcon3 drifted
# into prose about cryptography research fields).
_EXTRACTOR_LEAD_STOPWORDS = {
    "these", "those", "this", "that", "the", "a", "an",
    "and", "or", "but", "while", "which", "where", "when",
    "they", "them", "their", "there", "here", "it", "its",
    "also", "first", "second", "then", "next", "finally",
    # Run-4 additions β€” discourse-drift markers Falcon3 emits when
    # it slips into explanatory prose instead of an enumeration
    # (e.g. "namely", "firstly", "therefore", "however").
    "namely", "firstly", "secondly", "thirdly", "therefore",
    "however", "moreover", "furthermore", "additionally",
    "specifically", "particularly", "notably",
}


def _hardened_parse(raw_output: str) -> list:
    """Syl's hardened parser β€” now dramatically simplified because
    grammar-constrained decoding (grammars/concepts.gbnf) enforces
    the output shape at the token-sampling layer. The parser no
    longer needs to cope with prose, bullets, citations, or any of
    the run 1-5 drift patterns β€” those tokens are physically
    unreachable during generation.

    Remaining filters operate on semantic content the grammar can't
    see: stopset words (instruction leakage + generic topic labels),
    lead-connective words, and dedup.

    Rules (parsing, not quality judgment β€” Law 7 compliant):
      - split on [,;] (Falcon3-10B-1.58bit sometimes uses semicolons)
      - for each piece, if it contains a newline take only what's
        BEFORE the first \\n β€” content after is usually chat-template
        drift or hallucinated follow-up
      - strip whitespace + common punctuation
      - drop empty strings
      - drop entries containing ":" (explanation drift like "Answer:")
      - drop entries containing chat-template markers ("<|", "</s>")
      - drop entries > 4 words (sentences, not concepts)
      - drop lowercase-match against _EXTRACTOR_STOPSET (instruction leak)
      - drop pure punctuation / pure digits
      - lowercase + dedupe (first occurrence wins)
    """
    import re
    out = []
    seen = set()

    # Defensive parser β€” belt-and-suspenders. When grammar-constrained
    # decoding engages, most of these filters are redundant (the
    # grammar blocks the characters they guard against). But run 6
    # showed grammar can silently fail to engage, so we keep the full
    # defense. The filters are cheap and idempotent when grammar works.

    # Split on common list-delimiters observed across Falcon3's output
    # variants. Commas dominate when grammar engages; newlines/semicolons
    # cover bullet-style fallbacks; square-bracket citation markers
    # from run 5 ("1] = ...", "2]") get treated as delimiters so the
    # actual content after them can be extracted.
    for piece in re.split(r'[,;\n\]]', raw_output):
        # Strip common bullet/punctuation characters from ends.
        c = piece.strip().strip(".-:;*`\"'‒‣○⁃[]{}()=$ \t")
        if not c:
            continue

        # Drop anything with syntax garbage from prose drift
        if any(ch in c for ch in ":[]{}$\"'<>`"):
            continue
        # Periods inside a concept almost always mean sentence drift
        # (exception: acronyms like "U.S.A.", but those rarely appear
        # as concepts). Drop to be safe.
        if "." in c:
            continue
        # Questions and exclamations are never concepts
        if "?" in c or "!" in c:
            continue

        # Word-count gate: 1-4 words. Grammar enforces this when
        # active; when grammar fails, this is the critical safeguard
        # against multi-sentence fragments (run 6 turn 1 = 137 chars).
        n_words = len(c.split())
        if n_words < 1 or n_words > 4:
            continue

        cl = c.lower().strip()
        if cl in _EXTRACTOR_STOPSET:
            continue
        # Lead-connective filter β€” drops sentence-fragment prose
        first_word = cl.split()[0] if cl.split() else ""
        if first_word in _EXTRACTOR_LEAD_STOPWORDS:
            continue
        # Drop pure-numeric and pure-punctuation entries
        if c.replace(".", "").replace("-", "").strip().isdigit():
            continue
        if all(not ch.isalnum() for ch in c):
            continue
        # Dedup β€” case-insensitive, first occurrence wins
        if cl in seen:
            continue
        seen.add(cl)
        out.append(cl)
    return out


# Extractor prompt β€” kept deliberately free of "key", "concepts",
# "important", "meaning", "list", "stop" words in the instruction
# portion, because small LLMs echo instruction vocabulary back as
# output content.
# Concept extractor prompt. With grammar-constrained decoding doing
# the heavy lifting on output format, the prompt only needs to
# communicate *what we want extracted* β€” the grammar guarantees the
# shape. Run 5 taught us that primer-style format-anchoring backfires
# on Falcon3-10B-1.58bit (citation-mode drift), so this reverts to a
# clean instructional prompt with few-shot examples.
_EXTRACTOR_PROMPT_TEMPLATE = (
    "Read the following text. Extract the specific processes, "
    "dependencies, and named entities it establishes β€” what happens, "
    "what depends on what, and the particular things involved.\n\n"
    "Prefer specific over general:\n"
    "- 'prime factorization' beats 'primes'\n"
    "- 'photon absorption' beats 'light'\n"
    "- 'cache line invalidation' beats 'caching'\n"
    "- 'Schwarzschild radius' beats 'gravity'\n"
    "- 'chlorophyll' or 'Calvin cycle' beat 'biology'\n\n"
    "Specific single-word terms are fine when they name the correct "
    "level of precision ('chlorophyll', 'factorization'). Avoid "
    "generic domain labels and broad abstractions.\n\n"
    "Output as a comma-separated list, 3 to 8 items, each 1-4 "
    "words. No explanations, no repetition.\n\n"
    "Text: {text}\n\n"
    "Specifics:"
)


def _bitnet_extract_full(text: str) -> dict:
    """Run the concept extractor via Falcon3-10B + hardened parser.

    Returns a dict with:
      prompt             β€” the exact prompt sent to the model
      raw_output         β€” llama-cli stdout (post-strip of prompt echo),
                           BEFORE hardened parsing
      parsed             β€” list of concepts after hardened parser
      tokens_in          β€” input token count (via BitNet tokenizer)
      tokens_out         β€” estimate via BitNet tokenizer on the raw_output
      elapsed_s          β€” wall-clock for the generation call
      hit_token_cap      β€” approximated via tokens_out >= max_new_tokens
      runtime_returncode β€” bitnet.cpp process return code
      error              β€” exception string if the generate call failed
    """
    text_for_extraction = text[:1000]
    prompt = _EXTRACTOR_PROMPT_TEMPLATE.format(text=text_for_extraction)
    result = {
        "prompt": prompt,
        "raw_output": "",
        "parsed": [],
        "tokens_in": 0,
        "tokens_out": 0,
        "elapsed_s": 0.0,
        "hit_token_cap": False,
        "runtime_returncode": None,
        "error": None,
    }

    MAX_NEW = 128
    try:
        # Plain-list tokens β€” transformers 5.5 disabled PyTorch backend
        # (BitNet pin torch 2.2 vs transformers' want for 2.4+). We only
        # need the count, not tensors.
        result["tokens_in"] = len(
            tokenizer(prompt, truncation=True, max_length=2048)["input_ids"]
        )
    except Exception:
        pass

    organism.mark_generation_start()
    try:
        raw_output, meta = extractor_client.generate(
            prompt,
            max_new_tokens=MAX_NEW,
            temperature=0.7,
            top_p=0.9,
            repetition_penalty=1.25,
            repeat_last_n=64,
            stop=[
                # Chat-template boundary markers β€” Falcon3 hallucinates
                # these when the prompt isn't in chat format. Cutting
                # generation at these kills the drift tail before it
                # starts. Order matters: check these first.
                "<|assistant|>", "<|user|>", "<|system|>",
                # Fallback terminators + drift markers
                "<|im_end|>", "<|end_of_text|>", "</s>",
                "Answer:", "Question:", "Explanation:", "Text:",
            ],
            # Grammar-constrained decoding β€” tokens violating the
            # concept-list GBNF get probability zero at sample time.
            # Inline (string) not file: avoids container path surprises
            # that caused run 6's silent-fallback failure mode.
            grammar=_EXTRACTOR_GRAMMAR,
        )
    except Exception as exc:
        organism.mark_generation_end()
        result["error"] = f"{type(exc).__name__}: {exc}"
        return result
    organism.mark_generation_end()

    result["raw_output"] = raw_output
    result["elapsed_s"] = meta.get("elapsed_s", 0.0)
    result["runtime_returncode"] = meta.get("returncode")
    # Surface stderr from the subprocess β€” critical for debugging when
    # the binary exits with rc!=0 (invalid flag, GGUF load failure,
    # OOM, etc.) and returns an empty response. Without this field in
    # the debug output, failures look like "the model generated nothing"
    # instead of "the subprocess never ran."
    result["stderr_tail"] = meta.get("stderr_tail", "")
    result["raw_stdout_tail"] = (meta.get("raw_stdout", "") or "")[-300:]
    if meta.get("error"):
        result["error"] = meta["error"]
    try:
        result["tokens_out"] = len(tokenizer(raw_output)["input_ids"]) if raw_output else 0
    except Exception:
        pass
    result["hit_token_cap"] = (result["tokens_out"] >= MAX_NEW - 5)

    result["parsed"] = _hardened_parse(raw_output)
    return result


# Stash of the most recent extractor call per forest text. Keyed by
# the input text (which equals the user's prompt when called from the
# benchmark). Populated by _bitnet_concept_extractor, read by the
# benchmark harness to record raw_output alongside parsed trees.
# Bounded β€” we keep only the last 32 entries to avoid unbounded growth
# in long-running sessions.
_last_extractions: dict = {}
_LAST_EXTRACTIONS_MAX = 32


def _bitnet_concept_extractor(text: str) -> list:
    """Thin wrapper the organism calls β€” returns just the parsed list.

    Full-detail version (_bitnet_extract_full) is used by the Debug
    Extract tab to show raw output alongside parsed concepts.

    Also stashes the full detail in _last_extractions so the benchmark
    can inspect Falcon3's raw emissions per turn without needing a
    separate extraction pass.
    """
    detail = _bitnet_extract_full(text)
    _last_extractions[text] = detail
    # Trim oldest if over cap β€” dict preserves insertion order in py3.7+
    if len(_last_extractions) > _LAST_EXTRACTIONS_MAX:
        oldest = next(iter(_last_extractions))
        _last_extractions.pop(oldest, None)
    return detail["parsed"]


def on_debug_extract():
    """Run the extractor against all 8 interleaved-benchmark questions.

    Returns a JSON report for human inspection:
      - raw model output (catches hallucinated explanations)
      - parsed concept list (what actually gets fed to the substrate)
      - timing + token counts (sanity-checks drain budget)
      - hit_token_cap flag (did the model run out of tokens? = no
        natural stop, probably not producing a list at all)
      - overall counters (total elapsed, median concepts per question,
        how many hit the cap)

    Run this BEFORE spending hours on A/B benchmarks β€” if the
    extractor output is junk, the rest doesn't matter.
    """
    per_question = []
    t_overall = time.time()
    cap_hits = 0
    errors = 0
    concept_counts = []

    for category, prompt_text in INTERLEAVED_QUESTIONS:
        detail = _bitnet_extract_full(prompt_text)
        per_question.append({
            "category": category,
            "question": prompt_text,
            "raw_output": detail["raw_output"],
            "parsed": detail["parsed"],
            "parsed_count": len(detail["parsed"]),
            "tokens_in": detail["tokens_in"],
            "tokens_out": detail["tokens_out"],
            "hit_token_cap": detail["hit_token_cap"],
            "elapsed_s": detail["elapsed_s"],
            "error": detail["error"],
            # Diagnostic fields forwarded from _bitnet_extract_full.
            # Critical for debugging when raw_output is empty β€” tells
            # us whether the subprocess actually ran and what it said.
            "runtime_returncode": detail.get("runtime_returncode"),
            "stderr_tail": detail.get("stderr_tail", ""),
            "raw_stdout_tail": detail.get("raw_stdout_tail", ""),
        })
        concept_counts.append(len(detail["parsed"]))
        if detail["hit_token_cap"]:
            cap_hits += 1
        if detail["error"]:
            errors += 1

    overall_elapsed = round(time.time() - t_overall, 2)

    # Cross-question concept-overlap diagnostic β€” if different
    # questions are extracting the same concepts, that's the generic-
    # concept-pollution signature. Normalize to lowercase for comparison.
    all_lower = [[c.lower() for c in pq["parsed"]] for pq in per_question]
    pairwise_overlap = []
    for i in range(len(per_question)):
        for j in range(i + 1, len(per_question)):
            set_i, set_j = set(all_lower[i]), set(all_lower[j])
            if set_i and set_j:
                jaccard = len(set_i & set_j) / max(1, len(set_i | set_j))
                if jaccard > 0:
                    pairwise_overlap.append({
                        "pair": f"T{i+1}({per_question[i]['category']}) ↔ T{j+1}({per_question[j]['category']})",
                        "jaccard": round(jaccard, 3),
                        "shared": sorted(set_i & set_j),
                    })

    # Same-category pair analysis β€” the hypothesis check. For each
    # category, does q1's concept set overlap with q2's? This is the
    # direct test of whether dual-pass trees CAN bridge category pairs.
    same_cat_bridges = []
    for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS:
        set_i, set_j = set(all_lower[i]), set(all_lower[j])
        same_cat_bridges.append({
            "category": per_question[i]["category"],
            "q1_concepts": per_question[i]["parsed"],
            "q2_concepts": per_question[j]["parsed"],
            "shared_lowercase": sorted(set_i & set_j),
            "jaccard": round(len(set_i & set_j) / max(1, len(set_i | set_j)), 3) if (set_i and set_j) else 0,
        })

    summary = {
        "model": MODEL_NAME,
        "questions_tested": len(per_question),
        "total_elapsed_s": overall_elapsed,
        "median_concepts_per_question": int(sorted(concept_counts)[len(concept_counts) // 2]) if concept_counts else 0,
        "token_cap_hits": f"{cap_hits}/{len(per_question)}",
        "extraction_errors": errors,
        "pairwise_overlap_nonzero_count": len(pairwise_overlap),
        "same_category_bridges": same_cat_bridges,
    }

    return (
        json.dumps(summary, indent=2),
        json.dumps(per_question, indent=2),
        json.dumps(pairwise_overlap, indent=2),
    )


# Wire the extractor into the organism β€” starts the background concept
# helper manager thread. From this point forward, every deposit and
# response enqueues for deferred tree extraction.
organism.set_concept_extractor(_bitnet_concept_extractor)
logger.info("NuWave concept helper wired: dual-pass extraction live")


# ── Substrate context formatter β€” DORMANT ────────────────────────
#
# Status: NOT CALLED at any site as of 2026-04-28 (B1 reverted).
# Run 26 (commits 59124dd + e2c4343 active) showed B1's section
# headers added ~120-200 tokens of pure formatting overhead per
# turn β€” more than the typed-presentation benefit gave back at
# BitNet 1.58-bit 2B-parameter scale. Token economy regressed
# from -2.4% (Run 25) to +3.4% (Run 26). Reverted to plain
# "\n".join(pith_context) at all three call sites.
#
# Hypothesis worth revisiting at larger model scale (7B+ or
# higher-precision quantization): typed input may genuinely help
# attention when the model has more capacity to use the structural
# cues. At 2B / 1.58-bit, the formatting tax exceeds the benefit.
#
# The helper is preserved here as dormant code. Re-enable by
# swapping the three call sites back to:
#   substrate_ctx = _format_substrate_context(pith_context, pith_ids)
# (and switching pith_extract β†’ pith_extract_with_ids at sites 1 and 2).
# Group surfaced content by node-kind via ID prefix:
#   tree_*         β†’ "Related concepts" (concept words from dual-pass)
#   exp_*          β†’ "Prior questions on this topic" (deposit nodes)
#   resp_*         β†’ "Prior responses"
#   concept_narr_* β†’ operational telemetry, omitted from prompt
#   other          β†’ "Other context"

def _format_substrate_context(pith_context, pith_ids=None) -> str:
    """Return a sectioned substrate-context string for prompt injection."""
    if not pith_context:
        return ""
    if not pith_ids or len(pith_ids) != len(pith_context):
        # No IDs available β€” can't section. Fallback to plain join so
        # callers without _with_ids still produce something usable.
        return "\n".join(pith_context)

    concepts, questions, responses, other = [], [], [], []
    for text, pid in zip(pith_context, pith_ids):
        if not text:
            continue
        if pid.startswith("tree_"):
            concepts.append(text)
        elif pid.startswith("exp_"):
            questions.append(text)
        elif pid.startswith("resp_"):
            responses.append(text)
        elif pid.startswith("concept_narr_"):
            # Operational telemetry β€” omit from prompt context (Bunyan-shaped
            # data; legitimate substrate experience but not user knowledge).
            continue
        else:
            other.append(text)

    parts = []
    if concepts:
        parts.append(
            "[Related concepts from substrate:]\n"
            + "\n".join(f"- {c}" for c in concepts)
        )
    if questions:
        parts.append(
            "[Prior questions on this topic:]\n"
            + "\n".join(f"- {q}" for q in questions)
        )
    if responses:
        parts.append("[Prior context:]\n" + "\n".join(responses))
    if other:
        parts.append("[Other context:]\n" + "\n".join(f"- {o}" for o in other))
    return "\n\n".join(parts)


# ── Chat Handler ──────────────────────────────────────────────────

def on_send(message, history):
    if not message:
        return "", history, ""

    global messages_nw
    messages_nw.append({"role": "user", "content": message})

    # ── 1. Deposit raw experience into substrate (Law 7) ──
    organism.deposit_experience(message)

    # ── 2. Substrate processes ──
    step_result = organism.step()

    # ── 3. KISS bucket β€” extract what changed from the River ──
    kiss_extract = organism.kiss_extract(step_result)

    # Also run string-level KISS for comparison metrics
    kiss_string_result = kiss_nw.filter_context(messages_nw, system_prompt)
    sys_ctx = kiss_string_result.get("system_context", system_prompt)

    # ── 4. Pith bucket β€” extract relevant context from the River ──
    pith_context, pith_ids = organism.pith_extract_with_ids(message, max_context=5)

    # D2 (2026-05-22): NO conversation history in the prompt.
    # Substrate-only continuity per NuWave's design philosophy
    # (organism.py header: "substrate carries continuity, not literal
    # chat history"). The previous recent_window=6 was the source of
    # the post-2026-05-16 degeneracy loop β€” BitNet's own prior degenerate
    # outputs in messages_nw assistant turns were being pulled back into
    # next turn's prompt, BitNet kept echoing the pattern. Closing it:
    # prompt = [system instructions, current user turn (with pith block)].
    # messages_nw still grows (KISS filter_context reads it for sys_ctx
    # derivation, conversation record persists); the model just doesn't
    # see prior turns directly.
    if pith_context:
        pith_block = "\n".join(f"  - {p}" for p in pith_context)
        user_content = (
            "Some context that may be relevant (recalled from earlier "
            "related conversations; these are reference material, not "
            "questions to answer):\n"
            f"{pith_block}\n\n"
            f"My actual question: {message}"
        )
    else:
        user_content = message

    prompt_msgs = []
    if sys_ctx:
        prompt_msgs.append({"role": "system", "content": sys_ctx})
    prompt_msgs.append({"role": "user", "content": user_content})

    prompt = tokenizer.apply_chat_template(
        prompt_msgs, tokenize=False, add_generation_prompt=True,
    )

    # ── 5. Model generates ──
    response, in_tok, out_tok, elapsed, tok_s, lenia_result = do_generate(prompt)

    messages_nw.append({"role": "assistant", "content": response})

    # ── 6. Outcome feeds back into substrate (Law 7) ──
    organism.record_outcome(message, response, success=True)

    # Stats β€” both substrate and string-level
    kiss_stats = kiss_nw.stats.to_dict()
    org_stats = organism.get_stats()

    lenia_info = ""
    if lenia_result:
        lenia_info = (
            f" | Lenia step {lenia_result.get('step', 0)}: "
            f"Δα={lenia_result.get('total_alpha_delta', 0):.6f} "
            f"Δμ={lenia_result.get('total_position_delta', 0):.6f} "
            f"({lenia_result.get('time_ms', 0):.0f}ms)"
        )

    substrate_info = (
        f" | Substrate: {org_stats.get('nodes', 0)} nodes, "
        f"{org_stats.get('synapses', 0)} syn, "
        f"{org_stats.get('fired_nodes', 0)} fired"
    )

    kiss_bucket_info = (
        f" | KISS bucket: {kiss_extract.get('action', '?')} "
        f"({kiss_extract.get('reason', '')})"
    )
    if kiss_extract.get('surprise_ratio', 0) > 0:
        kiss_bucket_info += f" surprise={kiss_extract['surprise_ratio']}"

    stats_text = (
        f"**Turn {len(messages_nw)//2}** | "
        f"{out_tok} tokens in {elapsed}s ({tok_s} tok/s) | "
        f"Input: {in_tok} tokens | "
        f"String KISS: {kiss_stats.get('tokens_saved', 0)} saved ({kiss_stats.get('efficiency', 0):.1%})"
        f"{substrate_info}"
        f"{kiss_bucket_info}"
        f" | Pith river: {len(pith_context)} contexts"
        f"{lenia_info}"
    )

    history = history + [
        {"role": "user", "content": message},
        {"role": "assistant", "content": response},
    ]

    return "", history, stats_text


def on_reset():
    global messages_nw, kiss_nw, pith_nw
    messages_nw = []
    kiss_nw = KISSFilter()
    pith_nw = PithPipeline()
    return [], "Chat reset."


# ── Benchmark ─────────────────────────────────────────────────────

# ── Interleaved-Category Benchmark ────────────────────────────────
#
# Tests topology re-ignition: 4 semantic neighborhoods are seeded in
# turns 1-4 (q1 each), then turns 5-8 ask a follow-up in each category
# with 3 unrelated turns in between. If Pith's Born-rule extraction is
# genuinely substrate-informed (not just recency-biased), turn 5's Pith
# should re-select turn 1's deposit β€” the category neighborhood wakes
# back up despite the gap. If the system were a sliding window, none
# of that could happen: the relevant context is always 4 turns stale.

INTERLEAVED_QUESTIONS = [
    # q1's β€” primers, establish 4 neighborhoods
    ("biology",   "How does photosynthesis work?"),
    ("physics",   "What is a black hole?"),
    ("computing", "How do CPU cache hierarchies work?"),
    ("math",      "What are prime numbers?"),
    # q2's β€” follow-ups, test re-ignition across the 3-turn gap
    ("biology",   "What role does chlorophyll play in it?"),
    ("physics",   "How does its event horizon form?"),
    ("computing", "Why are L1 caches split into instruction and data?"),
    ("math",      "Why are they important in cryptography?"),
]
# Expected same-category pairs: (q1_turn, q2_turn) zero-indexed
_INTERLEAVED_SAME_CAT_PAIRS = [(0, 4), (1, 5), (2, 6), (3, 7)]


# ── Oracle Trees (experimental ceiling test) ─────────────────────────
#
# Hand-authored "ideal" mechanism concepts for each interleaved prompt.
# Used by the oracle-mode benchmark to establish whether dual-pass CAN
# succeed given perfect trees β€” regardless of extractor quality. If
# oracle-mode ignition metrics dramatically exceed run 3's no-tree
# baseline (15.3Γ— signal/noise), the extractor is the bottleneck
# and worth improving. If oracle-mode performs no better than runs
# 3-9, dual-pass itself is the dead end.
#
# Design: each q1 and q2 tree list intentionally shares 1-5 concepts
# with its same-category partner to maximize re-ignition probability.
# Example: "prime factorization" appears in BOTH math/q1 and math/q2
# so it should fire the same tree node on both turns.
_ORACLE_TREES = {
    # Biology
    "How does photosynthesis work?": [
        "chlorophyll", "photon absorption", "thylakoid membrane",
        "Calvin cycle", "ATP synthesis", "carbon fixation",
    ],
    "What role does chlorophyll play in it?": [
        "chlorophyll", "photon absorption", "thylakoid membrane",
        "light-dependent reactions", "green pigment", "photosystem II",
    ],
    # Physics
    "What is a black hole?": [
        "event horizon", "Schwarzschild radius", "gravitational collapse",
        "singularity", "escape velocity", "spacetime curvature",
    ],
    "How does its event horizon form?": [
        "event horizon", "Schwarzschild radius", "gravitational collapse",
        "spacetime curvature", "escape velocity", "null geodesic",
    ],
    # Computing
    "How do CPU cache hierarchies work?": [
        "cache hierarchy", "cache coherency", "memory access latency",
        "cache line", "L1 cache", "L2 cache",
    ],
    "Why are L1 caches split into instruction and data?": [
        "L1 cache", "instruction cache", "data cache",
        "cache line", "Harvard architecture", "pipeline parallelism",
    ],
    # Math
    "What are prime numbers?": [
        "prime factorization", "integer divisibility", "Euclidean algorithm",
        "fundamental theorem", "modular arithmetic", "prime distribution",
    ],
    "Why are they important in cryptography?": [
        "prime factorization", "modular exponentiation", "RSA encryption",
        "discrete logarithm", "trapdoor function", "integer factorization",
    ],
}


def _oracle_concept_extractor(text: str) -> list:
    """Return hand-authored ideal trees for interleaved benchmark prompts.

    Oracle extraction: lookup-only, no LLM call. Used by the oracle-mode
    benchmark to establish the ceiling of dual-pass performance. For
    prompts NOT in the oracle dict, returns empty list (oracle mode only
    supports the interleaved benchmark questions β€” running other text
    through this would give misleading results).
    """
    concepts = _ORACLE_TREES.get(text, [])
    if not concepts:
        logger.info("Oracle extractor: no entry for prompt, returning []")
    else:
        logger.info("Oracle extractor: returning %d concepts for %r",
                    len(concepts), text[:60])
    return [c.lower() for c in concepts]


SAMPLE_CONVERSATIONS = [
    "What is machine learning?",
    "How does it differ from traditional programming?",
    "Can you give me a simple example of supervised learning?",
    "What about unsupervised learning?",
    "How would I choose between them for a new project?",
    "What are neural networks?",
    "How deep is 'deep learning'?",
    "What's the relationship between AI, ML, and deep learning?",
    "What are transformers in the context of AI?",
    "How does attention work in a transformer?",
    "Why are transformers better than RNNs for many tasks?",
    "What is transfer learning and why does it matter?",
    "How do I fine-tune a pre-trained model?",
    "What are the ethical considerations in AI?",
    "Where do you see AI heading in the next 5 years?",
]


def on_benchmark(num_turns):
    turns = min(int(num_turns), len(SAMPLE_CONVERSATIONS))
    conversation = SAMPLE_CONVERSATIONS[:turns]

    # Use the live organism β€” it has topology from prior conversations.
    # A fresh organism has no topology, Pith returns nothing, trimming
    # never activates. The compound needs accumulated state.
    nw_organism = organism
    nw_kiss = KISSFilter()
    bl_msgs = []
    nw_msgs = []

    results = []

    for i, prompt_text in enumerate(conversation):
        # ── Baseline β€” raw model, full history, no optimization ──
        bl_msgs.append({"role": "user", "content": prompt_text})
        prompt_bl = tokenizer.apply_chat_template(
            [{"role": "system", "content": system_prompt}] + bl_msgs,
            tokenize=False, add_generation_prompt=True,
        )
        resp_bl, in_bl, out_bl, time_bl, tps_bl, _ = do_generate(prompt_bl, max_new_tokens=128)
        bl_msgs.append({"role": "assistant", "content": resp_bl})

        # ── NuWave β€” full organism path (same as on_send) ──
        nw_msgs.append({"role": "user", "content": prompt_text})

        # Deposit + step + KISS + Pith (full loop)
        nw_organism.deposit_experience(prompt_text)
        step_result = nw_organism.step()
        kiss_extract = nw_organism.kiss_extract(step_result)

        # String KISS for comparison
        kiss_r = nw_kiss.filter_context(nw_msgs, system_prompt)
        sys_ctx = kiss_r.get("system_context", system_prompt)

        # Pith Born rule extraction from substrate.
        pith_context = nw_organism.pith_extract(prompt_text, max_context=5)

        # D2 (2026-05-22): NO conversation history in the prompt.
        # Substrate-only continuity per NuWave's design. nw_msgs still
        # grows (KISS reads it for sys_ctx); the model only sees this
        # turn's user message with the labeled pith context block.
        if pith_context:
            pith_block = "\n".join(f"  - {p}" for p in pith_context)
            user_content = (
                "Some context that may be relevant (recalled from earlier "
                "related conversations; these are reference material, not "
                "questions to answer):\n"
                f"{pith_block}\n\n"
                f"My actual question: {prompt_text}"
            )
        else:
            user_content = prompt_text

        prompt_msgs_nw = []
        if sys_ctx:
            prompt_msgs_nw.append({"role": "system", "content": sys_ctx})
        prompt_msgs_nw.append({"role": "user", "content": user_content})

        prompt_nw = tokenizer.apply_chat_template(
            prompt_msgs_nw, tokenize=False, add_generation_prompt=True,
        )
        resp_nw, in_nw, out_nw, time_nw, tps_nw, lenia_r = do_generate(prompt_nw, max_new_tokens=128)
        nw_msgs.append({"role": "assistant", "content": resp_nw})

        # Outcome closes the loop
        nw_organism.record_outcome(prompt_text, resp_nw, success=True)

        ks = nw_kiss.stats.to_dict()
        org_stats = nw_organism.get_stats()

        results.append({
            "turn": i + 1,
            "baseline": {"tokens": in_bl, "time": time_bl, "tok_s": tps_bl},
            "nuwave": {"tokens": in_nw, "time": time_nw, "tok_s": tps_nw},
            "tokens_saved": max(0, in_bl - in_nw),
            "time_saved": round(max(0, time_bl - time_nw), 2),
            "kiss_efficiency": ks.get("efficiency", 0),
            "pith_l1_size": org_stats.get('pith_l1_size', 0),
            "substrate_nodes": org_stats.get('nodes', 0),
            "substrate_synapses": org_stats.get('synapses', 0),
        })

    # Summary
    total_time_bl = sum(r["baseline"]["time"] for r in results)
    total_time_nw = sum(r["nuwave"]["time"] for r in results)
    total_tok_bl = sum(r["baseline"]["tokens"] for r in results)
    total_tok_nw = sum(r["nuwave"]["tokens"] for r in results)

    summary = {
        "model": MODEL_NAME,
        "turns": turns,
        "baseline_total_tokens": total_tok_bl,
        "nuwave_total_tokens": total_tok_nw,
        "tokens_saved": total_tok_bl - total_tok_nw,
        "baseline_total_time": round(total_time_bl, 2),
        "nuwave_total_time": round(total_time_nw, 2),
        "time_saved": round(total_time_bl - total_time_nw, 2),
        "final_kiss_efficiency": results[-1]["kiss_efficiency"] if results else 0,
        "final_pith_l1": results[-1]["pith_l1_size"] if results else 0,
    }

    return json.dumps(summary, indent=2), json.dumps(results, indent=2)


def _response_is_degenerate(text: str) -> bool:
    """Detect degenerate BitNet output patterns.

    Phase B+1 (2026-05-11) β€” closes the substrate-quality feedback gap.
    Run 45's T8 surfaced 3 `resp_*` nodes with degenerate text ("Did
    on... Did on... 4. 2. 2..."), bloated the prompt to 605s NuWave
    generation, and produced more degenerate output which got
    deposited and reinforced via record_outcome's reward (which only
    evaluates pith quality, not response quality).

    This helper detects three specific BitNet degeneracy signatures
    we've observed across runs:
      1. Long token-runs (β‰₯ 5 consecutive identical tokens β€” the
         "2. 2. 2. 2. 2." pattern)
      2. Low unique-token diversity (< 30% unique β€” heavy repetition)
      3. Chat-template fragment leakage ("| user:", "| assistant:",
         "System:" β€” BitNet pulling its prompt-format markers into
         the generated text)

    Used in the benchmark loop to force `success_signal = False` when
    the response was degenerate, regardless of pith composition. The
    substrate then receives LTD on synapses that produced the
    degenerate-generating retrieval β€” self-cleaning via STDP over
    multiple runs.

    Pure-stdlib, O(N) string check. No coupling to substrate code.
    """
    tokens = text.split()
    if len(tokens) < 10:
        return False
    unique_ratio = len(set(tokens)) / len(tokens)
    if unique_ratio < 0.3:
        return True
    max_run = cur_run = 1
    for i in range(1, len(tokens)):
        if tokens[i] == tokens[i - 1]:
            cur_run += 1
            if cur_run > max_run:
                max_run = cur_run
        else:
            cur_run = 1
    if max_run >= 5:
        return True
    if any(marker in text for marker in ("| user:", "| assistant:", "System:")):
        return True
    # Phrase-level verbatim repetition β€” 3-word n-gram occurring 3+ times.
    # Catches "Readability: Code that is easy to understand. Readability:
    # Code that is easy to understand." style degeneracy that has moderate
    # unique-token ratio but obvious sentence-level loops. Coherent text
    # rarely has 3+ verbatim 3-word phrase repetitions.
    if len(tokens) >= 9:
        trigram_counts: dict = {}
        for i in range(len(tokens) - 2):
            tg = (tokens[i], tokens[i + 1], tokens[i + 2])
            trigram_counts[tg] = trigram_counts.get(tg, 0) + 1
        if trigram_counts and max(trigram_counts.values()) >= 3:
            return True
    return False


def on_interleaved_benchmark(
    enable_dual_pass: bool = True,
    oracle_trees: bool = False,
    surfacing_mode: str = "pith",
):
    """Run the 4-category interleaved benchmark + build re-ignition heatmaps.

    Runs against the live organism (accumulated state), so re-ignition
    is tested against a real populated substrate. Returns:
      (summary_json, per_turn_json, heatmap_A_fig, heatmap_B_fig)

    Heatmap A: Jaccard overlap of ignition sets between all turn pairs.
    Bright (1,5), (2,6), (3,7), (4,8) = same-category re-ignition signal.

    Heatmap B: Did turn j's Pith selection include turn i's deposit?
    Bright (5,1), (6,2), (7,3), (8,4) = substrate memory carrying the
    q1 deposit forward through 3 unrelated turns to surface at q2 time.

    enable_dual_pass: if False, temporarily disables the concept helper
    for the duration of this benchmark run. Used for A/B comparison
    against a run with dual-pass enabled. Disabling: clears the pending
    concept queue, detaches the extractor (deposits stop enqueueing),
    and skips wait_for_trees between turns. Restored in a finally block.
    """
    # Matplotlib in headless container β€” set backend before any import.
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    import numpy as np

    nw_organism = organism

    # ── Dual-pass toggle ─────────────────────────────────────────────
    # Detach the extractor so deposit_experience / record_outcome skip
    # enqueueing. Drain any pending queue entries so they don't get
    # processed during this benchmark (which would contaminate the
    # "dual-pass disabled" measurement). The manager thread stays
    # running but has nothing to do. Restored in the finally block.
    _saved_extractor = None
    if not enable_dual_pass:
        _saved_extractor = nw_organism._concept_extractor
        nw_organism._concept_extractor = None
        drained_count = 0
        while not nw_organism._concept_queue.empty():
            try:
                nw_organism._concept_queue.get_nowait()
                drained_count += 1
            except Exception:
                break
        logger.info(
            "Dual-pass DISABLED for this benchmark run "
            "(drained %d pending concept entries)", drained_count,
        )
    elif oracle_trees:
        # Oracle mode β€” swap the LLM extractor for a dict-lookup oracle
        # that returns hand-authored ideal trees. Tests the ceiling of
        # dual-pass performance independent of extractor quality.
        _saved_extractor = nw_organism._concept_extractor
        nw_organism._concept_extractor = _oracle_concept_extractor
        logger.info(
            "ORACLE TREES mode for this benchmark run β€” using hand-authored "
            "ideal concepts (%d prompts in oracle dict)", len(_ORACLE_TREES),
        )
    else:
        logger.info("Dual-pass ENABLED for this benchmark run (LLM extractor)")

    # Record starting substrate state for fair-comparison diagnostics
    _start_stats = nw_organism.get_stats()
    _start_nodes = _start_stats.get('nodes', 0)
    _start_synapses = _start_stats.get('synapses', 0)
    # Run 41+ predictive-coding diagnostic β€” capture cumulative prediction
    # counters at run start so we can compute delta per run. If predictions
    # never generate, all per-turn predictions_confirmed/surprised are 0
    # AND total_predictions_made delta = 0 β†’ confirms prediction_threshold
    # gating diagnosis. Reads canonical Graph counters directly.
    _start_total_predictions_made = int(getattr(
        getattr(nw_organism, "_graph", None), "_total_predictions_made", 0,
    ) or 0)
    _start_total_surprised = int(getattr(
        getattr(nw_organism, "_graph", None), "_total_surprised", 0,
    ) or 0)

    nw_kiss_inst = KISSFilter()
    bl_msgs: list = []
    nw_msgs: list = []
    results = []

    ignition_sets: list = []       # fired-node set per turn
    deposit_ids: list = []         # deposit node_id per turn (may be None)
    pith_ids_per_turn: list = []   # pith-selected node_ids per turn

    # ── Phase A pool β€” per-run stratified sampling ──────────────────────
    # Replaces the hardcoded module-level INTERLEAVED_QUESTIONS (4 categories
    # Γ— 1 Q1/Q2 pair = 8 fixed turns) with a sample drawn from the 80-prompt
    # benchmark_pool.yaml (10 categories Γ— 4 Q1/Q2 pairs = 40 pairs total).
    # Per-run variance in which 8 pairs get sampled is itself substrate
    # diversification β€” different threads / categories / complexity registers
    # dominate each run, which the canonical co-firing-discovery and
    # predictive-coding mechanisms need to fire (Run 42 sidecar diagnosis).
    #
    # Stratification discipline (enforced in benchmark_loader.sample_pairs):
    #   - max 2 pairs per category (no category dominates)
    #   - β‰₯3 threads with 2+ instances (cross-category co-firing)
    #   - β‰₯3 distinct complexity levels (gradient hit each run)
    #
    # Returns 16 turns (8 Q1s indexed 0..7, then 8 matching Q2s indexed 8..15)
    # and same-cat pairs [(0,8), (1,9), ..., (7,15)]. Shadows the module-level
    # INTERLEAVED_QUESTIONS / _INTERLEAVED_SAME_CAT_PAIRS for this function's
    # scope; downstream code reads the locals via Python scoping.
    # Load the full pool once β€” passed to the sampler AND used downstream to
    # build category centroids from ALL 80 prompts (not just the per-run
    # sampled subset). Run 43 surfaced the bug where centroids built from
    # `INTERLEAVED_QUESTIONS` (per-run sample of 6-8 cats) caused old
    # substrate nodes from non-sampled categories to be force-mapped to
    # whatever centroid was closest, garbling the diagnostic metrics.
    _full_benchmark_pool = _load_benchmark_pool()
    _pool_interleaved, _pool_same_cat_pairs, _pool_meta = _sample_benchmark_chains(
        pool=_full_benchmark_pool,
        n_chains=8,
    )
    INTERLEAVED_QUESTIONS = _pool_interleaved
    _INTERLEAVED_SAME_CAT_PAIRS = _pool_same_cat_pairs
    _pool_summary = _describe_benchmark_sample(_pool_meta)
    logger.info(
        "Phase B pool sampled: %d chains / %d turns | cats=%s | threads=%s",
        _pool_summary["n_chains"],
        _pool_summary["n_turns"],
        _pool_summary["categories_sampled"],
        _pool_summary["threads_sampled"],
    )

    # Per-turn category labels (parallel to deposit_ids). Used by the
    # category-match heatmap to ask "did j's pith pull ANY same-category
    # prior deposit" rather than the strict exact-id match.
    categories_per_turn: list = [c for c, _ in INTERLEAVED_QUESTIONS]

    # Cross-run category registry on the organism. Maps node_id -> category
    # for deposits this benchmark has tagged. Kept as a diagnostic β€” its
    # size proves the persistence path works β€” but the heatmap no longer
    # depends on it (Run 15 showed the substrate is stable but pith pulls
    # nodes that predate the registry, so registry-based tagging can never
    # match). Lazy-init on first use.
    if not hasattr(nw_organism, "_benchmark_category_registry"):
        nw_organism._benchmark_category_registry = {}
    cat_registry = nw_organism._benchmark_category_registry

    # ── Option G: similarity-based categorization (full-pool centroids) ─
    # Build one centroid per category by averaging the embeddings of ALL
    # prompts in the FULL pool's q1 and q2 layers (8 prompts per category
    # Γ— 10 categories = 80 embeddings, 10 centroids). Earlier this iterated
    # only over the per-run-sampled INTERLEAVED_QUESTIONS, which produced
    # centroids for just 6-8 categories β€” substrate nodes from non-sampled
    # categories (physics, biology, math, etc. when they weren't in the
    # current run's draw) got force-mapped to whatever centroid happened to
    # be closest, garbling per-turn category diagnostics (Run 43 surfaced
    # this β€” gravitational-collapse nodes were tagged "music," prime-
    # factorization nodes tagged "computing," etc.).
    #
    # Building from the full pool means tagging is stable regardless of
    # which subset gets sampled this run, AND the centroid quality is
    # better (8 prompts averaged per category vs 2 in the old benchmark).
    # Cost: ~10 seconds of embedding at benchmark startup; one-time per run.
    _category_centroids: dict = {}
    _CATEGORY_SIM_THRESHOLD = 0.30  # cosine sim floor to assign category
    try:
        _per_cat_embs: dict = {}
        for _layer_key in ("q1_layer", "q2_layer"):
            for _entry in _full_benchmark_pool.get(_layer_key, []):
                _emb = np.asarray(
                    nw_organism._embed_fn(_entry["text"]), dtype=np.float32,
                )
                _per_cat_embs.setdefault(_entry["category"], []).append(_emb)
        for _cat, _embs in _per_cat_embs.items():
            _centroid = np.mean(_embs, axis=0)
            _norm = np.linalg.norm(_centroid) + 1e-8
            _category_centroids[_cat] = _centroid / _norm
        logger.info(
            "Built %d category centroids from full pool "
            "(%d prompts averaged per centroid)",
            len(_category_centroids),
            sum(len(v) for v in _per_cat_embs.values()) // max(1, len(_per_cat_embs)),
        )
    except Exception as exc:
        logger.warning("Category centroid build failed: %s", exc)

    def _categorize_node(node_id: str) -> Optional[str]:
        """Return best-matching category for a node, or None.

        Looks up the node's stored embedding in the organism's side-table,
        computes cosine similarity to each category centroid, returns the
        category with maximum similarity if it exceeds the threshold.
        Threshold prevents off-topic substrate nodes (e.g. residue from
        unrelated chat sessions) from being shoehorned into a category.
        """
        if not _category_centroids:
            return None
        emb = nw_organism._embeddings.get(node_id)
        if emb is None:
            return None
        norm = np.linalg.norm(emb) + 1e-8
        emb_n = emb / norm
        best_cat = None
        best_sim = _CATEGORY_SIM_THRESHOLD
        for cat, cent in _category_centroids.items():
            sim = float(np.dot(emb_n, cent))
            if sim > best_sim:
                best_sim = sim
                best_cat = cat
        return best_cat

    N = len(INTERLEAVED_QUESTIONS)

    for i, (category, prompt_text) in enumerate(INTERLEAVED_QUESTIONS):
        # Baseline β€” raw model, full interleaved history, no optimization
        bl_msgs.append({"role": "user", "content": prompt_text})
        prompt_bl = tokenizer.apply_chat_template(
            [{"role": "system", "content": system_prompt}] + bl_msgs,
            tokenize=False, add_generation_prompt=True,
        )
        resp_bl, in_bl, out_bl, time_bl, tps_bl, _ = do_generate(prompt_bl, max_new_tokens=128)
        bl_msgs.append({"role": "assistant", "content": resp_bl})

        # NuWave β€” full organism path with rich logging
        nw_msgs.append({"role": "user", "content": prompt_text})

        deposit_nid = nw_organism.deposit_experience(prompt_text)
        step_result = nw_organism.step()
        kiss_extract = nw_organism.kiss_extract(step_result)

        # Retrieval dispatch β€” pith_extract uses amplitude Born-rule
        # scoring; surface_extract uses CES voltage+recency scoring.
        # Run 11 oracle test showed ampΒ² suppresses trees (0.5Β²=0.25
        # vs forest 1.0Β²=1.0, 4Γ— disadvantage). Surfacing mode lets
        # us test whether SNN-native dynamics avoid that bottleneck.
        if surfacing_mode == "surface":
            pith_context, pith_ids = nw_organism.surface_extract_with_ids(
                prompt_text, max_context=5,
            )
        else:
            pith_context, pith_ids = nw_organism.pith_extract_with_ids(
                prompt_text, max_context=5,
            )

        # Capture substrate internals BEFORE record_outcome (which runs
        # additional graph.step() calls that would pollute the fired set).
        ignition_sets.append(set(step_result.get('fired_nodes', [])))
        deposit_ids.append(deposit_nid)
        pith_ids_per_turn.append(list(pith_ids))
        # Register this deposit's category so future pith pulls can be
        # category-tagged across runs (setdefault so we don't overwrite
        # if the same node_id is somehow re-deposited).
        if deposit_nid:
            cat_registry.setdefault(deposit_nid, category)

        kiss_r = nw_kiss_inst.filter_context(nw_msgs, system_prompt)
        sys_ctx = kiss_r.get("system_context", system_prompt)

        # D2 (2026-05-22): NO conversation history in the prompt.
        # Substrate-only continuity per NuWave's design philosophy.
        # nw_msgs still grows as a record (KISS reads it for sys_ctx
        # derivation just above); the model only sees this turn's user
        # message with the labeled pith context block.
        if pith_context:
            pith_block = "\n".join(f"  - {p}" for p in pith_context)
            user_content = (
                "Some context that may be relevant (recalled from earlier "
                "related conversations; these are reference material, not "
                "questions to answer):\n"
                f"{pith_block}\n\n"
                f"My actual question: {prompt_text}"
            )
        else:
            user_content = prompt_text

        prompt_msgs_iv = []
        if sys_ctx:
            prompt_msgs_iv.append({"role": "system", "content": sys_ctx})
        prompt_msgs_iv.append({"role": "user", "content": user_content})

        prompt_nw = tokenizer.apply_chat_template(
            prompt_msgs_iv, tokenize=False, add_generation_prompt=True,
        )
        resp_nw, in_nw, out_nw, time_nw, tps_nw, _ = do_generate(prompt_nw, max_new_tokens=128)
        nw_msgs.append({"role": "assistant", "content": resp_nw})

        # ── Per-turn correctness signal (Run 33+) ──────────────────────────
        # Did this turn's pith pull predominantly USEFUL same-category
        # content β€” excluding self-retrievals (pith ids whose embedding is
        # near-identical to the query, i.e., the substrate handing the query
        # back at us)?
        #
        # History:
        #  - Run 30: hardcoded success=True β†’ inverted ignition asymmetry
        #    (cross-cat firing harder than same-cat).
        #  - Run 31 (same-cat ratio threshold β‰₯ 0.5): ignition flipped sign
        #    in one run; token regression collapsed +6.1% β†’ +0.51%.
        #  - Run 32: signal got gamed by question-repetition. Prior-run
        #    deposits of the same query text are same-category-tagged, so
        #    ratio = 1.0 on 5/8 turns. Substrate over-LTP'd at 5Γ— normal
        #    rate (~56K new synapses vs typical ~11K). Token regression
        #    jumped to +12.1%, wall-clock +8.6% slower.
        #
        # Self-retrieval gate: for each pith id, cosine similarity between
        # its embedding and the current query's embedding. If above
        # _SELF_RETRIEVAL_THRESHOLD (0.92), node is a near-identical text
        # repeat β€” counts toward tagged_total but NOT tagged_same. Drives
        # ratio DOWN for question-repeat-heavy turns, so canonical STDP
        # depresses self-retrieval synapses via LTD over multiple runs.
        #
        # This is a feedback-path correction (refines the reward signal we
        # feed canonical inject_reward), NOT an extraction-path filter β€”
        # pith still goes to the LLM unchanged. Substrate's STDP retrieves
        # what it retrieves; we only refine our judgement of "did that
        # help" so the canonical reward channel has accurate ground truth
        # to learn against.
        _SELF_RETRIEVAL_THRESHOLD = 0.92
        _q_emb = np.asarray(nw_organism._embed_fn(prompt_text), dtype=np.float32)
        _q_norm = float(np.linalg.norm(_q_emb)) + 1e-8
        _tagged_total = 0
        _tagged_same = 0
        _self_retrievals = 0
        for _pid in pith_ids:
            _tag = _categorize_node(_pid)
            _node_emb = nw_organism._embeddings.get(_pid)
            _is_self = False
            if _node_emb is not None:
                _node_norm = float(np.linalg.norm(_node_emb)) + 1e-8
                _cos_to_query = float(
                    np.dot(_q_emb, _node_emb) / (_q_norm * _node_norm)
                )
                _is_self = _cos_to_query > _SELF_RETRIEVAL_THRESHOLD
            if _is_self:
                _self_retrievals += 1
            # Skip only when BOTH untaggable AND not self-retrieval (no signal)
            if _tag is None and not _is_self:
                continue
            _tagged_total += 1
            # Same-cat credit only if tag matches AND not a self-retrieval
            if _tag == category and not _is_self:
                _tagged_same += 1
        if _tagged_total >= 2:
            _same_cat_ratio = _tagged_same / _tagged_total
            success_signal = _same_cat_ratio >= 0.5
        else:
            _same_cat_ratio = None
            success_signal = True  # neutral / cold-start

        # Phase B+1 (Run 46+) β€” response-quality gate. If BitNet's output
        # was degenerate (repeated tokens, chat-template fragments, low
        # unique-token ratio), force success_signal=False so the substrate
        # gets LTD on whatever co-fired during this turn β€” including the
        # synapses that LED to surfacing the junk pith that bloated the
        # prompt. Closes the substrate-quality feedback gap surfaced by
        # Run 45's T8 anomaly (605s NuWave generation on a pith with 3
        # degenerate resp_* nodes; record_outcome rewarded the bad path).
        _response_degenerate = _response_is_degenerate(resp_nw)
        if _response_degenerate:
            success_signal = False

        nw_organism.record_outcome(prompt_text, resp_nw, success=success_signal)

        # Phase 2 (scoped multi-channel substrate feedback) was attempted
        # in commits 468fd09, ab0fdd3, e4dd297 then removed 2026-04-27.
        # The architectural idea (substrate-feedback-via-inject_reward) is
        # canonical Substrate Authority Pattern and remains correct, but
        # all three implementations had bugs that made them either no-op
        # or actively harmful: stimulate residual voltage created positive
        # feedback loops, Channel 3 collective penalty had wrong signal-
        # to-scope binding, prime_and_propagate(currents=1.0) didn't fire
        # seeds, and concurrent-modification races crashed 3/8 turns.
        # See feedback_substrate_representation_first.md β€” Phase 2 redesign
        # is deferred until representation work (discover_hyperedges hook,
        # type-aware retrieval scoring with expert decay) gives the
        # substrate the structural inductive biases that make relevance
        # learnable in the first place.

        # Drain the concept queue before the next turn β€” makes tree
        # extraction synchronous for benchmark reproducibility. Without
        # this, q2's Pith might or might not see q1's trees depending
        # on how fast the manager pulsed. Skip drain when dual-pass
        # is disabled β€” nothing to drain, and no overhead required.
        if enable_dual_pass:
            drain_t0 = time.time()
            drained = nw_organism.wait_for_trees(timeout=180.0)
            drain_elapsed = round(time.time() - drain_t0, 2)
        else:
            drained = True
            drain_elapsed = 0.0

        org_stats = nw_organism.get_stats()

        # Capture the extracted tree concepts for THIS turn's forest β€”
        # walk graph metadata for nodes tagged forest=deposit_nid.
        # Post-drain so these are complete and stable. Gives us
        # ground-truth visibility into what the extractor actually
        # produced vs. what the prompt asked for. Critical diagnostic
        # for specificity tuning. Safe to read nodes under the graph
        # lock (trees already committed).
        trees_for_turn = []
        if enable_dual_pass:
            try:
                with nw_organism._graph_lock:
                    for nid, node in nw_organism._graph.nodes.items():
                        if node.metadata.get("forest") == deposit_nid:
                            concept = nw_organism._node_content.get(nid, "")
                            if concept:
                                trees_for_turn.append(concept)
            except Exception as exc:
                logger.debug("Tree capture failed for turn %d: %s", i + 1, exc)

        # Raw extractor output for THIS turn β€” lets us see exactly
        # what Falcon3-10B-1.58bit emitted vs. what the parser kept.
        # If trees=[] but raw_output looks reasonable, parser is
        # over-filtering. If raw_output is garbage, it's a prompt
        # or model-adherence issue.
        extraction_detail = _last_extractions.get(prompt_text, {}) if enable_dual_pass else {}
        raw_output = extraction_detail.get("raw_output", "")[:500]
        extractor_elapsed = extraction_detail.get("elapsed_s", 0.0)

        # Qualitative content β€” pair each surfaced text with the
        # category our centroid-similarity lookup tagged it as. Lets us
        # eyeball "is this actually biology content for a biology query"
        # without running another similarity pass at heatmap-build time.
        # Truncate text to 200 chars so the JSON stays readable.
        _surfaced_context = []
        for _idx, _pid in enumerate(pith_ids):
            _text = pith_context[_idx] if _idx < len(pith_context) else ""
            _surfaced_context.append({
                "id": _pid,
                "category_tagged": _categorize_node(_pid),
                "text": (_text[:200] + ("..." if len(_text) > 200 else "")),
            })

        results.append({
            "turn": i + 1,
            "category": category,
            "q_num": 1 if i < 4 else 2,
            "prompt": prompt_text,
            "baseline": {"tokens": in_bl, "time": time_bl, "tok_s": tps_bl},
            "nuwave":   {"tokens": in_nw, "time": time_nw, "tok_s": tps_nw},
            "tokens_saved": max(0, in_bl - in_nw),
            "time_saved":   round(max(0, time_bl - time_nw), 2),
            "deposit_node_id": deposit_nid,
            "ignition_size":   len(ignition_sets[i]),
            "pith_ids":        list(pith_ids),
            "surfaced_context": _surfaced_context,
            "trees":           trees_for_turn,
            "raw_extractor_output": raw_output,
            "extractor_elapsed_s": extractor_elapsed,
            "substrate_nodes":    org_stats.get('nodes', 0),
            "substrate_synapses": org_stats.get('synapses', 0),
            "substrate_hyperedges": org_stats.get('hyperedges', 0),
            "tree_drain_s": drain_elapsed,
            "tree_drained": drained,
            # Run 31+ correctness-signal telemetry β€” what we fed the substrate
            # via record_outcome's success arg this turn, and the underlying
            # same-category proportion. ratio is None when fewer than 2 pith
            # ids were taggable (cold-start neutral). pith_self_retrievals
            # added Run 33+: count of pith ids with cosine β‰₯ 0.92 to query
            # (substrate handing the query back) β€” these count as misses.
            "success_signal": success_signal,
            "pith_same_cat_ratio": _same_cat_ratio,
            "pith_self_retrievals": _self_retrievals,
            # Phase B+1 telemetry β€” Run 46+. Tracks whether BitNet's
            # output this turn was degenerate (forced success_signal
            # False). Watch cross-run count: should drop over runs as
            # substrate LTDs degenerate-producing pathways.
            "response_quality": "degenerate" if _response_degenerate else "clean",
            # Run 50+ ground-truth instrumentation. _surfaced_context shows
            # pith items trimmed to max_chars_per_context (default ~300),
            # so when response_quality flags degenerate but the trimmed
            # surfaced text looks clean, we can't tell if it's a false
            # positive or trailing-degeneracy hidden by truncation. Capping
            # at 1500 chars keeps JSON payload reasonable while showing
            # enough of each response to verify what the detector saw.
            "response_text": (resp_nw[:1500] if resp_nw else ""),
            # Run 41+ predictive-coding telemetry β€” surface what step_result
            # already carries about predictions plus a snapshot of active
            # predictions on the graph. If all 0/0/0 across all turns, the
            # canonical predictive-coding loop is dormant (gated by
            # prediction_threshold per audit task #12).
            "predictions_confirmed": int(step_result.get("predictions_confirmed", 0) or 0),
            "predictions_surprised": int(step_result.get("predictions_surprised", 0) or 0),
            "active_predictions_count": len(getattr(
                getattr(nw_organism, "_graph", None), "active_predictions", {}
            ) or {}),
        })

    # ── Heatmap A: ignition-set Jaccard overlap (symmetric) ──
    mat_A = np.zeros((N, N))
    for i in range(N):
        for j in range(N):
            s1, s2 = ignition_sets[i], ignition_sets[j]
            if not s1 or not s2:
                continue
            mat_A[i, j] = len(s1 & s2) / max(1, len(s1 | s2))

    # ── Heatmap B (strict exact-id, kept for legacy stat) ──
    # Did turn j's Pith select turn i's specific deposit? Only causally
    # valid when i < j. Run 13 showed this is too strict β€” accumulated
    # prior-run nodes drown out fresh deposits in the Pith cut.
    mat_B = np.zeros((N, N))
    for j in range(N):
        pith_set = set(pith_ids_per_turn[j])
        for i in range(N):
            if i < j and deposit_ids[i] and deposit_ids[i] in pith_set:
                mat_B[j, i] = 1.0

    # ── Heatmap B (category-match via similarity tagging β€” Option G) ──
    # Cell (j, i) is bright when turn j's Pith contains ANY node whose
    # stored embedding cosine-matches category[i]'s centroid above the
    # threshold. Causally valid for all i < j. The similarity-based
    # version replaces the prior registry-based logic which could only
    # see nodes deposited by runs that called the new code path β€”
    # invisible against a substrate accumulated over many prior runs.
    mat_B_cat = np.zeros((N, N))
    for j in range(N):
        pith_set = set(pith_ids_per_turn[j])
        for i in range(N):
            if i >= j:
                continue
            target_cat = categories_per_turn[i]
            for pid in pith_set:
                if _categorize_node(pid) == target_cat:
                    mat_B_cat[j, i] = 1.0
                    break

    def _tick_labels():
        return [f"T{r['turn']}\n{r['category'][:3]}{r['q_num']}" for r in results]

    def _render(matrix, title, highlight_pairs, xlabel, ylabel):
        fig, ax = plt.subplots(figsize=(8, 7))
        vmax = matrix.max() if matrix.max() > 0 else 1.0
        im = ax.imshow(matrix, cmap='viridis', vmin=0, vmax=vmax)
        ax.set_xticks(range(N))
        ax.set_yticks(range(N))
        ax.set_xticklabels(_tick_labels(), fontsize=8)
        ax.set_yticklabels(_tick_labels(), fontsize=8)
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        ax.set_title(title, fontsize=10)
        plt.colorbar(im, ax=ax, fraction=0.04)
        # Red boxes on cells where we EXPECT brightness
        for (ii, jj) in highlight_pairs:
            ax.add_patch(plt.Rectangle(
                (jj - 0.5, ii - 0.5), 1, 1,
                fill=False, edgecolor='red', linewidth=2,
            ))
        # Value annotations
        for ii in range(N):
            for jj in range(N):
                v = matrix[ii, jj]
                if v > 0:
                    ax.text(jj, ii, f"{v:.2f}", ha='center', va='center',
                            fontsize=6, color='white' if v < vmax * 0.5 else 'black')
        fig.tight_layout()
        return fig

    # Heatmap A highlight: both directions of same-category pair
    pairs_A = []
    for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS:
        pairs_A.extend([(i, j), (j, i)])
    fig_A = _render(
        mat_A,
        "Ignition Overlap β€” Jaccard(fired_i, fired_j)\n"
        "Red boxes mark expected bright cells (same-category q1 ↔ q2)",
        pairs_A,
        xlabel="Turn j", ylabel="Turn i",
    )

    # Heatmap B highlight: only causal (j > i) same-category pairs
    pairs_B = [(j, i) for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS]
    fig_B = _render(
        mat_B_cat,
        "Pith Category-Match β€” did turn j's Pith pull ANY same-category node?\n"
        "Red boxes mark same-category q1β†’q2 pairs. Bright off-diagonal = "
        "category leak; bright on-diagonal = category-coherent retrieval.",
        pairs_B,
        xlabel="Turn i (category target)", ylabel="Turn j (pith extract)",
    )

    # Summary metrics
    same_A = [mat_A[i, j] for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS]
    cross_A = []
    for i in range(N):
        for j in range(i + 1, N):
            if (i, j) not in _INTERLEAVED_SAME_CAT_PAIRS:
                cross_A.append(mat_A[i, j])

    # Heatmap B: same-category causal cells are (j, i) where j = q2_turn,
    # i = q1_turn. Count ONLY those 4 cells β€” that's the re-ignition
    # signal we actually care about, not the total-reselects-across-all-cells
    # that mat_B.sum() produces (previous reporting conflated the two).
    same_cat_B_hits = sum(int(mat_B[j, i]) for (i, j) in _INTERLEAVED_SAME_CAT_PAIRS)

    # Category-match via similarity (Option G): for each q2 turn, did its
    # pith contain ANY node whose embedding cosine-matches the turn's
    # category centroid above threshold? This is the metric that actually
    # answers "is the substrate doing category-coherent retrieval" β€”
    # works on the entire substrate, not just nodes the registry has seen.
    q2_turns = [j for (_, j) in _INTERLEAVED_SAME_CAT_PAIRS]
    same_cat_pith_hits = 0
    for j in q2_turns:
        pith_set = set(pith_ids_per_turn[j])
        j_cat = categories_per_turn[j]
        if any(_categorize_node(pid) == j_cat for pid in pith_set):
            same_cat_pith_hits += 1
    same_cat_pith_hit_rate = same_cat_pith_hits / max(1, len(q2_turns))

    # Off-diagonal "category leak" diagnostic: how often did a q2 pith
    # pull a node tagged with a DIFFERENT category? Lower is cleaner
    # separation. Untaggable nodes (no embedding, or below threshold) do
    # not count as leaks.
    cross_cat_leaks = 0
    for j in q2_turns:
        pith_set = set(pith_ids_per_turn[j])
        j_cat = categories_per_turn[j]
        for pid in pith_set:
            tagged = _categorize_node(pid)
            if tagged is not None and tagged != j_cat:
                cross_cat_leaks += 1
                break

    # End-state substrate diagnostics β€” pair with the _start_ values
    # captured at benchmark entry so consumers can confirm both A and B
    # runs started from the same substrate topology.
    _end_stats = nw_organism.get_stats()

    summary = {
        "model": MODEL_NAME,
        "interleaved_turns": N,
        # Toggle state β€” critical for A/B attribution. Comparing results
        # across enable_dual_pass=True vs =False is only meaningful when
        # both runs started from the same substrate state (substrate_
        # nodes_start below should match between paired runs).
        "dual_pass_enabled": enable_dual_pass,
        "oracle_trees": oracle_trees,
        "surfacing_mode": surfacing_mode,
        "substrate_nodes_start":    _start_nodes,
        "substrate_nodes_end":      _end_stats.get('nodes', 0),
        "substrate_synapses_start": _start_synapses,
        "substrate_synapses_end":   _end_stats.get('synapses', 0),
        # Run 41+ predictive-coding diagnostic β€” cumulative counters from the
        # canonical Graph. If `predictions_made_during_run = 0` even at
        # benchmark scale, the predictive-coding loop is dormant (gated by
        # prediction_threshold per audit task #12) and the surprise-driven
        # intrinsic reward broadcast (canonical neuro_foundation:2549) never
        # fires. This is the empirical confirmation gate before any config
        # graduation work.
        "predictions_made_during_run": int(getattr(
            getattr(nw_organism, "_graph", None), "_total_predictions_made", 0,
        ) or 0) - _start_total_predictions_made,
        "predictions_surprised_during_run": int(getattr(
            getattr(nw_organism, "_graph", None), "_total_surprised", 0,
        ) or 0) - _start_total_surprised,
        # Phase A pool sample metadata β€” what got drawn this run.
        # Lets us correlate per-run substrate behavior with which threads /
        # categories / complexity levels were actually exercised.
        "pool_sample": _pool_summary,
        "baseline_total_tokens": sum(r["baseline"]["tokens"] for r in results),
        "nuwave_total_tokens":   sum(r["nuwave"]["tokens"] for r in results),
        "tokens_saved":          sum(max(0, r["baseline"]["tokens"] - r["nuwave"]["tokens"]) for r in results),
        "baseline_total_time":   round(sum(r["baseline"]["time"] for r in results), 2),
        "nuwave_total_time":     round(sum(r["nuwave"]["time"] for r in results), 2),
        "ignition_mean_same_category":  round(float(np.mean(same_A)), 4) if same_A else 0,
        "ignition_mean_cross_category": round(float(np.mean(cross_A)), 4) if cross_A else 0,
        # Same-category re-ignition: did q2 turn pull q1's deposit? 4 pairs.
        # (Strict exact-id match. Run 13 confirmed this is too narrow.)
        "same_category_pith_reselect":       same_cat_B_hits,
        "same_category_pith_reselect_total": len(_INTERLEAVED_SAME_CAT_PAIRS),
        # Category-match re-ignition: did q2's pith pull ANY same-category
        # node (this run OR prior runs)? This is the metric that actually
        # measures category-coherent retrieval β€” the substrate's intended
        # behavior. Numerator counts q2 turns 4-7; denominator is 4.
        "same_category_pith_hit_rate":     round(same_cat_pith_hit_rate, 4),
        "same_category_pith_hits":         same_cat_pith_hits,
        "same_category_pith_hits_total":   len(q2_turns),
        # Off-diagonal diagnostic: how many q2 turns pulled at least one
        # cross-category node? Lower = cleaner category separation.
        "cross_category_pith_leaks":       cross_cat_leaks,
        # Registry size β€” grows monotonically across runs on this Space.
        "category_registry_size":          len(cat_registry),
        # Total reselects across ALL causal cells (diagnostic, not the
        # re-ignition signal β€” includes cross-category pulls).
        "pith_reselect_total_causal":    int(mat_B.sum()),
        "pith_reselect_total_causal_max": sum(range(N)),  # 0+1+2+...+(N-1) = 28 for N=8
    }

    # Restore the concept extractor if we disabled it for this run.
    # Done here at the end rather than in a finally so the summary
    # captures the actual state. If an exception crashes the benchmark
    # mid-flight the extractor stays detached until manual re-wiring
    # or Space restart β€” acceptable for a diagnostic tool.
    if _saved_extractor is not None:
        nw_organism._concept_extractor = _saved_extractor
        if oracle_trees:
            logger.info("Oracle mode EXITED β€” LLM extractor restored")
        else:
            logger.info("Dual-pass RE-ENABLED after benchmark")

    return (
        json.dumps(summary, indent=2),
        json.dumps(results, indent=2),
        fig_A,
        fig_B,
    )


# ── Gradio App ────────────────────────────────────────────────────

with gr.Blocks(
    title="NuWave β€” Your Model Gets Smarter Over Time",
    theme=gr.themes.Soft(),
) as demo:
    gr.Markdown(
        f"""
        # NuWave β€” Your Model Gets Smarter Over Time

        **Context optimization through compound substrate dynamics.**

        - **KISS** filters redundant context β€” system prompt skipped when unchanged, old history compressed to summary
        - **Pith** manages context as a cache hierarchy β€” clutter stripped, cold entries evicted, relevant context promoted
        - **Splat-Lenia** β€” weight layers decomposed to Gaussian splats, Lenia dynamics evolve them between turns

        Model: `{MODEL_NAME}` | Inference: CPU | Splat layers: {len(splat_layers)} | Total splats: {sum(s.n_splats for s in splat_layers.values()) if splat_layers else 0}
        """
    )

    with gr.Tabs():
        with gr.Tab("Live Chat"):
            chatbot = gr.Chatbot(height=400, type="messages")
            stats_display = gr.Markdown("*Send a message to see NuWave metrics*")

            with gr.Row():
                msg = gr.Textbox(placeholder="Type a message...", show_label=False, scale=4)
                send_btn = gr.Button("Send", scale=1)
                reset_btn = gr.Button("Reset", scale=1)

            send_btn.click(on_send, [msg, chatbot], [msg, chatbot, stats_display])
            msg.submit(on_send, [msg, chatbot], [msg, chatbot, stats_display])
            reset_btn.click(on_reset, outputs=[chatbot, stats_display])

        with gr.Tab("A/B Benchmark"):
            gr.Markdown(
                """
                ### Baseline vs NuWave

                Same conversation, same model, same CPU. Baseline sends full context every turn.
                NuWave compresses history and skips redundant system context.

                Watch: tokens decrease, time decreases, KISS efficiency climbs.
                """
            )

            with gr.Row():
                num_turns = gr.Slider(minimum=3, maximum=15, value=8, step=1, label="Turns")
                run_btn = gr.Button("Run Benchmark", variant="primary")

            summary_output = gr.Code(label="Summary", language="json")
            curve_output = gr.Code(label="Per-Turn Data", language="json")

            run_btn.click(on_benchmark, [num_turns], [summary_output, curve_output])

        with gr.Tab("Interleaved Benchmark"):
            gr.Markdown(
                """
                ### Topology Re-ignition Test

                Four semantic categories, two questions each, interleaved.

                | Turn | Category  | Question |
                |------|-----------|----------|
                | 1    | biology   | photosynthesis q1 |
                | 2    | physics   | black holes q1 |
                | 3    | computing | CPU caches q1 |
                | 4    | math      | prime numbers q1 |
                | 5    | biology   | chlorophyll q2 |
                | 6    | physics   | event horizon q2 |
                | 7    | computing | L1 split q2 |
                | 8    | math      | cryptography q2 |

                Turns 1-4 seed four semantic neighborhoods in the substrate.
                Turns 5-8 ask a follow-up in each β€” but each follow-up's
                *matching* primer is 4 turns back, with 3 unrelated turns
                in between. A recency-only system fails this test. A
                substrate-informed bucket should re-light the matching
                neighborhood via Born-rule interference despite the gap.

                **Heatmap A** β€” Jaccard overlap of fired-node sets between
                every pair of turns. Red boxes mark the same-category q1
                ↔ q2 pairs we expect to see light up.

                **Heatmap B** β€” Did turn *j*'s Pith selection pull turn
                *i*'s deposit back into context? Red boxes mark the four
                causal same-category cells. Bright red cells = substrate
                memory working.
                """
            )

            gr.Markdown(
                """
                **A/B toggle:** Uncheck to disable the dual-pass concept
                helper for this run. For a clean comparison, run the same
                starting substrate through both toggle states back-to-back.
                The summary includes `substrate_nodes_start` so you can
                confirm both runs began from the same state.
                """
            )

            with gr.Row():
                inter_enable_dualpass = gr.Checkbox(
                    value=True,
                    label="Enable dual-pass concept helper",
                )
                inter_btn = gr.Button("Run Interleaved Benchmark", variant="primary")

            gr.Markdown(
                """
                **Oracle Trees (ceiling test):** Run once with hand-authored
                ideal mechanism concepts instead of the LLM extractor. Tests
                whether dual-pass CAN succeed given perfect trees β€” regardless
                of extractor quality. If ignition metrics dramatically exceed
                the no-tree baseline, the extractor is the bottleneck.
                If not, dual-pass itself is the dead end. Only works with
                the 8 interleaved benchmark prompts.

                **CES Surfacing (architecture test):** Swaps the Born-rule
                amplitudeΒ² scoring (which suppresses trees 4:1) for CES
                voltage+recencyΓ—excitability scoring. Tests whether SNN-
                native dynamics avoid the amplitude bottleneck. Pairs well
                with Oracle Trees to isolate the scoring-layer effect
                from the extractor-quality effect.
                """
            )
            with gr.Row():
                oracle_btn = gr.Button(
                    "Run with Oracle Trees (Pith scoring)",
                    variant="secondary",
                )
                surface_btn = gr.Button(
                    "Run with CES Surfacing (LLM trees)",
                    variant="secondary",
                )
                oracle_surface_btn = gr.Button(
                    "Run Oracle + CES Surfacing",
                    variant="secondary",
                )

            inter_summary = gr.Code(label="Summary", language="json")
            inter_per_turn = gr.Code(label="Per-Turn Data", language="json")

            with gr.Row():
                inter_heatmap_a = gr.Plot(label="Ignition Overlap")
                inter_heatmap_b = gr.Plot(label="Pith Re-selection")

            inter_btn.click(
                lambda enable: on_interleaved_benchmark(enable, False, "pith"),
                inputs=[inter_enable_dualpass],
                outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b],
            )

            oracle_btn.click(
                lambda: on_interleaved_benchmark(True, True, "pith"),
                inputs=[],
                outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b],
            )

            surface_btn.click(
                lambda: on_interleaved_benchmark(True, False, "surface"),
                inputs=[],
                outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b],
            )

            oracle_surface_btn.click(
                lambda: on_interleaved_benchmark(True, True, "surface"),
                inputs=[],
                outputs=[inter_summary, inter_per_turn, inter_heatmap_a, inter_heatmap_b],
            )

        with gr.Tab("Debug Extract"):
            gr.Markdown(
                """
                ### Concept extraction diagnostic

                Runs the BitNet concept extractor against all 8 interleaved-
                benchmark questions and reports what actually comes out.
                Use this **before** running A/B benchmarks to verify
                extraction quality β€” if concepts are generic, hallucinated,
                or structurally malformed, downstream measurements are noise.

                **Three views of the output:**

                - **Summary** β€” overall counts, median concepts per question,
                  whether any generations hit the token cap (suggests the
                  model didn't produce a natural stop and may have launched
                  into an explanation), and per-category **same-category bridge
                  analysis**: do q1 and q2 for the same category share concepts?
                  That's the direct hypothesis check β€” if the shared set is
                  empty for math (prime numbers ↔ cryptography), no amount of
                  dual-pass will help.

                - **Per-Question Data** β€” for each question: raw model output
                  (before parsing), parsed concepts, tokens in/out, wall-time.
                  Eyeball the raw output to catch hallucinated answers and
                  the parsed list to judge concept specificity.

                - **Pairwise Overlap** β€” which question pairs share concepts.
                  If every question shares "thing" / "concept" / "process",
                  the extractor is producing generic pollution.

                **Cost:** ~30-40s per extraction Γ— 8 = 4-6 minutes total.
                """
            )

            debug_btn = gr.Button("Run Debug Extraction", variant="primary")

            debug_summary = gr.Code(label="Summary + Same-Category Bridges", language="json")
            debug_per_question = gr.Code(label="Per-Question Raw + Parsed", language="json")
            debug_pairwise = gr.Code(label="Pairwise Concept Overlap (cross-category pollution signal)", language="json")

            debug_btn.click(
                on_debug_extract,
                inputs=[],
                outputs=[debug_summary, debug_per_question, debug_pairwise],
            )

if __name__ == "__main__":
    demo.launch(server_name="0.0.0.0", ssr_mode=False)