File size: 62,710 Bytes
f0cf803
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
870b6c3
f0cf803
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
#!/usr/bin/env python
# coding: utf-8
from IPython.core.debugger import set_trace
import random
import matplotlib.pyplot as plt 
import numpy as np
import math
import copy
import sys
from scipy.stats import pearsonr # for computing correlation
from functools import reduce #for flattening lists
from operator import concat  #for flattening lists
from scipy.stats import trim_mean # for ensemble evaluation
from scipy.stats import differential_entropy
import warnings
import time
import dill
import os
from sklearn.cluster import KMeans #for clustering in ensemble definition
from scipy.optimize import minimize #for uncertainty maximization
from sympy import symbols, simplify, expand
import sympy as sym
try:
    from IPython.display import display, clear_output
except:
    pass

import signal #for timing out functions
from contextlib import contextmanager #for timing out functions

warnings.filterwarnings('ignore', '.*invalid value.*' )
warnings.filterwarnings('ignore', '.*overflow.*' )
warnings.filterwarnings('ignore', '.*divide by.*' )
warnings.filterwarnings('ignore', '.*is constant.*' )
warnings.filterwarnings('ignore', '.*nearly constant.*' )
warnings.filterwarnings('ignore', '.*Polyfit may be.*' )
warnings.filterwarnings('ignore', '.*Number of.*')

## Get the arrays for the models
model_distributions = []
def protectDiv(a,b):
    if (type(b)==int or type(b)==float or type(b)==np.float64) and b==0:
        return a/math.nan
    if (type(b)==np.ndarray) and (0 in b):
        return a/np.where(b==0,math.nan,b)
    return a/b
def add(a,b):
    return a+b
def sub(a,b):
    return a-b
def mult(a,b):
    return a*b
def exp(a):
    safe_a = np.clip(a, -90.0, 90.0)
    return np.exp(safe_a)
# def sine(a,b):
#     return np.sin(a)
def power(a,b):
    if (type(a)==int or type(a)==float or type(a)==np.float64) and a==0:
        return a/math.nan
    if (type(a)==np.ndarray) and (0 in a):
        return a/np.where(a==0,math.nan,a)
    return a**b
def sqrt(a):
    return np.sqrt(abs(a))
def sqrd(a):
    return a**2
def inv(a):
    return np.array(a).astype(float)**(-1)
def neg(a):
    return -a
def sin(a):
    return np.sin(a)
def cos(a):
    return np.cos(a)
def tan(a):
    return np.tan(a)
def arccos(a):
    return np.arccos(a)
def arcsin(a):
    return np.arcsin(a)
def arctan(a):
    return np.arctan(a)
def tanh(a):
    return np.tanh(a)
def log(a):
    a = np.maximum(np.abs(a), 1e-9)
    return np.log(a)
def log10(a):
    return np.log10(a)
def log2(a):
    return np.log2(a)
def abs1(a):
    return np.abs(a)

def and1(a,b):
    return np.logical_and(a,b)
def or1(a,b):
    return np.logical_or(a,b)
def xor1(a,b):
    return np.logical_xor(a,b)
def nand1(a,b):
    return np.logical_not(np.logical_and(a,b))
def nor1(a,b):
    return np.logical_not(np.logical_or(a,b))
def xnor1(a,b):
    return np.logical_not(np.logical_xor(a,b))
def not1(a):
    return np.logical_not(a)


def defaultOps():
    return [protectDiv,add,sub,mult,exp,sqrd,sqrt,inv,neg,"pop","pop","pop","pop","pop","pop"]
def allOps():
    return [protectDiv,add,sub,mult,exp,sqrd,sqrt,inv,neg,cos,sin,tan,arccos,arcsin,arctan,tanh,log,"pop","pop","pop","pop","pop","pop","pop","pop","pop","pop"]

def customOps():
    return [protectDiv,add,sub,mult,exp,sqrd,sqrt,inv,neg,cos,sin,log,"pop","pop","pop","pop","pop","pop","pop","pop"]
def booleanOps():
    return [and1,or1,xor1,nand1,nor1,xnor1,not1,"pop","pop","pop","pop","pop","pop","pop"]
def randomInt(a=-3,b=3):
    return random.randint(a,b)
def defaultConst():
    return [np.pi, np.e, randomInt,ranReal ]
def booleanConst():
    return [1,0]
def ranReal(a=20,b=-10):
    return random.random()*a-b


##### MODEL DISTRIBUTION CODE ##### 

import sympy as sym

import sympy as sym
from sympy import preorder_traversal
from collections import defaultdict


def extract_genotype(population_buffer):
    """
    Extracts in terms of the geneotypes instead of using pheno
    """
    stats = {
        "operators": [],
        "terminals": [],
    }
    
    for prog in population_buffer:
        print(prog[0], prog[1])
        ops = [op.__name__ if hasattr(op, '__name__') else str(op) for op in prog[0]]
        stats["operators"].extend(ops)
        #(Features and Constants)
        stats["terminals"].extend([str(t) for t in prog[1]])
    return stats

def extract_model_stats(expr):
    stats = {
        "constants": [],
        "operators": [],
        "features": [],
        "powers": []
    }

    for node in preorder_traversal(expr):
        if isinstance(node, sym.Number):
            stats["constants"].append(float(node))
        elif isinstance(node, sym.Symbol):
            stats["features"].append(str(node))
        # POWERS 
        elif isinstance(node, sym.Pow):
            base, exp = node.args
            stats["operators"].append("pow")
            if exp.is_number:
                stats["powers"].append(float(exp))
        elif getattr(node, 'is_Function', False):
            op_name = node.func.__name__
            stats["operators"].append(op_name)
          
           
        elif isinstance(node, sym.Add):
            has_sub = any(arg.could_extract_minus_sign() for arg in node.args)

            stats["operators"].append("add")
            if has_sub:
                stats["operators"].append("sub")

        # MUL → detect * vs /
        elif isinstance(node, sym.Mul):
            has_div = False

            for arg in node.args:
                if isinstance(arg, sym.Pow):
                    base, exp = arg.args
                    if exp.is_number and exp < 0:
                        has_div = True
                        break

            if has_div:
                stats["operators"].append("div")
            else:
                stats["operators"].append("mul")
    return stats


def get_model_distributions(models):
    '''
    Returns list of dictionaries, where each dict is breakdown of each model from list of models
    '''
    all_models_dist, gene_dist = [], []
    for i, mod in enumerate(models):
        expr = printGPModel(mod)
        if isinstance(expr, sym.Expr):
            expr = sym.simplify(expr)
            #expr = sym.expand(expr)
        stats = extract_model_stats(expr)
        stats["DEBUG_INDEX"] = i 
        all_models_dist.append(stats)
        print(f"--- MODEL INDEX {i} ---")
        print(stats)
        gene_dist.append(extract_genotype(stats))

    return all_models_dist, gene_dist

# def get_model_distribution(models):
#     # We will store the distributions in a dictionary where the key is the feature
#     # and the value is a list of terms across all models from pareto front

#     feature_distributions = {}
#     all_models_expanded = []

#     for mod in models:
#         expr = printGPModel(mod) 
#         print(expr)
#         if isinstance(expr, sym.Expr):
#             expanded_expr = sym.expand(expr)
#         else:
#             expanded_expr = expr

#         terms = expanded_expr.args if hasattr(expanded_expr, 'args') and expanded_expr.func == sym.Add else (expanded_expr,)
#         print("WE ARE PRINTING OUT THE TERMS FOR ONE MODEL")
#         print(terms)
#         all_models_expanded.append(terms)
      

#         for term in terms:
#             print("EACH INDIVIDUAL term FOR ONE MODEL", term)
#             if hasattr(term, 'free_symbols'):
#                 print("HAS FROZEN SYMBOLS")
#                 sig = frozenset(term.free_symbols)
#                 print("sig", sig)
#                 if sig not in feature_distributions:
#                     #THIS REPRESENTS THE MODEL REPRESENTATION
#                     feature_distributions[sig] = []

#     for terms in all_models_expanded:
#         current_model_contributions = {sig: 0 for sig in feature_distributions.keys()}
#         for term in terms:
#             if hasattr(term, 'free_symbols'):
#                 sig = frozenset(term.free_symbols)
#                 current_model_contributions[sig] += term
#             else:
#                 sig = frozenset()
#                 current_model_contributions[sig] += term
                
#         # Append this model's contributions to the global buckets
#         for sig in feature_distributions.keys():
#             feature_distributions[sig].append(current_model_contributions[sig])

#     return feature_distributions

        
############################
#Data Subsampling Methods
############################
def randomSubsample(x,y, *args, **kwargs):
    n=max(int(np.ceil(len(y)**(3/5))),3)
    idx=np.random.choice(range(x.shape[1]),n,replace=False)
    return np.array([i[idx] for i in x]),y[idx]

def generationProportionalSample(x,y,generation=100,generations=100):
    n=max(int(np.ceil(len(y)*(generation/generations)**(3/5))),3)
    idx=np.random.choice(range(x.shape[1]),n,replace=False)
    return np.array([i[idx] for i in x]),y[idx]

def ordinalSample(x,y,generation=100,generations=100):
    n=max(int(len(y)*generation/generations),3)
    sortedIdx=np.argsort(y)
    step=len(y)/(n-1)
    idx=[sortedIdx[max(int(i*step)-1,0)] for i in range(n)]
    return np.array([i[idx] for i in x]),y[idx]

def orderedSample(x,y,generation=100,generations=100):
    n=max(int(len(y)*generation/generations),3)
    idx=[i for i in range(n)]
    return np.array([i[idx] for i in x]),y[idx]

def ordinalBalancedSample(x,y,generation=100,generations=100):
    n=max(int(len(y)*generation/generations),3)
    numBins=int(max(np.ceil(np.sqrt(n)),3))
    bins=np.linspace(min(y),max(y),numBins+1)
    binIdx=np.digitize(y,bins)-1
    samplesPerBin=max(int(n/numBins),1)
    idx=[]
    for i in range(numBins):
        binMembers=[j for j in range(len(y)) if binIdx[j]==i]
        if len(binMembers)>0:
            chosen=np.random.choice(binMembers,min(samplesPerBin,len(binMembers)),replace=False)
            idx=idx+chosen.tolist()
    return np.array([i[idx] for i in x]),y[idx]

def balancedSample(x,y, *args, **kwargs):
    n=int(np.ceil(len(y)**(3/5)))
    numBins=max(round(n**(2/5)),3)
    bins=np.linspace(min(y),max(y),numBins+1)
    binIdx=np.digitize(y,bins)-1
    samplesPerBin=max(int(n/numBins),1)
    idx=[]
    for i in range(numBins):
        binMembers=[j for j in range(len(y)) if binIdx[j]==i]
        if len(binMembers)>0:
            chosen=np.random.choice(binMembers,min(samplesPerBin,len(binMembers)),replace=False)
            idx=idx+chosen.tolist()
    return np.array([i[idx] for i in x]),y[idx]

import inspect
def getArity(func): #Returns the arity of a function: used for model evaluations
    if func=="pop":
        return 1
    return len(inspect.signature(func).parameters)

getArity.__doc__ = "getArity(func) takes a function and returns the function arity"
def modelArity(model): #Returns the total arity of a model
    return 1+sum([getArity(i)-1 for i in model[0]])

modelArity.__doc__ = "modelArity(model) returns the total arity of a model"
def listArity(data): #Returns arity of evaluating a list of operators
    if len(data)==0:
        return 0
    return 1+sum([getArity(i)-1 for i in data])
listArity.__doc__ = "listArity(list) returns the arity of evaluating a list of operators"
def buildEmptyModel(): # Generates an empty model
    return [[],[],[]]
buildEmptyModel.__doc__ = "buildEmptyModel() takes no inputs and generates an empty GP model"
def variableSelect(num): #Function that creates a function to select a specific variable
    return lambda variables: variables[num]
variableSelect.__doc__ = "variableSelect(n) is a function that creates a function to select the nth variable"
def modelToListForm(model):
    model[0]=model[0].tolist()
def modelRestoreForm(model):
    model[0]=np.array(model[0],dtype=object)

def generateRandomModel(variables,ops,const,maxLength):  #Generates a random GP model
    prog = buildEmptyModel()                             #Generate an empty model with correct structure
    varChoices=[variableSelect(i) for i in range(variables)]+const                           #All variable and constants choices
    prog[0]=np.array(np.random.choice(ops,random.randint(1,maxLength)),dtype=object) #Choose random operators
    countVars=modelArity(prog)    #Count how many variables/constants are needed
    prog[1]=np.random.choice(varChoices,countVars)       #Choose random variables/constants
    # if all vars are constants then replace one random term
    if all(t in const for t in prog[1]):
        replace_idx = random.randrange(countVars)
        prog[1][replace_idx] = random.choice(varChoices[:variables]) #Replace with a variable
    prog[1]=[i() if (callable(i) and i.__name__!='<lambda>' )else i for i in prog[1]] #If function then evaluate
    return prog
generateRandomModel.__doc__ = "generateRandomModel() takes as input the variables, operators, constants, and max program length and returns a random program"
def initializeGPModels(variables,ops=defaultOps(),const=defaultConst(),numberOfModels=100,maxLength=10): # generate random linear program
    prog=[[],[],[]]
    # prog stores [Operators, VarConst, QualityMetrics]
    
    models=[generateRandomModel(variables,ops,const,maxLength) for i in range(numberOfModels)] #Generate models
    
    return models
initializeGPModels.__doc__ = "initializeGPModels(countOfVariables, operators, constants, numberOfModels=100, maxLength=10) returns a set of randomly generated models"

def reverseList(data): #Returns a list reversed
    return [i for i in reversed(data)]
reverseList.__doc__ = "reverseList(data) returns the data list reversed"
def varReplace(data,variables): #Replaces variable references with data during model evaluation
    return [i(variables) if callable(i) else i for i in data]
varReplace.__doc__ = "varReplace(data,variables) replaces references to variables in data with actual values"
def inputLen(data): #Returns the number of data records in a data set
    el1=data[0]
    if type(el1)==list or type(el1)==np.ndarray:
        return len(el1)
    else:
        return 1
inputLen.__doc__ = "inputLen(data) determines the number of data records in a data set"
def varCount(data): #Returns the number of variables in a data set
    return len(data)
varCount.__doc__ = "varCount(data) determines the number of variables in a data set"
def evaluateGPModel(model,inputData): #Evaluates a model numerically
    response=evModHelper(model[1],model[0],[],np.array(inputData).astype(float))[2][0]
    if not type(response)==np.ndarray and inputLen(inputData)>1:
        response=np.array([response for i in range(inputLen(inputData))])
    return response
evaluateGPModel.__doc__ = "evaluateGPModel(model,data) numerically evaluates a model using the data stored in inputData"

def evModHelper(varStack,opStack,tempStack,data): #Recursive helper function for evaluateGPModel
    stack1=varStack
    stack2=opStack
    stack3=tempStack
    
    if len(stack2)==0:
        return [stack3,stack2,stack1]
    op=stack2[0]
    stack2=stack2[1:]
    
    if callable(op):
        
        patt=getArity(op)
        while patt>len(stack3):
            stack3=[stack1[0]]+stack3
            stack1=stack1[1:]
        try:
            temp=op(*varReplace(reverseList(stack3[:patt]),data))
        except TypeError:
            print("stack3: ", stack3, " patt: ", patt, " data: ", data)
            temp=np.nan
        except OverflowError:
            temp=np.nan
        stack3=stack3[patt:]
        stack3=[temp]+stack3
        
    else:
        if len(stack1)>0:
            stack3=varReplace([stack1[0]],data)+stack3
            stack1=stack1[1:]
    if len(stack2)>0:
        stack1,stack2,stack3=evModHelper(stack1,stack2,stack3,data)
        
    return [stack1,stack2,stack3]
evModHelper.__doc__ = "evModHelper(varStack,opStack,tempStack,data) is a helper function for evaluateGPModel"
def rmse(model, inputData, response):
    predictions = evaluateGPModel(model, inputData)
    if not all(np.isfinite(predictions)) or any(np.iscomplex(predictions)):
        return np.nan
    return np.sqrt(np.mean((predictions - response) ** 2))
rmse.__doc__ = "rmse(model, input, response) is a fitness objective that evaluates the root mean squared error"
def binaryError(model, input, response):
    prediction=evaluateGPModel(model,input)
    error=np.mean(np.abs(prediction-response))
    if np.isnan(error) or np.isinf(error) or error > 1 or error < 0:
        return 0.5
    return min(error,1 - error)

def fitness(prog,data,response): # Fitness function using correlation
    predicted=evaluateGPModel(prog,np.array(data))
    if type(predicted)!=list and type(predicted)!=np.ndarray:
        predicted=np.array([predicted for i in range(inputLen(data))])
    try:    
        if np.isnan(predicted).any() or np.isinf(predicted).any():
            return np.nan
    except TypeError:
        #print(predicted)
        return np.nan
    except OverflowError:
        return np.nan
    if (not all(np.isfinite(np.array(predicted,dtype=np.float32)))) or np.all(predicted==predicted[0]):
        return np.nan
    try:
        fit=1-pearsonr(predicted,np.array(response))[0]**2  # 1-R^2
    except ValueError:
        return 1
    if math.isnan(fit):
        return 1 # If nan return 1 as fitness
    return fit   # Else return actual fitness 1-R^2
fitness.__doc__ = "fitness(program,data,response) returns the 1-R^2 value of a model"
def stackGPModelComplexity(model,*args):
    return len(model[0])+len(model[1])-model[0].tolist().count("pop")
stackGPModelComplexity.__doc__ = "stackGPModelComplexity(model) returns the complexity of the model"

###################### Timeout function for model complexity ######################
class TimeoutException(Exception): pass

@contextmanager
def time_limit(seconds):
    def signal_handler(signum, frame):
        raise TimeoutException("Timed out!")
    signal.signal(signal.SIGALRM, signal_handler)
    signal.alarm(seconds)
    try:
        yield
    finally:
        signal.alarm(0)
####################################################################################

# Compute Hess
def ComputeSymbolicHess(model,vars):
    printedModel=sym.simplify(printGPModel(model))
    if type(printedModel)==float:
        return sym.matrices.dense.MutableDenseMatrix(np.zeros((vars,vars)))
    hess=sym.hessian(printedModel, [symbols('x'+str(i)) for i in range(vars)])
    return hess

def EvaluateHess(hess,vars,values):
    numHess=hess.subs({symbols('x'+str(j)):values[j] for j in range(vars)})
    hessN = np.array(numHess).astype(float)
    rankN=np.linalg.matrix_rank(hessN,tol=0.0001*0.0001*10)
    return rankN

def Approx2Deriv(model,values,diff1,diff2,positions): #maybe diff should be relative to the variation of each feature
    term1=[values[i]+diff1 if i == positions[0] else values[i] for i in range(len(values))]
    term1=[term1[i]+diff2 if i == positions[1] else term1[i] for i in range(len(term1))]
    term2=[values[i]-diff1 if i == positions[0] else values[i] for i in range(len(values))]
    term2=[term2[i]+diff2 if i == positions[1] else term2[i] for i in range(len(term2))]
    term3=[values[i]+diff1 if i == positions[0] else values[i] for i in range(len(values))]
    term3=[term3[i]-diff2 if i == positions[1] else term3[i] for i in range(len(term3))]
    term4=[values[i]-diff1 if i == positions[0] else values[i] for i in range(len(values))]
    term4=[term4[i]-diff2 if i == positions[1] else term4[i] for i in range(len(term4))]
    return ((evaluateGPModel(model,term1)-evaluateGPModel(model,term2))/((2*diff1))
            -(evaluateGPModel(model,term3)-evaluateGPModel(model,term4))/((2*diff1)))/(2*diff2)

def ApproxHessRank(model,vars,values,diff1=0.001,diff2=0.001):
    hess=[[Approx2Deriv(model,values,diff1,diff2,[i,j]) for i in range(vars)] for j in range(vars)]
    hessN = np.array(hess).astype(float)
    rankN=np.linalg.matrix_rank(hessN,tol=0.0001*0.0001*10)
    return rankN

#def HessRank(model,vars,values):
#    try: 
#        with time_limit(.01):
#            hess=ComputeSymbolicHess(model,vars)
#            hess = EvaluateHess(hess,vars,values)
#            #print(hess)
#            return hess
#    except TimeoutException as e:
#        hess=ApproxHessRank(model,vars,values)
        #print(hess)
#        return hess

def HessRank(model,vars,values):
    hess=ApproxHessRank(model,vars,values)
    return hess

# Counts basis terms in a model
def count_basis_terms(equation, expand=False):
    try:
        with time_limit(2):


            if expand:
                # Simplify the equation to standardize the expression
                simplified_eq = simplify(equation)
                # Expand the expression to identify additive terms clearly
                expanded_eq = expand(simplified_eq)
            
                # Separate the terms of the expression
                terms = expanded_eq.as_ordered_terms()
            else:
                terms = equation.as_ordered_terms()
            #print(terms)
            
    except TimeoutException as e:
        return 1000
    return len(terms)

# Determines the number of basis functions in a model by counting +s and -s
def basisFunctionComplexity(model,vars, values,*args):
    try: # values should be max, min, and median with respect to response variable
        return HessRank(model,vars,values)#count_basis_terms(printGPModel(model))
    except:
        return 1000

# Creates a lambda function to be used as a complexity metric when given a target dimensionality and deviation
def basisFunctionComplexityDiff(target, deviation, vars, low, mid, high):
    return lambda model,*args: max(np.mean([abs(basisFunctionComplexity(model,vars,low)-target),abs(basisFunctionComplexity(model,vars,mid)-target) ,abs(basisFunctionComplexity(model,vars,high)-target)] ),(deviation))-deviation


def setModelQuality(model,inputData,response,modelEvaluationMetrics=[fitness,stackGPModelComplexity]):
    model[2]=[i(model,inputData,response) for i in modelEvaluationMetrics]

    
setModelQuality.__doc__ = "setModelQuality(model, inputdata, response, metrics=[r2,size]) is an inplace operator that sets a models quality"
def stackPass(model,pt):
    i=0
    t=0
    p=0
    s=model[0]
    if i <pt:
        t+=1
    while i<pt:
        if s[i]=="pop":
            t+=1
            p+=1
        else:
            p+=max(0,getArity(s[i])-t)
            t=max(1,t-getArity(s[i])+1)
        i+=1
    stack1=model[1][p:]
    stack2=reverseList(model[1][:p])[:t+1]
    return [stack1,stack2]
def stackGrab(stack1, stack2, num):
    tStack1=copy.deepcopy(stack1)
    tStack2=copy.deepcopy(stack2)
    newStack=[]
    if len(stack2)<num:
        newStack=stack2+stack1[:(num-len(stack2))]
        tStack1=tStack1[num-len(tStack2):]
        tStack2=[]
    else:
        newStack=stack2[:num]
        tStack2=tStack2[num:]
    return [newStack,tStack1,tStack2]
def fragmentVariables(model,pts):
    stack1,stack2=stackPass(model,pts[0])
    opStack=model[0]
    newStack=[]
    i=pts[0]
    while i<=pts[1]:
        if opStack[i]=="pop" and len(stack1)>0:
            stack2=[stack1[0]]+stack2
            stack1=stack1[1:]
        else:
            if len(newStack)==0 and pts[0]==0:
                tStack,stack1,stack2=stackGrab(stack1,stack2,getArity(opStack[i]))
            else:
                tStack,stack1,stack2=stackGrab(stack1,stack2,getArity(opStack[i])-1)
            newStack=newStack+tStack
        i+=1
    return newStack
                                            
def recombination2pt(model1,model2): #2 point recombination
    pts1=np.sort(random.sample(range(0,len(model1[0])+1),2))
    pts2=np.sort(random.sample(range(0,len(model2[0])+1),2))
    #pts1=[4,5]
    #pts2=[2,4]
    #pts1=[0,3]
    #pts2=[1,3]
    #print(pts1,pts2)
    child1=buildEmptyModel()
    child2=buildEmptyModel()
    
    parent1=copy.deepcopy(model1)
    parent2=copy.deepcopy(model2)
    parent1[0]=np.array(parent1[0],dtype=object).tolist()
    parent2[0]=np.array(parent2[0],dtype=object).tolist()
    
    child1[0]=np.array(parent1[0][0:pts1[0]]+parent2[0][pts2[0]:pts2[1]]+parent1[0][pts1[1]:],dtype=object)
    child2[0]=np.array(parent2[0][0:pts2[0]]+parent1[0][pts1[0]:pts1[1]]+parent2[0][pts2[1]:],dtype=object)
        
    varPts1=[listArity(parent1[0][:(pts1[0])])+0,listArity(parent2[0][:(pts2[0])])+0,listArity(parent2[0][pts2[0]:pts2[1]]),listArity(parent1[0][pts1[0]:pts1[1]])]
    if pts1[0]==0:
        varPts1[0]+=1
    if pts2[0]==0:
        varPts1[1]+=1
    child1[1]=parent1[1][:varPts1[0]]+parent2[1][varPts1[1]:(varPts1[1]+varPts1[2]-1)]+parent1[1][(varPts1[0]+varPts1[3]-1):]
    
    varPts2=[listArity(parent2[0][:(pts2[0])])+0,listArity(parent1[0][:(pts1[0])])+0,listArity(parent1[0][pts1[0]:pts1[1]]),listArity(parent2[0][pts2[0]:pts2[1]])]
    if pts1[0]==0:
        varPts2[1]+=1
    if pts2[0]==0:
        varPts2[0]+=1
    child2[1]=parent2[1][:varPts2[0]]+parent1[1][varPts2[1]:(varPts2[1]+varPts2[2]-1)]+parent2[1][(varPts2[0]+varPts2[3]-1):]
    #print(varPts1,varPts2)
    
    return [child1,child2]
recombination2pt.__doc__ = "recombination2pt(model1,model2) does 2 point crossover and returns two children models"

def get_numeric_indices(l): #Returns indices of list that are numeric
    return [i for i in range(len(l)) if type(l[i]) in [int,float]]


def mutate(model,variables,ops=defaultOps(),const=defaultConst(),maxLength=10):
    newModel=copy.deepcopy(model)
    newModel[0]=np.array(newModel[0],dtype=object).tolist()
    mutationType=random.randint(0,7) 
    varChoices=[variableSelect(i) for i in range(variables)]+const
    opChoice=0
    varChoice=0
    
    tmp=0
    
    if mutationType==0: #single operator mutation
        opChoice=random.randint(0,len(newModel[0])-1)
        if len(newModel[0])>0:
            newModel[0][opChoice]=np.random.choice([i for i in ops] )
               
    elif mutationType==1: #single variable mutation
        varChoice=np.random.choice(varChoices)
        if callable(varChoice) and varChoice.__name__!='<lambda>':
            varChoice=varChoice()
        newModel[1][random.randint(0,len(newModel[1])-1)]=varChoice
    
    elif mutationType==2: #insertion mutation to top of stack
        opChoice=np.random.choice(ops)
        newModel[0]=[opChoice]+newModel[0]
        while modelArity(newModel)>len(newModel[1]):
            varChoice=np.random.choice(varChoices)
            if callable(varChoice) and varChoice.__name__!='<lambda>':
                varChoice=varChoice()
            newModel[1]=[varChoice]+newModel[1]
        
    elif mutationType==3: #deletion mutation from top of stack
        if len(newModel[0])>1:
            opChoice=random.randint(1,len(newModel[0])-1)
            newModel[0]=newModel[0][-opChoice:]
            newModel[1]=newModel[1][-listArity(newModel[0]):]
            
    elif mutationType==4: #insertion mutation to bottom of stack
        opChoice=np.random.choice([i for i in ops])
        newModel[0].append(opChoice)
        
    elif mutationType==5: #mutation via crossover with random model
        newModel=recombination2pt(newModel,generateRandomModel(variables,ops,const,maxLength))[0]
            
    elif mutationType==6: #single operator insertion mutation
        singleOps=[op for op in ops if getArity(op)==1 and op!='pop']
        singleOps.append('pop')
        pos=random.randint(0,len(newModel[0])-1)
        newModel[0].insert(pos,np.random.choice(singleOps))

    elif mutationType==7: #nudge numeric constant
        pos=get_numeric_indices(newModel[1])
        if(len(pos)>0): #If there are numeric constants
            pos=random.choice(pos)
            newModel[1][pos]=newModel[1][pos]+np.random.normal(-1,1) 
            
    if modelArity(newModel)<len(newModel[1]):
        newModel[1]=newModel[1][:modelArity(newModel)]
    elif modelArity(newModel)>len(newModel[1]):
        newModel[1]=newModel[1]+[np.random.choice(varChoices) for i in range(modelArity(newModel)-len(newModel[1]))]
    newModel[1]=[varChoice() if callable(varChoice) and varChoice.__name__!='<lambda>' else varChoice for varChoice in newModel[1]]         
    newModel[0]=np.array(newModel[0],dtype=object)
    return newModel
    
mutate.__doc__ = "mutate(model,variableCount,ops,constants,maxLength) mutates a model"
def paretoFront(fitValues): #Returns Boolean list of Pareto front elements
    onFront = np.ones(fitValues.shape[0], dtype = bool)
    for i, j in enumerate(fitValues):
        if onFront[i]:
            onFront[onFront] = np.any(fitValues[onFront]<j, axis=1)  
            onFront[i] = True  
    return onFront
def paretoTournament(pop): # selects the Pareto front of a model set
    fitnessValues=np.array([mod[2] for mod in pop])
    return (np.array(pop,dtype=object)[paretoFront(fitnessValues)]).tolist()
def tournamentModelSelection(models, popSize=100,tourneySize=5):
    selectedModels=[]
    selectionSize=popSize
    while len(selectedModels)<popSize:
        tournament=random.sample(models,tourneySize)
        winners=paretoTournament(tournament)
        selectedModels=selectedModels+winners
    
    return selectedModels
paretoTournament.__doc__ = "paretoTournament(models, inputData, responseData) returns the Pareto front of a model set"
def modelSameQ(model1,model2): #Checks if two models are the same
    return len(model1[0])==len(model2[0]) and len(model1[1]) == len(model2[1]) and all(model1[0]==model2[0]) and model1[1]==model2[1]
modelSameQ.__doc__ = "modelSameQ(model1,model2) checks if model1 and model2 are the same and returns True if so, else False"
def deleteDuplicateModels(models): #Removes any models that are the same, does not consider simplified form
    uniqueMods = [models[0]]
     
    for mod in models:
        test=False
        for checkMod in uniqueMods:
            if modelSameQ(mod,checkMod):
                test=True
        if not test:
            uniqueMods.append(mod)
    
    return uniqueMods
deleteDuplicateModels.__doc__ = "deleteDuplicateModels(models) deletes models that have the same form without simplifying"

def deleteDuplicateModelsPhenotype(models): #Removes any models that are the same regarding phenotype, does not consider simplified form
    uniqueMods = [printGPModel(models[0])]
    remainingMods=[printGPModel(mod) for mod in models[1:]]
    uniquePos = [0]
    currPos=1
    for mod in remainingMods:
        test=False
        for checkMod in uniqueMods:
            if mod==checkMod:
                test=True
        if not test:
            uniqueMods.append(mod)
            uniquePos.append(currPos)
        currPos+=1
    
    return [models[i] for i in uniquePos]

def removeIndeterminateModels(models): #Removes models from the population that evaluate to nonreal values
    return [i for i in models if (not any(np.isnan(i[2]))) and all(np.isfinite(np.isnan(i[2])))]
removeIndeterminateModels.__doc__ = "removeIndeterminateModels(models) removes models that have a fitness that results from inf or nan values"
def sortModels(models):
    return sorted(models, key=lambda m:m[2])
sortModels.__doc__ = "sortModels(models) sorts a model population by the models' accuracies"
def selectModels(models, selectionSize=0.5, thresholds=None):
    tMods=copy.deepcopy(models)
    [modelToListForm(mod) for mod in tMods]
    if thresholds is not None:
        tMods=[mod for mod in tMods if all([mod[2][i]<=thresholds[i] for i in range(len(thresholds))])]
    paretoModels=[]
    if selectionSize<=1:
        selection=selectionSize*len(models)
    else:
        selection=selectionSize
    
    while len(paretoModels)<selection and len(tMods)>0:
        front=paretoTournament(tMods)
        paretoModels=paretoModels+front
        for i in front:
            tMods.remove(i)
    [modelRestoreForm(mod) for mod in paretoModels]
    return paretoModels
selectModels.__doc__ = "selectModels(models, selectionSize=0.5) iteratively selects the Pareto front of a model population until n or n*popSize models are selected"
def stackVarUsage(opStack): #Counts how many variables are used by the operator stack
    pos=getArity(opStack[0])
    for j in range(1,len(opStack)):
        pos+=getArity(opStack[j])-1
        if opStack[j]=='pop':
            pos+=1
    return pos
stackVarUsage.__doc__ = "stackVarUsage(opStack) is a helper function that determines how many variables/constants are needed by the operator stack"
def trimModel(mod): #Removes extra pop operators that do nothing
    model=copy.deepcopy(mod)
    i=0
    varStack=len(mod[1])
    tempStack=0
    varStack-=getArity(model[0][i])
    tempStack+=1
    i+=1
    while varStack>0:
        if model[0][i]=='pop':
            varStack-=1
            tempStack+=1
        else:
            
            take=getArity(model[0][i])-tempStack
            if take>0:
                varStack-=take
                tempStack=1
            else:
                tempStack-=getArity(model[0][i])-1
        i+=1
    model[0]=np.array(model[0][:i].tolist()+[j for j in model[0][i:] if not j=='pop'],dtype=object)
    return model
trimModel.__doc__ = "trimModel(model) trims extra pop operators off the operator stack so that further modifications such as a model alignment aren't altered by those pop operators"
def alignGPModel(model, data, response): #Aligns a model
    prediction=evaluateGPModel(model,data)
    if (not all(np.isfinite(np.array(prediction)))) or np.all(prediction==prediction[0]):
        return model
    if np.isnan(np.array(prediction)).any() or np.isnan(np.array(response)).any() or not np.isfinite(np.array(prediction,dtype=np.float32)).all():
        return model
    # Variance guards
    if np.std(prediction) < 1e-12:
        return model
    if np.ptp(prediction) < 1e-12:
        return model
    try:
        align=np.polyfit(prediction,response,1,rcond=1e-16)#np.round(np.polyfit(prediction,response,1,rcond=1e-16),decimals=14)
    except np.linalg.LinAlgError:
        #print("Alignment failed for: ", model, " with prediction: ", prediction, "and reference data: ", response)
        return model
    newModel=trimModel(model)
    newModel[0]=np.array(newModel[0].tolist()+[mult,add],dtype=object)
    newModel[1]=newModel[1]+align.tolist()
    #setModelQuality(newModel,data,response)
    return newModel
alignGPModel.__doc__ = "alignGPModel(model, input, response) aligns a model such that response-a*f(x)+b are minimized over a and b"
def evolve(inputData, responseData, generations=100, ops=defaultOps(), const=defaultConst(), variableNames=[], mutationRate=79, crossoverRate=11, spawnRate=10, extinction=False,extinctionRate=10,elitismRate=10,popSize=300,maxComplexity=100,align=True,initialPop=[],timeLimit=300,capTime=False,tourneySize=5,tracking=False,returnTracking=False,liveTracking=False,liveTrackingInterval=1,modelEvaluationMetrics=[fitness,stackGPModelComplexity],dataSubsample=False,samplingMethod=randomSubsample,alternateObjectives=[],alternateObjFrequency=10,allowEarlyTermination=False,earlyTerminationThreshold=0):
    evolution_hisotry = []
    alternatingFlag = False
    if callable(modelEvaluationMetrics):
        metrics=[modelEvaluationMetrics]
        allMetrics=[modelEvaluationMetrics]+alternateObjectives
    elif isinstance(modelEvaluationMetrics, list) and callable(modelEvaluationMetrics[0]):
        metrics=modelEvaluationMetrics
        allMetrics=modelEvaluationMetrics+alternateObjectives
    elif isinstance(modelEvaluationMetrics, list) and isinstance(modelEvaluationMetrics[0], list):
        metrics=modelEvaluationMetrics[0]
        allMetrics=[item for sublist in modelEvaluationMetrics for item in sublist]+alternateObjectives
        alternatingFlag = True
    else:
        raise ValueError("modelEvaluationMetrics must be a function, list of functions, or a list of lists of functions")

    fullInput,fullResponse=copy.deepcopy(inputData),copy.deepcopy(responseData)
    inData=copy.deepcopy(fullInput)
    resData=copy.deepcopy(fullResponse)
    variableCount=varCount(inData)
    models=initializeGPModels(variableCount,ops,const,popSize)
    models=models+initialPop
    startTime=time.perf_counter()
    bestFits=[]
    gene_dists = []
    if liveTracking:
        fig, ax = plt.subplots(figsize=(20,10))
        ckTime=time.perf_counter()
    for i in range(generations):

        if capTime and time.perf_counter()-startTime>timeLimit:
            break
        if len(alternateObjectives)>0 and (i+1)%alternateObjFrequency==0:
            metrics=modelEvaluationMetrics[:1]+alternateObjectives
        else:
            if alternatingFlag:
                metrics=modelEvaluationMetrics[i%len(modelEvaluationMetrics)]
            else:
                metrics=modelEvaluationMetrics
        if dataSubsample:
            inData,resData=samplingMethod(fullInput,fullResponse,generations=generations,generation=i)


        for mods in models:
            setModelQuality(mods,inData,resData,modelEvaluationMetrics=metrics)
        models=removeIndeterminateModels(models)
        if allowEarlyTermination and min([mods[2][0] for mods in models])<=earlyTerminationThreshold:
            print("Early termination at generation ", i)
            break

        if tracking or liveTracking or returnTracking:
            bestFits.append(min([mods[2][0] for mods in paretoTournament(models)]))
        if liveTracking and time.perf_counter()-ckTime>liveTrackingInterval:
            ax.clear()
            ax.plot(bestFits)
            ax.set_title(f"Best Model: {bestFits[-1]:.2f} at Generation {(i+1)}")
            ax.set_xlabel("Generations")
            ax.set_ylabel("Fitness")
            clear_output(wait=True) 
            display(fig)    
            #plt.show()        
            plt.close(fig)
            ckTime=time.perf_counter()        

        #get distribution of the models from the last generation and use the models from the Pareto Front

 
        paretoModels=selectModels(models,elitismRate/100*popSize if elitismRate/100*popSize<len(models) else len(models))

    
        if extinction and i%extinctionRate==0 and i>0:
            models=initializeGPModels(variableCount,ops,const,popSize)
            for mods in models:
                setModelQuality(mods,inData,resData,modelEvaluationMetrics=metrics)
        
        models=tournamentModelSelection(models,popSize,tourneySize)
        
        crossoverPairs=random.sample(models,round(crossoverRate/100*popSize))
        toMutate=random.sample(models,round(mutationRate/100*popSize))
        
        childModels=paretoModels
        
        for j in range(round(len(crossoverPairs)/2)-1):
            childModels=childModels+recombination2pt(crossoverPairs[j],crossoverPairs[j+round(len(crossoverPairs)/2)])
        
        for j in toMutate:
            childModels=childModels+[mutate(j,variableCount,ops,const)]
        
        childModels=childModels+initializeGPModels(variableCount,ops,const,round(spawnRate/100*popSize))
        
        childModels=deleteDuplicateModels(childModels)
        childModels=[model for model in childModels if stackGPModelComplexity(model)<maxComplexity]
        
        #for mods in childModels:
        #    setModelQuality(mods,inData,resData,modelEvaluationMetrics=modelEvaluationMetrics)
        #childModels=removeIndeterminateModels(childModels)
        
        if len(childModels)<popSize:
            childModels=childModels+initializeGPModels(variableCount,ops,const,popSize-len(childModels))
        
        models=copy.deepcopy(childModels)
        if ( i + 5 >= generations):
            evolution_hisotry.append(models)
        yield models
        
    
    for mods in models:
        setModelQuality(mods,fullInput,fullResponse,modelEvaluationMetrics=allMetrics)
    models=[trimModel(mod) for mod in models]
    models=deleteDuplicateModels(models)
    models=removeIndeterminateModels(models)
    models=sortModels(models)
    if align:
        models=[alignGPModel(mods,fullInput,fullResponse) for mods in models]
        for mods in models:
            setModelQuality(mods,fullInput,fullResponse,modelEvaluationMetrics=allMetrics)
    
    if tracking or returnTracking:
        bestFits.append(min([mods[2][0] for mods in paretoTournament(models)]))
        if returnTracking:
            return models, bestFits
        plt.figure()
        plt.plot(bestFits)
        plt.title("Fitness over Time")
        plt.xlabel("Generations")
        plt.ylabel("Fitness")
        plt.show()
    
    return models,evolution_hisotry
    #model_distributions
    

def replaceFunc(stack,f1,f2):
    return [i if i!=f1 else f2 for i in stack]
def printGPModel(mod,inputData=symbols(["x"+str(i) for i in range(100)])): #Evaluates a model algebraically
    def inv1(a):
        return a**(-1)
    from sympy import tan as tan1, exp as exp1, sqrt as sqrt1, sin as sin1, cos as cos1, acos, asin, atan, tanh as tanh1, log as log1
    def sqrt2(a):
        return sqrt1(a)
    def log2(a):
        return log1(a)
    model = copy.deepcopy(mod)
    model[0] = replaceFunc(model[0],exp,exp1)
    model[0] = replaceFunc(model[0],tan,tan1)
    model[0] = replaceFunc(model[0],sqrt,sqrt2)
    model[0] = replaceFunc(model[0],inv,inv1)
    model[0] = replaceFunc(model[0],sin,sin1)
    model[0] = replaceFunc(model[0],cos,cos1)
    model[0] = replaceFunc(model[0],arccos,acos)
    model[0] = replaceFunc(model[0],arcsin,asin)
    model[0] = replaceFunc(model[0],arctan,atan)
    model[0] = replaceFunc(model[0],tanh,tanh1)
    model[0] = replaceFunc(model[0],log,log2)
    try:
        response=evModHelper(model[1],model[0],[],np.array(inputData))[2][0]
    except:
        return np.nan
    return response

def ensembleSelect(models, inputData, responseData, numberOfClusters=10): #Generates a model ensemble using input data partitions
    data=np.transpose(inputData)
    if len(data)<numberOfClusters:
        numberOfClusters=len(data)
    clusters=KMeans(n_clusters=numberOfClusters).fit_predict(data)
    if numberOfClusters>len(set(clusters)):
        numberOfClusters=len(set(clusters))
        clusters=KMeans(n_clusters=numberOfClusters).fit_predict(data)
    dataParts=[]
    partsResponse=[]
    for i in range(numberOfClusters):
        dataParts.append([])
        partsResponse.append([])
    
    for i in range(len(clusters)):
        dataParts[clusters[i]].append(data[i])
        partsResponse[clusters[i]].append(responseData[i])
        
    modelResiduals=[]
    
    for i in range(len(models)):
        modelResiduals.append([])
    for i in range(len(models)):
        for j in range(numberOfClusters):
            modelResiduals[i].append(fitness(models[i],np.transpose(dataParts[j]),partsResponse[j]))
    
    best=[]
    for i in range(numberOfClusters):
        ordering=np.argsort(modelResiduals[i])
        j=0
        while ordering[j] in best:
            j+=1
        best.append(ordering[j])
    ensemble=[models[best[i]] for i in range(numberOfClusters)]
    
    return ensemble
def uncertainty(data,trim=0.3):
    wl=None
    if len(data)<=4:
        wl=1
    h=differential_entropy(data,window_length=wl)
    if np.isfinite(h):
        return h
    else:
        return 0

def evaluateModelEnsemble(ensemble, inputData):
    responses=[evaluateGPModel(mod, inputData) for mod in ensemble]
    if type(responses[0])==np.ndarray:
        responses=np.transpose(responses)
        predictions=[np.median(res) for res in responses]
    else:
        predictions=[np.median(responses)]
    
    return predictions
    
def evaluateModelEnsembleUncertainty(ensemble, inputData):
    responses=[evaluateGPModel(mod, inputData) for mod in ensemble]
    if type(responses[0])==np.ndarray:
        responses=np.transpose(responses)
        uncertainties=[uncertainty(res,0) for res in responses]
    else:
        uncertainties=[uncertainty(responses,0)]
    return uncertainties

def relativeEnsembleUncertainty(ensemble,inputData):
    output=evaluateModelEnsembleUncertainty(ensemble,inputData)
    return np.array(output)
    
def createUncertaintyFunc(ensemble):
    return lambda x: -relativeEnsembleUncertainty(ensemble,x)
    
def maximizeUncertainty(ensemble,varCount,bounds=[]): #Used to select a new point of maximum uncertainty
    func=createUncertaintyFunc(ensemble)
    x0=[np.mean(bounds[i]) for i in range(varCount)]
    if bounds==[]:
        pt=minimize(func,x0).x
    else:
        pt=minimize(func,x0,bounds=bounds).x
    return pt
def extendData(data,newPoint):
    return np.concatenate((data.T,np.array([newPoint]))).T

def activeLearningCheckpoint(eqNum,version,i,inputData,response,testInput,testResponse,errors,models,minerr):
    path=os.path.join(str(eqNum),str(version))
    file=open(path,"wb+")
    dill.dump([i,inputData,response,testInput,testResponse,errors,models,minerr],file)
    file.close()
def activeLearningCheckpointLoad(eqNum,version,i,inputData,response,testInput,testResponse,errors,models,minerr):
    path=os.path.join(str(eqNum),str(version))
    try:
        with open(path,'rb') as f:
            i,inputData,response,testInput,testResponse,errors,models,minerr=dill.load(f)
    except FileNotFoundError:
        return i,inputData,response,testInput,testResponse,errors,models,minerr
    return i,inputData,response,testInput,testResponse,errors,models,minerr
def subSampleSpace(space):
    newSpace=copy.deepcopy(space)
    newSpace=list(newSpace)
    for i in range(len(newSpace)):
        pts=sorted([np.random.uniform(newSpace[i][0],newSpace[i][1]),np.random.uniform(newSpace[i][0],newSpace[i][1])])
        newSpace[i]=tuple(pts)
    return tuple(newSpace)

def activeLearning(func, dims, ranges,rangesP,eqNum=1,version=1,iterations=100): #func should be a lamda function of form lambda data: f(data[0],data[1],...)
    try:
        with open(os.path.join(str(eqNum),str(version))+".txt",'rb') as f:
            return -1
    except FileNotFoundError:
        pass
    inputData=[]
    testInput=[]
    found=False
    for i in range(dims):
        inputData.append(np.random.uniform(ranges[i][0],ranges[i][1],3))
        testInput.append(np.random.uniform(ranges[i][0],ranges[i][1],200))
    inputData=np.array(inputData)
    testInput=np.array(testInput)
    response=func(inputData)
    testResponse=func(testInput)
    errors=[]
    models=[]
    minerr=1
    for i in range(iterations):
        print("input: ",inputData)
        print("\n response: ",response)
        i,inputData,response,testInput,testResponse,errors,models,minerr=activeLearningCheckpointLoad(eqNum,version,i,inputData,response,testInput,testResponse,errors,models,minerr)
        if i>iterations-1:
            break
        i+=1            
        models1=evolve(inputData,response,initialPop=models,generations=1000,tracking=False,popSize=300,ops=allOps(),timeLimit=120,capTime=True,align=False,elitismRate=10)
        models2=evolve(inputData,response,initialPop=models,generations=1000,tracking=False,popSize=300,ops=allOps(),timeLimit=120,capTime=True,align=False,elitismRate=10)
        models3=evolve(inputData,response,initialPop=models,generations=1000,tracking=False,popSize=300,ops=allOps(),timeLimit=120,capTime=True,align=False,elitismRate=10)
        models4=evolve(inputData,response,initialPop=models,generations=1000,tracking=False,popSize=300,ops=allOps(),timeLimit=120,capTime=True,align=False,elitismRate=10)
        models=models1+models2+models3+models4
        models=selectModels(models,20)
        alignedModels=[alignGPModel(mods,inputData,response) for mods in models]
        ensemble=ensembleSelect(alignedModels,inputData,response)
        out=maximizeUncertainty(ensemble,dims,rangesP)
        while out in inputData.T:
            out=maximizeUncertainty(ensemble,dims,subSampleSpace(rangesP))
        inputData=extendData(inputData,out)
        response=func(inputData)
        fitList=np.array([fitness(mod,testInput,testResponse) for mod in alignedModels])
        errors.append(min(fitList[np.logical_not(np.isnan(fitList))]))
        minerr=errors[-1]
        if minerr<1e-14:
            #print("Points needed in round", j,": ",3+i, " Time needed: ", time.perf_counter()-roundTime)
            if not os.path.exists(str(eqNum)):
                os.makedirs(str(eqNum))
            path=os.path.join(str(eqNum),str(version))
            file=open(path,"wb+")
            dill.dump([i,inputData,response,testInput,testResponse,errors,models,minerr],file)
            file.close()
            file=open(path+'.txt','w+')
            file.write(str(i+3)+'\n')
            file.write(str(errors))
            file.close()
            return 3+i
            found=True
            ptsNeeded.append(3+i)
            break
        activeLearningCheckpoint(eqNum,version,i,inputData,response,testInput,testResponse,errors,models,minerr)
    if found==False:
        #print("Points needed in round",j,": NA (model not found)")
        path=os.path.join(str(eqNum),str(version))
        file=open(path,"wb")
        dill.dump([-1,inputData,response,testInput,testResponse,errors,models,minerr],file)
        file.close()
        file=open(path+'.txt',"w+")
        file.write(str(i+3)+"\n")
        file.write(str(errors))
        file.close()
        return -1

def plotModels(models, modelExpression=False):
    tMods=copy.deepcopy(models)
    if len(tMods[0][2])<2:
        # add complexity as second value
        for mod in tMods:
            mod[2]=[mod[2][0],stackGPModelComplexity(mod)]
    [modelToListForm(mod) for mod in tMods]
    paretoModels=paretoTournament(tMods)
    for i in paretoModels:
        tMods.remove(i)
    [modelRestoreForm(mod) for mod in paretoModels]
    [modelRestoreForm(mod) for mod in tMods]
    
    pAccuracies=[mod[2][0] for mod in paretoModels]
    pComplexities=[mod[2][1] for mod in paretoModels]
    
    accuracies=[mod[2][0] for mod in tMods]+pAccuracies
    complexities=[mod[2][1] for mod in tMods]+pComplexities
    colors=['blue' for i in range(len(tMods))]+['red' for i in range(len(pAccuracies))]
    
    fig,ax = plt.subplots()

    sc=plt.scatter(complexities,accuracies,color=colors)
    plt.xlabel("Complexity")
    plt.ylabel("1-R**2")

    if modelExpression:
        names=[str(printGPModel(mod)) for mod in tMods]+[str(printGPModel(mod)) for mod in paretoModels]
    else:
        names = [str(mod) for mod in tMods]+[str(mod) for mod in paretoModels]
    
    label = ax.annotate("", xy=(0,0), xytext=(np.min(complexities),np.mean([np.max(accuracies),np.min(accuracies)])),
                        bbox=dict(boxstyle="round", fc="w"),
                        arrowprops=dict(arrowstyle="->"))
    label.set_visible(False)
    
    def update_labels(ind):

        pos = sc.get_offsets()[ind["ind"][0]]
        label.xy = pos
        text = "{}".format(" ".join([names[n] for n in [ind["ind"][0]]]))
        label.set_text(text)
        label.get_bbox_patch().set_facecolor('grey')
        label.get_bbox_patch().set_alpha(0.9)


    def hover(event):
        vis = label.get_visible()
        if event.inaxes == ax:
            cont, ind = sc.contains(event)
            if cont:
                update_labels(ind)
                label.set_visible(True)
                fig.canvas.draw_idle()
            else:
                if vis:
                    label.set_visible(False)
                    fig.canvas.draw_idle()

    fig.canvas.mpl_connect("motion_notify_event", hover)

    plt.show()

def plotModelResponseComparison(model,inputData,response,sort=False):
    plt.scatter(range(len(response)),response,label="True Response")
    plt.scatter(range(len(response)),evaluateGPModel(model,inputData),label="Model Prediction")
    plt.legend()
    plt.xlabel("Data Index")
    plt.ylabel("Response Value")
    plt.show()
def plotPredictionResponseCorrelation(model,inputData,response):
    plt.scatter(response,evaluateGPModel(model,inputData),label="Model")
    plt.plot(response,response,label="Perfect Correlation",color='green')
    plt.xlabel("True Response")
    plt.ylabel("Predicted Response")
    plt.legend()
    plt.show()
#Plot model complexity distribution
def plotModelComplexityDistribution(models):
    tMods=copy.deepcopy(models)
    [modelToListForm(mod) for mod in tMods]
    paretoModels=paretoTournament(tMods)
    for i in paretoModels:
        tMods.remove(i)
    [modelRestoreForm(mod) for mod in paretoModels]
    [modelRestoreForm(mod) for mod in tMods]
    pComplexities=[mod[2][1] for mod in paretoModels]
    tComplexities=[mod[2][1] for mod in tMods]
    plt.hist(tComplexities,label="Non-Pareto Models")
    plt.hist(pComplexities,label="Pareto Models")
    plt.xlabel("Model Complexity")
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()
#Plot model accuracy distribution
def plotModelAccuracyDistribution(models):
    tMods=copy.deepcopy(models)
    [modelToListForm(mod) for mod in tMods]
    paretoModels=paretoTournament(tMods)
    for i in paretoModels:
        tMods.remove(i)
    [modelRestoreForm(mod) for mod in paretoModels]
    [modelRestoreForm(mod) for mod in tMods]
    pAccuracies=[mod[2][0] for mod in paretoModels]
    tAccuracies=[mod[2][0] for mod in tMods]
    plt.hist(tAccuracies,label="Non-Pareto Models")
    plt.hist(pAccuracies,label="Pareto Models")
    plt.xlabel("Model Accuracy")
    plt.ylabel("Frequency")
    plt.legend()
    plt.show()


#Plot model residuals relative to response
def plotModelResiduals(model,input,response):
    plt.scatter(response,evaluateGPModel(model,input)-response)
    plt.xlabel("Response")
    plt.ylabel("Residual")
    plt.show()
#Plot model residual distribution
def plotModelResidualDistribution(model,input,response):
    plt.hist(evaluateGPModel(model,input)-response)
    plt.xlabel("Residual")
    plt.ylabel("Frequency")
    plt.show()


##CAN USE THESE FOR UNCERTAINITY FURTHERMORE
#Plot the presence of variables in a model population
def plotVariablePresence(models,variables=["x"+str(i) for i in range(100)],sort=False):
    vars=[varReplace(model[1],variables) for model in models]
    #Remove all numeric entries in vars
    vars=[[i for i in var if type(i)!=int and type(i)!=float] for var in vars]
    #Merge into one list
    vars=[j for i in vars for j in i]
    #Count frequency of each variable in vars
    varFreqs=[vars.count(i) for i in variables]
    #Keep only variables that appear at least once
    variablesUsed=[variables[i] for i in range(len(varFreqs)) if varFreqs[i]>0]
    varFreqs=[varFreqs[i] for i in range(len(varFreqs)) if varFreqs[i]>0]
    if sort:
        order=np.argsort(varFreqs)[::-1]
        variablesUsed=[variablesUsed[i] for i in order]
        varFreqs=[varFreqs[i] for i in order]
    #Plot variable frequency
    plt.bar(variablesUsed,varFreqs)
    plt.xlabel("Variable")
    plt.ylabel("Frequency")
    plt.show()
def replaceOpsWithStrings(opStack):
    model = copy.deepcopy(opStack)
    model = replaceFunc(model,exp,str("exp"))
    model = replaceFunc(model,tan,str("tan"))
    model = replaceFunc(model,sqrt,str("sqrt"))
    model = replaceFunc(model,inv,str("1/#"))
    model = replaceFunc(model,sin,str("sin"))
    model = replaceFunc(model,cos,str("cos"))
    model = replaceFunc(model,arccos,str("acos"))
    model = replaceFunc(model,arcsin,str("asin"))
    model = replaceFunc(model,arctan,str("atan"))
    model = replaceFunc(model,tanh,str("tanh"))
    model = replaceFunc(model,log,str("log"))
    model = replaceFunc(model,add,"+")
    model = replaceFunc(model,mult,"*")
    model = replaceFunc(model,sub,"-")
    model = replaceFunc(model,protectDiv,"/")
    model = replaceFunc(model,sqrd,"^2")
    return model
#Plot the presence of operators in a model population
def plotOperatorPresence(models,sort=False,excludePop=True):
    ops=[replaceOpsWithStrings(model[0]) for model in models]
    #Merge into one list
    ops=[j for i in ops for j in i]
    #Remove duplicates in ops
    uniqueOps=list(set(ops))
    if excludePop:
        #Remove pop operator
        uniqueOps.remove('pop')
    #Count frequency of each operator in ops
    opFreqs=[ops.count(i) for i in uniqueOps]
    #Keep only operators that appear at least once
    opsUsed=[str(uniqueOps[i]) for i in range(len(opFreqs)) if opFreqs[i]>0]
    opFreqs=[opFreqs[i] for i in range(len(opFreqs)) if opFreqs[i]>0]
    if sort:
        order=np.argsort(opFreqs)[::-1]
        opsUsed=[opsUsed[i] for i in order]
        opFreqs=[opFreqs[i] for i in order]
    #Plot operator frequency
    plt.bar(opsUsed,opFreqs)
    #Rotate x axis labels
    plt.xticks(rotation=0)
    plt.xlabel("Operator")
    plt.ylabel("Frequency")
    plt.show()

def diverse_models(best_model, filtered_model_population, trainInput, trainResponse):
    ###
    ## Residuals allow us to capture different diverse models since we can fit the models in multiple different ways
    ###
    predicted_data = evaluateGPModel(best_model, trainInput)
    candidate_preds = []
    for model in filtered_model_population:
        candidate_preds.append(np.array(evaluateGPModel(model, trainInput)).flatten())

    candidate_preds = np.array(candidate_preds)
    trainResponse = np.array(trainResponse).flatten()
    best_preds = np.array(evaluateGPModel(best_model, trainInput)).flatten()
    selected_preds = [best_preds]
    

    ensemble_cluster = [best_model]
    picked_indices = set()

    for r in range(10):
        #choose up to 10 best models from this ensemble
        if len(picked_indices) >= len(filtered_model_population):
            break

        current_ensemble_mean = np.median(selected_preds, axis=0)
        ensemble_residuals = trainResponse - current_ensemble_mean

        best_candidate_idx = -1
        anti_correlation = float('inf')
        for i, cur_model in enumerate(candidate_preds):
            if i in picked_indices:continue 
            cand_residual = trainResponse - cur_model
            corr_val = corr = pearsonr(cand_residuals, ensemble_residuals)[0]
            if np.nan(corr_val):
                corr_val = 1.0
            if corr_val < anti_correlation:
                anti_correlation = corr_val
                best_candidate_idx = i 


        ##add the choosen best model to our current ensemble
        picked_indices.add(best_candidate_idx)
        selected_preds.append(candidate_preds[best_candidate_idx])
        ensemble_cluster.append(filtered_model_population[best_candidate_idx])
        print(f"Added Model {best_candidate_idx} (Residual Correlation: {lowest_correlation:.4f})")

    return ensemble_cluster

    

############################
#Sharpness Computations
############################

def sharpnessConstants(model,inputData,responseData,numPerturbations=10,percentPerturbation=0.2):

    fits=[]

    #For each model parameter, if numeric, randomly perturb by x% and see how much the model changes
    for i in range(numPerturbations):
        tempModel=copy.deepcopy(model)
        newParameters=[param if callable(param) else param*(1+percentPerturbation*(np.random.uniform()-0.5)) for param in model[1]]
        tempModel[1]=newParameters
        fits.append(fitness(tempModel,inputData,responseData))
    return np.std(fits)

def sharpnessData(model,inputData,responseData,numPerturbations=10,percentPerturbation=0.2,preserveSign=False):

    fits=[]

    #For each vector, randomly perturb by x% of the standard deviation and see how much the model fitness changes
    for i in range(numPerturbations):
        tempData=copy.deepcopy(inputData)
        tempData=np.array([(vec+percentPerturbation*np.std(vec)*(np.random.uniform(size=len(vec))-0.5)) for vec in tempData])
        if preserveSign:
            signs=[np.unique(var) for var in np.sign(inputData)]
            tempData=[signs[i]*abs(tempData[i]) if len(signs[i])==1 else tempData[i] for i in range(len(signs))]
        fits.append(fitness(model,tempData,responseData))
    return np.std(fits)

def totalSharpness(model,inputData,responseData,numPerturbations=10,percentPerturbation=0.2,preserveSign=False):

    return sharpnessConstants(model,inputData,responseData,numPerturbations=numPerturbations,percentPerturbation=percentPerturbation)+sharpnessData(model,inputData,responseData,numPerturbations=numPerturbations,percentPerturbation=percentPerturbation,preserveSign=preserveSign)

############################
#Multiple Independent Searches
############################
def runEpochs(x,y,epochs=5,**kwargs):
    models=[]
    for i in range(epochs):
        models+=evolve(x,y,**kwargs)

    return sortModels(models)


############################
#Parallelization
############################
from joblib import Parallel, delayed
def parallelEvolve(*args,n_jobs=-1,avail_cores=-1, **kwargs):
    if avail_cores==-1:
        try:
            avail_cores=len(os.sched_getaffinity(0))
        except:
            avail_cores=os.cpu_count()
    if n_jobs==-1:
        try:
            n_jobs=len(os.sched_getaffinity(0))
        except:
            n_jobs=os.cpu_count()

    if "tracking" in kwargs and kwargs["tracking"]:
        kwargs["returnTracking"]=True

    print(f"Running parallel evolution with {n_jobs} jobs.")
    if "liveTracking" in kwargs and kwargs["liveTracking"]:
        print("Live tracking is not supported in parallel evolution, disabling live tracking.")
        kwargs["liveTracking"]=False
        
    runs = Parallel(n_jobs=avail_cores, backend="loky")(delayed(evolve)(*args, **kwargs) for _ in range(n_jobs))
    runs, tracking_histories = zip(*runs)
    if ("tracking" in kwargs and kwargs["tracking"]):
        runs, tracking = zip(*runs)
        # plot tracking for each job
        plt.figure(figsize=(12, 6))
        for i, track in enumerate(tracking):
            plt.plot(track, label=f'Job {i+1}')
        plt.title('Best Fitness Over Generations for Each Parallel Run')
        plt.xlabel('Generations')
        plt.ylabel('Best Fitness')
        if n_jobs <= 16:  # Only show legend if there are a reasonable number of jobs
            plt.legend()
        plt.show()
    flat = [model for sublist in runs for model in sublist]
    return sortModels(flat), tracking_histories


############################
#Benchmarking
############################
def generateRandomBenchmark(numVars=5, numSamples=100, noiseLevel=0, opsChoices=defaultOps(), constChoices=defaultConst(), maxLength=10):

    # Generate random input data
    inputData = np.random.rand(numVars, numSamples)

    # Generate a random target function
    randomModel = generateRandomModel(numVars, opsChoices, constChoices, maxLength)

    # Evaluate the model to get response data
    responseData = evaluateGPModel(randomModel, inputData)

    # Add noise if specified
    if noiseLevel > 0:
        noise = np.random.normal(0, noiseLevel, size=responseData.shape)
        responseData += noise

    return inputData, responseData, randomModel