File size: 61,955 Bytes
f945b6a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
import gradio as gr
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
# Scikit-learn Models
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.svm import SVC, SVR
# Metrics
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, r2_score
# Dataset generators
from sklearn.datasets import make_classification, make_regression

import joblib
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
import torchvision # For transforms, even if data is basic
import torchvision.transforms as T

# ONNX specific imports
import skl2onnx
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType, Int64TensorType, StringTensorType
import onnxruntime as rt

import traceback
import tempfile
import json
import math
import collections.abc # For Gradio issue with new Python versions

# --- Global Variables / Constants ---
TEMP_DIR = "temp_outputs"
os.makedirs(TEMP_DIR, exist_ok=True)
MAX_DATASET_ROWS_WARN = 30000 # Reduced slightly due to increased complexity
MAX_GENERATED_ROWS = 50000 # Max rows for generation
MAX_GENERATED_COLS = 100   # Max cols for generation

# --- Helper Functions ---
def count_sklearn_parameters(model):
    if hasattr(model, 'coef_'):
        return model.coef_.size + (model.intercept_.size if hasattr(model, 'intercept_') else 0)
    if hasattr(model, 'support_vectors_'):
        return model.support_vectors_.size
    if isinstance(model, (RandomForestClassifier, RandomForestRegressor)):
        try:
            return sum(tree.tree_.node_count for tree in model.estimators_)
        except: return "N/A (Complex Ensemble)"
    return "N/A"

def count_pytorch_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

def get_temp_filepath(filename_base, extension):
    # Ensure extension does not start with a dot if it's passed with one
    clean_extension = extension.lstrip('.')
    return os.path.join(TEMP_DIR, f"{filename_base}_{time.strftime('%Y%m%d-%H%M%S')}.{clean_extension}")


# --- PyTorch Model Definitions ---
class SimpleMLP(nn.Module):
    def __init__(self, input_dim, hidden_layers_str, output_dim, activation_fn_str="relu", task_type="classification"):
        super(SimpleMLP, self).__init__()
        layers = []
        if not isinstance(input_dim, int) or input_dim <= 0:
            raise ValueError(f"Input dimension must be a positive integer, got {input_dim}")

        hidden_units_list = []
        if hidden_layers_str and isinstance(hidden_layers_str, str) and hidden_layers_str.strip():
            try:
                hidden_units_list = [int(x.strip()) for x in hidden_layers_str.split(',') if x.strip()]
                if any(h_units <= 0 for h_units in hidden_units_list):
                    raise ValueError("Hidden layer units must be positive integers.")
            except ValueError as e:
                raise ValueError(f"Invalid hidden layer string '{hidden_layers_str}'. Error: {e}")
        
        current_dim = input_dim
        for h_units in hidden_units_list:
            layers.append(nn.Linear(current_dim, h_units))
            if activation_fn_str.lower() == "relu": layers.append(nn.ReLU())
            elif activation_fn_str.lower() == "tanh": layers.append(nn.Tanh())
            elif activation_fn_str.lower() == "sigmoid": layers.append(nn.Sigmoid())
            else: layers.append(nn.ReLU())
            current_dim = h_units
        
        layers.append(nn.Linear(current_dim, output_dim))
        
        if task_type == "classification":
            if output_dim == 1: layers.append(nn.Sigmoid()) # Binary
            elif output_dim > 1: layers.append(nn.Softmax(dim=-1)) # Multi-class
        self.network = nn.Sequential(*layers)
    def forward(self, x): return self.network(x)

class SimpleCNN(nn.Module):
    def __init__(self, input_channels, img_size_wh, num_classes=10,
                 c_out1=16, k1=3, s1=1, p1=1, pool1_k=2, pool1_s=2,
                 c_out2=32, k2=3, s2=1, p2=1, pool2_k=2, pool2_s=2,
                 fc_hidden=128):
        super(SimpleCNN, self).__init__()
        self.input_channels = input_channels
        self.img_h, self.img_w = img_size_wh
        self.num_classes = num_classes

        self.conv1 = nn.Conv2d(self.input_channels, c_out1, kernel_size=k1, stride=s1, padding=p1)
        self.relu1 = nn.ReLU()
        self.pool1 = nn.MaxPool2d(kernel_size=pool1_k, stride=pool1_s)

        h_out_conv1 = (self.img_h - k1 + 2 * p1) // s1 + 1
        w_out_conv1 = (self.img_w - k1 + 2 * p1) // s1 + 1
        h_pool1 = (h_out_conv1 - pool1_k) // pool1_s + 1
        w_pool1 = (w_out_conv1 - pool1_k) // pool1_s + 1
        
        self.conv2 = nn.Conv2d(c_out1, c_out2, kernel_size=k2, stride=s2, padding=p2)
        self.relu2 = nn.ReLU()
        self.pool2 = nn.MaxPool2d(kernel_size=pool2_k, stride=pool2_s)

        h_out_conv2 = (h_pool1 - k2 + 2 * p2) // s2 + 1
        w_out_conv2 = (w_pool1 - k2 + 2 * p2) // s2 + 1
        h_pool2 = (h_out_conv2 - pool2_k) // pool2_s + 1
        w_pool2 = (w_out_conv2 - pool2_k) // pool2_s + 1
        
        self.flattened_size = c_out2 * h_pool2 * w_pool2
        if self.flattened_size <= 0:
            raise ValueError(f"Calculated flattened size is {self.flattened_size}. Check CNN params and image size. Conv1_out:({h_out_conv1},{w_out_conv1}), Pool1_out:({h_pool1},{w_pool1}), Conv2_out:({h_out_conv2},{w_out_conv2}), Pool2_out:({h_pool2},{w_pool2})")
        
        self.fc1 = nn.Linear(self.flattened_size, fc_hidden)
        self.relu3 = nn.ReLU()
        self.fc2 = nn.Linear(fc_hidden, num_classes)
        
        if num_classes > 1 or (num_classes == 1 and task_type=="classification"): # Adapt for binary vs regression
            self.final_activation = nn.Softmax(dim=1) if num_classes > 1 else nn.Sigmoid()
        else: # Regression output from fc2
            self.final_activation = nn.Identity()


    def forward(self, x):
        x = self.pool1(self.relu1(self.conv1(x)))
        x = self.pool2(self.relu2(self.conv2(x)))
        x = x.view(-1, self.flattened_size)
        x = self.relu3(self.fc1(x))
        x = self.fc2(x)
        x = self.final_activation(x)
        return x

# --- Parameter Target Helpers ---
PARAM_RANGES = collections.OrderedDict([ # Ordered for consistent UI
    ("Tiny (<10k)", (0, 10000)),
    ("Small (10k-50k)", (10000, 50000)),
    ("Medium (50k-250k)", (50000, 250000)),
    ("Large (250k-1M)", (250000, 1000000)),
])

def suggest_mlp_layers_for_range(input_dim, output_dim, target_range_str, current_logs=""):
    logs = current_logs
    if not target_range_str or target_range_str not in PARAM_RANGES:
        logs += "Invalid parameter range selected for MLP suggestion.\n"; return "", logs
    min_p, max_p = PARAM_RANGES[target_range_str]
    target_p_avg = (min_p + max_p) // 2
    suggested_layers_str = ""
    if input_dim <=0 or output_dim <=0:
        logs += "Input/Output dims must be positive for MLP suggestion.\n"; return "", logs

    h1_candidate = max(1, int(target_p_avg / (input_dim + output_dim + 1e-6)))
    params_1_layer = (input_dim * h1_candidate + h1_candidate) + (h1_candidate * output_dim + output_dim)
    if min_p <= params_1_layer <= max_p and h1_candidate > 0:
        suggested_layers_str = str(h1_candidate)
        logs += f"Suggested 1 hidden layer: {h1_candidate} units (Est. Params: {params_1_layer})\n"
    else:
        h_base = max(1, int(math.sqrt(target_p_avg / 2.0)))
        h1 = min(2048, max(1, int(h_base * (input_dim / (input_dim + output_dim + 1e-6)) * 2 + h_base / 2)))
        h2 = min(2048, max(1, int(h_base * (output_dim / (input_dim + output_dim + 1e-6)) * 2 + h_base / 2)))
        params_2_layers = (input_dim * h1 + h1) + (h1 * h2 + h2) + (h2 * output_dim + output_dim)
        if min_p <= params_2_layers <= max_p and h1 > 0 and h2 > 0:
            suggested_layers_str = f"{h1},{h2}"
            logs += f"Suggested 2 hidden layers: {h1},{h2} units (Est. Params: {params_2_layers})\n"
        else:
            if target_p_avg < 50000: suggested_layers_str = str(max(1, int(target_p_avg / (input_dim + output_dim + 100)))) or "32"
            elif target_p_avg < 250000: h = max(1,int(math.sqrt(target_p_avg/1.5))); suggested_layers_str=f"{h},{h//2}" if h>0 and h//2 >0 else "128,64"
            else: h = max(1,int(math.sqrt(target_p_avg/2.0))); suggested_layers_str=f"{h},{h},{h//2}" if h>0 and h//2 >0 else "256,256,128"
            logs += f"Fallback suggestion: {suggested_layers_str} (Verify params).\n"
    if not suggested_layers_str: suggested_layers_str = "64"; logs += "Defaulting to '64'.\n"
    return suggested_layers_str, logs

def estimate_current_mlp_params(input_dim_str, hidden_layers_str, output_dim_str, current_logs=""):
    logs = current_logs
    try:
        input_dim = int(input_dim_str); output_dim = int(output_dim_str)
        if input_dim <= 0 or output_dim <= 0: return "Input/Output dims must be > 0", logs
        temp_mlp = SimpleMLP(input_dim, hidden_layers_str, output_dim)
        params = count_pytorch_parameters(temp_mlp); del temp_mlp
        return f"{params:,}", logs
    except Exception as e: logs += f"Error estimating MLP params: {e}\n"; return "Error", logs

def estimate_cnn_params(img_h_str, img_w_str, num_classes_str, current_logs=""):
    logs = current_logs
    try:
        img_h, img_w, num_classes = int(img_h_str), int(img_w_str), int(num_classes_str)
        if not (img_h > 0 and img_w > 0 and num_classes > 0): return "Image dims/classes must be > 0", logs
        # Using default SimpleCNN params here. A real app would pass them.
        temp_cnn = SimpleCNN(input_channels=1, img_size_wh=(img_h, img_w), num_classes=num_classes)
        params = count_pytorch_parameters(temp_cnn); del temp_cnn
        return f"{params:,}", logs
    except Exception as e: logs += f"Error estimating CNN params: {e}\n"; return "Error", logs

# --- Dataset and Preprocessing ---
def generate_dataset_backend(task_type, n_samples_str, n_features_str,
                             n_classes_or_informative_str, dataset_format,
                             ai_suggest_ds_shape, target_param_range_str, model_type_selection,
                             current_logs=""):
    logs = current_logs + "\n--- Generating Dataset ---\n"
    try:
        n_samples = int(n_samples_str); n_features = int(n_features_str); n_classes_or_informative = int(n_classes_or_informative_str)
    except ValueError: logs += "Invalid numbers for dataset generation.\n"; return None, "Error", logs, None

    if ai_suggest_ds_shape:
        n_samples_sugg, n_features_sugg, n_classes_or_informative_sugg = 5000, 10, 2
        if task_type == "Tabular Regression": n_classes_or_informative_sugg = min(n_features_sugg // 2, 5)
        elif task_type == "Basic Image Classification": n_samples_sugg, n_features_sugg = 500, 0 # features not tabular
        
        is_nn = "Network" in model_type_selection
        if is_nn and target_param_range_str in PARAM_RANGES:
            min_p, max_p = PARAM_RANGES[target_param_range_str]; avg_p = (min_p + max_p) / 2
            if avg_p > 200000: n_samples_sugg = min(MAX_GENERATED_ROWS, n_samples_sugg * 2); n_features_sugg = min(MAX_GENERATED_COLS, n_features_sugg * 2) if task_type.startswith("Tabular") else n_features_sugg
            elif avg_p < 50000: n_samples_sugg = max(100, n_samples_sugg // 2); n_features_sugg = max(3, n_features_sugg // 2) if task_type.startswith("Tabular") else n_features_sugg
        
        n_samples, n_features, n_classes_or_informative = n_samples_sugg, n_features_sugg, n_classes_or_informative_sugg
        logs += f"AI Suggested Dataset: Samples={n_samples}, Feats={n_features}, Classes/Informative={n_classes_or_informative}\n"
    
    n_samples = max(10, min(n_samples, MAX_GENERATED_ROWS))
    if task_type.startswith("Tabular"): n_features = max(1, min(n_features, MAX_GENERATED_COLS))
    if n_samples > MAX_DATASET_ROWS_WARN: logs += f"Warning: Generating {n_samples} rows. May be slow.\n"

    df = None; X_data=None; y_data=None # Init X_data, y_data
    try:
        if task_type == "Tabular Classification":
            n_cls = max(2, n_classes_or_informative)
            n_inf = max(1, min(n_features, n_classes_or_informative if n_classes_or_informative > n_cls else n_features // 2))
            X_data, y_data = make_classification(n_samples=n_samples, n_features=n_features, n_informative=n_inf,
                                       n_redundant=max(0,n_features - n_inf)//2, n_classes=n_cls, flip_y=0.05, random_state=42)
            df = pd.DataFrame(X_data, columns=[f'feature_{i}' for i in range(n_features)]); df['target'] = y_data
        elif task_type == "Tabular Regression":
            n_inf = max(1, min(n_features, n_classes_or_informative))
            X_data, y_data = make_regression(n_samples=n_samples, n_features=n_features, n_informative=n_inf, noise=10, random_state=42)
            df = pd.DataFrame(X_data, columns=[f'feature_{i}' for i in range(n_features)]); df['target'] = y_data
        elif task_type == "Basic Image Classification":
            # For SimpleCNN, let's generate 28x28 "images" (random noise)
            img_h, img_w = 28, 28 
            num_pixels = img_h * img_w
            X_data = np.random.randint(0, 256, size=(n_samples, num_pixels), dtype=np.uint8)
            y_data = np.random.randint(0, max(2, n_classes_or_informative), n_samples)
            df = pd.DataFrame(X_data, columns=[f'pixel_{i}' for i in range(num_pixels)]); df['target'] = y_data
            logs += f"Generated {img_h}x{img_w} Image placeholder data.\n"
        else: logs += f"Dataset generation for '{task_type}' not fully implemented.\n"; return None, "Task not implemented", logs, None
        
        logs += f"Generated data: {df.shape if df is not None else (X_data.shape, y_data.shape)}\n"
        file_path = get_temp_filepath("generated_dataset", dataset_format)
        if df is not None: # Save if DataFrame was created
            if dataset_format == ".csv": df.to_csv(file_path, index=False)
            elif dataset_format == ".json": df.to_json(file_path, orient='records', lines=True)
            elif dataset_format == ".parquet": df.to_parquet(file_path, index=False)
            else: logs += f"Unsupported format {dataset_format}. Defaulting to CSV.\n"; file_path=get_temp_filepath("generated_dataset","csv"); df.to_csv(file_path, index=False)
            logs += f"Dataset saved to {file_path}\n"
            return df.head(), df, logs, file_path # Return DataFrame for sklearn
        else: # Case where df might not be created (though current logic does)
            logs += "Dataset generated as numpy arrays. No file saved directly by this part of function.\n"
            # This branch needs more thought if we don't always make a df
            return pd.DataFrame(X_data[:5]), (X_data, y_data), logs, None # Return numpy arrays for PyTorch image case


    except Exception as e: error_msg=f"Error generating dataset: {traceback.format_exc()}"; logs+=error_msg+"\n"; return None, error_msg, logs, None

def preprocess_tabular_data(df_or_X, y_if_X_is_numpy, target_column_name, task_type, current_logs=""):
    logs = current_logs
    if isinstance(df_or_X, pd.DataFrame):
        df = df_or_X
        if target_column_name not in df.columns: raise ValueError(f"Target column '{target_column_name}' not found.")
        X_df = df.drop(target_column_name, axis=1)
        y_series = df[target_column_name]
    elif isinstance(df_or_X, np.ndarray) and y_if_X_is_numpy is not None: # If X,y are numpy
        X_df = pd.DataFrame(df_or_X, columns=[f'feature_{i}' for i in range(df_or_X.shape[1])]) # Temp DF for pipeline
        y_series = pd.Series(y_if_X_is_numpy)
    else: raise ValueError("Invalid input for preprocess_tabular_data.")

    numerical_features = X_df.select_dtypes(include=np.number).columns.tolist()
    categorical_features = X_df.select_dtypes(include='object').columns.tolist()
    logs += f"Numerical: {numerical_features}, Categorical: {categorical_features}\n"

    preprocessor = ColumnTransformer(transformers=[
        ('num', Pipeline([('imputer', SimpleImputer(strategy='mean')), ('scaler', StandardScaler())]), numerical_features),
        ('cat', Pipeline([('imputer', SimpleImputer(strategy='most_frequent')), ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))]), categorical_features) # sparse_output=False for easier handling
    ], remainder='passthrough') # passthrough to keep unhandled columns if any

    X_processed_np = preprocessor.fit_transform(X_df)
    
    try: feature_names_out = preprocessor.get_feature_names_out()
    except AttributeError: # Older sklearn
        cat_encoder = preprocessor.named_transformers_['cat'].named_steps['onehot']
        if hasattr(cat_encoder, 'get_feature_names_out'):
            cat_feature_names = cat_encoder.get_feature_names_out(categorical_features)
        elif hasattr(cat_encoder, 'get_feature_names'): # even older
             cat_feature_names = cat_encoder.get_feature_names(categorical_features)
        else: cat_feature_names = [f"cat_feat_{i}" for i in range(X_processed_np.shape[1] - len(numerical_features))] # Fallback
        feature_names_out = numerical_features + list(cat_feature_names)

    processed_input_dim = X_processed_np.shape[1]
    logs += f"Tabular data preprocessed. X shape: {X_processed_np.shape}, Processed input dim: {processed_input_dim}\n"

    if task_type.endswith("Classification"):
        le = LabelEncoder()
        y_processed_np = le.fit_transform(y_series)
        num_classes = len(le.classes_)
        logs += f"Target encoded. Classes: {num_classes} ({le.classes_})\n"
        # For binary classification with PyTorch, often output 1 neuron with Sigmoid or BCEWithLogitsLoss
        # If num_classes is 2, some PyTorch setups expect output_dim=1.
        # Scikit-learn handles this internally.
        output_dim_nn = 1 if num_classes == 2 else num_classes
    else: # Regression
        y_processed_np = y_series.astype(float).values
        num_classes = 1 # Output dim for regression for NN
        output_dim_nn = 1
    
    return X_processed_np, y_processed_np, preprocessor, logs, processed_input_dim, output_dim_nn, feature_names_out


# --- Training Functions ---
def train_model_sklearn(data_input_obj, target_column, task_type, model_name, model_output_format, current_logs=""):
    logs = current_logs + f"\n--- Training Scikit-learn Model: {model_name} ---\n"
    model_path_out, metrics_out, model_params_out = None, "Training failed.", "N/A"

    df = None
    if isinstance(data_input_obj, str): # Filepath
        try:
            if data_input_obj.endswith('.csv'): df = pd.read_csv(data_input_obj)
            elif data_input_obj.endswith('.json'): df = pd.read_json(data_input_obj, lines=True)
            elif data_input_obj.endswith('.parquet'): df = pd.read_parquet(data_input_obj)
            else: logs += f"Unsupported file: {data_input_obj}\n"; return logs, "Error: Unsupported file.", None, "N/A"
        except Exception as e: logs += f"Error reading {data_input_obj}: {e}\n"; return logs, f"Error reading: {e}", None, "N/A"
    elif isinstance(data_input_obj, pd.DataFrame): df = data_input_obj
    else: logs += "Invalid data for training.\n"; return logs, "Error: Invalid data.", None, "N/A"

    if target_column not in df.columns:
        logs += f"Target '{target_column}' not found.\n"; return logs, f"Error: Target '{target_column}' not found.", None, "N/A"

    try:
        X_processed_np, y_processed_np, preprocessor, logs, _, _, feature_names = preprocess_tabular_data(df, None, target_column, task_type, logs)
    except ValueError as e: logs += f"Preprocessing error: {e}\n"; return logs, f"Error: {e}", None, "N/A"

    X_train, X_test, y_train, y_test = train_test_split(X_processed_np, y_processed_np, test_size=0.2, random_state=42)
    logs += f"Train/Test split. Train: {X_train.shape}, Test: {X_test.shape}\n"

    model = None
    if task_type == "Tabular Classification":
        if model_name == "Logistic Regression": model = LogisticRegression(max_iter=1000, random_state=42)
        elif model_name == "Random Forest Classifier": model = RandomForestClassifier(random_state=42)
        elif model_name == "Support Vector Machine (SVM) Classifier": model = SVC(random_state=42, probability=True) # probability=True for ONNX if it needs predict_proba
    elif task_type == "Tabular Regression":
        if model_name == "Linear Regression": model = LinearRegression()
        elif model_name == "Random Forest Regressor": model = RandomForestRegressor(random_state=42)
        elif model_name == "Support Vector Machine (SVR) Regressor": model = SVR()
    if model is None: logs += f"Model {model_name} or task {task_type} not supported.\n"; return logs, "Model/Task Error", None, "N/A"

    try:
        logs += f"Starting training for {model_name}...\n"; start_time = time.time()
        model.fit(X_train, y_train)
        logs += f"Training completed in {time.time() - start_time:.2f}s.\n"
        model_params_out = str(count_sklearn_parameters(model))
        logs += f"Est. Model Params: {model_params_out}\n"
        y_pred = model.predict(X_test)

        if task_type == "Tabular Classification":
            acc = accuracy_score(y_test, y_pred)
            report = classification_report(y_test, y_pred, zero_division=0)
            metrics_out = f"Accuracy: {acc:.4f}\n\nClassification Report:\n{report}"
        elif task_type == "Tabular Regression":
            mse = mean_squared_error(y_test, y_pred)
            r2 = r2_score(y_test, y_pred)
            metrics_out = f"Mean Squared Error: {mse:.4f}\nR2 Score: {r2:.4f}"
        logs += "\n--- Evaluation Metrics ---\n" + metrics_out + "\n"

        # Full pipeline for inference: preprocessor + model
        full_pipeline_for_saving = Pipeline([('preprocessor', preprocessor), ('model', model)])
        model_filename_base = f"sklearn_{model_name.replace(' ', '_').lower()}"

        if model_output_format == ".pkl (Scikit-learn)":
            model_path_out = get_temp_filepath(model_filename_base, "pkl")
            joblib.dump(full_pipeline_for_saving, model_path_out)
            logs += f"Model (with preprocessor) saved to {model_path_out} as PKL.\n"
        
        elif model_output_format == ".onnx (ONNX)":
            model_path_out = get_temp_filepath(model_filename_base, "onnx")
            
            # Define initial types for ONNX conversion based on preprocessed input
            # The preprocessor converts all to numerical. Shape is (batch_size, num_processed_features)
            # num_processed_features = X_train.shape[1]
            initial_type = [('float_input', FloatTensorType([None, X_train.shape[1]]))] # None for batch size
            
            # For models with string inputs *before* preprocessing, it's more complex.
            # Here, we assume the `full_pipeline_for_saving` takes the raw DataFrame structure as input.
            # So, we need to define initial_types based on the *original* DataFrame features.
            
            # Re-create initial types based on the *original* df structure, before preprocessing
            # This is complex because ColumnTransformer input spec is not trivial for skl2onnx for mixed types.
            # The EASIEST way for skl2onnx with ColumnTransformer is to convert the *fitted preprocessor separately*
            # OR, provide initial types that match the *input to the preprocessor*.

            # Let's try providing initial types for the raw input to the preprocessor
            raw_X_for_types = df.drop(target_column, axis=1).infer_objects() # Infer object dtypes to str for ONNX
            onnx_initial_types = []
            for col_name in raw_X_for_types.columns:
                col_dtype = raw_X_for_types[col_name].dtype
                if pd.api.types.is_numeric_dtype(col_dtype):
                    # Forcing float32 for ONNX compatibility
                    onnx_initial_types.append((col_name, FloatTensorType([None, 1])))
                elif pd.api.types.is_string_dtype(col_dtype) or col_dtype == 'object':
                     onnx_initial_types.append((col_name, StringTensorType([None, 1])))
                else:
                    logs += f"Warning: Unsupported dtype {col_dtype} for column {col_name} in ONNX conversion. Skipping.\n"
            
            if not onnx_initial_types:
                logs += "Error: Could not determine ONNX initial types for raw input. Aborting ONNX export.\n"
                raise ValueError("ONNX initial types failed.")

            try:
                options = {id(full_pipeline_for_saving): {'zipmap': False}} # Disable zipmap for classifier output
                onnx_model = convert_sklearn(full_pipeline_for_saving, initial_types=onnx_initial_types,
                                             target_opset=12, options=options) # Target opset can be important
                with open(model_path_out, "wb") as f:
                    f.write(onnx_model.SerializeToString())
                logs += f"Model (with preprocessor) saved to {model_path_out} as ONNX.\n"

                # Optional: Verify ONNX model
                sess = rt.InferenceSession(model_path_out, providers=rt.get_available_providers())
                logs += f"ONNX model loaded successfully with ONNX Runtime. Input names: {[inp.name for inp in sess.get_inputs()]}\n"
            except Exception as onnx_e:
                logs += f"Error during ONNX conversion/saving: {traceback.format_exc()}\n"
                model_path_out = None # Clear path if saving failed
                metrics_out += "\nONNX EXPORT FAILED."

        else:
            logs += f"Unsupported format '{model_output_format}'. Saving as .pkl\n"
            model_path_out = get_temp_filepath(model_filename_base, "pkl")
            joblib.dump(full_pipeline_for_saving, model_path_out)

    except Exception as e:
        error_msg = f"Error during sklearn training/eval: {traceback.format_exc()}"; logs += error_msg + "\n"; metrics_out = error_msg
    return logs, metrics_out, model_path_out, model_params_out


def train_model_pytorch(data_input_obj, target_column, task_type, model_type_pt,
                        mlp_hidden_layers_str, mlp_activation,
                        # CNN specific (using defaults in SimpleCNN for now)
                        # cnn_img_h_str, cnn_img_w_str, # Now derived from data
                        epochs_str, batch_size_str, lr_str,
                        model_output_format, current_logs=""):
    logs = current_logs + f"\n--- Training PyTorch Model: {model_type_pt} ---\n"
    model_path_out, metrics_out, model_params_out, plot_out = None, "Training failed.", "N/A", None

    df_for_pytorch = None; X_numpy_for_pytorch=None; y_numpy_for_pytorch=None # For flexibility
    if isinstance(data_input_obj, str): # Filepath
        try:
            # For PyTorch, we might want to handle data differently, esp images
            if data_input_obj.endswith('.csv'): df_for_pytorch = pd.read_csv(data_input_obj)
            elif data_input_obj.endswith('.json'): df_for_pytorch = pd.read_json(data_input_obj, lines=True)
            elif data_input_obj.endswith('.parquet'): df_for_pytorch = pd.read_parquet(data_input_obj)
            else: logs += f"Unsupported file: {data_input_obj}\n"; return logs, "Error", None, "N/A", None
        except Exception as e: logs += f"Error reading {data_input_obj}: {e}\n"; return logs, f"Error: {e}", None, "N/A", None
    elif isinstance(data_input_obj, pd.DataFrame): df_for_pytorch = data_input_obj
    elif isinstance(data_input_obj, tuple) and len(data_input_obj) == 2 and \
         isinstance(data_input_obj[0], np.ndarray) and isinstance(data_input_obj[1], np.ndarray):
        X_numpy_for_pytorch, y_numpy_for_pytorch = data_input_obj # If data was (X,y) from generation
    else: logs += "Invalid data for PyTorch training.\n"; return logs, "Error", None, "N/A", None

    try:
        epochs = int(epochs_str); batch_size = int(batch_size_str); lr = float(lr_str)
        if not (epochs > 0 and batch_size > 0 and lr > 0): raise ValueError("Params must be >0.")
    except ValueError as e: logs += f"Invalid training params: {e}\n"; return logs, f"Error: {e}", None, "N/A", None

    processed_input_dim_actual = -1; nn_output_dim_actual = -1; preprocessor_pipeline = None
    X_processed_np = None; y_processed_np = None

    if model_type_pt == "Simple Neural Network (MLP)":
        if not task_type.startswith("Tabular"):
            logs += "MLP requires Tabular task.\n"; return logs, "MLP Task Error", None, "N/A", None
        try:
            # Pass df_for_pytorch or (X_numpy_for_pytorch, y_numpy_for_pytorch)
            data_arg1 = df_for_pytorch if df_for_pytorch is not None else X_numpy_for_pytorch
            data_arg2 = y_numpy_for_pytorch if df_for_pytorch is None else None
            X_processed_np, y_processed_np, preprocessor_pipeline, logs, processed_input_dim_actual, nn_output_dim_actual, _ = \
                preprocess_tabular_data(data_arg1, data_arg2, target_column, task_type, logs)
        except ValueError as e: logs+=f"MLP Preprocessing error: {e}\n"; return logs,f"Error: {e}",None,"N/A",None

    elif model_type_pt == "Simple Convolutional Network (CNN)":
        if task_type != "Basic Image Classification":
            logs += "Warning: CNN selected, but task is not Basic Image Classification. Output may be unexpected.\n"
        
        if df_for_pytorch is not None:
            if target_column not in df_for_pytorch.columns:
                 logs += f"Target '{target_column}' not found for CNN.\n"; return logs, "CNN Target Error", None, "N/A", None
            X_raw = df_for_pytorch.drop(target_column, axis=1).values
            y_raw = df_for_pytorch[target_column].values
        elif X_numpy_for_pytorch is not None and y_numpy_for_pytorch is not None:
            X_raw = X_numpy_for_pytorch
            y_raw = y_numpy_for_pytorch
        else:
            logs += "No valid data found for CNN.\n"; return logs, "CNN Data Error", None, "N/A", None

        le = LabelEncoder(); y_processed_np = le.fit_transform(y_raw)
        nn_output_dim_actual = len(le.classes_)
        if nn_output_dim_actual == 2: nn_output_dim_actual = 1 # Binary output for NN

        pixels_per_sample = X_raw.shape[1]
        img_dim_approx = int(math.sqrt(pixels_per_sample))
        img_h, img_w, input_channels = (28,28,1) # Default
        if img_dim_approx * img_dim_approx == pixels_per_sample:
            img_h, img_w = img_dim_approx, img_dim_approx
        else: logs += f"Warning: Cannot infer square image from {pixels_per_sample} pixels. Defaulting to 28x28 for CNN.\n"
        
        # Reshape and normalize (basic)
        X_processed_np = X_raw.reshape(-1, input_channels, img_h, img_w).astype(np.float32) / 255.0
        processed_input_dim_actual = (input_channels, img_h, img_w) # For CNN constructor
        logs += f"CNN Data: X reshaped to {X_processed_np.shape}, y: {y_processed_np.shape}, NN Output Dim: {nn_output_dim_actual}\n"
    else: logs += f"Unknown PyTorch model: {model_type_pt}\n"; return logs, "Unknown PyTorch model", None, "N/A", None

    X_tensor = torch.tensor(X_processed_np, dtype=torch.float32)
    # Adjust y_tensor dtype based on loss function expectations
    y_dtype = torch.float32 if (nn_output_dim_actual == 1 and task_type.endswith("Regression")) or \
                              (nn_output_dim_actual == 1 and task_type.endswith("Classification")) \
                           else torch.long # MSELoss/BCELoss with float, CrossEntropy with long
    y_tensor = torch.tensor(y_processed_np, dtype=y_dtype)
    if nn_output_dim_actual == 1 and task_type.endswith("Classification"): y_tensor = y_tensor.unsqueeze(1) # For BCE based loss
    if task_type.endswith("Regression"):  y_tensor = y_tensor.unsqueeze(1) # MSELoss expects [N,1]

    dataset = TensorDataset(X_tensor, y_tensor)
    # Use num_workers=0 on free tier to avoid issues with multiprocessing
    dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=0) 

    pytorch_model = None
    try:
        if model_type_pt == "Simple Neural Network (MLP)":
            pytorch_model = SimpleMLP(input_dim=processed_input_dim_actual, hidden_layers_str=mlp_hidden_layers_str,
                                    output_dim=nn_output_dim_actual, activation_fn_str=mlp_activation,
                                    task_type="classification" if task_type.endswith("Classification") else "regression")
        elif model_type_pt == "Simple Convolutional Network (CNN)":
            channels, h, w = processed_input_dim_actual
            pytorch_model = SimpleCNN(input_channels=channels, img_size_wh=(h,w), num_classes=nn_output_dim_actual)
    except Exception as model_e:
        logs += f"Error creating PyTorch model: {traceback.format_exc()}\n"; return logs, f"Model Creation Error: {model_e}", None, "N/A", None

    if pytorch_model is None: logs += "Failed to instantiate PyTorch model.\n"; return logs, "Model instantiate fail", None, "N/A", None
    model_params_val = count_pytorch_parameters(pytorch_model)
    model_params_out = f"{model_params_val:,}"
    logs += f"PyTorch Model: {model_params_out} params.\n"
    if model_params_val > 500000: logs += "Warning: >500k params on CPU will be SLOW.\n"

    is_classification_task = task_type.endswith("Classification") or model_type_pt == "Simple Convolutional Network (CNN)" # Treat CNN as classification here
    if is_classification_task:
        criterion = nn.BCELoss() if nn_output_dim_actual == 1 else nn.CrossEntropyLoss()
    else: # Regression
        criterion = nn.MSELoss()
    optimizer = optim.Adam(pytorch_model.parameters(), lr=lr)

    logs += f"Starting PyTorch training for {epochs} epochs...\n"; start_time = time.time()
    epoch_losses = []
    pytorch_model.train()
    for epoch in range(epochs):
        epoch_loss_sum = 0.0; num_batches = 0
        for batch_X, batch_y in dataloader:
            optimizer.zero_grad()
            outputs = pytorch_model(batch_X)
            loss = criterion(outputs, batch_y)
            loss.backward(); optimizer.step()
            epoch_loss_sum += loss.item(); num_batches += 1
        avg_epoch_loss = epoch_loss_sum / num_batches if num_batches > 0 else 0
        epoch_losses.append(avg_epoch_loss)
        logs += f"Epoch {epoch+1}/{epochs}, Avg Loss: {avg_epoch_loss:.4f}\n"
        # yield logs, metrics_out, model_path_out, model_params_out, None # For streaming, but makes UI complex
    
    training_time = time.time() - start_time
    logs += f"PyTorch training completed in {training_time:.2f} seconds.\n"
    
    # Basic evaluation (on last batch for simplicity, or could do full test set)
    # A proper eval loop on a test set would be better here.
    pytorch_model.eval()
    with torch.no_grad():
        # For simplicity, let's just report final training loss.
        # A full evaluation on a test split would be needed for proper metrics.
        if is_classification_task:
            # This is a very rough accuracy on the last training batch for demo
            if dataloader.dataset: # Check if dataset is not empty
                try:
                    last_batch_X, last_batch_y = next(iter(dataloader)) # Get one batch
                    outputs = pytorch_model(last_batch_X)
                    if nn_output_dim_actual == 1: # Binary
                        predicted = (outputs > 0.5).float()
                    else: # Multi-class
                        _, predicted = torch.max(outputs.data, 1)
                    correct = (predicted == last_batch_y.view_as(predicted)).sum().item()
                    total = last_batch_y.size(0)
                    acc = correct / total if total > 0 else 0
                    metrics_out = f"Final Training Loss: {avg_epoch_loss:.4f}\nApprox. Accuracy on a batch: {acc*100:.2f}% (Note: Proper eval needs a test set)"
                except StopIteration: # Dataloader was empty
                    metrics_out = f"Final Training Loss: {avg_epoch_loss:.4f}\n (Dataloader empty, cannot get batch accuracy)"

            else:
                metrics_out = f"Final Training Loss: {avg_epoch_loss:.4f}\n (No data for batch accuracy)"
        else: # Regression
            metrics_out = f"Final Training Loss (MSE): {avg_epoch_loss:.4f}"
    logs += "\n--- PyTorch Metrics (Simplified) ---\n" + metrics_out + "\n"

    # Loss plot
    if epoch_losses:
        import matplotlib.pyplot as plt
        fig, ax = plt.subplots()
        ax.plot(range(1, epochs + 1), epoch_losses, marker='o')
        ax.set_xlabel("Epoch")
        ax.set_ylabel("Average Loss")
        ax.set_title("Training Loss Curve")
        plot_out = fig # Gradio can display matplotlib figures
        logs += "Loss curve generated.\n"


    # Save model (and preprocessor if MLP)
    model_filename_base = f"pytorch_{model_type_pt.replace(' ', '_').lower()}"
    if model_output_format == ".pt (PyTorch)":
        model_path_out = get_temp_filepath(model_filename_base, "pt")
        if model_type_pt == "Simple Neural Network (MLP)" and preprocessor_pipeline:
            torch.save({
                'model_state_dict': pytorch_model.state_dict(),
                'preprocessor': preprocessor_pipeline,
                'input_dim': processed_input_dim_actual, # From preprocessing
                'output_dim': nn_output_dim_actual, # From preprocessing
                'hidden_layers_str': mlp_hidden_layers_str,
                'activation_fn': mlp_activation,
                'task_type': task_type
            }, model_path_out)
            logs += f"PyTorch MLP (model + preprocessor) saved to {model_path_out}\n"
        else: # CNN or MLP without preprocessor explicitly bundled (less common)
            torch.save(pytorch_model.state_dict(), model_path_out)
            logs += f"PyTorch {model_type_pt} (model state_dict) saved to {model_path_out}\n"
    # Add ONNX export for PyTorch later if needed (torch.onnx.export)
    else:
        logs += f"Unsupported format '{model_output_format}' for PyTorch. Saving as .pt\n"
        model_path_out = get_temp_filepath(model_filename_base, "pt")
        torch.save(pytorch_model.state_dict(), model_path_out) # Fallback to state_dict

    return logs, metrics_out, model_path_out, model_params_out, plot_out


# --- Gradio UI Definition ---
# Define choices
TASK_CHOICES = ["Tabular Classification", "Tabular Regression", "Basic Image Classification"] # Simple Text removed for focus
MODEL_FAMILIES = ["Scikit-learn (Classical ML)", "PyTorch (Neural Networks)"]
SKLEARN_MODELS_CLASSIFICATION = ["Logistic Regression", "Random Forest Classifier", "Support Vector Machine (SVM) Classifier"]
SKLEARN_MODELS_REGRESSION = ["Linear Regression", "Random Forest Regressor", "Support Vector Machine (SVR) Regressor"]
PYTORCH_MODELS = ["Simple Neural Network (MLP)", "Simple Convolutional Network (CNN)"]
DATASET_FORMATS = [".csv", ".json", ".parquet"]
MODEL_OUTPUT_FORMATS_SKLEARN = [".pkl (Scikit-learn)", ".onnx (ONNX)"]
MODEL_OUTPUT_FORMATS_PYTORCH = [".pt (PyTorch)"] # ".onnx (ONNX)" can be added later
MLP_ACTIVATIONS = ["relu", "tanh", "sigmoid"]

CLONE_GUIDE_TEXT = """
## How to Clone & Upgrade This Space for More Power:
(Instructions as provided in previous response - omitted here for brevity but should be included)
"""

def update_model_options(task_choice, model_family_choice):
    if model_family_choice == "Scikit-learn (Classical ML)":
        if task_choice == "Tabular Classification": return gr.update(choices=SKLEARN_MODELS_CLASSIFICATION, value=SKLEARN_MODELS_CLASSIFICATION[0], visible=True)
        elif task_choice == "Tabular Regression": return gr.update(choices=SKLEARN_MODELS_REGRESSION, value=SKLEARN_MODELS_REGRESSION[0], visible=True)
        else: return gr.update(choices=[], value=None, visible=False) # Sklearn not for image task here
    elif model_family_choice == "PyTorch (Neural Networks)":
        if task_choice.startswith("Tabular"): return gr.update(choices=[PYTORCH_MODELS[0]], value=PYTORCH_MODELS[0], visible=True) # Only MLP for tabular
        elif task_choice == "Basic Image Classification": return gr.update(choices=[PYTORCH_MODELS[1]], value=PYTORCH_MODELS[1], visible=True) # Only CNN for image
        else: return gr.update(choices=[], value=None, visible=False)
    return gr.update(choices=[], value=None, visible=False)

def update_param_range_visibility(model_family_choice):
    return gr.update(visible=(model_family_choice == "PyTorch (Neural Networks)"))

def update_pytorch_specific_options_visibility(model_choice_pytorch):
    is_mlp = model_choice_pytorch == "Simple Neural Network (MLP)"
    is_cnn = model_choice_pytorch == "Simple Convolutional Network (CNN)"
    return gr.update(visible=is_mlp), gr.update(visible=is_cnn) # MLP Group, CNN Group

def update_model_output_formats(model_family_choice):
    if model_family_choice == "Scikit-learn (Classical ML)":
        return gr.update(choices=MODEL_OUTPUT_FORMATS_SKLEARN, value=MODEL_OUTPUT_FORMATS_SKLEARN[0])
    elif model_family_choice == "PyTorch (Neural Networks)":
        return gr.update(choices=MODEL_OUTPUT_FORMATS_PYTORCH, value=MODEL_OUTPUT_FORMATS_PYTORCH[0])
    return gr.update(choices=[], value=None)


css = """
.gradio-container { font-family: 'IBM Plex Sans', sans-serif; }
.gr-button { color: white; border-color: black; background: black; }
.gr-input { border-radius: 8px; }
.gr-output { border-radius: 8px; }
"""

with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue", secondary_hue="orange"), css=css) as demo:
    gr.Markdown("# 🧠 Universal AI Model Trainer (CPU Edition)")
    gr.Markdown("Create, train, and download AI models. Optimized for CPU - expect longer training for complex models.")
    
    # Global state to store generated data path or df
    # This helps pass data between dataset generation and training without re-upload
    # For DataFrames, it's better to pass them directly if possible, or save/load paths.
    generated_data_state = gr.State(None) 
    current_logs_state = gr.State("") # To accumulate logs

    with gr.Tabs():
        with gr.TabItem("1. Define Task & Model"):
            with gr.Row():
                task_type_dd = gr.Dropdown(TASK_CHOICES, label="Select Task Type", value=TASK_CHOICES[0])
                model_family_dd = gr.Dropdown(MODEL_FAMILIES, label="Select Model Family", value=MODEL_FAMILIES[0])
            
            model_specific_dd = gr.Dropdown(label="Select Specific Model", interactive=True) # Populated by callback
            
            # PyTorch Parameter Range (only visible for PyTorch)
            pytorch_param_range_dd = gr.Dropdown(list(PARAM_RANGES.keys()), label="Target Parameter Range (for NNs)", 
                                                 info="Guides NN architecture suggestions. Training >250k params on CPU is slow.",
                                                 value=list(PARAM_RANGES.keys())[1], visible=False)
            
            # PyTorch MLP Specifics (only visible for MLP)
            with gr.Group(visible=False) as pt_mlp_specific_group:
                gr.Markdown("#### MLP Configuration")
                # Input dim will be determined after data preprocessing for MLP. User doesn't set it here.
                # Output dim also determined by data (num_classes or 1 for regression)
                pt_mlp_hidden_layers_txt = gr.Textbox(label="Hidden Layer Sizes (comma-separated, e.g., 128,64)", value="64,32")
                pt_mlp_activation_dd = gr.Dropdown(MLP_ACTIVATIONS, label="Activation Function", value="relu")
                pt_mlp_suggest_btn = gr.Button("Suggest MLP Layers for Target Range")
                pt_mlp_param_count_txt = gr.Textbox(label="Estimated MLP Parameters", interactive=False)
                # For MLP param estimation, we'd need #input_features and #output_classes from data step
                # This means estimation might be better placed *after* dataset is defined.
                # For now, placeholder or user has to guess input/output dims.
                # Simplified: we'll show actual params *after* training or with a dedicated button post-data.

            # PyTorch CNN Specifics (Placeholder - visible for CNN)
            with gr.Group(visible=False) as pt_cnn_specific_group:
                gr.Markdown("#### CNN Configuration (Simplified for Demo)")
                gr.Markdown("SimpleCNN uses fixed architecture for now (2 conv layers, 1 FC). Parameters mainly come from image size/classes.")
                # For CNN param estimation, we need image H, W, num_classes from data step.
                # cnn_img_h_param_est = gr.Number(label="Est. Image Height (for param count)", value=28, visible=False) # Hidden, used by callback
                # cnn_img_w_param_est = gr.Number(label="Est. Image Width (for param count)", value=28, visible=False)
                # cnn_num_classes_param_est = gr.Number(label="Est. Num Classes (for param count)", value=10, visible=False)
                pt_cnn_param_count_txt = gr.Textbox(label="Estimated CNN Parameters", interactive=False)
                # Actual CNN param count shown after training or with dedicated button post-data.


        with gr.TabItem("2. Configure Dataset"):
            dataset_source_rb = gr.Radio(["Generate new dataset", "Upload my own dataset (CSV, JSON, Parquet)"], 
                                         label="Dataset Source", value="Generate new dataset")
            
            with gr.Group(visible=True) as generate_dataset_group: # Visible by default
                gr.Markdown("#### Generate Synthetic Dataset")
                with gr.Row():
                    ds_gen_samples_num = gr.Number(label="Number of Rows (Samples)", value=1000)
                    ds_gen_features_num = gr.Number(label="Number of Features (Columns, if tabular)", value=10)
                ds_gen_classes_informative_num = gr.Number(label="Num Classes (for Classification) / Num Informative Features (for Regression)", value=2)
                ds_gen_ai_suggest_cb = gr.Checkbox(label="Let AI suggest optimal rows/columns based on model type & param range?", value=False)
                ds_gen_format_dd = gr.Dropdown(DATASET_FORMATS, label="Generated Dataset Download Format", value=".csv")
                generate_dataset_btn = gr.Button("Generate & Preview Dataset", variant="secondary")
            
            with gr.Group(visible=False) as upload_dataset_group:
                gr.Markdown("#### Upload Dataset")
                ds_upload_file = gr.File(label="Upload your dataset file", file_types=[".csv", ".json", ".parquet"])
            
            target_column_name_txt = gr.Textbox(label="Target Column Name (Case-Sensitive)", placeholder="e.g., 'target' or 'label'")
            dataset_preview_df = gr.DataFrame(label="Dataset Preview (First 5 Rows)", interactive=False)
            generated_dataset_download_file = gr.File(label="Download Generated Dataset", interactive=False)

        with gr.TabItem("3. Train Model & Get Results"):
            gr.Markdown("Ensure Model and Dataset are configured before training.")
            with gr.Row():
                # Training Hyperparameters (Common for PyTorch)
                # For Scikit-learn, HPs are mostly defaults or need more complex UI
                # These are mainly for PyTorch NNs
                train_epochs_num = gr.Number(label="Epochs (for NNs)", value=10)
                train_batch_size_num = gr.Number(label="Batch Size (for NNs)", value=32)
                train_learning_rate_num = gr.Number(label="Learning Rate (for NNs)", value=0.001)
            
            model_output_format_dd = gr.Dropdown(label="Select Model Output Format", choices=MODEL_OUTPUT_FORMATS_SKLEARN, value=MODEL_OUTPUT_FORMATS_SKLEARN[0]) # Default to sklearn
            train_model_btn = gr.Button("🚀 Train Model", variant="primary")
            
            gr.Markdown("---")
            gr.Markdown("### Training Progress & Results")
            training_log_txt = gr.Textbox(label="Training Log & Status", lines=15, interactive=False, max_lines=50)
            model_param_count_output_txt = gr.Textbox(label="Actual Trained Model Parameters", interactive=False)
            evaluation_metrics_txt = gr.Textbox(label="Evaluation Metrics", lines=7, interactive=False)
            loss_plot_img = gr.Plot(label="Training Loss Curve (PyTorch NNs)")
            download_trained_model_file = gr.File(label="Download Trained Model", interactive=False)

        with gr.TabItem("ℹ️ Guide & Info"):
            gr.Markdown("### Using This Space")
            gr.Markdown("- **Free CPU Tier:** Training large or complex models will be slow. Memory is also limited (around 15GB RAM).")
            gr.Markdown("- **Workflow:** 1. Define Task/Model -> 2. Configure Dataset -> 3. Train.")
            gr.Markdown("- **Dataset Generation:** For 'Basic Image Classification', random pixel data is generated (not real images).")
            gr.Markdown("- **Parameters:** For Neural Networks, the 'Target Parameter Range' helps suggest architectures. 1M params is already large for CPU training.")
            gr.Markdown("- **ONNX Export (Scikit-learn):** Converts Scikit-learn pipelines (preprocessor + model) to ONNX. Input to the ONNX model should be raw data matching the original training DataFrame structure.")
            gr.Markdown(CLONE_GUIDE_TEXT)
            
    # --- Event Handlers ---
    # Update model choices based on task and family
    task_type_dd.change(fn=update_model_options, inputs=[task_type_dd, model_family_dd], outputs=model_specific_dd)
    model_family_dd.change(fn=update_model_options, inputs=[task_type_dd, model_family_dd], outputs=model_specific_dd)
    
    # Show/hide PyTorch parameter range dropdown
    model_family_dd.change(fn=update_param_range_visibility, inputs=model_family_dd, outputs=pytorch_param_range_dd)
    
    # Show/hide PyTorch MLP/CNN specific groups
    # This needs model_specific_dd as input, which is tricky if it's dynamically populated.
    # Let's assume model_specific_dd is the PyTorch model dropdown for this context.
    # This means model_specific_dd must *only* be active/relevant when model_family_dd is PyTorch.
    def combined_pytorch_ui_update(model_family_choice, pytorch_model_choice):
        param_range_visible = (model_family_choice == "PyTorch (Neural Networks)")
        if not param_range_visible: # If not PyTorch, hide all PyTorch specific groups
            return gr.update(visible=False), gr.update(visible=False), gr.update(visible=False)
        
        is_mlp = (pytorch_model_choice == "Simple Neural Network (MLP)")
        is_cnn = (pytorch_model_choice == "Simple Convolutional Network (CNN)")
        return gr.update(visible=param_range_visible), gr.update(visible=is_mlp), gr.update(visible=is_cnn)

    model_family_dd.change(fn=combined_pytorch_ui_update, 
                           inputs=[model_family_dd, model_specific_dd], 
                           outputs=[pytorch_param_range_dd, pt_mlp_specific_group, pt_cnn_specific_group])
    model_specific_dd.change(fn=combined_pytorch_ui_update, # Also trigger when specific PyTorch model changes
                             inputs=[model_family_dd, model_specific_dd],
                             outputs=[pytorch_param_range_dd, pt_mlp_specific_group, pt_cnn_specific_group])

    # Suggest MLP Layers
    def mlp_suggest_proxy(target_range_str, current_logs, dataset_preview_df, target_col_name, task_type):
        logs = current_logs
        input_dim_est = 10 # default if no data
        output_dim_est = 2 if task_type.endswith("Classification") else 1 # default
        
        if dataset_preview_df is not None and isinstance(dataset_preview_df, pd.DataFrame) and not dataset_preview_df.empty and target_col_name:
            try:
                # Attempt to get processed input dim. This is a simplified estimation.
                # A full preprocessing run is too heavy here.
                temp_X = dataset_preview_df.drop(target_col_name, axis=1, errors='ignore')
                num_cols = len(temp_X.select_dtypes(include=np.number).columns)
                cat_cols = temp_X.select_dtypes(include='object').columns
                # Rough estimate of one-hot encoded features
                one_hot_est = sum(min(10, dataset_preview_df[col].nunique()) for col in cat_cols) # cap nunique
                input_dim_est = num_cols + one_hot_est
                input_dim_est = max(1, input_dim_est) # Ensure > 0

                if task_type.endswith("Classification"):
                    output_dim_est = max(1, dataset_preview_df[target_col_name].nunique())
                    if output_dim_est == 2: output_dim_est = 1 # For binary an output of 1 is common in NNs
                logs += f"Estimated input_dim: {input_dim_est}, output_dim: {output_dim_est} for MLP suggestion.\n"
            except Exception as e:
                logs += f"Could not estimate dims from preview for MLP suggestion: {e}. Using defaults.\n"
        else:
            logs += "Dataset preview not available for MLP dimension estimation. Using defaults.\n"
            
        suggested_str, logs = suggest_mlp_layers_for_range(input_dim_est, output_dim_est, target_range_str, logs)
        
        # Also estimate params for the suggestion
        param_count_str = "Error"
        if suggested_str:
            param_count_str, logs = estimate_current_mlp_params(str(input_dim_est), suggested_str, str(output_dim_est), logs)

        return suggested_str, logs, param_count_str

    pt_mlp_suggest_btn.click(
        fn=mlp_suggest_proxy,
        inputs=[pytorch_param_range_dd, current_logs_state, dataset_preview_df, target_column_name_txt, task_type_dd],
        outputs=[pt_mlp_hidden_layers_txt, training_log_txt, pt_mlp_param_count_txt] # Use training_log_txt for logs from suggestion
    )
    
    # Estimate MLP params when hidden layers text changes (might be too slow if hooked to .change)
    # A button is safer for this. For now, rely on suggestion button or post-training report.
    # We can add an "Estimate Current MLP Params" button if needed.

    # Show/hide dataset generation/upload groups
    def toggle_dataset_source_groups(source_choice):
        return gr.update(visible=(source_choice == "Generate new dataset")), \
               gr.update(visible=(source_choice == "Upload my own dataset (CSV, JSON, Parquet)"))
    dataset_source_rb.change(fn=toggle_dataset_source_groups, inputs=dataset_source_rb, 
                             outputs=[generate_dataset_group, upload_dataset_group])

    # Update model output formats based on family
    model_family_dd.change(fn=update_model_output_formats, inputs=model_family_dd, outputs=model_output_format_dd)

    # Dataset Generation Button
    def generate_dataset_wrapper(task_type, n_samples, n_features, n_classes_info, ds_format, ai_sugg, param_range, model_type, logs_in):
        preview, data_obj, logs_out, file_out = generate_dataset_backend(
            task_type, n_samples, n_features, n_classes_info, ds_format, ai_sugg, param_range, model_type, logs_in
        )
        # Store the actual data (DataFrame or (X,y) tuple) in state if generation was successful
        # If it's a filepath (from upload), store the path.
        # For generated data, store the df or (X,y) tuple to avoid disk I/O if not necessary before training.
        stored_data = data_obj if data_obj is not None else None
        return preview, stored_data, logs_out, file_out

    generate_dataset_btn.click(
        fn=generate_dataset_wrapper,
        inputs=[task_type_dd, ds_gen_samples_num, ds_gen_features_num, ds_gen_classes_informative_num, 
                ds_gen_format_dd, ds_gen_ai_suggest_cb, pytorch_param_range_dd, model_specific_dd, current_logs_state],
        outputs=[dataset_preview_df, generated_data_state, training_log_txt, generated_dataset_download_file]
    )

    # Handle dataset upload
    def process_uploaded_file(file_obj, logs_in):
        logs = logs_in
        if file_obj is None:
            return None, logs, "Please upload a file first.", None
        logs += f"Uploaded file: {file_obj.name}\n"
        
        # For preview, try to read a few lines
        df_preview = None
        try:
            if file_obj.name.endswith(".csv"):
                df_preview = pd.read_csv(file_obj.name, nrows=5)
            elif file_obj.name.endswith(".json"): # Assuming JSONL
                df_preview = pd.read_json(file_obj.name, lines=True, nrows=5)
            elif file_obj.name.endswith(".parquet"):
                # Reading only 5 rows from parquet is not straightforward without loading more.
                # For simplicity, load full and take head, or skip preview.
                temp_df = pd.read_parquet(file_obj.name)
                df_preview = temp_df.head()
            logs += "Preview generated for uploaded file.\n"
        except Exception as e:
            logs += f"Could not generate preview for {file_obj.name}: {e}\n"
            return None, logs, f"Error previewing: {e}", file_obj.name # Return path even if preview fails
            
        return df_preview, logs, "File ready for training.", file_obj.name # Store path in generated_data_state

    ds_upload_file.upload(
        fn=process_uploaded_file,
        inputs=[ds_upload_file, current_logs_state],
        outputs=[dataset_preview_df, training_log_txt, training_log_txt, generated_data_state] # Use training_log for status, then store path
    )


    # Train Model Button
    def train_model_wrapper(data_state_val, # This will be DataFrame, (X,y) tuple, or filepath string
                            target_col, task_type, model_family, model_name, # Common params
                            # Sklearn specific (none for now beyond model_name)
                            # PyTorch specific
                            pt_model_type, pt_mlp_hidden, pt_mlp_activ, #pt_cnn_params (later)
                            epochs, batch_size, lr,
                            model_out_format,
                            logs_in): # Accumulate logs

        current_logs = logs_in + "\n--- Initiating Training ---\n"
        current_logs += f"Data state type: {type(data_state_val)}\n"

        if data_state_val is None:
            current_logs += "Error: No dataset loaded or generated. Please go to Tab 2.\n"
            return current_logs, "No data available.", None, "N/A", None, None # logs, metrics, model_file, params, plot, download_btn_update

        if not target_col and (task_type.startswith("Tabular") or (isinstance(data_state_val, pd.DataFrame) and model_type_pt != "Simple Convolutional Network (CNN)")) : # Target col needed for tabular
             current_logs += "Error: Target column name is required for this task/data.\n"
             return current_logs, "Target column needed.", None, "N/A", None, None
        
        # Ensure logs are passed and returned correctly by train functions
        if model_family == "Scikit-learn (Classical ML)":
            logs, metrics, model_file, params = train_model_sklearn(
                data_state_val, target_col, task_type, model_name, model_out_format, current_logs
            )
            return logs, metrics, model_file, params, None, model_file # No plot for sklearn here
        
        elif model_family == "PyTorch (Neural Networks)":
            # model_name here is the PyTorch model type (MLP or CNN)
            logs, metrics, model_file, params, plot = train_model_pytorch(
                data_state_val, target_col, task_type, model_name, 
                pt_mlp_hidden, pt_mlp_activ,
                epochs, batch_size, lr,
                model_out_format, current_logs
            )
            return logs, metrics, model_file, params, plot, model_file
        else:
            current_logs += f"Unknown model family: {model_family}\n"
            return current_logs, "Unknown model family.", None, "N/A", None, None

    train_model_btn.click(
        fn=train_model_wrapper,
        inputs=[
            generated_data_state, target_column_name_txt, task_type_dd, model_family_dd, model_specific_dd,
            # PyTorch specific inputs (will be None if not PyTorch family, but passed)
            model_specific_dd, # This is pt_model_type if family is PyTorch
            pt_mlp_hidden_layers_txt, pt_mlp_activation_dd,
            train_epochs_num, train_batch_size_num, train_learning_rate_num,
            model_output_format_dd,
            training_log_txt # Pass current log content to append
        ],
        outputs=[
            training_log_txt, evaluation_metrics_txt, download_trained_model_file, 
            model_param_count_output_txt, loss_plot_img,
            download_trained_model_file # This seems redundant, download_trained_model_file is already an output
        ]
    )

    # Clear logs button (optional)
    # clear_logs_btn = gr.Button("Clear Logs")
    # def clear_logs_func(): return "", "" # Clears current_logs_state and training_log_txt
    # clear_logs_btn.click(clear_logs_func, [], [current_logs_state, training_log_txt])


demo.queue().launch(debug=True, show_error=True) # Enable queue for longer tasks, debug for local testing