File size: 57,723 Bytes
3ca3212
 
c2b5d4f
3ca3212
 
 
 
c2b5d4f
 
 
 
 
 
 
 
 
 
 
 
3ca3212
 
c2b5d4f
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba2eb6f
 
 
 
3ca3212
 
 
 
 
 
 
 
c2b5d4f
 
 
3ca3212
 
c2b5d4f
 
3ca3212
 
 
 
 
 
c2b5d4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca3212
 
 
 
 
c2b5d4f
 
3ca3212
 
 
c2b5d4f
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca3212
 
 
 
 
c2b5d4f
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca3212
 
 
 
 
c2b5d4f
 
 
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca3212
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
 
 
3ca3212
c2b5d4f
 
3ca3212
 
 
 
 
 
c2b5d4f
 
3ca3212
 
c2b5d4f
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
3ca3212
 
c2b5d4f
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
3ca3212
 
c2b5d4f
 
3ca3212
 
 
 
 
c2b5d4f
 
 
3ca3212
c2b5d4f
 
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c2b5d4f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca3212
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ba2eb6f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3ca3212
 
 
 
 
c2b5d4f
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
<!DOCTYPE html>
<html lang="en">

<head>
  <meta charset="UTF-8" />
  <meta name="viewport" content="width=device-width, initial-scale=1.0" />
  <title>Feature Engineering Explorer</title>

  <!-- MathJax for rendering LaTeX formulas -->
  <script src="https://polyfill.io/v3/polyfill.min.js?features=es6"></script>
  <script>
    MathJax = {
      tex: {
        inlineMath: [['$', '$'], ['\\(', '\\)']]
      }
    };
  </script>
  <script id="MathJax-script" async src="https://cdn.jsdelivr.net/npm/mathjax@3/es5/tex-mml-chtml.js"></script>

  <link rel="stylesheet" href="style.css" />
</head>

<body>
  <div class="app flex">
    <!-- Sidebar Navigation -->
    <aside class="sidebar" id="sidebar">
      <h1 class="sidebar__title">🛠️ Feature Engineering</h1>
      <nav>
        <ul class="nav__list" id="navList">
          <li><a href="#intro" class="nav__link">🎯 Introduction</a></li>
          <li><a href="#missing-data" class="nav__link">🔍 Missing Data</a></li>
          <li><a href="#outliers" class="nav__link">📊 Outliers</a></li>
          <li><a href="#scaling" class="nav__link">⚖️ Scaling</a></li>
          <li><a href="#encoding" class="nav__link">🔢 Encoding</a></li>
          <li><a href="#feature-selection" class="nav__link">🎯 Feature Selection</a></li>
          <li><a href="#imbalanced-data" class="nav__link">⚖️ Imbalanced Data</a></li>
          <li><a href="#eda" class="nav__link">📈 EDA Overview</a></li>
          <li><a href="#feature-transformation" class="nav__link">🔄 Feature Transformation</a></li>
          <li><a href="#feature-creation" class="nav__link">🛠️ Feature Creation</a></li>
          <li><a href="#dimensionality-reduction" class="nav__link">📉 Dimensionality Reduction</a></li>
          <li><a href="#text-data" class="nav__link">📝 Text Data (NLP)</a></li>
          <li><a href="#time-series" class="nav__link">⏳ Time-Series</a></li>
          <li><a href="#target-leakage" class="nav__link">⚠️ Target Leakage</a></li>
          <li><a href="#automated-fe" class="nav__link">🤖 Automated FE</a></li>
      </nav>
    </aside>

    <!-- Main Content -->
    <main class="content" id="content">
      <!-- ============================ 1. INTRO ============================ -->
      <section id="intro" class="topic-section">
        <h2>Introduction to Feature Engineering</h2>
        <p>Feature Engineering is the process of transforming raw data into meaningful inputs that boost
          machine-learning model performance. A well-crafted feature set can improve accuracy by 10-30% without changing
          the underlying algorithm.</p>

        <div class="info-card">
          <strong>Key Idea:</strong> 💡 Thoughtful features provide the model with clearer patterns, like lenses
          sharpening a blurry picture.
        </div>

        <!-- Canvas Visual -->
        <div class="canvas-wrapper">
          <canvas id="canvas-intro" width="600" height="280"></canvas>
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>setup.py - Pandas Basics</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>import pandas as pd
import numpy as np

# Load the dataset
df = pd.read_csv('housing_data.csv')

# Inspect raw data types and missing values
df.info()

# View summary statistics
display(df.describe())</code></pre>
        </div>
      </section>

      <!-- ====================== 2. HANDLING MISSING DATA ================== -->
      <section id="missing-data" class="topic-section">
        <h2>Handling Missing Data</h2>
        <p>Missing values come in three flavors: MCAR (Missing Completely At Random), MAR (Missing At Random), and MNAR
          (Missing Not At Random). Each demands different treatment to avoid bias.</p>

        <!-- Real-world Example -->
        <div class="info-card">
          <strong>Real Example:</strong> A hospital's patient records often have absent <em>cholesterol</em> values
          because certain tests were not ordered for healthy young adults.
        </div>

        <!-- Controls -->
        <div class="form-group">
          <button id="btn-mean-impute" class="btn btn--primary">Mean Imputation</button>
          <button id="btn-median-impute" class="btn btn--primary">Median Imputation</button>
          <button id="btn-knn-impute" class="btn btn--primary">KNN Imputation</button>
        </div>

        <!-- Canvas -->
        <div class="canvas-wrapper">
          <canvas id="canvas-missing-data" width="800" height="500"></canvas>
        </div>

        <!-- Callouts -->
        <div class="callout callout--insight">💡 Mean/Median work best when data is MCAR or MAR.</div>
        <div class="callout callout--mistake">⚠️ Using mean imputation on skewed data can distort distributions.</div>
        <div class="callout callout--tip">✅ Always impute <strong>after</strong> splitting into train and test to avoid
          leakage.</div>

        <div class="info-card" style="margin-top: 20px; border-left-color: #9900ff;">
          <h3 style="margin-top: 0; color: #9900ff;">🧠 Under the Hood: Imputation Math</h3>
          <p><strong>KNN Imputation</strong> predicts missing values by finding the $k$ closest neighbors using a
            distance metric like Euclidean distance. For two samples $x$ and $y$ with $n$ features, ignoring missing
            dimensions:</p>
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ d(x, y) = \sqrt{\sum_{i=1}^{n} w_i (x_i - y_i)^2} $$
          </div>
          <p style="margin-bottom: 0;">Once the $k$ neighbors are found, their values are averaged (or weighted by
            distance) to fill the missing slot. This preserves local cluster distributions better than global mean
            imputation.</p>
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>missing_data.py - Scikit-Learn Imputers</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>from sklearn.impute import SimpleImputer, KNNImputer

# 1. Simple Imputation (Mean/Median/Most Frequent)
# Good for MCAR (Missing Completely At Random)
mean_imputer = SimpleImputer(strategy='mean')
df['age_imputed'] = mean_imputer.fit_transform(df[['age']])

# 2. KNN Imputation (Distance-based)
# Good for MAR (Missing At Random) when variables are correlated
knn_imputer = KNNImputer(n_neighbors=5, weights='distance')
df_imputed = knn_imputer.fit_transform(df)

# Note: Tree-based models like XGBoost can handle NaNs natively!</code></pre>
        </div>
      </section>

      <!-- ======================= 3. HANDLING OUTLIERS ===================== -->
      <section id="outliers" class="topic-section">
        <h2>Handling Outliers</h2>
        <p>Outliers are data points that deviate markedly from others. Detecting and treating them prevents skewed
          models.</p>

        <div class="form-group">
          <button id="btn-detect-iqr" class="btn btn--primary">IQR Method</button>
          <button id="btn-detect-zscore" class="btn btn--primary">Z-Score Method</button>
          <button id="btn-winsorize" class="btn btn--primary">Winsorization</button>
        </div>

        <div class="canvas-wrapper">
          <canvas id="canvas-outliers" width="600" height="300"></canvas>
        </div>

        <div class="callout callout--insight">💡 The IQR method is robust to non-normal data.</div>
        <div class="callout callout--mistake">⚠️ Removing legitimate extreme values can erase important signals.</div>

        <div class="info-card" style="margin-top: 20px; border-left-color: #9900ff;">
          <h3 style="margin-top: 0; color: #9900ff;">🧠 Under the Hood: Outlier Math</h3>
          <p><strong>Z-Score</strong> measures how many standard deviations $\sigma$ a point is from the mean $\mu$. It
            assumes the data is normally distributed:</p>
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ z = \frac{x - \mu}{\sigma} \quad \text{(Threshold: } |z| > 3 \text{)} $$
          </div>
          <p style="margin-bottom: 0;"><strong>IQR (Interquartile Range)</strong> is non-parametric. It defines fences
            based on the 25th ($Q1$) and 75th ($Q3$) percentiles: $[Q1 - 1.5 \times \text{IQR},\ Q3 + 1.5 \times
            \text{IQR}]$. <em>Winsorization</em> caps values at these percentiles instead of dropping them.</p>
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>outliers.py - Z-Score and Winsorization</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>import numpy as np
from scipy import stats

# 1. Z-Score Method (Dropping Outliers)
z_scores = np.abs(stats.zscore(df['income']))
# Keep only rows where z-score is less than 3
df_clean = df[z_scores < 3]

# 2. IQR Method (Winsorization / Capping)
# Capping at 5th and 95th percentiles to retain data points
lower_limit = df['income'].quantile(0.05)
upper_limit = df['income'].quantile(0.95)

df['income_capped'] = np.clip(df['income'], lower_limit, upper_limit)</code></pre>
        </div>
      </section>

      <!-- ========================== 4. SCALING ============================ -->
      <section id="scaling" class="topic-section">
        <h2>Feature Scaling</h2>
        <p>Algorithms that rely on distance, like KNN, demand comparable feature magnitudes.</p>

        <div class="form-group">
          <button id="btn-minmax" class="btn btn--primary">Min-Max Scaling</button>
          <button id="btn-standardize" class="btn btn--primary">Standardization</button>
          <button id="btn-robust" class="btn btn--primary">Robust Scaler</button>
        </div>

        <div class="canvas-wrapper">
          <canvas id="canvas-scaling" width="600" height="300"></canvas>
        </div>

        <div class="info-card" style="margin-top: 20px; border-left-color: #9900ff;">
          <h3 style="margin-top: 0; color: #9900ff;">🧠 Under the Hood: Scaling Math</h3>
          <p><strong>Min-Max Scaling (Normalization)</strong> scales data to a fixed range, usually $[0, 1]$:</p>
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ X_{norm} = \frac{X - X_{min}}{X_{max} - X_{min}} $$
          </div>
          <p><strong>Standardization (Z-Score Scaling)</strong> centers the data around a mean of 0 with a standard
            deviation of 1. It does not bound data to a specific range, handling outliers better than Min-Max:</p>
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ X_{std} = \frac{X - \mu}{\sigma} $$
          </div>
          <p style="margin-bottom: 0;"><strong>Robust Scaling</strong> uses statistics that are robust to outliers, like
            the median and Interquartile Range (IQR): $X_{robust} = \frac{X - \text{median}}{Q3 - Q1}$.</p>
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>scaling.py - Scikit-Learn Scalers</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

# 1. Min-Max Scaler (Best for Neural Networks/Images)
minmax = MinMaxScaler()
df[['age_minmax', 'income_minmax']] = minmax.fit_transform(df[['age', 'income']])

# 2. Standard Scaler (Best for PCA, SVM, Logistic Regression)
standard = StandardScaler()
df_scaled = standard.fit_transform(df)

# 3. Robust Scaler (Best when dataset has many outliers)
robust = RobustScaler()
df_robust = robust.fit_transform(df)</code></pre>
        </div>
      </section>

      <!-- ========================== 5. ENCODING =========================== -->
      <section id="encoding" class="topic-section">
        <h2>Data Encoding</h2>
        <p>Transform categorical variables into numbers so models can interpret them.</p>

        <div class="form-group">
          <button id="btn-label-encode" class="btn btn--primary">Label Encoding</button>
          <button id="btn-onehot-encode" class="btn btn--primary">One-Hot Encoding</button>
          <button id="btn-target-encode" class="btn btn--primary">Target Encoding</button>
        </div>

        <div class="canvas-wrapper">
          <canvas id="canvas-encoding" width="600" height="300"></canvas>
        </div>

        <div class="info-card" style="margin-top: 20px; border-left-color: #9900ff;">
          <h3 style="margin-top: 0; color: #9900ff;">🧠 Under the Hood: Target Encoding Math</h3>
          <p><strong>One-Hot Encoding</strong> creates $N$ sparse binary columns for $N$ categories, which can cause the
            "Curse of Dimensionality" for high-cardinality features.</p>
          <p><strong>Target Encoding</strong> replaces a categorical value with the average target value for that
            category. To prevent overfitting (especially on rare categories), a <em>Bayesian Smoothing</em> average is
            applied:</p>
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ S = \lambda \cdot \bar{y}_{cat} + (1 - \lambda) \cdot \bar{y}_{global} $$
          </div>
          <p style="margin-bottom: 0;">Where $\bar{y}_{cat}$ is the mean of the target for the specific category,
            $\bar{y}_{global}$ is the global target mean, and $\lambda$ is a weight between 0 and 1 determined by the
            category's frequency.</p>
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>encoding.py - Category Encoders</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from category_encoders import TargetEncoder

# 1. One-Hot Encoding (Best for nominal variables with few categories)
ohe = OneHotEncoder(sparse_output=False, drop='first') # drop='first' avoids multicollinearity
color_encoded = ohe.fit_transform(df[['color']])

# Pandas alternative (easy but not ideal for pipelines):
# pd.get_dummies(df, columns=['color'], drop_first=True)

# 2. Target Encoding (Best for high-cardinality nominal variables like zipcodes)
# Requires 'category_encoders' library
te = TargetEncoder(smoothing=10) # Higher smoothing pulls estimates closer to global mean
df['zipcode_encoded'] = te.fit_transform(df['zipcode'], df['target'])</code></pre>
        </div>
      </section>

      <!-- ===================== 6. FEATURE SELECTION ======================= -->
      <section id="feature-selection" class="topic-section">
        <h2>Feature Selection</h2>
        <p>Pick features that matter, drop those that don't.</p>

        <div class="form-group">
          <button id="btn-backward-elim" class="btn btn--primary">Backward Elimination</button>
          <button id="btn-forward-select" class="btn btn--primary">Forward Selection</button>
          <button id="btn-rfe" class="btn btn--primary">RFE</button>
        </div>

        <div class="canvas-wrapper">
          <canvas id="canvas-selection" width="600" height="300"></canvas>
        </div>

        <div class="info-card" style="margin-top: 20px; border-left-color: #9900ff;">
          <h3 style="margin-top: 0; color: #9900ff;">🧠 Under the Hood: Selection Math</h3>
          <p>Feature selection can be filter-based, wrapper-based, or intrinsic.</p>
          <p><strong>Filter Method (ANOVA F-Value):</strong> Scikit-Learn's `f_classif` computes the ANOVA F-value
            between numerical features and a categorical target. The F-statistic measures the ratio of variance
            <em>between</em> groups to the variance <em>within</em> groups:
          </p>
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ F = \frac{\text{Between-group variability}}{\text{Within-group variability}} $$
          </div>
          <p style="margin-bottom: 0;"><strong>Wrapper Method (RFE):</strong> Recursive Feature Elimination fits a model
            (e.g., Logistic Regression or Random Forest), ranks features by importance coefficients, drops the weakest
            feature, and repeats until the desired $N$ features remain.</p>
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>selection.py - Feature Selection</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>from sklearn.feature_selection import SelectKBest, f_classif, RFE
from sklearn.linear_model import LogisticRegression

X = df.drop('target', axis=1)
y = df['target']

# 1. Filter Method: SelectKBest (ANOVA F-value)
# Keeps the 5 features with the highest ANOVA F-scores
selector = SelectKBest(score_func=f_classif, k=5)
X_top_5 = selector.fit_transform(X, y)
selected_columns = X.columns[selector.get_support()]

# 2. Wrapper Method: Recursive Feature Elimination (RFE)
# Uses a model's intrinsic feature importance assigning to prune
estimator = LogisticRegression()
rfe = RFE(estimator, n_features_to_select=5, step=1)
X_rfe = rfe.fit_transform(X, y)
rfe_columns = X.columns[rfe.support_]</code></pre>
        </div>
      </section>

      <!-- =================== 7. IMBALANCED DATA =========================== -->
      <section id="imbalanced-data" class="topic-section">
        <h2>Handling Imbalanced Data</h2>
        <p>Class imbalance leads to biased predictions. Balancing techniques can fix this.</p>

        <div class="form-group">
          <button id="btn-rus" class="btn btn--primary">Random Under-Sampling</button>
          <button id="btn-ros" class="btn btn--primary">Random Over-Sampling</button>
          <button id="btn-smote" class="btn btn--primary">SMOTE</button>
        </div>

        <div class="canvas-wrapper">
          <canvas id="canvas-imbalanced" width="600" height="300"></canvas>
        </div>

        <div class="info-card" style="margin-top: 20px; border-left-color: #9900ff;">
          <h3 style="margin-top: 0; color: #9900ff;">🧠 Under the Hood: SMOTE Math</h3>
          <p><strong>SMOTE (Synthetic Minority Over-sampling Technique)</strong> doesn't just duplicate data (like
            Random Over-Sampling). It creates novel synthetic examples by interpolating between existing minority
            instances.</p>
          <p>For a minority class point $x_i$, SMOTE finds its $k$-nearest minority neighbors. It picks one neighbor
            $x_{zi}$ and generates a synthetic point $x_{new}$ along the line segment joining them:</p>
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ x_{new} = x_i + \lambda \times (x_{zi} - x_i) $$
          </div>
          <p style="margin-bottom: 0;">Where $\lambda$ is a random number between 0 and 1. This creates a denser, more
            generalized decision region for the minority class.</p>
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>imbalanced.py - Imblearn Resampling</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

# 1. SMOTE (Over-sampling the minority class)
smote = SMOTE(sampling_strategy='auto', k_neighbors=5, random_state=42)
X_smote, y_smote = smote.fit_resample(X, y)

# 2. Random Under-Sampling (Reducing the majority class)
rus = RandomUnderSampler(sampling_strategy='auto', random_state=42)
X_rus, y_rus = rus.fit_resample(X, y)

# 3. Best Practice Pipeline: Under-sample majority THEN SMOTE minority
# Prevents creating too many synthetic points if the imbalance is extreme
resample_pipe = Pipeline([
    ('rus', RandomUnderSampler(sampling_strategy=0.1)), # Reduce majority until minority is 10%
    ('smote', SMOTE(sampling_strategy=0.5))             # SMOTE minority until it's 50%
])
X_resampled, y_resampled = resample_pipe.fit_resample(X, y)</code></pre>
        </div>
      </section>

      <!-- ========================== 8. EDA ================================ -->
      <section id="eda" class="topic-section">
        <h2>Exploratory Data Analysis (EDA)</h2>
        <p><strong>Exploratory Data Analysis (EDA)</strong> is a critical step in the machine learning pipeline that
          comes BEFORE feature engineering. EDA helps you understand your data, discover patterns, identify anomalies,
          detect outliers, test hypotheses, and check assumptions through summary statistics and graphical
          representations.</p>

        <div class="info-card">
          <strong>Key Questions EDA Answers:</strong>
          <ul>
            <li>How many columns are numerical vs. categorical?</li>
            <li>What does the data distribution look like?</li>
            <li>Are there missing values?</li>
            <li>Are there outliers?</li>
            <li>Is the data imbalanced (for classification problems)?</li>
            <li>What are the correlations between features?</li>
            <li>Are there any trends or patterns?</li>
          </ul>
        </div>

        <div class="info-card">
          <strong>Real-World Example:</strong> Imagine you're analyzing customer data for a bank to predict loan
          defaults. EDA helps you understand:
          <ul>
            <li>Age distribution of customers (histogram)</li>
            <li>Income levels (box plot for outliers)</li>
            <li>Correlation between income and loan amount (scatter plot)</li>
            <li>Missing values in employment history</li>
            <li>Class imbalance (5% defaults vs 95% non-defaults)</li>
          </ul>
        </div>

        <h3>Two Main Types of EDA</h3>

        <h4>1. Descriptive Statistics</h4>
        <p><strong>Purpose:</strong> Summarize and visualize what the data looks like</p>

        <div class="info-card">
          <strong>A. Central Tendency:</strong><br><strong>Mean (Average):</strong> μ = Σxᵢ / n<br>
          &nbsp;&nbsp;Example: Average income = $50,000 (Sensitive to outliers)<br><strong>Median:</strong> Middle value when sorted<br>
          &nbsp;&nbsp;Example: Median income = $45,000 (Robust to outliers)<br><strong>Mode:</strong> Most frequent value<br>
          &nbsp;&nbsp;Example: Most common age = 35 years<br><br>

          <strong>B. Variability (Spread):</strong><br><strong>Variance:</strong> σ² = Σ(xᵢ - μ)² / n (Measures how spread out data is)<br><strong>Standard Deviation:</strong> σ = √variance<br>
          &nbsp;&nbsp;68% of data within 1σ, 95% within 2σ, 99.7% within 3σ (for normal distribution)<br><strong>Interquartile Range (IQR):</strong> Q3 - Q1<br>
          &nbsp;&nbsp;Middle 50% of data, robust to outliers<br><br>

          <strong>C. Correlation &amp; Associations:</strong><br><strong>Pearson Correlation:</strong> r = Cov(X,Y) / (σₓ × σᵧ)<br>
          &nbsp;&nbsp;Range: -1 to +1<br>
          &nbsp;&nbsp;r = +1: Perfect positive correlation<br>
          &nbsp;&nbsp;r = 0: No linear correlation<br>
          &nbsp;&nbsp;r = -1: Perfect negative correlation<br><strong>Thresholds:</strong> |r| &gt; 0.7: Strong, |r| = 0.5-0.7: Moderate, |r| &lt; 0.3: Weak
        </div>

        <h4>2. Inferential Statistics</h4>
        <p><strong>Purpose:</strong> Make inferences or generalizations about the population from the sample</p>
        <p><strong>Key Question:</strong> Can we claim this effect exists in the larger population, or is it just by
          chance?</p>

        <div class="info-card">
          <strong>A. Hypothesis Testing:</strong><br><strong>Null Hypothesis (H₀):</strong> No effect exists (e.g., "Mean of Group A = Mean of Group B")<br><strong>Alternative Hypothesis (H₁):</strong> Effect exists (e.g., "Mean of Group A ≠ Mean of Group B")<br><strong>P-value:</strong> Probability of observing data if H₀ is true<br>
          &nbsp;&nbsp;p &lt; 0.05: Reject H₀ (effect is statistically significant)<br>
          &nbsp;&nbsp;p &gt; 0.05: Fail to reject H₀ (not enough evidence)<br><br>

          <strong>Example:</strong><br>
          • H₀: "There is no difference between positive and negative movie review lengths"<br>
          • H₁: "Negative reviews are longer than positive reviews"<br>
          • After t-test: p = 0.003 (&lt; 0.05)<br>
          • Conclusion: Reject H₀ → Negative reviews ARE significantly longer<br><br>

          <strong>B. Confidence Intervals:</strong><br>
          • Range where true population parameter likely lies<br>
          • 95% CI: We're 95% confident the true value is within this range<br>
          • Example: "Average customer age is 35 ± 2 years (95% CI: [33, 37])"<br><br>

          <strong>C. Effect Size:</strong><br>
          • Cohen's d = (mean₁ - mean₂) / pooled_std<br>
          • Small effect: d = 0.2, Medium: d = 0.5, Large: d = 0.8
        </div>

        <h3>Algorithm Steps for EDA</h3>
        <div class="info-card">
          <strong>1. Load and Inspect Data:</strong> df.head(), df.info(), df.describe()<br>
          <strong>2. Handle Missing Values:</strong> Identify (df.isnull().sum()), Visualize, Decide<br>
          <strong>3. Analyze Distributions:</strong> Histograms, count plots, box plots<br>
          <strong>4. Check for Imbalance:</strong> Count target classes, plot distribution<br>
          <strong>5. Correlation Analysis:</strong> Correlation matrix, heatmap, identify multicollinearity<br>
          <strong>6. Statistical Testing:</strong> Compare groups (t-test, ANOVA), test assumptions, calculate effect
          sizes
        </div>

        <h3>Interactive EDA Dashboard</h3>
        <div class="form-group">
          <label for="edaFeature" class="form-label">Select Feature:</label>
          <select id="edaFeature" class="form-control w-100">
            <option value="age">Age</option>
            <option value="income">Income</option>
            <option value="credit">Credit Score</option>
          </select>
        </div>

        <div class="form-group">
          <label for="confidenceLevel" class="form-label">Confidence Level: <span
              id="confidenceValue">95</span>%</label>
          <input type="range" id="confidenceLevel" min="90" max="99" step="1" value="95" class="form-control" />
        </div>

        <div class="form-group">
          <button id="btn-histogram" class="btn btn--primary">Show Histogram</button>
          <button id="btn-boxplot" class="btn btn--primary">Show Box Plot</button>
          <button id="btn-correlation" class="btn btn--primary">Show Correlation</button>
        </div>

        <div class="canvas-wrapper">
          <canvas id="canvas-eda" width="800" height="500"></canvas>
        </div>

        <div class="callout callout--insight">💡 EDA typically takes 30-40% of total project time. Good EDA reveals
          which features to engineer.</div>
        <div class="callout callout--mistake">⚠️ Common Mistakes: Skipping EDA, not checking outliers before scaling,
          ignoring missing value patterns, overlooking class imbalance, ignoring multicollinearity.</div>
        <div class="callout callout--tip">✅ Best Practices: ALWAYS start with EDA, visualize EVERY feature, check
          correlations with target, document insights, use both descriptive and inferential statistics.</div>

        <div class="info-card" style="margin-top: 20px; border-left-color: #9900ff;">
          <h3 style="margin-top: 0; color: #9900ff;">🧠 Under the Hood: Skewness & Kurtosis</h3>
          <p>Beyond mean and variance, we examine the geometric shape of our distributions using the 3rd and 4th
            statistical moments.</p>
          <p><strong>Skewness ($s$)</strong> measures asymmetry. Positive means right-tailed, negative means
            left-tailed:</p>
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ s = \frac{\frac{1}{n} \sum_{i=1}^{n} (x_i - \bar{x})^3}{\sigma^3} $$
          </div>
          <p><strong>Kurtosis ($k$)</strong> measures "tailedness" (presence of outliers). A normal distribution has a
            kurtosis of 3. High kurtosis means heavy tails:</p>
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ k = \frac{\frac{1}{n} \sum_{i=1}^{n} (x_i - \bar{x})^4}{\sigma^4} $$
          </div>
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>eda.py - Automated & Visual EDA</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# 1. Deep Descriptive Stats (includes skewness)
display(df.describe().T)
print("Skewness:\n", df.skew())
print("\nMissing Values:\n", df.isnull().sum())

# 2. Visual Distributions (Pairplot)
# Plots histograms on the diagonal and scatter plots for every relationship
sns.pairplot(df, hue='target_class', diag_kind='kde', corner=True)
plt.show()

# 3. Correlation Heatmap
plt.figure(figsize=(10, 8))
corr_matrix = df.corr(method='spearman') # Spearman is robust to non-linear relationships
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Spearman Correlation Heatmap")
plt.show()</code></pre>
        </div>

        <h3>Use Cases and Applications</h3>
        <ul>
          <li><strong>Healthcare:</strong> Analyzing patient data before building disease prediction models</li>
          <li><strong>Finance:</strong> Understanding customer demographics before credit scoring</li>
          <li><strong>E-commerce:</strong> Analyzing purchase patterns before recommendation systems</li>
          <li><strong>Marketing:</strong> Understanding customer segments before targeted campaigns</li>
          <li><strong>Time Series:</strong> Checking for seasonality and trends in sales data</li>
        </ul>

        <h3>Summary &amp; Key Takeaways</h3>
        <p>Exploratory Data Analysis is the foundation of any successful machine learning project. It combines
          <strong>descriptive statistics</strong> (mean, median, variance, correlation) with <strong>inferential
            statistics</strong> (hypothesis testing, confidence intervals) to understand data deeply.
        </p>
        <p><strong>Descriptive EDA</strong> answers: "What is happening in the dataset?"<br>
          <strong>Inferential EDA</strong> answers: "Can we claim this effect exists in the larger population?"
        </p>
        <p>Remember: <strong>Data → EDA → Feature Engineering → ML → Deployment</strong></p>
      </section>

      <!-- =================== 9. FEATURE TRANSFORMATION ==================== -->
      <section id="feature-transformation" class="topic-section">
        <h2>Feature Transformation</h2>
        <p>Feature transformation creates new representations of data to capture non-linear patterns. Techniques like
          polynomial features, binning, and mathematical transformations unlock hidden relationships.</p>

        <div class="info-card">
          <strong>Real Example:</strong> Predicting house prices with polynomial features (adding x² terms) improves
          model fit for non-linear relationships between square footage and price.
        </div>

        <h3>Mathematical Foundations</h3>
        <div class="info-card">
          <strong>Polynomial Features:</strong> Transform (x₁, x₂) → (1, x₁, x₂, x₁², x₁x₂, x₂²)<br>
          • Degree 2 example: For features (x, y) → (1, x, y, x², xy, y²)<br>
          • 2 features with degree=2 creates 6 features total<br><br>
          <strong>Binning:</strong> Convert continuous → categorical<br>
          • Equal-width: Divide range into equal intervals<br>
          • Quantile: Each bin has equal number of samples<br>
          • Example: Age (0-100) → [0-18], [19-35], [36-60], [61+]<br><br>
          <strong>Mathematical Transformations:</strong><br>
          • Square Root: √x (reduces right skew)<br>
          • Log Transform: log(1 + x)<br>
          • Box-Cox: λ = 0: log(x), λ ≠ 0: (x^λ - 1)/λ
        </div>

        <div class="form-group">
          <button id="btn-polynomial" class="btn btn--primary">Add Polynomial Features</button>
          <button id="btn-binning" class="btn btn--primary">Apply Binning</button>
          <button id="btn-log" class="btn btn--primary">Log Transform</button>
        </div>

        <div class="canvas-wrapper">
          <canvas id="canvas-transformation" width="700" height="350"></canvas>
        </div>

        <div class="callout callout--insight">💡 Polynomial features capture curve fitting, but degree=3 on 10 features
          creates 286 features!</div>
        <div class="callout callout--mistake">⚠️ Always scale features after polynomial transformation to prevent
          magnitude issues.</div>
        <div class="callout callout--tip">✅ Start with degree=2 and visualize distributions before/after transformation.
        </div>

        <div class="info-card" style="margin-top: 20px; border-left-color: #9900ff;">
          <h3 style="margin-top: 0; color: #9900ff;">🧠 Under the Hood: Power Transforms</h3>
          <p>When log transformations $\ln(1+x)$ aren't enough to fix severe skewness, we use parametric Power
            Transformations like <strong>Box-Cox</strong> (requires $x > 0$) or <strong>Yeo-Johnson</strong> (supports
            negative values). They automatically find the optimal $\lambda$ parameter using Maximum Likelihood
            Estimation.</p>
          <p><strong>Box-Cox Transformation Formula:</strong></p>
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ x^{(\lambda)} = \begin{cases} \frac{x^\lambda - 1}{\lambda} & \text{if } \lambda \neq 0 \\ \ln(x) &
            \text{if } \lambda = 0 \end{cases} $$
          </div>
          <p style="margin-bottom: 0;">These transforms stretch and compress the variable to map it as closely to a
            Gaussian (Normal) distribution as mathematically possible.</p>
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>transformation.py - Power Transforms & Binning</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>import numpy as np
from sklearn.preprocessing import PowerTransformer, KBinsDiscretizer

# 1. Power Transformation (Yeo-Johnson)
# Attempts to map skewed feature to a Gaussian distribution
pt = PowerTransformer(method='yeo-johnson', standardize=True)
df['income_gaussian'] = pt.fit_transform(df[['income']])

# 2. Log Transformation (np.log1p handles zeros safely by doing log(1+x))
df['revenue_log'] = np.log1p(df['revenue'])

# 3. Discretization / Binning
# Converts continuous age into 5 categorical bins (quantiles ensures equal frequency per bin)
binner = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='quantile')
df['age_group'] = binner.fit_transform(df[['age']])</code></pre>
        </div>

        <h3>Use Cases</h3>
        <ul>
          <li>Polynomial features for non-linear house price prediction</li>
          <li>Binning age into groups for marketing segmentation</li>
          <li>Log transformation for right-skewed income data</li>
        </ul>
      </section>

      <!-- =================== 10. FEATURE CREATION ========================= -->
      <section id="feature-creation" class="topic-section">
        <h2>Feature Creation</h2>
        <p>Creating new features from existing ones based on domain knowledge. Interaction terms, ratios, and
          domain-specific calculations enhance model performance.</p>

        <div class="info-card">
          <strong>Real Example:</strong> E-commerce revenue = price × quantity. Profit margin = (selling_price -
          cost_price) / cost_price. These derived features often have stronger predictive power than raw features.
        </div>

        <h3>Mathematical Foundations</h3>
        <div class="info-card">
          <strong>Interaction Terms:</strong> feature₁ × feature₂<br>
          • Example: advertising_budget × seasonality → total_impact<br>
          • Why: Captures how one feature's effect depends on another<br><br>
          <strong>Ratio Features:</strong> feature₁ / feature₂<br>
          • Example: price/sqft, income/age<br><br>
          <strong>Domain-Specific Features:</strong><br>
          • BMI = weight(kg) / height²(m²)<br>
          • Speed = distance / time<br>
          • Profit margin = (revenue - cost) / cost<br><br>
          <strong>Time-Based Features:</strong><br>
          • Extract: year, month, day, weekday, hour<br>
          • Create: is_weekend, is_holiday, season
        </div>

        <div class="form-group">
          <button id="btn-interaction" class="btn btn--primary">Create Interaction</button>
          <button id="btn-ratio" class="btn btn--primary">Create Ratio</button>
          <button id="btn-bmi" class="btn btn--primary">Calculate BMI</button>
        </div>

        <div class="canvas-wrapper">
          <canvas id="canvas-creation" width="700" height="350"></canvas>
        </div>

        <div class="callout callout--insight">💡 Interaction terms are especially powerful in linear models - neural
          networks learn them automatically.</div>
        <div class="callout callout--mistake">⚠️ Creating features without domain knowledge leads to meaningless
          combinations.</div>
        <div class="callout callout--tip">✅ Always check correlation between new and existing features to avoid
          redundancy.</div>

        <div class="info-card" style="margin-top: 20px; border-left-color: #9900ff;">
          <h3 style="margin-top: 0; color: #9900ff;">🧠 Under the Hood: Polynomial Combinations</h3>
          <p>Scikit-Learn's `PolynomialFeatures` generates a new feature matrix consisting of all polynomial
            combinations of the features with degree less than or equal to the specified degree.</p>
          <p>For two features $X = [x_1, x_2]$ and a degree of 2, the expanded polynomial vector is:</p>
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ [1,\; x_1,\; x_2,\; x_1^2,\; x_1 \cdot x_2,\; x_2^2] $$
          </div>
          <p style="margin-bottom: 0;">Notice the $x_1 \cdot x_2$ term. This is an <strong>interaction term</strong>,
            which lets a linear model learn conditional relationships (e.g., "if $x_1$ is high, the effect of $x_2$
            changes").</p>
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>creation.py - Automated Polynomial Features</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>from sklearn.preprocessing import PolynomialFeatures
import pandas as pd

# Assume df has two features: 'length' and 'width'
X = df[['length', 'width']]

# Create polynomial and interaction features up to degree 2
# include_bias=False prevents adding a column of 1s (intercept)
poly = PolynomialFeatures(degree=2, include_bias=False)
X_poly = poly.fit_transform(X)

# Get the names of the new features (e.g., 'length^2', 'length width')
feature_names = poly.get_feature_names_out(['length', 'width'])
df_poly = pd.DataFrame(X_poly, columns=feature_names)

print(df_poly.head())</code></pre>
        </div>

        <h3>Use Cases</h3>
        <ul>
          <li>BMI from height and weight in healthcare prediction</li>
          <li>Click-through rate = clicks / impressions in digital marketing</li>
          <li>Revenue = price × quantity in retail analytics</li>
        </ul>
      </section>

      <!-- ================ 11. DIMENSIONALITY REDUCTION ==================== -->
      <section id="dimensionality-reduction" class="topic-section">
        <h2>Dimensionality Reduction</h2>
        <p>Reducing the number of features while preserving information. PCA (Principal Component Analysis) projects
          high-dimensional data onto lower dimensions by finding directions of maximum variance.</p>

        <div class="info-card">
          <strong>Real Example:</strong> Image compression and genome analysis with thousands of genes benefit from PCA.
          First 2-3 principal components often capture 80%+ of variance.
        </div>

        <h3>PCA Mathematical Foundations</h3>
        <div class="info-card">
          <strong>Algorithm Steps:</strong><br>
          1. Standardize data: $X_{scaled} = \frac{X - \mu}{\sigma}$<br>
          2. Compute covariance matrix: $\Sigma = \frac{1}{n-1} X^T X$<br>
          3. Calculate eigenvalues $\lambda$ and eigenvectors $v$<br>
          4. Sort eigenvectors by eigenvalues (descending)<br>
          5. Select top $k$ eigenvectors (principal components)<br>
          6. Transform: $X_{new} = X \times v_k$<br><br>
          <strong>Explained Variance:</strong> $\frac{\lambda_i}{\sum \lambda_j}$<br>
          <strong>Cumulative Variance:</strong> Shows total information preserved<br><br>
          <strong>Why PCA Works:</strong><br>
          • Removes correlated features<br>
          • Captures maximum variance in fewer dimensions<br>
          • Components are orthogonal (no correlation)
        </div>

        <div class="form-group">
          <label for="slider-components" class="form-label">Number of Components: <span id="pcaValue">2</span></label>
          <input type="range" id="slider-components" min="1" max="3" step="1" value="2" class="form-control" />
        </div>

        <button id="btn-pca-apply" class="btn btn--primary">Apply PCA</button>

        <div class="canvas-wrapper">
          <canvas id="canvas-pca" width="700" height="400"></canvas>
        </div>

        <div class="callout callout--insight">💡 PCA is unsupervised - it doesn't use the target variable. First PC
          always captures most variance.</div>
        <div class="callout callout--mistake">⚠️ Not standardizing before PCA is a critical error - features with large
          scales will dominate.</div>
        <div class="callout callout--tip">✅ Aim for 95% cumulative explained variance when choosing number of
          components.</div>

        <div class="info-card" style="margin-top: 20px; border-left-color: #9900ff;">
          <h3 style="margin-top: 0; color: #9900ff;">🧠 Under the Hood: PCA Math</h3>
          <p>PCA finds the directions (Principal Components) that maximize the variance of the data. Mathematically, it
            works by computing the covariance matrix of the standardized dataset $X$:</p>
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ \Sigma = \frac{1}{n-1} X^T X $$
          </div>
          <p>Then, we solve for the eigenvectors $V$ and eigenvalues $\lambda$ solving $\Sigma V = \lambda V$.</p>
          <ul style="margin-top: 10px; margin-bottom: 0;">
            <li><strong>Eigenvectors</strong> ($v_i$) are the axes of the new feature space (the directions).</li>
            <li><strong>Eigenvalues</strong> ($\lambda_i$) represent the magnitude of variance captured by each vector.
            </li>
          </ul>
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>pca.py - Principal Component Analysis</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import numpy as np

# 1. ALWAYS scale data before PCA
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# 2. Fit PCA without specifying components to see all variance
pca_full = PCA()
pca_full.fit(X_scaled)

# 3. Plot Cumulative Explained Variance
cumulative_variance = np.cumsum(pca_full.explained_variance_ratio_)
plt.plot(cumulative_variance, marker='o')
plt.axhline(y=0.95, color='r', linestyle='--') # 95% threshold
plt.xlabel('Number of Components')
plt.ylabel('Cumulative Explained Variance')
plt.show()

# 4. Apply PCA retaining 95% variance
# Float between 0 and 1 selects components covering that % of variance
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_scaled)
print(f"Reduced from {X.shape[1]} to {X_pca.shape[1]} features.")</code></pre>
        </div>

        <h3>Use Cases</h3>
        <ul>
          <li>Image compression (reduce pixel dimensions)</li>
          <li>Genomics (thousands of genes → few principal components)</li>
          <li>Visualization (project high-D data to 2D for plotting)</li>
          <li>Speed up training (fewer features = faster models)</li>
        </ul>

        <h3>Common Mistakes</h3>
        <ul>
          <li>⚠️ Applying PCA before train-test split (data leakage)</li>
          <li>⚠️ Using PCA with categorical features (PCA is for numerical data)</li>
          <li>⚠️ Losing interpretability (PCs are linear combinations)</li>
        </ul>
      </section>

      <!-- ================== 12. TEXT DATA (NLP BASICS) ==================== -->
      <section id="text-data" class="topic-section">
        <h2>Text Data (NLP Basics)</h2>
        <p>Real-world tabular data often contains unstructured text (e.g., reviews, titles). Algorithms require numbers,
          so we must vectorize this text into numerical representations.</p>

        <div class="info-card">
          <strong>Real Example:</strong> Converting thousands of Amazon product reviews into numeric features allows a
          classification model to predict positive vs. negative sentiment.
        </div>

        <h3>Mathematical Foundations</h3>
        <div class="info-card">
          <strong>Bag of Words (BoW):</strong> Represents text by counting the frequency of each word, ignoring grammar
          and order.<br><br>
          <strong>TF-IDF (Term Frequency - Inverse Document Frequency):</strong><br>
          Penalizes frequent, uninformative words (like "the", "and") while boosting rare, meaningful words.<br><br>
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ \text{TF-IDF}(t, d, D) = \text{TF}(t, d) \times \text{IDF}(t, D) $$
          </div>
          • $\text{TF}$: (count of term $t$ in document $d$) / (total terms in $d$)<br>
          • $\text{IDF}$: $\log \left( \frac{\text{Total Documents } N}{\text{Documents containing term } t} \right)$
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>text_features.py - Scikit-Learn Vectorizers</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import pandas as pd

# Sample text column
corpus = [
    "Machine learning is amazing",
    "Deep learning is the future of learning",
    "Data science and artificial intelligence"
]

# 1. Bag of Words (CountVectorizer)
# Creates a column for every unique word in the corpus
vectorizer = CountVectorizer(stop_words='english')
X_bow = vectorizer.fit_transform(corpus)

# 2. TF-IDF (TfidfVectorizer)
# Converts words to continuous weights between 0 and 1
tfidf = TfidfVectorizer(stop_words='english', max_features=100)
X_tfidf = tfidf.fit_transform(corpus)

# Quick way to view features as a DataFrame
tfidf_df = pd.DataFrame(X_tfidf.toarray(), columns=tfidf.get_feature_names_out())
print(tfidf_df.head())</code></pre>
        </div>

        <h3>Meta-Features</h3>
        <p>Before throwing text into a vectorizer, you can extract powerful <strong>meta-features</strong> using pure
          Python or Pandas:</p>
        <ul>
          <li><strong>Word count:</strong> <code>df['text'].apply(lambda x: len(str(x).split()))</code></li>
          <li><strong>Character count:</strong> <code>df['text'].apply(lambda x: len(str(x)))</code></li>
          <li><strong>Count of punctuation/capitals:</strong> (Often strongly correlated with SPAM or fake reviews).
          </li>
        </ul>
      </section>

      <!-- ================= 13. TIME-SERIES ENGINEERING ==================== -->
      <section id="time-series" class="topic-section">
        <h2>Time-Series Feature Engineering</h2>
        <p>Time-series data assumes that past values influence future values. We cannot simply shuffle rows; order
          matters. We must engineer features that capture chronological patterns.</p>

        <h3>Mathematical Foundations</h3>
        <div class="info-card">
          <strong>Lag Features:</strong> Shifting the target variable back by $t$ steps. "What was yesterday's
          sales?"<br>
          $X_{lag\_1} = Y_{t-1}$<br><br>
          <strong>Rolling Windows:</strong> Computing statistics over a moving window of past data. Smoothes out
          short-term fluctuations to reveal trends.<br>
          • Simple Moving Average (SMA) for window $w$:
          <div
            style="background: rgba(0,0,0,0.2); padding: 15px; border-radius: 8px; text-align: center; margin: 15px 0; font-size: 1.1em; color: #e4e6eb;">
            $$ SMA_t = \frac{1}{w} \sum_{i=1}^{w} Y_{t-i} $$
          </div>
          <strong>Expanding Windows:</strong> Computes statistics from the very beginning of the dataset up to the
          current point $t$ (e.g., cumulative sum or cumulative max).
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>time_series.py - Lags and Rolling Windows</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>import pandas as pd

# Assuming 'df' is sorted chronologically and indexed by Date
# 1. Lag Features (Looking back in time)
# What was the value 1 day ago? 7 days ago?
df['sales_lag_1'] = df['sales'].shift(1)
df['sales_lag_7'] = df['sales'].shift(7)

# 2. Rolling Window Features
# The average and standard deviation over the last 7 days
df['sales_rolling_mean_7d'] = df['sales'].rolling(window=7).mean()
df['sales_rolling_std_7d'] = df['sales'].rolling(window=7).std()

# 3. Expanding Window Features
# Year-to-date maximum sales
df['sales_expanding_max'] = df['sales'].expanding().max()

# Drop NaNs generated by shifting/rolling
df.dropna(inplace=True)</code></pre>
        </div>
      </section>

      <!-- ===================== 14. TARGET LEAKAGE ========================= -->
      <section id="target-leakage" class="topic-section">
        <h2>Target Leakage (Data Leakage)</h2>
        <p>Data Leakage occurs when information from outside the training dataset is used to create the model. This
          guarantees amazing performance during training/validation, but total failure in the real world.</p>

        <div class="callout callout--mistake">⚠️ The most common cause of leakage is performing feature engineering
          (Scaling, Imputing, TF-IDF) on the ENTIRE dataset <strong>before</strong> calling train_test_split.</div>

        <div class="info-card" style="margin-top: 20px; border-left-color: #ff3366;">
          <h3 style="margin-top: 0; color: #ff3366;">🧠 Under the Hood: The Contamination Problem</h3>
          <p>Imagine using <code>StandardScaler</code> on your entire dataset. The scaler calculates the global $\mu$
            (mean) and $\sigma$ (standard deviation) to scale the data.</p>
          <p>If you split the data <em>after</em> scaling, your Training Data has been transformed using the mean of the
            Test Data. The Test Data is supposed to be completely unseen, but you just "leaked" its summary statistics
            into the training process.</p>
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>leakage.py - The Golden Rule of Fit vs Transform</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# ❌ BAD PRACTICE (Creates Leakage)
scaler_bad = StandardScaler()
X_scaled_bad = scaler_bad.fit_transform(X) # Entire dataset sees the scaler
X_train_bad, X_test_bad = train_test_split(X_scaled_bad)

# ✅ GOOD PRACTICE (No Leakage)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

scaler = StandardScaler()
# Fit ONLY on the training data to learn parameters (mean, std)
X_train_scaled = scaler.fit_transform(X_train) 

# Transform test data using the parameters learned from the training data
X_test_scaled = scaler.transform(X_test)</code></pre>
        </div>
        <div class="callout callout--tip">✅ The easiest way to mathematically prevent leakage in production is to
          package all your feature engineering steps inside a <strong>Scikit-Learn Pipeline</strong>.</div>
      </section>

      <!-- ================ 15. AUTOMATED FEATURE ENGINEERING =============== -->
      <section id="automated-fe" class="topic-section">
        <h2>Automated Feature Engineering</h2>
        <p>In complex, multi-table relational databases, manually creating features is incredibly tedious. Automated
          Feature Engineering relies on algorithms to automatically synthesize hundreds of new features from relational
          datasets.</p>

        <h3>Deep Feature Synthesis (DFS)</h3>
        <p>DFS stacks mathematical primitives (like computing sums, counts, averages, and time-since-last-event) across
          entity relationships (e.g., Customers $\xrightarrow{\text{1 to M}}$ Orders $\xrightarrow{\text{1 to M}}$
          Order_Items).</p>

        <div class="info-card">
          <strong>Real Example:</strong> From a raw database of e-commerce transactions, DFS can automatically generate
          complex features like: <em>"The average value of a customer's orders over the last 30 days"</em> or <em>"The
            standard deviation of time between a user's logins."</em>
        </div>

        <div class="code-block" style="margin-top: 20px;">
          <div class="code-header">
            <span>autofe.py - Featuretools Library</span>
            <button class="copy-btn" onclick="copyCode(this)">Copy</button>
          </div>
          <pre><code>import featuretools as ft

# Assume we have three Pandas DataFrames: clients, loans, and payments
# Step 1: Create an EntitySet (a representation of your database)
es = ft.EntitySet(id="banking")

# Step 2: Add dataframes to the EntitySet with primary keys
es = es.add_dataframe(dataframe_name="clients", dataframe=clients_df, index="client_id")
es = es.add_dataframe(dataframe_name="loans", dataframe=loans_df, index="loan_id")

# Step 3: Define relational joins (Foreign Keys)
es = es.add_relationship("clients", "client_id", "loans", "client_id")

# Step 4: Run Deep Feature Synthesis!
# Automatically generates agg features for clients based on their loans history
feature_matrix, feature_defs = ft.dfs(
    entityset=es,
    target_dataframe_name="clients",
    agg_primitives=["mean", "sum", "mode", "std"],
    trans_primitives=["month", "hour"],
    max_depth=2 # Stacks primitives up to 2 layers deep
)

print(f"Automatically generated {len(feature_defs)} features!")</code></pre>
        </div>
      </section>
    </main>
  </div>

  <script src="app.js" defer></script>
</body>

</html>