diff --git a/.gitattributes b/.gitattributes
index a6344aac8c09253b3b630fb776ae94478aa0275b..8289721585bddb0cc4a8219829b9fbedaba81923 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.zip filter=lfs diff=lfs merge=lfs -text
*.zst filter=lfs diff=lfs merge=lfs -text
*tfevents* filter=lfs diff=lfs merge=lfs -text
+Static/decision_tree.png filter=lfs diff=lfs merge=lfs -text
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000000000000000000000000000000000000..bc74ffd48e6878ef23682886273df98b60631298
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,57 @@
+# # Use official Python base image
+# FROM python:3.10-slim
+
+# # Avoid Python buffering
+# ENV PYTHONUNBUFFERED=1
+
+# # Set work directory
+# WORKDIR /app
+
+# # Install system dependencies
+# RUN apt-get update && apt-get install -y \
+# build-essential \
+# git \
+# curl \
+# && rm -rf /var/lib/apt/lists/*
+
+# # Copy requirements.txt and install
+# COPY requirements.txt .
+# RUN pip install --upgrade pip && pip install -r requirements.txt
+
+# # Copy project files
+# COPY . .
+
+# # Expose port (Hugging Face expects 7860 by default, but Flask usually runs 5000)
+# EXPOSE 5000
+
+# # Set environment variable for Flask
+# ENV PORT=5000
+# ENV FLASK_APP=app.py
+
+# # Run Flask
+# CMD ["flask", "run", "--host", "0.0.0.0", "--port", "5000"]
+
+# Use lightweight Python image
+FROM python:3.10-slim
+
+# Environment variables
+ENV PYTHONUNBUFFERED=1
+ENV TF_CPP_MIN_LOG_LEVEL=2
+
+# Set working directory
+WORKDIR /app
+
+# Copy and install dependencies first (cache-friendly)
+COPY requirements.txt .
+RUN pip install --no-cache-dir --upgrade pip \
+ && pip install --no-cache-dir -r requirements.txt
+
+# Copy app code
+COPY . .
+
+# Render provides PORT automatically — DO NOT hardcode
+CMD ["python", "app.py"]
+
+
+
+
diff --git a/Models/label_encoder.joblib b/Models/label_encoder.joblib
new file mode 100644
index 0000000000000000000000000000000000000000..1680efdb8ebedd1b8e022c15d5ad345db74e42dc
--- /dev/null
+++ b/Models/label_encoder.joblib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:865b0dbea4a93bf730753224d4e047c046ae29bf9b2aea0c7be7d49117a886bc
+size 585
diff --git a/Models/label_encoder.pkl b/Models/label_encoder.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..24ba994a9672fe8431087c6af85cbffe12077c93
--- /dev/null
+++ b/Models/label_encoder.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:42d9d0139ea16bc79a275b08e1e97c8c3075f91279b211fa3a635786f26c015e
+size 592
diff --git a/Models/liar_vectorizer.joblib b/Models/liar_vectorizer.joblib
new file mode 100644
index 0000000000000000000000000000000000000000..5612a4f440b18ee4bea389c46b1a4b98ed1fe66a
--- /dev/null
+++ b/Models/liar_vectorizer.joblib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c26d15ecdbe5770f3b01b015d4ebb565d20e9e3a9a477b397a875857812a7cf4
+size 184539
diff --git a/Models/linear_model.pkl b/Models/linear_model.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..e5c499c475cbcea73a89e68c987b7cd1eb246dd1
--- /dev/null
+++ b/Models/linear_model.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e686db9126ad24dbdd3eaee6b9915cce209e0c703e3279c23787cdb3f1fa6e7a
+size 577
diff --git a/Models/logistic_model.pkl b/Models/logistic_model.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1836891ba91c462078c9c94072c0585321e17cff
--- /dev/null
+++ b/Models/logistic_model.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:57c8921a04cc148eb213bc4e1d21bf7d4e027401ea0dbe272567d6d6dd12d920
+size 40863
diff --git a/Models/logvectorizer.pkl b/Models/logvectorizer.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..6d2e05deb3c9e89749d2027c79ff50bc508278cd
--- /dev/null
+++ b/Models/logvectorizer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:e51b1d8b6c8975d5469c9c7540af43fab5ac2bdce0008d7109cfdab4fd481917
+size 160142
diff --git a/Models/nb_url_model.pkl b/Models/nb_url_model.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1ccc3f73210bcab0041967645b8572bab5f3c7e4
--- /dev/null
+++ b/Models/nb_url_model.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c88eb0fb2fb0b99144d1f59e4a9868a5a09c2143649a2e5611931f9271fadf11
+size 22222423
diff --git a/Models/nb_url_vectorizer.pkl b/Models/nb_url_vectorizer.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..e0372ec956b4d1681e1521183572c5f58df789b9
--- /dev/null
+++ b/Models/nb_url_vectorizer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1930da4d5837bdbf45094b03047a1c9a4febd8d37871b08dc4259fe7d723e852
+size 14448425
diff --git a/Models/poly_model.pkl b/Models/poly_model.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..1b316c1bfb2bc142f15c6858147b95ac25675f95
--- /dev/null
+++ b/Models/poly_model.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:56c2a0cbd48a3349e662adb7120e361a3d31c11e457690a3315f778c5eac10f2
+size 609
diff --git a/Models/poly_transform.pkl b/Models/poly_transform.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..fd9e20f58be4252119847a942ae2e401a0eab396
--- /dev/null
+++ b/Models/poly_transform.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9dc1dfc5979d069bdb7c33289547d02668adfe29739f31519cef264c1bb1b57
+size 255
diff --git a/Models/rf_model.pkl b/Models/rf_model.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..641b8e1acd1013166d8bc7e705c91694a02090fa
--- /dev/null
+++ b/Models/rf_model.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:cf25c22e41534fe505d74c6c5cd7e6e6cf5a0d76fa75f1bb58df2c949ee58a5a
+size 102017
diff --git a/Models/ridge_model.pkl b/Models/ridge_model.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..95d9fb8131371113f8edb965b6336363f76da737
--- /dev/null
+++ b/Models/ridge_model.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec7a5044c24138f0ce707e2a9b0e936c8a44a87009dbe2039fffa52dfd6ddab2
+size 593
diff --git a/Models/ridge_scaler.pkl b/Models/ridge_scaler.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..89de9cccb44f605b3d9152c621afb381bb9e845f
--- /dev/null
+++ b/Models/ridge_scaler.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:700c6f8cd087cd8183e3e923406b37414e106068de5d335378bce049081b1862
+size 1039
diff --git a/Models/supervised_model.pkl b/Models/supervised_model.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..b8c1114bf690a87fe508a0db5ba5d754885394a7
--- /dev/null
+++ b/Models/supervised_model.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:563a76099009bb0d525d7178e6901903bb38037b80e354ceb0fed0697e755f92
+size 576
diff --git a/Models/svr_model.pkl b/Models/svr_model.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..4dde34fc8d62db6e78015d0f86d26cf815e4f966
--- /dev/null
+++ b/Models/svr_model.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a53c8cb144b241a532575dd98f7e0a3a00cb96e0e1b86ca3865aa420a08fd47c
+size 42141
diff --git a/Models/svr_scaler_X.pkl b/Models/svr_scaler_X.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..f92df23e8f9c63be87886a3f84767df6decb3364
--- /dev/null
+++ b/Models/svr_scaler_X.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd01177f9512e5b7165080eac192c4434001a0650911d1af49603245dd395372
+size 722
diff --git a/Models/svr_scaler_y.pkl b/Models/svr_scaler_y.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..c0546e7b5b26b7313e8bec7da396c78567d80e7b
--- /dev/null
+++ b/Models/svr_scaler_y.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2e9f5568bef89410981f949a8af69c55bd631fc0a5166d48ff52014629bc6956
+size 474
diff --git a/Models/tfidf_vectorizer.pkl b/Models/tfidf_vectorizer.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..6e45483690ff53c6311cb4837827c93cdaf043c9
--- /dev/null
+++ b/Models/tfidf_vectorizer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:909c6d43daee911d09fc013149f2a7cbf2da5afbdb8ae01f8057641bde4f8ce7
+size 226415
diff --git a/Models/url_vectorizer.pkl b/Models/url_vectorizer.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..8bceed947c866781a933159939a9cb75186f3468
--- /dev/null
+++ b/Models/url_vectorizer.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1137c32cf449a9820f0128cb5b170e480b38169639c94e75f15fff578abb9df8
+size 140312
diff --git a/Models/vectorizer.joblib b/Models/vectorizer.joblib
new file mode 100644
index 0000000000000000000000000000000000000000..695b529410c6c93f5d2c48bdab0d201bca8a402a
--- /dev/null
+++ b/Models/vectorizer.joblib
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b69e7d0c042a50411c148bec8240a3756aa7d2057931c55a85659d673c1bc8e6
+size 183179
diff --git a/Models/voting_url_model.pkl b/Models/voting_url_model.pkl
new file mode 100644
index 0000000000000000000000000000000000000000..17dc686d3c58748ec6dcbf8b1818634a01c06c91
--- /dev/null
+++ b/Models/voting_url_model.pkl
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7eeb355c7e7339439d73909b118de4befd5257d2a04208d1d0a36bd71f52f57c
+size 8767014
diff --git a/README.md b/README.md
index ef1c7a87e436865942a24a52d75bc88c27c590b7..ef7d322d9ccc4d0bb947042b016b5c21301df5ee 100644
--- a/README.md
+++ b/README.md
@@ -1,11 +1,2 @@
----
-title: Machinelearningalgorithms
-emoji: 🏢
-colorFrom: yellow
-colorTo: green
-sdk: docker
-pinned: false
-short_description: machinelearningalgor
----
-
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# machine-learning
+it is machine learning wesite
diff --git a/Static/decision_tree.png b/Static/decision_tree.png
new file mode 100644
index 0000000000000000000000000000000000000000..04372501b0ac3f37db52cfda60bbcaf04ec500ac
--- /dev/null
+++ b/Static/decision_tree.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6b8cdfb3ac950b37d03f7ddabd674789f9509bb23b2d3ff61bcd228a72bfafd8
+size 328793
diff --git a/Static/js/lasso_charts.js b/Static/js/lasso_charts.js
new file mode 100644
index 0000000000000000000000000000000000000000..47564f51a2b893988826c403efbef8b5d0d20df6
--- /dev/null
+++ b/Static/js/lasso_charts.js
@@ -0,0 +1,225 @@
+document.addEventListener('DOMContentLoaded', function() {
+ console.log("lasso_charts.js loaded and DOM fully parsed.");
+
+ const form = document.getElementById('predictionForm');
+ const loadingSpinner = document.getElementById('loadingSpinner');
+
+ form.addEventListener('submit', function() {
+ loadingSpinner.classList.remove('hidden'); // Show loading spinner
+ });
+
+ // --- Example for a Coefficient Path Chart (Conceptual with Chart.js) ---
+ const alphaValues = [0.01, 0.1, 0.5, 1, 2, 5, 10];
+ const dummyCoefficients = {
+ 'OverallQual': [0.8, 0.7, 0.5, 0.3, 0.1, 0, 0],
+ 'GrLivArea': [1.2, 1.1, 0.9, 0.7, 0.5, 0.2, 0.1],
+ 'GarageCars': [0.5, 0.4, 0.3, 0.1, 0, 0, 0],
+ 'TotalBsmtSF': [0.6, 0.5, 0.4, 0.2, 0.1, 0.05, 0],
+ 'YearBuilt': [0.3, 0.2, 0.1, 0.05, 0, 0, 0]
+ };
+
+ const ctxCoeff = document.getElementById('coefficientPathChart');
+ if (ctxCoeff) {
+ new Chart(ctxCoeff, {
+ type: 'line',
+ data: {
+ labels: alphaValues.map(a => `λ=${a}`),
+ datasets: Object.keys(dummyCoefficients).map(feature => ({
+ label: feature,
+ data: dummyCoefficients[feature],
+ borderColor: getRandomColor(),
+ fill: false,
+ tension: 0.1
+ }))
+ },
+ options: {
+ responsive: true,
+ maintainAspectRatio: false,
+ plugins: {
+ title: {
+ display: true,
+ text: 'Coefficient Path for Different Lambda (α) Values'
+ },
+ tooltip: {
+ mode: 'index',
+ intersect: false,
+ },
+ },
+ scales: {
+ x: {
+ title: {
+ display: true,
+ text: 'Regularization Strength (λ)'
+ }
+ },
+ y: {
+ title: {
+ display: true,
+ text: 'Coefficient Value'
+ }
+ }
+ }
+ }
+ });
+ }
+
+ // --- Example for Feature Importance Bar Chart (Conceptual with Chart.js) ---
+ const finalCoefficients = {
+ 'OverallQual': 0.65,
+ 'GrLivArea': 0.82,
+ 'GarageCars': 0.15,
+ 'TotalBsmtSF': 0.38,
+ 'YearBuilt': 0.07
+ };
+ const featureLabels = Object.keys(finalCoefficients);
+ const featureValues = Object.values(finalCoefficients).map(Math.abs);
+
+ const ctxFeature = document.getElementById('featureImportanceChart');
+ if (ctxFeature) {
+ new Chart(ctxFeature, {
+ type: 'bar',
+ data: {
+ labels: featureLabels,
+ datasets: [{
+ label: 'Absolute Coefficient Value',
+ data: featureValues,
+ backgroundColor: 'rgba(54, 162, 235, 0.7)',
+ borderColor: 'rgba(54, 162, 235, 1)',
+ borderWidth: 1
+ }]
+ },
+ options: {
+ responsive: true,
+ maintainAspectRatio: false,
+ plugins: {
+ title: {
+ display: true,
+ text: 'Feature Importance (Absolute Coefficients)'
+ },
+ legend: {
+ display: false
+ }
+ },
+ scales: {
+ y: {
+ beginAtZero: true,
+ title: {
+ display: true,
+ text: 'Absolute Coefficient Value'
+ }
+ },
+ x: {
+ title: {
+ display: true,
+ text: 'Features'
+ }
+ }
+ }
+ }
+ });
+ }
+
+ // --- Example for Predicted vs Actual Chart (Conceptual with Chart.js) ---
+ const actualPrices = [200000, 250000, 180000, 300000, 220000, 270000, 190000, 310000];
+ const predictedPrices = [210000, 245000, 175000, 310000, 215000, 280000, 195000, 300000];
+ const dataPoints = actualPrices.map((actual, index) => ({
+ x: actual,
+ y: predictedPrices[index]
+ }));
+
+ const ctxPredActual = document.getElementById('predictionActualChart');
+ if (ctxPredActual) {
+ new Chart(ctxPredActual, {
+ type: 'scatter',
+ data: {
+ datasets: [{
+ label: 'Predicted vs. Actual',
+ data: dataPoints,
+ backgroundColor: 'rgba(75, 192, 192, 0.8)',
+ pointRadius: 5
+ }, {
+ label: 'Ideal Prediction',
+ data: [{x: Math.min(...actualPrices, ...predictedPrices), y: Math.min(...actualPrices, ...predictedPrices)},
+ {x: Math.max(...actualPrices, ...predictedPrices), y: Math.max(...actualPrices, ...predictedPrices)}],
+ borderColor: 'rgba(255, 99, 132, 0.8)',
+ borderWidth: 2,
+ pointRadius: 0,
+ type: 'line',
+ fill: false,
+ tension: 0
+ }]
+ },
+ options: {
+ responsive: true,
+ maintainAspectRatio: false,
+ plugins: {
+ title: {
+ display: true,
+ text: 'Predicted vs. Actual Prices'
+ },
+ tooltip: {
+ callbacks: {
+ label: function(context) {
+ return `Actual: $${context.parsed.x}, Predicted: $${context.parsed.y}`;
+ }
+ }
+ }
+ },
+ scales: {
+ x: {
+ type: 'linear',
+ position: 'bottom',
+ title: {
+ display: true,
+ text: 'Actual Price ($)'
+ }
+ },
+ y: {
+ type: 'linear',
+ position: 'left',
+ title: {
+ display: true,
+ text: 'Predicted Price ($)'
+ }
+ }
+ }
+ }
+ });
+ }
+
+ // Helper function to get a random color for line charts
+ function getRandomColor() {
+ const letters = '0123456789ABCDEF';
+ let color = '#';
+ for (let i = 0; i < 6; i++) {
+ color += letters[Math.floor(Math.random() * 16)];
+ }
+ return color;
+ }
+
+ // --- IMPORTANT: How to get real data from your Flask/Python backend ---
+ // You would typically fetch data using JavaScript's Fetch API after the page loads,
+ // or by embedding data directly into the HTML from your Jinja2 template.
+
+ // Example of fetching data (if your Flask app has an /api/charts endpoint)
+ /*
+ fetch('/api/charts/coefficient_path_data')
+ .then(response => response.json())
+ .then(data => {
+ // Use 'data' to render your coefficient path chart
+ // e.g., update the Chart.js data object and call chart.update()
+ console.log("Received coefficient path data:", data);
+ })
+ .catch(error => console.error('Error fetching chart data:', error));
+ */
+
+ // Example of embedding data (if passed directly from Flask view)
+ // In your Flask view:
+ // return render_template('lasso_regression.html', prediction=..., chart_data_json=json.dumps(your_data))
+ // In lasso_regression.html:
+ //
+ // In lasso_charts.js:
+ // console.log(chartData); // Use this data directly for charts
+
+
+});
diff --git a/Static/js/linear.js b/Static/js/linear.js
new file mode 100644
index 0000000000000000000000000000000000000000..4a64b1b217caf4f6f4ec649fa696f704fc7f5cdc
--- /dev/null
+++ b/Static/js/linear.js
@@ -0,0 +1,263 @@
+// Get canvas and context
+const canvas = document.getElementById('regressionCanvas');
+const ctx = canvas.getContext('2d');
+
+// Data from your Python script (X, y)
+// These are hardcoded here for visualization purposes.
+// In a real advanced app, these might be dynamically loaded.
+const X_data = [1, 2, 3, 4, 5];
+const y_data = [35, 45, 55, 65, 75];
+
+// --- Understanding Slope (m) and Intercept (b) ---
+// For a perfect linear relationship as in your data,
+// we can manually calculate slope (m) and intercept (b).
+// In a real-world scenario with scattered data, the scikit-learn
+// LinearRegression model uses more advanced statistical methods
+// (like Ordinary Least Squares) to find the 'best fit' line
+// that minimizes the squared differences between actual and predicted y values.
+
+// Calculate Slope (m):
+// m = (y2 - y1) / (x2 - x1)
+// Using points (1, 35) and (2, 45):
+// m = (45 - 35) / (2 - 1) = 10 / 1 = 10
+const slope = 10;
+
+// Calculate Intercept (b):
+// b = y - m * x
+// Using point (1, 35) and calculated slope m=10:
+// b = 35 - (10 * 1) = 35 - 10 = 25
+const intercept = 25;
+
+// Display slope and intercept values in the HTML
+document.getElementById('slopeValue').textContent = slope.toFixed(2);
+document.getElementById('interceptValue').textContent = intercept.toFixed(2);
+
+// Canvas dimensions and padding
+let canvasWidth, canvasHeight;
+const padding = 50;
+
+// Scale factors for drawing data onto the canvas
+let xScale, yScale;
+let xMin, xMax, yMin, yMax;
+
+// Prediction variables (these will be updated when the user inputs hours)
+let predictedHours = null;
+let predictedScore = null;
+
+// Function to set up scaling based on data range and canvas size
+function setupScaling() {
+ canvasWidth = canvas.width;
+ canvasHeight = canvas.height;
+
+ // Determine data ranges for X and Y axes
+ xMin = Math.min(...X_data, 0); // Always start X-axis at 0
+ // Set xMax to at least 10 (as per the last request) and ensure it covers any new predicted hours
+ xMax = Math.max(...X_data, predictedHours !== null ? predictedHours : 0, 10) + 1; // Extend x-axis slightly beyond 10
+
+ yMin = Math.min(...y_data, 0); // Always start Y-axis at 0
+ // Calculate the predicted score for the determined xMax to ensure the y-axis covers the line
+ const maxPredictedY = slope * xMax + intercept;
+ yMax = Math.max(...y_data, predictedScore !== null ? predictedScore : 0, maxPredictedY) + 20; // Extend y-axis slightly beyond max needed
+
+ // Calculate scaling factors to fit data within the canvas padding
+ xScale = (canvasWidth - 2 * padding) / (xMax - xMin);
+ yScale = (canvasHeight - 2 * padding) / (yMax - yMin);
+}
+
+// Convert data coordinates (e.g., hours, score) to canvas pixel coordinates
+function toCanvasX(x) {
+ return padding + (x - xMin) * xScale;
+}
+
+function toCanvasY(y) {
+ return canvasHeight - padding - (y - yMin) * yScale;
+}
+
+// Function to draw the entire graph, including data points, regression line, and predictions
+function drawGraph() {
+ ctx.clearRect(0, 0, canvasWidth, canvasHeight); // Clear the entire canvas
+
+ // Draw axes
+ ctx.beginPath();
+ ctx.strokeStyle = '#64748b'; // Slate gray for axes
+ ctx.lineWidth = 2;
+
+ // X-axis (horizontal line)
+ ctx.moveTo(padding, toCanvasY(yMin));
+ ctx.lineTo(canvasWidth - padding, toCanvasY(yMin));
+ // Y-axis (vertical line)
+ ctx.moveTo(toCanvasX(xMin), padding);
+ ctx.lineTo(toCanvasX(xMin), canvasHeight - padding);
+ ctx.stroke();
+
+ // Draw axis labels and ticks
+ ctx.fillStyle = '#475569'; // Darker gray for labels
+ ctx.font = '14px Inter';
+ ctx.textAlign = 'center';
+ ctx.textBaseline = 'top';
+
+ // X-axis labels (Hours Studied)
+ // Dynamic tick step for clarity on different scales
+ const xTickStep = 1; // Every 1 hour for a graph up to 10
+ for (let i = Math.ceil(xMin / xTickStep) * xTickStep; i <= Math.floor(xMax); i += xTickStep) {
+ if (i >= 0) {
+ ctx.fillText(i + 'h', toCanvasX(i), canvasHeight - padding + 10);
+ ctx.beginPath();
+ ctx.moveTo(toCanvasX(i), canvasHeight - padding);
+ ctx.lineTo(toCanvasX(i), canvasHeight - padding - 5);
+ ctx.stroke();
+ }
+ }
+ // X-axis title
+ ctx.fillText('Hours Studied', canvasWidth / 2, canvasHeight - 20);
+
+ ctx.textAlign = 'right';
+ ctx.textBaseline = 'middle';
+ // Y-axis labels (Score)
+ // Dynamic tick step for clarity on different scales
+ const yTickStep = (yMax - yMin) / 10 > 20 ? 50 : 20; // Example: every 20 or 50 points
+ for (let i = Math.ceil(yMin / yTickStep) * yTickStep; i <= Math.floor(yMax); i += yTickStep) {
+ if (i >= 0) {
+ ctx.fillText(i.toFixed(0), padding - 10, toCanvasY(i));
+ ctx.beginPath();
+ ctx.moveTo(padding, toCanvasY(i));
+ ctx.lineTo(padding + 5, toCanvasY(i));
+ ctx.stroke();
+ }
+ }
+ // Y-axis title (rotated)
+ ctx.save();
+ ctx.translate(20, canvasHeight / 2);
+ ctx.rotate(-Math.PI / 2);
+ ctx.textAlign = 'center';
+ ctx.fillText('Score', 0, 0);
+ ctx.restore();
+
+
+ // Draw data points (blue circles)
+ ctx.fillStyle = '#3b82f6'; // Blue for data points
+ X_data.forEach((x, i) => {
+ ctx.beginPath();
+ ctx.arc(toCanvasX(x), toCanvasY(y_data[i]), 5, 0, Math.PI * 2); // Radius 5
+ ctx.fill();
+ });
+
+ // Draw regression line (red line)
+ ctx.beginPath();
+ ctx.strokeStyle = '#ef4444'; // Red for regression line
+ ctx.lineWidth = 3;
+ // Draw line across the entire X-axis range based on the model equation
+ ctx.moveTo(toCanvasX(xMin), toCanvasY(slope * xMin + intercept));
+ ctx.lineTo(toCanvasX(xMax), toCanvasY(slope * xMax + intercept));
+ ctx.stroke();
+
+ // Draw predicted point and lines if available (green point and dashed lines)
+ if (predictedHours !== null && predictedScore !== null) {
+ const predX = toCanvasX(predictedHours);
+ const predY = toCanvasY(predictedScore);
+
+ // Predicted point
+ ctx.fillStyle = '#22c55e'; // Green for predicted point
+ ctx.beginPath();
+ ctx.arc(predX, predY, 6, 0, Math.PI * 2); // Slightly larger radius
+ ctx.fill();
+
+ // Dotted lines to axes
+ ctx.strokeStyle = '#22c55e'; // Green for dotted lines
+ ctx.lineWidth = 1.5;
+ ctx.setLineDash([5, 5]); // Dotted line style
+
+ // Line from predicted point to X-axis
+ ctx.beginPath();
+ ctx.moveTo(predX, predY);
+ ctx.lineTo(predX, toCanvasY(yMin));
+ ctx.stroke();
+
+ // Line from predicted point to Y-axis
+ ctx.beginPath();
+ ctx.moveTo(predX, predY);
+ ctx.lineTo(toCanvasX(xMin), predY);
+ ctx.stroke();
+
+ ctx.setLineDash([]); // Reset line dash to solid for subsequent drawings
+ }
+}
+
+// Event listener for the "Predict Score" button click
+document.getElementById('predictBtn').addEventListener('click', () => {
+ // Get the value from the input field and parse it as a floating-point number
+ const hoursInput = parseFloat(document.getElementById('hoursInput').value);
+
+ // Check if the input is a valid number
+ if (!isNaN(hoursInput)) {
+ // Update global prediction variables
+ predictedHours = hoursInput;
+ predictedScore = slope * predictedHours + intercept;
+
+ // Display the predicted score in the HTML
+ document.getElementById('predictedScore').textContent = predictedScore.toFixed(2);
+ // Make the prediction output box visible
+ document.getElementById('predictionOutput').classList.remove('hidden');
+
+ // Recalculate scaling and redraw the graph to accommodate new prediction if it extends axes
+ setupScaling();
+ drawGraph();
+ } else {
+ // If input is invalid, display an error message
+ const outputDiv = document.getElementById('predictionOutput');
+ outputDiv.innerHTML = '
Please enter a valid number for hours studied.
';
+ outputDiv.classList.remove('hidden');
+ }
+});
+
+// Function to handle canvas resizing and redraw the graph
+function resizeCanvas() {
+ // Get the device pixel ratio for sharper rendering on high-DPI screens
+ const dpi = window.devicePixelRatio;
+ // Get the actual rendered size of the canvas element from its CSS styles
+ const rect = canvas.getBoundingClientRect();
+
+ // Set the internal drawing buffer size of the canvas
+ canvas.width = rect.width * dpi;
+ canvas.height = rect.height * dpi;
+
+ // Scale the drawing context to match the DPI, ensuring crisp lines and text
+ ctx.scale(dpi, dpi);
+
+ // Re-setup scaling for data to canvas coordinates and redraw
+ setupScaling();
+ drawGraph();
+}
+
+// Initial setup and draw when the window loads
+window.addEventListener('load', () => {
+ resizeCanvas(); // Set initial canvas size and draw
+ // Also trigger an initial prediction for the default value in the input field
+ const initialHours = parseFloat(document.getElementById('hoursInput').value);
+ if (!isNaN(initialHours)) {
+ predictedHours = initialHours;
+ predictedScore = slope * initialHours + intercept;
+ document.getElementById('predictedScore').textContent = predictedScore.toFixed(2);
+ document.getElementById('predictionOutput').classList.remove('hidden');
+ setupScaling();
+ drawGraph();
+ }
+});
+
+// Redraw the graph whenever the window is resized
+window.addEventListener('resize', resizeCanvas);
+
+// Optional: Allow clicking on canvas to set hours input (for quick testing)
+canvas.addEventListener('click', (event) => {
+ // Get mouse click coordinates relative to the canvas
+ const rect = canvas.getBoundingClientRect();
+ const mouseX = (event.clientX - rect.left) / (canvas.width / canvas.getBoundingClientRect().width);
+ const mouseY = (event.clientY - rect.top) / (canvas.height / canvas.getBoundingClientRect().height); // Corrected this line
+
+ // Convert canvas X coordinate back to data X (hours studied)
+ const clickedHours = xMin + (mouseX - padding) / xScale;
+ // Update the input field with the clicked hours
+ document.getElementById('hoursInput').value = clickedHours.toFixed(1);
+ // Trigger the prediction immediately
+ document.getElementById('predictBtn').click();
+});
diff --git a/Static/js/poly.js b/Static/js/poly.js
new file mode 100644
index 0000000000000000000000000000000000000000..9fad29d6c8a6ca9110f44fb7515ac46df8e52652
--- /dev/null
+++ b/Static/js/poly.js
@@ -0,0 +1,85 @@
+const canvas = document.getElementById("polyCanvas");
+const ctx = canvas.getContext("2d");
+
+const X_data = [1, 2, 3, 4, 5];
+const y_data = [3, 8, 15, 24, 35];
+
+function toCanvasX(x, xScale, padding) {
+ return padding + x * xScale;
+}
+
+function toCanvasY(y, yScale, padding, canvasHeight) {
+ return canvasHeight - padding - y * yScale;
+}
+
+function setupAndDraw(predX = null, predY = null) {
+ const padding = 50;
+ const canvasWidth = canvas.width = canvas.clientWidth;
+ const canvasHeight = canvas.height = canvas.clientHeight;
+
+ const xMax = 6;
+ const yMax = 40;
+
+ const xScale = (canvasWidth - 2 * padding) / xMax;
+ const yScale = (canvasHeight - 2 * padding) / yMax;
+
+ // Clear
+ ctx.clearRect(0, 0, canvasWidth, canvasHeight);
+
+ // Axes
+ ctx.beginPath();
+ ctx.moveTo(padding, toCanvasY(0, yScale, padding, canvasHeight));
+ ctx.lineTo(canvasWidth - padding, toCanvasY(0, yScale, padding, canvasHeight));
+ ctx.moveTo(toCanvasX(0, xScale, padding), padding);
+ ctx.lineTo(toCanvasX(0, xScale, padding), canvasHeight - padding);
+ ctx.strokeStyle = "#475569";
+ ctx.stroke();
+
+ // Points
+ ctx.fillStyle = "#3b82f6";
+ X_data.forEach((x, i) => {
+ ctx.beginPath();
+ ctx.arc(toCanvasX(x, xScale, padding), toCanvasY(y_data[i], yScale, padding, canvasHeight), 5, 0, 2 * Math.PI);
+ ctx.fill();
+ });
+
+ // Curve
+ ctx.beginPath();
+ ctx.moveTo(toCanvasX(0, xScale, padding), toCanvasY(0, yScale, padding, canvasHeight));
+ for (let x = 0; x <= xMax; x += 0.1) {
+ const y = x * x + 2 * x; // match your data (x^2 + 2x)
+ ctx.lineTo(toCanvasX(x, xScale, padding), toCanvasY(y, yScale, padding, canvasHeight));
+ }
+ ctx.strokeStyle = "#ef4444";
+ ctx.lineWidth = 2;
+ ctx.stroke();
+
+ // Predicted point
+ if (predX !== null && predY !== null) {
+ ctx.fillStyle = "#22c55e";
+ ctx.beginPath();
+ ctx.arc(toCanvasX(predX, xScale, padding), toCanvasY(predY, yScale, padding, canvasHeight), 6, 0, 2 * Math.PI);
+ ctx.fill();
+ }
+}
+
+// Prediction handler
+function predict() {
+ const hours = parseFloat(document.getElementById("hoursInput").value);
+ fetch("/predict_poly", {
+ method: "POST",
+ body: JSON.stringify({ hours }),
+ headers: {
+ "Content-Type": "application/json"
+ }
+ })
+ .then(res => res.json())
+ .then(data => {
+ const score = data.prediction;
+ document.getElementById("predictedScore").textContent = score;
+ document.getElementById("predictionOutput").classList.remove("hidden");
+ setupAndDraw(hours, score);
+ });
+}
+
+window.onload = () => setupAndDraw();
diff --git a/Static/knn.js b/Static/knn.js
new file mode 100644
index 0000000000000000000000000000000000000000..05d871b00be6c85c065877c1029f791367941467
--- /dev/null
+++ b/Static/knn.js
@@ -0,0 +1,71 @@
+let points = [
+ [2, 3, 0], [3, 4, 0], [1, 1, 0],
+ [7, 8, 1], [6, 9, 1], [8, 7, 1]
+]; // (x, y, label)
+let testPoint = [4.5, 5.5];
+
+const ctx = document.getElementById('knnChart').getContext('2d');
+const colors = ['#1f77b4', '#ff7f0e', '#2ca02c'];
+
+let chart = new Chart(ctx, {
+ type: 'scatter',
+ data: {
+ datasets: [
+ {
+ label: 'Class 0',
+ data: points.filter(p => p[2] === 0).map(p => ({ x: p[0], y: p[1] })),
+ backgroundColor: colors[0]
+ },
+ {
+ label: 'Class 1',
+ data: points.filter(p => p[2] === 1).map(p => ({ x: p[0], y: p[1] })),
+ backgroundColor: colors[1]
+ },
+ {
+ label: 'Test Point',
+ data: [{ x: testPoint[0], y: testPoint[1] }],
+ backgroundColor: 'black',
+ pointStyle: 'triangle',
+ radius: 7
+ }
+ ]
+ },
+ options: {
+ responsive: true,
+ plugins: {
+ legend: { position: 'top' },
+ title: { display: true, text: 'KNN Classification Plot' }
+ },
+ scales: {
+ x: { type: 'linear', position: 'bottom' },
+ y: { type: 'linear' }
+ }
+ }
+});
+
+async function sendToServer() {
+ const k = document.getElementById('k-value').value;
+
+ const response = await fetch('/knn_visual_predict', {
+ method: 'POST',
+ headers: { 'Content-Type': 'application/json' },
+ body: JSON.stringify({ points, test_point: testPoint, k })
+ });
+
+ const result = await response.json();
+
+ document.getElementById('output').innerHTML =
+ `Prediction: Class ${result.prediction}`;
+
+ // Highlight neighbors
+ const neighborLayer = {
+ label: 'Nearest Neighbors',
+ data: result.neighbors.map(p => ({ x: p[0], y: p[1] })),
+ backgroundColor: '#d62728',
+ pointStyle: 'rect',
+ radius: 6
+ };
+
+ chart.data.datasets = chart.data.datasets.slice(0, 3).concat([neighborLayer]);
+ chart.update();
+}
\ No newline at end of file
diff --git a/Static/svr_linear.png b/Static/svr_linear.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc45bd001e812fd36710618f14aaf9bb8564893f
Binary files /dev/null and b/Static/svr_linear.png differ
diff --git a/Static/svr_poly.png b/Static/svr_poly.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6f8e5140ce1740498889e4f78a0c5f0e2ed6f2f
Binary files /dev/null and b/Static/svr_poly.png differ
diff --git a/Static/svr_rbf.png b/Static/svr_rbf.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e8c10970323503afff5961485f8947bc178d006
Binary files /dev/null and b/Static/svr_rbf.png differ
diff --git a/Static/uploads/Figure_1.png b/Static/uploads/Figure_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..d28d7c2040d32a342c45002a5c61d160809c28dc
Binary files /dev/null and b/Static/uploads/Figure_1.png differ
diff --git a/Static/uploads/compressed_clean.jpg b/Static/uploads/compressed_clean.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..83c29b8410f4c79e3340bf986f72c3b16968b798
Binary files /dev/null and b/Static/uploads/compressed_clean.jpg differ
diff --git a/Static/uploads/digit_0.png b/Static/uploads/digit_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..b6d83e2532cf8600c113d87f7235809feada5261
Binary files /dev/null and b/Static/uploads/digit_0.png differ
diff --git a/Static/uploads/digit_4.png b/Static/uploads/digit_4.png
new file mode 100644
index 0000000000000000000000000000000000000000..cc6a668b2dbca47f95564049bac912587861baed
Binary files /dev/null and b/Static/uploads/digit_4.png differ
diff --git a/Static/uploads/download.jpg b/Static/uploads/download.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1d841d232b82936e30214276abc355c216566d97
Binary files /dev/null and b/Static/uploads/download.jpg differ
diff --git a/Static/uploads/download.png b/Static/uploads/download.png
new file mode 100644
index 0000000000000000000000000000000000000000..72884c4f6a7334db4247da7f83a72c917aff6179
Binary files /dev/null and b/Static/uploads/download.png differ
diff --git a/Static/uploads/download_1.jpg b/Static/uploads/download_1.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..4268a9f07c74cb50dad1be9690dbe29b5c1d8467
Binary files /dev/null and b/Static/uploads/download_1.jpg differ
diff --git a/Static/uploads/download_2.jpg b/Static/uploads/download_2.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..fb7e5a2c7943f3756ed2f75d6cb4658e0650b757
Binary files /dev/null and b/Static/uploads/download_2.jpg differ
diff --git a/Static/uploads/input.jpg b/Static/uploads/input.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..f8dba2828491ffd79ebc3cef45f42aca4158de43
Binary files /dev/null and b/Static/uploads/input.jpg differ
diff --git a/Static/uploads/kmeans.png b/Static/uploads/kmeans.png
new file mode 100644
index 0000000000000000000000000000000000000000..8bc454d4a69b5e791b2de5d7f305c1147eac6053
Binary files /dev/null and b/Static/uploads/kmeans.png differ
diff --git a/Static/uploads/test_digit.png b/Static/uploads/test_digit.png
new file mode 100644
index 0000000000000000000000000000000000000000..c0239664ca898de966ef32440cc2f8fa6e4c007c
Binary files /dev/null and b/Static/uploads/test_digit.png differ
diff --git a/Static/uploads/test_digit_8.png b/Static/uploads/test_digit_8.png
new file mode 100644
index 0000000000000000000000000000000000000000..65fd4e1bfb8201f82699ce08797014425b41b627
Binary files /dev/null and b/Static/uploads/test_digit_8.png differ
diff --git a/Static/uploads/test_digit_8_1.png b/Static/uploads/test_digit_8_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..ab7b29556adb894692bbe53059d78646b3a2c896
Binary files /dev/null and b/Static/uploads/test_digit_8_1.png differ
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..df8ef478037bc681b860ed1f7cadb76b5b867bb4
--- /dev/null
+++ b/app.py
@@ -0,0 +1,2373 @@
+from flask import Flask, render_template, request, jsonify
+import numpy as np
+import pandas as pd
+import joblib
+import os
+from sklearn.svm import SVR
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import mean_squared_error, r2_score
+from sklearn.neighbors import KNeighborsClassifier
+from sklearn.preprocessing import StandardScaler
+from sklearn.ensemble import RandomForestClassifier
+from sklearn.tree import DecisionTreeClassifier
+from sklearn import svm
+from sklearn.naive_bayes import GaussianNB # <--- Add this import
+from sklearn.feature_extraction.text import CountVectorizer
+from textblob import TextBlob
+import traceback
+from flask_cors import CORS
+from werkzeug.utils import secure_filename # For secure file names
+import io # To read CSV from memory
+import re
+from sklearn.cluster import KMeans, DBSCAN
+from PIL import Image
+import matplotlib.pyplot as plt
+from joblib import load # ✅ This is the missing line
+import traceback
+import pickle
+from sklearn.svm import SVC
+from sklearn.datasets import make_classification
+import plotly.graph_objs as go
+import json
+import requests
+from PIL import Image
+
+
+# from transformers import pipeline
+from dotenv import load_dotenv
+import os
+from urllib.parse import urlparse
+import tldextract
+import string
+
+
+#chatbotcode
+import zipfile
+import gdown
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+from peft import PeftModel
+
+# #login
+# from flask import Flask
+# from flask_jwt_extended import JWTManager
+# from flask_login import LoginManager
+# from flask_mail import Mail
+# from flask_login import LoginManager
+# from flask_sqlalchemy import SQLAlchemy
+# from flask_mail import Mail
+# from auth.models import db, User
+# from auth.routes import auth
+# from flask_login import login_required
+
+
+
+
+#chatbotcode
+
+# from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline
+
+# model_name = "microsoft/deberta-v3-small"
+
+# tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+# model = AutoModelForSequenceClassification.from_pretrained(model_name)
+
+# bert_checker = pipeline("text-classification", model=model, tokenizer=tokenizer)
+
+# Load environment variables from .env
+load_dotenv()
+#spam url import relateted
+import nltk, os
+
+# Tell NLTK to also check the local nltk_data folder
+nltk.data.path.append(os.path.join(os.path.dirname(__file__), "nltk_data"))
+
+from nltk.corpus import words
+
+# Load the words corpus
+valid_words = set(words.words())
+print("engineering" in valid_words) # ✅ Should be True
+print("engineerigfnnxng" in valid_words) # ❌ Should be False
+import wordninja # Function to split words into valid parts
+import re
+from urllib.parse import urlparse
+from spellchecker import SpellChecker
+
+import wordninja
+# end urlspam
+import google.generativeai as genai
+
+# app.py
+# import streamlit as st
+# from load_file import load_file
+
+# st.title("Download HuggingFace Repo Files in Streamlit")
+
+# filename = st.text_input("Enter filename from repo:", "model.safetensors")
+
+# if st.button("Download"):
+# try:
+# local_path = load_file(filename)
+# st.success(f"✅ File downloaded to: {local_path}")
+# st.write("You can now use this file in your app.")
+# except Exception as e:
+# st.error(f"❌ Error: {str(e)}")
+
+
+# Set API key (no need to assign OpenAI() to client like that)
+# openai.api_key = os.getenv("OPENAI_API_KEY")
+
+# def ask_openai_scientific_validation(statement):
+# prompt = f"""Assess the scientific accuracy of: "{statement}"\nRespond with ✅ (possible) or ❌ (impossible), and explain simply."""
+
+# try:
+# client = OpenAI() # This is correct placement
+# response = client.chat.completions.create(
+# model="gpt-3.5-turbo",
+# messages=[
+# {"role": "system", "content": "You are a scientific fact-checker."},
+# {"role": "user", "content": prompt}
+# ],
+# temperature=0.7,
+# max_tokens=150
+# )
+
+
+# return response.choices[0].message.content.strip()
+
+# except Exception as e:
+# return f"⚠️ Could not verify:\n\n{str(e)}"
+
+
+ #huggung face code start
+
+
+# # =====================
+# # Replace your old model loads with this:
+# # =====================
+
+# # Models
+# knn_model = load_file("Models/knn_model.pkl")
+# lasso_model = load_file("Models/lasso_model.pkl")
+# liar_model = load_file("Models/liar_model.joblib")
+# linear_model = load_file("Models/linear_model.pkl")
+# logistic_model = load_file("Models/logistic_model.pkl")
+# nb_url_model = load_file("Models/nb_url_model.pkl")
+# poly_model = load_file("Models/poly_model.pkl")
+# rf_model = load_file("Models/rf_model.pkl")
+# ridge_model = load_file("Models/ridge_model.pkl")
+# supervised_model = load_file("Models/supervised_model.pkl")
+# svr_model = load_file("Models/svr_model.pkl")
+# voting_url_model = load_file("Models/voting_url_model.pkl")
+
+# # Vectorizers / Encoders / Scalers
+# label_classes = load_file("Models/label_classes.npy")
+# label_encoder = load_file("Models/label_encoder.pkl")
+# lasso_scaler = load_file("Models/lasso_scaler.pkl")
+# liar_vectorizer = load_file("Models/liar_vectorizer.joblib")
+# nb_url_vectorizer = load_file("Models/nb_url_vectorizer.pkl")
+# poly_transform = load_file("Models/poly_transform.pkl")
+# ridge_scaler = load_file("Models/ridge_scaler.pkl")
+# svr_scaler_X = load_file("Models/svr_scaler_X.pkl")
+# svr_scaler_y = load_file("Models/svr_scaler_y.pkl")
+# tfidf_vectorizer = load_file("Models/tfidf_vectorizer.pkl")
+# url_vectorizer = load_file("Models/url_vectorizer.pkl")
+# vectorizer_joblib = load_file("Models/vectorizer.joblib")
+# vectorizer_pkl = load_file("Models/vectorizer.pkl")
+# # huggung face code end
+
+MODEL_DIR = "Models"
+DATA_DIR = "housedata" # Assuming your house data is here
+UPLOAD_FOLDER = 'static/uploads' # NEW: Folder for temporary user uploads
+
+app = Flask(__name__)
+app.config['UPLOAD_FOLDER'] = UPLOAD_FOLDER
+CORS(app)
+
+
+REPO_ID = "deedrop1140/nero-ml"
+MODEL_DIR = "Models"
+
+def load_file(filename):
+ """Try to load model from local folder; if missing, download from Hugging Face Hub."""
+ local_path = os.path.join(MODEL_DIR, filename)
+
+ # 1️⃣ Check if file exists locally
+ if os.path.exists(local_path):
+ file_path = local_path
+ else:
+ # 2️⃣ Download from Hugging Face (Render case)
+ file_path = hf_hub_download(repo_id=REPO_ID, filename=filename)
+
+ # 3️⃣ Load based on file extension
+ if filename.endswith((".pkl", ".joblib")):
+ return joblib.load(file_path)
+ elif filename.endswith(".npy"):
+ return np.load(file_path, allow_pickle=True)
+ elif filename.endswith((".pt", ".pth")):
+ return torch.load(file_path, map_location="cpu")
+ else:
+ return file_path
+
+
+#flasklogin
+
+
+# app.config["JWT_SECRET_KEY"] = "jwt-secret-key"
+# jwt = JWTManager(app)
+
+
+
+#authstart
+# app.config["SECRET_KEY"] = "super-secret"
+# app.config["SQLALCHEMY_DATABASE_URI"] = "sqlite:///users.db"
+
+# Mail
+# app.config["MAIL_SERVER"] = "smtp.gmail.com"
+# app.config["MAIL_PORT"] = 587
+# app.config["MAIL_USE_TLS"] = True
+# app.config["MAIL_USERNAME"] = "your_email@gmail.com"
+# app.config["MAIL_PASSWORD"] = "app_password"
+
+# mail = Mail(app)
+
+# login_manager = LoginManager(app)
+# login_manager.login_view = "auth.login"
+# db.init_app(app)
+# app.register_blueprint(auth)
+# jwt = JWTManager(app)
+# mail = Mail(app)
+
+# @login_manager.user_loader
+# def load_user(user_id):
+# return User.query.get(int(user_id))
+
+# with app.app_context():
+# db.create_all()
+#authend
+
+
+#chatbotcode
+# deedrop1140/qwen-ml-tutor-assets
+from transformers import (
+ AutoTokenizer,
+ AutoModelForCausalLM,
+ StoppingCriteria,
+ StoppingCriteriaList
+)
+from peft import PeftModel
+from huggingface_hub import hf_hub_download
+import zipfile
+from transformers import TextIteratorStreamer
+import threading
+from flask import Response
+
+
+# ======================
+# CONFIG
+# ======================
+BASE_MODEL = "Qwen/Qwen2.5-1.5B"
+DATASET_REPO = "deedrop1140/qwen-ml-tutor-assets"
+ZIP_NAME = "qwen-ml-tutor-best-20251213T015537Z-1-001.zip"
+MODEL_DIR = "qwen-ml-tutor-best"
+
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+
+# ======================
+# FLASK APP
+# ======================
+app = Flask(__name__)
+
+# ======================
+# DOWNLOAD MODEL ASSETS
+# ======================
+if not os.path.exists(MODEL_DIR):
+ print("⬇️ Downloading LoRA adapter...")
+ zip_path = hf_hub_download(
+ repo_id=DATASET_REPO,
+ filename=ZIP_NAME,
+ repo_type="dataset"
+ )
+ print("📦 Extracting adapter...")
+ with zipfile.ZipFile(zip_path, "r") as z:
+ z.extractall(".")
+ print("✅ Adapter ready")
+
+# ======================
+# TOKENIZER (BASE MODEL)
+# ======================
+# ======================
+# LOAD TOKENIZER (FROM LORA MODEL)
+# ======================
+tokenizer = AutoTokenizer.from_pretrained(
+ MODEL_DIR,
+ trust_remote_code=True
+)
+
+if tokenizer.pad_token_id is None:
+ tokenizer.pad_token = tokenizer.eos_token
+
+# ======================
+# LOAD BASE MODEL
+# ======================
+base_model = AutoModelForCausalLM.from_pretrained(
+ BASE_MODEL,
+ torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+ trust_remote_code=True
+)
+
+# 🔥 THIS LINE IS THE FIX (DO NOT SKIP)
+base_model.resize_token_embeddings(len(tokenizer))
+
+# MOVE MODEL TO DEVICE
+device = "cuda" if torch.cuda.is_available() else "cpu"
+base_model = base_model.to(device)
+
+# ======================
+# LOAD LORA ADAPTER
+# ======================
+llm_model = PeftModel.from_pretrained(
+ base_model,
+ MODEL_DIR,
+ is_trainable=False
+)
+
+llm_model.eval()
+
+print("✅ Model loaded successfully")
+
+# ======================
+# STOPPING CRITERIA
+# ======================
+class StopOnStrings(StoppingCriteria):
+ def __init__(self, tokenizer, stop_strings):
+ self.tokenizer = tokenizer
+ self.stop_ids = [
+ tokenizer.encode(s, add_special_tokens=False)
+ for s in stop_strings
+ ]
+
+ def __call__(self, input_ids, scores, **kwargs):
+ for stop in self.stop_ids:
+ if len(input_ids[0]) >= len(stop):
+ if input_ids[0][-len(stop):].tolist() == stop:
+ return True
+ return False
+
+stop_criteria = StoppingCriteriaList([
+ StopOnStrings(
+ tokenizer,
+ stop_strings=["User:", "Instruction:", "Question:"]
+ )
+])
+
+# =============================
+# ROUTES
+# =============================
+@app.route("/chatbot")
+def chatbot():
+ return render_template("chatbot.html", active_page="chatbot")
+
+@app.route("/chat", methods=["POST"])
+def chat():
+ data = request.json
+ user_msg = data.get("message", "").strip()
+
+ if not user_msg:
+ return jsonify({"reply": "Please ask a machine learning question."})
+
+ prompt = f"""Instruction: Answer the following question clearly.
+Do NOT ask follow-up questions.
+Do NOT continue the conversation.
+Question: {user_msg}
+Answer:"""
+
+ inputs = tokenizer(prompt, return_tensors="pt").to(DEVICE)
+
+ streamer = TextIteratorStreamer(
+ tokenizer,
+ skip_prompt=True,
+ skip_special_tokens=True
+ )
+
+ generation_kwargs = dict(
+ **inputs,
+ max_new_tokens=200,
+ temperature=0.3,
+ top_p=0.9,
+ do_sample=True,
+ eos_token_id=tokenizer.eos_token_id,
+ pad_token_id=tokenizer.eos_token_id,
+ stopping_criteria=stop_criteria,
+ streamer=streamer
+ )
+
+ # Run generation in background thread
+ thread = threading.Thread(
+ target=llm_model.generate,
+ kwargs=generation_kwargs
+ )
+ thread.start()
+
+ def event_stream():
+ for token in streamer:
+ yield f"data: {token}\n\n"
+
+ yield "data: [DONE]\n\n"
+
+ return Response(
+ event_stream(),
+ mimetype="text/event-stream"
+ )
+
+
+
+#chatbotcode
+
+genai.configure(api_key=os.getenv("GEMINI_API_KEY"))
+
+def ask_gemini(statement):
+ model = genai.GenerativeModel("gemini-2.0-flash-001")
+ response = model.generate_content(f"Verify this statement for truth: {statement}")
+ return response.text
+
+#rfc
+# model = load("Models/liar_model.joblib")
+# vectorizer = load("Models/liar_vectorizer.joblib")
+
+# Load BERT fact-checker pipeline (local model)
+# bert_checker = pipeline("text-classification", model="microsoft/deberta-v3-small")
+
+#endrfc
+
+#svm
+
+# ==== SVM Setup ====
+X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
+ n_clusters_per_class=1, n_classes=2, random_state=42)
+scaler = StandardScaler()
+X = scaler.fit_transform(X)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+
+# Train SVM
+svm_model = SVC(kernel="linear")
+svm_model.fit(X_train, y_train)
+
+#endsvm
+#deision tree
+GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
+GEMINI_URL = "https://generativelanguage.googleapis.com/v1beta/models/gemini-2.0-flash:generateContent"
+#end deision tree
+
+# Ensure directories exist
+os.makedirs(MODEL_DIR, exist_ok=True)
+os.makedirs(DATA_DIR, exist_ok=True)
+os.makedirs(UPLOAD_FOLDER, exist_ok=True) # NEW: Create upload folder
+
+def clean_text(text):
+ if pd.isnull(text):
+ return ""
+ text = text.lower()
+ text = re.sub(r"http\S+|www\S+|https\S+", '', text)
+ text = text.translate(str.maketrans('', '', string.punctuation))
+ text = re.sub(r'\d+', '', text)
+ text = re.sub(r'\s+', ' ', text).strip()
+ return text
+
+# --- Helper functions for data generation (conceptual for demo) ---
+def generate_linear_data(n_samples=100, noise=0.5):
+ X = np.sort(np.random.rand(n_samples) * 10).reshape(-1, 1)
+ y = 2 * X.squeeze() + 5 + noise * np.random.randn(n_samples)
+ return X, y
+
+def generate_non_linear_data(n_samples=100, noise=0.5):
+ X = np.sort(np.random.rand(n_samples) * 10).reshape(-1, 1)
+ y = np.sin(X.squeeze()) * 10 + noise * np.random.randn(n_samples)
+ return X, y
+
+def generate_noisy_data(n_samples=100, noise_factor=3.0):
+ X = np.sort(np.random.rand(n_samples) * 10).reshape(-1, 1)
+ y = 2 * X.squeeze() + 5 + noise_factor * np.random.randn(n_samples) # Increased noise
+ return X, y
+
+# Function to generate house price data (using your existing data structure for consistency)
+def get_house_data():
+ try:
+ df = pd.read_csv(os.path.join(DATA_DIR, 'train.csv'))
+ # Using a subset of features for simplicity in demo
+ features = ['GrLivArea', 'OverallQual', 'GarageCars', 'TotalBsmtSF', 'YearBuilt']
+ # Check if all required columns exist
+ if not all(col in df.columns for col in features + ['SalePrice']):
+ print("Warning: Missing one or more required columns in train.csv for house data.")
+ return None, None
+ X = df[features]
+ y = df['SalePrice']
+ return X, y
+ except FileNotFoundError:
+ print(f"Error: train.csv not found in {DATA_DIR}. Please ensure your data is there.")
+ return None, None
+ except Exception as e:
+ print(f"Error loading house data: {e}")
+ return None, None
+
+# Dictionary to hold all loaded models
+loaded_models = {}
+
+# Load logistic model and vectorizer for SMS
+# vectorizer = joblib.load("Models/logvectorizer.pkl")
+# model = joblib.load("Models/logistic_model.pkl")
+# vectorizer = load_file("Models/logvectorizer.pkl")
+# model = load_file("Models/logistic_model.pkl")
+
+
+# # Load models once NB+DT+SVM is trained
+# try:
+# model = load_file("Models/logistic_model.pkl")
+# # vectorizer = joblib.load("Models/logvectorizer.pkl")
+# # model = joblib.load("Models/logistic_model.pkl")
+# vectorizer = load_file("Models/vectorizer.pkl")
+# print("✅ Model and vectorizer loaded into memory successfully!")
+# except Exception as e:
+# vectorizer = None
+# model = None
+# print(f"❌ Error: Could not load model or vectorizer. Please check your file paths. Error: {e}")
+# #END NB+DT+SVM
+
+# === Naive Bayes URL Spam Classifier (NB_spam.html) ===
+# === Load Model & Vectorizer ===
+
+
+
+# VT_API_KEY = os.getenv("VT_API_KEY")
+# nb_model = load_file("Models/nb_url_model.pkl")
+# vectorizer = load_file("Models/nb_url_vectorizer.pkl")
+
+# if nb_model is not None and vectorizer is not None:
+# print("✅ Loaded model and vectorizer.")
+# else:
+# print("❌ Model or vectorizer not found.")
+
+
+
+
+
+
+def load_all_models():
+ """
+ Loads all necessary models into the loaded_models dictionary when the app starts.
+ """
+ global loaded_models
+
+ # Load Supervised Model
+ # Load Supervised Model
+try:
+ supervised_model_path = load_file("linear_model.pkl")
+
+ # Debug: check what load_file actually returned
+ print("DEBUG -> supervised_model_path type:", type(supervised_model_path))
+
+ # If load_file returned a path (string), load with joblib
+ if isinstance(supervised_model_path, str):
+ loaded_models['supervised'] = joblib.load(supervised_model_path)
+ else:
+ # If load_file already returned the model object
+ loaded_models['supervised'] = supervised_model_path
+
+ print("Supervised model loaded successfully")
+
+except FileNotFoundError:
+ print(f"Error: Supervised model file not found at {supervised_model_path}. "
+ "Please run train_model.py first.")
+ loaded_models['supervised'] = None # Mark as not loaded
+except Exception as e:
+ print(f"Error loading supervised model: {e}")
+ loaded_models['supervised'] = None
+
+
+# Load models when Flask app context is ready
+with app.app_context():
+ load_all_models()
+
+@app.route('/')
+def frontpage():
+ return render_template('frontpage.html')
+@app.route('/home')
+def home():
+ return render_template('home.html')
+
+@app.route('/Optimization')
+def Optimization():
+ return render_template('Optimization.html', active_page='Optimization')
+
+@app.route('/supervise')
+def supervise():
+ return render_template('supervise.html', active_page='supervise')
+
+
+@app.route('/unsupervised')
+def unsupervised():
+ return render_template('unsupervised.html', active_page='unsupervised')
+
+# Semi-Supervised Learning page
+@app.route('/semi-supervised')
+def semi_supervised():
+ return render_template('semi_supervised.html', active_page='semi_supervised')
+
+# Reinforcement Learning page
+@app.route('/reinforcement')
+def reinforcement():
+ return render_template('reinforcement.html', active_page='reinforcement')
+
+# Ensemble Learning page
+@app.route('/ensemble')
+def ensemble():
+ return render_template('ensemble.html', active_page='ensemble')
+
+
+@app.route('/supervised', methods=['GET', 'POST'])
+def supervised():
+ prediction = None
+ hours_studied_input = None
+
+ if loaded_models['supervised'] is None:
+ return "Error: Supervised model could not be loaded. Please check server logs.", 500
+
+ if request.method == 'POST':
+ try:
+ hours_studied_input = float(request.form['hours'])
+ input_data = np.array([[hours_studied_input]])
+
+ predicted_score = loaded_models['supervised'].predict(input_data)[0]
+ prediction = round(predicted_score, 2)
+
+ except ValueError:
+ print("Invalid input for hours studied.")
+ prediction = "Error: Please enter a valid number."
+ except Exception as e:
+ print(f"An error occurred during prediction: {e}")
+ prediction = "Error during prediction."
+
+ return render_template('supervised.html', prediction=prediction, hours_studied_input=hours_studied_input)
+
+
+@app.route('/polynomial', methods=['GET', 'POST'])
+def polynomial():
+ if request.method == 'POST':
+ try:
+ hours = float(request.form['hours'])
+
+ # model = joblib.load('Models/poly_model.pkl')
+ # poly = joblib.load('Models/poly_transform.pkl')
+ # model = load_file("Models/poly_model.pkl")
+ # poly= load_file("Models/poly_transform.pkl")
+ model = load_file("poly_model.pkl")
+ poly= load_file("poly_transform.pkl")
+
+ transformed_input = poly.transform([[hours]])
+ prediction = model.predict(transformed_input)[0]
+
+ return render_template("poly.html", prediction=round(prediction, 2), hours=hours)
+
+ except Exception as e:
+ print(f"Error: {e}")
+ return render_template("poly.html", error="Something went wrong.")
+
+ return render_template("poly.html")
+
+
+@app.route('/random_forest', methods=['GET', 'POST'])
+def random_forest():
+ if request.method == 'POST':
+ try:
+ hours = float(request.form['hours'])
+ model = load_file("rf_model.pkl")
+ # model = joblib.load('Models/rf_model.pkl')
+ prediction = model.predict([[hours]])[0]
+
+ return render_template("rf.html", prediction=round(prediction, 2), hours=hours)
+ except Exception as e:
+ print(f"[ERROR] {e}")
+ return render_template("rf.html", error="Prediction failed. Check your input.")
+ return render_template("rf.html")
+
+@app.route('/prediction_flow')
+def prediction_flow():
+ return render_template('prediction_flow.html')
+
+@app.route("/lasso", methods=["GET", "POST"])
+def lasso():
+ if request.method == "POST":
+ try:
+ inputs = [float(request.form.get(f)) for f in ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'YearBuilt']]
+
+ # model = load_file("Models/lasso_model.pkl")
+ # scaler = load_file("Models/lasso_scaler.pkl")
+ # model = joblib.load("Models/lasso_model.pkl")
+ # scaler = joblib.load("Models/lasso_scaler.pkl")
+ model = load_file("lasso_model.pkl")
+ scaler = load_file("lasso_scaler.pkl")
+
+ scaled_input = scaler.transform([inputs])
+
+ prediction = model.predict(scaled_input)[0]
+ return render_template("lasso.html", prediction=round(prediction, 2))
+
+ except Exception as e:
+ return render_template("lasso.html", error=str(e))
+
+ return render_template("lasso.html")
+
+
+@app.route('/ridge', methods=['GET', 'POST'])
+def ridge():
+ prediction = None
+ error = None
+
+ try:
+ # model = load_file("Models/ridge_model.pkl")
+ # scaler = load_file("Models/ridge_scaler.pkl")
+ # model = joblib.load(os.path.join(MODEL_DIR, 'ridge_model.pkl'))
+ # scaler = joblib.load(os.path.join(MODEL_DIR, 'ridge_scaler.pkl'))
+
+ model = load_file("ridge_model.pkl")
+ scaler = load_file("ridge_scaler.pkl")
+
+
+ except Exception as e:
+ return f"❌ Error loading Ridge model: {e}", 500
+
+ if request.method == 'POST':
+ try:
+ features = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'YearBuilt']
+ input_data = [float(request.form[feature]) for feature in features]
+ input_scaled = scaler.transform([input_data])
+ prediction = model.predict(input_scaled)[0]
+ except Exception as e:
+ error = str(e)
+
+ return render_template('ridge.html', prediction=prediction, error=error)
+
+@app.route('/dtr', methods=['GET', 'POST'])
+def dtr():
+ if request.method == 'GET':
+ return render_template('dtr.html')
+
+ if request.method == 'POST':
+ data = request.get_json()
+ data_points = data.get('dataPoints') if data else None
+ print("Received data:", data_points)
+ return jsonify({'message': 'Data received successfully!', 'receivedData': data_points})
+
+
+@app.route('/dtrg')
+def drg():
+ return render_template('desiciongame.html')
+
+# --- SVR Routes ---
+@app.route('/svr') # This route is for the initial GET request to load the page
+def svr_page():
+ return render_template('svr.html')
+
+# @app.route('/decision-tree')
+# def decision_tree():
+# return render_template('decision-Tree.html')
+
+# @app.route('/decision-tree-game')
+# def decision_tree_game():
+# return render_template('Decision-Tree-Game.html')
+
+
+@app.route('/run_svr_demo', methods=['POST'])
+def run_svr_demo():
+ try:
+ # Check if the request contains JSON (for predefined datasets) or FormData (for file uploads)
+ if request.is_json:
+ data = request.json
+ else:
+ # For FormData, data is accessed via request.form for fields, request.files for files
+ data = request.form
+
+ dataset_type = data.get('dataset_type', 'linear')
+ kernel_type = data.get('kernel', 'rbf')
+ C_param = float(data.get('C', 1.0))
+ gamma_param = float(data.get('gamma', 0.1))
+ epsilon_param = float(data.get('epsilon', 0.1))
+
+ X, y = None, None
+
+ if dataset_type == 'linear':
+ X, y = generate_linear_data()
+ elif dataset_type == 'non_linear':
+ X, y = generate_non_linear_data()
+ elif dataset_type == 'noisy':
+ X, y = generate_noisy_data()
+ elif dataset_type == 'house_data':
+ X_house, y_house = get_house_data()
+ if X_house is not None and not X_house.empty:
+ X = X_house[['GrLivArea']].values # Only GrLivArea for simple 1D plotting
+ y = y_house.values
+ else:
+ X, y = generate_linear_data() # Fallback if house data is missing/invalid
+ elif dataset_type == 'custom_csv': # NEW: Handle custom CSV upload
+ uploaded_file = request.files.get('file')
+ x_column_name = data.get('x_column_name')
+ y_column_name = data.get('y_column_name')
+
+ if not uploaded_file or uploaded_file.filename == '':
+ return jsonify({'error': 'No file uploaded for custom CSV.'}), 400
+ if not x_column_name or not y_column_name:
+ return jsonify({'error': 'X and Y column names are required for custom CSV.'}), 400
+
+ try:
+ # Read CSV into a pandas DataFrame from in-memory BytesIO object
+ df = pd.read_csv(io.BytesIO(uploaded_file.read()))
+
+ if x_column_name not in df.columns or y_column_name not in df.columns:
+ missing_cols = []
+ if x_column_name not in df.columns: missing_cols.append(x_column_name)
+ if y_column_name not in df.columns: missing_cols.append(y_column_name)
+ return jsonify({'error': f"Missing columns in uploaded CSV: {', '.join(missing_cols)}"}), 400
+
+ X = df[[x_column_name]].values # Ensure X is 2D for scikit-learn
+ y = df[y_column_name].values
+ except Exception as e:
+ return jsonify({'error': f"Error reading or processing custom CSV: {str(e)}"}), 400
+ else: # Fallback for unknown dataset types
+ X, y = generate_linear_data()
+
+
+ if X is None or y is None or len(X) == 0:
+ return jsonify({'error': 'Failed to generate or load dataset.'}), 500
+
+ # Scale data
+ scaler_X = StandardScaler()
+ scaler_y = StandardScaler()
+
+ X_scaled = scaler_X.fit_transform(X)
+ y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()
+
+ X_train, X_test, y_train, y_test = train_test_split(X_scaled, y_scaled, test_size=0.2, random_state=42)
+
+ # Train SVR model
+ svr_model = SVR(kernel=kernel_type, C=C_param, gamma=gamma_param, epsilon=epsilon_param)
+ svr_model.fit(X_train, y_train)
+
+ # Make predictions
+ y_pred_scaled = svr_model.predict(X_test)
+
+ # Inverse transform predictions to original scale for metrics
+ y_pred = scaler_y.inverse_transform(y_pred_scaled.reshape(-1, 1)).flatten()
+ y_test_original = scaler_y.inverse_transform(y_test.reshape(-1, 1)).flatten()
+
+ # Calculate metrics
+ mse = mean_squared_error(y_test_original, y_pred)
+ r2 = r2_score(y_test_original, y_pred)
+ support_vectors_count = len(svr_model.support_vectors_)
+
+ # Prepare data for plotting
+ plot_X_original = scaler_X.inverse_transform(X_scaled)
+ plot_y_original = scaler_y.inverse_transform(y_scaled.reshape(-1, 1)).flatten()
+
+ x_plot = np.linspace(plot_X_original.min(), plot_X_original.max(), 500).reshape(-1, 1)
+ x_plot_scaled = scaler_X.transform(x_plot)
+ y_plot_scaled = svr_model.predict(x_plot_scaled)
+ y_plot_original = scaler_y.inverse_transform(y_plot_scaled.reshape(-1, 1)).flatten()
+
+ y_upper_scaled = y_plot_scaled + epsilon_param
+ y_lower_scaled = y_plot_scaled - epsilon_param
+ y_upper_original = scaler_y.inverse_transform(y_upper_scaled.reshape(-1, 1)).flatten()
+ y_lower_original = scaler_y.inverse_transform(y_lower_scaled.reshape(-1, 1)).flatten()
+
+ plot_data = {
+ 'data': [
+ {
+ 'x': plot_X_original.flatten().tolist(),
+ 'y': plot_y_original.tolist(),
+ 'mode': 'markers',
+ 'type': 'scatter',
+ 'name': 'Original Data'
+ },
+ {
+ 'x': x_plot.flatten().tolist(),
+ 'y': y_plot_original.tolist(),
+ 'mode': 'lines',
+ 'type': 'scatter',
+ 'name': 'SVR Prediction',
+ 'line': {'color': 'red'}
+ },
+ {
+ 'x': x_plot.flatten().tolist(),
+ 'y': y_upper_original.tolist(),
+ 'mode': 'lines',
+ 'type': 'scatter',
+ 'name': 'Epsilon Tube (Upper)',
+ 'line': {'dash': 'dash', 'color': 'green'},
+ 'fill': 'tonexty',
+ 'fillcolor': 'rgba(0,128,0,0.1)'
+ },
+ {
+ 'x': x_plot.flatten().tolist(),
+ 'y': y_lower_original.tolist(),
+ 'mode': 'lines',
+ 'type': 'scatter',
+ 'name': 'Epsilon Tube (Lower)',
+ 'line': {'dash': 'dash', 'color': 'green'}
+ }
+ ],
+ 'layout': {
+ 'title': f'SVR Regression (Kernel: {kernel_type.upper()})',
+ 'xaxis': {'title': 'Feature Value'},
+ 'yaxis': {'title': 'Target Value'},
+ 'hovermode': 'closest'
+ }
+ }
+
+ return jsonify({
+ 'mse': mse,
+ 'r2_score': r2,
+ 'support_vectors_count': support_vectors_count,
+ 'plot_data': plot_data
+ })
+
+ except Exception as e:
+ print(f"Error in SVR demo: {e}")
+ return jsonify({'error': str(e)}), 500
+
+
+def clean_text(text):
+ return text.lower().strip()
+
+ # Gradient-desent route
+@app.route('/gradient-descent')
+def gradient_descent():
+ return render_template('Gradient-Descen.html')
+#new
+
+@app.route('/gradient-descent-three')
+def gradient_descent_three():
+ return render_template('gradient-descent-three.html')
+
+
+# Gradient-boosting route
+@app.route('/gradient-boosting')
+def gradient_boosting():
+ return render_template('Gradient-Boosting.html')
+#new
+@app.route('/gradient-boosting-three')
+def gradient_boosting_three():
+ return render_template('gradient-boosting-three.html')
+
+
+
+# Gradient-xgboost route
+@app.route('/xgboost-regression')
+def xgboost_regression():
+ return render_template('XGBoost-Regression.html')
+
+@app.route('/xgboost-tree-three')
+def xgboost_regression_three():
+ return render_template('xboost-tree-three.html')
+
+@app.route('/xgboost-graph-three2')
+def xgboost_regression_three2():
+ return render_template('xbost-graph-three.html')
+
+
+
+#Gradient-lightgbm route
+@app.route('/lightgbm')
+def lightgbm():
+ return render_template('LightGBM-Regression.html')
+
+
+@app.route('/Naive-Bayes-Simulator')
+def Naive_Bayes_Simulator():
+ return render_template('Naive-Bayes-Simulator.html')
+
+@app.route('/svm-model-three')
+def svm_model_three():
+ return render_template('SVM_Simulator_3D.html')
+
+
+
+#nerual network route for calssifcation
+@app.route('/neural-network-classification')
+def neural_network_classification():
+ return render_template('Neural-Networks-for-Classification.html')
+
+@app.route('/Neural-Networks-for-Classification-three')
+def Neural_Networks_for_Classification_three():
+ return render_template('Neural-Networks-for-Classification-three.html')
+
+
+
+#hierarchical clustering route
+
+@app.route('/hierarchical-clustering')
+def hierarchical_clustering():
+ return render_template('Hierarchical-Clustering.html')
+
+@app.route('/hierarchical-three')
+def hierarchical_three():
+ return render_template('Hierarchical-three.html')
+
+
+#Gaussian-mixture-models route
+@app.route('/gaussian-mixture-models')
+def gaussian_mixture_models():
+ return render_template('Gaussian-Mixture-Models.html')
+
+@app.route('/gaussian-mixture-three')
+def gaussian_mixture_three():
+ return render_template('gmm-threejs.html')
+
+
+
+
+#Principal-Component-Analysis
+@app.route('/pca')
+def pca():
+ return render_template('Principal-Component-Analysis.html')
+
+@app.route('/pca-three')
+def pca_three():
+ return render_template('pca-threejs.html')
+
+
+
+#t-sne
+@app.route('/t-sne')
+def tsne():
+ return render_template('t-SNE.html')
+
+@app.route('/t-sne-three')
+def tsne_three():
+ return render_template('t-sne-three.html')
+
+
+# liner-discriminant-analysis
+@app.route('/lda')
+def lda():
+ return render_template('Linear-Discriminant-Analysis.html')
+
+
+@app.route('/lda-three')
+def lda_three():
+ return render_template('lda-three.html')
+
+
+# Independent-Component-Analysis
+@app.route('/ica')
+def ica():
+ return render_template('Independent-Component-Analysis.html')
+
+
+
+@app.route('/ica-three')
+def ica_three():
+ return render_template('ica-threejs.html')
+
+
+#Apriori
+@app.route('/apriori')
+def apriori():
+ return render_template('Apriori-Algorithm.html')
+
+@app.route('/apriori-three')
+def apriori_three():
+ return render_template('Apriori-Simulator-three.html')
+
+
+# Eclat Algorithm
+@app.route('/eclat')
+def eclat():
+ return render_template('Eclat-Algorithm.html')
+
+@app.route('/eclat-three')
+def eclat_three():
+ return render_template('Eclat-Algorithm-three.html')
+
+#genrative models
+@app.route('/generative-models')
+def generative_models():
+ return render_template('Generative-Models.html')
+
+#self training
+@app.route('/self-training')
+def self_training():
+ return render_template('Self-Training.html')
+
+
+# TRANSDUCTIVE SVM
+@app.route('/transductive-svm')
+def transductive_svm():
+ return render_template('Transductive-SVM.html')
+
+
+#Graph-Based Methods
+@app.route('/graph-based-methods')
+def graph_based_methods():
+ return render_template('Graph-Based-Method.html')
+
+#Agent-Environment-State
+@app.route('/agent-environment-state')
+def agent_environment_state():
+ return render_template('Agent-Environment-State.html')
+
+#Action and Policy
+@app.route('/action-and-policy')
+def action_and_policy():
+ return render_template('Action-and-Policy.html')
+
+#Reward-ValueFunction
+@app.route('/reward-valuefunction')
+def reward_valuefunction():
+ return render_template('Reward-ValueFunction.html')
+
+#Q-Learning
+@app.route('/q-learning')
+def q_learning():
+ return render_template('Q-Learning.html')
+
+#Deep Reinforcement Learning
+@app.route('/deep-reinforcement-learning')
+def deep_reinforcement_learning():
+ return render_template('Deep-Reinforcement-Learning.html')
+
+
+#Bagging
+@app.route('/bagging')
+def bagging():
+ return render_template('Bagging.html')
+
+#Boosting
+@app.route('/boosting')
+def boosting():
+ return render_template('Boosting.html')
+
+# stacking
+@app.route('/stacking')
+def stacking():
+ return render_template('Stacking.html')
+
+# voting
+@app.route('/voting')
+def voting():
+ return render_template('Voting.html')
+
+import re
+
+# Load saved model and vectorizer
+# model = joblib.load("Models/logistic_model.pkl")
+# vectorizer = joblib.load("Models/logvectorizer.pkl")
+
+
+# Text cleaning
+def clean_text(text):
+ text = text.lower()
+ text = re.sub(r'\W', ' ', text)
+ text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
+ text = re.sub(r'\s+', ' ', text)
+ return text.strip()
+
+@app.route('/logistic', methods=['GET', 'POST'])
+def logistic():
+ prediction, confidence_percentage, cleaned, tokens, probability = None, None, None, None, None
+
+
+ # model = load_file("Models/logistic_model.pkl")
+ # vectorizer = load_file("Models/logvectorizer.pkl")
+ model = load_file("logistic_model.pkl")
+ vectorizer = load_file("logvectorizer.pkl")
+
+ if request.method == "POST":
+ msg = request.form.get('message', '')
+ cleaned = clean_text(msg)
+ tokens = cleaned.split()
+
+
+ try:
+ vector = vectorizer.transform([cleaned])
+ probability = model.predict_proba(vector)[0][1]
+ prediction = "Spam" if probability >= 0.5 else "Not Spam"
+ confidence_percentage = round(probability * 100, 2)
+ except Exception as e:
+ print("Error predicting:", e)
+ prediction = "Error"
+ confidence_percentage = 0
+
+ return render_template(
+ "logistic.html",
+ prediction=prediction,
+ confidence_percentage=confidence_percentage,
+ cleaned=cleaned,
+ tokens=tokens,
+ probability=round(probability, 4) if probability else None,
+ source="sms"
+ )
+
+@app.route('/logistic-sms', methods=['POST'])
+def logistic_sms():
+ try:
+ data = request.get_json()
+ msg = data.get('message', '')
+ cleaned = clean_text(msg)
+ tokens = cleaned.split()
+
+ vector = vectorizer.transform([cleaned])
+ probability = model.predict_proba(vector)[0][1]
+ prediction = "Spam" if probability >= 0.5 else "Not Spam"
+ confidence_percentage = round(probability * 100, 2)
+
+ return jsonify({
+ "prediction": prediction,
+ "confidence": confidence_percentage,
+ "probability": round(probability, 4),
+ "cleaned": cleaned,
+ "tokens": tokens,
+ "source": "json"
+ })
+
+ except Exception as e:
+ print("Error in /logistic-sms:", e)
+ return jsonify({"error": "Internal server error", "details": str(e)}), 500
+
+
+
+# @app.route("/logistic", methods=["GET", "POST"])
+# def logistic():
+# prediction = None
+# error = None
+# if request.method == "POST":
+# try:
+# input_text = request.form.get("message")
+
+# # Load the vectorizer and logistic model from Models folder
+# vectorizer = joblib.load("Models/vectorizer.pkl")
+# model = joblib.load("Models/logistic_model.pkl")
+
+# # Transform input and make prediction
+# input_vector = vectorizer.transform([input_text])
+# result = model.predict(input_vector)[0]
+
+# prediction = "✅ Not Spam" if result == 0 else "🚨 Spam"
+# except Exception as e:
+# error = str(e)
+
+# return render_template("logistic.html", prediction=prediction, error=error)
+
+
+
+
+
+
+ #---------- LOAD MODEL & LABELS ONCE (startup) ----------
+# MODEL_PATH = os.path.join("Models", "knnmodel.joblib") # adjust if your filename is different
+# LABELS_PATH = os.path.join("Models", "label_classes.npy")
+
+# try:
+# model = joblib.load(MODEL_PATH)
+# except Exception as e:
+# # Keep model as None so routes can return clear error if it's missing
+# current_app.logger if hasattr(current_app, "logger") else print
+# print(f"Failed to load model from {MODEL_PATH}: {e}")
+# model = None
+
+# try:
+# label_classes = np.load(LABELS_PATH, allow_pickle=True)
+# except Exception as e:
+# print(f"Failed to load label_classes from {LABELS_PATH}: {e}")
+# label_classes = None
+
+HF_DATASET_REPO = "deedrop1140/qwen-ml-tutor-assets"
+
+
+def load_knn_assets():
+ try:
+ model_path = hf_hub_download(
+ repo_id=HF_DATASET_REPO,
+ filename="knnmodel.joblib",
+ repo_type="dataset"
+ )
+
+ labels_path = hf_hub_download(
+ repo_id=HF_DATASET_REPO,
+ filename="label_classes.npy",
+ repo_type="dataset"
+ )
+
+ model = joblib.load(model_path)
+ label_classes = np.load(labels_path, allow_pickle=True)
+
+ return model, label_classes
+
+ except Exception as e:
+ print("❌ Failed to load KNN assets from Hugging Face:", e)
+ return None, None
+
+
+# ---------- KNN VISUAL ROUTES (unchanged) ----------
+@app.route("/knn")
+def knn_visual():
+ return render_template("knn.html")
+
+@app.route('/knn_visual_predict', methods=['POST'])
+def knn_visual_predict():
+ data = request.get_json()
+ points = np.array(data['points']) # shape: (N, 3)
+ test_point = np.array(data['test_point']) # shape: (2,)
+ k = int(data['k'])
+
+ X = points[:, :2]
+ y = points[:, 2].astype(int)
+
+ knn_local = KNeighborsClassifier(n_neighbors=k)
+ knn_local.fit(X, y)
+ pred = knn_local.predict([test_point])[0]
+
+ dists = np.linalg.norm(X - test_point, axis=1)
+ neighbor_indices = np.argsort(dists)[:k]
+ neighbors = X[neighbor_indices]
+
+ return jsonify({
+ 'prediction': int(pred),
+ 'neighbors': neighbors.tolist()
+ })
+
+# ---------- IMAGE PREDICTION ROUTE (fixed) ----------
+@app.route("/knn_image")
+def knn_image_page():
+ return render_template("knn_image.html")
+
+@app.route("/predict_image", methods=["POST"])
+def predict_image():
+ if model is None or label_classes is None:
+ return jsonify({"error": "Model not loaded"}), 500
+
+ if "image" not in request.files:
+ return jsonify({"error": "No image uploaded"}), 400
+
+ file = request.files["image"]
+
+ try:
+ image = Image.open(file.stream).convert("L")
+ image = image.resize((28, 28))
+ img_array = np.array(image).reshape(1, -1).astype("float32")
+ except Exception as e:
+ return jsonify({"error": f"Invalid image. {str(e)}"}), 400
+
+ probs = model.predict_proba(img_array)[0]
+ pred_index = np.argmax(probs)
+ pred_label = label_classes[pred_index]
+ confidence = round(float(probs[pred_index]) * 100, 2)
+
+ return jsonify({
+ "prediction": str(pred_label),
+ "confidence": f"{confidence}%",
+ "all_probabilities": {
+ str(label_classes[i]): round(float(probs[i]) * 100, 2)
+ for i in range(len(probs))
+ }
+ })
+
+
+@app.route("/rfc")
+def random_forest_page():
+ return render_template("Random_Forest_Classifier.html") # Your beautiful HTML goes in rfc.html
+
+@app.route('/rf_visual_predict', methods=['POST'])
+def rf_visual_predict():
+ try:
+ data = request.get_json()
+ print("📦 Incoming JSON data:", data)
+
+ labeled_points = data.get('points')
+ test_point = data.get('test_point')
+
+ if not labeled_points or not test_point:
+ return jsonify({"error": "Missing points or test_point"}), 400
+
+ df = pd.DataFrame(labeled_points, columns=['X1', 'X2', 'Class'])
+ X = df[['X1', 'X2']]
+ y = df['Class']
+
+ rf_model = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)
+ rf_model.fit(X, y)
+
+ test_point_np = np.array(test_point).reshape(1, -1)
+ prediction = int(rf_model.predict(test_point_np)[0])
+
+ x_min, x_max = X['X1'].min() - 1, X['X1'].max() + 1
+ y_min, y_max = X['X2'].min() - 1, X['X2'].max() + 1
+ xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
+ np.linspace(y_min, y_max, 100))
+
+ Z = rf_model.predict(np.c_[xx.ravel(), yy.ravel()])
+ Z = Z.reshape(xx.shape)
+
+ return jsonify({
+ 'prediction': prediction,
+ 'decision_boundary_z': Z.tolist(),
+ 'decision_boundary_x_coords': xx[0, :].tolist(),
+ 'decision_boundary_y_coords': yy[:, 0].tolist()
+ })
+
+ except Exception as e:
+ import traceback
+ print("❌ Exception in /rf_visual_predict:")
+ traceback.print_exc() # Print full error stack trace
+ return jsonify({"error": str(e)}), 500
+
+@app.route("/liar")
+def liar_input_page():
+ return render_template("rfc_liar_predict.html")
+
+
+
+
+
+
+
+@app.route("/ref/liar/predictor", methods=["POST"])
+def liar_predictor():
+ try:
+ data = request.get_json()
+ statement = data.get("statement", "")
+
+ if not statement:
+ return jsonify({"success": False, "error": "Missing statement"}), 400
+
+ try:
+ # 🔍 LIAR Model Prediction
+ features = vectorizer.transform([statement])
+ prediction = model.predict(features)[0]
+
+ liar_label_map = {
+ 0: "It can be false 🔥",
+ 1: "False ❌",
+ 2: "Mostly false but can be true 🤏",
+ 3: "Half True 🌓",
+ 4: "Mostly True 👍",
+ 5: "True ✅"
+ }
+
+ prediction_label = liar_label_map.get(int(prediction), "Unknown")
+
+ except ValueError as ve:
+ if "features" in str(ve):
+ # Fallback to Gemini API
+ prediction_label = ask_gemini(statement)
+ else:
+ raise ve
+
+ # 🧠 BERT-Based Scientific Check
+ bert_result = bert_checker(statement)[0]
+ bert_label = bert_result["label"]
+ bert_score = round(bert_result["score"] * 100, 2)
+
+ science_label_map = {
+ "LABEL_0": "✅ Scientifically Possible",
+ "LABEL_1": "❌ Scientifically Impossible"
+ }
+
+ scientific_check = f"{science_label_map.get(bert_label, bert_label)} ({bert_score:.2f}%)"
+
+ return jsonify({
+ "success": True,
+ "prediction": prediction_label,
+ "reason": "Predicted from linguistic and content-based patterns, or Gemini fallback.",
+ "scientific_check": scientific_check
+ })
+
+ except Exception as e:
+ traceback.print_exc()
+ return jsonify({"success": False, "error": str(e)}), 500
+
+
+
+#svm
+@app.route("/svm")
+def svm_page():
+ return render_template("svm.html")
+
+@app.route('/svm_visual_predict', methods=['POST'])
+def svm_visual_predict():
+ data = request.json
+ labeled_points = data['points']
+ test_point = data['test_point']
+ svm_type = data['svm_type']
+ c_param = float(data['c_param'])
+ gamma_param = float(data['gamma_param']) # Will be ignored for linear kernel
+
+ df = pd.DataFrame(labeled_points, columns=['X1', 'X2', 'Class'])
+ X = df[['X1', 'X2']]
+ y = df['Class']
+
+ # 1. Train the SVM Classifier
+ if svm_type == 'linear':
+ svm_model = svm.SVC(kernel='linear', C=c_param, random_state=42)
+ elif svm_type == 'rbf':
+ svm_model = svm.SVC(kernel='rbf', C=c_param, gamma=gamma_param, random_state=42)
+ else:
+ return jsonify({'error': 'Invalid SVM type'}), 400
+
+ svm_model.fit(X, y)
+
+ # 2. Predict for the test point
+ test_point_np = np.array(test_point).reshape(1, -1)
+ prediction = int(svm_model.predict(test_point_np)[0])
+
+ # 3. Get Support Vectors
+ # support_vectors_ refers to indices of support vectors
+ # svc_model.support_vectors_ gives the actual support vectors
+ support_vectors = svm_model.support_vectors_.tolist()
+
+ # 4. Generate data for the decision boundary
+ # Create a meshgrid of points to predict across the entire plot area
+ x_min, x_max = X['X1'].min() - 1, X['X1'].max() + 1
+ y_min, y_max = X['X2'].min() - 1, X['X2'].max() + 1
+
+ # Extend range slightly to ensure test point is within boundary if it's an outlier
+ x_min = min(x_min, test_point_np[0,0] - 1)
+ x_max = max(x_max, test_point_np[0,0] + 1)
+ y_min = min(y_min, test_point_np[0,1] - 1)
+ y_max = max(y_max, test_point_np[0,1] + 1)
+
+ xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
+ np.linspace(y_min, y_max, 100))
+
+ # Predict class for each point in the meshgrid
+ Z = svm_model.predict(np.c_[xx.ravel(), yy.ravel()])
+ Z = Z.reshape(xx.shape)
+
+ # Convert numpy arrays to lists for JSON serialization
+ decision_boundary_z = Z.tolist()
+ decision_boundary_x_coords = xx[0, :].tolist()
+ decision_boundary_y_coords = yy[:, 0].tolist()
+
+ return jsonify({
+ 'prediction': prediction,
+ 'decision_boundary_z': decision_boundary_z,
+ 'decision_boundary_x_coords': decision_boundary_x_coords,
+ 'decision_boundary_y_coords': decision_boundary_y_coords,
+ 'support_vectors': support_vectors
+ })
+
+
+
+
+
+
+
+@app.route('/api/explain', methods=['POST'])
+def explain():
+ # In a real deployed environment, you'd secure your API key.
+ # For Canvas, it's automatically injected if GEMINI_API_KEY is empty string.
+ # If running locally and not in Canvas, set GEMINI_API_KEY in your environment variables.
+ if not GEMINI_API_KEY and not os.getenv("FLASK_ENV") == "development": # Allow empty key in dev for local testing
+ return jsonify({'error': 'Missing API key'}), 500
+
+ payload = request.get_json()
+
+ try:
+ response = requests.post(
+ f"{GEMINI_URL}?key={GEMINI_API_KEY}",
+ headers={"Content-Type": "application/json"},
+ json=payload
+ )
+ response.raise_for_status() # Raise HTTPError for bad responses (4xx or 5xx)
+ return jsonify(response.json())
+ except requests.exceptions.RequestException as e:
+ app.logger.error(f"Error calling Gemini API: {e}") # Log the error on the server side
+ return jsonify({'error': str(e)}), 500
+
+@app.route('/decision_tree')
+def decision_tree_page():
+ # This route serves your Decision Tree visualization page
+ # Ensure the HTML file name matches (e.g., 'decision_tree_viz.html' or 'decision_tree.html')
+ return render_template('decision_tree.html') # Check your actual HTML file name here
+
+
+@app.route('/game')
+def decision_tree_game():
+ """Renders the interactive game page for decision trees."""
+ return render_template('decision_tree_game.html')
+
+@app.route('/dt_visual_predict', methods=['POST'])
+def dt_visual_predict():
+ try:
+ data = request.json
+ labeled_points = data['points']
+ test_point = data['test_point']
+ max_depth = int(data['max_depth'])
+
+ # Convert labeled_points to a pandas DataFrame
+ df = pd.DataFrame(labeled_points, columns=['X1', 'X2', 'Class'])
+ X = df[['X1', 'X2']]
+ y = df['Class']
+
+ # Check if there's enough data to train
+ if X.empty or len(X) < 2:
+ return jsonify({'error': 'Not enough data points to train the model.'}), 400
+
+ # 1. Train the Decision Tree Classifier (This is the "model" part)
+ dt_model = DecisionTreeClassifier(max_depth=max_depth, random_state=42)
+ dt_model.fit(X, y)
+
+ # 2. Predict for the test point
+ test_point_np = np.array(test_point).reshape(1, -1)
+ prediction = int(dt_model.predict(test_point_np)[0])
+
+ # 3. Generate data for the decision boundary
+ x_min, x_max = X['X1'].min(), X['X1'].max()
+ y_min, y_max = X['X2'].min(), X['X2'].max()
+
+ # Add a buffer to the plot range to make sure points are not on the edge
+ # And handle cases where min == max (e.g., all points have same X1 value)
+ x_buffer = 1.0 if (x_max - x_min) == 0 else (x_max - x_min) * 0.1
+ y_buffer = 1.0 if (y_max - y_min) == 0 else (y_max - y_min) * 0.1
+
+ x_min -= x_buffer
+ x_max += x_buffer
+ y_min -= y_buffer
+ y_max += y_buffer
+
+ # Ensure test point is also comfortably within the range
+ x_min = min(x_min, test_point_np[0,0] - 0.5)
+ x_max = max(x_max, test_point_np[0,0] + 0.5)
+ y_min = min(y_min, test_point_np[0,1] - 0.5)
+ y_max = max(y_max, test_point_np[0,1] + 0.5)
+
+ # Create a meshgrid for plotting the decision boundary
+ xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
+ np.linspace(y_min, y_max, 100))
+
+ # Predict class for each point in the meshgrid using the trained model
+ Z = dt_model.predict(np.c_[xx.ravel(), yy.ravel()])
+ Z = Z.reshape(xx.shape)
+
+ # Convert numpy arrays to lists for JSON serialization
+ decision_boundary_z = Z.tolist()
+ decision_boundary_x_coords = xx[0, :].tolist()
+ decision_boundary_y_coords = yy[:, 0].tolist()
+
+ return jsonify({
+ 'prediction': prediction,
+ 'decision_boundary_z': decision_boundary_z,
+ 'decision_boundary_x_coords': decision_boundary_x_coords,
+ 'decision_boundary_y_coords': decision_boundary_y_coords
+ })
+ except Exception as e:
+ # This will print the actual error to your terminal
+ print(f"An error occurred in /dt_visual_predict: {e}")
+ # Return a more informative error message to the frontend
+ return jsonify({'error': f'Backend Error: {str(e)}. Check server console for details.'}), 500
+
+ # --- Naive Bayes Routes ---
+
+from urllib.parse import urlparse
+from sklearn.naive_bayes import GaussianNB
+from nltk.corpus import words
+
+nb_model = load_file("nb_url_model.pkl")
+vectorizer = load_file("nb_url_vectorizer.pkl")
+
+# if nb_model is not None and vectorizer is not None:
+# print("✅ Loaded Naive Bayes URL model")
+# else:
+# nb_model, vectorizer = None, None
+# print("❌ vectorizer not found")
+
+
+
+@app.route('/nb_spam')
+def nb_spam_page():
+ return render_template('NB_spam.html')
+
+
+import re
+from urllib.parse import urlparse
+from spellchecker import SpellChecker
+import wordninja
+
+
+
+# ---- Whitelist (your full one, unchanged) ----
+whitelist = set([
+ # Search Engines
+ 'google', 'bing', 'yahoo', 'duckduckgo', 'baidu', 'ask',
+
+ # Social Media
+ 'facebook', 'instagram', 'twitter', 'linkedin', 'snapchat', 'tiktok',
+ 'threads', 'pinterest', 'reddit', 'quora',
+
+ # Communication Tools
+ 'whatsapp', 'telegram', 'skype', 'zoom', 'meet', 'discord',
+ 'teams', 'signal', 'messenger',
+
+ # Global E-commerce
+ 'amazon', 'ebay', 'shopify', 'alibaba', 'walmart', 'target',
+ 'etsy', 'shein', 'bestbuy', 'costco', 'newegg',
+
+ # Indian E-commerce / Services
+ 'flipkart', 'myntra', 'ajio', 'nykaa', 'meesho', 'snapdeal',
+ 'paytm', 'phonepe', 'mobikwik', 'zomato', 'swiggy', 'ola', 'uber', 'bookmyshow',
+ 'ixigo', 'makemytrip', 'yatra', 'redbus', 'bigbasket', 'grofers', 'blinkit',
+ 'universalcollegeofengineering',
+
+ # Education / Productivity
+ 'youtube', 'docs', 'drive', 'calendar', 'photos', 'gmail', 'notion',
+ 'edx', 'coursera', 'udemy', 'khanacademy', 'byjus', 'unacademy',
+
+ # News / Media / Tech
+ 'bbc', 'cnn', 'nyt', 'forbes', 'bloomberg', 'reuters',
+ 'ndtv', 'indiatimes', 'thehindu', 'hindustantimes', 'indiatoday',
+ 'techcrunch', 'verge', 'wired',
+
+ # Streaming / Entertainment
+ 'netflix', 'hotstar', 'primevideo', 'spotify', 'gaana', 'wynk', 'saavn', 'voot',
+
+ # Dev & Tools
+ 'github', 'stackoverflow', 'medium', 'gitlab', 'bitbucket',
+ 'adobe', 'figma', 'canva',
+
+ # Financial / Banking
+ 'hdfcbank', 'icicibank', 'sbi', 'axisbank', 'kotak', 'boi', 'upi',
+ 'visa', 'mastercard', 'paypal', 'stripe', 'razorpay', 'phonepe', 'paytm',
+
+ # Government / Utilities
+ 'gov', 'nic', 'irctc', 'uidai', 'mygov', 'incometax', 'aadhar', 'rbi',
+
+ # Others Common
+ 'airtel', 'jio', 'bsnl', 'vi', 'speedtest', 'cricbuzz', 'espn', 'espncricinfo',
+ 'wikipedia', 'mozilla', 'opera', 'chrome', 'android', 'apple', 'windows', 'microsoft'
+])
+
+ # ... your full whitelist from before ...
+
+
+# ---- Trusted & Bad TLDs ----
+trusted_tlds = [
+ '.gov', '.nic.in', '.edu', '.ac.in', '.mil', '.org', '.int',
+ '.co.in', '.gov.in', '.res.in', '.net.in', '.nic.gov.in'
+]
+
+# Expanded Bad TLDs (Rule 4)
+bad_tlds = [
+ '.xyz', '.tk', '.ml', '.ga', '.cf', '.top', '.gq', '.cn',
+ '.ru', '.pw', '.bid', '.link', '.loan', '.party', '.science',
+ '.stream', '.webcam', '.online', '.site', '.website', '.space',
+ '.club', '.buzz', '.info'
+]
+
+# Suspicious extensions (Rule 13)
+suspicious_extensions = ['.exe', '.zip', '.rar', '.js', '.php', '.asp', '.aspx', '.jsp', '.sh']
+
+# Phishing keywords (Rule 11, your full list)
+phishing_keywords = [
+ 'login', 'verify', 'secure', 'account', 'update', 'confirm', 'authenticate',
+ 'free', 'bonus', 'offer', 'prize', 'winner', 'gift', 'coupon', 'discount',
+ 'bank', 'paypal', 'creditcard', 'mastercard', 'visa', 'amex', 'westernunion',
+ 'signin', 'click', 'password', 'unlock', 'recover', 'validate', 'urgency',
+ 'limitedtime', 'expires', 'suspicious', 'alert', 'important', 'actionrequired'
+]
+
+# ---- Rules 5–14 ----
+rules = {
+ 5: r"https?://\d{1,3}(\.\d{1,3}){3}",
+ 6: r"@[A-Za-z0-9.-]+\.[A-Za-z]{2,}",
+ 7: r"(free money|win now|click here)",
+ 8: r"https?://[^\s]*\.(ru|cn|tk)",
+ 9: r"https?://.{0,6}\..{2,6}/.{0,6}",
+ 10: r"[0-9]{10,}",
+ 12: r"https?://[^\s]*@[^\s]+",
+ 13: r"https?://[^\s]*//[^\s]+",
+ 14: r"https?://[^\s]*\?(?:[^=]+=[^&]*&){5,}",
+}
+
+
+# ---- Gibberish Check Helper (Rule 15) ----
+def is_gibberish_word(word):
+ vowels = "aeiou"
+ v_count = sum(c in vowels for c in word)
+ return v_count / len(word) < 0.25
+
+# # ---- Utility: Extract words from URL ----
+# def extract_words(url):
+# parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
+# raw = parsed.netloc.replace('-', '') + parsed.path.replace('-', '')
+# # Split using wordninja
+# words = wordninja.split(raw.lower())
+# # Keep only alphabetic words of length >= 3
+# words = [w for w in words if w.isalpha() and len(w) >= 3]
+# return words
+# ---- Extract words from URL ----
+def extract_words(url):
+ parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
+ parts = re.split(r'\W+', parsed.netloc + parsed.path)
+ final_words = []
+ for word in parts:
+ if len(word) > 2 and word.isalpha():
+ split_words = wordninja.split(word.lower())
+ if len(split_words) <= 1:
+ split_words = [word.lower()]
+ final_words.extend(split_words)
+ return final_words
+
+
+# --- Your original predict function, now inside the Flask app ---
+@app.route("/predict", methods=["POST"])
+def predict():
+ try:
+ data = request.get_json()
+ url = data.get("url", "").lower()
+ if not url:
+ return jsonify({'error': 'No URL provided'}), 400
+
+ parsed = urlparse(url if url.startswith(("http://", "https://")) else "http://" + url)
+ path = parsed.path
+
+ # ---- SpellChecker using built-in dictionary ----
+ spell = SpellChecker(distance=1)
+
+ # ---- Extract words and check spelling ----
+ words = extract_words(url)
+ # ignore known TLDs
+ tlds_to_ignore = [tld.replace('.', '',"/") for tld in trusted_tlds + bad_tlds]
+ words_for_spellcheck = [w for w in words if w not in tlds_to_ignore]
+
+ misspelled = spell.unknown(words_for_spellcheck)
+ steps = [{"word": w, "valid": (w not in misspelled) or (w in tlds_to_ignore)} for w in words]
+
+ if misspelled:
+ return jsonify({
+ "prediction": 1,
+ "reason": f"🧾 Spelling errors: {', '.join(misspelled)}",
+ "steps": steps
+ })
+ else:
+ return jsonify({
+ "prediction": 0,
+ "reason": "✅ No spelling issues",
+ "steps": steps
+ })
+
+ except Exception as e:
+ return jsonify({'error': f"An issue occurred during spell checking: {str(e)}"}), 500
+
+
+
+
+@app.route('/naive_bayes')
+def naive_bayes_page():
+ return render_template('naive_bayes_viz.html')
+
+ # --- New Naive Bayes Prediction Route ---
+@app.route('/nb_visual_predict', methods=['POST'])
+def nb_visual_predict():
+ try:
+ data = request.json
+ labeled_points = data['points']
+ test_point = data['test_point']
+
+ df = pd.DataFrame(labeled_points, columns=['X1', 'X2', 'Class'])
+ X = df[['X1', 'X2']]
+ y = df['Class']
+
+ # Ensure enough data and at least two classes for classification
+ if X.empty or len(X) < 2:
+ return jsonify({'error': 'Not enough data points to train the model.'}), 400
+ if len(y.unique()) < 2:
+ return jsonify({'error': 'Need at least two different classes to classify.'}), 400
+
+ # Train Gaussian Naive Bayes Model
+ # GaussianNB is suitable for continuous data
+ nb_model = GaussianNB()
+ nb_model.fit(X, y)
+
+ # Predict for the test point
+ test_point_np = np.array(test_point).reshape(1, -1)
+ prediction = int(nb_model.predict(test_point_np)[0])
+
+ # Generate data for the decision boundary
+ x_min, x_max = X['X1'].min(), X['X1'].max()
+ y_min, y_max = X['X2'].min(), X['X2'].max()
+
+ x_buffer = 1.0 if x_max - x_min == 0 else (x_max - x_min) * 0.1
+ y_buffer = 1.0 if y_max - y_min == 0 else (y_max - y_min) * 0.1
+
+ x_min -= x_buffer
+ x_max += x_buffer
+ y_min -= y_buffer
+ y_max += y_buffer
+
+ x_min = min(x_min, test_point_np[0,0] - 0.5)
+ x_max = max(x_max, test_point_np[0,0] + 0.5)
+ y_min = min(y_min, test_point_np[0,1] - 0.5)
+ y_max = max(y_max, test_point_np[0,1] + 0.5)
+
+ xx, yy = np.meshgrid(np.linspace(x_min, x_max, 100),
+ np.linspace(y_min, y_max, 100))
+
+ if xx.size == 0 or yy.size == 0:
+ return jsonify({'error': 'Meshgrid could not be created. Data range too narrow.'}), 400
+
+ # Predict class for each point in the meshgrid
+ # Use predict_proba and then argmax to get class for decision boundary coloring
+ Z = nb_model.predict(np.c_[xx.ravel(), yy.ravel()])
+ Z = Z.reshape(xx.shape)
+
+ decision_boundary_z = Z.tolist()
+ decision_boundary_x_coords = xx[0, :].tolist()
+ decision_boundary_y_coords = yy[:, 0].tolist()
+
+ return jsonify({
+ 'prediction': prediction,
+ 'decision_boundary_z': decision_boundary_z,
+ 'decision_boundary_x_coords': decision_boundary_x_coords,
+ 'decision_boundary_y_coords': decision_boundary_y_coords
+ })
+ except Exception as e:
+ print(f"An error occurred in /nb_visual_predict: {e}")
+ return jsonify({'error': f'Backend Error: {str(e)}. Check server console for details.'}), 500
+
+def check_with_virustotal(url):
+ try:
+ headers = {"x-apikey": VT_API_KEY}
+ submit_url = "https://www.virustotal.com/api/v3/urls"
+
+ # Submit the URL for scanning
+ response = requests.post(submit_url, headers=headers, data={"url": url})
+ url_id = response.json()["data"]["id"]
+
+ # Fetch result
+ result = requests.get(f"{submit_url}/{url_id}", headers=headers)
+ data = result.json()
+
+ stats = data["data"]["attributes"]["last_analysis_stats"]
+ malicious_count = stats.get("malicious", 0)
+
+ if malicious_count > 0:
+ return True, f"☣️ VirusTotal flagged it as malicious ({malicious_count} engines)"
+ return False, None
+ except Exception as e:
+ print(f"⚠️ VirusTotal error: {e}")
+
+
+
+ return False, None
+
+
+
+
+
+
+
+
+
+
+@app.route('/kmeans-clustering')
+def clustering():
+ return render_template('clustering.html')
+
+#image code
+@app.route('/kmeans-Dbscan-image', methods=['GET', 'POST'])
+def compress_and_clean():
+ final_image = None
+
+ if request.method == 'POST':
+ try:
+ # Get form values
+ mode = request.form.get('mode', 'compress')
+ k = int(request.form.get('k', 8))
+ eps = float(request.form.get('eps', 0.6))
+ min_samples = int(request.form.get('min_samples', 50))
+ image_file = request.files.get('image')
+
+ if image_file and image_file.filename != '':
+ # Load image
+ img = Image.open(image_file).convert('RGB')
+ max_size = (518, 518)
+ img.thumbnail(max_size, Image.Resampling.LANCZOS)
+
+ img_np = np.array(img)
+ h, w, d = img_np.shape
+ pixels = img_np.reshape(-1, d)
+
+ # Apply KMeans
+ kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
+ kmeans.fit(pixels)
+ clustered_pixels = kmeans.cluster_centers_[kmeans.labels_].astype(np.uint8)
+
+ # Mode 1: Just Compress
+ if mode == 'compress':
+ final_pixels = clustered_pixels.reshape(h, w, d)
+
+ # Mode 2: Compress + Clean (KMeans + DBSCAN)
+ else:
+ # Sample to avoid MemoryError
+ max_dbscan_pixels = 10000
+ if len(clustered_pixels) > max_dbscan_pixels:
+ idx = np.random.choice(len(clustered_pixels), max_dbscan_pixels, replace=False)
+ dbscan_input = clustered_pixels[idx]
+ else:
+ dbscan_input = clustered_pixels
+
+ # DBSCAN
+ # For DBSCAN: use only 10,000 pixels max
+ max_dbscan_pixels = 10000
+
+ scaler = StandardScaler()
+ pixels_scaled = scaler.fit_transform(dbscan_input)
+ db = DBSCAN(eps=eps, min_samples=min_samples)
+ labels = db.fit_predict(pixels_scaled)
+
+ # Clean noisy pixels
+ clean_pixels = []
+ for i in range(len(dbscan_input)):
+ label = labels[i]
+ clean_pixels.append([0, 0, 0] if label == -1 else dbscan_input[i])
+
+ # Fill extra if sampling was used
+ if len(clustered_pixels) > max_dbscan_pixels:
+ clean_pixels.extend([[0, 0, 0]] * (len(clustered_pixels) - len(clean_pixels)))
+
+ final_pixels = np.array(clean_pixels, dtype=np.uint8).reshape(h, w, d)
+
+ # Save final image
+ final_img = Image.fromarray(final_pixels)
+ final_image = 'compressed_clean.jpg'
+ final_img.save(os.path.join(app.config['UPLOAD_FOLDER'], final_image), optimize=True, quality=90)
+
+ except Exception as e:
+ return f"⚠️ Error: {str(e)}", 500
+
+ return render_template('kmean-dbscan-image.html', final_image=final_image)
+
+@app.route('/DBscan')
+def DBSCAN():
+ return render_template('DBSCAN.html')
+
+
+#test routs start here
+
+
+@app.route('/Test-layout')
+def test():
+ return render_template('Test-layout.html')
+
+@app.route('/Test-home')
+def Test_home():
+ return render_template('Test-home.html',active_page='Test-home')
+
+@app.route('/Test-supervise')
+def Test_supervise():
+ return render_template('Test/Test-supervise.html', active_page='Test-supervise')
+
+
+@app.route('/Test-unsupervised')
+def Test_unsupervised():
+ return render_template('Test/Test-unsupervised.html', active_page='Test-unsupervised')
+
+# Semi-Supervised Learning page
+@app.route('/Test-semi-supervised')
+def Test_semi_supervised():
+ return render_template('Test/Test-semi_supervised.html', active_page='Test-semi_supervised')
+
+# Reinforcement Learning page
+@app.route('/Test-reinforcement')
+def Test_reinforcement():
+ return render_template('Test/Test-reinforcement.html', active_page='Test-reinforcement')
+
+# Ensemble Learning page
+@app.route('/Test-ensemble')
+def Test_ensemble():
+ return render_template('Test/Test-ensemble.html', active_page='Test-ensemble')
+
+#Templates/Test/Quiz-Overview-Page.html
+@app.route('/linear-Quiz-Overview-Page')
+def linear_Test_quiz_overview():
+ return render_template('Test/linear-Quiz-Overview-Page.html', active_page='linear-Quiz-Overview-Page')
+
+
+@app.route('/Quiz-test')
+def Quiz_test():
+ return render_template('Test/Quiz-test.html', active_page='Quiz-test')
+#if the dtat file doesnt show or dsiapay use render_data like this render_template('data/yourfile.json')
+
+# @app.route('/Quiz-test/')
+# def quiz_topic(topic):
+# import json, os
+# count = int(request.args.get('count', 10))
+# try:
+# json_path = os.path.join(app.root_path, 'data', f'{topic}.json')
+# with open(json_path, 'r', encoding='utf-8') as f:
+# data = json.load(f) # This is your JSON array
+
+# # Transform the JSON to match frontend expectations
+# transformed = []
+# for q in data[:count]:
+# transformed.append({
+# "id": q.get("id"),
+# "question": q.get("questionText"),
+# "options": q.get("options"),
+# "answer": q.get("options")[q.get("correctAnswerIndex")],
+# "explanation": q.get("explanation")
+# })
+
+# return jsonify(transformed)
+
+# except FileNotFoundError:
+# return "Topic not found", 404
+# except json.JSONDecodeError:
+# # return "Invalid JSON file", 500
+
+# @app.route('/Quiz-test/')
+# def quiz_topic(topic):
+# import os, json
+# count = int(request.args.get('count', 10))
+# json_path = os.path.join(app.root_path, 'data', f'{topic}.json')
+
+# try:
+# with open(json_path, 'r', encoding='utf-8') as f:
+# data = json.load(f)
+
+# # If JSON is a dict with "questions" key
+# if isinstance(data, dict) and "questions" in data:
+# questions = data["questions"][:count]
+# elif isinstance(data, list):
+# questions = data[:count]
+# else:
+# return "Invalid JSON structure", 400
+
+# return jsonify(questions)
+# except FileNotFoundError:
+# return "Topic not found", 404
+# except json.JSONDecodeError:
+# return "Invalid JSON file", 400
+
+# ✅ API Route: Send JSON quiz data
+@app.route('/api/quiz/')
+def get_quiz(topic):
+ count = int(request.args.get('count', 10))
+ file_path = os.path.join('data', f'{topic}.json')
+
+ if not os.path.exists(file_path):
+ return jsonify({'error': 'Topic not found'}), 404
+
+ with open(file_path, 'r', encoding='utf-8') as f:
+ data = json.load(f)
+
+ questions = data.get('questions', [])[:count]
+ return jsonify({'questions': questions})
+
+
+@app.route('/polynomial-Quiz')
+def polynomial_Test_quiz():
+ return render_template('Test/polynomial-Quiz.html', active_page='polynomial-Quiz')
+
+# -------------------------------
+# Regression Algorithms
+# -------------------------------
+@app.route('/ridge-regression-test')
+def ridge_regression_test():
+ return render_template('Test/ridge-regression-test.html', active_page='ridge-regression-test')
+
+@app.route('/lasso-regression-test')
+def lasso_regression_test():
+ return render_template('Test/lasso-regression-test.html', active_page='lasso-regression-test')
+
+@app.route('/svr-test')
+def svr_test():
+ return render_template('Test/svr-r-test.html', active_page='svr-r-test')
+
+@app.route('/decision-tree-regression-test')
+def decision_tree_regression_test():
+ return render_template('Test/decision-tree-regression-test.html', active_page='decision-tree-regression-test')
+
+@app.route('/random-forest-regression-test')
+def random_forest_regression_test():
+ return render_template('Test/random-forest-regression-test.html', active_page='random-forest-regression-test')
+
+
+# -------------------------------
+# Classification Algorithms
+# -------------------------------
+@app.route('/logistic-regression-test')
+def logistic_regression_test():
+ return render_template('Test/logistic-regression-test.html', active_page='logistic-regression-test')
+
+@app.route('/svm-c-test')
+def svm_test():
+ return render_template('Test/svm-c-test.html', active_page='svm-c-test')
+
+@app.route('/decision-trees-c-test')
+def decision_trees_test():
+ return render_template('Test/decision-trees-c-test.html', active_page='decision-trees-c-test')
+
+@app.route('/random-forest-c-test')
+def random_forest_test():
+ return render_template('Test/random-forest-c-test.html', active_page='random-forest-c-test')
+
+@app.route('/gradient-descent-test')
+def gradient_descent_test():
+ return render_template('Test/gradient-descent-test.html', active_page='gradient-descent-test')
+
+@app.route('/gradient-boosting-test')
+def gradient_boosting_test():
+ return render_template('Test/gradient-boosting-test.html', active_page='gradient-boosting-test')
+
+@app.route('/xgboost-regression-test')
+def xgboost_regression_test():
+ return render_template('Test/xgboost-regression-test.html', active_page='xgboost-regression-test')
+
+@app.route('/lightgbm-test')
+def lightgbm_test():
+ return render_template('Test/lightgbm-test.html', active_page='lightgbm-test')
+
+@app.route('/knn-test')
+def knn_test():
+ return render_template('Test/knn-test.html', active_page='knn-test')
+
+@app.route('/naive-bayes-test')
+def naive_bayes_test():
+ return render_template('Test/naive-bayes-test.html', active_page='naive-bayes-test')
+
+@app.route('/neural-networks-test')
+def neural_networks_test():
+ return render_template('Test/neural-networks-test.html', active_page='neural-networks-test')
+
+
+# -------------------------------
+# Clustering
+# -------------------------------
+@app.route('/k-means-test')
+def k_means_test():
+ return render_template('Test/k-means-test.html', active_page='k-means-test')
+
+@app.route('/hierarchical-clustering-test')
+def hierarchical_clustering_test():
+ return render_template('Test/hierarchical-clustering-test.html', active_page='hierarchical-clustering-test')
+
+@app.route('/dbscan-test')
+def dbscan_test():
+ return render_template('Test/dbscan-test.html', active_page='dbscan-test')
+
+@app.route('/gmm-test')
+def gmm_test():
+ return render_template('Test/gmm-test.html', active_page='gmm-test')
+
+
+# -------------------------------
+# Dimensionality Reduction
+# -------------------------------
+@app.route('/pca-test')
+def pca_test():
+ return render_template('Test/pca-test.html', active_page='pca-test')
+
+@app.route('/tsne-test')
+def tsne_test():
+ return render_template('Test/tsne-test.html', active_page='tsne-test')
+
+@app.route('/lda-test')
+def lda_test():
+ return render_template('Test/lda-test.html', active_page='lda-test')
+
+@app.route('/ica-test')
+def ica_test():
+ return render_template('Test/ica-test.html', active_page='ica-test')
+
+
+# -------------------------------
+# Association Rule Learning
+# -------------------------------
+@app.route('/apriori-test')
+def apriori_test():
+ return render_template('Test/apriori-test.html', active_page='apriori-test')
+
+@app.route('/eclat-test')
+def eclat_test():
+ return render_template('Test/eclat-test.html', active_page='eclat-test')
+
+
+# -------------------------------
+# Semi-Supervised Learning
+# -------------------------------
+@app.route('/generative-models-test')
+def generative_models_test():
+ return render_template('Test/generative-models-test.html', active_page='generative-models-test')
+
+@app.route('/self-training-test')
+def self_training_test():
+ return render_template('Test/self-training-test.html', active_page='self-training-test')
+
+@app.route('/transductive-svm-test')
+def transductive_svm_test():
+ return render_template('Test/transductive-svm-test.html', active_page='transductive-svm-test')
+
+@app.route('/graph-based-methods-test')
+def graph_based_methods_test():
+ return render_template('Test/graph-based-methods-test.html', active_page='graph-based-methods-test')
+
+
+# -------------------------------
+# Reinforcement Learning
+# -------------------------------
+@app.route('/agent-environment-state-test')
+def agent_environment_state_test():
+ return render_template('Test/agent-environment-state-test.html', active_page='agent-environment-state-test')
+
+@app.route('/action-policy-test')
+def action_policy_test():
+ return render_template('Test/action-policy-test.html', active_page='action-policy-test')
+
+@app.route('/reward-value-function-test')
+def reward_value_function_test():
+ return render_template('Test/reward-value-function-test.html', active_page='reward-value-function-test')
+
+@app.route('/q-learning-test')
+def q_learning_test():
+ return render_template('Test/q-learning-test.html', active_page='q-learning-test')
+
+@app.route('/deep-reinforcement-learning-test')
+def deep_reinforcement_learning_test():
+ return render_template('Test/deep-reinforcement-learning-test.html', active_page='deep-reinforcement-learning-test')
+
+
+# -------------------------------
+# Ensemble Methods
+# -------------------------------
+@app.route('/bagging-test')
+def bagging_test():
+ return render_template('Test/bagging-test.html', active_page='bagging-test')
+
+@app.route('/boosting-test')
+def boosting_test():
+ return render_template('Test/boosting-test.html', active_page='boosting-test')
+
+@app.route('/stacking-test')
+def stacking_test():
+ return render_template('Test/stacking-test.html', active_page='stacking-test')
+
+@app.route('/voting-test')
+def voting_test():
+ return render_template('Test/voting-test.html', active_page='voting-test')
+
+
+
+
+
+# if __name__ == "__main__":
+# app.run(host="0.0.0.0", port=5000)
+
+if __name__ == "__main__":
+ port = int(os.environ.get("PORT", 5000))
+ app.run(host="0.0.0.0", port=port)
+
+
+
+
+
+
+
+
diff --git a/auth/__init__.py b/auth/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/auth/email.py b/auth/email.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf9b79a4ca3a1d9c88b09d28817e7e1df3cf1c34
--- /dev/null
+++ b/auth/email.py
@@ -0,0 +1,12 @@
+from flask_mail import Message
+from flask import current_app
+from .extensions import mail
+
+def send_otp(email, otp):
+ msg = Message(
+ subject="Your OTP Code",
+ sender=current_app.config["MAIL_USERNAME"],
+ recipients=[email]
+ )
+ msg.body = f"Your OTP is {otp}"
+ mail.send(msg)
diff --git a/auth/extensions.py b/auth/extensions.py
new file mode 100644
index 0000000000000000000000000000000000000000..61d8382bc438fe2a07b05f82c7c81167dc2be1a0
--- /dev/null
+++ b/auth/extensions.py
@@ -0,0 +1,3 @@
+from flask_mail import Mail
+
+mail = Mail()
diff --git a/auth/jwt_utils.py b/auth/jwt_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..629eabf8e59586f2c821cf6d023992ecffe0528b
--- /dev/null
+++ b/auth/jwt_utils.py
@@ -0,0 +1,4 @@
+from flask_jwt_extended import create_access_token
+
+def generate_jwt(user_id):
+ return create_access_token(identity=user_id)
diff --git a/auth/models.py b/auth/models.py
new file mode 100644
index 0000000000000000000000000000000000000000..e5a1109a1947a026e0d6312d1af6b717c99b6f5b
--- /dev/null
+++ b/auth/models.py
@@ -0,0 +1,13 @@
+from flask_sqlalchemy import SQLAlchemy
+from flask_login import UserMixin
+from datetime import datetime
+
+db = SQLAlchemy()
+
+class User(UserMixin, db.Model):
+ id = db.Column(db.Integer, primary_key=True)
+ email = db.Column(db.String(150), unique=True, nullable=False)
+ password = db.Column(db.String(200), nullable=False)
+ is_verified = db.Column(db.Boolean, default=False)
+ otp = db.Column(db.String(6))
+ otp_expiry = db.Column(db.DateTime)
diff --git a/auth/routes.py b/auth/routes.py
new file mode 100644
index 0000000000000000000000000000000000000000..f54f106aee0c8f24c4a1d7ab2d5e8b7a91c8784d
--- /dev/null
+++ b/auth/routes.py
@@ -0,0 +1,91 @@
+from flask import Blueprint, render_template, request, redirect, url_for
+from werkzeug.security import generate_password_hash, check_password_hash
+from flask_login import login_user, logout_user, login_required
+from datetime import datetime, timedelta
+from flask_jwt_extended import create_access_token
+from flask import jsonify
+
+import random
+
+from .models import db, User
+from .email import send_otp
+
+auth = Blueprint("auth", __name__, url_prefix="/auth")
+
+def gen_otp():
+ return str(random.randint(100000, 999999))
+
+
+@auth.route("/register", methods=["GET", "POST"])
+def register():
+ if request.method == "POST":
+ otp = gen_otp()
+ user = User(
+ email=request.form["email"],
+ password=generate_password_hash(request.form["password"]),
+ otp=otp,
+ otp_expiry=datetime.now() + timedelta(minutes=5)
+ )
+ db.session.add(user)
+ db.session.commit()
+ send_otp(user.email, otp)
+ return redirect(url_for("auth.verify", email=user.email))
+ return render_template("auth/register.html")
+
+
+# @auth.route("/verify/", methods=["GET", "POST"])
+# def verify(email):
+# user = User.query.filter_by(email=email).first()
+# if request.method == "POST":
+# if user.otp == request.form["otp"] and user.otp_expiry > datetime.now():
+# user.is_verified = True
+# user.otp = None
+# db.session.commit()
+# return redirect(url_for("auth.login"))
+# return render_template("auth/verify_otp.html")
+
+@auth.route("/verify/", methods=["GET", "POST"])
+def verify(email):
+ user = User.query.filter_by(email=email).first()
+
+ if request.method == "POST":
+ if user.otp != request.form["otp"]:
+ return render_template("auth/verify_otp.html", error="Invalid OTP")
+
+ if user.otp_expiry < datetime.now():
+ return render_template("auth/verify_otp.html", error="OTP expired")
+
+ user.is_verified = True
+ user.otp = None
+ db.session.commit()
+
+ return render_template(
+ "auth/login.html",
+ success="Email verified successfully"
+ )
+
+ return render_template("auth/verify_otp.html")
+
+
+@auth.route("/login", methods=["GET", "POST"])
+def login():
+ if request.method == "POST":
+ user = User.query.filter_by(email=request.form["email"]).first()
+
+ if user and check_password_hash(user.password, request.form["password"]) and user.is_verified:
+ login_user(user)
+
+ token = create_access_token(identity=user.id)
+
+ return jsonify({
+ "message": "login success",
+ "access_token": token
+ })
+
+ return render_template("auth/login.html")
+
+@auth.route("/logout")
+@login_required
+def logout():
+ logout_user()
+ return redirect("/")
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..27ddcdee0eda7e0550964f98ed4ec586a9730308
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,51 @@
+# Core ML / NLP
+scikit-learn>=1.4.2
+scipy>=1.13.1
+numpy>=1.26.4
+pandas>=2.2.2
+joblib>=1.4.2
+nltk>=3.9.1
+textblob>=0.19.0
+tldextract>=5.1.2
+wordninja>=2.0.0
+pyspellchecker>=0.8.3
+
+# Deep learning
+tensorflow>=2.15.0
+torch>=2.2.2
+torchvision>=0.17.2
+torchaudio>=2.2.2
+transformers>=4.41.2
+sentencepiece>=0.2.0
+
+# Visualization
+matplotlib>=3.9.0
+seaborn>=0.13.2
+plotly>=5.20.0
+altair>=5.3.0
+
+# Web framework
+Flask>=3.0.3
+flask-cors>=4.0.0
+Flask-Login
+Flask-Mail
+Flask-SQLAlchemy
+flask-jwt-extended
+requests
+
+# Utils
+requests>=2.32.3
+python-dotenv>=1.0.1
+tqdm>=4.66.4
+regex>=2024.5.15
+PyYAML>=6.0.1
+
+# Google AI client libs (optional if using)
+google-generativeai>=0.8.3
+google-api-python-client>=2.136.0
+google-auth>=2.29.0
+
+accelerate
+peft
+sentencepiece
+gdown
\ No newline at end of file
diff --git a/templates/Action-and-Policy.html b/templates/Action-and-Policy.html
new file mode 100644
index 0000000000000000000000000000000000000000..339c3510f91af625eb9c34b8887af9e07507fa6c
--- /dev/null
+++ b/templates/Action-and-Policy.html
@@ -0,0 +1,296 @@
+{% extends "layout.html" %}
+
+{% block content %}
+
+
+
+
+
+ Study Guide: RL Action & Policy
+
+
+
+
+
+
+
+
+
🧠 Study Guide: Action & Policy in Reinforcement Learning
+
+
🔹 1. Introduction
+
+
Story-style intuition: The Video Game Character
+
Think of a character in a video game. At any moment, the character has a set of possible moves they can make—jump, run, duck, attack. These are the character's Actions. The player controlling the character has a strategy in their head: "If a monster is close, I should attack. If there's a pit, I should jump." This strategy, this set of rules that dictates which action to take in any situation, is the Policy. In Reinforcement Learning, our goal is to teach the agent (the character) to learn the best possible policy on its own to win the game (maximize rewards).
+
+
In the world of RL, the Action is the "what" (what the agent does) and the Policy is the "how" (how the agent decides what to do). Together, they form the core of the agent's behavior.
+
+
🔹 2. Action (A)
+
An Action is one of the possible moves an agent can make in a given state. The set of all possible actions in a state is called the action space.
+
Types of Action Spaces:
+
+
Discrete Actions: There is a finite, limited set of distinct actions the agent can choose from.
+
Example: In a maze, the actions are {Up, Down, Left, Right}. In a game of tic-tac-toe, the actions are placing your mark in one of the empty squares.
+
+
Continuous Actions: The actions are described by real-valued numbers within a certain range.
+
Example: For a self-driving car, the action of steering can be any angle between -45.0 and +45.0 degrees. For a thermostat, the action is setting a temperature, which can be any value like 20.5°C.
+
+
+
The set of available actions can depend on the current state, denoted as \( A(s) \).
+
+
🔹 3. Policy (π)
+
A Policy is the agent's strategy or "brain." It is a rule that maps a state to an action. The ultimate goal of RL is to find an optimal policy—a policy that maximizes the total expected reward over time.
+
Mathematically, a policy is a distribution over actions given a state: \( \pi(a|s) = P(A_t = a \mid S_t = s) \)
+
Types of Policies:
+
+
Deterministic Policy: The policy always outputs the same action for a given state. There is no randomness.
+
Story Example: A self-driving car's policy is deterministic: "If the traffic light state is 'Red', the action is always 'Brake'." There is no chance it will do something else.
+
Formula: \( a = \pi(s) \)
+
+
Stochastic Policy: The policy outputs a probability distribution over actions for a given state. The agent then samples from this distribution to choose its next action.
+
Story Example: A poker-playing bot might have a stochastic policy. In a certain state, its policy might be: "70% chance of 'Raising', 30% chance of 'Folding'." This randomness makes the agent's behavior less predictable to opponents and is crucial for exploration.
+
Formula: \( a \sim \pi(\cdot|s) \)
+
+
+
+
🔹 4. Policy vs. Value Function
+
It's crucial to distinguish between a policy and a value function, as they work together to guide the agent.
+
+
+
Policy (The "How-To" Guide): The policy tells you what to do in a state.
+
Example: "You are at a crossroads. The policy says: Turn Left."
+
+
Value Function (The "Evaluation Map"): The value function tells you how good it is to be in a certain state or to take a certain action in a state.
+
Example: "You are at a crossroads. The value function tells you: The path to the left has a high value because it leads to treasure. The path to the right has a low value because it leads to a dragon."
+
+
+
Modern RL algorithms often learn both. They use the value function to evaluate how good their actions are, which in turn helps them improve their policy.
+
+
🔹 5. Interaction Flow with Action & Policy
+
The Action and Policy are at the heart of the agent's decision-making in the RL loop.
+
+
Agent observes state (s): "I am at a crossroad."
+
Agent follows its policy (π) to choose an action (a): "My policy tells me to go left."
+
Environment transitions and gives reward (r): The agent moves left, finds a gold coin (+10 reward), and arrives at a new state.
+
Agent improves its policy: The agent thinks, "That was a great outcome! My policy was right to tell me to go left from that crossroad. I should strengthen that rule."
+
+
+
🔹 6. Detailed Examples
+
+
Example 1: Chess
+
+
Actions: The set of all legal moves for the current player's pieces (e.g., move pawn e2 to e4, move knight g1 to f3). The action space changes with every state.
+
Policy: A very complex strategy. A simple policy might be a set of human-written rules: "If my king is in check, my first priority is to move out of check." An advanced policy (like AlphaGo's) is a deep neural network that takes the board state as input and outputs a probability for every possible move.
+
+
+
+
Example 2: Self-Driving Car
+
+
Actions: A continuous action space, often represented as a vector: `[steering_angle, acceleration, braking]`. For example, `[-5.2, 0.8, 0.0]` means steer 5.2 degrees left, accelerate at 80%, and don't brake.
+
Policy: A highly sophisticated function that takes sensor data (camera, LiDAR) as input and outputs the continuous action vector. A simple part of the policy might be: "If the distance to the car in front is less than 10 meters and decreasing, the braking component of my action vector should be high."
+
+
+
+
🔹 7. Challenges
+
+
Huge Action Spaces:
+
Example: In a real-time strategy game like StarCraft, an action could be commanding any one of hundreds of units to do any one of a dozen things, leading to millions of possible actions at any moment.
+
+
Designing Effective Policies (Exploration): How do you design a policy that not only exploits what it knows but also explores new actions to discover better strategies? This is the exploration-exploitation dilemma.
+
Learning Stable Policies: In complex, dynamic environments, the feedback from actions can be noisy and delayed, making it very difficult for the policy to learn stable and reliable behaviors.
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What is the difference between a discrete and a continuous action space? Give an example of each.
+
What is the difference between a deterministic and a stochastic policy? When might a stochastic policy be useful?
+
Can an agent have a good policy without knowing the value function?
+
+
+
Answers
+
1. A discrete action space has a finite number of distinct options (e.g., move left/right). A continuous action space has actions represented by real numbers in a range (e.g., turning a steering wheel by 15.7 degrees).
+
2. A deterministic policy always chooses the same action for a state. A stochastic policy outputs a probability distribution over actions. A stochastic policy is very useful for exploration (trying new things) and for games where unpredictability is an advantage (like poker).
+
3. Yes, but it's harder. Some algorithms, called "policy-gradient" methods, can directly search for a good policy without learning a value function. However, many of the most successful modern algorithms learn both, using the value function to help guide improvements to the policy.
🤖 Study Guide: Core Concepts of Reinforcement Learning
+
+
🔹 Introduction to RL
+
+
Story-style intuition: Training a Dog
+
Imagine you are training a new puppy. You don't give it a textbook on how to behave. Instead, you use a system of rewards and consequences. When the puppy sits on command, you give it a treat (a positive reward). When it chews on the furniture, you say "No!" (a negative reward). Through a process of trial-and-error, the puppy gradually learns a set of behaviors (a "policy") that maximizes the number of treats it receives over its lifetime. This is the essence of Reinforcement Learning (RL). It's about learning what to do—how to map situations to actions—so as to maximize a numerical reward signal.
+
+
Reinforcement Learning (RL) is a type of machine learning where an agent learns to make a sequence of decisions in an environment to achieve a long-term goal. It is fundamentally different from other learning paradigms:
+
+
vs. Supervised Learning: In supervised learning, you have a labeled dataset (the "answer key"). The model learns by comparing its predictions to the correct answers.
+
Example: This is like a student studying for a test with a complete set of practice questions and the correct answers. They learn by correcting their mistakes.
+
+
vs. Unsupervised Learning: In unsupervised learning, the goal is to find hidden structure in unlabeled data. There are no right or wrong answers, just patterns.
+
Example: This is like a historian being given a thousand ancient, untranslated texts and trying to group them by language or topic, without any prior knowledge.
+
+
Reinforcement Learning: The agent learns from the consequences of its actions, not from being told what to do. The feedback is a scalar reward, which is often delayed.
+
Example: This is like a person learning to play a video game. They don't have an answer key. They learn that certain actions lead to points (rewards) and others lead to losing a life (negative rewards), and their goal is to get the highest score possible.
+
+
+
+
🔹 Core Components of RL
+
The "Training a Dog" analogy helps us define the core building blocks of any RL problem.
+
+
Agent: The learner and decision-maker. It perceives the environment and chooses actions.
+
Example: The puppy is the agent. In a video game, the character you control is the agent.
+
+
Environment: Everything the agent interacts with. It represents the world or the task the agent is trying to solve.
+
Example: Your house, including the furniture, your commands, and the treats, is the environment. The game world, including its rules, levels, and enemies, is the environment.
+
+
State (S): A complete description of the environment at a specific moment. It's the information the agent uses to make a decision.
+
Example: A state for the puppy could be a snapshot: "in the living room, toy is on the floor, owner is holding a treat." For a chess game, the state is the position of every piece on the board.
+
+
Action (A): A choice the agent can make from a set of possibilities.
+
Example: In the given state, the puppy's available actions might be "sit," "bark," "run," or "chew toy."
+
+
Reward (R): The immediate feedback signal from the environment after the agent performs an action. The agent's sole objective is to maximize the total reward it accumulates.
+
Example: If the puppy sits, it gets a +10 reward (a treat). If it barks, it gets a -1 reward (a stern look).
+
+
Policy (π): The agent's strategy or "brain." It's a function that maps a state to an action. A good policy will consistently choose actions that lead to high rewards.
+
Example: An initial, untrained policy for the puppy is random. A final, well-trained policy is a smart set of rules: "If I see my owner holding a treat, the best action is to sit immediately."
+
+
Value Function (V): A prediction of the total future reward an agent can expect to get, starting from a particular state. It represents the long-term desirability of a state.
+
Example: The puppy learns that the state "sitting by the front door in the evening" has a high value. While this state itself doesn't give an immediate reward, it often leads to a highly rewarding future state: going for a walk.
+
+
+
+
🔹 The Interaction Flow (Agent–Environment Loop)
+
RL is a continuous loop of interaction between the agent and the environment, where each step refines the agent's understanding.
+
+
+
The agent observes the current State (S_t).
+
Based on its Policy (π), the agent chooses an Action (A_t).
+
The environment receives the action, transitions to a new State (S_{t+1}), and gives the agent a Reward (R_{t+1}).
+
The agent uses this reward and new state to update its knowledge (its policy and value functions).
+
This loop repeats, allowing the agent to learn from experience and adapt its behavior over time.
+
+
+
🔹 Mathematical Foundations
+
+
To formalize this process, mathematicians use a framework called a Markov Decision Process (MDP). It's simply a way of writing down all the rules of the "game" the agent is playing, assuming that the future depends only on the current state and action, not on the past (the Markov Property).
+
+
An MDP is defined by a tuple: \( (S, A, P, R, \gamma) \)
+
+
\( S \): A set of all possible states (all possible configurations of the environment).
+
\( A \): A set of all possible actions.
+
\( P \): The state transition probability function, \( P(s'|s, a) \). This is the "physics" of the environment.
+
Example: In a slippery, icy world, if a robot in state "at square A" takes the action "move North," the transition probability might be: 80% chance of ending up in the state "at square B (north of A)," 10% chance of slipping and ending up "at square C (east of A)," and 10% chance of not moving at all ("at square A").
+
+
\( R \): The reward function, \( R(s, a) \). This defines the goal of the problem.
+
Example: In a maze, the reward is -1 for every step taken (to encourage finishing quickly) and +100 for taking the action that leads to the exit state.
+
+
\( \gamma \): The discount factor (a number between 0 and 1). It determines the present value of future rewards.
+
Example: A reward of 100 you receive in two steps is worth \(100 \times \gamma^2\) to you right now. If γ=0.9, that future reward is worth 81 now. If γ=0.1, it's worth only 1 now. This prevents infinite loops and makes the agent prioritize rewards that are closer in time.
+
+
+
+
🔹 Detailed Examples
+
Chess
+
+
Agent: The chess-playing program (e.g., AlphaZero).
+
Environment: The chessboard and the rules of chess, including the opponent's moves. The opponent is considered part of the environment because the agent cannot control their actions.
+
State: The exact position of all pieces on the board, plus whose turn it is.
+
Action: Making a legal move with one of the pieces.
+
Reward: A large positive reward (+1) for winning, a large negative reward (-1) for losing, and a small reward (0) for all other moves until the end of the game. This is an example of a sparse reward because most actions do not receive immediate feedback.
+
+
Self-Driving Car
+
+
Agent: The car's control system (the AI).
+
Environment: The road, other cars, pedestrians, traffic lights, and weather conditions.
+
State: A combination of the car's current speed, position, steering angle, and processed data from its sensors (e.g., detected lane lines from the camera, distances to obstacles from LiDAR).
+
Action: Can be discrete (turn left, turn right) or continuous (adjust the steering wheel by 3.5 degrees, accelerate by 5%).
+
Reward: The reward function is carefully designed ("reward shaping") to encourage good behavior: a small positive reward for every meter it moves forward safely, a small negative reward for jerky movements, and a large negative reward for any collision or traffic violation.
+
+
+
🔹 Advantages & Challenges
+
+
+
+
Advantages of RL
+
Challenges in RL
+
+
+
+
+
✅ Can solve complex problems that are difficult to program explicitly. Example: It's nearly impossible to write rules by hand for all situations a self-driving car might face. RL allows the car to learn these rules from experience.
+
❌ Large State Spaces: For problems like Go, the number of possible board states is greater than the number of atoms in the universe, making it impossible to explore them all.
+
+
+
✅ The agent can adapt to dynamic, changing environments. Example: A trading bot can adapt its strategy as market conditions change over time.
+
❌ Sparse Rewards: In many problems, rewards are only given at the very end (like winning a game). This is the "credit assignment problem" - it's hard for the agent to figure out which of its many early actions were actually responsible for the final win.
+
+
+
✅ A very general framework that can be applied to many different fields.
+
❌ Exploration vs. Exploitation: This is a fundamental trade-off.
+
Example: When choosing a restaurant, do you exploit your knowledge and go to your favorite place that you know is great? Or do you explore a new restaurant that might be even better, but also risks being terrible?
+
+
+
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What is the main difference between the feedback an agent gets in Reinforcement Learning versus Supervised Learning?
+
What is a "policy" in RL? Give a simple real-world analogy.
+
In the MDP formulation, what does the discount factor (gamma, γ) control? What would γ = 0 mean?
+
What is the "Exploration vs. Exploitation" dilemma? Provide an example from your own life.
+
+
+
Answers
+
1. In Supervised Learning, the feedback is the "correct answer" from a labeled dataset. In Reinforcement Learning, the feedback is a scalar "reward" signal, which only tells the agent how good its action was, not what the best action would have been.
+
2. A policy is the agent's strategy for choosing an action in a given state. A simple analogy is a recipe: for a given state ("I have eggs, flour, and sugar"), the policy (recipe) tells you which action to take ("mix them together").
+
3. The discount factor controls how much the agent cares about future rewards versus immediate rewards. A γ = 0 would mean the agent is completely "myopic" or short-sighted, only caring about the immediate reward from its next action and ignoring any long-term consequences.
+
4. It's the dilemma of choosing between trying something new (exploration) to potentially find a better outcome, versus sticking with what you know works well (exploitation). An example is choosing a restaurant: do you go to your favorite restaurant that you know is great (exploitation), or do you try a new one that might be even better, or might be terrible (exploration)?
+
+
+
+
🔹 Key Terminology Explained
+
+
The Story: Decoding the Dog Trainer's Manual
+
+
+
+ Policy (π):
+
+ What it is: The agent's brain or strategy. It's the rulebook the agent uses to decide what action to take in any given state.
+
+ Story Example: The puppy's final, well-trained policy is a set of rules like: "If my human is home, and it's 6 PM, and my bowl is empty, then the best action is to go sit by my bowl."
+
+
+ Markov Decision Process (MDP):
+
+ What it is: The mathematical framework used to describe an RL problem. It formalizes the agent, environment, states, actions, and rewards.
+
+ Story Example: The MDP is the complete "rulebook of the universe" for the puppy. It contains a list of every possible room configuration (states), every possible puppy action, the rules of what happens after each action, and the rewards for each action.
+
+
+ Discount Factor (γ):
+
+ What it is: A number between 0 and 1 that represents the importance of future rewards.
+
+ Story Example: A puppy with a high discount factor is patient. It's willing to perform a series of less-rewarding actions (like "come," "heel," "stay") because it knows it will lead to a very big treat at the end. A puppy with a low discount factor is impatient and will always choose the action that gets it a small treat *right now*.
+
Imagine you're a detective hired by a supermarket. Your mission is to analyze thousands of shopping receipts (transactions) to find hidden patterns. You soon notice a classic pattern: "Customers who buy bread also tend to buy butter." This is a valuable clue! The store can place bread and butter closer together to increase sales. The Apriori Algorithm is the systematic method this detective uses to sift through all the receipts and find these "frequently bought together" item combinations and turn them into powerful rules. This whole process is called Market Basket Analysis.
+
+
The Apriori Algorithm is a classic algorithm used for association rule mining. Its main goal is to find relationships and patterns between items in large transactional datasets. It generates rules in the format "If A, then B," helping businesses understand customer behavior and make smarter decisions.
+
+
🔹 Key Definitions
+
To be a good supermarket detective, you need to know the lingo. The three most important metrics are Support, Confidence, and Lift.
+
+
Example Scenario: Let's say we have 100 shopping receipts.
+
+
80 receipts contain {Bread}.
+
70 receipts contain {Butter}.
+
60 receipts contain both {Bread, Butter}.
+
+
+
+
+ Support: The popularity of an itemset. It's the fraction of total transactions that contain that itemset.
+
Example: The support for {Bread, Butter} is 60/100 = 0.6 or 60%. This tells us that 60% of all shoppers bought bread and butter together. High support means the itemset is frequent.
+
+
+ Confidence: The reliability of a rule. For a rule {Bread} => {Butter}, it's the probability of finding butter in a basket that already has bread.
+
Example: Confidence({Bread} => {Butter}) = Support({Bread, Butter}) / Support({Bread}) = 60 / 80 = 0.75 or 75%. This means that 75% of customers who bought bread also bought butter. High confidence makes the rule strong.
+
+
+ Lift: The strength of a rule compared to random chance. It tells you how much more likely customers are to buy Y when they buy X.
+
The Detective's Golden Rule: Our detective quickly realizes a simple but powerful truth: If customers rarely buy {Milk}, then they will *definitely* rarely buy the combination {Milk, Bread, Eggs}. Why waste time checking the records for a combination containing an already unpopular item? This is the Apriori Principle.
+
+
The principle states: "All non-empty subsets of a frequent itemset must also be frequent." This is the core idea that makes the Apriori algorithm efficient. It allows the algorithm to "prune" the search space by eliminating a huge number of candidate itemsets. If {Milk} is infrequent, any larger itemset containing {Milk} is guaranteed to be infrequent and can be ignored.
+
+
🔹 Algorithm Steps
+
The algorithm works iteratively, building up larger and larger frequent itemsets level by level.
+
+
+
Set a Minimum Support Threshold: The detective decides they only care about itemsets that appear in at least, say, 50% of receipts.
+
Find Frequent 1-Itemsets (L1): Scan all receipts and find every individual item that meets the minimum support. These are your "frequent items."
+
Generate and Prune (Iterate):
+
+
Join: Take the frequent itemsets from the previous step (Lk-1) and combine them to create candidate k-itemsets (Ck). E.g., combine {Bread} and {Butter} to make {Bread, Butter}.
+
Prune: This is where the Apriori Principle comes in. Check every candidate. If any of its subsets is not in the frequent list (Lk-1), discard it immediately.
+
Scan: For the remaining candidates, scan the database to count their support. Keep only those that meet the minimum support threshold. This new list is Lk.
+
+
+
Repeat Step 3 until no new frequent itemsets can be found.
+
Generate Rules: Once you have all frequent itemsets, generate association rules (like {Bread} => {Butter}) from them that meet a minimum confidence threshold.
+
+
+
🔹 Strengths & Weaknesses
+
Advantages:
+
+
✅ Simple and Intuitive: The logic is easy to understand and explain.
+
✅ Guaranteed to Find All Rules: It is a complete algorithm that will find all frequent itemsets and rules if they exist.
+
+
Disadvantages:
+
+
❌ Computationally Expensive: It requires multiple scans of the entire database, which can be very slow for large datasets.
+
❌ Many Candidate Itemsets: It can generate a huge number of candidate itemsets, especially in early passes, which consumes a lot of memory.
+
❌ Requires Tuning: Finding the right `min_support` and `min_confidence` can be tricky and requires trial and error.
+
+
+
🔹 Python Implementation (Beginner Example with `mlxtend`)
+
+
Here, we'll be a supermarket detective with a small set of receipts. We need to prepare our data in a specific way (a one-hot encoded format) where each row is a transaction and each column is an item. Then, we'll use the `apriori` function to find frequent itemsets and `association_rules` to find the strong relationships.
+
+
+import pandas as pd
+from mlxtend.frequent_patterns import apriori, association_rules
+
+# --- 1. Create a Sample Dataset ---
+# This represents 5 shopping receipts.
+dataset = [['Milk', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
+ ['Dill', 'Onion', 'Nutmeg', 'Kidney Beans', 'Eggs', 'Yogurt'],
+ ['Milk', 'Apple', 'Kidney Beans', 'Eggs'],
+ ['Milk', 'Unicorn', 'Corn', 'Kidney Beans', 'Yogurt'],
+ ['Corn', 'Onion', 'Onion', 'Kidney Beans', 'Ice cream', 'Eggs']]
+
+# --- 2. Prepare Data in One-Hot Encoded Format ---
+# mlxtend's apriori needs the data as a DataFrame of True/False values.
+from mlxtend.preprocessing import TransactionEncoder
+te = TransactionEncoder()
+te_ary = te.fit(dataset).transform(dataset)
+df = pd.DataFrame(te_ary, columns=te.columns_)
+
+# --- 3. Find Frequent Itemsets with Apriori ---
+# We set min_support to 0.6, meaning we only want itemsets
+# that appear in at least 60% of the transactions (3 out of 5).
+frequent_itemsets = apriori(df, min_support=0.6, use_colnames=True)
+print("--- Frequent Itemsets (Support >= 60%) ---")
+print(frequent_itemsets)
+
+# --- 4. Generate Association Rules ---
+# We generate rules that have a confidence of at least 70%.
+rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=0.7)
+# Let's sort the rules by their "lift" to see the strongest relationships.
+sorted_rules = rules.sort_values(by='lift', ascending=False)
+print("\n--- Strong Association Rules (Confidence >= 70%) ---")
+print(sorted_rules[['antecedents', 'consequents', 'support', 'confidence', 'lift']])
+
+
+
🔹 Best Practices
+
+
Data Formatting: Ensure your data is properly formatted (one-hot encoded) before applying the algorithm.
+
Setting Thresholds: Start with a higher `min_support` and gradually lower it. If it's too low on a large dataset, you might run out of memory.
+
Use Lift: Don't just rely on confidence. A rule might have high confidence just because the consequent is a very popular item. Lift tells you if the rule is truly meaningful.
+
Consider Alternatives: For very large datasets, algorithms like FP-Growth are often much faster than Apriori because they don't require candidate generation.
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What is the Apriori Principle, and why is it important?
+
If Support({A}) = 30%, Support({B}) = 40%, and Support({A, B}) = 20%, what is the Confidence of the rule {A} => {B}?
+
A rule {Diapers} => {Beer} has a Lift of 3.0. What does this mean in plain English?
+
What is the main performance bottleneck of the Apriori algorithm?
+
+
+
Answers
+
1. The Apriori Principle states that all subsets of a frequent itemset must also be frequent. It's important because it allows the algorithm to prune a massive number of candidate itemsets early on, making the process much more efficient.
3. A Lift of 3.0 means that customers who buy diapers are 3 times more likely to buy beer than a randomly chosen customer. This indicates a strong positive association.
+
4. The main bottleneck is the candidate generation step. In each pass, it can create a very large number of potential itemsets that need to be checked against the entire database, which is slow and memory-intensive.
+
+
+
+
🔹 Key Terminology Explained (Apriori)
+
+
The Story: Decoding the Supermarket Detective's Notebook
+
+
+
+ Itemset:
+
+ What it is: A collection of one or more items purchased in a transaction.
+
+ Story Example: {Bread, Butter} is a 2-itemset. {Milk} is a 1-itemset. A single shopping receipt can contain many different itemsets.
+
+
+ Association Rule:
+
+ What it is: An "if-then" statement showing the relationship between two itemsets.
+
+ Story Example: {Bread} => {Butter} is an association rule. The "if" part ({Bread}) is called the antecedent, and the "then" part ({Butter}) is called the consequent.
+
+
+ Pruning:
+
+ What it is: The process of discarding candidate itemsets that are guaranteed to be infrequent without actually counting their occurrences in the database.
+
+ Story Example: This is the detective's efficiency trick. By knowing that {Caviar} is rare (infrequent), they immediately prune and throw away the need to check for {Caviar, Bread} or {Caviar, Milk}, saving a huge amount of time.
+
+
+ One-Hot Encoding:
+
+ What it is: A way of preparing transactional data for the algorithm. It creates a table where each row is a transaction, each column is an item, and the cells are True/False (or 1/0) indicating if the item was in that transaction.
+
+ Story Example: It's like turning each receipt into a checklist. For a receipt with bread and milk, the "Bread" and "Milk" columns would be checked (True), while all other columns (Butter, Eggs, etc.) would be unchecked (False).
+
Imagine you want to guess the number of jellybeans in a giant jar. If you ask one person, their guess might be way off. They might be an expert, or they might be terrible at guessing. Their prediction has high variance. But what if you ask 100 different people and take the average of all their guesses? The final averaged guess is almost always much closer to the true number than any single individual's guess. This is the "wisdom of crowds" effect. Bagging applies this same logic to machine learning. Instead of trusting one complex model (one expert guesser), we train many models on slightly different perspectives of the data and combine their predictions to get a more stable and accurate result.
+
+
Bagging, short for Bootstrap Aggregating, is a powerful ensemble machine learning technique. Its primary goal is to reduce the variance of a model, thereby preventing overfitting and improving its stability. It works by training multiple instances of the same base model on different random subsets of the training data and then aggregating their predictions.
+
+
🔹 2. How Bagging Works
+
The process of Bagging is a straightforward three-step method.
+
+
+
Bootstrap Sampling: This is the "B" in Bagging. We create multiple new training datasets from our original dataset. Each new dataset is created by sampling with replacement.
+
Example: If our original dataset is `[A, B, C, D]`, a bootstrap sample might be `[B, A, D, B]`. Notice that 'B' was picked twice and 'C' was not picked at all. Each bootstrap sample is the same size as the original dataset.
+
+
Train Models in Parallel: We train a separate instance of the same base model (e.g., a Decision Tree) on each of the bootstrap samples. Since these models are independent of each other, they can all be trained at the same time (in parallel).
+
Aggregate Predictions: Once all models are trained, we use them to make predictions on new, unseen data. The final prediction is an aggregation of all the individual model predictions.
+
+
For Regression (predicting a number): We take the average of all predictions.
+
For Classification (predicting a category): We take a majority vote.
+
+
+
+
+
🔹 3. Mathematical Concept
+
The aggregation step is what combines the "wisdom" of the individual models. For a new data point \(x\), and \(m\) trained models:
+
+
Regression: The final prediction is the mean of the individual predictions.
+
$$ \hat{y} = \frac{1}{m} \sum_{i=1}^{m} f_i(x) $$
+
+
Classification: The final prediction is the class that receives the most votes.
+
Reduces Variance: This is the primary benefit. By averaging the outputs, the random errors and quirks of individual models tend to cancel each other out, leading to a much more stable final prediction.
+
Best with Unstable Models: Bagging is most effective when used with high-variance, low-bias models. Decision Trees are the perfect example: a single deep decision tree is very prone to overfitting (high variance), but a bagged ensemble of them is very robust.
+
Parallelizable: Each model in the ensemble is trained independently, making Bagging very efficient on multi-core processors.
+
+
+
🔹 5. Advantages & Disadvantages
+
+
+
+
Advantages
+
Disadvantages
+
+
+
+
+
✅ Significantly reduces overfitting and variance.
+
❌ Increased Computational Cost: You have to train multiple models instead of just one, which takes more time and resources.
+
+
+
✅ Often leads to a major improvement in accuracy and stability.
+
❌ Loss of Interpretability: It's easy to understand and visualize a single decision tree, but it's very difficult to interpret the combined logic of 100 different trees.
+
+
+
✅ Can be applied to almost any type of base model (e.g., trees, SVMs, neural networks).
+
❌ Less effective for models that are already stable and have low variance (like Linear Regression).
+
+
+
+
+
🔹 6. Python Implementation (Beginner Example)
+
+
In this example, we'll compare a single, complex Decision Tree to a Bagging ensemble of many Decision Trees. We expect the single tree to overfit and perform perfectly on the training data but poorly on the test data. The Bagging classifier should be more robust and perform well on both.
+
+
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.ensemble import BaggingClassifier
+from sklearn.metrics import accuracy_score
+
+# --- 1. Create a Sample Dataset ---
+X, y = make_classification(n_samples=500, n_features=10, n_informative=5,
+ n_redundant=0, random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+
+# --- 2. Train a Single Decision Tree (High Variance Model) ---
+single_tree = DecisionTreeClassifier(random_state=42)
+single_tree.fit(X_train, y_train)
+y_pred_tree = single_tree.predict(X_test)
+print(f"Single Decision Tree Accuracy: {accuracy_score(y_test, y_pred_tree):.2%}")
+
+# --- 3. Train a Bagging Ensemble of Decision Trees ---
+# We create an ensemble of 100 decision trees.
+bagging_clf = BaggingClassifier(
+ base_estimator=DecisionTreeClassifier(random_state=42),
+ n_estimators=100,
+ random_state=42
+)
+bagging_clf.fit(X_train, y_train)
+y_pred_bagging = bagging_clf.predict(X_test)
+print(f"Bagging Classifier Accuracy: {accuracy_score(y_test, y_pred_bagging):.2%}")
+
+
+
🔹 7. Applications
+
+
Random Forest: The most famous application of Bagging. A Random Forest is an ensemble of decision trees that uses Bagging for data sampling and adds an extra layer of randomness by also selecting a random subset of features for each tree.
+
Medical Diagnosis: Combining the opinions of multiple diagnostic models to make a more reliable prediction about a patient's condition.
+
Fraud Detection: Training multiple models on different subsets of transaction data to create a more robust fraud detection system.
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What does "Bootstrap Aggregating" mean?
+
What is the main goal of Bagging? Does it primarily reduce bias or variance?
+
If you were using Bagging for a regression problem to predict house prices, how would you calculate the final prediction from your ensemble of models?
+
Why is Bagging not very effective when used with a simple model like Linear Regression?
+
+
+
Answers
+
1.Bootstrap refers to creating random subsamples of the data with replacement. Aggregating refers to combining the predictions of the models trained on these subsamples (e.g., by averaging or voting).
+
2. The main goal of Bagging is to reduce variance. It helps to stabilize unstable models that are prone to overfitting.
+
3. You would take the average of the price predictions from all the individual models in the ensemble.
+
4. Linear Regression is a low-variance (stable) model. Its predictions don't change drastically even when the training data is slightly modified. Since Bagging's main strength is reducing variance, it provides little benefit to an already stable model.
+
+
+
+
🔹 Key Terminology Explained
+
+
The Story: Decoding the Jellybean Guesser's Strategy
+
+
+
+ Ensemble Method:
+
+ What it is: A machine learning technique where multiple models (often called "weak learners") are trained and their predictions are combined to achieve better performance than any single model alone.
+
+ Story Example: Instead of relying on one expert jellybean guesser, you assemble a "committee" or ensemble of 100 guessers.
+
+
+ Bootstrap Sampling:
+
+ What it is: A resampling method that involves drawing random samples from a dataset *with replacement*.
+
+ Story Example: To give each of your 100 guessers a slightly different perspective, you show each one a different random handful of jellybeans from the jar (and you put the beans back each time). This is bootstrap sampling.
+
+
+ Variance (in Models):
+
+ What it is: A measure of how much a model's predictions would change if it were trained on a different subset of the data. High variance means the model is unstable and sensitive to the specific training data it sees (i.e., it overfits).
+
+ Story Example: A single, overconfident "expert" guesser has high variance; their guess might be very different if they saw a slightly different handful of jellybeans. The averaged guess of the crowd has low variance.
+
Imagine a group of students studying for a difficult exam. Instead of studying independently (like in Bagging), they study sequentially. The first student takes a practice test and gets some questions right and some wrong. The second student then focuses specifically on the questions the first student got wrong. Then, a third student comes in and focuses on the questions that the first two *still* struggled with. They continue this process, with each new student specializing in the mistakes of their predecessors. Finally, they take the exam as a team, with the opinions of the students who studied the hardest topics given more weight. This is Boosting. It's an ensemble technique that builds a strong model by sequentially training new models to correct the errors of the previous ones.
+
+
Boosting is a powerful ensemble technique that aims to convert a collection of "weak learners" (models that are only slightly better than random guessing) into a single "strong learner." Unlike Bagging, which trains models in parallel, Boosting is a sequential process where each new model is built to fix the errors made by the previous models.
+
+
🔹 2. How Boosting Works
+
The core idea of Boosting is to iteratively focus on the "hard" examples in the dataset.
+
+
+
Train a Weak Learner: Start by training a simple base model (often a very shallow decision tree called a "stump") on the original dataset.
+
Identify Errors: Use this model to make predictions on the training set and identify which samples it misclassified.
+
Increase Weights: Assign higher weights to the misclassified samples. This forces the next model in the sequence to pay more attention to these "hard" examples.
+
Train the Next Learner: Train a new weak learner on the re-weighted dataset. This new model will naturally focus on getting the previously incorrect samples right.
+
Repeat and Aggregate: Repeat steps 2-4 for a specified number of models. The final prediction is a weighted combination of all the individual models' predictions, where better-performing models are given a higher weight.
+
+
+
🔹 3. Mathematical Concept
+
The final prediction of a boosting model is a weighted sum (for regression) or a weighted majority vote (for classification) of all the weak learners.
+
$$ F(x) = \sum_{m=1}^{M} \alpha_m h_m(x) $$
+
+
\( h_m(x) \): The prediction of the m-th weak learner.
+
\( \alpha_m \): The weight assigned to the m-th learner. This weight is typically calculated based on the learner's accuracy—better models get a bigger say in the final prediction.
+
\( F(x) \): The final, combined prediction of the strong learner.
+
+
+
🔹 4. Popular Boosting Algorithms
+
There are several famous implementations of the boosting idea:
+
+
AdaBoost (Adaptive Boosting): The original boosting algorithm. It adjusts the weights of the training samples at each step.
+
Gradient Boosting: A more generalized approach. Instead of re-weighting samples, each new model is trained to predict the *residual errors* (the difference between the true values and the current ensemble's prediction) of the previous models.
+
XGBoost (Extreme Gradient Boosting): A highly optimized and regularized version of Gradient Boosting. It's known for its speed and performance and is a dominant algorithm in machine learning competitions.
+
LightGBM & CatBoost: Even more modern and efficient implementations of Gradient Boosting, designed for speed on large datasets and better handling of categorical features.
+
+
+
🔹 5. Key Points
+
+
Sequential vs. Parallel: Boosting is sequential (models are trained one after another). Bagging is parallel (models are trained independently).
+
Bias and Variance: Boosting is a powerful technique that can reduce both bias and variance, leading to very strong predictive models.
+
Weak Learners: The base models in boosting are typically very simple (e.g., decision trees with a depth of just 1 or 2). This prevents the individual models from overfitting.
+
Sensitive to Outliers: Because boosting focuses on hard-to-classify examples, it can be sensitive to outliers, as it will try very hard to correctly classify these noisy points.
+
+
+
🔹 6. Advantages & Disadvantages
+
+
+
+
Advantages
+
Disadvantages
+
+
+
+
+
✅ Often achieves the highest predictive accuracy among all machine learning algorithms.
+
❌ Computationally Expensive: The sequential nature means it cannot be easily parallelized, which can make it slow to train.
+
+
+
✅ Can handle a variety of data types and complex relationships.
+
❌ Sensitive to Outliers and Noisy Data: It may over-emphasize noisy or outlier data points by trying too hard to classify them correctly.
+
+
+
✅ Many highly optimized implementations exist (XGBoost, LightGBM).
+
❌ Prone to Overfitting if the number of models is too large, without proper regularization.
+
+
+
+
+
🔹 7. Python Implementation (Sketches)
+
+
Here are simple examples of how to use two classic boosting algorithms in scikit-learn. The setup is very similar to other classifiers.
+
+
+
AdaBoost Example
+
+from sklearn.ensemble import AdaBoostClassifier
+from sklearn.tree import DecisionTreeClassifier
+# Assume X_train, y_train, X_test are defined
+
+# AdaBoost often uses a "stump" (a tree with depth 1) as its weak learner.
+weak_learner = DecisionTreeClassifier(max_depth=1)
+
+# Create the AdaBoost model
+adaboost_clf = AdaBoostClassifier(
+ base_estimator=weak_learner,
+ n_estimators=50, # The number of students in our study group
+ learning_rate=1.0,
+ random_state=42
+)
+adaboost_clf.fit(X_train, y_train)
+y_pred = adaboost_clf.predict(X_test)
+
+
+
Gradient Boosting Example
+
+from sklearn.ensemble import GradientBoostingClassifier
+# Assume X_train, y_train, X_test are defined
+
+# Create the Gradient Boosting model
+gradient_boosting_clf = GradientBoostingClassifier(
+ n_estimators=100,
+ learning_rate=0.1,
+ max_depth=3, # Trees are often slightly deeper than in AdaBoost
+ random_state=42
+)
+gradient_boosting_clf.fit(X_train, y_train)
+y_pred = gradient_boosting_clf.predict(X_test)
+
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What is the fundamental difference between how Bagging and Boosting train their models?
+
What is a "weak learner" in the context of boosting?
+
In Gradient Boosting, what does each new model try to predict?
+
Why is Boosting more sensitive to outliers than Bagging?
+
+
+
Answers
+
1. Bagging trains its models in parallel on different bootstrap samples of the data. Boosting trains its models sequentially, where each new model is trained to correct the errors of the previous ones.
+
2. A "weak learner" is a model that performs only slightly better than random guessing. In boosting, simple models like shallow decision trees (stumps) are used as weak learners.
+
3. Each new model in Gradient Boosting is trained to predict the residual errors of the current ensemble's predictions.
+
4. Boosting is more sensitive because its core mechanism involves increasing the weights of misclassified samples. An outlier is, by definition, a hard-to-classify point, so the algorithm will focus more and more on this single point, which can distort the decision boundary and harm generalization.
+
+
+
+
🔹 Key Terminology Explained
+
+
The Story: Decoding the Study Group's Strategy
+
+
+
+ Weak Learner:
+
+ What it is: A simple model that has a predictive accuracy only slightly better than random chance.
+
+ Story Example: Each individual student in the study group is a weak learner. On their own, they might only get 55% on a true/false test, but by combining their specialized knowledge, they can ace the exam.
+
+
+ Sequential Training:
+
+ What it is: A training process where models are built one after another, and the creation of each new model depends on the results of the previous ones.
+
+ Story Example: The study group's process is sequential because the second student can't start studying until the first student has taken the practice test and identified their mistakes.
+
+
+ Residual Error (in Gradient Boosting):
+
+ What it is: The difference between the actual target value and the predicted value. It's what the model got wrong.
+
+ Story Example: If a student was supposed to predict a house price of $300k but their model predicted $280k, the residual error is +$20k. The next student's job is to build a model that predicts this +$20k error.
+
+ DBSCAN is a density-based clustering algorithm that groups data points that are closely packed together and marks outliers as noise based on their density in the feature space. It identifies clusters as dense regions in the data space separated by areas of lower density. Unlike K-Means or hierarchical clustering which assumes clusters are compact and spherical, DBSCAN performs well in handling real-world data irregularities such as:
+
+
+
Arbitrary-Shaped Clusters: Clusters can take any shape, not just circular or convex.
+
Noise and Outliers: It effectively identifies and handles noise points without assigning them to any cluster.
+
+
+ The figure below shows a dataset with clustering algorithms: K-Means and Hierarchical handling compact, spherical clusters with varying noise tolerance while DBSCAN manages arbitrary-shaped clusters and noise handling.
+
+
+
Key Parameters in DBSCAN
+
+ 1. eps ($$\epsilon$$): This defines the radius of the neighborhood around a data point. If the distance between two points is less than or equal to $$\epsilon$$, they are considered neighbors. A common method to determine $\epsilon$ is by analyzing the k-distance graph. Choosing the right $\epsilon$ is important:
+
+
+
If $$\epsilon$$ is too small, most points will be classified as noise.
+
If $$\epsilon$$ is too large, clusters may merge and the algorithm may fail to distinguish between them.
+
+
+ 2. MinPts: This is the minimum number of points required within the $\epsilon$ radius to form a dense region. A general rule of thumb is to set MinPts $$$\ge D+1$$, where $$D$$ is the number of dimensions in the dataset.
+
+
+
How Does DBSCAN Work?
+
+ DBSCAN works by categorizing data points into three types:
+
+
+
Core points: which have a sufficient number of neighbors within a specified radius (epsilon).
+
Border points: which are near core points but lack enough neighbors to be core points themselves.
+
Noise points: which do not belong to any cluster.
+
+
+
Steps in the DBSCAN Algorithm
+
+
Identify Core Points: For each point in the dataset, count the number of points within its $\epsilon$ neighborhood. If the count meets or exceeds MinPts, mark the point as a core point.
+
Form Clusters: For each core point that is not already assigned to a cluster, create a new cluster. Recursively find all density-connected points (i.e., points within the $\epsilon$ radius of the core point) and add them to the cluster.
+
Density Connectivity: Two points $a$ and $b$ are density-connected if there exists a chain of points where each point is within the $\epsilon$ radius of the next, and at least one point in the chain is a core point. This chaining process ensures that all points in a cluster are connected through a series of dense regions.
+
Label Noise Points: After processing all points, any point that does not belong to a cluster is labeled as noise.
+
+
+
How this DBSCAN Visualization Handles User-Added Data
+
+ In this interactive visualization, when you click the "Add New Point & Cluster" button, the new point you specify is appended to the existing dataset. Importantly, the entire DBSCAN clustering algorithm is then re-run from scratch on this updated dataset. This means that:
+
+
+
The new point is treated as part of the original data, and its type (core, border, or noise) and cluster assignment are determined by the algorithm based on its density relative to all other points.
+
The cluster assignments and even the types (core/border/noise) of previously existing points might change, as the addition of a new point can alter the density landscape and connectivity.
+
The visualization dynamically updates to reflect these new cluster structures, showing the real-time effect of adding data on the DBSCAN clustering process.
+
+
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/templates/Deep-Reinforcement-Learning.html b/templates/Deep-Reinforcement-Learning.html
new file mode 100644
index 0000000000000000000000000000000000000000..4cb3795dfe5170f25d7e5e44814f826b2670b99a
--- /dev/null
+++ b/templates/Deep-Reinforcement-Learning.html
@@ -0,0 +1,322 @@
+{% extends "layout.html" %}
+
+{% block content %}
+
+
+
+
+
+ Study Guide: Deep Reinforcement Learning
+
+
+
+
+
+
+
+
+
🚀 Study Guide: Deep Reinforcement Learning (DRL)
+
+
🔹 1. Introduction
+
+
Story-style intuition: Upgrading the Critic's Brain
+
Remember our food critic from the Q-Learning guide with their giant notebook (the Q-table)? That notebook worked fine for a small city with a few restaurants. But what if they move to a massive city with millions of restaurants, where the menu changes every night (a continuous state space)? Their notebook is useless! It's too big to create and too slow to look up.
+ To solve this, the critic replaces their notebook with a powerful, creative brain—a Deep Neural Network. Now, instead of looking up an exact restaurant and dish, they can just describe the situation ("a fancy French restaurant, feeling adventurous") and their brain can *predict* a good Q-value for any potential dish on the spot. Deep Reinforcement Learning (DRL) is this powerful combination of RL's trial-and-error learning with the pattern-recognition power of deep learning.
+
+
Deep Reinforcement Learning (DRL) is a subfield of machine learning that combines Reinforcement Learning (RL) with Deep Learning (DL). Instead of using tables to store values, DRL uses deep neural networks to approximate the optimal policy and/or value functions, allowing it to solve problems with vast, high-dimensional state and action spaces.
+
+
🔹 2. Why Deep RL?
+
Traditional RL methods like Q-Learning rely on tables (Q-tables) to store a value for every possible state-action pair. This approach fails spectacularly when the number of states or actions becomes very large or continuous.
+
+
Example: An Atari Game
+
+
The State: A single frame from the game is an image of, say, 84x84 pixels. Even with just 256 grayscale values, the number of possible states is \(256^{(84 \times 84)}\), a number larger than all the atoms in the universe. Creating a Q-table is impossible.
+
The DRL Solution: A deep neural network (specifically, a Convolutional Neural Network or CNN) can take the raw pixels of the game screen as input and directly output the Q-values for each possible action (e.g., {Up, Down, Left, Right}). It learns to recognize patterns like the position of the ball and the paddle, just like a human would.
+
+
+
+
🔹 3. Core Components
+
The core components are the same as in classic RL, but the implementation is powered by neural networks.
+
+
Agent: The decision-maker, whose "brain" is now a deep neural network.
+
Environment: The world the agent interacts with.
+
State Representation: Often high-dimensional raw data, like image pixels or sensor readings.
+
Action Space: Can be discrete or continuous.
+
Reward Signal: The feedback that guides the learning process.
+
+
+
🔹 4. Types of Deep RL Algorithms
+
+
DRL agents can learn in different ways, just like people. Some focus on judging the situation (value-based), some focus on learning a skill (policy-based), and the most advanced do both at the same time (Actor-Critic).
+
+
+
+ Value-Based Methods (e.g., DQN): The neural network learns to predict the Q-value for each action. The policy is simple: just choose the action with the highest predicted Q-value.
+
Analogy: This is a "critic" agent. It doesn't have an innate skill, but it's an expert at evaluating the potential of every possible move.
+
+
+ Policy-Based Methods (e.g., REINFORCE): The neural network learns the policy directly. It takes a state as input and outputs the probability of taking each action.
+
Analogy: This is an "actor" agent. It develops a direct instinct or muscle memory for what to do in a situation, without necessarily calculating the long-term value of its actions.
+
+
+ Actor-Critic Methods (e.g., A2C, PPO): This is the hybrid approach. Two neural networks are used: an Actor that controls the agent's behavior (the policy) and a Critic that evaluates how good those actions are (the value function). The Critic gives feedback to the Actor, helping it to improve.
+
Analogy: This is like an actor on stage with a director. The actor performs, and the director (critic) provides feedback ("That was a great delivery!") to help the actor refine their performance.
+
+
+
+
🔹 5. Deep Q-Networks (DQN)
+
DQN was a breakthrough algorithm that successfully used a deep neural network to play Atari games at a superhuman level. It introduced two key innovations to stabilize learning:
+
+
+
Experience Replay: The agent stores its past experiences `(state, action, reward, next_state)` in a large memory buffer. During training, it samples random mini-batches from this buffer to update its neural network. This breaks the correlation between consecutive experiences, making training more stable and efficient.
+
Target Network: DQN uses a second, separate neural network (the "target network") to generate the target Q-values in the update rule. This target network is a clone of the main network but is updated only periodically. This provides a stable target for the Q-value updates, preventing the learning process from spiraling out of control.
+
+
+
🔹 6. Policy Gradient Methods
+
+
The Archer's Analogy: An archer (the policy network) shoots an arrow. If the arrow hits close to the bullseye (high reward), they adjust their stance and aim (the network's weights) slightly in the same direction they just used. If the arrow misses badly (low reward), they adjust their aim in the opposite direction. Policy Gradient is this simple idea of "do more of what works and less of what doesn't," scaled up with calculus (gradient ascent).
+
+
These methods directly optimize the policy's parameters \( \theta \) to maximize the expected return \( J(\theta) \). The core idea is to update the policy in the direction that makes good actions more likely and bad actions less likely.
Actor-Critic methods are the state-of-the-art for many DRL problems, especially those with continuous action spaces. They combine the best of both worlds:
+
+
The Actor (policy network) is responsible for taking actions.
+
The Critic (value network) provides feedback by evaluating the actions taken by the Actor.
+
+
This setup is more stable and sample-efficient because the Critic provides a low-variance "baseline" to judge the Actor's actions against, leading to better and faster learning.
+
Example Algorithms: PPO (Proximal Policy Optimization) and SAC (Soft Actor-Critic) are two of the most popular and robust DRL algorithms used today.
+
+
🔹 8. Challenges in DRL
+
+
High Sample Complexity: DRL agents often need millions or even billions of interactions with the environment to learn a good policy, making them very data-hungry.
+
Training Instability: The learning process can be highly sensitive to hyperparameters and random seeds, and can sometimes diverge or collapse.
+
Reward Design: Crafting a reward function that encourages the desired behavior without allowing for unintended "loopholes" or "reward hacking" is very difficult.
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What is the primary problem with using a Q-table that led to the development of Deep RL?
+
What is "Experience Replay" in DQN, and why is it important?
+
What are the two main components of an Actor-Critic agent?
+
Which type of DRL algorithm would be most suitable for controlling a robot arm with precise, continuous movements?
+
+
+
Answers
+
1. Q-tables cannot handle very large or continuous state spaces. The number of states in problems like video games or robotics is often effectively infinite, making it impossible to create or store a table for them.
+
2. Experience Replay is the technique of storing past transitions `(s, a, r, s')` in a memory buffer and then training the network on random samples from this buffer. It is important because it breaks the temporal correlation between consecutive samples, leading to more stable and efficient training.
+
3. An Actor (which learns and executes the policy) and a Critic (which learns and provides feedback on the value of states or actions).
+
4. An Actor-Critic method (like DDPG, PPO, or SAC) would be most suitable. Policy-based and Actor-Critic methods are naturally able to handle continuous action spaces, whereas value-based methods like DQN are designed for discrete actions.
+
+
+
+
🔹 Key Terminology Explained
+
+
The Story: Decoding the DRL Agent's Brain
+
+
+
+ Function Approximator:
+
+ What it is: Any function that can generalize from a set of inputs to produce an output, used to estimate a target function. In DRL, a deep neural network is used as a function approximator.
+
+ Story Example: Instead of a giant phone book (a table) that lists every person's exact phone number, you have a smart assistant (a function approximator). You can just ask it for "John Smith's number," and it can predict the number even if it's not explicitly in its contact list.
+
+
+ Experience Replay:
+
+ What it is: A technique where the agent stores its past experiences and samples from them randomly to train.
+
+ Story Example: This is like a student who, instead of just studying the last problem they solved, keeps a stack of all their past homework problems. To study for a test, they randomly pull problems from this stack. This prevents them from only remembering how to solve the most recent type of problem and helps them remember everything they've learned.
+
+
+ Policy Gradient:
+
+ What it is: The mathematical gradient (or direction of steepest ascent) of the policy's performance. RL algorithms use this to "climb the hill" towards a better policy.
+
+ Story Example: This is the archer's learning process. The policy gradient is the exact direction they need to adjust their aim to get closer to the bullseye, based on where their last arrow landed.
+
Imagine our Supermarket Detective (from the Apriori guide) has a new colleague, an efficient librarian. The detective uses a "horizontal" approach: they go through each shopping receipt one by one to see what's inside. The librarian uses a "vertical" approach. Instead of looking at receipts, they create an index card for every single item in the store. On the card for "Milk," they simply list the ID number of every receipt that contains milk. To find out how many people bought {Milk, Bread} together, they just take the two cards and find the common receipt IDs. This is the core idea of Eclat. It's often much faster because finding common IDs between two lists is a very quick operation.
+
+
The Eclat Algorithm (Equivalence Class Clustering and bottom-up Lattice Traversal) is an efficient algorithm for frequent itemset mining. Unlike Apriori, which scans the database horizontally (transaction by transaction), Eclat uses a vertical data format and finds frequent itemsets by intersecting transaction ID lists. This approach can be significantly faster, especially for dense datasets.
+
+
🔹 Key Definitions
+
+
+ Itemset: A collection of one or more items (e.g., {Milk, Diapers}).
+
+
+ Support: The number of transactions an itemset appears in. Note: Eclat often uses the raw count, not the percentage.
+
+
+ Tidset (Transaction ID set): The set of all transaction IDs (TIDs) that contain a specific itemset. This is the heart of the vertical data format.
+
+ Vertical Data Format: The data is structured as a map from each item to its tidset, instead of the traditional list of transactions.
+
+
+
+
🔹 The Eclat Principle
+
+
The Librarian's Smart Trick: The librarian's method for finding the support of a combined itemset is incredibly fast. To find the support of {Milk, Bread}, they don't need to look at any receipts. They just take the two index cards and find the common numbers (the intersection).
+
+
The core principle of Eclat is that the support of a larger itemset can be computed directly by intersecting the tidsets of its smaller subsets. The size of the resulting intersection is the support count.
Eclat uses a depth-first search (DFS) approach to explore the search space of itemsets.
+
+
+
Convert to Vertical Format: Scan the database once to transform the horizontal list of transactions into a vertical map of item → tidset.
+
Find Frequent 1-Itemsets: Find all items whose tidset size is greater than or equal to `min_support`.
+
Recursive Search (DFS):
+
+
Start with a frequent 1-itemset (e.g., {Milk}).
+
Find all other frequent items that can be combined with it.
+
For each combination (e.g., {Milk, Bread}), calculate the new tidset by intersection.
+
If the new tidset is frequent (its size ≥ `min_support`), add it to the list of frequent itemsets and then use this new itemset as the base for the next level of recursion (e.g., find combinations like {Milk, Bread, Butter}).
+
+
+
Continue this recursive process until no more frequent itemsets can be generated from a branch.
+
+
+
🔹 Comparison with Apriori
+
+
+
+
Feature
+
Eclat
+
Apriori
+
+
+
+
+
Data Format
+
Vertical (Item → {TID1, TID2, ...})
+
Horizontal (TID → {Item1, Item2, ...})
+
+
+
Search Method
+
Depth-First Search (DFS)
+
Breadth-First Search (BFS)
+
+
+
Main Operation
+
Tidset intersection.
+
Candidate generation and database scanning.
+
+
+
Performance
+
Generally faster, especially on dense datasets.
+
Can be slow due to repeated database scans and large candidate sets.
+
+
+
+
+
🔹 Strengths & Weaknesses
+
Advantages:
+
+
✅ Faster than Apriori: Avoids the expensive process of candidate generation and repeated database scans. Support counting via intersections is very fast.
+
✅ Efficient for Dense Data: Works particularly well when transactions are long and contain many items.
+
+
Disadvantages:
+
+
❌ Memory Intensive: The tidsets, especially for frequent items in a large dataset, can become very long and consume a lot of memory.
+
❌ Less Common: Not as widely implemented in standard machine learning libraries as Apriori.
+
+
+
🔹 Python Implementation (Conceptual Example)
+
+
Since Eclat is less common in libraries like `scikit-learn`, here's a conceptual Python example using a library called `pyECLAT`. The logic mirrors the algorithm steps: we prepare the data, create an Eclat object, and call `fit()` to get the frequent itemsets.
+
+
+# NOTE: You would need to install pyECLAT first: pip install pyECLAT
+import pandas as pd
+from pyECLAT import ECLAT
+
+# --- 1. Create a Sample Dataset in the right format ---
+# The data is a DataFrame where each row is a transaction.
+# NaN values are used for padding.
+data = {'Transaction': [1, 2, 3, 4, 5],
+ 'Items': [['Milk', 'Beer', 'Diapers'],
+ ['Bread', 'Butter', 'Milk'],
+ ['Beer', 'Diapers', 'Milk', 'Cola'],
+ ['Bread', 'Butter', 'Beer', 'Diapers'],
+ ['Bread', 'Butter']]}
+df = pd.DataFrame(data)
+
+# --- 2. Initialize and Run the Eclat Algorithm ---
+# We create an ECLAT object from our transactions data.
+eclat_instance = ECLAT(data=df['Items'])
+
+# You can see the binary (one-hot encoded) format it uses internally
+# print(eclat_instance.df_bin)
+
+# --- 3. Find Frequent Itemsets ---
+# We set min_support to 0.4, meaning itemsets in at least 2 of the 5 transactions.
+# The 'fit' method does all the work of intersecting tidsets.
+min_support = 0.4
+rule_indices, rule_supports = eclat_instance.fit(min_support=min_support,
+ min_combination=1, # Min number of items in an itemset
+ max_combination=3) # Max number of items
+
+print("--- Frequent Itemsets (Support >= 40%) ---")
+print(rule_supports)
+
+
+
🔹 Best Practices
+
+
Choose the Right Algorithm: Use Eclat for dense datasets where the number of transactions is not excessively large. For sparse data with many transactions, FP-Growth is often the best choice.
+
Manage Memory: Be mindful that tidsets for very common items can be huge. If you run into memory issues, you may need to increase your `min_support` threshold.
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What is the fundamental difference in how Apriori and Eclat scan data?
+
If TID({A}) = {1, 2, 5} and TID({B}) = {2, 4, 5, 6}, what is the support count of the itemset {A, B}?
+
What is the main disadvantage of using Eclat on a dataset with millions of transactions?
+
What search strategy does Eclat use to find frequent itemsets?
+
+
+
Answers
+
1. Apriori scans data horizontally (it reads each transaction to see what items it contains). Eclat uses a vertical format (it looks at each item to see which transactions it appeared in).
+
2. The support is the size of the intersection of the tidsets: |{1, 2, 5} ∩ {2, 4, 5, 6}| = |{2, 5}| = 2.
+
3. The main disadvantage is high memory consumption, as the tidsets for very frequent items can become extremely large lists containing millions of transaction IDs.
+
4. Eclat uses a Depth-First Search (DFS) strategy to traverse the lattice of potential itemsets.
+
+
+
+
🔹 Key Terminology Explained (Eclat)
+
+
The Story: Decoding the Efficient Librarian's Index
+
+
+
+ Vertical Data Format:
+
+ What it is: A way of storing transactional data where each item is a key, and its value is a list of all transaction IDs it appears in.
+
+ Story Example: Instead of a pile of receipts, the librarian has a card catalog. Each drawer is an item ("Milk," "Bread," etc.), and each card in that drawer is a receipt ID. This is a vertical format.
+
+
+ Tidset Intersection:
+
+ What it is: The core operation of Eclat. It's the process of finding the common elements between two or more transaction ID lists.
+
+ Story Example: When the librarian takes the list of receipt IDs for "Milk" and the list for "Bread" and finds all the numbers that appear on both lists, they are performing a tidset intersection.
+
+
+ Depth-First Search (DFS):
+
+ What it is: A strategy for exploring a tree or graph structure. It goes as deep as possible down one path before backtracking.
+
+ Story Example: To find all combinations, the librarian starts with {Milk}, then immediately finds all frequent pairs starting with Milk, like {Milk, Bread}. Then, they try to extend that to {Milk, Bread, Butter} before backtracking to try other pairs like {Milk, Diapers}. This is a DFS approach.
+
Imagine you have a pile of fruit containing two types that can be tricky to separate: lemons and limes. They look similar, and their sizes overlap. A simple sorter (like K-Means) might draw a hard line: anything yellow is a lemon. But what about a greenish lemon or a yellowish lime? GMM is an expert. It knows that limes are, *on average*, smaller and rounder, while lemons are *on average* larger and more oval. GMM models each fruit type as a flexible, oval-shaped "cloud of probability." For a fruit that's right on the border, GMM can say, "I'm 70% sure this is a lemon and 30% sure it's a lime." This is called soft clustering.
+
+
A Gaussian Mixture Model (GMM) is a probabilistic model that assumes all the data points are generated from a mixture of a finite number of Gaussian distributions (bell curves). In simple terms, it believes the data is a mix of several different groups, where each group has a sort of "center point" and a particular shape (which can be circular or oval).
+
+
Example: Analyzing customer data. You might have one group of customers who spend a lot but visit rarely (an oval cluster) and another group who spend a little but visit often (a different oval cluster). GMM is great at finding these non-circular groups.
+
+
+
🔹 Mathematical Foundation
+
+
Think of it like a recipe. The final probability of any data point is a "mixture" of probabilities from each group's individual recipe. Each group's recipe defines its center, its shape, and its overall importance in the mix.
+
+
+
Probability Density Function of a Gaussian: This is the complex-looking formula for a single bell curve (the recipe for one fruit type).
+
$$ \mathcal{N}(x|\mu, \Sigma) = \text{A formula defining a bell curve} $$
+
You don't need to memorize it! Just know it's the math for creating one of those oval "probability clouds."
+
+
Mixture of Gaussians: The total probability is a weighted sum of all the individual bell curves.
+
$$ p(x) = (\text{Weight}_A \times \text{Prob from A}) + (\text{Weight}_B \times \text{Prob from B}) + \dots $$
+ Where:
+
+
\( K \): The number of groups (e.g., 2 types of fruit).
+
\( \pi_k \): The "mixing weight" (e.g., maybe 60% of our pile is lemons).
+
\( \mu_k \): The "mean" (the center of the fruit group).
+
\( \Sigma_k \): The "covariance" (the shape and orientation of the fruit group—is it round or a tilted oval?).
+
+
+
+
+
🔹 Expectation-Maximization (EM) Algorithm
+
+
Story: The "Guess and Check" Method
+
Imagine you have the fruit pile but don't know the exact size and shape of lemons and limes. You use a two-step "guess and check" process:
+ 1. The "Guess" Step (Expectation): You make a starting guess for the oval shapes of the two fruit types. Then, for every single fruit in the pile, you calculate the probability it belongs to each shape. (e.g., "This one is 80% likely a lemon, 20% a lime").
+ 2. The "Check & Update" Step (Maximization): After guessing for all the fruit, you update your oval shapes. You calculate the average size and shape of all the fruits you labeled as "mostly lemon" to get a *better* lemon shape. You do the same for limes.
+ You repeat these "Guess" and "Check & Update" steps. Each time, your oval shape descriptions get more accurate, until they settle on the best possible fit for the data.
+
+
+
Initialize the parameters (the oval shapes) with a random guess.
+
E-step (Expectation): The "Guess" step. Calculate the probability that each data point belongs to each cluster.
+
M-step (Maximization): The "Check & Update" step. Update the oval shapes based on the probabilities from the E-step.
+
Repeat until the oval shapes stop changing.
+
+
+
🔹 Types of Covariance Structures
+
+
Example: The Cookie Cutter Analogy
+
The `covariance_type` parameter in the code controls the flexibility of your "oval shapes" or cookie cutters.
+
+
Spherical: Least flexible. Clusters must be circles. (Round cookie cutters of different sizes).
+
Diagonal: A bit more flexible. Clusters are ovals, but they must be aligned with the axes. (Oval cutters that can't be tilted).
+
Full: Most flexible. Clusters can be ovals of any shape and tilted in any direction. (The best, but also the most complex, type of cookie cutter).
+
Tied: A special rule where all clusters must have the exact same shape and size. (You must use the same cookie cutter for every group).
+
+
+
+
🔹 Comparison
+
+
+
+
Model
+
GMM vs. K-Means
+
GMM vs. Hierarchical
+
+
+
+
+
Cluster Assignment
+
GMM is soft (probabilistic). A point is 70% in Cluster A, 30% in B. K-Means is hard (100% in Cluster A).
+
GMM is probabilistic. Hierarchical is distance-based and deterministic.
+
+
+
Cluster Shape
+
GMM can model elliptical clusters. K-Means assumes spherical clusters.
+
GMM models clusters as distributions. Hierarchical can produce any shape depending on linkage.
+
+
+
Scalability
+
Both scale well, but GMM is more computationally intensive per iteration.
+
GMM scales much better to large datasets than hierarchical clustering.
+
+
+
+
+
🔹 Model Selection
+
GMM requires you to specify the number of clusters (K). Information criteria are used to help find the optimal K by balancing model fit with model complexity.
+
+
Story Example: Goldilocks and the Three Models
+
You test three GMMs: one with too few clusters (underfit), one with too many (overfit), and one that's just right.
+ • AIC (Akaike Information Criterion) and BIC (Bayesian Information Criterion) are like judges who score each model. They give points for fitting the data well but subtract points for being too complex. The model with the lowest score is the one that's "just right."
+
+
+
🔹 Strengths & Weaknesses
+
Advantages:
+
+
✅ Flexible Cluster Shapes: Can find clusters that aren't simple circles. Example: Identifying a long, thin cluster of "commuter" customers on a map.
+
✅ Soft Clustering: Tells you the probability that a point belongs to each cluster, which is great for understanding uncertainty.
+
+
Disadvantages:
+
+
❌ Requires specifying K: You have to tell it how many clusters to look for.
+
❌ Sensitive to Initialization: A bad starting guess can sometimes lead to a bad final result.
+
❌ Can be slow: The "Guess and Check" process can take time, especially with a lot of data.
+
+
+
🔹 Real-World Applications
+
+
Image Segmentation: Grouping pixels of similar color to separate a person from the background in a photo.
+
Speaker Recognition: Identifying who is speaking by modeling the unique properties of their voice.
+
Anomaly Detection: Finding unusual credit card transactions by seeing which ones don't fit well into any normal spending clusters.
+
+
+
🔹 Python Implementation (Beginner Example)
+
+
This simple example shows the core steps: create data, create a GMM model, train it (`.fit`), and then use it to predict which cluster new data belongs to (`.predict`) and the probabilities for each cluster (`.predict_proba`).
+
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.mixture import GaussianMixture
+from sklearn.datasets import make_blobs
+
+# --- 1. Create Sample Data ---
+# We'll create 300 data points, grouped into 3 "blobs" or clusters.
+X, y_true = make_blobs(n_samples=300, centers=3, cluster_std=1.0, random_state=42)
+
+# --- 2. Create and Train the GMM ---
+# We tell the model to look for 3 clusters (n_components=3).
+# random_state ensures we get the same result every time we run the code.
+gmm = GaussianMixture(n_components=3, random_state=42)
+
+# Train the model on our data. This is where the EM algorithm runs.
+gmm.fit(X)
+
+# --- 3. Make Predictions ---
+# Predict the cluster for each data point in our original dataset.
+labels = gmm.predict(X)
+
+# Let's create a new, unseen data point to test our model.
+new_point = np.array([[-5, -5]])
+
+# Predict which cluster the new point belongs to.
+new_point_label = gmm.predict(new_point)
+print(f"The new point belongs to cluster: {new_point_label[0]}")
+
+# --- 4. Get Probabilities (The "Soft" Part) ---
+# This is the most powerful feature of GMM.
+# It tells us the probability of the new point belonging to EACH of the 3 clusters.
+probabilities = gmm.predict_proba(new_point)
+print(f"Probabilities for each cluster: {np.round(probabilities, 3)}") # e.g., [[0.95, 0.05, 0.0]]
+
+# --- 5. Visualize the Results ---
+# Let's plot our data points, colored by the cluster labels GMM assigned.
+plt.figure(figsize=(8, 6))
+plt.scatter(X[:, 0], X[:, 1], c=labels, s=40, cmap='viridis')
+# Let's also plot our new point as a big red star to see where it landed.
+plt.scatter(new_point[:, 0], new_point[:, 1], c='red', s=200, marker='*')
+plt.title('GMM Clustering Results')
+plt.xlabel('Feature 1')
+plt.ylabel('Feature 2')
+plt.grid(True)
+plt.show()
+
+
+
🔹 Best Practices
+
+
Scale Features: If your features are on different scales (e.g., age and income), scale them before fitting GMM so one doesn't unfairly dominate the other.
+
Use AIC/BIC: To choose the best number of clusters (K), run your model with several different values for `n_components` and pick the one with the lowest AIC or BIC score.
+
Use `n_init` Parameter: To prevent a bad random start from ruining your model, set `n_init` to a value like 10. This tells scikit-learn to run the whole process 10 times and keep the best result.
+
+
+
🔹 Key Terminology Explained (GMM)
+
+
The Story: Decoding the Fruit Sorter's Toolkit
+
Let's clarify the advanced tools our expert fruit sorter uses.
+
+
+
+ Probabilistic Model:
+
+ What it is: A model that uses probabilities to handle uncertainty. It gives you the "chance" of something happening, not a definite yes or no.
+
+ Story Example: A weather forecast saying "80% chance of rain" is a probabilistic model. GMM uses this same idea to assign a "chance of belonging" to each cluster.
+
+
+ Gaussian Distribution (Bell Curve):
+
+ What it is: The classic bell-shaped curve. It describes data where most values are clustered around an average.
+
+ Story Example: The heights of adults in a city follow a Gaussian distribution. Most people are near the average height, and very tall or very short people are rare.
+
+
+ Covariance:
+
+ What it is: A measure of how two variables are related. It defines the shape and tilt of the cluster.
+
+ Story Example: Ice cream sales and temperature have a positive covariance: when one goes up, the other tends to go up. This relationship creates an oval shape in the data, which the covariance matrix describes.
+
+
+ Likelihood:
+
+ What it is: A score of how well the model's "oval shapes" explain the actual data. The "Guess and Check" algorithm works to make this score as high as possible.
+
+ Story Example: If our fruit sorter's oval shape for "lemons" perfectly covers all the actual lemons in the pile, it has a high likelihood. If it's a bad fit, it has a low likelihood.
+
Story-style intuition: The Artist vs. The Art Critic
+
Imagine two types of AI that both study thousands of cat photos.
+ • The Discriminative Model is like an art critic. Its only job is to learn the difference between a cat photo and a dog photo. If you show it a new picture, it can tell you, "That's a cat," but it can't create a cat picture of its own. It learns a decision boundary.
+ • The Generative Model is like an artist. It studies the cat photos so deeply that it understands the "essence" of what makes a cat a cat—the patterns, the textures, the shapes. It learns the underlying distribution of "cat-ness." Because it has this deep understanding, it can then be asked to create a brand new, never-before-seen picture of a cat from scratch.
+
+
Generative Models are a class of statistical models that learn the underlying probability distribution of a dataset. Their primary goal is to understand the data so well that they can "generate" new data samples that are similar to the ones they were trained on.
+
+
🔹 Types of Generative Models
+
Generative models come in several powerful flavors, each with a different approach to learning and creating.
+
+
+ Probabilistic Models: These models explicitly learn a probability distribution P(X). Examples include Naïve Bayes and Gaussian Mixture Models (GMMs). They are often easy to interpret but less powerful for complex data like images.
+
+
+ Variational Autoencoders (VAEs):
+
Analogy: The Master Forger. A VAE is like a forger who learns to create masterpieces. It first "compresses" a real painting into a secret recipe (a condensed set of characteristics called the latent space). It then learns to "decompress" that recipe back into a painting. By learning this process, it can later create new recipes and generate new, unique paintings.
+
+
+ Generative Adversarial Networks (GANs):
+
Analogy: The Artist and Critic Game. A GAN consists of two competing neural networks: a Generator (the artist) that tries to create realistic images, and a Discriminator (the critic) that tries to tell the difference between real images and the artist's fakes. They train together in a game where the artist gets better at fooling the critic, and the critic gets better at catching fakes. This competition pushes the artist to create incredibly realistic images.
+
+
+
+ Diffusion Models:
+
Analogy: The Sculptor. A Diffusion Model is like a sculptor who starts with a random block of marble (pure noise) and slowly chisels away the noise, step by step, until a clear statue (a realistic image) emerges. It learns this "denoising" process by first practicing in reverse: taking a perfect statue and systematically adding noise to it until it becomes a random block.
+
+
+
+
🔹 Mathematical Foundations
+
+
+ Joint Probability P(X, Y): Generative models often learn the joint probability of features X and labels Y. This allows them to generate new pairs of (X, Y).
+
+
+ Maximum Likelihood Estimation (MLE): This is the principle most generative models use for training. They adjust their parameters to maximize the probability (likelihood) that the observed training data was generated by the model.
+
+
+ ELBO (for VAEs): VAEs optimize a lower bound on the data likelihood called the Evidence Lower Bound. It's a clever way to make an otherwise intractable optimization problem solvable.
+
+
+ Adversarial Loss (for GANs): This is the "minimax" game objective where the Generator tries to minimize the loss while the Discriminator tries to maximize it.
+
+
+
+
🔹 Workflow of Generative Models
+
+
Collect Data: Gather a large, high-quality dataset of the thing you want to generate (e.g., thousands of celebrity faces).
+
Choose a Model: Select the right type of generative model for your task (e.g., a GAN or Diffusion Model for realistic images).
+
Train the Model: This is the most computationally expensive step, where the model learns the underlying patterns and distribution of the training data.
+
Generate New Samples: After training, you can use the model to generate new, synthetic data by sampling from its learned distribution.
+
Evaluate Quality: Assess the quality of the generated samples using both quantitative metrics (like FID) and human evaluation.
+
+
+
🔹 Applications
+
+
Image Generation and Editing: Creating photorealistic faces, art, or modifying existing images (e.g., DALL-E, Midjourney, Stable Diffusion).
+
Text Generation: Powering chatbots, writing articles, and generating code (e.g., GPT-4).
+
Data Augmentation: Creating more training data for other machine learning models, which is especially useful for rare events or imbalanced datasets.
+
Drug Discovery and Design: Generating new molecular structures with desired properties to accelerate scientific research.
+
Music and Art Creation: Composing new melodies or creating novel artistic styles.
+
+
+
🔹 Advantages & Disadvantages
+
Advantages:
+
+
✅ Creative and Powerful: Can generate novel, high-quality data that has never been seen before.
+
✅ Unsupervised Learning: Can learn from vast amounts of unlabeled data.
+
✅ Data Augmentation: Solves the problem of limited training data by creating realistic synthetic samples.
+
+
Disadvantages:
+
+
❌ Computationally Expensive: Training large generative models requires significant GPU resources and time.
+
❌ Training Instability: GANs, in particular, can be notoriously difficult to train, suffering from problems like mode collapse.
+
❌ Difficult to Evaluate: How do you objectively measure "creativity" or "realism"? Evaluating the quality of generated content is often subjective.
+
+
+
🔹 Key Evaluation Metrics
+
+
Inception Score (IS): Measures how diverse and clear the generated images are. A higher score is better.
+
Frechet Inception Distance (FID): Compares the statistical distribution of generated images to real images. It's considered a more reliable metric than IS. A lower score is better.
+
Perplexity (for text): Measures how well a language model predicts a sample of text. A lower perplexity indicates the model is less "surprised" by the text, meaning it's a better fit.
+
+
+
🔹 Python Implementation (Conceptual Sketches)
+
+
Training large generative models from scratch is a major undertaking. Here are conceptual sketches of what the code looks like using popular frameworks.
+
+
+
Simple GAN Generator in PyTorch
+
+import torch.nn as nn
+import numpy as np
+
+class Generator(nn.Module):
+ def __init__(self, latent_dim, img_shape):
+ super(Generator, self).__init__()
+ self.img_shape = img_shape
+ self.model = nn.Sequential(
+ # Takes a random noise vector (latent_dim) and upsamples it
+ nn.Linear(latent_dim, 128),
+ nn.LeakyReLU(0.2, inplace=True),
+ nn.Linear(128, 256),
+ nn.BatchNorm1d(256),
+ nn.LeakyReLU(0.2, inplace=True),
+ nn.Linear(256, 512),
+ nn.BatchNorm1d(512),
+ nn.LeakyReLU(0.2, inplace=True),
+ nn.Linear(512, int(np.prod(self.img_shape))),
+ nn.Tanh() # Scales output to be between -1 and 1
+ )
+
+ def forward(self, z):
+ img = self.model(z)
+ img = img.view(img.size(0), *self.img_shape)
+ return img
+
+# Easiest way to get started with powerful generative models!
+from transformers import pipeline
+
+# Initialize a text generation pipeline with a pre-trained model
+generator = pipeline('text-generation', model='gpt2')
+
+# Generate text
+prompt = "In a world where AI could dream,"
+generated_text = generator(prompt, max_length=50, num_return_sequences=1)
+
+print(generated_text[0]['generated_text'])
+
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What is the key difference between a generative model and a discriminative model?
+
In a GAN, what are the roles of the Generator and the Discriminator?
+
What is the core idea behind Diffusion Models?
+
You have trained a GAN to generate images of cats. You calculate the FID score and get a value of 5. Your colleague trains another model and gets an FID score of 45. Which model is better, and why?
+
+
+
Answers
+
1. A generative model (the artist) learns the underlying distribution of the data, P(X), and can create new samples. A discriminative model (the critic) learns the decision boundary between classes, P(Y|X), and can only classify existing data.
+
2. The Generator tries to create fake data that looks real. The Discriminator tries to distinguish between real data and the Generator's fake data.
+
3. The core idea is to learn to reverse a process of gradually adding noise to an image. By mastering this "denoising" process, the model can start with pure noise and denoise it step-by-step into a coherent new image.
+
4. Your model with an FID score of 5 is much better. For Frechet Inception Distance (FID), a lower score is better, as it indicates that the statistical distribution of your generated images is closer to the distribution of the real images.
+
+
+
+
🔹 Key Terminology Explained
+
+
The Story: Decoding the AI Artist's Toolkit
+
+
+
+ Latent Space:
+
+ What it is: A lower-dimensional, compressed representation of the data. It's where the model captures the essential features or "essence" of the data.
+
+ Story Example: Imagine a "face space." In this latent space, one axis might represent "age," another "smile intensity," and another "hair color." By picking a point in this space, the model can generate a face with those specific attributes.
+
+
+ Minimax Game:
+
+ What it is: A concept from game theory used to describe the GAN training process. It's a two-player game where one player's gain is the other player's loss.
+
+ Story Example: The Generator wants to minimize the probability that the Discriminator catches its fakes. The Discriminator wants to maximize its ability to correctly identify fakes. This push-and-pull is the minimax game that forces both to improve.
+
+
+ Mode Collapse (in GANs):
+
+ What it is: A common failure case in GAN training where the Generator finds a single "safe" output that can fool the Discriminator and only produces that one output, instead of a diverse range of samples.
+
+ Story Example: The artist discovers that drawing one specific, very realistic-looking cat is enough to always fool the critic. So, it stops learning and only ever produces that single cat image. It has "collapsed" to a single mode.
+
Imagine you are trying to predict the price of houses. Your first guess is just the average price of all houses—not very accurate. So, you look at your mistakes (residuals). You build a second, simple model that's an expert at fixing those specific mistakes. Then, you look at the remaining mistakes and build a third expert to fix those. You repeat this, adding a new expert each time to patch the leftover errors, until your predictions are very accurate.
+
+
Definition:
+
+ Gradient Boosting Regression (GBR) is an ensemble machine learning technique that builds a strong predictive model by sequentially combining multiple weak learners, usually decision trees. Each new tree focuses on correcting the errors (residuals) of the previous trees.
+
+
+
Difference from Random Forest (Bagging vs. Boosting):
+
+
Random Forest: Builds many trees in parallel. Each tree sees a random subset of data, and their predictions are averaged. It's like asking many independent experts for their opinion and taking the average.
+
Gradient Boosting: Builds trees sequentially. Each tree learns from the errors of the previous ones. It's like a team of experts where each new member is trained to fix the mistakes of the one before them.
+
+
+
🔹 Mathematical Foundation
+
+
Story example: The Improving Chef
+
A chef is trying to create the perfect recipe (the model). Their first dish (initial prediction) is just a basic soup. They taste it and note the errors (residuals)—it's not salty enough. They don't throw it out; instead, they add a pinch of salt (the weak learner). Then they taste again. Now it's a bit bland. They add some herbs. This step-by-step correction, guided by tasting (calculating the gradient), is how GBR refines its predictions.
+
+
Step-by-step algorithm:
+
+
Initialize model with a constant prediction: \( F_0(x) = \text{mean}(y) \)
Train a weak learner (a small decision tree \(h_m(x)\)) to predict these residuals.
+
Update the model by adding the new tree, scaled by a learning rate \( \nu \):
+ \( F_m(x) = F_{m-1}(x) + \nu \cdot h_m(x) \)
+
+
+
+
🔹 Key Parameters
+
+
+
+
Parameter
+
Explanation & Story
+
+
+
+
+
n_estimators
+
The number of boosting stages, or the number of "mini-experts" (trees) to add in the sequence. Story: How many times the chef is allowed to taste and correct the recipe.
+
+
+
learning_rate
+
Scales the contribution of each tree. Small values mean smaller, more careful correction steps. Story: How much salt or herbs the chef adds at each step. A small pinch is safer than a whole handful.
+
+
+
max_depth
+
The maximum depth of each decision tree. Controls complexity. Story: A shallow tree is an expert on one simple rule (e.g., "add salt"). A deep tree is a complex expert who considers many factors.
+
+
+
subsample
+
The fraction of data used to train each tree. Introduces randomness to prevent overfitting. Story: The chef tastes only a random spoonful of the soup each time, not the whole pot, to avoid over-correcting for one odd flavor.
+
+
+
+
+
🔹 Strengths & Weaknesses
+
+
GBR is like a master craftsman who builds something beautiful piece by piece. The final product is incredibly accurate (high predictive power), but the process is slow (slower training) and requires careful attention to detail (sensitive to hyperparameters). If not careful, the craftsman might over-engineer the product (overfitting).
+
+
Advantages:
+
+
✅ High predictive accuracy, often state-of-the-art.
+
✅ Works well with non-linear and complex relationships.
+
✅ Handles mixed data types (categorical + numeric).
+
+
Disadvantages:
+
+
❌ Slower training than bagging methods (like Random Forest).
+
❌ Sensitive to hyperparameters (requires careful tuning).
+
❌ Can overfit if not tuned properly.
+
+
+
🔹 Python Implementation
+
+
Here, we are programming our "chef" (the `GradientBoostingRegressor`). We give it the recipe book (`X`, `y` data) and set the rules (`n_estimators`, `learning_rate`). The chef then `fit`s the recipe by training on the data. Finally, we `predict` how a new dish will taste and `evaluate` how good our final recipe is.
A bank uses GBR to predict credit risk. The first model makes a simple guess based on average income. The next model corrects for age, the next for loan amount, and so on. By chaining these simple experts, the bank builds a highly accurate system to identify customers who are likely to default, saving millions.
+
+
+
Credit risk scoring → predict if someone will default on a loan.
+
Customer churn prediction → identify customers likely to leave a service.
+
Energy demand forecasting → predict daily energy consumption for a city.
+
Medical predictions → predict patient outcomes or disease risk based on their data.
+
+
+
🔹 Best Practices
+
+
Treat tuning GBR like a skilled surgeon: be careful and precise. Use cross-validation to find the best settings. Always keep an eye on the patient's vitals (validation error) to make sure the procedure is going well and stop if things get worse (early stopping). Always confirm if such a complex surgery is needed by checking if a simpler method works first (compare to baseline models).
+
+
+
Use cross-validation and grid search to find the optimal hyperparameters.
+
Balance learning_rate and n_estimators: a smaller learning rate usually requires more trees.
+
Monitor training vs. validation error to detect overfitting early and use early stopping.
+
Compare GBR's performance against simpler models (like Linear Regression or Random Forest) to justify its complexity.
+
+
+
🔹 Key Terminology Explained
+
+
The Story: The Student, The Chef, and The Tailor
+
These terms might sound complex, but they relate to everyday ideas. Think of them as tools and checks to ensure our model isn't just "memorizing" answers but is actually learning concepts it can apply to new, unseen problems.
+
+
Cross-Validation
+
+ What it is: A technique to assess how a model will generalize to an independent dataset. It involves splitting the data into 'folds' and training/testing the model on different combinations of these folds.
+
+
+ Story Example: Imagine a student has 5 practice exams. Instead of studying from all 5 and then taking a final, they use one exam to test themselves and study from the other four. They repeat this process five times, using a different practice exam for the test each time. This gives them a much better idea of their true knowledge and how they'll perform on the real final exam, rather than just memorizing answers. This rotation is cross-validation.
+
+
+
Validation Error
+
+ What it is: The error of the model calculated on a set of data that it was not trained on (the validation set). It's a measure of how well the model can predict new, unseen data.
+
+
+ Story Example: A chef develops a new recipe in their kitchen (the training data). The "training error" is how good the recipe tastes to them. But the true test is when a customer tries it (the validation data). The customer's feedback represents the "validation error". A low validation error means the recipe is a hit with new people, not just the chef who created it.
+
+
+
Overfitting
+
+ What it is: A modeling error that occurs when a model learns the training data's noise and details so well that it negatively impacts its performance on new, unseen data.
+
+
+ Story Example: A tailor is making a suit. If they make it exactly to the client's current posture, including a slight slouch and the phone in their pocket (the "noise"), it's a perfect fit for that one moment. This is overfitting. The training error is zero! But the moment the client stands up straight, the suit looks terrible. A good model, like a good tailor, creates a fit that works well in general, ignoring temporary noise.
+
+
+
Hyperparameter Tuning
+
+ What it is: The process of finding the optimal combination of settings (hyperparameters like `learning_rate` or `max_depth`) that maximizes the model's performance.
+
+
+ Story Example: Think of a race car driver. The car's engine is the model, but the driver can adjust the tire pressure, suspension, and wing angle. These settings are the hyperparameters. The driver runs several practice laps (like cross-validation), trying different combinations to find the setup that results in the fastest lap time. This process of tweaking the car's settings is hyperparameter tuning.
+
+
+
+
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/templates/Gradient-Descen.html b/templates/Gradient-Descen.html
new file mode 100644
index 0000000000000000000000000000000000000000..41d72005ed82e32df681bb2e75cdb2983bd222cd
--- /dev/null
+++ b/templates/Gradient-Descen.html
@@ -0,0 +1,455 @@
+
+{% extends "layout.html" %}
+
+{% block content %}
+
+
+
+
+
+ Gradient Descent Study Guide
+
+
+
+
+
+
+
+
Imagine a hiker is lost in a thick fog on a mountain and wants to get to the lowest valley. They can't see far, but they can feel the slope of the ground under their feet. Their strategy is simple: feel the steepest downward slope and take a step in that direction. By repeating this process, they slowly but surely make their way downhill, hoping to find the bottom. In this story, the hiker is Gradient Descent, their position is the model's parameters, and the mountain's altitude is the error (or loss). The goal is to find the lowest point.
+
+
Definition:
+
+ Gradient Descent (GD) is an optimization algorithm used to minimize a loss/cost function by iteratively updating model parameters (weights, biases).
+
+
+
Why optimization in ML?
+
+
In ML, models (like Linear Regression, Neural Networks) try to learn parameters that best fit the training data.
+
We measure how well the model fits using a loss function (e.g., Mean Squared Error).
+
Optimization finds the minimum loss, meaning the best model parameters.
+
+
+
Basic Idea:
+
Think of standing on a hill (the cost function surface). To reach the lowest valley (minimum loss):
+
+
Look at the slope (gradient).
+
Take a small step in the opposite direction of the slope.
+
Repeat until you reach the bottom.
+
+
👉 Example: Linear Regression
+
If the predicted line is too high, the gradient tells us to decrease slope/intercept; if too low, increase them.
+
+
+
🔹 Mathematical Foundation
+
+
The Story: The Hiker's Rulebook
+
The hiker needs a precise set of instructions for each step. This formula is their rulebook. It says: "Your new position is your old position, minus a small step (the learning rate) in the direction of the steepest slope (the gradient)." This rule ensures every step they take is a calculated move towards the valley floor, preventing them from walking in circles.
Imagine a U-shaped curve (parabola). Starting at the top, you repeatedly step downward along the slope until you reach the bottom.
+
+
+
🔹 Types of Gradient Descent
+
+
The Story: Three Different Hikers
+
Our lost hikers can have different personalities:
+
+
The Deliberate Hiker (Batch GD): Before taking a single step, this hiker scans the *entire* visible landscape around them, averages out the slope, and then takes one perfect, confident step. It's a very slow but very stable process.
+
The Impulsive Hiker (Stochastic GD): This hiker is in a hurry. They just feel the slope under one foot and immediately take a step. Their path is erratic and zig-zaggy, but they move very fast and their chaotic nature can help them jump out of small ditches (local minima).
+
The Pragmatic Hiker (Mini-Batch GD): This hiker finds a middle ground. They scan a small patch of ground around them (not everything, but more than one spot), decide on a direction, and take a step. This is the most popular strategy—a good balance of speed and stability.
+
+
+
Batch Gradient Descent
+
+
Uses the whole dataset to compute gradient.
+
Accurate but slow for large datasets.
+
👉 Example: 1 million rows → 1 update per pass.
+
+
+
Stochastic Gradient Descent (SGD)
+
+
Uses 1 data point at a time.
+
Faster, introduces noise → helps escape local minima.
+
👉 Example: updates model after each single sample.
+
+
+
Mini-Batch Gradient Descent
+
+
Uses small random batches (e.g., 32, 64 samples).
+
Balance between speed & stability.
+
Standard in deep learning.
+
👉 Example: For 1000 samples, batch size = 100 → 10 updates per pass.
+
+
+
🔹 Variants / Improvements
+
+
The Story: Upgrading the Hiker's Gear
+
A basic hiker is good, but a hiker with advanced gear is better. These variants are like giving our hiker special equipment to navigate the mountain more effectively.
+
+
Momentum Boots: These boots are heavy. Once they start moving in a direction, they build momentum, helping the hiker roll over small bumps (local minima) and speed up on long, straight downhill paths.
+
Adam's All-Terrain Boots: This is the ultimate hiking gear. It combines the momentum boots with adaptive traction. The boots automatically adjust their grip (the learning rate), taking smaller, careful steps on slippery, steep slopes and longer strides on flat, easy terrain. This is why it's the most popular and reliable choice.
+
+
+
+
Momentum – remembers previous update direction to speed up learning.
+
Nesterov Accelerated Gradient (NAG) – looks ahead before updating, more precise.
+
Adagrad – adapts learning rate for each parameter, useful for sparse data.
+
RMSProp – smooths updates using moving average of squared gradients.
+
Adam – combines Momentum + RMSProp (most popular optimizer in deep learning).
+
+
👉 Example: Training a CNN, Adam often converges much faster than plain GD.
+
+
+
🔹 Key Parameters
+
+
The Story: Calibrating the Hiker's Tools
+
Before starting, the hiker must calibrate their approach. These are the settings they control:
+
+
Step Size (Learning Rate): How big of a step should they take? If it's too large, they might leap right over the valley. If it's too small, it could take them forever to get down the mountain.
+
Journey Duration (Number of Iterations): How many steps should they plan to take? They need to walk long enough to reach the valley, but not so long that they waste energy wandering around the bottom.
+
Scan Area (Batch Size): For the pragmatic hiker, how large of a patch of ground should they look at for each step? A small patch gives a quick decision, while a larger patch gives more stability.
+
+
+
+
Learning Rate (\(\alpha\))
+
+
Too high → overshoot, divergence.
+
Too low → very slow convergence.
+
+
+
Number of Iterations (epochs) – how many times to repeat updates.
+
Batch size – affects speed & stability.
+
Initialization of parameters
+
+
Random initialization prevents symmetry.
+
Xavier/He initialization used in deep networks.
+
+
+
+
👉 Example: If learning rate = 1, model may oscillate; if 0.0001, may take forever.
+
+
+
🔹 Strengths & Weaknesses
+
+
The Story: Reviewing the Hiking Strategy
+
The "take a step downhill" strategy is simple and effective, but not foolproof.
+
+
Strengths: It’s a universal strategy that works on almost any mountain (problem), and it's the foundation for more advanced hiking techniques.
+
Weaknesses: The hiker might find a small ditch and think they've reached the main valley (a local minimum). The whole journey also depends heavily on picking the right step size, and if the mountain has many different types of terrain (unscaled features), the hiker's path can become very inefficient.
+
+
+
Advantages:
+
+
✅ Simple and effective.
+
✅ Works well for large problems.
+
✅ Forms the base of advanced optimizers.
+
+
Disadvantages:
+
+
❌ Can get stuck in local minima/saddle points.
+
❌ Very sensitive to learning rate.
+
❌ Requires normalized data for efficiency.
+
+
+
+
🔹 When to Use
+
+
The Story: Choosing the Right Mountain
+
This downhill hiking strategy is perfect for any "mountain" that has a smooth, continuous surface where you can always calculate a slope. This applies to countless problems in the real world, from predicting house prices (a gentle hill) to training massive AI for image recognition (a complex, high-dimensional mountain range).
+
+
+
Regression & Classification models.
+
Neural networks & deep learning.
+
Any optimization with differentiable cost functions.
+
+
👉 Example: Logistic Regression, CNNs, RNNs all trained with GD.
+
+
+
🔹 Python Implementation
+
+
The Story: Writing Down the Hiking Plan
+
This code is the hiker's plan written down before they start their journey. It defines the map of the mountain (`X` and `y` data), sets the calibration (learning rate, epochs), and contains the step-by-step instructions for the hike (the loop). Finally, it plots a chart of their altitude (loss) over time to confirm they successfully made it downhill.
+
+
Example: Gradient Descent for Linear Regression
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+# Data
+X = np.array([1, 2, 3, 4, 5])
+y = np.array([2, 4, 6, 8, 10]) # y = 2x
+
+# Parameters
+w, b = 0, 0
+alpha = 0.01
+epochs = 1000
+m = len(X)
+losses = []
+
+# Gradient Descent
+for i in range(epochs):
+ y_pred = w*X + b
+ dw = -(2/m) * np.sum(X * (y - y_pred))
+ db = -(2/m) * np.sum(y - y_pred)
+
+ w -= alpha * dw
+ b -= alpha * db
+
+ loss = np.mean((y - y_pred)**2)
+ losses.append(loss)
+
+print("Final w:", w, "Final b:", b)
+
+# Plot convergence
+plt.plot(losses)
+plt.xlabel("Iterations")
+plt.ylabel("Loss")
+plt.title("Convergence Curve")
+plt.show()
+
+
👉 Try changing learning rate (\(\alpha\)) and see convergence behavior.
+
+
+
🔹 Real-World Applications
+
+
The Story: Famous Mountains Conquered
+
This hiking method isn't just a theory; it's used to conquer real-world challenges every day. It's the strategy used to train the models that power Netflix's recommendations, Google's speech recognition, and the AI that can diagnose diseases from medical scans. Each of these is a complex "mountain" that Gradient Descent learns to navigate.
Experienced hikers have a checklist for a successful journey:
+
+
Prepare the Terrain (Normalize Data): Before you start, smooth out the mountain path. Rescaling the terrain makes the slope more consistent and the hike much faster.
+
Use the Best Gear (Adaptive Optimizers): Don't just walk. Use the Adam all-terrain boots to automatically adapt your steps to the terrain.
+
Check Your GPS (Monitor Loss): Regularly check your altitude chart. If you're going up instead of down, you know something is wrong (like your step size is too big).
+
Know When to Camp (Early Stopping): Once you've reached a good-enough spot in the valley, set up camp. Continuing to wander around risks getting lost again (overfitting).
+
+
+
+
Normalize/Standardize Data: Scale your features to have a similar range. This helps GD converge much faster.
+
Use Adaptive Optimizers: Start with Adam instead of standard SGD. It often works well with little tuning.
+
Monitor the Loss Curve: Always plot the loss vs. iterations. If it's flat, learning has stalled. If it's increasing, your learning rate is likely too high.
+
Use Early Stopping: To prevent overfitting, monitor performance on a validation set and stop training when the validation loss starts to increase.
+
+
👉 Example: A standard, robust approach for a deep learning project is to standardize the input data, use the Adam optimizer, and apply early stopping.
+
+
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/templates/Graph-Based-Method.html b/templates/Graph-Based-Method.html
new file mode 100644
index 0000000000000000000000000000000000000000..487035a40642db81a90ff87e7ac4ebf1b59ebf6b
--- /dev/null
+++ b/templates/Graph-Based-Method.html
@@ -0,0 +1,414 @@
+{% extends "layout.html" %}
+
+{% block content %}
+
+
+
+
+
+ Study Guide: Graph-Based Semi-Supervised Learning
+
+
+
+
+
+
+
+
+
🕸️ Study Guide: Graph-Based Semi-Supervised Learning
+
+
🔹 Introduction
+
+
Story-style intuition: The Social Network Detective
+
Imagine you're a detective trying to identify the members of two rival clubs, "The Eagles" and "The Sharks," in a large social network. You only know the affiliation of a few people (labeled data), but you have the entire network map showing who is friends with whom (unlabeled data). You make a simple but powerful assumption: "People are probably in the same club as their friends." So, you start with the known members and let their club affiliation "spread" to their direct friends, and then to their friends' friends, like a rumor. Eventually, this process reveals two distinct communities in the network. This is the core idea of Graph-Based Semi-Supervised Learning (SSL). It uses the connections between data points to propagate information from the labeled few to the unlabeled many.
+
+
Graph-Based Semi-Supervised Learning (SSL) is a family of algorithms that represents a dataset as a graph, where data points are nodes and the relationships between them are edges. It leverages the structure of this graph to infer the labels of unlabeled data points based on their proximity to labeled ones.
+
+
🔹 Core Concepts
+
The foundation of this method is the graph representation itself, which is built on a key assumption about the data.
+
+
Graph Representation:
+
+
Nodes: Every single data point in your dataset, both labeled and unlabeled, becomes a node in the graph.
+
Edges: An edge is drawn between two nodes if they are "similar." The similarity is calculated using a distance metric (like Euclidean distance). The weight of the edge indicates the strength of the similarity (closer points get a stronger edge weight).
+
+
+
Example: In an image dataset of handwritten digits, each image is a node. An edge between two images of the digit '7' would be very strong (high weight) because their pixel values are similar. An edge between an image of a '7' and an image of a '1' would be very weak (low weight).
+
+
+
The Graph Smoothness Assumption: This is the guiding principle. It assumes that if two data points are connected by a strong edge (meaning they are very similar), then they are very likely to have the same label. The goal of the algorithm is to find a labeling for all nodes that is "smooth" across the graph, meaning there are very few strong edges connecting nodes with different labels.
+
+
+
+
Example: Imagine we have 10 data points. Two are labeled 'Class A' (blue) and one is labeled 'Class B' (red). The other seven are unlabeled (gray). After building the graph, we see the two blue points are strongly connected to a cluster of four gray points on the left. The red point is strongly connected to a cluster of three gray points on the right. The algorithm will naturally 'propagate' the blue label to the left cluster and the red label to the right one, as this is the smoothest fit for the graph's structure.
+
+
+
🔹 Mathematical Foundations
+
+
The "Social Network Detective" needs a way to measure how "chaotic" a potential labeling of the network is. A chaotic labeling would have lots of friends from rival clubs, while a smooth labeling would have very few. This measure of chaos is captured by the Graph Laplacian.
+
+
+
Graph Laplacian (L): This is a special matrix derived from the graph that mathematically represents its structure. It's calculated as:
+
$$ L = D - W $$
+ Where:
+
+
\( W \) is the Adjacency Matrix, which stores the weight of the edge between every pair of nodes.
+
\( D \) is the Degree Matrix, a diagonal matrix where each diagonal element contains the sum of all edge weights connected to a single node.
+
+
+
Example: A Simple Graph's Laplacian
+
Imagine a 3-node graph: Node 1 is connected to Node 2 (weight=1) and Node 2 is connected to Node 3 (weight=1).
+
+
The Adjacency Matrix W would be: [[0, 1, 0], [1, 0, 1], [0, 1, 0]]
+
The Degree Matrix D (sum of weights for each node) would be: [[1, 0, 0], [0, 2, 0], [0, 0, 1]]
+
The Graph Laplacian L = D - W would be: [[1, -1, 0], [-1, 2, -1], [0, -1, 1]]
+
+
This L matrix now contains all the information about the graph's connectivity.
+
+
+
Smoothness Assumption (in Math): The goal is to assign a label function \(f\) to all nodes that minimizes the following formula. This formula gives a high penalty if you assign different labels (\(f(x_i) \neq f(x_j)\)) to two nodes connected by a strong edge (\(W_{ij}\) is large).
+
$$ \sum_{i,j} W_{ij} (f(x_i) - f(x_j))^2 $$
+
+
+
+
🔹 Workflow and Key Algorithms
+
The general process follows a logical flow of building the graph and then spreading the information.
+
+
Build the Similarity Graph: Connect all data points based on a similarity measure. This is the most critical step. Common methods include connecting each point to its 'k' nearest neighbors (a k-NN graph).
+
Assign Initial Labels: The few labeled nodes are given their true labels with 100% confidence. Unlabeled nodes can start with no label or an equal probability for all labels.
+
Propagate Labels: The labels "flow" from the labeled nodes to their neighbors through the weighted edges. This process is repeated in iterations. In each step, a node's label is updated based on the labels of its neighbors.
+
Stop when Converged: The process continues until the labels on the unlabeled nodes stop changing significantly.
+
+
+
Example: A Single Step of Label Propagation
+
Consider an unlabeled node C, which has three neighbors: Node A (labeled Blue), Node B (labeled Blue), and Node D (labeled Red). All edge weights are equal. In the next iteration, node C looks at its neighbors. It sees two "Blue" votes and one "Red" vote. Therefore, Node C will update its own label to "Blue" because that's the majority label among its direct neighbors.
+
+
Key Algorithms:
+
+
Label Propagation (LP): The simplest algorithm. In each step, every node adopts the label that the majority of its neighbors have.
+
Label Spreading (LS): A more robust version that is less sensitive to noise. It considers the initial labeling and allows the influence of the original labeled "seed" nodes to be clamped, preventing them from being completely overwritten by their neighbors.
+
+
+
🔹 Key Assumptions
+
These methods work best when the data follows certain geometric patterns.
+
+
Cluster Assumption: Data points that are close to each other (i.e., in the same cluster) are likely to share the same label. This is a restatement of the smoothness assumption.
+
Example: In a dataset of animal photos, we assume that photos of cats will naturally form a tight cluster based on pixel similarity, separate from the cluster of dog photos.
+
+
Manifold Assumption: The data, which might exist in a very high-dimensional space, actually lies on a much simpler, lower-dimensional surface or shape called a manifold. The graph is an attempt to approximate this underlying shape.
+
Example: A set of photos of a person's face taken from different angles exists in a very high-dimensional pixel space (e.g., 10,000+ dimensions). However, the 'true' structure of this data is a simple 2D or 3D manifold representing the rotation of the head. The graph helps capture the "neighbor" relationships along this curved surface.
+
+
+
+
🔹 Advantages & Disadvantages
+
+
+
+
Advantages
+
Disadvantages
+
+
+
+
+
✅ Naturally incorporates the structure of both labeled and unlabeled data.
+
❌ Graph Construction is Expensive: Building the similarity matrix for a large dataset with N samples requires calculating N² distances. Example: For a dataset of just 50,000 images, this would require 50,000 * 50,000 = 2.5 billion distance calculations, which is very slow.
+
+
+
✅ Very effective when you have very few labeled samples but a large amount of unlabeled data.
+
❌ Highly sensitive to how the graph is built. A poor choice of similarity metric or neighborhood size (k in k-NN) can lead to a bad result. Example: If you choose k=3 in a k-NN graph, you might miss a connection to an important fourth neighbor. If you choose k=20, you might introduce noisy connections to dissimilar points.
+
+
+
✅ Works well on complex, high-dimensional data where the underlying structure is important.
+
❌ Does not scale well to datasets with millions of nodes due to the computational and memory costs.
+
+
+
+
+
🔹 Applications
+
+
Image Classification:
+
Example: Given a massive database of web images, you can manually label a few as 'cat', 'dog', and 'car'. The algorithm can then use visual similarity (e.g., comparing color histograms or deep learning features) to propagate these labels to the rest of the database.
+
+
Document Categorization:
+
Example: Label a few news articles as 'Sports' or 'Politics'. The algorithm can use word similarity (e.g., TF-IDF vectors) to build a graph and categorize millions of other articles on a news feed.
+
+
Social Network Analysis:
+
Example: Start with a few known 'pro-A' and 'pro-B' Twitter users. The algorithm can propagate these labels through the follower/friend network to estimate the political stance of millions of other users.
+
+
+
+
🔹 Python Implementation (Beginner Sketch with Scikit-learn)
+
+
Scikit-learn provides easy-to-use implementations of `LabelPropagation` and `LabelSpreading`. In this example, we'll create a dataset that forms two clear "moon" shapes. This is a classic problem where linear classifiers fail. We'll label only two points (one in each moon) and let the algorithm infer the labels for all the others based on the graph structure.
+
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.datasets import make_moons
+from sklearn.semi_supervised import LabelPropagation, LabelSpreading
+
+# --- 1. Create a Sample Dataset ---
+# make_moons is perfect for testing graph methods because its structure is nonlinear
+X, y_true = make_moons(n_samples=200, noise=0.08, random_state=42)
+
+# --- 2. Create a small labeled set and a large unlabeled set ---
+# We simulate a real-world scenario by providing labels for only 2 points!
+y_labeled = np.full_like(y_true, -1) # -1 is the scikit-learn code for "unlabeled"
+y_labeled[0] = y_true[0] # Label the first point
+y_labeled[-1] = y_true[-1] # Label the last point
+
+# --- 3. Train a Label Propagation Model ---
+# The model will build a k-NN graph behind the scenes
+lp_model = LabelPropagation(kernel='knn', n_neighbors=10)
+# The .fit() method does all the work: builds the graph and propagates labels
+lp_model.fit(X, y_labeled)
+y_pred_lp = lp_model.predict(X)
+
+# --- 4. Train a Label Spreading Model ---
+ls_model = LabelSpreading(kernel='knn', n_neighbors=10)
+ls_model.fit(X, y_labeled)
+y_pred_ls = ls_model.predict(X)
+
+# --- 5. Visualize the Results ---
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 6))
+
+# Plot Label Propagation Results
+# Small dots are the inferred labels, large circles are the original labeled "seeds"
+ax1.scatter(X[y_labeled == -1, 0], X[y_labeled == -1, 1], c=y_pred_lp[y_labeled == -1], cmap='viridis', marker='.')
+ax1.scatter(X[y_labeled != -1, 0], X[y_labeled != -1, 1], c=y_labeled[y_labeled != -1], cmap='viridis', marker='o', s=100, edgecolor='k')
+ax1.set_title('Label Propagation Results')
+
+# Plot Label Spreading Results
+ax2.scatter(X[y_labeled == -1, 0], X[y_labeled == -1, 1], c=y_pred_ls[y_labeled == -1], cmap='viridis', marker='.')
+ax2.scatter(X[y_labeled != -1, 0], X[y_labeled != -1, 1], c=y_labeled[y_labeled != -1], cmap='viridis', marker='o', s=100, edgecolor='k')
+ax2.set_title('Label Spreading Results')
+
+plt.show()
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What does the "smoothness assumption" mean in the context of graph-based SSL?
+
What do the nodes and edge weights in the graph represent?
+
What is the main performance bottleneck for graph-based methods on very large datasets?
+
Why is Label Spreading sometimes preferred over Label Propagation?
+
+
+
Answers
+
1. The smoothness assumption means that if two data points are very similar (connected by a strong edge), they are very likely to have the same label. The algorithm tries to find a labeling that minimizes conflicts across strong edges.
+
2.Nodes represent the data points (both labeled and unlabeled). Edge weights represent the similarity between two connected data points; a higher weight means greater similarity.
+
3. The main bottleneck is the graph construction phase, which can require computing a similarity score (like distance) between all pairs of N nodes, an operation that scales with O(N²).
+
4. Label Spreading is generally more robust to noisy data because it includes a "clamping" factor that ensures the original labeled nodes retain some of their initial influence, preventing them from being completely swayed by their neighbors.
+
+
+
+
🔹 Key Terminology Explained
+
+
The Story: Decoding the Social Network Detective's Dossier
+
+
+
+ Graph Laplacian:
+
+ What it is: A matrix that is a mathematical representation of a graph's structure. It's used to measure properties of the graph, like its "smoothness."
+
+ Story Example: Think of the Graph Laplacian as the detective's "chaos meter." They can use it to calculate a single number that tells them how "mixed up" the clubs are in the current network labeling. Their goal is to find a labeling that makes this chaos meter's reading as low as possible.
+
+
+ Adjacency Matrix (W):
+
+ What it is: A matrix that stores the edge weights between nodes. The entry at W(i, j) is the weight of the edge connecting node i and node j.
+
+ Story Example: This is the detective's master list of friendships. It's a giant table where every person has a row and a column, and the cell where two people intersect contains a "friendship score" indicating how close they are.
+
+
+ Manifold Assumption:
+
+ What it is: The idea that high-dimensional data often lies on a much simpler, lower-dimensional, smoothly curved surface.
+
+ Story Example: Imagine the globe. The location of cities can be described by 3D coordinates (x, y, z), but we know they all lie on the 2D curved surface of the Earth. This surface is the manifold. Graph-based methods assume your data has a similar underlying structure, and the graph is an attempt to map it out.
+
Story-style intuition: Organizing a Family Reunion
+
Imagine you are organizing a big family reunion. You start by grouping the closest relatives: siblings form small groups. Then, you merge those groups with their cousins. Next, you merge those larger groups with their aunts and uncles. You keep doing this until the entire extended family is in one giant group. Hierarchical clustering works just like this: it builds a family tree, or a dendrogram, showing how everyone is related, from the closest individuals to the entire family.
+
+
+
What is Clustering?
+
In machine learning, clustering is an unsupervised learning technique. This means you have data, but you don't have pre-defined labels for it. The goal of clustering is to find natural groupings (or "clusters") in the data, where points within the same group are more similar to each other than to those in other groups.
+
+
Example: You have a list of customers and their purchasing habits (e.g., spending amount, frequency of visits). Clustering would help you automatically identify groups like "high-spending loyal customers," "occasional bargain hunters," and "new visitors" without you having to define these groups first.
+
+
+
Definition of Hierarchical Clustering
+
Hierarchical Clustering builds a hierarchy of clusters, either from the bottom up or the top down. The result is a tree-like structure called a dendrogram, which shows the entire "family tree" of how the groups were formed.
+
+
+
Agglomerative (Bottom-Up): Starts with each data point as its own cluster, then iteratively merges the closest pairs of clusters until only one cluster remains.
+
+
Example: We start with four friends, {A}, {B}, {C}, {D}. The algorithm first merges the two most similar, say A and B, to get {A, B}, {C}, {D}. Then it might merge C and D to get {A, B}, {C, D}. Finally, it merges these two groups into {A, B, C, D}.
+
+
+
Divisive (Top-Down): Starts with all data points in one giant cluster, then recursively splits them into smaller clusters.
+
+
Example: We start with one big group {A, B, C, D}. The algorithm first splits it into the two most different subgroups, for instance {A, B} and {C, D}. Then it might split {A, B} into {A} and {B}, completing the process.
Euclidean Distance: The straight-line distance between two points. d(p, q) = sqrt((p1 - q1)^2 + (p2 - q2)^2)
+
+
Example: If Friend A is at point (1, 2) and Friend B is at (4, 6), their Euclidean distance is sqrt((4-1)^2 + (6-2)^2) = sqrt(3^2 + 4^2) = sqrt(9 + 16) = sqrt(25) = 5.
+
+
+
Manhattan Distance: The distance as if traveling on a city grid (sum of absolute differences). d(p, q) = |p1 - q1| + |p2 - q2|
+
+
Example: For Friend A (1, 2) and Friend B (4, 6), the Manhattan distance is |4-1| + |6-2| = 3 + 4 = 7.
+
+
+
Cosine Similarity: Measures the angle between two vectors. It's great for text analysis where direction matters more than magnitude.
Once you have family groups, how do you decide which two groups should merge next? Do you connect them based on their two closest members (single linkage), their two most distant members (complete linkage), or the average distance between all their members (average linkage)?
+
+
+
Single Linkage (The Optimist): The distance is the minimum distance between any two points in the different clusters.
+
Example: Distance between {A, B} and {C} is the smaller of Distance(A, C) and Distance(B, C).
+
Complete Linkage (The Pessimist): The distance is the maximum distance between any two points.
+
Example: Distance between {A, B} and {C} is the larger of Distance(A, C) and Distance(B, C).
+
Average Linkage (The Diplomat): The distance is the average of all pairwise distances between points in the two clusters.
+
Example: Distance between {A, B} and {C} is (Distance(A, C) + Distance(B, C)) / 2.
+
Ward’s Method (The Team Builder): Merges clusters that lead to the minimum increase in within-cluster variance. A popular and effective default that creates compact, spherical clusters.
+
+
+
🔹 Algorithm Steps (Agglomerative)
+
+
Let's walk through a simple example with four points A, B, C, D. Their initial distance matrix (Euclidean) is:
+
A B C D
+ A 0 2 6 10
+ B 2 0 5 9
+ C 6 5 0 4
+ D 10 9 4 0
+
+
+
Initialization: We start with four clusters: {A}, {B}, {C}, {D}.
+
Merge 1: The smallest distance is 2 (between A and B). We merge them into a new cluster {A, B}. Our clusters are now {A, B}, {C}, {D}.
+
Update 1: We update the distance matrix using, for example, single linkage:
+
+
dist({A,B}, C) = min(dist(A,C), dist(B,C)) = min(6, 5) = 5
Final Merge: We merge the last two clusters {A, B} and {C, D} at a distance of 5. The process is complete.
+
+
+
🔹 The Dendrogram
+
The dendrogram is the tree diagram that visualizes the entire merging process. The y-axis represents the distance at which clusters were merged. By "cutting" the dendrogram with a horizontal line, you can choose the final number of clusters.
+
+
Example: For the algorithm steps above, the dendrogram would show A and B merging at a low height (distance 2). C and D would merge at a slightly higher height (distance 4). Finally, the {A, B} group and the {C, D} group would merge at an even higher height (distance 5).
+
+
If you draw a horizontal "cut" line at a distance of 3, you would cross three vertical lines, giving you three clusters: {A, B}, {C}, and {D}. If you cut at a distance of 6, you would get one cluster: {A, B, C, D}.
+
+
+
+
🔹 Comparison: Hierarchical vs. K-Means
+
+
+
+
Feature
+
Hierarchical Clustering
+
K-Means Clustering
+
+
+
+
+
Number of Clusters
+
Not needed upfront. Chosen by cutting the dendrogram.
+
Must be pre-specified (K).
+
+
+
Speed & Scalability
+
Slow (O(n^2) to O(n^3)). Not for large datasets.
+
Fast and scales well to large data.
+
+
+
Output
+
A full hierarchy (dendrogram) showing relationships.
+
A single set of K clusters.
+
+
+
Determinism
+
Deterministic (always the same result).
+
Can vary based on initial random centroids.
+
+
+
Use-Case Example
+
Understanding evolutionary relationships between species (phylogenetic trees). The hierarchy is the main goal.
+
Segmenting 1 million customers into 3 pricing tiers (Gold, Silver, Bronze). Speed and scalability are key.
+
+
+
+
+
🔹 Strengths & Weaknesses
+
Advantages:
+
+
✅ Easy to understand and visualize with the dendrogram. Example: Showing a business manager how different customer segments are related to each other.
+
✅ No need to pre-specify the number of clusters. Example: Exploring a new dataset of patient symptoms to see how many natural groupings of diseases emerge.
+
✅ The hierarchy can provide meaningful insights. Example: In text analysis, finding broad topics (like "Sports") that contain sub-topics ("Football," "Basketball").
+
+
Disadvantages:
+
+
❌ Computationally expensive and slow for large datasets. Example: Trying to cluster millions of social media users would be too slow.
+
❌ Sensitive to noisy data and outliers. Example: A single customer with extremely unusual buying habits could distort the entire cluster structure.
+
❌ Merges are final and cannot be undone (greedy approach). Example: If two customers are incorrectly merged early on, the algorithm can never separate them again.
+
+
+
🔹 Python Implementation
+
+
Let's use Python's `scipy` library to build our "family tree" (the dendrogram) and `scikit-learn` to perform the final clustering once we decide how many groups we want.
+
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.datasets import make_blobs
+from sklearn.preprocessing import StandardScaler
+from sklearn.cluster import AgglomerativeClustering
+from scipy.cluster.hierarchy import dendrogram, linkage
+
+# 1. Generate and Prepare Data
+X, y = make_blobs(n_samples=50, centers=4, cluster_std=1.2, random_state=42)
+X_scaled = StandardScaler().fit_transform(X)
+
+# 2. Build the Linkage Matrix using Scipy for the dendrogram
+linkage_matrix = linkage(X_scaled, method='ward')
+
+# 3. Plot the Dendrogram
+plt.figure(figsize=(15, 7))
+plt.title('Hierarchical Clustering Dendrogram (Ward Linkage)')
+plt.xlabel('Sample Index')
+plt.ylabel('Distance')
+dendrogram(linkage_matrix)
+plt.show()
+
+
+# 4. Perform Clustering with Scikit-learn
+# Let's say the dendrogram suggests 4 clusters is a good choice.
+agg_cluster = AgglomerativeClustering(n_clusters=4, linkage='ward')
+labels = agg_cluster.fit_predict(X_scaled)
+
+# 5. Visualize the final clusters
+plt.figure(figsize=(10, 7))
+plt.scatter(X_scaled[:, 0], X_scaled[:, 1], c=labels, cmap='viridis', s=50)
+plt.title('Final Clusters (n=4)')
+plt.xlabel('Feature 1')
+plt.ylabel('Feature 2')
+plt.grid(True)
+plt.show()
+
+
+
+
🔹 Key Terminology Explained
+
+
The Story: Decoding the Family Reunion Plan
+
Let's break down some of the technical terms used in our family reunion plan to make sure everything is crystal clear.
+
+
+
+ Unsupervised Learning:
+
+ What it is: A type of machine learning where the algorithm learns patterns from data that has not been labeled or categorized. The algorithm finds the structure on its own.
+
+ Story Example: You're given a box of mixed fruits with no labels. Unsupervised learning is the process of sorting them into piles (apples, bananas, oranges) based only on their appearance, size, and texture, without anyone telling you what each fruit is called.
+
+
+ Dendrogram:
+
+ What it is: The tree-like diagram that hierarchical clustering produces. It visually represents the nested grouping of data points and the distances at which merges occurred.
+
+ Story Example: This is the actual family tree chart you draw for the reunion. It shows exactly which siblings were grouped first, how their groups were joined with cousins, and so on, all the way up to the entire family. The height of the branches shows how "distantly related" the merged groups are.
+
+
+ Variance (Within-Cluster):
+
+ What it is: A measure of how spread out the data points are within a single cluster. Low variance means the points are tightly packed and very similar. High variance means they are spread out.
+
+ Story Example: A group of siblings who are all very close in age has low variance. A group that includes a toddler, a teenager, and a grandparent has high variance. Ward's method tries to create groups with the lowest possible age spread (variance).
+
+
+ Greedy Approach:
+
+ What it is: An algorithmic strategy that makes the best possible choice at each step, without considering the overall, long-term outcome. Once a decision is made, it is never reconsidered.
+
+ Story Example: At the reunion, you first merge the two closest siblings. The greedy approach means this decision is final. Even if merging one of those siblings with a cousin first might have created a better overall grouping later, the algorithm can't go back and change its initial decision.
+
+
+ PCA (Principal Component Analysis):
+
+ What it is: A technique for dimensionality reduction. It transforms a large set of variables into a smaller set of "principal components" while preserving most of the original information.
+
+ Story Example: You're judging a baking contest based on 50 different criteria (sweetness, texture, color, aroma, etc.). This is too complex. Using PCA is like creating two new, summary criteria: "Overall Taste" and "Visual Appeal". These two components capture the essence of the original 50, making it much easier to compare the cakes fairly.
+
+
+
+
🔹 Best Practices
+
+
Scale Data: Always normalize or scale your data before clustering.
+
Example: If you're clustering customers by income (e.g., 50,000) and number of purchases (e.g., 5), the income will dominate the distance calculation. Scaling brings both to a similar range so they are weighted fairly.
+
Experiment: Try different distance metrics and linkage methods to see what works best for your data.
+
Example: If your clusters look like long chains, single linkage might be appropriate. If they are tight and spherical, Ward's or complete linkage is better.
+
Use the Dendrogram: Look for the largest vertical lines that don't cross any horizontal merges. Cutting there is often a good strategy for choosing K.
+
Example: If there's a huge jump in merge distance from 3 clusters to 2, it suggests that merging those last two groups creates a very dissimilar, unnatural cluster. Therefore, 3 is likely a good number of clusters.
+
Combine with PCA: For high-dimensional data, use PCA to reduce dimensions first.
+
Example: Clustering genetic data with thousands of genes is noisy. PCA can reduce this to a few key components that represent the most important genetic variations, leading to better clusters.
Imagine you're at a crowded party. Two people, Alice and Bob, are speaking at the same time. You place two microphones in the room. Each microphone records a mixture of Alice's voice, Bob's voice, and some background noise. Your goal is to take these two messy, mixed recordings and perfectly isolate Alice's original voice into one audio file and Bob's original voice into another. This is called Blind Source Separation, and it's exactly what ICA is designed to do. ICA is a computational method that "unmixes" a set of signals to reveal the hidden, underlying sources that created them.
+
+
+
Independent Component Analysis (ICA) is a statistical technique used to separate a multivariate signal into its underlying, additive, and statistically independent components. Unlike PCA which seeks to maximize variance and finds uncorrelated components, ICA's goal is to find components that are truly independent, which is a much stronger condition.
+
+
🔹 Intuition Behind ICA
+
ICA operates on the assumption that your observed data is a linear mixture of some unknown, independent sources. The whole problem can be stated with a simple formula:
+
$$ X = A S $$
+
+
\( X \): The observed signals (e.g., the recordings from your two microphones).
+
\( S \): The original, independent source signals (e.g., the clean voices of Alice and Bob). These are the latent variables we want to find.
+
\( A \): The unknown "mixing matrix" that describes how the sources were combined (e.g., how the room's acoustics mixed the voices).
+
+
The goal of ICA is to find an "unmixing matrix" W that can reverse the process:
+
$$ S \approx W X $$
+
To do this, ICA relies on a key insight: most real-world signals of interest (like speech or music) are non-Gaussian (they don't follow a perfect bell curve). The Central Limit Theorem states that a mixture of independent signals will tend to be "more Gaussian" than the original sources. Therefore, ICA works by finding an unmixing matrix W that makes the resulting signals as non-Gaussian as possible, thereby recovering the original independent sources.
+
+
🔹 Mathematical Foundation
+
+
Story: The Signal Purifier's Three-Step Process
+
To unmix the signals, the ICA algorithm follows a systematic process:
+
+
Step 1: Center the Data. First, it removes the average "hum" or DC offset from each microphone recording so they are all centered around zero.
+
Step 2: Whiten the Data. This is a preprocessing step (often done with PCA) that removes correlations and ensures each dimension has equal variance. It's like equalizing the volume levels and removing echoes, making the unmixing job easier.
+
Step 3: Maximize "Interestingness." The algorithm then iteratively adjusts the unmixing matrix W to make the output signals as "interesting" (i.e., structured and non-random) as possible. It measures this "interestingness" using metrics for non-Gaussianity, such as Kurtosis or Negentropy.
+
+
+
The core of the ICA algorithm is an optimization problem. After preprocessing, it tries to find the components that maximize a measure of non-Gaussianity. The two most common measures are:
+
+
Kurtosis: A measure of the "tailedness" or "peakiness" of a distribution. A high kurtosis (positive) means the signal is "spiky," which is a strong sign of non-Gaussianity.
+
Negentropy: A more robust measure based on information theory. It measures the difference between a signal's entropy and the entropy of a Gaussian signal with the same variance. In simple terms, it's a measure of "how far from random" a signal is.
+
+
+
🔹 Comparison with PCA
+
+
+
+
Feature
+
ICA (Independent Component Analysis)
+
PCA (Principal Component Analysis)
+
+
+
+
+
Goal
+
Finds components that are statistically independent.
+
Finds components that are uncorrelated and maximize variance.
+
+
+
Supervision
+
Both are Unsupervised.
+
+
+
Component Property
+
Components are not necessarily orthogonal (at right angles).
+
Components are always orthogonal.
+
+
+
Use Case
+
Best for separating mixed signals (e.g., audio, EEG).
+
Best for dimensionality reduction and data compression.
+
+
+
Output Example
+
+
+
+
+
+
+
🔹 Strengths & Weaknesses
+
Advantages:
+
+
✅ **Powerful for Signal Separation:** It is one of the best methods for blind source separation when the underlying sources are independent.
+
✅ **Feature Extraction:** Can find meaningful underlying features or sources that are not immediately obvious in the mixed data.
+
+
Disadvantages:
+
+
❌ **Ambiguity in Output:** ICA cannot determine the original order, scale (volume), or sign (polarity) of the source signals. The recovered components are correct in shape but may be in a random order and flipped upside-down.
+
❌ **Assumes Non-Gaussianity:** It will fail if the underlying independent sources are themselves Gaussian.
+
❌ **Computationally Intensive:** Can be slower than PCA, especially on data with a very large number of features.
+
+
+
🔹 When to Use ICA
+
+
Audio Signal Processing: The classic "cocktail party problem" of separating voices from mixed recordings.
+
Biomedical Signal Analysis: Separating useful brain signals (EEG) or heart signals (ECG) from artifacts like eye blinks, muscle noise, or power line interference.
+
Financial Data Analysis: Attempting to identify underlying independent economic factors that drive stock price movements.
+
Image Denoising: Separating the "true" image signal from random noise patterns.
In this example, we will create our own "cocktail party." We'll generate two clean, independent source signals (a sine wave and a sawtooth wave). Then, we'll mathematically "mix" them together. Finally, we'll use `FastICA` to see if it can recover the original two signals from the mixed recordings.
+
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.decomposition import FastICA
+
+# --- 1. Create the Original "Source" Signals ---
+np.random.seed(0)
+n_samples = 2000
+time = np.linspace(0, 8, n_samples)
+
+# Source 1: A sine wave (smooth and periodic)
+s1 = np.sin(2 * time)
+# Source 2: A sawtooth wave (sharp and structured)
+s2 = np.sign(np.sin(3 * time))
+# Combine them into a single array
+S_original = np.c_[s1, s2]
+
+# --- 2. Create a "Mixing Matrix" and Mix the Signals ---
+# This simulates how the signals get mixed in the real world
+A = np.array([[1, 1], [0.5, 2]]) # The mixing matrix
+X_mixed = np.dot(S_original, A.T)
+
+# --- 3. Apply ICA to "Unmix" the Signals ---
+# We tell ICA that we are looking for 2 independent components
+ica = FastICA(n_components=2, random_state=42)
+S_recovered = ica.fit_transform(X_mixed)
+
+# --- 4. Visualize the Results ---
+plt.figure(figsize=(12, 8))
+
+# Plot Original Sources
+plt.subplot(3, 1, 1)
+plt.title("Original Independent Sources")
+plt.plot(S_original)
+
+# Plot Mixed Signals
+plt.subplot(3, 1, 2)
+plt.title("Mixed Signals (Observed Data)")
+plt.plot(X_mixed)
+
+# Plot Recovered Signals
+plt.subplot(3, 1, 3)
+plt.title("Recovered Signals using ICA")
+plt.plot(S_recovered)
+
+plt.tight_layout()
+plt.show()
+
+
+
🔹 Best Practices
+
+
Preprocess Your Data: Always center and whiten your data before applying ICA. Whitening can often be done using PCA.
+
Choose `n_components` carefully: The number of components must be less than or equal to the number of original features. You should have a good reason (based on domain knowledge) for the number of sources you expect to find.
+
Be Aware of Ambiguities: Remember that the output components won't be in any particular order and their scale might not match the original. You often need to inspect the results manually to identify which recovered signal corresponds to which source.
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What is the primary goal of ICA, and how does it differ from PCA's goal?
+
Why is the assumption of "non-Gaussianity" so important for ICA to work?
+
You apply ICA to a mixed audio recording and get two signals back. One looks like a perfect sine wave, but it's upside-down compared to the original. Did ICA fail? Why or why not?
+
You have a dataset with 10 features. What is the maximum number of independent components you can extract using ICA?
+
+
+
Answers
+
1. ICA's primary goal is to find components that are statistically independent. PCA's goal is to find components that are uncorrelated and maximize variance. Independence is a much stronger condition than uncorrelation.
+
2. The Central Limit Theorem suggests that mixing signals makes them "more Gaussian." ICA works by reversing this, finding a projection that makes the resulting signals as non-Gaussian as possible, which are assumed to be the original, independent sources.
+
3. No, ICA did not fail. It successfully recovered the shape of the signal. ICA cannot determine the original sign (polarity) or scale (amplitude) of the sources. An upside-down signal is a perfectly valid result.
+
4. You can extract a maximum of 10 components. The number of components must be less than or equal to the number of original features (observed signals).
+
+
+
+
🔹 Key Terminology Explained (ICA)
+
+
The Story: Decoding the Signal Purifier's Toolkit
+
+
+
+ Latent Variables:
+
+ What they are: Hidden or unobserved variables that are inferred from other variables that are directly observed.
+
+ Story Example: In the cocktail party, the clean voices of Alice and Bob are latent variables. You can't record them directly, but you can infer what they must have sounded like from the mixed microphone recordings.
+
+
+ Non-Gaussianity:
+
+ What it is: A property of a probability distribution that indicates it does not follow a perfect bell-curve (Gaussian) shape.
+
+ Story Example: A random, hissing static noise might be Gaussian. But a human voice, with its pauses, peaks, and structured patterns, is highly structured and therefore non-Gaussian. ICA looks for this structure.
+
+
+ Kurtosis:
+
+ What it is: A statistical measure of the "peakiness" or "tailedness" of a distribution.
+
+ Story Example: A signal with high positive kurtosis is very "spiky," with sharp peaks and heavy tails (more extreme values than a bell curve). A signal with negative kurtosis is very "flat-topped." ICA often looks for high kurtosis as a sign of an interesting, non-Gaussian signal.
+
+
+ Whitening:
+
+ What it is: A preprocessing step that transforms data so that its features are uncorrelated and have a variance of 1.
+
+ Story Example: Imagine your microphone recordings have different volume levels and some echo. Whitening is like running them through an audio equalizer that balances the volumes and removes the echo, creating a "cleaner" starting point for the unmixing algorithm.
+
Imagine two library builders. The XGBoost builder constructs one entire floor (level) at a time, ensuring all rooms are built before moving to the next floor. The LightGBM builder is an efficiency expert. They identify the most critical room in the entire library—the one that will provide the most value—and build that room first, even if it's on the 10th floor. They always focus on the single most impactful part of the project next, leading to a functional library much faster.
+
+
What is LightGBM?
+
+ LightGBM (Light Gradient Boosting Machine) is a gradient boosting framework developed by Microsoft that is designed for speed and efficiency. Its key innovation is using a leaf-wise tree growth strategy instead of the conventional level-wise strategy.
+
+
+
Comparison with XGBoost:
+
+
Speed: LightGBM is generally much faster due to its histogram-based algorithm and optimized sampling techniques.
+
Memory Usage: LightGBM uses significantly less memory.
+
Tree Growth: LightGBM grows trees leaf-wise (vertically), while XGBoost grows them level-wise (horizontally).
+
+
+
🔹 Key Innovations
+
+
Story example: The Smart Survey Taker
+
LightGBM is like a very smart survey taker. Instead of asking for everyone's exact age (a continuous value), they group people into age brackets like 20-30, 30-40, etc. (Histogram-based splitting). They focus their energy on people whose opinions are most likely to change the survey's outcome (GOSS) and bundle redundant questions together (EFB) to save time.
+
+
+
Histogram-based Splitting: Instead of checking every single unique value for a feature, LightGBM buckets continuous values into discrete bins (a histogram). This drastically speeds up finding the best split.
+
Leaf-wise Tree Growth: It grows the tree by always splitting the leaf that will cause the largest reduction in loss. This leads to faster convergence but can sometimes overfit if not constrained.
+
Gradient-based One-Side Sampling (GOSS): An intelligent sampling method. It keeps all the data points with large gradients (the ones the model is most wrong about) and takes a random sample of the points with small gradients.
+
Exclusive Feature Bundling (EFB): A technique for sparse data. It bundles mutually exclusive features (e.g., features that are rarely non-zero at the same time) into a single feature to reduce dimensionality.
+
+
+
🔹 Mathematical Foundation
+
+
Story example: The Aggressive Problem-Solver
+
The mathematical goal is the same as other boosting models: minimize a combined objective of loss and complexity. However, LightGBM's strategy is different. While a level-wise builder ensures a balanced structure at all times, LightGBM's leaf-wise strategy is like an aggressive problem-solver who ignores balanced development to go straight for the part of the problem that will yield the biggest reward.
+
+
Objective Function:
+
LightGBM minimizes the same objective function as XGBoost, which includes a loss term and a regularization term:
The key difference is not in the *what* (the objective) but in the *how* (the strategy). The leaf-wise split strategy finds the most promising leaf and splits it, which converges on the minimum loss much faster than building out a full level of the tree.
+
+
🔹 Key Parameters
+
+
+
+
Parameter
+
Explanation & Story
+
+
+
+
+
num_leaves
+
The maximum number of leaves in one tree. This is the main parameter to control complexity. Story: How many specific, final conclusions an expert is allowed to have. This is more direct than `max_depth`.
+
+
+
max_depth
+
Limits the maximum depth of the tree. Used to prevent overfitting. Story: A hard limit on how many "follow-up questions" an expert can ask before reaching a conclusion.
+
+
+
learning_rate
+
The shrinkage rate. Story: How cautiously you apply the new expert's advice.
+
+
+
n_estimators
+
The number of boosting iterations. Story: How many experts you add to the team sequentially.
+
+
+
min_data_in_leaf
+
Minimum number of data points required in a leaf. Prevents creating leaves for single, noisy data points. Story: An expert isn't allowed to make a final conclusion based on just one person's opinion.
+
+
+
boosting
+
Can be `gbdt` (traditional), `dart` (adds dropout), or `goss`. Story: The overall strategy the team of experts will use. `goss` is the efficient sampling strategy unique to LightGBM.
+
+
+
+
+
🔹 Strengths & Weaknesses
+
+
LightGBM is like a high-speed bullet train. It's incredibly fast and efficient, capable of handling huge amounts of cargo (large datasets) with ease. However, it's built for long, straight tracks. On smaller, twistier routes (small datasets), its aggressive speed might cause it to fly off the rails (overfit) if the driver isn't careful with the controls (hyperparameters).
+
+
Advantages:
+
+
✅ Very fast training speed and high efficiency.
+
✅ Lower memory usage compared to other boosting models.
+
✅ Excellent performance on large datasets.
+
✅ Supports parallel, distributed, and GPU learning.
+
+
Disadvantages:
+
+
❌ Can easily overfit on small datasets if parameters are not tuned.
+
❌ More sensitive to hyperparameters like `num_leaves`.
+
+
+
🔹 Python Implementation
+
+
Here, we call our "efficiency expert" from the `lightgbm` library. We create a regressor and train it on our data. We use `eval_set` to monitor performance on a validation set and stop training early if performance doesn't improve, preventing our expert from over-studying and memorizing the answers.
The Story: The Efficiency Expert's Secret Techniques
+
Let's uncover the clever tricks LightGBM uses to be so fast and efficient.
+
+
Histogram-based Splitting
+
+ What it is: A technique that groups continuous feature values into a fixed number of discrete bins (a histogram) before training. The algorithm then finds the best split among the bins instead of among all the unique data points.
+
+
+ Story Example: Imagine sorting a million marbles of slightly different shades of red. It would take forever. A histogram-based approach is like creating just 10 buckets: "Bright Red," "Medium Red," "Dark Red," etc. You quickly throw each marble into a bucket. Now, finding the best dividing line between shades is incredibly fast because you only have to compare 10 buckets, not a million individual marbles.
+
+
+
Leaf-wise vs. Level-wise Growth
+
+ What it is: Two different strategies for building decision trees.
+
+
+
Level-wise (XGBoost): Builds the tree out one full level at a time. It's balanced but can do a lot of unnecessary work splitting leaves that have low loss.
+
Leaf-wise (LightGBM): Scans all the current leaves and splits the one that promises the biggest reduction in error. It's faster and more focused but can lead to unbalanced, deep trees if not constrained.
+
+
+ Story Example: Two players are playing a strategy game. The level-wise player upgrades all their buildings to Level 2 before starting on Level 3. They are balanced but slow. The leaf-wise player finds the single most powerful upgrade in the entire game and rushes to get it, ignoring everything else. They become powerful much faster but might have weaknesses if their strategy is countered.
+
+
+
Gradient-based One-Side Sampling (GOSS)
+
+ What it is: A sampling method that focuses on the data points that the model is most wrong about. It keeps all instances with large gradients (high error) and randomly samples from instances with small gradients (low error).
+
+
+ Story Example: A teacher wants to improve the class's test scores efficiently. Instead of re-teaching the entire curriculum to everyone, they use GOSS. They give mandatory tutoring to all students who failed the test (large gradients). For the students who passed, they only pick a random handful to attend a review session (sampling small gradients). This focuses their teaching effort where it's needed most.
+
+
+
Exclusive Feature Bundling (EFB)
+
+ What it is: A technique for handling sparse data (data with many zeros). It identifies features that are mutually exclusive (i.e., they are rarely non-zero at the same time) and bundles them into a single, denser feature.
+
+
+ Story Example: You have a survey with many "Yes/No" questions that are rarely answered "Yes" at the same time, like "Do you own a cat?", "Do you own a dog?", "Do you own a bird?". EFB is like creating a single new question: "Which pet do you own?" and combining the sparse answers into one feature. This reduces the number of questions the model has to consider, speeding up the process without losing information.
+
+
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/templates/Linear-Discriminant-Analysis.html b/templates/Linear-Discriminant-Analysis.html
new file mode 100644
index 0000000000000000000000000000000000000000..ea5a5a7da639c3879f5b9e718cba76f7f7a54ba2
--- /dev/null
+++ b/templates/Linear-Discriminant-Analysis.html
@@ -0,0 +1,446 @@
+{% extends "layout.html" %}
+
+{% block content %}
+
+
+
+
+
+ Study Guide: Linear Discriminant Analysis (LDA)
+
+
+
+
+
+
+
+
Imagine you have to take a single photo of two different groups of people, say a basketball team (tall, lean) and a group of sumo wrestlers (shorter, heavy). A regular photographer (like PCA) doesn't know who is in which group, so they might take the photo from an angle that just shows the biggest spread of people, perhaps from the side. But you are a smart photographer (using LDA). You already have the guest list and know who is a basketball player and who is a sumo wrestler. So, you find the one perfect camera angle that makes the two groups look as distinct as possible. This angle will likely be one that contrasts height against weight, making the two groups form separate, tight clusters in your photo. LDA is a supervised technique that uses these known labels to find the best "camera angles" (projections) to maximize the separation between groups.
+
+
Linear Discriminant Analysis (LDA) is a powerful technique used for both supervised classification and dimensionality reduction. Its primary goal is to find a new, lower-dimensional space to project the data onto, such that the separation (or discrimination) between the different classes is maximized. The new axes it finds are called linear discriminants.
+
+
🔹 Intuition Behind LDA
+
While PCA is unsupervised and only cares about finding axes that maximize the total variance (the spread of the entire dataset), LDA is supervised and has a much more specific goal. It uses the class labels to find a projection that simultaneously accomplishes two things:
+
+
Maximize the distance between the means (centers) of the different classes. (In the photo, push the center of the basketball player group and the center of the sumo wrestler group as far apart as possible).
+
Minimize the variation (or "scatter") within each class. (In the photo, make the players within the basketball team appear as tightly clustered as possible, and do the same for the sumo wrestlers).
+
+
+
+
This image illustrates the core idea. Projecting onto the horizontal axis (like PCA might) causes the classes to overlap. LDA finds a new, tilted axis that perfectly separates the centers of the blue and red clusters while keeping each cluster's projection tight.
+
+
+
🔹 Mathematical Foundation
+
+
To achieve its goals, LDA mathematically defines the two objectives and finds a projection that optimizes them. It calculates two key statistical measures:
+
+
Within-Class Scatter Matrix ($$S_W$$): A number that represents the total scatter of data points around their respective class centers. Think of this as the "compactness" of all the individual groups added together. LDA wants this to be as small as possible.
+
Between-Class Scatter Matrix ($$S_B$$): A number representing the scatter of the class centers around the overall dataset's center. Think of this as how "spread out" the groups are from one another. LDA wants this to be as large as possible.
+
+
The perfect "camera angle" (projection matrix W) is the one that maximizes the ratio of $$S_B$$ to $$S_W$$. This is a classic optimization problem that is solved using a technique called the generalized eigenvalue problem.
Here, $$\mu_c$$ is the mean vector (center) of a single class c. This formula essentially calculates the spread of points around their own group's center and adds it all up.
Here, $$\mu$$ is the mean of the entire dataset, $$\mu_c$$ is the mean of class c, and $$N_c$$ is the number of samples in class c. This formula measures how far each class center is from the overall center, giving more weight to larger classes.
+
+
Optimization Goal: The objective is to find the projection matrix W that maximizes the following ratio. This is often called Fisher's criterion.
+
$$ J(W) = \frac{|W^T S_B W|}{|W^T S_W W|} $$
+
+
+
+
🔹 Geometric Interpretation
+
Geometrically, LDA rotates and projects the data to find the best view for class separation. The number of new dimensions (linear discriminants) it can create is limited by the number of classes. Specifically, for a problem with **k** classes, LDA can find at most **k-1** new axes.
+
+
Example:
+
+
For a 2-class problem (e.g., "Pass" vs. "Fail"), LDA can only find one new axis (a 1D line) that best separates the two groups.
+
For the 3-class Iris dataset ("Setosa", "Versicolor", "Virginica"), LDA can find a maximum of two new axes, allowing us to visualize the separation on a 2D plane.
+
+ This makes LDA an excellent tool for visualizing the separability of multi-class datasets.
+
+
+
🔹 Assumptions of LDA
+
LDA is a powerful tool, but it relies on a few key assumptions about the data. The model performs best when these are met:
+
+
Normality: The data within each class is assumed to follow a Gaussian (bell-curve) distribution. If the data is heavily skewed, LDA might not find the best boundary.
+
Equal Covariance (Homoscedasticity): This is a crucial assumption. LDA assumes that all classes have the same covariance matrix, meaning their "spread" or "shape" is roughly the same. If one class is very spread out and another is very compact, LDA's performance will suffer.
+
Linearity: LDA fundamentally creates linear boundaries between classes. If the true decision boundary is highly curved or nonlinear, LDA will fail to capture it.
+
+
+
🔹 Comparison with PCA
+
+
+
+
Feature
+
LDA (Linear Discriminant Analysis)
+
PCA (Principal Component Analysis)
+
+
+
+
+
Supervision
+
Supervised (it requires class labels to compute class separability).
+
Unsupervised (it only looks at the data's features, not the labels).
+
+
+
Goal
+
To find a projection that maximizes class separability.
+
To find a projection that maximizes total variance.
+
+
+
Application
+
Primarily used for classification or as a preprocessing step for classification.
+
Primarily used for general data representation, visualization, and compression.
+
+
+
Example Visualization
+
+
+
+
+
+
+
🔹 Strengths & Weaknesses
+
Advantages:
+
+
✅ **Simplicity and Speed:** It's computationally efficient and faster than more complex methods.
+
✅ **Effective for Classification:** By focusing on separability, it often creates a feature space where classes are easier to distinguish, which can improve the accuracy of a subsequent classifier.
+
✅ **Reduces Overfitting:** In situations with many features but few samples (the "curse of dimensionality"), reducing features with LDA can lead to more robust models.
+
+
Disadvantages:
+
+
❌ **Linearity Limitation:** It cannot separate classes with nonlinear boundaries. For example, it would fail on a dataset where one class forms a circle inside another.
+
❌ **Sensitivity to Assumptions:** Its performance degrades significantly if the assumptions of normality and equal covariance are badly violated.
+
❌ **Limited Components:** It can only find a maximum of k-1 discriminants, which might not be enough to capture the full structure if the data is very complex.
+
+
+
🔹 When to Use LDA
+
+
As a Preprocessing Step for Classification: This is the most common use case. Reduce 100 features to 2 with LDA, then train a simple classifier like Logistic Regression or a Naive Bayes model on those 2 features.
+
For Visualization of Labeled Data: When you have a dataset with many features and 3+ classes, using LDA to project it onto a 2D plane is an excellent way to see how well-separated your classes are.
+
Face Recognition: The Fisherfaces algorithm, a famous technique in face recognition, is a direct application of LDA.
+
+
+
🔹 Python Implementation (Beginner Example with Iris Dataset)
+
+
Here, we use the Iris dataset, which has 3 classes of flowers and 4 features. Since there are 3 classes, LDA can reduce the data to a maximum of 2 components (3-1=2). We will use it first for dimensionality reduction and visualization, and then show how it can be used directly as a classifier.
+
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
+from sklearn.preprocessing import StandardScaler
+from sklearn.datasets import load_iris
+from sklearn.model_selection import train_test_split
+from sklearn.metrics import accuracy_score
+
+# --- 1. Load and Scale the Data ---
+iris = load_iris()
+X, y = iris.data, iris.target
+
+# Split data for later classification test
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+
+# Scaling is a good practice for LDA.
+scaler = StandardScaler()
+X_train_scaled = scaler.fit_transform(X_train)
+X_test_scaled = scaler.transform(X_test)
+
+
+# --- PART A: LDA for Dimensionality Reduction ---
+
+# --- 2. Create and Apply LDA ---
+# Since there are 3 classes, we can reduce to at most 2 components.
+lda_dr = LinearDiscriminantAnalysis(n_components=2)
+
+# Fit LDA and transform the training data. Note: .fit() needs both X and y.
+X_train_lda = lda_dr.fit_transform(X_train_scaled, y_train)
+
+# --- 3. Visualize the Results ---
+plt.figure(figsize=(8, 6))
+plt.scatter(X_train_lda[:, 0], X_train_lda[:, 1], c=y_train, cmap='viridis', edgecolor='k')
+plt.title('LDA of Iris Dataset (4D -> 2D)')
+plt.xlabel('Linear Discriminant 1')
+plt.ylabel('Linear Discriminant 2')
+plt.grid(True)
+plt.show()
+
+
+# --- PART B: LDA as a Classifier ---
+
+# --- 4. Train LDA as a Classifier ---
+# We don't set n_components, so it uses the components for classification.
+lda_clf = LinearDiscriminantAnalysis()
+lda_clf.fit(X_train_scaled, y_train)
+
+# --- 5. Make Predictions and Evaluate ---
+y_pred = lda_clf.predict(X_test_scaled)
+accuracy = accuracy_score(y_test, y_pred)
+print(f"Accuracy of LDA as a classifier: {accuracy:.2%}")
+
+
+
+
🔹 Best Practices
+
+
Standardize Features: Always scale your data before applying LDA to ensure all features are treated equally.
+
Check Assumptions: Before relying heavily on LDA, it's wise to visualize your data to see if the classes are roughly Gaussian and have similar spreads. If not, consider alternatives.
+
Address Violated Assumptions: If the equal covariance assumption is violated, a variation called Quadratic Discriminant Analysis (QDA) might be a better choice. If the boundary is nonlinear, kernel-based methods might be needed.
+
+
+
🔹 Key Terminology Explained (LDA)
+
+
The Story: Decoding the Smart Photographer's Toolkit
+
+
+
+ Supervised Technique:
+
+ What it is: An algorithm that learns from data that has been labeled with the correct answers. It needs a "supervisor" to provide the ground truth.
+
+ Story Example: Teaching a child to identify animals by showing them pictures labeled "cat," "dog," etc., is supervised learning. LDA is supervised because it uses the pre-existing class labels (jockeys vs. basketball players) to find the best projection.
+
+
+ Class Separability:
+
+ What it is: A measure of how distinct and easy to distinguish the different classes in a dataset are from one another.
+
+ Story Example: The separability between apples and oranges is high. The separability between different types of apples (e.g., Gala vs. Fuji) is low. LDA's entire goal is to maximize this separability in the projected space.
+
+
+ Scatter Matrix:
+
+ What it is: A mathematical way to measure the "spread" or "scatter" of data points, generalizing the concept of variance to multiple dimensions.
+
+ Story Example: Imagine throwing a handful of sand on the floor. The scatter matrix is a numerical description of the shape and size of the sand pile. LDA uses two such matrices: one for the spread within each class, and one for the spread between the class centers.
+
+
+ Eigenvalue Problem:
+
+ What it is: A standard problem in linear algebra used to find the fundamental directions (eigenvectors) in which a linear transformation acts by just stretching/compressing, without rotation.
+
+ Story Example: Think of it as finding the "skeleton" or principal axes of a transformation. Solving the eigenvalue problem for the scatter matrices gives LDA the exact directions it needs to point its "camera" to get the best class separation.
+
After training you cant train agian and cant change output so if you want add a custom data in predefine data so add before training
+
+
+
+
+
+ 4 hidden neurons
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Data Class
+
+
+
+
+
+
+
+
+ Hidden Neurons: 4
+
+
+
+
+
+
+
+
+
+
+ Learning Rate: 0.50
+
+
+
+
+
+
+
+
+
+
+ Accuracy:
+ 0.0%
+
+
+
+
+
+
Presets
+
+
+
+
+
+
+
+
+
+
+
+ Training Log
+ Epoch 0
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Network Architecture
+
+
+
+
+
+
+
+
+
+
+
Data & Decision Boundary
+ Points: 0
+
+
+
+
+
X Coordinate
+
Y Coordinate
+
+
+
+
Class A
+
Class B
+
Prediction A
+
Prediction B
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Live Prediction (Hover)
+
+
+
+ 1 Input
+
+
+ X: 0.00
+ Y: 0.00
+
+
+
+
+ 2 Hidden Layer
+
+
+
+
+
+
+
+ 3 Output
+
+
+
+ Raw: 0.0000
+ -
+
+
+
+
+
+
+
+
+
+
Training Process
+
+
+ The network learns by "Backpropagation". It compares its guess to the real label, finds the error, and adjusts the weights backwards from output to input.
+
+
+ 💡 Tip: If the network gets stuck, try increasing neurons or clicking "Reset" to randomize weights.
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/templates/Neural-Networks-for-Classification.html b/templates/Neural-Networks-for-Classification.html
new file mode 100644
index 0000000000000000000000000000000000000000..ec0acfc5401f0501e7c5ffcb83c5607cece5bfe5
--- /dev/null
+++ b/templates/Neural-Networks-for-Classification.html
@@ -0,0 +1,416 @@
+{% extends "layout.html" %}
+{% block content %}
+
+
+
+
+
+ Study Guide: Neural Networks for Classification
+
+
+
+
+
+
+
+
Think of a large company trying to decide if a new project proposal is a "Go" or "No-Go". The raw data (market research, costs) goes to the junior analysts (input layer). Each analyst specializes in one piece of data. They pass their summaries to mid-level managers (hidden layers), who combine these summaries to spot higher-level patterns. Finally, the CEO (output layer) takes the managers' final reports and makes the single classification decision: Go or No-Go. A Deep Neural Network is just a company with many layers of management, allowing it to understand extremely complex problems.
+
+
What is a Neural Network?
+
+ A Neural Network (NN) is a computational model inspired by the structure and function of the human brain. It's composed of interconnected nodes, called artificial neurons, organized in layers. They are excellent at finding complex patterns in data.
+
+
+
Shallow vs. Deep Neural Networks
+
+
Shallow NN: A network with only one hidden layer. It's like a small company with just one layer of management. Good for simpler problems.
+
Deep NN (DNN): A network with two or more hidden layers. The "depth" allows it to learn hierarchical features, making it powerful for complex tasks like image and speech recognition.
+
+
+
🔹 Neural Network Architecture
+
+
Story example: The Assembly Line of Information
+
An NN works like an assembly line. Raw materials (input data) enter at one end. Each station (neuron) performs a specific task: it takes materials from previous stations, weighs their importance (weights), adds a standard adjustment (bias), and decides whether to pass its result along (activation function). The process of the product moving from start to finish is Forward Propagation. If the final product is faulty, a manager goes back down the line (Backpropagation), telling each station exactly how to adjust its process to fix the error.
+
+
+
+[Image of a simple neural network architecture]
+
+
+
Input Layer: Receives the initial data or features (e.g., the pixels of an image).
+
Hidden Layers: One or more layers between the input and output. This is where the network learns to transform the data to find patterns.
+
Output Layer: Produces the final result. For classification, this is typically the probability for each class.
+
+
+
🔹 Mathematical Foundation
+
+
Story example: The Neuron's Decision
+
Each neuron is a tiny decision-maker. It listens to several colleagues (inputs). It trusts some colleagues more than others (their inputs have higher weights). It also has its own personal opinion (a bias). It adds up all the weighted opinions and its own bias to get a final score. Based on this score, it decides how strongly to "shout" its conclusion to the next layer of neurons. This "shout" is governed by its activation function.
+
+
Weighted Sum & Activation
+
$$ z = (w_1x_1 + w_2x_2 + \dots) + b $$
+
$$ a = f(z) $$
+
+
Activation Functions:
+
+
+ Sigmoid:
+
The Sigmoid function takes any real value and squashes it to a range between 0 and 1. This is perfect for the output layer in a binary classification task, where the output can be interpreted as a probability.
+
Example: In an email spam detector, a Sigmoid output of 0.95 means there is a 95% probability that the email is spam.
+
Story Analogy: The Dimmer Switch. Think of a Sigmoid function as a dimmer switch for a light. It's not just on or off; it can be 0% bright (output 0), 100% bright (output 1), or any percentage in between. This makes it ideal for representing the probability of a single outcome.
+
+
+ Softmax:
+
The Softmax function is used in the output layer for multi-class classification. It takes a vector of raw scores (logits) and transforms them into a probability distribution, where each value is between 0 and 1, and all values sum up to 1.
+
Example: An image classifier for animals might output raw scores of `[cat: 2.5, dog: 1.8, bird: 0.5]`. After applying Softmax, this becomes a probability distribution like `[cat: 0.65, dog: 0.29, bird: 0.06]`, indicating a 65% chance the image is a cat.
+
Story Analogy: The Voting Poll. Imagine an election with multiple candidates (classes). Each candidate gets a certain number of raw votes (the logits). The Softmax function is the pollster that converts those raw vote counts into a final percentage for each candidate, ensuring the total percentage adds up to 100%. This tells you the relative likelihood of each candidate winning.
+
+
+ ReLU (Rectified Linear Unit):
+
ReLU is the most popular activation function for hidden layers. It's a very simple function: if the input is positive, it passes it through unchanged; if it's negative, it outputs zero. This simplicity makes it very fast and helps prevent the vanishing gradient problem.
+
Example: If a neuron calculates a weighted sum of `z = -0.8`, the ReLU activation will be `a = 0`. If it calculates `z = 1.2`, the activation will be `a = 1.2`.
+
Story Analogy: The One-Way Gate. Think of ReLU as a one-way gate that only opens for positive signals. If a positive signal arrives, the gate lets it pass through at full strength. If a negative signal arrives, the gate stays shut, blocking it completely. This simple but effective "go/no-go" mechanism is incredibly efficient for the internal workings of the network.
+
+
+
+
Loss Functions: The "report card" that tells the network how wrong its predictions are.
+
+
Binary Cross-Entropy: Used for two-class problems.
+
Categorical Cross-Entropy: Used for multi-class problems.
+
+
+
+
+
🔹 Key Concepts in Training
+
+
Story: The Student Studying for an Exam
+
A student (the model) is studying a textbook (the dataset). One full read-through of the book is an Epoch. If they study in chunks, say 32 pages at a time, that's the Batch Size. Each time they review a chunk of pages is an Iteration. How much they adjust their notes after finding a mistake is the Learning Rate. Memorizing the book word-for-word is Overfitting, while not studying enough is Underfitting.
+
+
+
Epoch: One complete pass through the entire training dataset.
+
Batch Size: The number of training examples used in one iteration.
+
Learning Rate: A hyperparameter that controls how much to change the model in response to the estimated error each time the weights are updated.
+
Regularization: Techniques to prevent overfitting.
+
+
Dropout: Randomly "turning off" a fraction of neurons during training to prevent over-reliance on any single neuron.
+
L2 Penalty: Adds a cost to having large weights, encouraging the model to use smaller, simpler weights.
+
Early Stopping: Monitoring the performance on a validation set and stopping training when performance stops improving.
+
+
+
+
+
🔹 Variants of Neural Networks
+
+
+
+
Network Type
+
Story & Analogy
+
+
+
+
+
Deep Neural Network (DNN)
+
A large corporation with many layers of management, capable of solving very complex business problems.
+
+
+
Convolutional Neural Network (CNN)
+
A team of image specialists. They use special scanning tools (filters) to find simple patterns (edges, corners) and then combine them to recognize complex objects (faces, cars).
+
+
+
Recurrent Neural Network (RNN)
+
A team that has a short-term memory. When processing a sentence, they remember the previous words to understand the context of the current word. Ideal for sequences like text or speech.
+
+
+
+
+
🔹 Strengths & Weaknesses
+
+
A Neural Network is like a powerful but mysterious alien artifact. It can perform incredible feats (learn complex patterns) that no other tool can. However, it requires a huge amount of energy to run (data and computation), it's a "black box" because its inner workings are hard to understand, and you need to press its buttons (hyperparameters) in exactly the right way to get it to work.
+
+
Advantages:
+
+
✅ Can learn highly complex, non-linear decision boundaries.
+
✅ State-of-the-art performance on unstructured data like images, text, and audio.
+
✅ Can scale with massive datasets.
+
+
Disadvantages:
+
+
❌ Requires large amounts of data to train effectively.
+
❌ Computationally expensive and slow to train.
+
❌ Acts as a "black box," making it difficult to interpret its decisions.
+
+
+
🔹 Python Implementation (Keras/TensorFlow)
+
+
Here, we use the `keras` library to build our "corporate hierarchy". We create a `Sequential` model, which is like setting up a new company. We `add` layers (departments) one by one. Then, we `compile` the company's rulebook: its goal (loss), its method for improving (optimizer), and how it will be graded (metrics). Finally, we `fit` the model, which is the process of training our new company on historical data.
Let's demystify the core processes and rules that govern how our neural network company learns and improves.
+
+
Backpropagation
+
+ What it is: The algorithm used to train neural networks. It calculates the error at the output and propagates it backward through the network layers, determining how much each weight and bias contributed to the error. This information is then used by the optimizer (like Gradient Descent) to update the weights.
+
+
+ Story Example: In our corporate hierarchy, the final project fails (an error). Backpropagation is the process where the CEO blames the senior managers, who in turn figure out which mid-level managers gave them bad information, who then blame the junior analysts. This chain of blame assignment precisely identifies how much each employee at every level needs to adjust their work to fix the overall process.
+
+
Activation Function
+
+ What it is: A function applied to the output of a neuron that determines whether it should be activated ("fire") or not. It introduces non-linearity into the network, allowing it to learn complex patterns.
+
+
+ Story Example: An activation function is like a neuron's "excitement" level. A neuron listens to all the evidence, and if the total evidence exceeds a certain threshold, it gets excited and fires a strong signal. If not, it stays quiet. This on/off or graded response is what allows the network to make complex, non-linear decisions, rather than just calculating simple averages.
+
+
+
Dropout
+
+ What it is: A regularization technique where, during each training iteration, a random fraction of neurons are temporarily "dropped out" or ignored.
+
+
+ Story Example: Imagine a team of employees working on a project. To ensure no single employee becomes a single point of failure, the manager uses Dropout. Each day, they tell a few random employees to take the day off. This forces the remaining team members to become more versatile and robust, unable to rely on any one superstar. The result is a more resilient team that performs better overall.
+
+
+
Epoch
+
+ What it is: One complete forward and backward pass of all the training examples through the neural network.
+
+
+ Story Example: An epoch is like one full school year for our neural network student. During the year, they study every chapter in the textbook (all the training data) at least once. For a model to become truly proficient, it often needs to go through multiple school years (epochs) to master the material.
+
+
+
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/templates/Optimization.html b/templates/Optimization.html
new file mode 100644
index 0000000000000000000000000000000000000000..fd6253aae85673dc2ec0ffcb26c8873ff4e31156
--- /dev/null
+++ b/templates/Optimization.html
@@ -0,0 +1,12 @@
+{% extends "layout.html" %}
+
+{% block content %}
+
Optimization Method
+
Optimization (Gradient Descent): Gradient descent is an optimization algorithm used to minimize the error (loss function) of a model during the training process. It is the underlying mechanism that powers the "gradient" aspect of gradient boosting, allowing the models to iteratively improve their performance.
Imagine you have a complex 3D object, like a toy airplane. If you shine a light on it, you create a 2D shadow. From one angle, the shadow might look like a simple line. But if you rotate the airplane and find the perfect angle, the shadow will capture its main shape—the wings and body. PCA is like a mathematical shadow puppet master for your data. It takes high-dimensional data (the 3D airplane) and finds the best "angles" to project it onto a lower-dimensional surface (the 2D shadow), making sure the shadow preserves as much of the original shape (the variance) as possible.
+
+
Principal Component Analysis (PCA) is a dimensionality reduction technique. Its main goal is to reduce the number of features in a dataset while keeping as much important information as possible. It doesn't just pick features; it creates new, powerful features called principal components, which are combinations of the original ones.
+
+
Example: A dataset about houses has 10 features: square footage, number of rooms, number of bathrooms, lot size, etc. Many of these features are correlated and essentially measure the same thing: the "size" of the house. PCA can combine them into a single new feature like "Overall House Size," reducing 10 features to 1 without losing much information.
+
+
+
🔹 Mathematical Foundation
+
+
Story: The "Data Squishing" Machine
+
PCA is a five-step machine that intelligently squishes your data:
+
+
Step 1: Put everything on the same scale. (Standardize Data).
+
Step 2: Figure out which features move together. (Compute Covariance Matrix).
+
Step 3: Find the main directions of "stretch" in the data. (Find Eigenvectors and Eigenvalues).
+
Step 4: Rank these directions from most to least important. (Sort Eigenvalues).
+
Step 5: Keep the top few important directions and discard the rest. (Select top k components).
+
+
+
The core of PCA relies on linear algebra to find the principal components. The process is:
+
+
Standardize the data: Rescale features to have a mean of 0 and a variance of 1. This is crucial!
+
Compute the Covariance Matrix: This matrix shows how every feature relates to every other feature.
+
Find Eigenvectors and Eigenvalues: These are calculated from the covariance matrix. The eigenvectors are the new axes (the principal components), and the eigenvalues tell you how much information (variance) each eigenvector holds.
+
Sort Eigenvalues: Rank them from highest to lowest. The eigenvector with the highest eigenvalue is the first principal component (PC1).
+
Select Top k Components: Choose the top `k` eigenvectors to form your new, smaller feature set.
+
+
+
🔹 Geometric Interpretation
+
+
Story: Finding the Best Camera Angle
+
Imagine your data is a cloud of points in 3D space. PCA is like finding the best camera angle to take a 2D picture of this cloud.
+ • The First Principal Component (PC1) is the direction (or camera angle) that shows the biggest spread of data. It's the longest axis of the data cloud.
+ • The Second Principal Component (PC2) is the direction that shows the next biggest spread, but it must be at a 90-degree angle (orthogonal) to PC1.
+ By projecting the 3D cloud onto a 2D plane defined by these two new axes, you get the most informative and representative 2D picture of your data.
+
+
+
🔹 Variance Explained
+
Each principal component captures a certain amount of the total variance (information) from the original dataset. The "explained variance ratio" tells you the percentage of the total information that each component holds.
+
+
Example: After running PCA, you might find:
+
+
PC1 explains 75% of the variance.
+
PC2 explains 20% of the variance.
+
PC3 explains 3% of the variance.
+
...and so on.
+
+
In this case, the first two components alone capture 95% of the total information. This means you can likely discard all other components and just use PC1 and PC2, reducing your data's complexity while retaining almost all of its structure. This is often visualized using a scree plot.
+
+
+
+
🔹 Comparison
+
+
+
+
Comparison
+
PCA (Principal Component Analysis)
+
Alternative Method
+
+
+
+
+
vs. Feature Selection
+
Creates new features by combining old ones. (Making a smoothie from different fruits).
+
Selects a subset of the original features. (Picking the best fruits for a fruit basket).
+
+
+
vs. Autoencoders
+
A linear method. Can't capture complex, curved patterns in data. (Taking a simple photo).
+
Can learn complex, nonlinear patterns. (Drawing a detailed, artistic sketch).
+
+
+
+
+
🔹 Strengths & Weaknesses
+
Advantages:
+
+
✅ Reduces Dimensionality: Makes models train faster and require less memory. Example: A model might train in 1 minute on 5 principal components vs. 10 minutes on 100 original features.
+
✅ Removes Multicollinearity: It gets rid of redundant, correlated features, which can improve the performance of some models like Linear Regression.
+
✅ Helps with Visualization: Allows you to plot high-dimensional data in 2D or 3D to see patterns.
+
+
Disadvantages:
+
+
❌ Features are Hard to Interpret: The new principal components are mathematical combinations (e.g., `0.7*age - 0.3*income + 0.1*education`). It's hard to explain what "PC1" means in a business context.
+
❌ It's a Linear Method: PCA might miss important patterns in data that aren't linear (e.g., a spiral or circular pattern).
+
❌ Sensitive to Scaling: If you don't scale your data first, features with large values (like income) will dominate the PCA process, leading to poor results.
+
+
+
🔹 When to Use PCA
+
+
High-Dimensional Data: When you have datasets with dozens or hundreds of features, especially if many are correlated. Example: Analyzing gene expression data with thousands of genes.
+
Visualization: When you need to plot and explore a dataset with more than 3 features.
+
Preprocessing: As a step before feeding data into another machine learning model to improve its speed and sometimes its performance.
+
Noise Reduction: By keeping only the components with the most variance, you can sometimes filter out noise in your data.
+
+
+
🔹 Python Implementation (Beginner Example with Iris Dataset)
+
+
In this example, we take the famous Iris dataset, which has 4 features, and use PCA to squish it down to just 2 features (principal components). This allows us to create a 2D scatter plot that effectively visualizes the separation between the different flower species.
+
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.decomposition import PCA
+from sklearn.preprocessing import StandardScaler
+from sklearn.datasets import load_iris
+
+# --- 1. Load and Scale the Data ---
+# The Iris dataset has 4 features for 3 species of iris flowers.
+iris = load_iris()
+X = iris.data
+
+# Scaling is CRITICAL for PCA!
+scaler = StandardScaler()
+X_scaled = scaler.fit_transform(X)
+
+# --- 2. Create and Apply PCA ---
+# We'll reduce the 4 features down to 2 principal components.
+pca = PCA(n_components=2)
+
+# Fit PCA to the scaled data and transform it.
+X_pca = pca.fit_transform(X_scaled)
+
+# --- 3. Check the Explained Variance ---
+# Let's see how much information our 2 new components hold.
+explained_variance = pca.explained_variance_ratio_
+print(f"Explained variance by component 1: {explained_variance[0]:.2%}")
+print(f"Explained variance by component 2: {explained_variance[1]:.2%}")
+print(f"Total variance explained by 2 components: {np.sum(explained_variance):.2%}")
+
+# --- 4. Visualize the Results ---
+# We can now plot our 4D dataset in 2D.
+plt.figure(figsize=(8, 6))
+plt.scatter(X_pca[:, 0], X_pca[:, 1], c=iris.target, cmap='viridis')
+plt.title('PCA of Iris Dataset (4D -> 2D)')
+plt.xlabel('First Principal Component')
+plt.ylabel('Second Principal Component')
+plt.grid(True)
+plt.show()
+
+
+
🔹 Best Practices
+
+
Always Scale Your Data: This is the most important rule. Use `StandardScaler` before applying PCA.
+
Choose `n_components` Wisely: Use a scree plot or the explained variance ratio to decide how many components to keep. A common rule of thumb is to keep enough components to explain 90-99% of the variance.
+
Consider Interpretability: If you absolutely must be able to explain what each feature means, PCA might not be the right choice. Simple feature selection could be better.
+
+
+
🔹 Key Terminology Explained (PCA)
+
+
The Story: Decoding the Shadow Master's Toolkit
+
Let's clarify the key terms the PCA shadow master uses.
+
+
+
+ Dimensionality Reduction:
+
+ What it is: The process of reducing the number of features (dimensions) in a dataset.
+
+ Story Example: This is like summarizing a 500-page book into a 1-page summary. You lose some detail, but you keep the main plot points. Dimensionality reduction creates a simpler version of your data.
+
+
+ Covariance Matrix:
+
+ What it is: A square table that shows how each pair of features in your data moves together.
+
+ Story Example: Imagine you're tracking a group of dancers. The covariance matrix is your notebook where you write down which pairs of dancers tend to move in the same direction at the same time.
+
+
+ Eigenvectors & Eigenvalues:
+
+ What they are: A pair of mathematical concepts. The eigenvector is a direction, and the eigenvalue is a number telling you how important that direction is.
+
+ Story Example: Imagine stretching a rubber sheet with a picture on it. The eigenvectors are the directions of stretch where the picture only gets scaled, not rotated. The eigenvalues tell you *how much* it stretched in those directions. PCA finds the directions of greatest "stretch" in your data.
+
+
+ Orthogonal:
+
+ What it is: A mathematical term that simply means "at a right angle (90°) to each other."
+
+ Story Example: The corner of a square or the intersection of the x-axis and y-axis on a graph are orthogonal. The principal components PCA finds are all orthogonal to each other.
+
🧭 Study Guide: Q-Learning in Reinforcement Learning
+
+
🔹 1. Introduction
+
+
Story-style intuition: The Restaurant Critic's Notebook
+
Imagine a food critic exploring a new city. Their goal is to find the best possible multi-course meal. The critic creates a huge notebook with a page for every restaurant (state) in the city. On each page, they list every dish (action) available. They then go out and, through trial-and-error, start assigning a score to each dish. This score, the Q-value, isn't just about how good that one dish tastes (the immediate reward). It's a prediction of the total "dining satisfaction" for the entire evening if they eat that dish and then continue to choose the best dishes at all subsequent restaurants. After visiting many restaurants over many nights, their notebook becomes the ultimate guide to the perfect dining experience. Q-Learning is this process of an agent filling out its "notebook" (the Q-table) to learn the value of every action in every state.
+
+
Q-Learning is a classic model-free, off-policy Reinforcement Learning algorithm. Its primary goal is to learn the optimal policy for making decisions by estimating the quality of state-action pairs, known as the Q-function.
+
+
🔹 2. The Q-Function
+
The Q-function, denoted \( Q(s, a) \), represents the total expected future reward (the Return) an agent can get by taking a specific action \(a\) in a specific state \(s\) and then following the optimal policy thereafter. It's a measure of how good a particular move is in a particular situation.
+
Q-Learning's objective is to find the optimal Q-function, \( Q^*(s, a) \):
In simple terms, this means \( Q^*(s, a) \) is the maximum possible return you can get if you start by taking action \(a\) in state \(s\).
+
+
Example: In a maze, \( Q(\text{crossroads}, \text{go left}) \) would be the predicted total reward if the agent chooses to go left at the crossroads and then plays perfectly for the rest of the maze. This value would be high if "left" is on the optimal path to the exit, and low if it leads to a dead end.
+
+
+
🔹 3. The Q-Learning Update Rule
+
+
The Critic's Update Rule: After trying a dish (action a) at a restaurant (state s), the critic gets an immediate taste score (Reward R). They then look at their notebook for the *next* restaurant (state s') and find the score of the *best possible* dish they could order there. They combine this information to create an updated "learned value." The final updated score in their notebook is a small step from the old score towards this new learned value. The size of that step is the learning rate (α).
+
+
The core of Q-Learning is its update rule, which is applied after every step the agent takes. It updates the Q-value for the state-action pair it just experienced.
+
$$ Q(s, a) \leftarrow Q(s, a) + \alpha \Big[ \underbrace{R + \gamma \max_{a'} Q(s', a')}_{\text{New Learned Value}} - Q(s, a) \Big] $$
+
+
\( Q(s, a) \): The current, old Q-value.
+
\( \alpha \) (Alpha): The Learning Rate. How much we update our Q-value based on the new information. A high value means we learn fast, a low value means we are more conservative.
+
\( R \): The immediate Reward received.
+
\( \gamma \) (Gamma): The Discount Factor. How much we value future rewards.
+
\( \max_{a'} Q(s', a') \): The agent's estimate of the best possible future value it can get from the next state \( s' \). This is the key to learning: it looks one step into the future to inform its current estimate.
+
+
+
🔹 4. Step-by-Step Flow of Q-Learning
+
The algorithm iteratively refines its Q-table until the values converge to the optimal ones.
+
+
+
Initialize Q-Table: Create a table with a row for every state and a column for every action. Fill it with zeros or small random values.
+
Loop for many episodes:
+
+
Start in an initial state \( s \).
+
Loop for each step of the episode:
+
+
Choose an action \( a \) from state \( s \) using an exploration strategy (like epsilon-greedy).
+
Perform the action \( a \).
+
Observe the immediate reward \( R \) and the next state \( s' \).
+
Update the Q-value for the original state and action, \( Q(s, a) \), using the update rule.
+
Set the new state: \( s \leftarrow s' \).
+
+
+
The episode ends when a terminal state is reached.
+
+
+
After thousands of episodes, the Q-table will contain good approximations of the optimal action-values. The agent's optimal policy is then simply to choose the action with the highest Q-value in any given state.
+
+
+
🔹 5. Exploration vs. Exploitation
+
+
The Critic's Dilemma: On any given night, should the critic go to the restaurant and order the dish they already know has the highest score in their notebook (Exploitation)? Or should they try a random, new dish they've never had before to see if it's even better (Exploration)? If they only exploit, they might miss out on a hidden gem. If they only explore, they'll have a lot of bad meals.
+
+
This is a fundamental challenge in RL. The most common solution is the epsilon-greedy (\(\epsilon\)-greedy) strategy:
+
+
With a small probability \( \epsilon \) (e.g., 10%), the agent takes a random action (explores).
+
With a large probability \( 1 - \epsilon \), the agent takes the action with the highest known Q-value (exploits).
+
Typically, \( \epsilon \) starts high and slowly decreases as the agent becomes more confident in its Q-values.
+
+
+
🔹 6. Example: Gridworld
+
+
Imagine a simple 3x3 grid world:
+
+
States: 9 cells, identified by their coordinates.
+
Actions: {Up, Down, Left, Right}.
+
Rewards: +10 for reaching the Goal cell, -100 for falling into a Lava cell, -1 for every other move (to encourage speed).
+
+
Initially, the Q-table is all zeros. As the agent explores, it will eventually stumble into the Goal. When it does, the Q-value for the state-action pair that led to the goal gets a positive update. In the next episode, if the agent reaches a state next to that one, the `max Q(s', a')` term in the update rule will now be positive, causing the "goodness" of the goal to propagate backwards through the grid, one step at a time, until the agent has a complete map of the best path from any square.
+
+
+
🔹 Advantages & Disadvantages
+
+
+
+
Advantages
+
Disadvantages
+
+
+
+
+
✅ Simple and Intuitive: The core concept of updating a value table is very easy to understand and implement.
+
❌ The Curse of Dimensionality: It is only feasible for problems with small, discrete state and action spaces. The size of the Q-table explodes as states/actions increase. Example: A chess board has ~10¹²⁰ states. A Q-table is impossible.
+
+
+
✅ Model-Free: The agent doesn't need to know the rules of the environment (the transition probabilities P). It learns just by observing outcomes.
+
❌ Slow Convergence: It can take a very large number of episodes for the Q-values to propagate through the entire state space and converge.
+
+
+
✅ Guaranteed Convergence: Under the right conditions (enough exploration, a learning rate that decays appropriately), Q-Learning is proven to converge to the optimal Q-values.
+
❌ Cannot handle continuous spaces without modification. To handle continuous states or actions, you need to combine it with a function approximator, which leads to Deep Q-Learning (DQN).
+
+
+
+
+
🔹 Key Terminology Explained
+
+
The Story: Decoding the Critic's Jargon
+
+
+
+ Model-Free:
+
+ What it is: An algorithm that learns a policy without building an explicit model of the environment's dynamics (the P and R functions).
+
+ Story Example: The critic doesn't need the restaurant's recipes or know how the kitchen works (the model). They learn which dish is best just by tasting them (trial-and-error).
+
+
+ Off-Policy:
+
+ What it is: An algorithm that can learn about the optimal policy even while it is following a different, exploratory policy.
+
+ Story Example: Even when the critic is exploring by trying a random dish, the Q-learning update rule still uses the `max Q(s', a')` term, which assumes they will take the *best* action in the next state. It learns about the perfect "exploitation" path while it is still on an "exploration" path.
+
+
+ Q-Table:
+
+ What it is: The data structure used to store and update the Q-values. It's a large matrix where rows represent states and columns represent actions.
+
+ Story Example: This is the critic's master notebook, with a page for every restaurant and a rating for every dish on the menu.
+
+ An ensemble learning method for classification that operates by constructing a multitude of decision trees.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
How Random Forest Classifies Your Data
+
+
+ 🌳
+
Many Decision Trees
+
Each trained on a random subset
+
+
→
+
+ 📍
+
New Data Point
+
Sent to ALL trees
+
+
→
+
+ 📊
+
Individual Predictions
+
Each tree "votes" on the class
+
+
↓
+
→
+
+ 🗳️
+
Majority Vote
+
Most common prediction wins
+
+
→
+
+ ✅
+
Final Classification
+
Robust and accurate
+
+
+
+
+
+ Random Forest combines the power of many individual decision trees to make a more robust and accurate classification, leveraging collective intelligence.
+
+ Random Forest is an ensemble learning method that builds a "forest" of decision trees. For classification tasks, it outputs the class that is the mode of the classes (majority vote) of the individual trees. It's known for its high accuracy and ability to handle complex datasets.
+
+
+
Key Concepts:
+
+
+ Ensemble Learning: Instead of relying on a single model, Random Forest combines predictions from multiple models (decision trees) to improve overall accuracy and robustness.
+
+
+ Decision Trees: Each tree in the forest makes a prediction independently. A single decision tree creates axis-parallel splits, leading to rectangular decision regions.
+
+
+ Randomness: Random Forest introduces randomness in two ways:
+
+
Bagging (Bootstrap Aggregating): Each tree is trained on a random subset of the training data (with replacement).
+
Feature Randomness: When splitting a node, each tree considers only a random subset of the available features. This decorrelates the trees.
+
+
+
+ Decision Boundary: Unlike a single decision tree's sharp, rectangular boundaries, the Random Forest's decision boundary is the aggregated result of many trees. This often results in a smoother, more complex, and often non-linear boundary, as seen in the plot.
+
+
+
+
How this Visualization Works:
+
+
+ Class 1 (Red Circles): These are your labeled data points belonging to Class 1.
+
+
+ Class 0 (Blue Circles): These are your labeled data points belonging to Class 0.
+
+
+ Test Point (Green 'x'): This is the new, unlabeled data point you want to classify. You can adjust its X1 and X2 coordinates.
+
+
+ Colored Background: This represents the decision boundary of the trained Random Forest model.
+
+
Red regions indicate areas where the Random Forest predicts Class 1.
+
Blue regions indicate areas where the Random Forest predicts Class 0.
+
+ The smoothness and complexity of this boundary are a result of the ensemble nature of Random Forest.
+
+
+
+
+ *The plot will show the decision boundary by predicting the class for a grid of points covering the entire plot area. The color of each grid point reflects the predicted class, creating the background regions.*
+
+
+
+
+
+
+
+
🌳 Single Tree vs Random Forest
+
+
+
+
+
+
+
🔍 Working of Random Forest Algorithm
+
+
Create Many Decision Trees: The algorithm makes many decision trees using different random parts of the data.
+
Pick Random Features: Each tree picks a random subset of features to make splits. This keeps trees diverse.
+
Each Tree Makes a Prediction: Every tree gives its own output.
+
Combine the Predictions:
+
+
Classification: Uses majority voting across trees.
+
Regression: Averages the outputs of all trees.
+
+
+
Why It Works: Randomness prevents overfitting and improves overall prediction accuracy.
+
+
+
🌟 Key Features of Random Forest
+
+
Handles Missing Data: Works even with some missing values.
+
Shows Feature Importance: Identifies most important features for prediction.
+
Handles Complex Data: Efficient with large datasets and many features.
+
Versatile: Works for both classification and regression tasks.
+
+
+
📌 Assumptions of Random Forest
+
+
Each tree is independent and makes its own prediction.
+
Each tree is trained on random samples and features.
+
A large enough dataset is required for diverse learning.
+
Combining different trees improves accuracy.
+
+
+
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/templates/Reward-ValueFunction.html b/templates/Reward-ValueFunction.html
new file mode 100644
index 0000000000000000000000000000000000000000..0f33eb30849cbc500fb70681d7016bb7e7ef8f24
--- /dev/null
+++ b/templates/Reward-ValueFunction.html
@@ -0,0 +1,290 @@
+{% extends "layout.html" %}
+
+{% block content %}
+
+
+
+
+
+ Study Guide: RL Reward & Value Function
+
+
+
+
+
+
+
+
+
💰 Study Guide: Reward & Value Function in Reinforcement Learning
+
+
🔹 1. Reward (R)
+
+
Story-style intuition: The Immediate Feedback
+
Imagine a mouse in a maze. The Reward is the immediate, tangible feedback it gets for its actions. If it takes a step and finds a tiny crumb of cheese, it gets an immediate `+1` reward. If it touches an electric wire, it gets an immediate `-10` reward. If it just moves to an empty square, it gets a small `-0.1` reward (to encourage it to hurry). The reward signal is the fundamental way the environment tells the agent, "What you just did was good/bad."
+
+
The Reward (R) is a scalar feedback signal that the environment provides to the agent after each action. It is the primary driver of learning, as the agent's ultimate goal is to maximize the total reward it accumulates over time.
+
Types of Rewards:
+
+
Positive Reward: Encourages the agent to repeat the action that led to it.
+
Example: In a video game, picking up a health pack gives a `+25` reward.
+
+
Negative Reward (Penalty): Discourages the agent from repeating an action.
+
Example: A self-driving car receiving a `-100` reward for a collision.
+
+
Zero Reward: A neutral signal, common for actions that don't have an immediate, obvious consequence.
+
Example: In chess, most moves don't immediately win or lose the game, so they receive a reward of `0`.
+
+
+
+
🔹 2. Return (G)
+
+
Story-style intuition: The Long-Term Goal
+
The mouse in the maze isn't just trying to get the next crumb of cheese; its real goal is to get the big block of cheese at the end. The Return (G) is the total sum of all the rewards the mouse expects to get from its current position until the end of the maze. A smart mouse will choose a path of small negative rewards (empty steps) if it knows that path leads to the huge `+1000` reward of the final cheese block. It learns to prioritize the path with the highest Return, not just the highest immediate reward.
+
+
The Return (G) is the cumulative sum of future rewards. Because the future is uncertain and rewards that are far away are often less valuable than immediate ones, we use a discount factor (γ).
The discount factor \( \gamma \) (a number between 0 and 1) determines the present value of future rewards. A \( \gamma \) of 0.9 means a reward received in the next step is worth 90% of its value now, a reward in two steps is worth 81%, and so on.
+
+
🔹 3. Value Function (V)
+
+
Story-style intuition: The Chess Master's Insight
+
A novice chess player only sees the immediate rewards (e.g., "I can capture their pawn!"). A chess master, however, understands the Value of a board position. A certain position might not offer any immediate captures, but the master knows it has a high value because it provides strong control over the center of the board and is highly likely to lead to a win (a large future return) later on. The Value Function is this deep, predictive understanding of "how good" a situation is in the long run.
+
+
A Value Function is a prediction of the expected future return. It is the core of many RL algorithms, as it allows the agent to make decisions based on the long-term consequences of its actions.
+
3.1 State-Value Function (V)
+
Answers the question: "How good is it to be in this state?"
This is the expected return an agent can get if it starts in state \(s\) and follows its policy \( \pi \) thereafter.
+
+
Example: In Pac-Man, the state-value \( V(s) \) of a position surrounded by pellets is high. The value of a position where Pac-Man is cornered by a ghost is very low.
+
+
3.2 Action-Value Function (Q-Function)
+
Answers the question: "How good is it to take this specific action in this state?"
This is the expected return if the agent starts in state \(s\), takes action \(a\), and then follows its policy \( \pi \) from that point on. The Q-function is often more useful for decision-making because for any state, the agent can simply choose the action with the highest Q-value.
+
+
Example: You are Pac-Man at an intersection (state s). The Q-function would give you values for each action: \( Q(s, \text{move left}) = +50 \), \( Q(s, \text{move right}) = -200 \) (because a ghost is there). You would obviously choose to move left.
+
+
+
🔹 4. Reward vs. Value Function
+
+
+
+
Aspect
+
Reward (R)
+
Value Function (V or Q)
+
+
+
+
+
Timing
+
Immediate and short-term.
+
Long-term prediction of future rewards.
+
+
+
Source
+
Provided directly by the environment.
+
Estimated by the agent based on its experience.
+
+
+
Purpose
+
Defines the fundamental goal of the task.
+
Used to guide the agent's policy toward that goal.
+
+
+
Analogy
+
The `+1` point you get for eating a pellet in Pac-Man.
+
Your internal estimate of the final high score you are likely to get from your current position.
+
+
+
+
+
🔹 5. Examples
+
+
Example 1: Chess
+
+
Reward: Sparse. +1 for a win, -1 for a loss, 0 for all other moves.
+
Value Function: A high-value state is a board position where you have a strategic advantage (e.g., controlling the center, having more valuable pieces). The agent learns that these states, while not immediately rewarding, are valuable because they lead to a higher probability of winning.
+
+
+
+
Example 2: Self-driving Car
+
+
Reward: A carefully shaped function: +1 for moving forward, -0.1 for jerky movements, -100 for a collision.
+
Value Function: A high-value state is one that is "safe" and making progress (e.g., driving in the center of the lane with no obstacles nearby). A low-value state is one that is dangerous (e.g., being too close to the car in front), even if no negative reward has been received yet.
+
+
+
+
🔹 6. Challenges
+
+
Reward Shaping: Designing a good reward function is one of the hardest parts of applied RL. A poorly designed reward can lead to unintended "reward hacking."
+
Example: An AI agent rewarded for winning a boat race discovered a bug where it could go in circles and collect turbo boosts infinitely, never finishing the race but accumulating a huge score. It maximized the reward signal, but not in the way the designers intended.
+
+
Sparse Rewards: In many real-world problems, rewards are infrequent (like winning a long game). This makes it very difficult for the agent to figure out which of its thousands of actions were actually responsible for the final outcome.
Imagine a student learning to identify animals. Their teacher gives them a small, labeled set of 10 flashcards (labeled data). The student studies these cards and learns the basic differences between cats and dogs. The teacher then gives the student a huge stack of 1,000 unlabeled photos (unlabeled data). The student goes through the stack and labels the photos they are most confident about (e.g., "I'm 99% sure this is a cat"). They add these self-labeled photos, called pseudo-labels, to their original small set of flashcards. Now, with a much larger study set, they retrain their brain to become an even better animal identifier. This process of using your own knowledge to learn more is the essence of Self-Training.
+
+
Self-Training is a simple yet powerful semi-supervised learning technique. It is used when you have a small amount of labeled data and a large amount of unlabeled data. The model is first trained on the small labeled set, and then it iteratively "bootstraps" itself by using its own predictions on the unlabeled data to improve its performance.
+
+
Supervised vs. Unsupervised vs. Semi-Supervised
+
+
Supervised: All data is labeled (e.g., thousands of flashcards with answers).
+
Unsupervised: No data is labeled (e.g., a pile of photos with no answers).
+
Semi-Supervised: A small amount of labeled data and a large amount of unlabeled data (the self-training scenario).
+
+
+
🔹 Workflow of Self-Training
+
The self-training process is an iterative loop that aims to leverage the unlabeled data effectively.
+
+
+
Train Initial Model: Train a base classifier (like an SVM or Random Forest) on the small, human-labeled dataset (L).
+
Predict on Unlabeled Data: Use this initial model to make predictions on the large unlabeled dataset (U).
+
Select High-Confidence Predictions: From the predictions, select the ones where the model is most confident (e.g., prediction probability > 95%). These are your "pseudo-labels."
+
Add to Training Set: Move these pseudo-labeled data points from the unlabeled set U to the labeled set L.
+
Retrain the Model: Train the model again on the newly expanded labeled set.
+
Repeat: Continue this loop until no more unlabeled data points meet the confidence threshold or a set number of iterations is reached.
+
+
+
🔹 Mathematical Formulation
+
+
Think of the model's learning process as minimizing an "error" or "loss" score. Initially, it only cares about the error on the teacher's flashcards. In self-training, it also starts caring about the error on its self-marked homework, but maybe gives it a little less weight so it doesn't get misled by a mistake.
+
+
The learning process is guided by a combined loss function:
+
$$ L = L_{sup} + \lambda L_{pseudo} $$
+
+
\( L_{sup} \): The supervised loss, calculated on the original, ground-truth labeled data. This is the primary error signal.
+
\( L_{pseudo} \): The loss calculated on the high-confidence pseudo-labeled data.
+
\( \lambda \): A weighting parameter that controls how much the model trusts its own pseudo-labels. A smaller \( \lambda \) means the model relies more on the original labeled data.
+
+
+
🔹 Key Assumptions of Self-Training
+
Self-training can be very effective, but it relies on a few important assumptions. If these aren't true, the model can actually get worse!
+
+
High-Confidence Predictions are Correct: This is the most critical assumption. The model must be accurate when it is highly confident. If its confident predictions are wrong, it will start teaching itself incorrect information.
+
Low-Density Separation: The classes should be separated by a low-density region in the feature space. This means the decision boundary should fall in an area where there are not many data points, making the confident predictions safer.
+
+
+
🔹 Advantages & Disadvantages
+
+
+
+
Advantages
+
Disadvantages
+
+
+
+
+
✅ Simple to implement and understand.
+
❌ Error Propagation: The biggest risk. If the model makes a confident mistake, that incorrect pseudo-label is added to the training set, potentially making the model even more wrong in the next iteration.
+
+
+
✅ Can significantly improve model performance when labeled data is scarce.
+
❌ Confirmation Bias: The model tends to reinforce its own initial biases. If it has a slight bias at the start, self-training can amplify it.
+
+
+
✅ Leverages vast amounts of cheap, unlabeled data.
+
❌ Highly sensitive to the choice of the confidence threshold.
+
+
+
+
+
🔹 Applications
+
Self-training is most useful in domains where labeling is a bottleneck:
+
+
Text Classification: Labeling a few hundred emails as "Spam" or "Not Spam" is easy. Self-training can then use a million unlabeled emails to improve the spam filter.
+
Medical Image Analysis: A radiologist can label a small number of X-rays. The model can then use a vast hospital archive of unlabeled X-rays to improve its diagnostic accuracy.
+
Speech Recognition: Using a small amount of transcribed audio to help label a much larger corpus of untranscribed speech.
+
+
+
🔹 Python Implementation (Beginner Sketch with Scikit-learn)
+
+
Scikit-learn makes implementing self-training incredibly easy with its `SelfTrainingClassifier`. You simply take a standard classifier (like a Random Forest) and wrap it inside `SelfTrainingClassifier`. It handles the iterative prediction and retraining loop for you automatically.
+
+
+import numpy as np
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.svm import SVC
+from sklearn.semi_supervised import SelfTrainingClassifier
+from sklearn.metrics import accuracy_score
+
+# --- 1. Create a Sample Dataset ---
+# We'll create a dataset with 1000 samples.
+X, y = make_classification(n_samples=1000, n_features=10, n_informative=5, random_state=42)
+
+# Split into a tiny labeled set and a large unlabeled set
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.95, random_state=42)
+
+# Let's "hide" most of the labels in the test set to simulate an unlabeled pool
+# A value of -1 is the default for "unlabeled" in scikit-learn
+y_unlabeled = np.full_like(y_test, -1)
+
+# Combine the small labeled training set with the large unlabeled test set
+X_combined = np.concatenate((X_train, X_test))
+y_combined = np.concatenate((y_train, y_unlabeled))
+
+# --- 2. Train a Standard Supervised Model (Baseline) ---
+base_classifier_baseline = SVC(probability=True, random_state=42)
+base_classifier_baseline.fit(X_train, y_train)
+y_pred_baseline = base_classifier_baseline.predict(X_test)
+print(f"Baseline Accuracy (trained on only {len(X_train)} labeled samples): {accuracy_score(y_test, y_pred_baseline):.2%}")
+
+# --- 3. Train a Self-Training Model ---
+# We use the same base classifier
+base_classifier_st = SVC(probability=True, random_state=42)
+self_training_model = SelfTrainingClassifier(base_classifier_st, threshold=0.95)
+
+# Train the model on the combined labeled and unlabeled data
+self_training_model.fit(X_combined, y_combined)
+
+# --- 4. Evaluate the Self-Training Model ---
+y_pred_st = self_training_model.predict(X_test)
+print(f"Self-Training Accuracy (trained on labeled + pseudo-labeled data): {accuracy_score(y_test, y_pred_st):.2%}")
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What is the primary motivation for using semi-supervised learning techniques like self-training?
+
What is the biggest risk associated with self-training, and how does it happen?
+
What is a "pseudo-label"?
+
If you lower the confidence threshold for pseudo-labeling (e.g., from 0.95 to 0.75), what is the likely trade-off?
+
+
+
Answers
+
1. The primary motivation is to leverage large amounts of cheap, unlabeled data to improve a model's performance when labeled data is scarce or expensive to obtain.
+
2. The biggest risk is error propagation. It happens when the model makes a confident but incorrect prediction, and that incorrect "pseudo-label" is added to the training set, which can corrupt the model and make it worse in subsequent iterations.
+
3. A "pseudo-label" is a label for an unlabeled data point that is generated by the machine learning model itself, not by a human.
+
4. The trade-off is between the quantity and quality of pseudo-labels. Lowering the threshold will add more data to the training set in each iteration (increasing quantity), but these labels will be less reliable, increasing the risk of error propagation (decreasing quality).
+
+
+
+
🔹 Key Terminology Explained
+
+
The Story: Decoding the Ambitious Student's Study Guide
+
+
+
+ Semi-Supervised Learning:
+
+ What it is: A learning paradigm that falls between supervised and unsupervised learning, using a mix of labeled and unlabeled data for training.
+
+ Story Example: The student's learning process, using both the teacher's few flashcards (labeled) and their own large stack of photos (unlabeled), is a perfect example of semi-supervised learning.
+
+
+ Pseudo-Label:
+
+ What it is: A label assigned by a model to an unlabeled data point. It's treated as a "real" label for the purpose of retraining, even though it might be incorrect.
+
+ Story Example: When the student confidently writes "Cat" on the back of an unlabeled photo, that "Cat" label is a pseudo-label. It's the student's best guess.
+
+
+ Confidence Threshold:
+
+ What it is: A predefined cutoff (e.g., 95% probability) that a model's prediction must meet to be considered a pseudo-label.
+
+ Story Example: The student decides they will only label photos if they are "at least 95% sure" of their answer. This 95% cutoff is their confidence threshold.
+
+
+ Error Propagation:
+
+ What it is: The process where an error made in an early stage of an iterative process is carried forward and potentially amplified in later stages.
+
+ Story Example: If the student confidently mislabels a photo of a fox as a "dog," they will add this incorrect flashcard to their study pile. In the next round of studying, this wrong example might confuse them further, causing them to mislabel even more photos. This is error propagation.
+
Imagine you need to solve a complex problem, like predicting next month's sales. Instead of hiring one expert, you hire a diverse team of specialists (the base learners): a statistician with a linear model, a data scientist with a decision tree, and a machine learning engineer with an SVM. Each specialist analyzes the data and gives you their prediction.
+ Instead of just averaging their opinions (like in Bagging), you hire a wise, experienced Project Manager (the meta-learner). The manager's job isn't to look at the original data, but to look at the *predictions* from the specialists. Over time, the manager learns which specialists are trustworthy in which situations (e.g., "The statistician is great at predicting stable trends, but the data scientist is better when there's a holiday sale"). The manager then learns to combine their advice intelligently to produce a final forecast that is better than any single specialist's prediction. This is Stacking.
+
+
Stacking (or Stacked Generalization) is a sophisticated ensemble learning technique that combines multiple machine learning models. It uses a "meta-learner" to learn the best way to combine the predictions from several "base learner" models to improve predictive accuracy.
+
+
🔹 2. How Stacking Works
+
Stacking is a multi-level process that learns from the output of other models.
+
+
+
Train First-Level Models (Base Learners): First, train several different models on the same training dataset. It's important that these models are diverse (e.g., a mix of linear models, tree-based models, and instance-based models).
+
Generate Predictions for the Meta-Learner: This is a crucial step. To avoid data leakage, you don't train the meta-learner on predictions made on the training data. Instead, you typically use cross-validation. For each fold, the base learners make predictions on the validation part, and these "out-of-fold" predictions become the training features for the meta-learner.
+
Train the Meta-Learner (Second-Level Model): The meta-learner is trained on a new dataset where the features are the predictions from the base learners, and the target is the original target variable. Its job is to learn the relationship between the base models' predictions and the correct answer.
+
Make Final Predictions: To predict on new, unseen data, you first get predictions from all the base learners. Then, you feed these predictions as input to the trained meta-learner to get the final, combined prediction.
+
+
+
🔹 3. Mathematical Concept
+
If you have \(n\) base learners \(h_1, h_2, ..., h_n\), the meta-learner \(H\) learns a function \(f\) that combines their outputs:
+
$$ H(x) = f(h_1(x), h_2(x), ..., h_n(x)) $$
+
Unlike a simple average, the function \(f\) learned by the meta-learner can be a complex, non-linear combination. It might learn, for example, to trust model \(h_1\) more when its prediction is high, but trust \(h_2\) more when \(h_1\)'s prediction is low.
+
+
🔹 4. Key Points
+
+
Diversity is Key: The power of stacking comes from combining diverse models. If all your base learners are very similar and make the same mistakes, the meta-learner has nothing to learn.
+
Preventing Data Leakage: Using out-of-fold predictions from cross-validation to train the meta-learner is essential to prevent it from simply learning to trust the base model that overfit the training data the most.
+
Flexibility: You can use almost any machine learning model as either a base learner or a meta-learner. A common choice for the meta-learner is a simple model like Logistic/Linear Regression, which can learn a simple weighted combination of the base models' outputs.
+
+
+
🔹 5. Advantages & Disadvantages
+
+
+
+
Advantages
+
Disadvantages
+
+
+
+
+
✅ Can achieve higher predictive performance than any single model in the ensemble.
+
❌ Computationally Expensive: It requires training multiple base models plus a meta-learner, often with cross-validation, making it very time-consuming.
+
+
+
✅ Highly flexible and can combine any type of model (heterogeneous ensembles).
+
❌ Complex to Implement: The setup, especially the cross-validation process for the meta-learner's training data, is more complex than Bagging or Boosting.
+
+
+
✅ Can effectively learn to balance the strengths and weaknesses of different models.
+
❌ Loss of Interpretability: It's extremely difficult to explain *why* a stacking ensemble made a particular prediction, as it involves multiple layers of models.
+
+
+
+
+
🔹 6. Python Implementation (Sketch with `StackingClassifier`)
+
+
Scikit-learn makes it easy to build a stacking model. You define a list of your "specialist" base learners and then specify the final "project manager" meta-learner.
+
+
+
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.ensemble import StackingClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.svm import SVC
+from sklearn.neighbors import KNeighborsClassifier
+
+# --- 1. Get Data ---
+X, y = make_classification(n_samples=500, n_features=10, n_informative=5, random_state=42)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
+
+# --- 2. Define the Base Learners ("The Specialists") ---
+base_learners = [
+ ('decision_tree', DecisionTreeClassifier(max_depth=5, random_state=42)),
+ ('svm', SVC(probability=True, random_state=42)),
+ ('knn', KNeighborsClassifier(n_neighbors=7))
+]
+
+# --- 3. Define and Train the Stacking Ensemble ---
+# The meta-learner is a Logistic Regression model
+stacking_clf = StackingClassifier(
+ estimators=base_learners,
+ final_estimator=LogisticRegression(),
+ cv=5 # Use 5-fold cross-validation to generate meta-features
+)
+
+# Fitting this model trains all base learners and the meta-learner
+stacking_clf.fit(X_train, y_train)
+
+# --- 4. Make Predictions ---
+y_pred = stacking_clf.predict(X_test)
+
+from sklearn.metrics import accuracy_score
+print(f"Stacking Classifier Accuracy: {accuracy_score(y_test, y_pred):.2%}")
+
+
+
+
🔹 7. Applications
+
+
Kaggle Competitions: Stacking is an extremely popular technique among top competitors on platforms like Kaggle, as it can squeeze out the last few percentage points of accuracy needed to win.
+
Critical Systems: In fields like finance (credit scoring) and healthcare (disease diagnosis), combining the outputs of multiple models can lead to more robust and reliable decisions than trusting a single model.
+
Customer Churn Prediction: Combining models that capture different aspects of customer behavior (e.g., a model for usage patterns, a model for support ticket history) to get a more accurate prediction of which customers are likely to leave.
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What is the role of the "meta-learner" in stacking?
+
Why is it important for the base learners in a stacking ensemble to be diverse?
+
What is the main technique used to prevent the meta-learner from overfitting, and why is it necessary?
+
What is the key difference between how Stacking and Bagging combine model predictions?
+
+
+
Answers
+
1. The meta-learner's role is to learn how to best combine the predictions from the base learners. It takes their predictions as input and makes the final prediction.
+
2. Diversity is crucial because if all base learners are similar and make the same mistakes, the meta-learner has no new information to learn from. Diverse models make different kinds of errors, and the meta-learner can learn to correct for them.
+
3.Cross-validation is used to generate the training data for the meta-learner. It's necessary to prevent data leakage. If the meta-learner was trained on predictions made on the same data the base models were trained on, it would simply learn to trust the base model that overfit the most.
+
4. Bagging uses a simple aggregation method like averaging or majority voting. Stacking uses another machine learning model (the meta-learner) to learn a potentially complex way to combine the predictions.
+
+
+
+
🔹 Key Terminology Explained
+
+
The Story: Decoding the Project Manager's Playbook
+
+
+
+ Meta-Learner:
+
+ What it is: The second-level model in a stacking ensemble that learns from the predictions of the first-level base learners.
+
+ Story Example: The project manager is the meta-learner. They don't analyze the raw sales data; they analyze the reports (predictions) from their specialist team.
+
+
+ Out-of-Fold Predictions:
+
+ What it is: In cross-validation, these are the predictions made on the validation fold (the part of the data the model was *not* trained on in that iteration).
+
+ Story Example: This is like the manager testing each specialist on a small, unique set of problems they haven't seen before to get an honest assessment of their skills. These "test results" are the out-of-fold predictions used to train the manager.
+
+
+ Heterogeneous Ensemble:
+
+ What it is: An ensemble that is made up of different types of models.
+
+ Story Example: The project manager's team is a heterogeneous ensemble because it includes a diverse group of specialists (statistician, data scientist, etc.), not just a team of 10 identical statisticians.
+
+
+
+
+
\ No newline at end of file
diff --git a/templates/Test/Quiz-test.html b/templates/Test/Quiz-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..248f7b6ae1bb4f4304653b6bda039a4bce1e0d81
--- /dev/null
+++ b/templates/Test/Quiz-test.html
@@ -0,0 +1,277 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+ Dynamic General Knowledge Quiz
+
+
+
+
+
+
+
+
+
+
+
General Knowledge Challenge
+
Test your knowledge with these quick-fire questions!
+
+
📜 Instructions
+
+
There are 0 questions in total.
+
You will have 15 seconds to answer each question.
+
You have a total of 3 attempts to take this quiz.
+
Once the quiz starts, do not switch tabs or windows.
+
Emergency exit: Esc, Space, A, M.
+
+
+
+
+
+
+
+
+
+ Question 1 of 0
+
+
+ Time: 15s
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Quiz Complete!
+
You scored 0 out of 0
+
+
+
+
+
+
+
+
+
+
+
+
🚫 Attempt Limit Reached
+
You have already taken the quiz 3 times.
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/Test-ensemble.html b/templates/Test/Test-ensemble.html
new file mode 100644
index 0000000000000000000000000000000000000000..338d14d0467c8e8ee83b0e44afb4c7fe2e3c1cb3
--- /dev/null
+++ b/templates/Test/Test-ensemble.html
@@ -0,0 +1,18 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
Ensemble Learning
+
Ensemble learning is a technique that combines multiple machine learning models to get a more robust and accurate prediction than a single model alone.
+{% endblock %}
\ No newline at end of file
diff --git a/templates/Test/Test-reinforcement.html b/templates/Test/Test-reinforcement.html
new file mode 100644
index 0000000000000000000000000000000000000000..44d7815c04f3acf114f50ca5694d4ce87bc5c248
--- /dev/null
+++ b/templates/Test/Test-reinforcement.html
@@ -0,0 +1,21 @@
+
+{% extends "Test-layout.html" %}
+
+
+
+{% block content %}
+
Reinforcement Learning
+
Reinforcement learning (RL) is a subfield of machine learning where an agent learns to make decisions by taking actions in an environment to maximize some cumulative reward.
+{% endblock %}
\ No newline at end of file
diff --git a/templates/Test/Test-semi_supervised.html b/templates/Test/Test-semi_supervised.html
new file mode 100644
index 0000000000000000000000000000000000000000..cbfe4f6c7be9cb66eeb81d58d8916e98bae88bfd
--- /dev/null
+++ b/templates/Test/Test-semi_supervised.html
@@ -0,0 +1,18 @@
+
+{% extends "Test-layout.html" %}
+
+{% block content %}
+
Semi-Supervised Learning
+
Semi-supervised learning is a combination of supervised and unsupervised learning. It uses a small amount of labeled data and a large amount of unlabeled data to train a model. This is particularly useful when it is expensive to label data.
+{% endblock %}
\ No newline at end of file
diff --git a/templates/Test/Test-supervise.html b/templates/Test/Test-supervise.html
new file mode 100644
index 0000000000000000000000000000000000000000..67d0ae4046865ed8949de3d4f98247f6a78752b4
--- /dev/null
+++ b/templates/Test/Test-supervise.html
@@ -0,0 +1,42 @@
+{% extends "Test-layout.html" %}
+
+
+
+{% block content %}
+
Supervised Learning
+
Supervised learning is a type of machine learning where the model is trained on labeled data. This means the training dataset includes both the input and the correct output, and the model learns to map the input to the output.
+{% endblock %}
\ No newline at end of file
diff --git a/templates/Test/Test-unsupervised.html b/templates/Test/Test-unsupervised.html
new file mode 100644
index 0000000000000000000000000000000000000000..002ca785205f48fe1f105680823ea1c653e97575
--- /dev/null
+++ b/templates/Test/Test-unsupervised.html
@@ -0,0 +1,39 @@
+{% extends "Test-layout.html" %}
+
+
+
+
+{% block content %}
+
Unsupervised Learning
+
Unsupervised learning is a type of machine learning that looks for previously undetected patterns in a dataset with no pre-existing labels and with a minimum of human supervision.
+{% endblock %}
\ No newline at end of file
diff --git a/templates/Test/action-policy-test.html b/templates/Test/action-policy-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..a458b9a74ac7abf91a4bf8ebd20dcf2872055d63
--- /dev/null
+++ b/templates/Test/action-policy-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/agent-environment-state-test.html b/templates/Test/agent-environment-state-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..3c44088a99b8c9b793eebbce8a390d822185b252
--- /dev/null
+++ b/templates/Test/agent-environment-state-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/apriori-test.html b/templates/Test/apriori-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..b56c3e3792186e8b494820105d1e1322f1cf94bf
--- /dev/null
+++ b/templates/Test/apriori-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/bagging-test.html b/templates/Test/bagging-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..52a612a4a1c7282737e41b552c2d0ca0d2ae62ec
--- /dev/null
+++ b/templates/Test/bagging-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/boosting-test.html b/templates/Test/boosting-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..f7ec522997f4af391c88e027d0f854e2bb49dd1d
--- /dev/null
+++ b/templates/Test/boosting-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/dbscan-test.html b/templates/Test/dbscan-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..277c9ca5b6cb4ef410a295ecb8ac3de19ea63b73
--- /dev/null
+++ b/templates/Test/dbscan-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/decision-tree-regression-test.html b/templates/Test/decision-tree-regression-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..9ef6fe6c42c8595d4e869e50abea7d8418994fce
--- /dev/null
+++ b/templates/Test/decision-tree-regression-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/decision-trees-c-test.html b/templates/Test/decision-trees-c-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..160b3a5d1d88bc5771af52f346f18ad3009f4850
--- /dev/null
+++ b/templates/Test/decision-trees-c-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/deep-reinforcement-learning-test.html b/templates/Test/deep-reinforcement-learning-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..b19a163c7ef4773b6857ecc97d7f21afe29ac662
--- /dev/null
+++ b/templates/Test/deep-reinforcement-learning-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/eclat-test.html b/templates/Test/eclat-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..59d5bfdcfb2978ffe8c7e6cfa164858099d0dc8d
--- /dev/null
+++ b/templates/Test/eclat-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/generative-models-test.html b/templates/Test/generative-models-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..b2a131a9ea736e46a3b03078bd12a5a9147cd4c9
--- /dev/null
+++ b/templates/Test/generative-models-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/gmm-test.html b/templates/Test/gmm-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..28400f2dcdfdb6f3a6eac34e8d941b6f28a0f104
--- /dev/null
+++ b/templates/Test/gmm-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/gradient-boosting-test.html b/templates/Test/gradient-boosting-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..95aa1b82c34fc848af47ac1c1686c3baaebabd13
--- /dev/null
+++ b/templates/Test/gradient-boosting-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/gradient-descent-test.html b/templates/Test/gradient-descent-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..91b58a7ab2d87483db398dc9057ac32dc1141439
--- /dev/null
+++ b/templates/Test/gradient-descent-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/graph-based-methods-test.html b/templates/Test/graph-based-methods-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..ef907d5bc94dd18a9f758069c382d0ba9012d8d1
--- /dev/null
+++ b/templates/Test/graph-based-methods-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/hierarchical-clustering-test.html b/templates/Test/hierarchical-clustering-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..cd9a0dd2c55f351d6604e41a146371f2ac619957
--- /dev/null
+++ b/templates/Test/hierarchical-clustering-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/ica-test.html b/templates/Test/ica-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..f20557d26c274b3be3513ede277391c6d7383a2b
--- /dev/null
+++ b/templates/Test/ica-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/k-means-test.html b/templates/Test/k-means-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..9267bd34d9edc4524c5c4524132b6a7f28df2c22
--- /dev/null
+++ b/templates/Test/k-means-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/knn-test.html b/templates/Test/knn-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..72d184aac01b2fef49dbd4087db343d5b1c37897
--- /dev/null
+++ b/templates/Test/knn-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/lasso-regression-test.html b/templates/Test/lasso-regression-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..c9f6fca29cc6b94ac075ecd8acd5a2e7c3b4b040
--- /dev/null
+++ b/templates/Test/lasso-regression-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/lda-test.html b/templates/Test/lda-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..03ce0ae810b40cce7a68f3219c5de3a83fae27cf
--- /dev/null
+++ b/templates/Test/lda-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/lightgbm-test.html b/templates/Test/lightgbm-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..429486d04d5dcd1ef18ea654456ba935dc5a1085
--- /dev/null
+++ b/templates/Test/lightgbm-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/linear-Quiz-Overview-Page.html b/templates/Test/linear-Quiz-Overview-Page.html
new file mode 100644
index 0000000000000000000000000000000000000000..b22ecd75086457521dbb53b37016bff4d6cbd805
--- /dev/null
+++ b/templates/Test/linear-Quiz-Overview-Page.html
@@ -0,0 +1,113 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/logistic-regression-test.html b/templates/Test/logistic-regression-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..5dfb0c146fa9309345e5fa07898d080020394767
--- /dev/null
+++ b/templates/Test/logistic-regression-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/naive-bayes-test.html b/templates/Test/naive-bayes-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..f5b71cc82ef656eb8d326eecb8c26959be54705b
--- /dev/null
+++ b/templates/Test/naive-bayes-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/neural-networks-test.html b/templates/Test/neural-networks-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..b59b820ab18d675c3ba3c03257c3ddf717356bfd
--- /dev/null
+++ b/templates/Test/neural-networks-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/pca-test.html b/templates/Test/pca-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..892e3e07bdbc8a82e3aa357fac6d63f94023170a
--- /dev/null
+++ b/templates/Test/pca-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/polynomial-Quiz.html b/templates/Test/polynomial-Quiz.html
new file mode 100644
index 0000000000000000000000000000000000000000..5361707af9038195a01f3855fb24ab13cdf0ded3
--- /dev/null
+++ b/templates/Test/polynomial-Quiz.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/q-learning-test.html b/templates/Test/q-learning-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..7e124eb90e62e4ab2b4a3330a8fb230e29c528ae
--- /dev/null
+++ b/templates/Test/q-learning-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/random-forest-c-test.html b/templates/Test/random-forest-c-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..95a94687333011b333001f295d2882eaceecfc37
--- /dev/null
+++ b/templates/Test/random-forest-c-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/random-forest-regression-test.html b/templates/Test/random-forest-regression-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..d43d72b0464f2d6b8765935794b990baffef399b
--- /dev/null
+++ b/templates/Test/random-forest-regression-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/reward-value-function-test.html b/templates/Test/reward-value-function-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..bdac613f1226906b6866df2a2b4543b3ead63d52
--- /dev/null
+++ b/templates/Test/reward-value-function-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/ridge-regression-test.html b/templates/Test/ridge-regression-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..a193a4933659f4485979991f5e9c21741ff22790
--- /dev/null
+++ b/templates/Test/ridge-regression-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/self-training-test.html b/templates/Test/self-training-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..fa12708879b6270cda5c94f1806af841467f6a46
--- /dev/null
+++ b/templates/Test/self-training-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/stacking-test.html b/templates/Test/stacking-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..50c262a602373d629ad223183a0c0a16609acfc6
--- /dev/null
+++ b/templates/Test/stacking-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/svm-c-test.html b/templates/Test/svm-c-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..26afaa747d147ef10b9db8486a857e4d8f638515
--- /dev/null
+++ b/templates/Test/svm-c-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/svr-r-test.html b/templates/Test/svr-r-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..f8489162c02fb03f93e414fd7127aa5db2fe9621
--- /dev/null
+++ b/templates/Test/svr-r-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/transductive-svm-test.html b/templates/Test/transductive-svm-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..9baaede7570b82f72ce156e5940bcd6f57c3c868
--- /dev/null
+++ b/templates/Test/transductive-svm-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/tsne-test.html b/templates/Test/tsne-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..9e9706e1b86a7097025dc84982692fc600f7fcdb
--- /dev/null
+++ b/templates/Test/tsne-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/voting-test.html b/templates/Test/voting-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..cb8d227798209df32220e00e141eea4777007869
--- /dev/null
+++ b/templates/Test/voting-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Test/xgboost-regression-test.html b/templates/Test/xgboost-regression-test.html
new file mode 100644
index 0000000000000000000000000000000000000000..25089944c32c9342249a461818c8df7331b92cbd
--- /dev/null
+++ b/templates/Test/xgboost-regression-test.html
@@ -0,0 +1,114 @@
+{% extends "Test-layout.html" %}
+
+
+{% block content %}
+
+
+
+
+
+ Quiz Selection
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Choose Your Quiz
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/Transductive-SVM.HTML b/templates/Transductive-SVM.HTML
new file mode 100644
index 0000000000000000000000000000000000000000..c83e29990f5933e463ad8e7cef990af8dad15ee0
--- /dev/null
+++ b/templates/Transductive-SVM.HTML
@@ -0,0 +1,384 @@
+{% extends "layout.html" %}
+
+{% block content %}
+
+
+
+
+
+ Study Guide: Transductive SVM (TSVM)
+
+
+
+
+
+
+
+
+
🛣️ Study Guide: Transductive Support Vector Machines (TSVM)
+
+
🔹 Introduction
+
+
Story-style intuition: The Expert Path-Finder
+
Imagine a standard Support Vector Machine (SVM) is a novice pathfinder. To learn a general rule for navigating any forest, they are given a training manual with a few examples of "safe" plants (blue flowers) and "dangerous" plants (red thorns) (labeled data). From this, they create a simple rule: draw a straight line halfway between the known blue and red plants. This is Inductive Learning—creating a general rule for all future forests.
+
Now, imagine an expert pathfinder using a Transductive SVM (TSVM). They are given a map of a *specific* forest they must navigate. This map has the same few labeled blue and red plants, but it also shows the location of thousands of other unlabeled plants. The expert notices that these unlabeled plants form two distinct groves with a large, empty clearing between them. Instead of just drawing a line based on the two labeled plants, they adjust their path to go straight through the middle of the empty clearing. They are using the structure of the unlabeled landscape to find the safest, most confident path for *this specific forest*. This is Transductive Learning.
+
+
Transductive Support Vector Machine (TSVM) is a semi-supervised learning algorithm that extends the standard SVM. It is designed for situations where you have a small amount of labeled data and a large amount of unlabeled data. Instead of learning a general function for unseen data, it tries to find the best possible labels for the specific unlabeled data it was given during training.
+
+
🔹 Core Concepts
+
The motivation behind TSVM is simple: why ignore a mountain of free information? A standard SVM trained on two labeled points has no idea about the true underlying structure of the data. TSVM operates on the powerful assumption that the unlabeled points are not random; they provide crucial clues about where the real decision boundary should lie.
+
+
Example: The Power of Unlabeled Data
+ [Image showing SVM vs. TSVM decision boundary]
+
1. The SVM Scenario (Inductive): Imagine you have one labeled blue point at (-2, 0) and one labeled red point at (2, 0). A standard SVM would draw a vertical line at x=0 right between them. This seems reasonable.
+
2. The TSVM Scenario (Transductive): Now, imagine you add 100 unlabeled points. You notice they form a tight cluster centered at (-4, 0) and another tight cluster at (4, 0). The original SVM line at x=0 now seems less optimal. The TSVM sees these two unlabeled clusters and adjusts its boundary to pass through the large empty space between them. The new boundary is still at x=0, but the model is now far more confident in this boundary because it is supported by the structure of the unlabeled data.
+
+
The core idea is to find a hyperplane that not only separates the labeled data but also maximizes the margin with respect to the unlabeled data, fundamentally trying to avoid cutting through dense clusters of points.
+
+
🔹 Mathematical Formulation
+
+
The pathfinder's rulebook has two parts. The first part is for the known spots, and the second is a new, complex chapter for the unknown terrain.
+
+
Standard SVM's Rule: "Find a path (hyperplane) that is as simple as possible ($$\min \frac{1}{2} ||w||^2$$) while correctly classifying all known safe/dangerous spots, with a penalty for any mistakes ($$C \sum \xi_i$$)."
+
TSVM's Added Rule: "While following the first rule, also try to assign 'safe' or 'dangerous' labels to all the unknown spots in a way that makes your final path have the widest possible safe zone (margin) overall."
+
+
This second rule makes the problem much harder, because the pathfinder has to guess the labels and find the best path at the same time.
+
+
+
Standard SVM Optimization: The goal is to find a hyperplane (defined by w and b) that minimizes the model's complexity while correctly classifying the labeled points.
+
$$\min \frac{1}{2} ||w||^2 + C \sum \xi_i$$
+
+
TSVM Optimization: The TSVM adds the unlabeled points to this problem. It tries to assign a temporary label (\( y^* \)) to each unlabeled point and then solve the SVM problem. The challenge is to find the set of pseudo-labels \( y^* \) and the hyperplane (w, b) that together result in the maximum possible margin. This turns the problem into a difficult non-convex optimization problem, which is hard to solve perfectly.
+
+
+
🔹 Workflow
+
Because the exact TSVM optimization is hard, in practice, it's often solved with an iterative algorithm that looks very similar to self-training:
+
+
Train an initial SVM on the small labeled dataset. This gives a starting "guess" for the path.
+
Label the unlabeled data using this initial model. These are the first pseudo-labels.
+
Iterative Refinement:
+
+
Add all the pseudo-labeled data to the training set.
+
Retrain the SVM on this much larger combined dataset. The path is now influenced by the unlabeled points.
+
(Advanced Step) The algorithm might check if swapping the labels of two opposing pseudo-labeled points near the boundary could lead to an even better margin. It keeps swapping until no more improvements can be found.
+
+
+
Repeat until the labels on the unlabeled data stop changing or a stopping criterion is met. The path has now settled into its optimal position based on all available information.
+
+
+
🔹 Key Assumptions of TSVM
+
TSVM's success hinges on the same core assumptions as most semi-supervised learning methods:
+
+
Low-Density Separation: The best decision boundary is likely to pass through a region with few data points. In practice: This means TSVM works best when your data naturally forms clusters with some empty space between them.
+
Data Distribution Match: The unlabeled data should come from the same underlying distribution as the labeled data. In practice: The unlabeled plants on the map must be the same types of plants as the labeled ones; you can't have unlabeled palm trees if your labeled data is only pines and oaks.
+
+
+
🔹 Advantages & Disadvantages
+
+
+
+
Advantages
+
Disadvantages
+
+
+
+
+
✅ Can significantly improve the decision boundary and performance when labeled data is scarce. Example: A spam filter trained on only 50 labeled emails might be 70% accurate. By using 5,000 unlabeled emails, a TSVM could potentially boost accuracy to 95%.
+
❌ The optimization problem is non-convex, meaning it's hard to find the globally optimal solution and can be computationally very expensive. Example: Finding the best path might take hours or days for a very large, complex forest map, and you might still end up in a "good" valley instead of the "best" one.
+
+
+
✅ Effectively leverages the structure of unlabeled data to find a better margin. Example: It doesn't just separate two patients; it draws the diagnostic line in the empty space between the entire "healthy" and "sick" populations shown in the unlabeled data.
+
❌ Error Propagation: If the model confidently assigns wrong pseudo-labels early on, these errors can corrupt the training process. Example: If the pathfinder initially mislabels a patch of dangerous thorns as "safe," it will actively try to draw its path closer to them, making the final path more dangerous.
+
+
+
+
+
🔹 Applications
+
TSVM is most useful in fields where unlabeled data is plentiful but getting labels is a bottleneck:
+
+
Text Classification: Imagine you want to classify legal documents. Having lawyers label thousands of documents is extremely expensive. With TSVM, you can have them label a few hundred, and then use a database of millions of unlabeled documents to build a highly accurate classifier.
+
Bioinformatics: Classifying protein functions or gene expression data where lab experiments (labels) are expensive and time-consuming.
+
+
+
🔹 Python Implementation (Conceptual Sketch)
+
+
True TSVMs are not included in `scikit-learn` because they are computationally complex. However, we can approximate the behavior of a TSVM using the `SelfTrainingClassifier` with an SVM as its base. This wrapper effectively performs the iterative self-labeling workflow described above, which is a common and practical way to implement the core idea of transductive learning.
+
+
+import numpy as np
+from sklearn.datasets import make_classification
+from sklearn.model_selection import train_test_split
+from sklearn.svm import SVC
+from sklearn.semi_supervised import SelfTrainingClassifier
+from sklearn.metrics import accuracy_score
+
+# --- 1. Create a Sample Dataset ---
+# We simulate a scenario with 500 total data points.
+X, y = make_classification(n_samples=500, n_features=10, n_informative=5, random_state=42)
+
+# --- 2. Create a small labeled set and a large unlabeled set ---
+# This is a realistic scenario: we only have 50 labeled samples (10%).
+# The other 450 samples are our "unlabeled" pool.
+X_train, X_unlabeled, y_train, y_true_unlabeled = train_test_split(X, y, test_size=0.9, random_state=42)
+
+# To simulate the semi-supervised setting, we "hide" the labels of the unlabeled pool.
+# scikit-learn uses -1 to denote an unlabeled sample.
+y_unlabeled_masked = np.full_like(y_true_unlabeled, -1)
+X_combined = np.concatenate((X_train, X_unlabeled))
+y_combined = np.concatenate((y_train, y_unlabeled_masked))
+
+# --- 3. Train a Standard Inductive SVM (Baseline) ---
+# This model only learns from the 50 labeled samples.
+inductive_svm = SVC(probability=True, random_state=42)
+inductive_svm.fit(X_train, y_train)
+y_pred_inductive = inductive_svm.predict(X_unlabeled)
+print(f"Baseline Inductive SVM Accuracy (trained on only {len(X_train)} samples): {accuracy_score(y_true_unlabeled, y_pred_inductive):.2%}")
+
+# --- 4. Train a TSVM approximation using SelfTrainingClassifier ---
+# This wrapper will take our base SVM and perform the iterative self-labeling process.
+base_svm = SVC(probability=True, random_state=42)
+# The threshold determines how confident the model must be to create a "pseudo-label".
+tsvm_approx = SelfTrainingClassifier(base_svm, threshold=0.9)
+
+# We train the model on the combined set of labeled and unlabeled data.
+tsvm_approx.fit(X_combined, y_combined)
+
+# --- 5. Evaluate the Transductive Model ---
+# We test its performance on the same set of unlabeled data.
+y_pred_transductive = tsvm_approx.predict(X_unlabeled)
+print(f"TSVM (Approximation) Accuracy (trained with unlabeled data): {accuracy_score(y_true_unlabeled, y_pred_transductive):.2%}")
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What is the main difference between Inductive and Transductive learning?
+
What information does a TSVM use that a standard SVM does not?
+
Why is the TSVM optimization problem considered "non-convex"?
+
What is the biggest risk when using a TSVM or any self-training based method?
+
+
+
Answers
+
1.Inductive learning aims to learn a general rule from training data that can be applied to any future unseen data. Transductive learning aims to find the optimal labels for the specific unlabeled data points it is given during training; it doesn't create a general rule.
+
2. A TSVM uses the feature information from the large set of unlabeled data to help find a better decision boundary. A standard SVM ignores this and only uses the labeled data.
+
3. It is non-convex because it involves assigning discrete labels to the unlabeled points. The process of searching for the best combination of labels and the best hyperplane at the same time creates a complex optimization landscape with many local minima, making it hard to find the single best solution.
+
4. The biggest risk is error propagation. If the model confidently assigns incorrect pseudo-labels to the unlabeled data, these errors are baked into the next training iteration, potentially corrupting the model and making the final decision boundary worse.
+
+
+
+
🔹 Key Terminology Explained
+
+
The Story: Decoding the Expert Path-Finder's Map
+
+
+
+ Inductive Learning:
+
+ What it is: The most common form of machine learning, where the goal is to generalize from a training set to make predictions on future, unseen data.
+
+ Story Example: The pathfinder learns general "rules of thumb" for any forest (e.g., "avoid swampy areas," "stay on high ground"). This is an inductive approach.
+
+
+ Transductive Learning:
+
+ What it is: A learning setting where the model has access to the test data (without labels) during training and tries to optimize its predictions for that specific test set.
+
+ Story Example: The pathfinder is given the exact map of the specific forest they need to cross. They use all the features of this map to find the best path. This is a transductive approach.
+
+
+ Margin (in SVMs):
+
+ What it is: The "buffer zone" or empty space between the decision boundary (the hyperplane) and the closest data points from either class. SVMs aim to maximize this margin.
+
+ Story Example: This is the width of the clear path the pathfinder creates. A wider path (larger margin) is safer and represents a more confident decision boundary.
+
+
+ Non-Convex Optimization:
+
+ What it is: A type of optimization problem where the solution landscape can have multiple "valleys" (local optima). Finding the absolute lowest valley (the global optimum) is not guaranteed.
+
+ Story Example: Imagine trying to find the lowest point in a hilly mountain range. It's easy to walk downhill into a valley, but it's hard to know if you're in the lowest valley in the entire range or just a smaller, local one. This is a non-convex problem.
+
Imagine a talent show with a panel of three judges. Each judge (a base model) has a different background: one is an expert in singing, one in dancing, and one in comedy. After a performance, each judge gives their vote for whether the contestant should pass.
+ • Hard Voting: The final decision is based on a simple majority. If two out of three judges vote "Pass," the contestant passes. This is a democratic vote where every judge has an equal say.
+ • Soft Voting: Instead of a simple "yes" or "no," each judge provides a confidence score (e.g., "I'm 90% confident they should pass"). The final decision is based on the average confidence score across all judges. This method is often better because it accounts for the *certainty* of each judge's vote.
+ A Voting Ensemble is this panel of judges, combining their diverse opinions to make a final decision that is often more robust and accurate than any single judge's opinion.
+
+
A Voting Ensemble is one of the simplest and most effective ensemble learning techniques. It works by training multiple different models on the same data and combining their predictions to generate a final output. Unlike Stacking, it does not use a meta-learner; instead, it relies on simple statistical methods like majority vote or averaging.
+
+
🔹 2. How Voting Works
+
The process is straightforward and can be run in parallel.
+
+
+
Train Diverse Base Models: Train several different machine learning models (e.g., a Logistic Regression, a Decision Tree, and a KNN) independently on the entire training dataset.
+
Make Predictions: For a new data point, get a prediction from each of the trained models.
+
Aggregate the Predictions: Combine the predictions using a voting rule.
+
+
Hard Voting (for Classification): The final prediction is the class label that was predicted most frequently by the base models.
+
Soft Voting (for Classification): The final prediction is the class label with the highest average predicted probability. This requires that the base models can output class probabilities.
+
Averaging (for Regression): The final prediction is simply the average of the predictions from all the base models.
+
+
+
+
+
🔹 3. Key Points
+
+
Simplicity: It's one of the easiest ensemble methods to implement and understand.
+
Model Diversity is Crucial: Voting works best when the base models are diverse and make different types of errors. Combining three identical models provides no benefit.
+
Parallelizable: Since all base models are trained independently, the training process can be fully parallelized, making it computationally efficient.
+
+
+
🔹 4. Advantages & Disadvantages
+
+
+
+
Advantages
+
Disadvantages
+
+
+
+
+
✅ Very easy to implement and interpret.
+
❌ Often less powerful than more advanced ensembles like Boosting or Stacking.
+
+
+
✅ Can improve predictive accuracy and create a more robust model.
+
❌ It doesn't have a mechanism to explicitly correct the errors of its base models.
+
+
+
✅ Allows for the combination of different types of models (heterogeneous ensemble).
+
❌ The performance is highly dependent on the quality and diversity of the base models.
+
+
+
+
+
🔹 5. Python Implementation (Sketches)
+
+
Scikit-learn provides convenient `VotingClassifier` and `VotingRegressor` classes that make building a voting ensemble very simple. You just need to provide a list of the models you want to include in the panel.
+
+
+
Voting Classifier Example
+
+from sklearn.ensemble import VotingClassifier
+from sklearn.linear_model import LogisticRegression
+from sklearn.tree import DecisionTreeClassifier
+from sklearn.svm import SVC
+# Assume X_train, y_train, X_test are defined
+
+# 1. Define the panel of judges (base models)
+estimators = [
+ ('lr', LogisticRegression(random_state=42)),
+ ('dt', DecisionTreeClassifier(random_state=42)),
+ ('svc', SVC(probability=True, random_state=42)) # probability=True is needed for soft voting
+]
+
+# 2. Create the Voting Ensemble
+# 'soft' voting uses predicted probabilities and is often better
+voting_clf = VotingClassifier(estimators=estimators, voting='soft')
+
+# 3. Train and predict
+voting_clf.fit(X_train, y_train)
+y_pred = voting_clf.predict(X_test)
+
+
+
Voting Regressor Example
+
+from sklearn.ensemble import VotingRegressor
+from sklearn.linear_model import LinearRegression
+from sklearn.ensemble import RandomForestRegressor
+from sklearn.svm import SVR
+# Assume X_train, y_train, X_test are defined
+
+# 1. Define the panel of regressors
+regressors = [
+ ('lr', LinearRegression()),
+ ('rf', RandomForestRegressor(random_state=42)),
+ ('svr', SVR())
+]
+
+# 2. Create the Voting Ensemble (averages the predictions)
+voting_reg = VotingRegressor(estimators=regressors)
+
+# 3. Train and predict
+voting_reg.fit(X_train, y_train)
+y_pred_reg = voting_reg.predict(X_test)
+
+
+
+
🔹 6. Applications
+
+
Quick Ensemble Baseline: It's an excellent way to quickly build a baseline ensemble model to see if combining models is likely to improve performance on a given problem.
+
Production Models: Due to its simplicity and robustness, a voting ensemble of a few strong, diverse models is often a good candidate for a reliable production system.
+
Used across many domains, including fraud detection, medical diagnosis, and customer churn prediction, just like other ensemble methods.
+
+
+
+
📝 Quick Quiz: Test Your Knowledge
+
+
What is the difference between Hard Voting and Soft Voting? Which one is usually preferred and why?
+
Does a Voting Ensemble learn from the mistakes of its base models?
+
You create a Voting Classifier with three identical, perfectly-trained Decision Tree models. Will this ensemble perform better than a single Decision Tree?
+
+
+
Answers
+
1.Hard Voting uses a simple majority vote of the predicted class labels. Soft Voting averages the predicted probabilities for each class and chooses the class with the highest average probability. Soft voting is usually preferred because it accounts for how confident each model is in its prediction.
+
2. No, it does not. A Voting Ensemble trains its models independently and combines their outputs with a fixed rule (voting/averaging). It does not have a mechanism to sequentially correct errors like Boosting does.
+
3. No, it will perform exactly the same. Since all three models are identical, they will always produce the same output, and the majority vote will always be the same as the single model's prediction. Diversity is essential for a voting ensemble to be effective.
+
+
+
+
🔹 Key Terminology Explained
+
+
The Story: Decoding the Judge's Scorecard
+
+
+
+ Hard Voting:
+
+ What it is: A simple majority vote. The class with the most votes wins.
+
+ Story Example: Three judges vote. Judge 1: "Pass". Judge 2: "Fail". Judge 3: "Pass". The majority is "Pass" (2 out of 3), so the contestant passes.
+
+
+ Soft Voting:
+
+ What it is: A weighted vote based on predicted probabilities.
+
+ Story Example: Three judges give confidence scores. Judge 1: "90% Pass". Judge 2: "70% Fail". Judge 3: "60% Pass". The average probability for "Pass" is (0.9 + (1-0.7) + 0.6) / 3 = 0.6. The average for "Fail" is 0.4. Since 0.6 > 0.4, the contestant passes. This method captured the uncertainty of Judge 2.
+
If Gradient Boosting (GBR) is a careful craftsman, XGBoost is that same craftsman given a supercharged toolkit, a blueprint for efficiency, and a strict rulebook to prevent mistakes. The toolkit lets them work faster (parallelization), and the rulebook forces them to build simpler, more robust creations (regularization), making them a champion in their field.
+
+
What is XGBoost?
+
+ XGBoost (Extreme Gradient Boosting) is an advanced, optimized implementation of the gradient boosting framework. It's designed for speed, performance, and accuracy.
+
+
+
Difference between XGBoost vs. GBR vs. Random Forest:
+
+
Random Forest: A team of independent experts working in parallel and averaging their results. Democratic and robust.
+
Gradient Boosting: A line of experts, each one correcting the mistakes of the one before. A sequential learning process.
+
XGBoost: Like GBR, but each expert uses more advanced math (2nd-order gradients) to find mistakes and is penalized for being too complex (regularization). The whole process is also highly optimized to run faster.
+
+
+
🔹 Mathematical Foundation
+
+
Story example: The Chef's Two Goals
+
A standard chef wants to make a tasty dish (minimize loss). An XGBoost chef has two goals: make the dish tasty AND keep the recipe simple and clean (minimize complexity via regularization). Their objective is to find the perfect balance. They use advanced taste-testing techniques (2nd-order Taylor expansion) to figure out not just if the dish is too salty, but how quickly it's becoming too salty, allowing for more precise corrections.
+
+
The Objective Function:
+
XGBoost aims to minimize a more advanced objective function:
\( \sum l(y_i, \hat{y}_i) \): The Loss Function. Measures how far the model's predictions (\(\hat{y}_i\)) are from the actual values (\(y_i\)). This is the "tasty" part.
+
\( \sum \Omega(f_k) \): The Regularization Term. A penalty for the complexity of the trees (\(f_k\)). This is the "simple recipe" part.
+
+
XGBoost uses a 2nd-order Taylor expansion to approximate the loss, which allows it to use both first-order (gradient) and second-order (Hessian) derivatives for more accurate optimization.
+
+
🔹 Algorithm Steps
+
+
Story example: The Hiker with a Curvature Sensor
+
A standard Gradient Boosting hiker only knows the slope (gradient) under their feet. The XGBoost hiker has an advanced sensor that also measures the curvature of the ground (the Hessian). This tells them not just which way is down, but whether the path is flattening out or getting steeper. This extra information allows them to take much smarter, more direct steps towards the valley floor.
+
+
+
Start with an initial prediction (usually the mean of the target values).
+
For each boosting round:
+
+
Compute the gradients (1st derivative) and Hessians (2nd derivative) of the loss function.
+
Build a new tree that best predicts these gradients, using the Hessians as weights.
+
Apply regularization (L1/L2) to the tree to penalize complexity and prune weak branches.
+
Update the overall model by adding this new, regularized tree, scaled by a learning rate.
+
+
+
+
🔹 Key Features of XGBoost
+
+
Story: The Ultimate Multi-Tool
+
XGBoost isn't just one tool; it's a complete toolkit. It has a special attachment for dealing with missing pieces (handles missing values), it can be used by a whole team at once (parallel computing), and it has an automatic shut-off switch to prevent overheating (early stopping).
Parallel Processing: Optimizes computations to run in parallel, making it much faster than standard GBR.
+
Handles Missing Values: Can automatically learn the best way to handle missing data points.
+
Built-in Cross-Validation: Has a `cv` function to easily perform cross-validation.
+
Tree Pruning: Prunes trees based on a `gamma` parameter, not just `max_depth`.
+
+
+
🔹 Key Parameters
+
+
+
+
Parameter
+
Explanation & Story
+
+
+
+
+
n_estimators
+
The number of boosting rounds or trees. Story: How many corrective experts you hire for your team.
+
+
+
learning_rate (eta)
+
Shrinks the contribution of each tree. Story: How much you trust each new expert's advice. A small value means you only make small adjustments based on their feedback.
+
+
+
max_depth
+
Maximum depth of a tree. Story: The maximum complexity allowed for any single expert's reasoning.
+
+
+
subsample & colsample_bytree
+
Fraction of data (rows) and features (columns) used per tree. Story: Giving each expert a slightly different, random piece of the problem to work on so they don't all have the same blind spots.
+
+
+
reg_alpha (L1) & reg_lambda (L2)
+
Regularization terms. Story: A penalty system. `reg_alpha` penalizes using too many features, while `reg_lambda` penalizes using any single feature too heavily.
+
+
+
+
+
🔹 Strengths & Weaknesses
+
+
XGBoost is like a Formula 1 car. It's incredibly fast and powerful (high accuracy, fast), and it's packed with safety features (regularization). However, it requires a skilled driver to tune all the settings (many hyperparameters) and might be too much car for a simple trip to the grocery store (overkill for small datasets).
+
+
Advantages:
+
+
✅ Extremely accurate and often wins machine learning competitions.
+
✅ Fast due to parallel and distributed computing.
+
✅ Prevents overfitting effectively with built-in regularization.
+
✅ Handles sparse and missing data automatically.
+
+
Disadvantages:
+
+
❌ Can be more complex to tune due to a large number of hyperparameters.
+
❌ May not be the fastest option for smaller, simpler datasets.
+
+
+
🔹 Python Implementation
+
+
Here, we import our "master craftsman" from the `xgboost` library. We give it data and define the rules for its toolkit (`n_estimators`, `learning_rate`). We train it with an important safety check: `early_stopping_rounds`, which tells it to stop working if its performance on a test set doesn't improve after a certain number of rounds.
Let's demystify the advanced tools and rules our XGBoost craftsman uses to build superior models.
+
+
Hessian (Second-Order Derivative)
+
+ What it is: In optimization, the Hessian measures the curvature of the loss function. While the gradient (1st derivative) points downhill, the Hessian tells you if that downhill path is a gentle slope or a steep drop-off.
+
+
+ Story Example: A self-driving car uses the gradient to know which way the road is turning. It uses the Hessian to know how sharp that turn is. This allows it to slow down more for a hairpin bend than for a gentle curve, leading to a much smoother and more direct path.
+
+
+
Regularization
+
+ What it is: A technique used to prevent overfitting by adding a penalty for model complexity to the loss function. It forces the model to be simpler and generalize better.
+
+
+ Story Example: A writer is paid to write an article. Without regularization, they might write a 50-page, overly detailed article that is hard to read. With regularization, there's a penalty for every extra word they use. This forces them to be concise and clear, producing a better article that is useful to more people.
+
+
+
L1 (Lasso) and L2 (Ridge) Regularization
+
+ What they are: Two different types of regularization penalties.
+
+
+
L1 (Lasso): Penalizes the absolute value of the weights. It can shrink unimportant feature weights all the way to zero, effectively performing feature selection.
+
L2 (Ridge): Penalizes the squared value of the weights. It makes weights smaller but doesn't usually shrink them to zero.
+
+
+ Story Example: Imagine packing for a trip. L1 Regularization is like a rule that says "for every item you pack, you pay a fee." This forces you to be ruthless and leave behind anything non-essential (shrinking weights to zero). L2 Regularization is a rule that says "the fee is based on the total weight of your suitcase." This encourages you to pack lighter items but doesn't force you to leave entire categories of items behind.
+
+
+
Tree Pruning
+
+ What it is: The process of removing sections of a decision tree (branches) that provide little predictive power, in order to reduce complexity and prevent overfitting.
+
+
+ Story Example: A gardener grows a rose bush to get the best flowers. After it grows, they perform pruning by cutting off small, weak branches that aren't producing good buds. This directs the plant's energy to the stronger branches, resulting in bigger, healthier roses. XGBoost prunes its trees in a similar way to focus on the most important predictive rules.
+
+
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/templates/api.html b/templates/api.html
new file mode 100644
index 0000000000000000000000000000000000000000..7baaaa44a7622532f36b961d7b2599a74744148e
--- /dev/null
+++ b/templates/api.html
@@ -0,0 +1,21 @@
+
diff --git a/templates/auth/forgot_password.html b/templates/auth/forgot_password.html
new file mode 100644
index 0000000000000000000000000000000000000000..c011a7b008090ca3b132871b53fa561f5c73be77
--- /dev/null
+++ b/templates/auth/forgot_password.html
@@ -0,0 +1,32 @@
+
+
+
+
+ Forgot Password
+
+
+
+
+
+ Explore K-Means, an unsupervised learning algorithm that partitions data into K distinct clusters. For example, an online store uses K-Means to group customers based on purchase frequency and spending, creating segments like Budget Shoppers, Frequent Buyers, and Big Spenders for personalized marketing.
+
+ We are given a data set of items with certain features and values for these features like a vector. The task is to categorize those items into groups. To achieve this we will use the K-means algorithm. 'K' in the name of the algorithm represents the number of groups/clusters we want to classify our items into.
+
+
+ The algorithm works by first randomly picking some central points called centroids and each data point is then assigned to the closest centroid forming a cluster. After all the points are assigned to a cluster, the centroids are updated by finding the average position of the points in each cluster. This process repeats until the centroids stop changing, forming clusters. The goal of clustering is to divide the data points into clusters so that similar data points belong to the same group.
+
+
+
+
+
+
Flow of Data Treated by K-Means Algorithm
+
+
+ ✨
1. Initialize Centroids
+
Randomly pick K points
+
+
→
+
+ 📍
2. Assign Points to Clusters
+
To closest centroid
+
+
→
+
+ 🔄
3. Update Centroids
+
Calculate new means
+
+
↓
+
→
+
+ ✅
4. Convergence Check
+
Centroids stabilized?
+
+
+
+ The K-Means algorithm iteratively refines clusters. It starts by randomly selecting initial centroids. Then, each data point is assigned to the closest centroid, forming preliminary clusters. Next, the centroids are updated to the mean position of all points within their assigned clusters. This assignment and update process repeats until the centroids no longer change significantly, indicating that the clusters have converged.
+
+
+
+
+
+
Understanding K-Means Clustering
+
+ K-Means is an unsupervised learning algorithm that aims to partition $$n$$ observations into $$k$$ clusters in which each observation belongs to the cluster with the nearest mean (centroid), serving as a prototype of the cluster.
+
+
How K-Means Algorithm Works:
+
+
+ 1. Initialization: Randomly select $$K$$ data points from the dataset as initial centroids.
+
+
+ 2. Assignment Step: Assign each data point to the cluster whose centroid is closest to it (based on Euclidean distance).
+
+
+ 3. Update Step: Recalculate the centroids by taking the mean of all data points assigned to that cluster.
+
+
+ 4. Iteration: Repeat steps 2 and 3 until the centroids no longer move significantly or a maximum number of iterations is reached.
+
+
+
+ The objective of K-Means is to minimize the sum of squared distances between data points and their assigned cluster's centroid, also known as the within-cluster sum of squares (WCSS).
+
+ Centroid: The mean position of all data points in a cluster.
+
+
+ K: The number of clusters to form. This is a hyperparameter that must be chosen before running the algorithm.
+
+
+ Voronoi Diagram: The partitioning of the plane into regions based on distance to points in a specific subset of the plane. In K-Means, these regions represent the areas where new points would be assigned to a particular cluster.
+
+ A Decision Tree learns a series of if-then-else rules from your data, forming a tree structure. When new data comes in, it simply follows these rules down the tree to arrive at a classification.
+
+ A Decision Tree is a powerful, intuitive, and versatile supervised learning algorithm that models decisions in a tree-like structure. It provides a clear, flowchart-like representation of the choices and their potential outcomes, making it highly interpretable. By traversing its "branches," one can easily compare different paths and understand the reasoning behind a particular classification or prediction.
+
+
+
Types of Decision Trees:
+
+
+ Classification Trees: These are used when the target variable is categorical. For instance, classifying an email as 'spam' or 'not spam', or predicting if a customer will 'churn' or 'stay'. The tree partitions the data into regions, and each region is assigned a class label based on the majority class of data points falling into it.
+
+
+ Regression Trees: Employed when the target variable is continuous. Examples include predicting house prices, stock values, or a patient's recovery time. Instead of assigning categories, leaf nodes in regression trees hold a numerical value (e.g., the average of the target variable for data points in that region).
+
+
+
+
Key Components of a Decision Tree:
+
+
+ Root Node: The starting point of the tree, representing the entire dataset. It's the first decision node from which all other branches originate.
+
+
+ Decision Node (Internal Node): A node that represents a test on a specific feature (attribute). Based on the outcome of this test, the data is split into subsets, leading to new branches.
+
+
+ Branch: Represents the outcome of a decision node's test. It connects a parent node to a child node (either another decision node or a leaf node).
+
+
+ Leaf Node (Terminal Node): A node that does not split further. It represents the final decision or prediction (a class label for classification or a numerical value for regression).
+
+
+ Max Depth: A crucial hyperparameter that limits the maximum number of levels or splits from the root to the deepest leaf. It's a primary control for preventing overfitting.
+
+
+
+
+ Figure 1: A simplified representation of a Decision Tree's basic structure, showing a root node, branches, and leaf nodes.
+
+
+
+
+
How Decision Trees Work (The Learning Process):
+
+ The Decision Tree algorithm builds its structure by recursively partitioning the feature space into distinct, often rectangular, regions.
+
+
+
+ 1. Start at the Root: The entire training dataset begins at the root node. The tree considers all features to find the optimal initial split.
+
+
+ 2. Find the Best Split: At each node, the algorithm evaluates various possible splits for all available features. The goal is to find the split that best separates the data into purer subsets (meaning subsets where data points predominantly belong to a single class). This evaluation is based on a specific "splitting criterion." For 2D data, these splits result in axis-aligned (horizontal or vertical) lines.
+
+
+ 3. Branching: Based on the chosen best split, the data is divided into two (or more) subsets, and corresponding branches are created, leading to new child nodes.
+
+
+ 4. Continue Partitioning: Steps 2 and 3 are recursively applied to each new child node. This process continues until a stopping condition is met, such as:
+
+
All data points in a node belong to the same class.
+
The predefined `max_depth` limit is reached.
+
The number of data points in a node falls below a minimum threshold.
+
No further informative splits can be made.
+
+
+
+ 5. Form Leaf Nodes: Once a stopping condition is met for a particular branch, that node becomes a leaf node. It's then assigned the class label (for classification) or numerical value (for regression) that is most representative of the data points within that final region.
+
+
+
+ When a new, unlabeled data point needs classification, it traverses the tree from the root. At each decision node, it follows the path corresponding to its feature values, finally arriving at a leaf node which provides the model's prediction.
+
+
+
Splitting Criteria in Decision Trees:
+
+ The effectiveness of a Decision Tree heavily relies on its ability to find the best feature and split point at each node. This is determined by mathematical metrics called splitting criteria:
+
+
+
+ Gini Impurity: This criterion measures the probability of incorrectly classifying a randomly chosen element from the dataset if it were randomly labeled according to the distribution of labels in the subset. A Gini Impurity of 0 means the node is "pure" (all elements belong to the same class). Decision Trees aim to minimize Gini Impurity at each split.
+
+ $$ G = 1 - \sum_{i=1}^{C} (p_i)^2 $$
+ Where $p_i$ is the probability of an element belonging to class $i$, and $C$ is the total number of classes.
+
+
+
+ Information Gain (Entropy): Based on the concept of entropy from information theory, Information Gain measures the reduction in uncertainty or "randomness" after a split. The algorithm seeks splits that provide the maximum information gain.
+
+ $$ \text{Entropy}(S) = - \sum_{i=1}^{C} p_i \log_2(p_i) $$
+ $$ \text{Information Gain}(S, A) = \text{Entropy}(S) - \sum_{v \in Values(A)} \frac{|S_v|}{|S|} \text{Entropy}(S_v) $$
+ Where $S$ is the set of examples, $A$ is an attribute (feature), $Values(A)$ are the possible values for attribute $A$, $S_v$ is the subset of $S$ for which attribute $A$ has value $v$, and $p_i$ is the proportion of class $i$ in $S$.
+
+
+
+
+
Advantages of Decision Trees:
+
+
Interpretability: Easy to understand and visualize, often referred to as "white box" models.
+
Minimal Data Preprocessing: Can handle both numerical and categorical data, and often don't require feature scaling or normalization.
+
Versatility: Can be used for both classification and regression tasks.
+
Non-linear Relationships: Capable of capturing non-linear relationships between features and target.
+
+
+
Disadvantages and Challenges:
+
+
Overfitting: Can easily overfit noisy data, leading to trees that are too complex and don't generalize well. iOverfitting occurs when a model learns the training data too well, including its noise and outliers, making it perform poorly on new, unseen data.
+
Instability: Small variations in the data can lead to a completely different tree structure.
+
Bias with Imbalanced Data: Can be biased towards dominant classes if the dataset is imbalanced.
+
Local Optima: The greedy approach of finding the best split at each step doesn't guarantee a globally optimal tree.
+
+
+
Mitigating Overfitting (Pruning):
+
+ To combat overfitting, various techniques are employed, most notably "pruning." Pruning involves removing branches that have little predictive power, simplifying the tree.
+
+
+
Pre-pruning (Early Stopping): Stopping the tree construction early based on thresholds like `max_depth`, `min_samples_leaf` (minimum number of samples required to be at a leaf node), or `min_impurity_decrease`.
+
Post-pruning: Growing the full tree first, then removing branches that provide little value using metrics like cross-validation error or statistical tests.
+
+
+
Ensemble Methods (Beyond Single Trees):
+
+ Despite their challenges, Decision Trees form the building blocks for more powerful algorithms, especially ensemble methods:
+
+
+
Random Forests: Builds multiple Decision Trees during training and outputs the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. This reduces overfitting and improves accuracy.
+
Gradient Boosting (e.g., XGBoost, LightGBM): Builds trees sequentially, where each new tree tries to correct the errors of the previous ones. Highly powerful and widely used.
+
+
+ By understanding the fundamentals of Decision Trees, you gain a solid foundation for comprehending these more advanced and robust machine learning models.
+
+
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/templates/decision_tree_game.html b/templates/decision_tree_game.html
new file mode 100644
index 0000000000000000000000000000000000000000..4f27401beb0faf9dd9c63d074ef2c7d8fcfca3a7
--- /dev/null
+++ b/templates/decision_tree_game.html
@@ -0,0 +1,768 @@
+{% extends "layout.html" %}
+{% block content %}
+
+
+
+
+ Clever Critter Classifier
+
+
+
+
+
+
+
🐾 Clever Critter Classifier! 🌿
+
+ Think of your favorite animal, answer wisely, and watch the tree reveal its identity!
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Understanding Decision Trees
+
+ This game is a simple, fun way to explore how a **Decision Tree** algorithm works in Machine Learning. Imagine a flowchart that helps make decisions!
+
+
+
+ 🌳 Root Node (Top): This is your starting point, like the first big question everyone asks. In real Decision Trees, this node considers all possible data and picks the best question to split it.
+
+
+ 🌿 Internal Nodes (Middle Questions): These are where you make choices based on features (like "Does it have fur?"). Each answer leads you down a specific branch, getting you closer to a final decision.
+
+
+ 🌸 Leaf Nodes (Bottom - Classifications): You've reached the end of a path! These are the final answers or predictions. In machine learning, this could be classifying an email as "Spam" or predicting if a customer will "Churn."
+
+
+ 🔗 Paths (Decision Rules): The sequence of choices you make from the root to a leaf node creates a unique "rule." This rule can then be applied to new, unseen data to make a prediction!
+
+
+
+ Just like your choices guide you through this game, Decision Trees learn these question-and-answer paths from large datasets to classify new information.
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/templates/desiciongame.html b/templates/desiciongame.html
new file mode 100644
index 0000000000000000000000000000000000000000..16ac17ea778b7f44f1fb79e92fa1cc5b1bb44b85
--- /dev/null
+++ b/templates/desiciongame.html
@@ -0,0 +1,702 @@
+
+
+
+
+
+ Decision Tree Regression Game
+
+
+
+
+
+
🎯 Decision Tree Regression Game
+
Visualize and interact with machine learning regression in real-time!
+
+
+
+
+
+
+
+
Data Points
+
0
+
+
+
Tree Depth
+
1
+
+
+
+
+
+
+
🎮 Game Controls
+
+
+
+
+
+
+
🌳 Tree Settings
+
+
+
3
+
+
+
+
📊 Data Points
+
+
No data points yet...
+
+
+
+
+
🔮 Make a Prediction
+
+
+
+
+
+
+
+
+
+
You can see by incres depth it more non-linaer and by decring depth it more linaer
+
📖 How to Play
+
+
🎯 Click on the canvas to add data points
+
🔄 Use the slider to adjust tree depth (complexity)
+
🚀 Click "Train Model" to build the decision tree
+
🔮 Enter an X value to predict the corresponding Y
+
📊 Watch how the tree partitions the feature space
+
🎨 Different colors represent different tree nodes
+
+
+
+
+
+
+
+
+ Go back to decision tree regrssion
+
+
+
+
+
+
+
diff --git a/templates/dtr.html b/templates/dtr.html
new file mode 100644
index 0000000000000000000000000000000000000000..e73cae570c64db3953450d15e2ab04e5a39d730a
--- /dev/null
+++ b/templates/dtr.html
@@ -0,0 +1,601 @@
+{% extends "layout.html" %}
+{% block content %}
+
+
+
+
+
+Decision Tree Regression - Interactive Flow Visualization
+
+
+
+
+
+
+
Decision Tree Regression (DTR) Visualization
+
Explore how DTR predicts continuous values with interactive examples.
+
+
+
+
+
+
What is Decision Tree Regression?
+
It’s a smart algorithm that predicts numbers — like house prices or temperatures — by splitting data into smaller groups based on features. Imagine a tree where each branch asks a question, and leaves give the final prediction.
+
+
+
+
How Does It Work?
+
The tree splits data recursively, choosing the best points to divide so that each group is as similar as possible. It stops splitting when the groups are small or deep enough.
+
+
Variance Reduction: Splits aim to reduce differences within groups.
+
Mean Squared Error (MSE): The algorithm picks splits that minimize prediction errors.
+
+
+
+
+
Making Predictions
+
To predict a new value, the data point travels down the tree following the split rules until it reaches a leaf. The prediction is the average of all training points in that leaf.
+
+
+
+
Key Hyperparameters
+
+
max_depth: Limits tree height to avoid overfitting.
+
min_samples_split: Minimum data points to split a node.
+
min_samples_leaf: Minimum data points in a leaf.
+
+
+
+
+
Comparison with Other Models
+
+
Decision Tree vs. Linear Regression:
+ DTR can model non-linear relationships, whereas Linear Regression assumes a linear relationship.
+ DTR is generally more flexible but also more prone to overfitting.
+
+
Decision Tree vs. SVR (Support Vector Regression):
+ SVR is a powerful model that finds the best fit line (or hyperplane) while tolerating some error.
+ SVR can be very effective but is often more complex to tune than DTR.
+
+
Decision Tree vs. Random Forest:
+ Random Forest is an ensemble of Decision Trees. It builds multiple trees and averages their predictions.
+ This significantly reduces variance and improves stability, making it a much better and more common choice in practice than a single DTR.
+
+
+
+
+
+
Key Hyperparameters (Detailed)
+
+
+ max_depth: How deep the tree can grow.
+ 👉 Bigger depth = tree keeps splitting → very detailed → more overfitting.
+ 👉 Smaller depth = tree stops early → simpler → less overfitting.
+
+
+
+ min_samples_split: Minimum samples needed to split a node.
+ 👉 Smaller value (like 2) = splits happen easily → more overfitting.
+ 👉 Larger value (like 10) = splits happen only with many samples → less overfitting.
+ Example: If min_samples_split=2, even 2 points can split → tree memorizes tiny patterns.
+ If min_samples_split=10, need 10+ points to split → tree generalizes.
+
+
+
+ min_samples_leaf: Minimum samples in a leaf node.
+ 👉 Smaller value (like 1) = tiny leaves → more overfitting.
+ 👉 Larger value (like 5 or 10) = bigger leaves → less overfitting.
+ Example: If min_samples_leaf=1, each data point might get its own leaf.
+ If min_samples_leaf=10, each leaf must cover at least 10 points → tree generalizes.
+
+
+
+ max_features: Number of features considered at each split.
+ 👉 Smaller value = fewer features per split → adds randomness → can reduce overfitting (especially in Random Forests).
+ 👉 Larger value = considers all features → risk of overfitting.
+
+
+
+
✅ Summary Memory Trick:
+ Big numbers (min_samples_split ↑, min_samples_leaf ↑) + small max_depth ↓ → simpler tree → less overfitting.
+ Small numbers (min_samples_split ↓, min_samples_leaf ↓) + big max_depth ↑ → complex tree → more overfitting.
+
+
+decis
+
+
+
Why Use DTR?
+
+
Easy to understand and visualize.
+
Captures complex, non-linear relationships.
+
No need to scale features.
+
+
+
+
+
Limitations
+
+
Can overfit if not controlled.
+
Small data changes can cause big model changes.
+
Less stable than ensemble methods like Random Forest.
+
+
+
+
+
Evaluation Metrics
+
+
MSE: Average squared error.
+
RMSE: Square root of MSE, same units as target.
+
R² Score: How well predictions fit actual data (1 = perfect).
Ensemble learning is a technique that combines multiple machine learning models to get a more robust and accurate prediction than a single model alone.
+ Learn how machines find hidden groups in data. Watch points get assigned and clusters adapt in real-time!
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
GMM = Soft Clustering
+
Points can partially belong to multiple clusters.
+
+
+
+
EM is Iterative
+
Alternates between E-step and M-step.
+
+
+
+
Watch the Ellipses
+
Ellipses show the statistical shape of clusters.
+
+
+
+
Convergence
+
Algorithm stops when centers stabilize.
+
+
+
+
+
+
+
+
+
+
EM Algorithm Flow
+
+
+
+
+
+ Initialize
+
+
+
+
+
+
+ E-Step
+
+
+
+
+
+
+ M-Step
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Speed:
+
+ Medium
+
+
+
+
+ Iteration:
+ 0 / 20
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Ready to Start! 🚀
+
+
+
+ Press 'Next Step' to see how the EM algorithm works.
+
+
+
+ Watch points get assigned to the most likely cluster center.
+
+
+
+
+
+
+
+
+
+
+
+
Log-Likelihood
+
+
+ Converged!
+
+
+
+ Measures model fit. Higher values = better grouping.
+
+
+
+
+ Start simulation to see progress...
+
+
+
+
+
+
+
Key Terms 📚
+
+
+ E-Step:
+
Calculating the "responsibility" (probability) that each point belongs to each cluster center.
+
+
+ M-Step:
+
Moving and stretching clusters to better fit the points assigned to them.
+
+
+ Convergence:
+
The point where the clusters stop moving because they've found the optimal mathematical fit.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
What is the EM Algorithm?
+
+
+
+ EM (Expectation-Maximization) is a powerful iterative method used to find maximum likelihood estimates of parameters in statistical models, where the model depends on unobserved latent variables.
+
+
+
🎯 Real-Life Analogy
+
+ Imagine you have a bag of colored marbles, but you're colorblind! You can feel their sizes and weights, but you can't see the colors. EM helps you figure out which marbles are likely the same color based on their shared physical properties.
+
+
+
The algorithm works by alternating between two main steps until it finds the best mathematical grouping.
+
+
+
+
+
+
+
+
+
+
+
+
+
E-Step (Expectation)
+
Assignment Phase
+
+
+
+
Goal: Calculate the probability (responsibility) of each cluster for each data point.
+
+
Process:
+
+
• Each point "looks" at all current cluster curves.
+
• It asks: "Given my location, which cluster would likely have produced me?"
+
• Points get colored by their most likely cluster.
+
+
+
+
responsibility = (fit to cluster) / (total fit to all clusters)
+
+
+
+
+
+
+
+
+
+
+
+
M-Step (Maximization)
+
Update Phase
+
+
+
+
Goal: Update cluster parameters (center, shape, weight) based on assigned points.
+
+
Process:
+
+
• Cluster centers move to the weighted average of points.
+
• The ellipse stretches/shrinks to cover its assigned points better.
+ We have a classic loop: To find the centers, we need to know point assignments. To find point assignments, we need to know the centers.
+ EM solves this by starting with a "best guess" and iteratively refining it.
+
+
+
+
Guaranteed Improvement 📈
+
+ Mathematically, each iteration of EM is guaranteed to increase the Log-Likelihood of the model (or leave it unchanged). This means the model always gets better at explaining the data until it hits a maximum.
+
+
+
+
+
+
+
+
GMM vs K-Means: The Difference
+
+
+
+
+
Feature
+
K-Means
+
GMM (EM)
+
+
+
+
+
Assignment
+
Hard (0 or 1)
+
Soft (Probabilities)
+
+
+
Cluster Shape
+
Always circular/spherical
+
Flexible ellipses (any orientation)
+
+
+
Model Type
+
Distance-based
+
Distribution-based
+
+
+
Use Case
+
Simple, distinct groups
+
Overlapping, varied group shapes
+
+
+
+
+
+
+
+
+
🌍 Real-World Applications
+
+
+
🖼️
+
Image Segmentation
+
Grouping pixels by color/texture to separate objects in photos.
+
+
+
🔊
+
Speech Recognition
+
Identifying different speakers in an audio stream using voice patterns.
+
+
+
📊
+
Customer Segmentation
+
Finding groups of customers with similar shopping behaviors.
+
+
+
🧬
+
Genetics
+
Clustering gene expression data to find functional biological groups.
+
+
+
🌤️
+
Meteorology
+
Classifying climate zones based on temperature and humidity data.
+
+
+
📧
+
Spam Detection
+
Clustering emails into 'Ham' and 'Spam' based on content features.
+
+
+
+
+
+
+
+
+
+
+
+
+
Welcome to the EM Simulator
+
Discover how AI learns hidden patterns in data
+
+
+
+
+
What is GMM?
A statistical model that groups data points into distinct probability curves (clusters).
+
+
+
+
How it learns
It "cycles" between assigning points (E-Step) and updating groups (M-Step) until stable.
+ Optimize parameters by descending the loss landscape. Now with Adaptive Learning Rate logic!
+
+
+
+
+
+
+
+
+
+
+
Status
+
Ready
+
+
+
+
+
+
Position
+
X: 3.00 Y: 3.00
+
+
+
+
+ GOAL AT 0, 0
+
+
+
+
+
+
+
+
+
Momentum (β): Acts like a physical ball with weight. It keeps moving in the same direction, helping it "roll" through flat valleys and over small local pits.
+
+
+
The Challenge: Rosenbrock and Rastrigin are "non-convex" or have "vanishing gradients." Vanilla GD is often too weak to reach the center without help!
+
+
+
+
+ 💡Pro Tip: If the ball gets stuck or vibrates wildly, you must adjust the learning rate or momentum. Or turn on Adaptive Rate to let the algorithm handle it!
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ 🏆
+
+
Convergence!
+
Level Complete
+
+
+
0
+
Total Steps
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/templates/home.html b/templates/home.html
new file mode 100644
index 0000000000000000000000000000000000000000..820a92ff3c2bdb656f5cb56ce4dba3a2d60abd95
--- /dev/null
+++ b/templates/home.html
@@ -0,0 +1,18 @@
+{% extends "layout.html" %}
+
+{% block content %}
+
+ A simple, non-parametric, and lazy learning algorithm for classification. Use this interactive tool to understand how it works with two distinct categories!
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
How KNN Classifies Your Data
+
+
+ 📍
+
New Data Point
+
You provide X, Y
+
+
→
+
+ 📏
+
Calculate Distances
+
To ALL labeled points
+
+
→
+
+ 🔍
+
Find K-Neighbors
+
Based on 'k' value
+
+
↓
→
+ 🗳️
+
Majority Vote
+
Neighbors decide category
+
+
→
+
+ ✅
+
Predicted Category
+
Final Classification
+
+
+
+ KNN works by finding the 'k' closest existing data points to a new point, then classifying the new point based on the most common category among those 'k' neighbors.
+
+ K-Nearest Neighbors (KNN) is a simple, non-parametric, and lazy learning algorithm. It's primarily used for classification and regression tasks. In this visualization, we focus on its classification ability in 2 dimensions (X and Y coordinates) with two distinct categories of data.
+
+
+
Key Components on the Chart:
+
+
+ Category 1 (Red Diamonds): These are existing data points that are known to belong to "Category 1". Think of them as your training data with a confirmed label.
+
+
+ Category 2 (Blue Diamonds): These are existing data points that are known to belong to "Category 2". Together, Category 1 and Category 2 points form your labeled dataset.
+
+
+ Test Point (Green 'x'): This is the new, unlabeled data point whose category we want to predict using the KNN algorithm. You can change its X and Y coordinates using the input fields above.
+
+
+ Neighbors (Orange Stars): These are the `k` data points from your labeled dataset (red or blue diamonds) that are closest to your Test Point. These are the points that will "vote" on the Test Point's category.
+
+
+
+
The KNN Algorithm - Step-by-Step Flow:
+
+
+ 1. Define 'k': First, you choose the value of `k` (the number of neighbors). This is done using the "Choose k (Neighbors)" input field.
+
+
+ 2. Provide a Test Point: You input the coordinates (X and Y) of the new, unlabeled data point you want to classify (the green 'x').
+
+
+ 3. Calculate Distances: The algorithm calculates the distance from the Test Point ($P_{test}(x_t, y_t)$) to every single Labeled Point ($P_{labeled}(x_l, y_l)$) in your dataset. The most common distance metric used for this is the Euclidean Distance, which is given by the formula:
+
+ $$ d(P_{test}, P_{labeled}) = \sqrt{(x_t - x_l)^2 + (y_t - y_l)^2} $$
+
+
+ 4. Find K-Nearest Neighbors: After calculating all distances, the algorithm sorts them in ascending order and identifies the `k` data points that have the smallest distances to the Test Point. These are your Neighbors, and they will be highlighted with an orange star on the chart.
+
+
+ 5. Majority Vote & Prediction: The Test Point is then assigned the category (either Category 1 or Category 2) that is most frequent among its `k` Neighbors. If there's a tie (e.g., an equal number of votes for both categories), tie-breaking rules (like choosing the category of the single closest neighbor, or random selection) are applied.
+
+
+
+
Example Walkthrough: k=3, Test Point X=4, Y=2
+
+ Let's trace a prediction flow with a test point somewhat central to the two categories.
+
+
+
+ You set k = 3, Test Point X = 4, and Test Point Y = 2.
+
+
+ The Test Point appears at coordinates $(4, 2)$ on the chart.
+
+
+ The algorithm calculates distances from $(4, 2)$ to all labeled points. Based on the updated data:
+
+
Example Distances: (You can verify these manually or let the tool do it!)
+ Now, we find the 3 smallest distances. For a test point at $(4,2)$, these are likely to be a mix of Category 1 and Category 2 points. Let's assume (based on typical clustering):
+
+
A Category 1 point (e.g., $(3.0, 1.5)$)
+
Another Category 1 point (e.g., $(3.5, 1.0)$)
+
A Category 2 point (e.g., $(4.5, 3.0)$)
+
+ These 3 points will be highlighted as orange stars on the chart.
+
+
+ Majority Vote:
+ Among these 3 nearest neighbors:
+
+
Category 1: 2 points
+
Category 2: 1 point
+
+ Since Category 1 (2 votes) has more votes than Category 2 (1 vote), the Test Point at $(4,2)$ would be predicted as Category 1.
+
+ *(Try moving the Test Point closer to the blue diamonds and see how the prediction changes!)*
+
+
+
+ *The diamond shapes from your image are illustrative. In our plot, they are rendered as circular markers for simplicity, but the underlying principle remains the same.*
+
+
+
+
+
+
+
+ {% endblock %}
\ No newline at end of file
diff --git a/templates/knn_image.html b/templates/knn_image.html
new file mode 100644
index 0000000000000000000000000000000000000000..fadffd8fdc4958abd3135042e5b4b20029c404ca
--- /dev/null
+++ b/templates/knn_image.html
@@ -0,0 +1,120 @@
+
+
+
+
+
+ KNN Image Classifier
+
+
+
+
+
+
🖼️ KNN Image Classifier
+
+
+
Expected: Number in White and background have to be Black and of a digit (0–9).
+ Unlock the power of Lasso Regression (L1 Regularization) to make accurate predictions.
+ This technique helps prevent overfitting and performs automatic feature selection, leading to simpler and more robust models.
+
+
+
+
+
+
+
+ 🏠
+
+
Predicted Price
+
238,500.00
+
+
+
USD
+
+
+
+
+
+
+ 📚 Deep Dive into Lasso Regression
+
+
+
+ Lasso Regression, or Least Absolute Shrinkage and Selection Operator, is a powerful extension of linear regression. It introduces L1 Regularization to penalize large coefficients, making models more interpretable and robust.
+
+
+
+
🎯 Objective Function:
+
+ The cost function that Lasso tries to minimize is:
+
Penalizes model complexity. Encourages sparsity by setting some coefficients exactly to zero.
+
+
+
+
+
+
+
🌱 Benefits of Lasso
+
+
Feature Selection: Sets irrelevant feature coefficients to 0
+
Prevents Overfitting: Penalizes large weights
+
Simplifies Models: Creates more interpretable solutions
+
+
+
+
⚖️ Lasso vs Ridge
+
+
Ridge (L2): Shrinks coefficients but rarely to zero
+
Lasso (L1): Shrinks some coefficients to exact zero
+
Elastic Net: Combines L1 + L2 regularization
+
+
+
+
+
+ Lasso shines in high-dimensional spaces with many features, especially when many of them are irrelevant. It's widely used in finance, bioinformatics, real estate, and more.
+
+
+
+
+
+
+ 🔄 How Lasso Treats Your Data
+
+
+ Lasso applies L1 regularization to shrink some feature coefficients all the way to 0.
+ This means those features are completely ignored in the final prediction! Here's how it flows:
+
+
+
+
+
📥
+
Input Features
+
+
➡️
+
+
⚙️
+
Cost Function
+
+
➡️
+
+
📉
+
Shrinks Weights
+
+
➡️
+
+
🧹
+
Sparse Output
+
+
+
+
+ This process leads to a simpler model by automatically performing feature selection.
+
+
+
+
+
+ ✂️ Lasso Regression Made Easy (Story Style)
+
+
+ Imagine again you're predicting house price, and you have 5 friends giving opinions:
+ Quality, Living Area, Garage, Basement, and Year Built.
+ But this time, you're tired of too much noise. You want only the really useful voices.
+
+
+
+
+
🗣️
+
Without Lasso All friends talk — some help, some just add noise.
+
+
+
+
✂️
+
With Lasso Lasso listens carefully, then mutes unhelpful friends.
+
+
+
+
✅
+
Result You get a simpler model using only the most useful features.
Understanding Naive Bayes Classifier: A Probabilistic Approach
+
+
+ The Naive Bayes classifier is a simple yet powerful probabilistic machine learning algorithm used for classification tasks. It's based on Bayes' theorem with a crucial "naive" assumption: it assumes that all features are independent of each other, given the class label. Despite this simplifying assumption, Naive Bayes often performs surprisingly well, especially in text classification and spam filtering.
+
+
+
Bayes' Theorem: The Foundation
+
+ Naive Bayes is built upon Bayes' theorem, which describes the probability of an event, based on prior knowledge of conditions that might be related to the event. For classification, it helps calculate the probability of a hypothesis (a class) given evidence (features).
+
+
+ $$P(A|B) = \frac{P(B|A) \cdot P(A)}{P(B)}$$
+ Where:
+ • $P(A|B)$: Posterior probability of class A given predictor B.
+ • $P(B|A)$: Likelihood of predictor B given class A.
+ • $P(A)$: Prior probability of class A.
+ • $P(B)$: Prior probability of predictor B.
+
+
+ In the context of classification, we want to find the probability of a data point belonging to a certain class ($P(\text{Class}|\text{Features})$). Bayes' theorem allows us to calculate this using probabilities that are easier to estimate from the training data.
+
+
+
The "Naive" Assumption:
+
+ The "naive" part comes from the assumption that all features are conditionally independent of each other, given the class label. This means, for example, that the presence of one word in an email (e.g., "money") is independent of the presence of another word (e.g., "Viagra") given that the email is spam. While rarely true in reality, this simplification makes the calculations tractable and surprisingly effective.
+
+
+ $$P(\text{Class} | \text{Feature}_1, ..., \text{Feature}_n) \propto P(\text{Class}) \cdot \prod_{i=1}^{n} P(\text{Feature}_i | \text{Class})$$
+ The classifier selects the class that maximizes this posterior probability.
+
+
+
Types of Naive Bayes Models:
+
+
+ 1. Gaussian Naive Bayes: In Gaussian Naive Bayes, continuous values associated with each feature are assumed to be distributed according to a Gaussian (Normal) distribution. When plotted, it gives a bell-shaped curve which is symmetric about the mean of the feature values.
+
+
+ 2. Multinomial Naive Bayes: Multinomial Naive Bayes is used when features represent the frequency of terms (such as word counts) in a document. It is commonly applied in text classification, where term frequencies are important.
+
+
+ 3. Bernoulli Naive Bayes: Bernoulli Naive Bayes deals with binary features, where each feature indicates whether a word appears or not in a document. It is suited for scenarios where the presence or absence of terms is more relevant than their frequency. Both Multinomial and Bernoulli models are widely used in document classification tasks.
+
+
+
+
How Gaussian Naive Bayes Works (for continuous data):
+
+ For continuous features (like the X and Y coordinates in our visualization), the algorithm assumes that the values associated with each class follow a Gaussian (Normal) distribution.
+
+
+
+ 1. Calculate Priors: It first calculates the prior probability of each class ($P(\text{Class})$) based on their frequency in the training data.
+
+
+ 2. Estimate Likelihoods (Mean & Variance): For each feature and each class, it calculates the mean ($\mu$) and variance ($\sigma^2$) of the feature values. These are used to model the Gaussian distribution $P(\text{Feature}_i | \text{Class})$.
+
+
+ 3. Classify New Data: When a new data point arrives, for each class:
+
+
It calculates the probability of each feature value given that class, using the estimated Gaussian distributions.
+
It multiplies these probabilities by the prior probability of that class (based on the naive assumption).
+
+
+
+ 4. Assign Class: The data point is assigned to the class that yields the highest overall posterior probability.
+
+
+
+ The decision boundary generated by Gaussian Naive Bayes is often curvilinear or non-linear because it's based on the intersection of Gaussian probability distributions, which are circular or elliptical in 2D space.
+
+
+
How Naive Bayes Treats User Data:
+
+ When you interact with this visualization and add new data points or modify existing ones, the Naive Bayes model processes this "user data" in a specific way:
+
+
+
+ Training Data Update: Every time you add a new point or reset the data, the entire dataset (including your newly added points) is used to *retrain* the Naive Bayes model. This means the model recalculates the class priors, and the mean and variance for each feature within each class, based on all the data currently visible on the graph.
+
+
+ Feature Independence: Regardless of how many features your data has (2D or 3D in this case), the model continues to treat each feature (X, Y, and Z coordinates) as independent of the others, given the class. For example, when classifying a new point, the probability of its X-coordinate is considered separately from its Y-coordinate (and Z-coordinate in 3D), and these individual probabilities are multiplied together along with the class prior.
+
+
+ Probabilistic Classification: When you add a new point to be classified, the model calculates the probability of that point belonging to each of the existing classes. It then assigns the point to the class for which it calculates the highest probability. This is why the decision boundaries dynamically shift as you add more data, reflecting the model's updated understanding of the underlying probability distributions.
+
+
+ Dynamic Decision Boundaries: The visualization actively shows how the decision boundaries adapt based on the distribution of the data you provide. As you add points, the model's estimates of the class means and variances change, which in turn alters the shape and position of the separating lines (in 2D) or surfaces (in 3D). This demonstrates the adaptive nature of the Naive Bayes algorithm to new data.
+
+
+
+
+
How this Visualization Works:
+
+ This interactive graph demonstrates the decision boundary of a Gaussian Naive Bayes classifier in 2D space:
+
+
+
+ Add Data Points: Click anywhere on the plot to add new training data points. These points are automatically assigned to "Class 0" (Red) or "Class 1" (Blue) based on the current mouse coordinates.
+
+
+ Test Point (Green X): The green 'X' represents the point you want to classify. You can drag it around or enter its coordinates manually.
+
+
+ Decision Boundary: The colored background represents the decision regions. After clicking "Run Naive Bayes Prediction", the model trains on your added points. The background then colors the regions based on which class the Naive Bayes model predicts for every point in that area. Notice the smooth, often curved, nature of the boundary which arises from the underlying Gaussian probability distributions.
+
+
+ Prediction Result: The text below the button will show the predicted class for your test point (Green X).
+
+
+
+
Advantages of Naive Bayes Classifier:
+
+
Easy to implement and computationally efficient.
+
Effective in cases with a large number of features.
+
Performs well even with limited training data.
+
It performs well in the presence of categorical features.
+
For numerical features, data is assumed to come from normal distributions.
+
+
+
Disadvantages of Naive Bayes Classifier:
+
+
Assumes that features are independent, which may not always hold in real-world data.
+
Can be influenced by irrelevant attributes.
+
May assign zero probability to unseen events, leading to poor generalization. (This is typically addressed by smoothing techniques like Laplace smoothing).
+
+
+
Why it is Called Naive Bayes?
+
+ It is named as "Naive" because it assumes the presence of one feature does not affect other features. The "Bayes" part of the name refers to its basis in Bayes’ Theorem.
+
+
+
Key Features of Naive Bayes Classifiers:
+
+
The main idea behind the Naive Bayes classifier is to use Bayes' Theorem to classify data based on the probabilities of different classes given the features of the data.
+
It is used mostly in high-dimensional text classification.
+
The Naive Bayes Classifier is a simple probabilistic classifier and it has a very few number of parameters which are used to build the ML models that can predict at a faster speed than other classification algorithms.
+
It is a probabilistic classifier because it assumes that one feature in the model is independent of the existence of another feature. In other words, each feature contributes to the predictions with no relation between each other.
+
+
+
Applications of Naive Bayes Classifier:
+
+
Spam Email Filtering: Classifies emails as spam or non-spam based on features.
+
Text Classification: Used in sentiment analysis, document categorization, and topic classification.
+
Medical Diagnosis: Helps in predicting the likelihood of a disease based on symptoms.
+
Credit Scoring: Evaluates creditworthiness of individuals for loan approval.
+
Weather Prediction: Classifies weather conditions based on various factors.
+
+
+
+
+
+
+{% endblock %}
\ No newline at end of file
diff --git a/templates/pca-threejs.html b/templates/pca-threejs.html
new file mode 100644
index 0000000000000000000000000000000000000000..cb2215d53fb3dd67aa933205dc74f5c9ca3538dd
--- /dev/null
+++ b/templates/pca-threejs.html
@@ -0,0 +1,524 @@
+
+
+
+
+
+ PCA Playground - Interactive Tool
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/templates/poly.html b/templates/poly.html
new file mode 100644
index 0000000000000000000000000000000000000000..b0840632228997bd006dd0620836513b12e65547
--- /dev/null
+++ b/templates/poly.html
@@ -0,0 +1,539 @@
+{% extends "layout.html" %}
+
+{% block content %}
+
+
+
+
+
+
+
Polynomial Regression Visualizer
+
+
+
What is Polynomial Regression?
+
+ Polynomial Regression is a type of regression analysis that models the relationship between the independent variable x
+ and the dependent variable y as an n-th degree polynomial. It extends simple linear regression by considering higher-degree terms.
+
+
+
Why Polynomial Regression?
+
+
Linear Regression only fits straight lines. But real-world data is often curved or non-linear.
+
Polynomial Regression allows us to capture these curves while still being simple and interpretable.
+ Here, theta_0, theta_1, ..., theta_n are the coefficients learned by the model.
+
+
+
+
Theoretical Examples: y = x^2 + 2x
+
+ To illustrate how the equation y = x^2 + 2x creates a curve, let's substitute a few values for x:
+
+
+
+ When x = 0:
+ y = (0)^2 + 2(0) = 0 + 0 = 0
+
+ An input of 0 results in an output of 0.
+
+
+
+ When x = 1:
+ y = (1)^2 + 2(1) = 1 + 2 = 3
+
+ An input of 1 yields an output of 3.
+
+
+
+ When x = 3:
+ y = (3)^2 + 2(3) = 9 + 6 = 15
+
+ With an input of 3, the output becomes 15.
+
+
+
+ When x = 5:
+ y = (5)^2 + 2(5) = 25 + 10 = 35
+
+ For an input of 5, the predicted output is 35.
+
+
+
+
+ These examples show how the relationship between x and y is curved, not linear.
+
+
+
+
How this App Works:
+
+
The model is trained using Python's PolynomialFeatures from sklearn.preprocessing.
+
You enter a value (e.g., hours studied), and the trained model predicts the output (e.g., expected score).
+
A dynamic graph shows the curve and predicted point in real-time.
+
+
+
+
+
+
Part 1: Comprehensive Study Guide on Linear Regression
+
1. Introduction to Linear Regression
+
1.1 What is Linear Regression?
+
+ Linear Regression is a fundamental statistical and machine learning technique used to model the relationship between a **dependent variable** (target) and one or more **independent variables** (features) by fitting a linear equation to the observed data. The goal is to predict the value of the dependent variable based on the values of the independent variables. It assumes a linear relationship between the input features and the output variable.
+
+
+
1.2 Types of Linear Regression
+
+
Simple Linear Regression (SLR): Involves one independent variable to predict a single dependent variable.
+
+ y = beta_0 + beta_1x + epsilon
+ y: Dependent variable
+ x: Independent variable
+ beta_0: Y-intercept (the value of y when x=0)
+ beta_1: Slope of the regression line (the change in y for a one-unit change in x)
+ epsilon: Error term (the difference between the actual and predicted values)
+
+
+
Multiple Linear Regression (MLR): Involves two or more independent variables to predict a single dependent variable.
+
Polynomial Regression: A form of linear regression in which the relationship between the independent variable x and the dependent variable y is modeled as an nth-degree polynomial. While it models a non-linear relationship, it's considered a linear model because it's linear in its coefficients.
+
Ridge Regression: A regularization technique used when multicollinearity is present. It adds a penalty equivalent to the square of the magnitude of the coefficients, shrinking them towards zero.
+
+ Cost Function: MSE + lambdasum_j=1p beta_j^2
+
+
+
Lasso Regression: Another regularization technique that adds a penalty equivalent to the absolute value of the magnitude of the coefficients. It can shrink coefficients exactly to zero, performing feature selection.
+
+ Cost Function: MSE + lambdasum_j=1p |beta_j|
+
+
+
Elastic Net Regression: Combines the penalties of both Ridge and Lasso Regression.
+
The goal is to find the coefficients mathbf{beta} that minimize the sum of squared errors. This can be done using the Ordinary Least Squares (OLS) method, which provides the closed-form solution:
Violations of these assumptions can lead to unreliable or inefficient regression models.
+
+
Linearity: The relationship between the independent variables and the mean of the dependent variable is linear.
+
Homoscedasticity: The variance of the errors (residuals) is constant across all levels of the independent variables.
+
No Multicollinearity: Independent variables are not highly correlated with each other.
+
Independence of Errors: Errors (residuals) are independent of each other. There is no correlation between consecutive error terms.
+
Normally Distributed Errors: The errors (residuals) are normally distributed.
+
+
+
4. Cost Function (Mean Squared Error - MSE)
+
4.1 Definition of MSE:
+
+ Mean Squared Error (MSE) measures the average of the squares of the errors, where the error is the difference between the actual value and the predicted value.
+
+
+ MSE = (1/n) * sum(i=1 to n) (y_i - haty_i)^2
+
+
4.2 Why Squared Errors are Used:
+
+
Penalizes Large Errors More: Squaring errors gives larger errors a disproportionately higher penalty.
+
Differentiability: The squared error function is continuously differentiable, which is crucial for optimization algorithms.
+
Convexity: The MSE cost function for linear regression is convex, ensuring optimization algorithms will converge to the optimal solution.
+
+
+
5. Gradient Descent Algorithm
+
+ Gradient Descent is an iterative optimization algorithm used to find the minimum of a function. It works by repeatedly moving in the direction of the steepest descent, which is the negative of the gradient.
+
+
5.1 How it Works:
+
+
Initialize Parameters: Start with random initial values for the model parameters.
+
Calculate Gradient: Compute the gradient of the cost function.
+
Update Parameters: Move the parameters in the opposite direction of the gradient by a step size (learning rate).
+
Repeat: Repeat until the parameters converge.
+
+
5.2 Equations (for Simple Linear Regression):
+
+ The MSE cost function is: J(beta_0, beta_1) = (1/n) * sum(i=1 to n) (y_i - (beta_0 + beta_1x_i))^2.
+ The update rules are:
+
7. Advantages and Disadvantages of Linear Regression
+
+
+
7.1 Advantages
+
+
Simplicity and Interpretability
+
Fast Computation
+
Foundation for Other Models
+
+
+
+
7.2 Disadvantages
+
+
Assumes Linearity
+
Sensitive to Outliers
+
Assumptions Must Hold
+
Prone to Underfitting
+
+
+
+
+
8. Handling Challenges in Linear Regression
+
8.1 How to Handle Outliers
+
+
Removal or Transformation
+
Use Robust Regression Methods
+
Use MAE as Cost Function
+
+
8.2 How to Handle Multicollinearity
+
+
Feature Removal or Combination
+
Dimensionality Reduction (PCA)
+
Regularization (Ridge, Lasso)
+
+
8.3 How to Handle Feature Scaling
+
+ Feature scaling is crucial for **Gradient Descent** and **Regularization** to ensure all features contribute equally to the model. Methods include Standardization and Min-Max Scaling.
+
+
+
9. Regularized Regression Methods
+
Regularization techniques add a penalty to the cost function to prevent overfitting.
+
+
Ridge Regression (L2): Penalizes the sum of squared coefficients.
+
See how your input travels through each decision tree
+
+
+
+
+ 1
+ User Input
+
+
+
+
+
+
+
+
+
+
+ 2
+ Data Preparation (Train/Test Split)
+
+
+
+
Training Data (80%)
+
+
+
+
Test Data (20%)
+
+
+
+
+
+
+
+
+
+
+
+ 3
+ Tree Predictions (3 Example Trees)
+
+
+
+
+
Tree 1 Prediction: ?
+
+
+
+
+
Tree 2 Prediction: ?
+
+
+
+
+
Tree 3 Prediction: ?
+
+
+
+
+
+
+
+
+ 4
+ Final Prediction
+
+
+
Based on X hours studied:
+
?
+
(Average of all tree predictions)
+
+
+
+
+
+
+
+
+
+
+
+
+ ← Back to Random Forest Regression
+
+
+
diff --git a/templates/reinforcement.html b/templates/reinforcement.html
new file mode 100644
index 0000000000000000000000000000000000000000..267316e67af96640b0c3cc976f1fd2f95e22b821
--- /dev/null
+++ b/templates/reinforcement.html
@@ -0,0 +1,18 @@
+{% extends "layout.html" %}
+
+{% block content %}
+
Reinforcement Learning
+
Reinforcement learning (RL) is a subfield of machine learning where an agent learns to make decisions by taking actions in an environment to maximize some cumulative reward.
+{% endblock %}
\ No newline at end of file
diff --git a/templates/rf.html b/templates/rf.html
new file mode 100644
index 0000000000000000000000000000000000000000..c1638feb9a353c96b79a7f3a122ec5cec5417f81
--- /dev/null
+++ b/templates/rf.html
@@ -0,0 +1,153 @@
+{% extends "layout.html" %}
+{% block content %}
+
+
+
🌲 Random Forest Regressor 🌲
+
+
+ The Random Forest Regressor is a powerful ensemble learning method. Imagine a forest of decision trees working together.
+ Instead of relying on a single, potentially biased tree, it combines the wisdom of many. This helps it capture complex, non-linear patterns
+ in your data far better than simpler models.
+
+
+
+
✨ Try It Yourself! ✨
+
+ Ready to see the Random Forest Regressor in action? Click the button below to predict an exam score!
+
+ Here's a simple dataset we could use to train our Random Forest Regressor, predicting exam scores based on hours studied:
+
+
+
+
+
+
+ Hours Studied (X)
+
+
+ Exam Score (y)
+
+
+
+
+
1
35
+
2
45
+
3
55
+
4
65
+
5
75
+
6
80
+
7
82
+
8
88
+
9
92
+
10
95
+
+
+
+
+
+
+
+ 🧠 How Random Forest Regression Works
+
+
+ A Random Forest is built from many individual decision trees. For regression, it predicts by averaging the outputs of these trees.
+ This significantly reduces overfitting and makes the model robust.
+
+
+
+ Building Trees with Bagging (Bootstrap Aggregating):
+ Each tree is trained on a different random subset of your original data, sampled with replacement. This creates varied training sets for each tree.
+
+
+ Feature Randomness (Random Subspace):
+ When a tree makes a split, it only considers a random subset of the available features. This ensures no single feature dominates all trees.
+
+
+
+
+
+
+ ✂️ The Splitting Process in Each Tree
+
+
+ Each decision tree in the forest grows by repeatedly splitting its data. The goal is to create "pure" child nodes. For regression, "purity" means minimizing the Mean Squared Error (MSE) within a node.
+
+
+
+ Finding the Best Split:
+ At each node, the tree evaluates all possible split points for its random subset of features. It picks the split that results in the greatest reduction in Mean Squared Error (MSE).
+ The MSE formula is: MSE = (1/n) × Σ(𝑦i − ŷi)2, where 𝑦i is the actual value and ŷi is the predicted value (the average of y in that node).
+
+
+
+ Recursive Partitioning:
+ This splitting process repeats for the newly created child nodes. It continues until stopping conditions are met (e.g., maximum depth, minimum samples in a leaf).
+
+
+ Leaf Node Prediction:
+ Once a tree is fully grown, its final prediction at any leaf node is simply the average of all the training data's target values (y values) that ended up in that leaf.
+
+
+
+
+
+
+ 🚀 Predicting with New Input (e.g., 5.5 Hours)
+
+
+ When you enter a new value, like 5.5 hours studied, here's the journey it takes through the Random Forest:
+
+
+
+ Sent to Every Tree:
+ Your input 5.5 hours goes to each and every decision tree in the Random Forest.
+
+
+ Tree by Tree Journey:
+ In each tree, 5.5 travels down, following the branches based on the split conditions (e.g., if "Hours Studied <= 5?", then 5.5 goes to the "No" branch).
+
+
+ Individual Tree Predictions:
+ Eventually, 5.5 lands in a leaf node for each tree. That tree's prediction is the average of the exam scores (y values) of the training data points that settled in that same leaf node during training.
+
+ Example: If one tree's leaf node for "Hours Studied > 5.0 and <= 7.0" contained training points (6 hours, 80 score) and (7 hours, 82 score), that tree would predict (80+82)/2 = 81 for 5.5 hours.
+
+
+ Averaging for the Final Answer:
+ Once all individual trees have made their predictions, the Random Forest Regressor simply calculates the average of all these individual tree predictions.
+
+ For instance, if Tree 1 predicted 81, Tree 2 predicted 78, Tree 3 predicted 80.5, etc., the final output would be the average of these numbers.
+
+
+ Your Predicted Score:
+ This final averaged value is what you see as the predicted exam score, offering a robust and well-rounded estimate!
+
+ Explore the power of Ridge Regression (L2 Regularization) for robust predictions.
+ This technique is excellent for handling multicollinearity and preventing overfitting by shrinking coefficients, while still retaining all features.
+
+ 📚 Deep Dive into Ridge Regression (L2 Regularization)
+
+
+ Ridge Regression helps make our prediction models more stable and less prone to errors on new data. It does this by adding a special "penalty" to its calculations. This penalty is called L2 Regularization.
+
+
+
+
🎯 How Ridge Calculates (Objective Function):
+
+ Ridge tries to find the best prediction rule by minimizing this formula:
+
+ This is the standard part: How well did our model predict? We want our predictions ($$h_{\theta}(x^{(i)})$$) to be very close to the actual values ($$y^{(i)}$$).
+
+
Ridge's "Penalty" (Second Part):
+
+
+ $$ \lambda \sum_{j=1}^{p} \theta_j^2 $$
+
+
+ This is the L2 penalty. It adds a cost for making any feature's "importance" (called a coefficient, $$\theta_j$$) too big. If a coefficient is large, this term gets big, increasing the total cost. To keep the cost low, Ridge *forces* coefficients to be smaller.
+
+
+
+ The symbol $$\lambda$$ (lambda) is like a "Volume Knob" for this penalty.
+ Big $$\lambda$$: Turn up the volume knob, so coefficients are shrunk *a lot*.
+ Small $$\lambda$$: Turn down the volume knob, so coefficients are shrunk *less*.
+
+
+
+
+
+
+ ✨ Why Ridge is Useful
+
+
+
Prevents Over-Learning: Stops the model from memorizing the training data too much, so it performs better on new data.
+
Handles Similar Features: If you have features that tell a similar story (e.g., house size and number of rooms), Ridge manages them well without favoring one too much.
+
Keeps All Information: All features still contribute to the prediction, even if just a little bit.
+
+
+
+
+ 🆚 Ridge vs. Lasso (The Main Difference)
+
+
+
Ridge (L2): It *shrinks* the importance of features (their coefficients) towards zero. Think of it like a dimmer switch for a light: the light gets very dim, but it's never completely OFF.
+
Lasso (L1): It can *force* the importance of some features (their coefficients) to become *exactly zero*. This is like turning some lights completely OFF, effectively ignoring those features.
+
+
+
+
+
+
+ 🔄 How Ridge Treats Your Data
+
+
+ Ridge uses L2 regularization to shrink all feature weights, but never drops any feature completely. This allows it to preserve all information while still controlling model complexity.
+
+
+
+
+
📥
+
Input Features
+
All provided
+
+
➡️
+
+
⚙️
+
Cost Function
+
MSE + λ × ∑θ²
+
+
➡️
+
+
📉
+
Shrinks All Coeffs
+
Closer to 0, but ≠ 0
+
+
➡️
+
+
🎯
+
Prediction Uses All
+
None are ignored
+
+
+
+ Unlike Lasso, Ridge Regression keeps all your features—just with smaller, regularized weights. It’s perfect when all features have some predictive power.
+
+
+
+
+
+ Ridge is a great choice when you believe all your features are somewhat useful and you want to **control how much influence they have** without completely throwing any away.
+
+
+
+
+ 🧠 Ridge Regression Made Easy (Story Style)
+
+
+ Imagine your house price is being predicted by a team of 5 friends (features):
+ Quality, Living Area, Garage, Basement, and Year Built.
+ Each friend gives their opinion (a number), and we add them up to predict the price.
+
+
+
+
+
🔊
+
Without Ridge Some friends shout too loudly! Their numbers dominate and cause mistakes.
+
+
➡️
+
+
🔇
+
With Ridge Everyone still speaks, but Ridge adds rules so no one can shout. Voices are balanced.
+
+
➡️
+
+
📊
+
Result The prediction is more stable, fair, and doesn’t overreact to any one person.
+
+
+
+
+ ✅ Ridge doesn’t remove anyone (like Lasso does).
+ ✅ It just turns down their microphone if they’re being too loud.
+ ✅ That’s how Ridge keeps all your data and prevents wild, unstable predictions.
+
+
+ (This is especially helpful when features are similar or slightly redundant.)
+
+
+
+
+ 📊 Suggested Visualizations (to see it in action!):
+
+
+ To really get a feel for how Ridge works, visual examples are best:
+
+
+
Coefficient Shrinkage Graph: See a graph where all feature "voices" (coefficients) start big and then get smaller as you increase $\lambda$ (the "Volume Knob"). Notice they don't hit zero!
+
Prediction Quality Chart: Compare Ridge's predictions against actual values.
+
Model Stability Over Time: How the model's performance holds up on different sets of data.
+
+
+ (These dynamic visualizations can be powered by Chart.js, D3.js, or by integrating server-side plots from libraries like Matplotlib/Seaborn in your Flask application.)
+
Semi-supervised learning is a combination of supervised and unsupervised learning. It uses a small amount of labeled data and a large amount of unlabeled data to train a model. This is particularly useful when it is expensive to label data.
+{% endblock %}
\ No newline at end of file
diff --git a/templates/supervise.html b/templates/supervise.html
new file mode 100644
index 0000000000000000000000000000000000000000..384ee783ac5741b34753d7247cf76b939e5cb004
--- /dev/null
+++ b/templates/supervise.html
@@ -0,0 +1,40 @@
+{% extends "layout.html" %}
+
+{% block content %}
+
Supervised Learning
+
Supervised learning is a type of machine learning where the model is trained on labeled data. This means the training dataset includes both the input and the correct output, and the model learns to map the input to the output.
+{% endblock %}
\ No newline at end of file
diff --git a/templates/supervised.html b/templates/supervised.html
new file mode 100644
index 0000000000000000000000000000000000000000..a27e8a9a24ad7dc07385e4129d97f4e833007c4f
--- /dev/null
+++ b/templates/supervised.html
@@ -0,0 +1,1248 @@
+{% extends "layout.html" %}
+{% block content %}
+
+
+
+
+
+ Linear Regression Predictor - Interactive Educational Tool
+
+
+
+
+
+
+
+
+
+
+
+
+ Predict Your Exam Score 📈
+
+
+ A comprehensive demonstration of Linear Regression to predict exam scores based on study hours.
+ Explore the theory, mathematics, and practical implementation.
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
📊 Linear Regression Visualization
+
+
+
+
+
Current Model:
+
+ Score = 20 × Hours + 15
+
+
+ This simple linear model assumes each additional hour of study increases your score by 20 points
+
+
+
+
+
+
+
+
+
Make a Prediction
+
+
+
+
+
+
+ Predicted Score:
+
+
85.00
+
+ Based on the linear regression equation: Score = 20 × Hours + 15
+
+
+
+
+
Regression Equation:
+
+ Score = 20 × Hours + 15
+
+
+
• Red Line: Regression line showing the linear relationship
+
• Blue Points: Training data points
+
• Green Point: Your prediction
+
+
+
+
+
+
+
+
+
📈 Model Performance Metrics
+
+
+
+
+
0.98
+
R² Score
+
+
+
3.2
+
RMSE
+
+
+
2.8
+
MAE
+
+
+
10.2
+
MSE
+
+
+
+ High R² indicates the model explains 98% of the variance in the data
+
+
+
+
+
+
+
+
+
+
+
Linear Regression Theory & Implementation
+
+
+
+
+
+
+
+
+
+
+
What is Linear Regression?
+
+ Linear Regression is a fundamental supervised learning algorithm used for predicting a continuous outcome variable (dependent variable) based on one or more input features (independent variables). It models the relationship between the variables by fitting a linear equation to the observed data.
+
+
+ For a simple linear regression with one input feature, the equation used by our model is:
+ Predicted Score = (20 × Hours Studied) + 15
+
+
+
Slope (m=20): Represents how much the predicted outcome changes for every one-unit increase in the input feature. It indicates the strength and direction of the relationship.
+
Intercept (b=15): Represents the predicted outcome when all input features are zero. It's the baseline value.
+
+
+
Why Slope (m) is 20
+
+ The slope of 20 means each hour of studying contributes 20 points to your exam score. For example, if you study one more hour, your predicted score increases by 20 points.
+
+
+
Why Intercept (b) is 15
+
+ The intercept of 15 represents points earned regardless of study time. This could account for:
+
+
+
Class attendance and participation
+
Homework assignments
+
Quizzes and in-class activities
+
Base marks for attempting the exam
+
+
+
+
+
+
+
+
+
1. Mathematical Formulation
+
+
+
+
Simple Linear Regression
+
+ In statistical modeling, the simple linear regression equation is:
+
+
+ y = β₀ + β₁x + ε
+
+
+ The goal is to find the coefficients β₀ and β₁ that minimize the cost function, typically the Mean Squared Error (MSE).
+
+
+
+
+
+
+
+
2. Cost Function (Mean Squared Error)
+
+
+
+
Definition of MSE
+
+ Mean Squared Error measures the average of the squares of the errors:
+
+
+ MSE = (1/n)∑(yᵢ - ŷᵢ)²
+
+
+
+
+
Why Squared Errors?
+
+
Penalizes Large Errors: Larger errors get disproportionately higher penalty
+
Differentiability: Crucial for optimization algorithms
+
Convexity: Ensures global minimum exists
+
+
+
+
+
+
+
+
3. Evaluation Metrics
+
+
+
+
+
Mean Squared Error (MSE)
+
+ MSE = (1/n)∑(yᵢ - ŷᵢ)²
+
+
+ Average of squared differences between actual and predicted values.
+
+
+
+
+
R-squared (R²)
+
+ R² = 1 - (SS_res/SS_tot)
+
+
+ Proportion of variance explained by the model.
+
Part 1: Comprehensive Study Guide on Linear Regression
+
1. Introduction to Linear Regression
+
1.1 What is Linear Regression?
+
+ Linear Regression is a fundamental statistical and machine learning technique used to model the relationship between a **dependent variable** (target) and one or more **independent variables** (features) by fitting a linear equation to the observed data. The goal is to predict the value of the dependent variable based on the values of the independent variables. It assumes a linear relationship between the input features and the output variable.
+
+
+
1.2 Types of Linear Regression
+
+
Simple Linear Regression (SLR): Involves one independent variable to predict a single dependent variable.
+
+ y = beta_0 + beta_1x + epsilon
+ y: Dependent variable
+ x: Independent variable
+ beta_0: Y-intercept (the value of y when x=0)
+ beta_1: Slope of the regression line (the change in y for a one-unit change in x)
+ epsilon: Error term (the difference between the actual and predicted values)
+
+
+
Multiple Linear Regression (MLR): Involves two or more independent variables to predict a single dependent variable.
+
Polynomial Regression: A form of linear regression in which the relationship between the independent variable x and the dependent variable y is modeled as an nth-degree polynomial. While it models a non-linear relationship, it's considered a linear model because it's linear in its coefficients.
+
Ridge Regression: A regularization technique used when multicollinearity is present. It adds a penalty equivalent to the square of the magnitude of the coefficients, shrinking them towards zero.
+
+ Cost Function: MSE + lambdasum_j=1p beta_j^2
+
+
+
Lasso Regression: Another regularization technique that adds a penalty equivalent to the absolute value of the magnitude of the coefficients. It can shrink coefficients exactly to zero, performing feature selection.
+
+ Cost Function: MSE + lambdasum_j=1p |beta_j|
+
+
+
Elastic Net Regression: Combines the penalties of both Ridge and Lasso Regression.
+
The goal is to find the coefficients mathbf{beta} that minimize the sum of squared errors. This can be done using the Ordinary Least Squares (OLS) method, which provides the closed-form solution:
Violations of these assumptions can lead to unreliable or inefficient regression models.
+
+
Linearity: The relationship between the independent variables and the mean of the dependent variable is linear.
+
Homoscedasticity: The variance of the errors (residuals) is constant across all levels of the independent variables.
+
No Multicollinearity: Independent variables are not highly correlated with each other.
+
Independence of Errors: Errors (residuals) are independent of each other. There is no correlation between consecutive error terms.
+
Normally Distributed Errors: The errors (residuals) are normally distributed.
+
+
+
4. Cost Function (Mean Squared Error - MSE)
+
4.1 Definition of MSE:
+
+ Mean Squared Error (MSE) measures the average of the squares of the errors, where the error is the difference between the actual value and the predicted value.
+
+
+ MSE = (1/n) * sum(i=1 to n) (y_i - haty_i)^2
+
+
4.2 Why Squared Errors are Used:
+
+
Penalizes Large Errors More: Squaring errors gives larger errors a disproportionately higher penalty.
+
Differentiability: The squared error function is continuously differentiable, which is crucial for optimization algorithms.
+
Convexity: The MSE cost function for linear regression is convex, ensuring optimization algorithms will converge to the optimal solution.
+
+
+
5. Gradient Descent Algorithm
+
+ Gradient Descent is an iterative optimization algorithm used to find the minimum of a function. It works by repeatedly moving in the direction of the steepest descent, which is the negative of the gradient.
+
+
5.1 How it Works:
+
+
Initialize Parameters: Start with random initial values for the model parameters.
+
Calculate Gradient: Compute the gradient of the cost function.
+
Update Parameters: Move the parameters in the opposite direction of the gradient by a step size (learning rate).
+
Repeat: Repeat until the parameters converge.
+
+
5.2 Equations (for Simple Linear Regression):
+
+ The MSE cost function is: J(beta_0, beta_1) = (1/n) * sum(i=1 to n) (y_i - (beta_0 + beta_1x_i))^2.
+ The update rules are:
+
7. Advantages and Disadvantages of Linear Regression
+
+
+
7.1 Advantages
+
+
Simplicity and Interpretability
+
Fast Computation
+
Foundation for Other Models
+
+
+
+
7.2 Disadvantages
+
+
Assumes Linearity
+
Sensitive to Outliers
+
Assumptions Must Hold
+
Prone to Underfitting
+
+
+
+
+
8. Handling Challenges in Linear Regression
+
8.1 How to Handle Outliers
+
+
Removal or Transformation
+
Use Robust Regression Methods
+
Use MAE as Cost Function
+
+
8.2 How to Handle Multicollinearity
+
+
Feature Removal or Combination
+
Dimensionality Reduction (PCA)
+
Regularization (Ridge, Lasso)
+
+
8.3 How to Handle Feature Scaling
+
+ Feature scaling is crucial for **Gradient Descent** and **Regularization** to ensure all features contribute equally to the model. Methods include Standardization and Min-Max Scaling.
+
+
+
9. Regularized Regression Methods
+
Regularization techniques add a penalty to the cost function to prevent overfitting.
+
+
Ridge Regression (L2): Penalizes the sum of squared coefficients.
+
Adjust the slider to change the displayed regularization weight (\(\lambda\)). This does not change model training logic — it only updates the displayed formula for clarity on mobile.
+ The SVM algorithm processes your input data through several key steps: it first ingests your labeled data, then (optionally, using the kernel trick for non-linear cases) transforms it into a higher-dimensional space. Next, it determines the best possible decision boundary (hyperplane) that maximally separates the classes while minimizing errors. The crucial data points closest to this boundary are identified as $$'Support Vectors'$$ as they fundamentally define the separation. Finally, any new, unlabeled data point is classified based on which side of this optimized hyperplane it falls.
+
+
+
+
+
Understanding Support Vector Machines (SVM)
+
+ Support Vector Machines (SVMs) are powerful supervised learning models used for classification and regression. The core idea behind SVM is to find the $$optimal hyperplane$$ that best separates data points of different classes in a high-dimensional space.
+
+
+
Key Concepts of Support Vector Machine
+
+
Hyperplane: A decision boundary separating different classes in feature space and is represented by the equation $$wx + b = 0$$ in linear classification.
+
Support Vectors: The closest data points to the hyperplane, crucial for determining the hyperplane and margin in SVM. In our plot, they are highlighted with a purple ring.
+
Margin: The distance between the hyperplane and the support vectors. SVM aims to maximize this margin for better classification performance.
+
Kernel: A function that maps data to a higher-dimensional space enabling SVM to handle non-linearly separable data.
+
Hard Margin: A maximum-margin hyperplane that perfectly separates the data without misclassifications.
+
Soft Margin: Allows some misclassifications by introducing slack variables, balancing margin maximization and misclassification penalties when data is not perfectly separable.
+
C: A regularization parameter that controls the trade-off between margin maximization and misclassification penalties. A higher C value forces stricter penalty for misclassifications.
+
Hinge Loss: A loss function penalizing misclassified points or margin violations and is combined with regularization in SVM. If a data point is correctly classified and within the margin there is no penalty (loss = 0). If a point is incorrectly classified or violates the margin the hinge loss increases proportionally to the distance of the violation.
+
Dual Problem: Involves solving for Lagrange multipliers associated with support vectors, facilitating the kernel trick and efficient computation.
+
+
+
How does Support Vector Machine Algorithm Work?
+
+ The key idea behind the SVM algorithm is to find the hyperplane that best separates two classes by maximizing the margin between them. This margin is the distance from the hyperplane to the nearest data points (support vectors) on each side.
+
+
+
+ The best hyperplane also known as the "hard margin" is the one that maximizes the distance between the hyperplane and the nearest data points from both classes. This ensures a clear separation between the classes. Let's consider a scenario like shown below where we have one blue ball in the boundary of the red ball (referring to a conceptual image similar to the one you provided earlier).
+
+
+
How does SVM classify the data?
+
+ The blue ball in the boundary of red ones is an outlier of blue balls. The SVM algorithm has the characteristics to ignore the outlier and finds the best hyperplane that maximizes the margin. SVM is robust to outliers. A soft margin allows for some misclassifications or violations of the margin to improve generalization. The SVM optimizes the following equation to balance margin maximization and penalty minimization:
+
The penalty used for violations is often hinge loss which has the following behavior:
+
+
If a data point is correctly classified and within the margin there is no penalty (loss = 0).
+
If a point is incorrectly classified or violates the margin the hinge loss increases proportionally to the distance of the violation.
+
+
+
Till now we were talking about linearly separable data that separates groups of blue balls and red balls by a straight line/linear line.
+
+
What to do if data are not linearly separable?
+
When data is not linearly separable i.e it can't be divided by a straight line, SVM uses a technique called kernels to map the data into a higher-dimensional space where it becomes separable. This transformation helps SVM find a decision boundary even for non-linear data.
+
+
A kernel is a function that maps data points into a higher-dimensional space without explicitly computing the coordinates in that space. This allows SVM to work efficiently with non-linear data by implicitly performing the mapping. For example consider data points that are not linearly separable. By applying a kernel function SVM transforms the data points into a higher-dimensional space where they become linearly separable.
+
+
+
Linear Kernel: For linear separability.
+
Polynomial Kernel: Maps data into a polynomial space.
+
Radial Basis Function (RBF) Kernel: Transforms data into a space based on distances between data points.
+
+
+
In this case (referring to your 1D to 2D mapping image) the new variable $$y$$ is created as a function of distance from the origin.
+
+
Mathematical Computation of SVM
+
Consider a binary classification problem with two classes, labeled as $$+1$$ and $$-1$$. We have a training dataset consisting of input feature vectors $$X$$ and their corresponding class labels $$Y$$. The equation for the linear hyperplane can be written as:
+
+
$$ w^T x + b = 0 $$
+
+
Where:
+
+
$$w$$ is the normal vector to the hyperplane (the direction perpendicular to it).
+
$$b$$ is the offset or bias term representing the distance of the hyperplane from the origin along the normal vector $$w$$.
+
+
+
Distance from a Data Point to the Hyperplane
+
The distance between a data point $$x_i$$ and the decision boundary can be calculated as:
+
$$ d_i = \frac{w^T x_i + b}{\|w\|} $$
+
where $$\|w\|$$ represents the Euclidean norm of the weight vector $$w$$.
+
+
Linear SVM Classifier
+
The predicted label $$\hat{y}$$ of a data point is given by:
+
$$ \hat{y} = \begin{cases} 1 & : w^T x + b \geq 0 \\ 0 & : w^T x + b < 0 \end{cases} $$
+
+
Optimization Problem for SVM (Hard Margin)
+
For a linearly separable dataset the goal is to find the hyperplane that maximizes the margin between the two classes while ensuring that all data points are correctly classified. This leads to the following optimization problem:
$$ y_i (w^T x_i + b) \geq 1 \quad \text{for } i = 1, 2, 3, \ldots, m $$
+
+
Where:
+
+
$$y_i$$ is the class label ($$+1$$ or $$-1$$) for each training instance.
+
$$x_i$$ is the feature vector for the $$i$$-th training instance.
+
$$m$$ is the total number of training instances.
+
+
+
The condition $$y_i (w^T x_i + b) \geq 1$$ ensures that each data point is correctly classified and lies outside the margin.
+
+
Soft Margin in Linear SVM Classifier
+
In the presence of outliers or non-separable data the SVM allows some misclassification by introducing slack variables $$\zeta_i$$. The optimization problem is modified as:
+
$$ \underset{w,b}{\text{minimize}} \quad \frac{1}{2} \|w\|^2 + C \sum_{i=1}^{m} \zeta_i $$
+
+
Subject to the constraints:
+
$$ y_i (w^T x_i + b) \geq 1 - \zeta_i \quad \text{and} \quad \zeta_i \geq 0 \quad \text{for } i = 1, 2, \ldots, m $$
+
+
Where:
+
+
$$C$$ is a regularization parameter that controls the trade-off between margin maximization and penalty for misclassifications.
+
$$\zeta_i$$ are slack variables that represent the degree of violation of the margin by each data point.
+
+
+
Dual Problem for SVM
+
The dual problem involves maximizing the Lagrange multipliers associated with the support vectors. This transformation allows solving the SVM optimization using kernel functions for non-linear classification.
$$\alpha_i$$ are the Lagrange multipliers associated with the $$i$$-th training sample.
+
$$t_i$$ is the class label for the $$i$$-th training sample.
+
$$K(x_i, x_j)$$ is the kernel function that computes the similarity between data points $$x_i$$ and $$x_j$$. The kernel allows SVM to handle non-linear classification problems by mapping data into a higher-dimensional space.
+
+
+
The dual formulation optimizes the Lagrange multipliers $$\alpha_i$$ and the support vectors are those training samples where $$\alpha_i > 0$$.
+
+
SVM Decision Boundary
+
Once the dual problem is solved, the decision boundary is given by:
+
$$ w = \sum_{i=1}^{m} \alpha_i t_i K(x_i, x) + b $$
+
+
Where $$w$$ is the weight vector, $$x$$ is the test data point and $$b$$ is the bias term. Finally the bias term $$b$$ is determined by the support vectors, which satisfy:
+
$$ t_i (w^T x_i - b) = 1 \implies b = w^T x_i - t_i $$
+
+
Where $$x_i$$ is any support vector.
+
+
This completes the mathematical framework of the Support Vector Machine algorithm which allows for both linear and non-linear classification using the dual problem and kernel trick.
+
+
Types of Support Vector Machine
+
+
Linear SVM: Linear SVMs use a linear decision boundary to separate the data points of different classes. When the data can be precisely linearly separated, linear SVMs are very suitable. This means that a single straight line (in 2D) or a hyperplane (in higher dimensions) can entirely divide the data points into their respective classes. A hyperplane that maximizes the margin between the classes is the decision boundary.
+
Non-Linear SVM: Non-Linear SVM can be used to classify data when it cannot be separated into two classes by a straight line (in the case of 2D). By using kernel functions, nonlinear SVMs can handle nonlinearly separable data. The original input data is transformed by these kernel functions into a higher-dimensional feature space where the data points can be linearly separated. A linear SVM is used to locate a nonlinear decision boundary in this modified space.
Support Vector Regression is an extension of Support Vector Machines (SVMs) for predicting continuous values. It focuses on fitting within an ε margin.
+
+
+
+
+
+
🧠 How SVR Thinks
+
Think of SVR like a wise teacher who says:
+
“I don’t care if a student (data point) is a little wrong — as long as the error is within a certain range, I’ll allow it.”
+
That range is the epsilon margin (ε). SVR fits a function that stays close to data but only penalizes points outside ε.
+
+
+
+
+
+
+
+
+
📉 Objective Function (with Penalty)
+
SVR minimizes this:
+
+
+
+
Objective Function
+
Scale formula
+
+
+
+
100%
+
+
+
+ $$ \frac{1}{2} \lVert w \rVert^2 + C \sum_{i=1}^{n} (\xi_i + \xi^*_i) $$
+
+
+
+
||w||²: Keeps the model simple (flat function).
+
C: Penalty constant for errors larger than ε.
+
ξ, ξ*: Slack variables for points outside the margin.
+
+
+
+
+
+
+
+
+
+
+
+
+
📊 SVR Visualization
+
Interactively change kernel, C, gamma, epsilon and dataset. Plotly chart below is mobile responsive.
+
+
+
MSE: N/A
+
R²: N/A
+
SVs: N/A
+
+
+
+
+
+
+
+
+
+
+
Upload Your Data
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
MSE: N/A
+
R²: N/A
+
SVs: N/A
+
+
+
+
+
+
+
+
+ Plot Controls
+
Tap to open a larger interactive plot for a better mobile experience.
+
+
+
+
+
+
+
+
+
+
+
+
+
Expanded SVR Visualization
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
📘 SVR Story: The Elastic Ribbon
+
Imagine stretching an elastic ribbon (your prediction line) between your data points. This ribbon is not too loose (which means overfitting) and not too tight (which would miss patterns). Instead, it hugs the data with a gentle margin of error (epsilon), and only stretches beyond that when it absolutely has to.
+
The wider the ribbon (large epsilon), the more forgiving SVR is. The stiffer the ribbon (small epsilon, large C), the stricter it becomes.
+
+
+
+
+
+
🔍 When to Use SVR?
+
+
When you want non-linear regression with smooth boundaries.
+
If you have outliers but don’t want them to dominate your model.
+
When you're okay with small prediction errors but want to penalize big ones.
+
+
+
+
+
+
+
🧩 Key Concepts Behind SVR
+
+
+
+
📌 Support Vector Machines (SVMs)
+
SVR is a type of Support Vector Machine (SVM), a powerful supervised learning algorithm that can be used for both classification and regression. In classification, SVM finds the best boundary (hyperplane) that separates different classes. In regression (SVR), it tries to find a line or surface that fits the data within a certain tolerance (ε-insensitive zone).
+
+
+
+
⚙️ Kernels
+
Kernels are functions that help SVR deal with non-linear data by transforming input data into higher dimensions:
+
+
Linear Kernel: For simple, straight-line relationships
+
Polynomial Kernel: For data with curved trends
+
RBF (Gaussian) Kernel: For more complex, wavy patterns
+
+
+
+
+
🎛️ Hyperparameters
+
SVR has a few important knobs (hyperparameters) you can tune:
+
+
C (Penalty): Controls how much error is tolerated. A high C forces the model to fit as closely as possible, possibly overfitting. A small C allows for more slack.
+
ε (Epsilon): Defines the “no penalty zone.” Errors within this range are ignored, helping generalize better.
+
Kernel: Decides the shape of the function used to fit the data.
+
+
+
+
+
📈 Model Evaluation
+
After training, we must evaluate how well the SVR model performs. Here’s a common process:
+
+
Split your data into training and test sets
+
Train your SVR model on the training set
+
Use the test set to check how well the model generalizes
+
Use metrics like: MSE (Mean Squared Error) and MAE (Mean Absolute Error)
+
+
+
+
+
Mastering these concepts helps you unlock the real power of SVR and make smarter, more flexible predictions!
+
+
+
+
+
+
Want to visualize SVR predictions? Try plotting the ε margin band and the support vectors — they're the true heroes behind the scenes.
+
+
+
+
+
+
+{% endblock %}
diff --git a/templates/svr_linear.png b/templates/svr_linear.png
new file mode 100644
index 0000000000000000000000000000000000000000..fc45bd001e812fd36710618f14aaf9bb8564893f
Binary files /dev/null and b/templates/svr_linear.png differ
diff --git a/templates/svr_poly.png b/templates/svr_poly.png
new file mode 100644
index 0000000000000000000000000000000000000000..e6f8e5140ce1740498889e4f78a0c5f0e2ed6f2f
Binary files /dev/null and b/templates/svr_poly.png differ
diff --git a/templates/svr_rbf.png b/templates/svr_rbf.png
new file mode 100644
index 0000000000000000000000000000000000000000..4e8c10970323503afff5961485f8947bc178d006
Binary files /dev/null and b/templates/svr_rbf.png differ
diff --git a/templates/t-SNE.html b/templates/t-SNE.html
new file mode 100644
index 0000000000000000000000000000000000000000..504d8919023d86f52919e67811e9555e45e57d81
--- /dev/null
+++ b/templates/t-SNE.html
@@ -0,0 +1,388 @@
+{% extends "layout.html" %}
+
+{% block content %}
+
+
+
+
+
+ Study Guide: t-SNE
+
+
+
+
+
+
+
+
+
🎨 Study Guide: t-SNE (t-Distributed Stochastic Neighbor Embedding)
Imagine you have a list of 1,000 guests for a party, and you know how well each guest gets along with every other guest (based on a complex 100-dimensional personality profile). Your job is to create a 2D seating chart for the party. A simple approach (like PCA) might place guests based on one or two general traits, but you want something better. You want to make sure that "close friends" from the original list end up sitting close together at the party. t-SNE is your expert party planning assistant. It meticulously arranges the seating chart so that the local "friendship circles" from your high-dimensional list are beautifully preserved in the 2D layout. Its main goal isn't to preserve the exact distances, but to keep the neighbors together.
+
+
t-SNE is a powerful, nonlinear dimensionality reduction technique primarily used for visualizing high-dimensional data in 2D or 3D. Unlike PCA, which focuses on preserving the overall "spread" (global variance) of the data, t-SNE focuses on preserving the local structure—it tries to keep points that are close neighbors in the original high-dimensional space as close neighbors in the low-dimensional map.
+
+
+
🔹 Intuition Behind t-SNE
+
The core idea of t-SNE is to convert the high-dimensional distances between data points into probabilities representing similarities. If two points are close, there's a high probability they are "neighbors." It then tries to create a low-dimensional map where these probabilities are as similar as possible.
+
+
Example: In a high-dimensional space of animal features, a "cat" and a "lynx" are very close. A "cat" and a "whale" are very far apart. t-SNE converts this to:
+ • High probability that a "cat" would pick a "lynx" as its neighbor.
+ • Very low probability that a "cat" would pick a "whale" as its neighbor.
+ It then arranges the points on a 2D map to reflect these same probabilities.
+
+
A key innovation is its use of a Student-t distribution in the low-dimensional map. This distribution has "heavier tails" than a normal (Gaussian) distribution, which allows it to place dissimilar points further apart, helping to prevent overcrowding in the center of the map and forming clearer, more distinct clusters.
+
+
+
🔹 Mathematical Foundation
+
+
The math behind t-SNE is like a two-part translation process:
+ 1. Describe Friendships in High Dimensions: First, it creates a "friendship score" ($$p_{ij}$$) for every pair of guests on the original list.
+ 2. Describe Friendships on the 2D Map: Then, it creates a similar friendship score ($$q_{ij}$$) for every pair on the 2D seating chart.
+ The Goal: It then shuffles the 2D seating chart around until the two sets of friendship scores are as similar as possible. The difference between the scores is measured by something called KL Divergence.
+
+
+
High-Dimensional Similarity ($$p_{ij}$$): The probability that point \(x_i\) would pick point \(x_j\) as its neighbor, based on a Gaussian distribution centered on \(x_i\).
+
This is then made symmetric: $$p_{ij} = (p_{j|i} + p_{i|j}) / 2n$$
+
+
Low-Dimensional Similarity ($$q_{ij}$$): The probability of similarity between two points \(y_i\) and \(y_j\) on the new map, using a Student-t distribution.
+
Optimization Goal (KL Divergence): The algorithm's job is to minimize the difference between the two probability distributions, P (high-dim) and Q (low-dim).
+
Think of these as the settings for your party planner assistant:
+
+
Perplexity: This is the most important setting. It's related to the number of close neighbors each point considers. Analogy: It's like telling your planner, "For each guest, focus on their 20 closest friends when making the seating chart." Typical values are between 5 and 50.
+
+
Learning Rate: Controls how big of a step the algorithm takes when rearranging the seating chart in each iteration. Analogy: A high learning rate is like making drastic changes to the chart, while a low one is like making tiny, careful adjustments.
+
Number of Iterations: How many times the algorithm will try to improve the seating chart. Usually needs at least 500-1000 rounds.
+
+
+
+
🔹 Comparison with PCA
+
+
+
+
Feature
+
t-SNE
+
PCA
+
+
+
+
+
Method
+
Nonlinear
+
Linear
+
+
+
Goal
+
Preserves local structure (neighbors).
+
Preserves global variance (overall spread).
+
+
+
Use Case
+
Visualization and identifying clusters.
+
Preprocessing, noise reduction, and data compression.
+
+
+
Speed
+
Slow (O(n²)), not for huge datasets.
+
Very fast and scalable.
+
+
+
Output Meaning
+
The distances between clusters are often not meaningful. Focus on the clusters themselves.
+
The components have a clear mathematical meaning (directions of max variance).
+
+
+
+
+
🔹 Strengths & Weaknesses
+
Advantages:
+
+
✅ Excellent at Visualization: Creates beautiful, well-separated visualizations that make it easy to see clusters.
+
✅ Captures Nonlinear Structure: Can reveal complex patterns like spirals or nested groups that PCA would miss completely.
+
+
Disadvantages:
+
+
❌ Only for Visualization: You should not use the 2D output of t-SNE as input for another model. It's a visualization tool, not a preprocessing tool.
+
❌ Computationally Expensive: Can be very slow on datasets with more than a few thousand samples.
+
❌ Non-Deterministic: Running it twice might produce slightly different-looking (but usually structurally similar) plots. Always set a `random_state` for reproducible results.
+
❌ Cluster Sizes/Distances are Misleading: The size of a t-SNE cluster and the distance between two clusters doesn't have a direct, reliable meaning. Don't over-interpret them!
+
+
+
🔹 When to Use t-SNE
+
+
Exploratory Data Analysis (EDA): As a first step to "see" what your high-dimensional data looks like.
+
Visualizing Embeddings: Perfect for plotting high-dimensional outputs from models like Word2Vec, BERT, or Autoencoders to see if similar items are being grouped together.
+
Checking Class Separability: Visualizing your labeled data to see if the classes naturally form distinct clusters.
+
+
+
🔹 Python Implementation (Beginner Example with Digits Dataset)
+
+
In this example, we'll use the famous handwritten digits dataset. Each digit is a 64-dimensional vector (an 8x8 image). We'll use t-SNE to visualize all these digits in a 2D plot. We expect to see 10 distinct clusters, one for each digit from 0 to 9.
+
+
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.manifold import TSNE
+from sklearn.datasets import load_digits
+
+# --- 1. Load the Data ---
+# The digits dataset contains 8x8 pixel images of handwritten digits (0-9).
+# Each image is a "data point" with 64 features (8*8=64).
+digits = load_digits()
+X = digits.data
+y = digits.target # The true labels (0, 1, 2, ...)
+
+# --- 2. Create and Apply t-SNE ---
+# n_components=2: We want to create a 2D map.
+# perplexity=30: A good default value to start with.
+# random_state=42: Ensures we get the same plot every time we run the code.
+tsne = TSNE(n_components=2, perplexity=30, random_state=42)
+
+# Fit t-SNE to the data and transform it. This can take a moment.
+X_tsne = tsne.fit_transform(X)
+
+# --- 3. Visualize the Results ---
+# We can now plot our 64D dataset in 2D.
+# We'll color each point on the plot according to its true digit label.
+plt.figure(figsize=(10, 8))
+plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=y, cmap=plt.cm.get_cmap("jet", 10))
+plt.title('t-SNE Visualization of Handwritten Digits')
+plt.xlabel('t-SNE Component 1')
+plt.ylabel('t-SNE Component 2')
+plt.colorbar(label='Digit Label', ticks=range(10))
+plt.grid(True)
+plt.show()
+
+
+
+
🔹 Best Practices
+
+
Use PCA First: On very high-dimensional data (e.g., >100 features), it's a good practice to run PCA first to reduce the dimensions to a manageable number (like 30-50) before feeding it into t-SNE. This speeds up the process and can reduce noise.
+
Experiment with Perplexity: Try a few different values for perplexity (e.g., 5, 30, 50) to see if the cluster structures remain stable. If they do, it's a good sign that the clusters are real.
+
Fix `random_state`: For your results to be reproducible for papers or presentations, always set the `random_state` parameter.
+
Don't Over-Interpret: Remember that t-SNE is for visualization. The relative sizes of clusters and the distances between them are not reliable metrics. Focus on the grouping of neighbors.
Unsupervised learning is a type of machine learning that looks for previously undetected patterns in a dataset with no pre-existing labels and with a minimum of human supervision.