Upload 3 files

Browse files

Update 3 Files: Linear Regression , Logistic Regression, Support Vector Machine

Files changed (3) hide show

Linear_Regression.py +163 -0
Logistic_Regression.py +197 -0
Support_Vector_Machine.py +145 -0

Linear_Regression.py ADDED Viewed

	@@ -0,0 +1,163 @@

+# Hồi quy tuyến tính đơn giản
+import numpy as np
+# Dữ liệu y = 2x + 1
+x = np.array([1,2,3,4,5])
+y = np.array([3,5,7,9,11])
+# Tính hệ số w và b theo công thức thống kê
+w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean())**2)
+b = y.mean() - w * x.mean()
+# Dự đoán
+y_pred = w * x + b
+print(f"Phương trình: y= {w: .1f}x + {b: .1f}")
+print(f"Dự đoán:", y_pred)
+# Vẽ đồ thị Linear Regression để trực quan hóa
+ # Hồi quy tuyến tính đơn giản + vẽ biểu đồ
+import numpy as np
+import matplotlib.pyplot as plt
+# Dữ liệu y = 2x + 1
+x = np.array([1,2,3,4,5])
+y = np.array([3,5,7,9,11])
+# Tính hệ số hồi quy
+w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean()) ** 2)
+b = y.mean() - w * x.mean()
+# Dự đoán
+y_pred = w * x + b
+# In ra kết quả
+print(f"Phương trình hồi quy: y={w: .1f}x + {b: .1f}")
+# Vẽ biểu đồ
+plt.scatter(x,y, color='blue', label='Dữ liệu thật')
+plt.plot(x,y_pred, color='red', label='Đường hồi quy')
+plt.xlabel('x axis')
+plt.ylabel('y axis')
+plt.title('Hồi quy tuyến tính đơn giản + Trực quan hóa')
+plt.legend()
+plt.show()
+# Luyện tập
+# 1. Tuyến tính nhưng đổi hệ số khác
+import numpy as np
+# Dữ liệu y = 4x - 3
+x = np.array([1,2,3,4,5,6,7])
+y = np.array([1,5,9,13,17,21,25]) # y = 4x - 3
+# Tính hệ số w và b theo công thức thống kê
+w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean())**2)
+b = y.mean() - w * x.mean()
+# Dự đoán
+y_predict = w * x + b
+print(f"Phương trình y= {w: .0f}x + {b: .0f}")
+print(f"Dự đoán:", y_predict)
+# Vẽ đồ thị để trực quan hóa
+import matplotlib.pyplot as plt
+plt.scatter(x,y, color='black', label='Real Data')
+plt.plot(x, y_predict, color='red', label='Predicted Data')
+plt.xlabel('X - Axis')
+plt.ylabel('Y - Axis')
+plt.title('Linear Regression (4x - 3)')
+plt.legend()
+plt.show()
+# 2. Tuyến tính có nhiễu(noise) thực tế hơn
+ # Công thức y = 3x + 2 + noise
+import numpy as np
+import matplotlib.pyplot as plt
+# Dữ liệu 3x + 2 + noise
+np.random.seed(0)
+x = np.arange(1,11)
+y = 3 * x + 2 + np.random.randn(10) * 2 # thêm nhiễu ngẫu nhiên
+# Tính hệ số w và b theo công thức thống kê
+w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean())**2)
+b = y.mean() - w * x.mean()
+# Dự đoán
+y_predict = w * x + b
+print(f"Phương trình + nhiễu: y = {w: .1f}x + {b: .1f}")
+print(f"Dự đoán chính xác:", y_predict)
+# Vẽ đồ thị trực quan hóa với nhiễu
+plt.scatter(x,y, color='blue', label='Real Data + Noise')
+plt.plot(x,y_predict, color='red', label='Predicted Data + Noise')
+plt.xlabel('X - Axis')
+plt.ylabel('Y - Axis')
+plt.title('Linear Regression AND Noise')
+plt.legend()
+plt.show()
+# 3. Tuyến tính nghịch (hệ số âm)
+ # Công thức y = -2x + 10
+import numpy as np
+import matplotlib.pyplot as plt
+# Dữ liệu tuyến tính nghịch
+x = np.array([1,2,3,4,5,6])
+y = np.array([8,6,4,2,0,-2])
+# Tính hệ số w và b theo công thức thống kê
+w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean())**2)
+b = y.mean() - w * x.mean()
+# Dự đoán chính xác
+y_pred = w * x + b
+print(f"Phương trình tuyến tính nghịch: y = {w: .0f}x + {b: .0f}")
+print(f"Dự đoán chính xác:", y_pred)
+# Vẽ đồ thị trực quan hóa với tuyến tính nghịch
+plt.scatter(x,y, color='red', label='Real Data')
+plt.plot(x,y_pred, color='blue', label='Predicted Data')
+plt.xlabel('X - Axis')
+plt.ylabel('Y - Axis')
+plt.title('Linear Regression AND Reversed')
+plt.legend()
+plt.show()
+# 4. Không tuyến tính (để xem hồi quy tuyến tính kém thế nào)
+ # Công thức: y =  x**2 + noise
+import numpy as np
+import matplotlib.pyplot as plt
+# Dữ liệu không tuyến tính
+np.random.seed(1)
+x = np.linspace(-5,5,20) # Giải thích: .linspace(start, stop, num) với num=number là số giá trị cách đều nhau
+y = x**2 + np.random.randn(20)*3  # Sinh ra 20 số ngẫu nhiên tuân theo phân phối chuẩn (Gaussian) → trung bình = 0, độ lệch chuẩn = 1; Nhân với 3 để phóng đại độ nhiễu (noise) → sai số ngẫu nhiên có độ lệch chuẩn ≈ 3 → càng nhân lớn, dữ liệu càng "rải" quanh đường parabol. y là giá trị theo hàm bậc hai, có thêm nhiễu ngẫu nhiên để giống dữ liệu thật. Đây là mối quan hệ phi tuyến tính giữa x và y.
+# Tính hệ số w và b theo công thức thống kê
+w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean())**2)
+b = y.mean() - w * x.mean()
+# Dự đoán chính xác
+y_pred = w * x + b
+print(f"Phương trình không tuyến tính: y={w: 1f}x + {b: .1f}")
+print(f"Dự đoán chính xác:", y_pred)
+# Vẽ đồ thị để trực quan hóa không tuyến tính
+plt.scatter(x,y, color='red', label='Real Data with no linear')
+plt.plot(x,y_pred, color='blue', label='Predicted Data with no linear')
+plt.xlabel('X - Axis')
+plt.ylabel('Y - Axis')
+plt.title(' No Linear Regression')
+plt.legend()
+plt.show()
+# 5. Dữ liệu ngẫu nhiên để luyện hiểu độ phù hợp (low R**2)
+import numpy as np
+import matplotlib.pyplot as plt
+# Dữ liệu ngẫu nhiên
+np.random.seed(2)
+x = np.random.randint(1,100,20) # Giải thích: np.random.randint(low,high,size) với size là tổng số ptử muốn tạo ra
+y = np.random.randint(1,100,20)
+# Tính hệ số w và b theo công thức thống kê
+w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean())**2)
+b = y.mean() - w * x.mean()
+# Dự đoán chính xác
+y_pred = w * x + b
+print(f"Phương trình tuyến tính + Random Data: y={w: 1f}x + {b: .1f}")
+print(f"Dự đoán chính xác:", y_pred)
+# Vẽ đồ thị để trực quan hóa không tuyến tính
+plt.scatter(x,y, color='red', label='Real Data + Random')
+plt.plot(x,y_pred, color='blue', label='Predicted Data + Random')
+plt.xlabel('X - Axis')
+plt.ylabel('Y - Axis')
+plt.title('Linear Regression With Random Data')
+plt.legend()
+plt.show()

Logistic_Regression.py ADDED Viewed

	@@ -0,0 +1,197 @@

+'''
+Logistic Regression là một mô hình học có giám sát (supervised learning) dùng cho bài toán phân loại (classification).
+Nó dựa trên Linear Regression, nhưng thay vì dự đoán giá trị liên tục → nó biến đầu ra thành xác suất (0–1) bằng hàm sigmoid.
+🧠 2. CÔNG THỨC TOÁN HỌC
+2.1. Linear part
+Trước hết, ta tính tổng có trọng số:
+z = w0 + w1x1 + w2x2 + w3x3 + ... + wnxn
+2.2 Áp dụng hàm Sigmoid
+y^ = σ(z) = 1 / 1 + e^-z
+=> Kết quả y^ nằm trong khoảng (0,1) hiểu là xác xuất đối tượng thuộc lớp 1.
+📊 3. QUYẾT ĐỊNH PHÂN LOẠI
+Nếu:
+y^ >= 0.5 ==> dự đoán 1
+y^ < 0.5 ==> dự đoán 0
+⚙️ 4. HÀM MẤT MÁT (LOSS FUNCTION)
+Ta dùng Binary Cross-Entropy Loss (Log Loss):
+J(w) = -1/m ∑[y^(i)log(y^^(i)) + (1-y^(i))log(1-y^^(i))]
+Mục tiêu: tối thiểu hóa J(w) → tìm bộ trọng số w tốt nhất.
+5. QUÁ TRÌNH HỌC (TRAINING)
+Dùng Gradient Descent để cập nhật trọng số:
+w := w - α * ∂J/∂w  # Trong đó: α là learning rate.
+'''
+# 💻 6. CODE MẪU PYTHON
+  # Thư viện sử dụng cho Logistic Regression
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+from sklearn.model_selection import train_test_split
+from sklearn.linear_model import LogisticRegression
+from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
+# Dữ liệu ví dụ
+data = pd.DataFrame({
+    'hours_studied': [1,2,3,4,5,6,7,8,9,10],
+    'pass_exam': [0,0,0,0,0,1,1,1,1,1]
+})
+X = data[['hours_studied']]
+y = data['pass_exam']
+# Chia dữ liệu
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+# Huấn luyện mô hình
+model = LogisticRegression()
+model.fit(X_train, y_train)
+# Dự đoán
+y_pred = model.predict(X_test)
+print("Accuracy:", accuracy_score(y_test, y_pred))
+print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
+print("Classification Report:\n", classification_report(y_test, y_pred))
+# 📈 Ứng dụng thực tế của Logistic Regression
+'''
+Lĩnh vực              Ứng dụng
+ y học                Dự đoán bệnh(có/không)
+ Email                Phân loại spam/không spam
+ Marketing            Khách hàng mua/không mua
+ Tài chính            Dự đoán vỡ nợ/không vỡ nợ
+ Nhận dạng            Có khuôn mặt/không có khuôn mặt
+'''
+# 🧠 Trực quan hóa hàm Sigmoid
+z = np.linspace(-10, 10, 100)
+sigmoid = 1 / (1 + np.exp(-z))
+plt.plot(z, sigmoid)
+plt.title('Sigmoid Function')
+plt.xlabel('z')
+plt.ylabel('σ(z)')
+plt.grid()
+plt.show()
+''' → Đồ thị cong từ 0 → 1, giúp chuyển hóa giá trị tuyến tính thành xác suất. '''
+'''
+🧩 9. ƯU ĐIỂM & NHƯỢC ĐIỂM
+✅ Ưu điểm
+Đơn giản, dễ huấn luyện
+Dễ hiểu, giải thích rõ ràng
+Hiệu quả cho dữ liệu tuyến tính
+⚠️ Nhược điểm
+Kém hiệu quả với dữ liệu phi tuyến
+Không xử lý tốt nhiều lớp phức tạp (multi-class → phải dùng One-vs-Rest)
+Giả định quan hệ tuyến tính giữa biến độc lập và log-odds
+🧪 10. MỞ RỘNG
+Multinomial Logistic Regression → cho phân loại nhiều lớp
+Regularization (L1, L2) → chống overfitting
+Feature scaling → nên chuẩn hóa dữ liệu trước khi huấn luyện
+'''
+# 🚀 Ứng dụng Nhỏ- Dự Đoán xác xuất Thi Đậu
+hours = np.array([[7]])
+pred = model.predict_proba(hours)
+print(f"Xác suất đậu: {pred[0][1]*100:.2f}%")
+# 🧩 12. SO SÁNH VỚI LINEAR REGRESSION
+'''
+Đặc điểm	   Linear Regression	    Logistic Regression
+ Đầu ra	        Giá trị liên tục	        Xác suất (0–1)
+Bài toán	    Dự đoán (Regression)	Phân loại (Classification)
+Hàm kích hoạt	   Không có	                    Sigmoid
+Hàm mất mát	         MSE	                    Log Loss
+'''
+# LOGISTIC REGRESSION (Dùng thư viện đơn giản)
+import numpy as np
+import matplotlib.pyplot as plt
+# Set up styling for Matplotlib
+plt.style.use('ggplot')
+# Create dataset
+dataset = np.array([
+    [-10, 0],
+    [-5, 0],
+    [-7, 0],
+    [0, 0],
+    [-2, 0],
+    [5, 1],
+    [7, 1],
+    [6, 1],
+    [10, 1],
+    [15, 1],
+    [9, 1]
+])
+# Draw the dataset
+negative_class = dataset[:5]
+positive_class = dataset[5:]
+# Draw the negative first and positive
+plt.scatter(negative_class[:, 0], negative_class[:, 1], c='y', label='Class 0')
+plt.scatter(positive_class[:, 0], positive_class[:, 1], c='g', label='Class 1')
+plt.legend()
+plt.show()
+# -------------------- Logistic Regression Functions --------------------
+def get_prediction(m, b, x):
+    """
+    Get the predictions: y_hat using the input
+    """
+    return 1 / (1 + np.exp(-(m * x + b)))  # ✅ Sửa công thức sigmoid
+def get_cost(y, y_hat):
+    """
+    Get the value of the cost function
+    """
+    k = y.shape[0]
+    return (-1 / k) * np.sum(y * np.log(y_hat) + (1 - y) * np.log(1 - y_hat))
+def get_gradient(m, b, x, y, y_hat):
+    """
+    Return the gradient of the loss function w.r.t m and b
+    """
+    k = y.shape[0]
+    dm = (1 / k) * np.sum((y_hat - y) * x)
+    db = (1 / k) * np.sum(y_hat - y)
+    return dm, db
+def get_accuracy(y, y_hat):
+    return ((y_hat >= 0.5).astype(int) == y).sum() / y.shape[0]  # ✅ Sửa chia đúng mẫu
+# -------------------- Gradient Descent --------------------
+m = 1.0
+b = 10.0
+iterations = 200
+lr = 0.03
+x = dataset[:, 0]
+y = dataset[:, 1]
+costs = []
+for it in range(iterations):
+    y_hat = get_prediction(m, b, x)
+    cost = get_cost(y, y_hat)
+    accuracy = get_accuracy(y, y_hat)
+    print(f"Iteration {it} - Cost: {cost:.4f}, Accuracy: {accuracy:.4f}")
+    dm, db = get_gradient(m, b, x, y, y_hat)  # ✅ Sửa dn → db
+    m -= lr * dm
+    b -= lr * db
+    costs.append(cost)
+# Plot cost over iterations
+plt.plot(costs)
+plt.xlabel("Iteration")
+plt.ylabel("Cost")
+plt.title("Cost Function over Time")
+plt.show()

Support_Vector_Machine.py ADDED Viewed

	@@ -0,0 +1,145 @@

+'''
+🧠 1. SVM là gì?
+SVM là một thuật toán phân loại (classification) trong Machine Learning.
+Nó tìm ra một đường ranh giới (hyperplane) để phân tách các nhóm dữ liệu khác nhau.
+Ví dụ:
+Nếu dữ liệu có 2 lớp (A và B), SVM tìm đường thẳng (2D) hoặc mặt phẳng (3D) sao cho:
+Hai nhóm tách nhau rõ nhất.
+Khoảng cách (margin) từ ranh giới đến điểm gần nhất của mỗi nhóm là lớn nhất.
+'''
+'''
+🧩 2. Ý tưởng trực quan
+Hãy tưởng tượng:
+Bạn có hai nhóm điểm đỏ 🔴 và xanh 🔵.
+Có nhiều cách vẽ đường thẳng chia 2 nhóm này.
+SVM chọn đường chia có khoảng cách xa nhất với cả hai nhóm.
+👉 Vì khoảng cách lớn → mô hình tổng quát tốt → ít lỗi khi gặp dữ liệu mới.
+'''
+'''
+⚖️ 3. Khi dữ liệu bị lẫn (không tách được thẳng)
+Nếu các điểm bị xen kẽ (không thể chia bằng đường thẳng),
+SVM dùng kernel trick để:
+“Nâng dữ liệu lên không gian cao hơn”
+Ở đó, có thể tìm được đường chia tuyến tính dễ hơn.
+'''
+'''
+🧮 4. Tham số quan trọng
+| Tham số    | Ý nghĩa                               | Cách chọn                                      |
+| ---------- | ------------------------------------- | ---------------------------------------------- |
+|   C        | Cân bằng giữa “ít lỗi” và “biên rộng” | Bắt đầu với 1, rồi thử 0.1 → 10                |
+|   kernel   | Kiểu ranh giới                        | `'linear'` (đường thẳng) hoặc `'rbf'` ( đường cong)   |
+|   gamma    | Mức “uốn cong” khi dùng RBF kernel    | Giá trị nhỏ = biên mềm, lớn = biên sát dữ liệu |
+'''
+# 🧰 5. Cách dùng trong Python
+from sklearn.svm import SVC
+from sklearn.preprocessing import StandardScaler
+from sklearn.pipeline import make_pipeline
+from sklearn.model_selection import train_test_split
+from sklearn.datasets import load_iris
+# Dữ liệu mẫu
+X, y = load_iris(return_X_y=True)
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
+# Tạo mô hình SVM
+model = make_pipeline(StandardScaler(), SVC(kernel='rbf', C=1, gamma='scale')) # thử đổi kernel bằng linear, poly, sigmoid và Tinh chỉnh C và gamma: SVC(C=10, gamma=0.1)
+# Huấn luyện
+model.fit(X_train, y_train)
+# Đánh giá
+print("Độ chính xác:", model.score(X_test, y_test))
+'''
+✅ Giải thích:
+StandardScaler() → chuẩn hóa dữ liệu (rất quan trọng)
+SVC() → SVM classifier
+kernel='rbf' → chọn ranh giới cong
+C và gamma → điều chỉnh độ khớp của mô hình
+'''
+'''
+📈 6. Khi nào nên dùng SVM?
+✅ Khi:
+Dữ liệu vừa phải (không quá lớn)
+Rõ ràng giữa các lớp
+Muốn mô hình mạnh, chính xác
+❌ Tránh:
+Dữ liệu cực lớn (hàng trăm ngàn mẫu)
+Cần mô hình dễ giải thích
+'''
+'''
+7. Ứng dụng thực tế
+    Email spam / không spam
+    Ảnh có mèo / không mèo
+    Dự đoán bệnh / không bệnh
+    Nhận diện khuôn mặt (Face Recognition)
+    Nhận diện vật thể (Object Classification)
+    Nhận dạng chữ viết tay (Handwritten Digit Recognition)
+    Phân loại văn bản (Text Classification)
+    Nhận dạng cảm xúc (Sentiment Analysis)
+    Phân loại tế bào ung thư (Cancer Cell Classification)
+    Dự đoán hoạt động gen hoặc protein
+    Phát hiện gian lận thẻ tín dụng (Fraud Detection)
+    Dự đoán rủi ro tín dụng (Credit Risk Scoring)
+    Phân loại tín hiệu thị trường chứng khoán (Up / Down)
+    Nhận dạng vật cản / đường đi cho robot tự hành
+    Dự đoán sự cố thiết bị (predictive maintenance)
+    Kiểm tra chất lượng sản phẩm (Quality Control)
+    Phân tích quy trình (Process Optimization)
+'''
+# ️ Mini Project thực tế với tập dữ liệu data.csv
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.svm import SVC
+from sklearn.preprocessing import StandardScaler
+from sklearn.model_selection import train_test_split
+from sklearn.pipeline import make_pipeline
+# 1. Đọc dữ liệu
+data = pd.read_csv(r"C:\Users\Microsoft\Machine-Learning-Tutorial\data.csv")
+# 2. Lấy đặc trưng và nhãn
+X = data.iloc[:, :-1].values  # tất cả cột trừ cột cuối
+y = data.iloc[:, -1].values   # cột cuối là label
+# (Chỉ vẽ được nếu dữ liệu có đúng 2 cột feature)
+if X.shape[1] != 2:
+    raise ValueError("⚠️ Dữ liệu phải có đúng 2 cột đặc trưng để vẽ biểu đồ 2D!")
+# 3. Chia tập train/test
+X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
+# 4. Tạo model SVM (dùng RBF)
+model = make_pipeline(StandardScaler(), SVC(kernel="rbf", C=1, gamma=0.5))
+model.fit(X_train, y_train)
+# 5. Độ chính xác
+acc = model.score(X_test, y_test)
+print(f"✅ Độ chính xác trên tập test: {acc:.2f}")
+# 6. Tạo lưới để vẽ ranh giới
+x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
+y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
+xx, yy = np.meshgrid(np.linspace(x_min, x_max, 300),
+                     np.linspace(y_min, y_max, 300))
+# Dự đoán cho từng điểm trong lưới
+Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
+Z = Z.reshape(xx.shape)
+# 7. Vẽ biểu đồ
+plt.figure(figsize=(8,6))
+plt.contourf(xx, yy, Z, alpha=0.3, cmap=plt.cm.coolwarm)
+plt.scatter(X[:, 0], X[:, 1], c=y, s=40, cmap=plt.cm.coolwarm, edgecolors='k')
+plt.title(f"SVM Decision Boundary (accuracy = {acc:.2f})")
+plt.xlabel("Feature 1")
+plt.ylabel("Feature 2")
+plt.show()