# Hồi quy tuyến tính đơn giản
import numpy as np

# Dữ liệu y = 2x + 1
x = np.array([1,2,3,4,5])
y = np.array([3,5,7,9,11])

# Tính hệ số w và b theo công thức thống kê
w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean())**2)
b = y.mean() - w * x.mean()

# Dự đoán
y_pred = w * x + b

print(f"Phương trình: y= {w: .1f}x + {b: .1f}")
print(f"Dự đoán:", y_pred)


# Vẽ đồ thị Linear Regression để trực quan hóa
 # Hồi quy tuyến tính đơn giản + vẽ biểu đồ
import numpy as np
import matplotlib.pyplot as plt

# Dữ liệu y = 2x + 1
x = np.array([1,2,3,4,5])
y = np.array([3,5,7,9,11])

# Tính hệ số hồi quy
w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean()) ** 2)
b = y.mean() - w * x.mean()

# Dự đoán
y_pred = w * x + b

# In ra kết quả
print(f"Phương trình hồi quy: y={w: .1f}x + {b: .1f}")

# Vẽ biểu đồ
plt.scatter(x,y, color='blue', label='Dữ liệu thật')
plt.plot(x,y_pred, color='red', label='Đường hồi quy')
plt.xlabel('x axis')
plt.ylabel('y axis')
plt.title('Hồi quy tuyến tính đơn giản + Trực quan hóa')
plt.legend()
plt.show()

# Luyện tập
# 1. Tuyến tính nhưng đổi hệ số khác
import numpy as np
# Dữ liệu y = 4x - 3
x = np.array([1,2,3,4,5,6,7])
y = np.array([1,5,9,13,17,21,25]) # y = 4x - 3
# Tính hệ số w và b theo công thức thống kê
w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean())**2)
b = y.mean() - w * x.mean()
# Dự đoán
y_predict = w * x + b
print(f"Phương trình y= {w: .0f}x + {b: .0f}")
print(f"Dự đoán:", y_predict)
# Vẽ đồ thị để trực quan hóa
import matplotlib.pyplot as plt
plt.scatter(x,y, color='black', label='Real Data')
plt.plot(x, y_predict, color='red', label='Predicted Data')
plt.xlabel('X - Axis')
plt.ylabel('Y - Axis')
plt.title('Linear Regression (4x - 3)')
plt.legend()
plt.show()

# 2. Tuyến tính có nhiễu(noise) thực tế hơn
 # Công thức y = 3x + 2 + noise
import numpy as np
import matplotlib.pyplot as plt
# Dữ liệu 3x + 2 + noise
np.random.seed(0)
x = np.arange(1,11)
y = 3 * x + 2 + np.random.randn(10) * 2 # thêm nhiễu ngẫu nhiên
# Tính hệ số w và b theo công thức thống kê
w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean())**2)
b = y.mean() - w * x.mean()
# Dự đoán
y_predict = w * x + b
print(f"Phương trình + nhiễu: y = {w: .1f}x + {b: .1f}")
print(f"Dự đoán chính xác:", y_predict)
# Vẽ đồ thị trực quan hóa với nhiễu
plt.scatter(x,y, color='blue', label='Real Data + Noise')
plt.plot(x,y_predict, color='red', label='Predicted Data + Noise')
plt.xlabel('X - Axis')
plt.ylabel('Y - Axis')
plt.title('Linear Regression AND Noise')
plt.legend()
plt.show()

# 3. Tuyến tính nghịch (hệ số âm)
 # Công thức y = -2x + 10
import numpy as np
import matplotlib.pyplot as plt
# Dữ liệu tuyến tính nghịch
x = np.array([1,2,3,4,5,6])
y = np.array([8,6,4,2,0,-2])
# Tính hệ số w và b theo công thức thống kê
w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean())**2)
b = y.mean() - w * x.mean()
# Dự đoán chính xác
y_pred = w * x + b
print(f"Phương trình tuyến tính nghịch: y = {w: .0f}x + {b: .0f}")
print(f"Dự đoán chính xác:", y_pred)
# Vẽ đồ thị trực quan hóa với tuyến tính nghịch
plt.scatter(x,y, color='red', label='Real Data')
plt.plot(x,y_pred, color='blue', label='Predicted Data')
plt.xlabel('X - Axis')
plt.ylabel('Y - Axis')
plt.title('Linear Regression AND Reversed')
plt.legend()
plt.show()

# 4. Không tuyến tính (để xem hồi quy tuyến tính kém thế nào)
 # Công thức: y =  x**2 + noise
import numpy as np
import matplotlib.pyplot as plt
# Dữ liệu không tuyến tính
np.random.seed(1)
x = np.linspace(-5,5,20) # Giải thích: .linspace(start, stop, num) với num=number là số giá trị cách đều nhau
y = x**2 + np.random.randn(20)*3  # Sinh ra 20 số ngẫu nhiên tuân theo phân phối chuẩn (Gaussian) → trung bình = 0, độ lệch chuẩn = 1; Nhân với 3 để phóng đại độ nhiễu (noise) → sai số ngẫu nhiên có độ lệch chuẩn ≈ 3 → càng nhân lớn, dữ liệu càng "rải" quanh đường parabol. y là giá trị theo hàm bậc hai, có thêm nhiễu ngẫu nhiên để giống dữ liệu thật. Đây là mối quan hệ phi tuyến tính giữa x và y.
# Tính hệ số w và b theo công thức thống kê
w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean())**2)
b = y.mean() - w * x.mean()
# Dự đoán chính xác
y_pred = w * x + b
print(f"Phương trình không tuyến tính: y={w: 1f}x + {b: .1f}")
print(f"Dự đoán chính xác:", y_pred)
# Vẽ đồ thị để trực quan hóa không tuyến tính
plt.scatter(x,y, color='red', label='Real Data with no linear')
plt.plot(x,y_pred, color='blue', label='Predicted Data with no linear')
plt.xlabel('X - Axis')
plt.ylabel('Y - Axis')
plt.title(' No Linear Regression')
plt.legend()
plt.show()

# 5. Dữ liệu ngẫu nhiên để luyện hiểu độ phù hợp (low R**2)
import numpy as np
import matplotlib.pyplot as plt
# Dữ liệu ngẫu nhiên
np.random.seed(2)
x = np.random.randint(1,100,20) # Giải thích: np.random.randint(low,high,size) với size là tổng số ptử muốn tạo ra
y = np.random.randint(1,100,20)

# Tính hệ số w và b theo công thức thống kê
w = np.sum((x - x.mean()) * (y - y.mean())) / np.sum((x - x.mean())**2)
b = y.mean() - w * x.mean()
# Dự đoán chính xác
y_pred = w * x + b
print(f"Phương trình tuyến tính + Random Data: y={w: 1f}x + {b: .1f}")
print(f"Dự đoán chính xác:", y_pred)
# Vẽ đồ thị để trực quan hóa không tuyến tính
plt.scatter(x,y, color='red', label='Real Data + Random')
plt.plot(x,y_pred, color='blue', label='Predicted Data + Random')
plt.xlabel('X - Axis')
plt.ylabel('Y - Axis')
plt.title('Linear Regression With Random Data')
plt.legend()
plt.show()