ml_courses / code /ps2_submission.py
johnnydevriese's picture
Upload 80 files
8938d1b verified
import numpy as np
import util
# Character to replace with sub-problem letter in plot_path/save_path
WILDCARD = 'X'
def main_LogReg(train_path, valid_path, save_path):
"""Problem (1b): Logistic regression with Newton's Method.
Args:
train_path: Path to CSV file containing dataset for training.
valid_path: Path to CSV file containing dataset for validation.
save_path: Path to save predicted probabilities using np.savetxt().
"""
# Load dataset
x_train, y_train = util.load_dataset(train_path, add_intercept=True)
# Train a logistic regression classifier
clf = LogisticRegression()
clf.fit(x_train, y_train)
# Plot decision boundary on top of validation set set
x_eval, y_eval = util.load_dataset(valid_path, add_intercept=True)
plot_path = save_path.replace('.txt', '.png')
util.plot(x_eval, y_eval, clf.theta, plot_path)
# Use np.savetxt to save predictions on eval set to save_path
p_eval = clf.predict(x_eval)
yhat = p_eval > 0.5
print('LR Accuracy: %.2f' % np.mean( (yhat == 1) == (y_eval == 1)))
np.savetxt(save_path, p_eval)
class LogisticRegression:
"""Logistic regression with Newton's Method as the solver.
Example usage:
> clf = LogisticRegression()
> clf.fit(x_train, y_train)
> clf.predict(x_eval)
"""
def __init__(self, step_size=0.01, max_iter=1000000, eps=1e-5,
theta_0=None, verbose=True):
"""
Args:
step_size: Step size for iterative solvers only.
max_iter: Maximum number of iterations for the solver.
eps: Threshold for determining convergence.
theta_0: Initial guess for theta. If None, use the zero vector.
verbose: Print loss values during training.
"""
self.theta = theta_0
self.step_size = step_size
self.max_iter = max_iter
self.eps = eps
self.verbose = verbose
def gradient(self,x, y):
n_examples, dim = x.shape
logits = self.sigmoid(x)
# grad of logit function
gradient = 1 / n_examples * x.T @ (logits - y)
return gradient
def hessian(self, x, y):
n_examples, dim = x.shape
# sigmoid = lambda x: 1 / 1 + np.exp(- x @ self.theta)
logits = self.sigmoid(x)
# probs = self._sigmoid(x.dot(self.theta))
# diag = np.diag(logits * (1. - logits))
# hess = 1 / n_examples * x.T.dot(diag).dot(x)
# return hess
# main diag is just second derivative wrt to itself. e.g. f_xx and f_yy
main_diagonal = np.diag(logits * (1 - logits))
hessian = 1 / n_examples * x.T @ main_diagonal @ x
return hessian
def loss(self, x, y):
# https://developers.google.com/machine-learning/crash-course/logistic-regression/model-training
# also in p.16 in Supervised Learning notes
n_examples, dim = x.shape
# sigmoid = lambda x: 1 / 1 + np.exp(- x @ self.theta)
logits = self.sigmoid(x)
loss = -np.mean(y * np.log(logits) + (1 + y) * np.log(1 - logits))
return loss
def sigmoid(self, x):
# return 1 / (1 + np.exp(-x.dot(self.theta)))
return 1 / (1 + np.exp(- x @ self.theta))
def fit(self, x, y):
"""Run Newton's Method to minimize J(theta) for logistic regression.
Args:
x: Training example inputs. Shape (n_examples, dim).
y: Training example labels. Shape (n_examples,).
"""
# *** START CODE HERE ***
# NOTE: look at p.18 in notes
# we need to calculate theta with Newton and then maximize the
# logistic regression log likelihood function l(theta)
# prev_theta = theta # store for comparison
# m = rows = number of examples
# n = columns = number of features
# breakpoint()
# NOTE: it looks like they prepend the '1' at the beginning of the x array!
n_examples, dim = x.shape
if self.theta is None:
self.theta = np.zeros(dim)
# just need to init for first time.
# theta_prev = np.ones(dim)
# # print(np.sum(np.abs(theta_prev - self.theta)) < self.eps)
# current_iteration = 0
# theta_difference = np.sum(np.abs(theta_prev - self.theta))
# while theta_difference > self.eps and current_iteration < self.max_iter:
for i in range(self.max_iter):
# current_iteration += 1
gradient = self.gradient(x, y)
hessian = self.hessian(x, y)
# theta_prev = self.step(gradient, hessian)
theta_prev = np.copy(self.theta)
# theta_prev = self.step()
# theta_prev = self.theta
# self.theta = self.theta - self.step_size * np.linalg.inv(hessian) @ gradient
self.theta -= self.step_size * np.linalg.inv(hessian).dot(gradient)
if np.sum(np.abs(theta_prev - self.theta)) < self.eps:
break
# *** END CODE HERE ***
def predict(self, x):
"""Return predicted probabilities given new inputs x.
Args:
x: Inputs of shape (n_examples, dim).
Returns:
Outputs of shape (n_examples,).
"""
# *** START CODE HERE ***
# breakpoint()
# sigmoid = lambda x: 1 / 1 + np.exp(- x @ self.theta)
prediction = self.sigmoid(x)
return prediction
# *** END CODE HERE ***
def main_GDA(train_path, valid_path, save_path):
"""Problem (1e): Gaussian discriminant analysis (GDA)
Args:
train_path: Path to CSV file containing dataset for training.
valid_path: Path to CSV file containing dataset for validation.
save_path: Path to save predicted probabilities using np.savetxt().
"""
# Load dataset
x_train, y_train = util.load_dataset(train_path, add_intercept=False)
# Train a GDA classifier
clf = GDA()
clf.fit(x_train, y_train)
# Plot decision boundary on validation set
x_eval, y_eval = util.load_dataset(valid_path, add_intercept=False)
plot_path = save_path.replace('.txt', '.png')
util.plot(x_eval, y_eval, clf.theta, plot_path)
x_eval = util.add_intercept(x_eval)
# Use np.savetxt to save outputs from validation set to save_path
p_eval = clf.predict(x_eval)
yhat = p_eval > 0.5
print('GDA Accuracy: %.2f' % np.mean( (yhat == 1) == (y_eval == 1)))
np.savetxt(save_path, p_eval)
class GDA:
"""Gaussian Discriminant Analysis.
Example usage:
> clf = GDA()
> clf.fit(x_train, y_train)
> clf.predict(x_eval)
"""
def __init__(self, step_size=0.01, max_iter=10000, eps=1e-5,
theta_0=None, verbose=True):
"""
Args:
step_size: Step size for iterative solvers only.
max_iter: Maximum number of iterations for the solver.
eps: Threshold for determining convergence.
theta_0: Initial guess for theta. If None, use the zero vector.
verbose: Print loss values during training.
"""
self.theta = theta_0
self.step_size = step_size
self.max_iter = max_iter
self.eps = eps
self.verbose = verbose
def sigmoid(self, x):
# return 1 / (1 + np.exp(-x.dot(self.theta)))
return 1 / (1 + np.exp(- x @ self.theta))
def fit(self, x, y):
"""Fit a GDA model to training set given by x and y by updating
self.theta.
Args:
x: Training example inputs. Shape (n_examples, dim).
y: Training example labels. Shape (n_examples,).
"""
# *** START CODE HERE ***
n_examples, dim = x.shape
# Find phi, mu_0, mu_1, and sigma
phi = 1 / n_examples * np.sum(y == 1)
mu_0 = (y == 0).dot(x) / np.sum(y == 0)
mu_1 = (y == 1).dot(x) / np.sum(y == 1)
mu_yi = np.where(np.expand_dims(y == 0, -1),
np.expand_dims(mu_0, 0),
np.expand_dims(mu_1, 0))
sigma = 1 / n_examples * (x - mu_yi).T.dot(x - mu_yi)
# Write theta in terms of the parameters
self.theta = np.zeros(dim + 1)
sigma_inv = np.linalg.inv(sigma)
mu_diff = mu_0.T.dot(sigma_inv).dot(mu_0) - mu_1.T.dot(sigma_inv).dot(mu_1)
self.theta[0] = 1 / 2 * mu_diff - np.log((1 - phi) / phi)
self.theta[1:] = -sigma_inv.dot(mu_0 - mu_1)
# *** END CODE HERE ***
def predict(self, x):
"""Make a prediction given new inputs x.
Args:
x: Inputs of shape (n_examples, dim).
Returns:
Outputs of shape (n_examples,).
"""
# *** START CODE HERE ***
prediction = self.sigmoid(x)
return prediction
# *** END CODE HERE
def main_posonly(train_path, valid_path, test_path, save_path):
"""Problem 2: Logistic regression for incomplete, positive-only labels.
Run under the following conditions:
1. on t-labels,
2. on y-labels,
3. on y-labels with correction factor alpha.
Args:
train_path: Path to CSV file containing training set.
valid_path: Path to CSV file containing validation set.
test_path: Path to CSV file containing test set.
save_path: Path to save predictions.
NOTE: You need to complete logreg implementation first (see class above)!!!
"""
output_path_true = save_path.replace(WILDCARD, 'true')
output_path_naive = save_path.replace(WILDCARD, 'naive')
output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')
plot_path = save_path.replace('.txt', '.png')
plot_path_true = plot_path.replace(WILDCARD, 'true')
plot_path_naive = plot_path.replace(WILDCARD, 'naive')
plot_path_adjusted = plot_path.replace(WILDCARD, 'adjusted')
# Problem (2a): Train and test on true labels (t)
full_predictions = fully_observed_predictions(train_path, test_path, output_path_true, plot_path_true)
# Problem (2b): Train on y-labels and test on true labels
naive_predictions, clf = naive_partial_labels_predictions(train_path, test_path, output_path_naive, plot_path_naive)
# Problem (2f): Apply correction factor using validation set and test on true labels
alpha = find_alpha_and_plot_correction(clf, valid_path, test_path, output_path_adjusted, plot_path_adjusted, naive_predictions)
return
def fully_observed_predictions(train_path, test_path, output_path_true, plot_path_true):
"""
Problem (2a): Fully Observable Binary Classification Helper Function
Args:
train_path: Path to CSV file containing dataset for training.
test_path: Path to CSV file containing dataset for testing.
output_path_true: Path to save observed predictions
plot_path_true: Path to save the plot using plot_posonly util function
Return:
full_predictions: tensor of predictions returned from applied LogReg classifier prediction
"""
full_predictions = None
# Problem (2a): Train and test on true labels (t)
# Make sure to save predicted probabilities to output_path_true using np.savetxt()
# *** START CODE HERE ***
x_train, t_train = util.load_dataset(train_path, label_col='t',
add_intercept=True)
clf = LogisticRegression()
clf.fit(x_train, t_train)
x_test, t_test = util.load_dataset(test_path, label_col='t',
add_intercept=True)
full_predictions = clf.predict(x_test)
np.savetxt(output_path_true, full_predictions)
util.plot(x_test, t_test, clf.theta, plot_path_true)
# *** END CODE HERE ***
return full_predictions
def naive_partial_labels_predictions(train_path, test_path, output_path_naive, plot_path_naive):
"""
Problem (2b): Naive Partial Labels Binary Classification Helper Function
Args:
train_path: Path to CSV file containing dataset for training.
test_path: Path to CSV file containing dataset for testing.
output_path_naive: Path to save observed predictions
plot_path_naive: Path to save the plot using plot_posonly util function
Return:
naive_predictions: tensor of predictions returned from applied LogReg prediction
clf: Logistic Regression classifier (will be reused for 2f)
"""
naive_predictions = None
clf = None
# Problem (2b): Train on y-labels and test on true labels
# Make sure to save predicted probabilities to output_path_naive using np.savetxt()
# *** START CODE HERE ***
x_train, y_train = util.load_dataset(train_path, label_col='y',
add_intercept=True)
clf = LogisticRegression()
clf.fit(x_train, y_train)
x_test, t_test = util.load_dataset(test_path, label_col='t',
add_intercept=True)
naive_predictions = clf.predict(x_test)
np.savetxt(output_path_naive, naive_predictions)
util.plot(x_test, t_test, clf.theta, plot_path_naive)
# *** END CODE HERE ***
return naive_predictions, clf
def find_alpha_and_plot_correction(clf, valid_path, test_path, output_path_adjusted, plot_path_adjusted, naive_predictions):
"""
Problem (2f): Alpha Correction Binary Classification Helper Function
Args:
clf: Logistic regression classifier from part 2b
valid_path: Path to CSV file containing dataset for validation.
test_path: Path to CSV file containing dataset for testing.
output_path_adjusted: Path to save observed predictions
plot_path_adjusted: Path to save the plot using plot_posonly util function
naive_predictions: tensor of predictions returned from applied LogReg prediction from 2b
Return:
alpha: corrected alpha value
"""
alpha = None
# Problem (2f): Apply correction factor using validation set and test on true labels
# Plot and use np.savetxt to save outputs to output_path_adjusted
# *** START CODE HERE ***
x_valid, y_valid = util.load_dataset(valid_path, label_col='y')
x_valid = x_valid[y_valid == 1, :] # Restrict to just the labeled examples
x_valid = util.add_intercept(x_valid)
y_pred = clf.predict(x_valid)
alpha = np.mean(y_pred)
print('Found alpha = {}'.format(alpha))
x_test, t_test = util.load_dataset(test_path, label_col='t',
add_intercept=True)
# Plot and use np.savetxt to save outputs to output_path_adjusted
np.savetxt(output_path_adjusted, naive_predictions / alpha)
util.plot(x_test, t_test, clf.theta, plot_path_adjusted, correction=alpha)
# *** END CODE HERE ***
return alpha
if __name__ == '__main__':
'''
Start of Problem 1: Linear Classifiers
'''
# 1b
main_LogReg(train_path='ds1_train.csv',
valid_path='ds1_valid.csv',
save_path='logreg_pred_1.txt')
main_LogReg(train_path='ds2_train.csv',
valid_path='ds2_valid.csv',
save_path='logreg_pred_2.txt')
# 1e
main_GDA(train_path='ds1_train.csv',
valid_path='ds1_valid.csv',
save_path='gda_pred_1.txt')
main_GDA(train_path='ds2_train.csv',
valid_path='ds2_valid.csv',
save_path='gda_pred_2.txt')
'''
Start of Problem 2: Incomplete, Positive-Only Labels
'''
main_posonly(train_path='train.csv',
valid_path='valid.csv',
test_path='test.csv',
save_path='posonly_X_pred.txt')