ml_courses / code /ps2_submission.py

Upload 80 files

8938d1b verified 11 months ago

15.6 kB

	import numpy as np
	import util

	# Character to replace with sub-problem letter in plot_path/save_path
	WILDCARD = 'X'

	def main_LogReg(train_path, valid_path, save_path):
	"""Problem (1b): Logistic regression with Newton's Method.

	Args:
	train_path: Path to CSV file containing dataset for training.
	valid_path: Path to CSV file containing dataset for validation.
	save_path: Path to save predicted probabilities using np.savetxt().
	"""
	# Load dataset
	x_train, y_train = util.load_dataset(train_path, add_intercept=True)

	# Train a logistic regression classifier
	clf = LogisticRegression()
	clf.fit(x_train, y_train)

	# Plot decision boundary on top of validation set set
	x_eval, y_eval = util.load_dataset(valid_path, add_intercept=True)
	plot_path = save_path.replace('.txt', '.png')
	util.plot(x_eval, y_eval, clf.theta, plot_path)

	# Use np.savetxt to save predictions on eval set to save_path
	p_eval = clf.predict(x_eval)
	yhat = p_eval > 0.5
	print('LR Accuracy: %.2f' % np.mean( (yhat == 1) == (y_eval == 1)))
	np.savetxt(save_path, p_eval)

	class LogisticRegression:
	"""Logistic regression with Newton's Method as the solver.

	Example usage:
	> clf = LogisticRegression()
	> clf.fit(x_train, y_train)
	> clf.predict(x_eval)
	"""
	def __init__(self, step_size=0.01, max_iter=1000000, eps=1e-5,
	theta_0=None, verbose=True):
	"""
	Args:
	step_size: Step size for iterative solvers only.
	max_iter: Maximum number of iterations for the solver.
	eps: Threshold for determining convergence.
	theta_0: Initial guess for theta. If None, use the zero vector.
	verbose: Print loss values during training.
	"""
	self.theta = theta_0
	self.step_size = step_size
	self.max_iter = max_iter
	self.eps = eps
	self.verbose = verbose

	def gradient(self,x, y):
	n_examples, dim = x.shape
	logits = self.sigmoid(x)
	# grad of logit function
	gradient = 1 / n_examples * x.T @ (logits - y)
	return gradient

	def hessian(self, x, y):
	n_examples, dim = x.shape
	# sigmoid = lambda x: 1 / 1 + np.exp(- x @ self.theta)
	logits = self.sigmoid(x)

	# probs = self._sigmoid(x.dot(self.theta))
	# diag = np.diag(logits * (1. - logits))
	# hess = 1 / n_examples * x.T.dot(diag).dot(x)
	# return hess

	# main diag is just second derivative wrt to itself. e.g. f_xx and f_yy
	main_diagonal = np.diag(logits * (1 - logits))
	hessian = 1 / n_examples * x.T @ main_diagonal @ x
	return hessian

	def loss(self, x, y):
	# https://developers.google.com/machine-learning/crash-course/logistic-regression/model-training
	# also in p.16 in Supervised Learning notes
	n_examples, dim = x.shape
	# sigmoid = lambda x: 1 / 1 + np.exp(- x @ self.theta)
	logits = self.sigmoid(x)

	loss = -np.mean(y * np.log(logits) + (1 + y) * np.log(1 - logits))
	return loss


	def sigmoid(self, x):
	# return 1 / (1 + np.exp(-x.dot(self.theta)))
	return 1 / (1 + np.exp(- x @ self.theta))


	def fit(self, x, y):
	"""Run Newton's Method to minimize J(theta) for logistic regression.

	Args:
	x: Training example inputs. Shape (n_examples, dim).
	y: Training example labels. Shape (n_examples,).
	"""
	# * START CODE HERE *
	# NOTE: look at p.18 in notes
	# we need to calculate theta with Newton and then maximize the
	# logistic regression log likelihood function l(theta)
	# prev_theta = theta # store for comparison

	# m = rows = number of examples
	# n = columns = number of features

	# breakpoint()
	# NOTE: it looks like they prepend the '1' at the beginning of the x array!
	n_examples, dim = x.shape
	if self.theta is None:
	self.theta = np.zeros(dim)

	# just need to init for first time.
	# theta_prev = np.ones(dim)
	# # print(np.sum(np.abs(theta_prev - self.theta)) < self.eps)

	# current_iteration = 0
	# theta_difference = np.sum(np.abs(theta_prev - self.theta))
	# while theta_difference > self.eps and current_iteration < self.max_iter:
	for i in range(self.max_iter):
	# current_iteration += 1

	gradient = self.gradient(x, y)
	hessian = self.hessian(x, y)

	# theta_prev = self.step(gradient, hessian)
	theta_prev = np.copy(self.theta)
	# theta_prev = self.step()
	# theta_prev = self.theta
	# self.theta = self.theta - self.step_size * np.linalg.inv(hessian) @ gradient
	self.theta -= self.step_size * np.linalg.inv(hessian).dot(gradient)

	if np.sum(np.abs(theta_prev - self.theta)) < self.eps:
	break

	# * END CODE HERE *

	def predict(self, x):
	"""Return predicted probabilities given new inputs x.

	Args:
	x: Inputs of shape (n_examples, dim).

	Returns:
	Outputs of shape (n_examples,).
	"""
	# * START CODE HERE *
	# breakpoint()
	# sigmoid = lambda x: 1 / 1 + np.exp(- x @ self.theta)
	prediction = self.sigmoid(x)
	return prediction
	# * END CODE HERE *

	def main_GDA(train_path, valid_path, save_path):
	"""Problem (1e): Gaussian discriminant analysis (GDA)

	Args:
	train_path: Path to CSV file containing dataset for training.
	valid_path: Path to CSV file containing dataset for validation.
	save_path: Path to save predicted probabilities using np.savetxt().
	"""
	# Load dataset
	x_train, y_train = util.load_dataset(train_path, add_intercept=False)

	# Train a GDA classifier
	clf = GDA()
	clf.fit(x_train, y_train)

	# Plot decision boundary on validation set
	x_eval, y_eval = util.load_dataset(valid_path, add_intercept=False)
	plot_path = save_path.replace('.txt', '.png')
	util.plot(x_eval, y_eval, clf.theta, plot_path)
	x_eval = util.add_intercept(x_eval)

	# Use np.savetxt to save outputs from validation set to save_path
	p_eval = clf.predict(x_eval)
	yhat = p_eval > 0.5
	print('GDA Accuracy: %.2f' % np.mean( (yhat == 1) == (y_eval == 1)))
	np.savetxt(save_path, p_eval)

	class GDA:
	"""Gaussian Discriminant Analysis.

	Example usage:
	> clf = GDA()
	> clf.fit(x_train, y_train)
	> clf.predict(x_eval)
	"""
	def __init__(self, step_size=0.01, max_iter=10000, eps=1e-5,
	theta_0=None, verbose=True):
	"""
	Args:
	step_size: Step size for iterative solvers only.
	max_iter: Maximum number of iterations for the solver.
	eps: Threshold for determining convergence.
	theta_0: Initial guess for theta. If None, use the zero vector.
	verbose: Print loss values during training.
	"""
	self.theta = theta_0
	self.step_size = step_size
	self.max_iter = max_iter
	self.eps = eps
	self.verbose = verbose

	def sigmoid(self, x):
	# return 1 / (1 + np.exp(-x.dot(self.theta)))
	return 1 / (1 + np.exp(- x @ self.theta))


	def fit(self, x, y):
	"""Fit a GDA model to training set given by x and y by updating
	self.theta.

	Args:
	x: Training example inputs. Shape (n_examples, dim).
	y: Training example labels. Shape (n_examples,).
	"""
	# * START CODE HERE *
	n_examples, dim = x.shape

	# Find phi, mu_0, mu_1, and sigma
	phi = 1 / n_examples * np.sum(y == 1)
	mu_0 = (y == 0).dot(x) / np.sum(y == 0)
	mu_1 = (y == 1).dot(x) / np.sum(y == 1)
	mu_yi = np.where(np.expand_dims(y == 0, -1),
	np.expand_dims(mu_0, 0),
	np.expand_dims(mu_1, 0))
	sigma = 1 / n_examples * (x - mu_yi).T.dot(x - mu_yi)

	# Write theta in terms of the parameters
	self.theta = np.zeros(dim + 1)
	sigma_inv = np.linalg.inv(sigma)
	mu_diff = mu_0.T.dot(sigma_inv).dot(mu_0) - mu_1.T.dot(sigma_inv).dot(mu_1)
	self.theta[0] = 1 / 2 * mu_diff - np.log((1 - phi) / phi)
	self.theta[1:] = -sigma_inv.dot(mu_0 - mu_1)

	# * END CODE HERE *

	def predict(self, x):
	"""Make a prediction given new inputs x.

	Args:
	x: Inputs of shape (n_examples, dim).

	Returns:
	Outputs of shape (n_examples,).
	"""
	# * START CODE HERE *
	prediction = self.sigmoid(x)
	return prediction
	# *** END CODE HERE

	def main_posonly(train_path, valid_path, test_path, save_path):
	"""Problem 2: Logistic regression for incomplete, positive-only labels.

	Run under the following conditions:
	1. on t-labels,
	2. on y-labels,
	3. on y-labels with correction factor alpha.

	Args:
	train_path: Path to CSV file containing training set.
	valid_path: Path to CSV file containing validation set.
	test_path: Path to CSV file containing test set.
	save_path: Path to save predictions.

	NOTE: You need to complete logreg implementation first (see class above)!!!
	"""
	output_path_true = save_path.replace(WILDCARD, 'true')
	output_path_naive = save_path.replace(WILDCARD, 'naive')
	output_path_adjusted = save_path.replace(WILDCARD, 'adjusted')

	plot_path = save_path.replace('.txt', '.png')
	plot_path_true = plot_path.replace(WILDCARD, 'true')
	plot_path_naive = plot_path.replace(WILDCARD, 'naive')
	plot_path_adjusted = plot_path.replace(WILDCARD, 'adjusted')

	# Problem (2a): Train and test on true labels (t)
	full_predictions = fully_observed_predictions(train_path, test_path, output_path_true, plot_path_true)

	# Problem (2b): Train on y-labels and test on true labels
	naive_predictions, clf = naive_partial_labels_predictions(train_path, test_path, output_path_naive, plot_path_naive)

	# Problem (2f): Apply correction factor using validation set and test on true labels
	alpha = find_alpha_and_plot_correction(clf, valid_path, test_path, output_path_adjusted, plot_path_adjusted, naive_predictions)

	return

	def fully_observed_predictions(train_path, test_path, output_path_true, plot_path_true):
	"""
	Problem (2a): Fully Observable Binary Classification Helper Function

	Args:
	train_path: Path to CSV file containing dataset for training.
	test_path: Path to CSV file containing dataset for testing.
	output_path_true: Path to save observed predictions
	plot_path_true: Path to save the plot using plot_posonly util function
	Return:
	full_predictions: tensor of predictions returned from applied LogReg classifier prediction
	"""
	full_predictions = None
	# Problem (2a): Train and test on true labels (t)
	# Make sure to save predicted probabilities to output_path_true using np.savetxt()
	# * START CODE HERE *
	x_train, t_train = util.load_dataset(train_path, label_col='t',
	add_intercept=True)
	clf = LogisticRegression()
	clf.fit(x_train, t_train)

	x_test, t_test = util.load_dataset(test_path, label_col='t',
	add_intercept=True)

	full_predictions = clf.predict(x_test)
	np.savetxt(output_path_true, full_predictions)
	util.plot(x_test, t_test, clf.theta, plot_path_true)
	# * END CODE HERE *
	return full_predictions

	def naive_partial_labels_predictions(train_path, test_path, output_path_naive, plot_path_naive):
	"""
	Problem (2b): Naive Partial Labels Binary Classification Helper Function

	Args:
	train_path: Path to CSV file containing dataset for training.
	test_path: Path to CSV file containing dataset for testing.
	output_path_naive: Path to save observed predictions
	plot_path_naive: Path to save the plot using plot_posonly util function
	Return:
	naive_predictions: tensor of predictions returned from applied LogReg prediction
	clf: Logistic Regression classifier (will be reused for 2f)
	"""
	naive_predictions = None
	clf = None
	# Problem (2b): Train on y-labels and test on true labels
	# Make sure to save predicted probabilities to output_path_naive using np.savetxt()
	# * START CODE HERE *
	x_train, y_train = util.load_dataset(train_path, label_col='y',
	add_intercept=True)
	clf = LogisticRegression()
	clf.fit(x_train, y_train)
	x_test, t_test = util.load_dataset(test_path, label_col='t',
	add_intercept=True)
	naive_predictions = clf.predict(x_test)
	np.savetxt(output_path_naive, naive_predictions)
	util.plot(x_test, t_test, clf.theta, plot_path_naive)
	# * END CODE HERE *
	return naive_predictions, clf

	def find_alpha_and_plot_correction(clf, valid_path, test_path, output_path_adjusted, plot_path_adjusted, naive_predictions):
	"""
	Problem (2f): Alpha Correction Binary Classification Helper Function

	Args:
	clf: Logistic regression classifier from part 2b
	valid_path: Path to CSV file containing dataset for validation.
	test_path: Path to CSV file containing dataset for testing.
	output_path_adjusted: Path to save observed predictions
	plot_path_adjusted: Path to save the plot using plot_posonly util function
	naive_predictions: tensor of predictions returned from applied LogReg prediction from 2b
	Return:
	alpha: corrected alpha value
	"""
	alpha = None
	# Problem (2f): Apply correction factor using validation set and test on true labels
	# Plot and use np.savetxt to save outputs to output_path_adjusted
	# * START CODE HERE *
	x_valid, y_valid = util.load_dataset(valid_path, label_col='y')
	x_valid = x_valid[y_valid == 1, :] # Restrict to just the labeled examples
	x_valid = util.add_intercept(x_valid)
	y_pred = clf.predict(x_valid)
	alpha = np.mean(y_pred)
	print('Found alpha = {}'.format(alpha))
	x_test, t_test = util.load_dataset(test_path, label_col='t',
	add_intercept=True)

	# Plot and use np.savetxt to save outputs to output_path_adjusted
	np.savetxt(output_path_adjusted, naive_predictions / alpha)
	util.plot(x_test, t_test, clf.theta, plot_path_adjusted, correction=alpha)
	# * END CODE HERE *
	return alpha

	if __name__ == '__main__':
	'''
	Start of Problem 1: Linear Classifiers
	'''
	# 1b
	main_LogReg(train_path='ds1_train.csv',
	valid_path='ds1_valid.csv',
	save_path='logreg_pred_1.txt')
	main_LogReg(train_path='ds2_train.csv',
	valid_path='ds2_valid.csv',
	save_path='logreg_pred_2.txt')
	# 1e
	main_GDA(train_path='ds1_train.csv',
	valid_path='ds1_valid.csv',
	save_path='gda_pred_1.txt')
	main_GDA(train_path='ds2_train.csv',
	valid_path='ds2_valid.csv',
	save_path='gda_pred_2.txt')

	'''
	Start of Problem 2: Incomplete, Positive-Only Labels
	'''
	main_posonly(train_path='train.csv',
	valid_path='valid.csv',
	test_path='test.csv',
	save_path='posonly_X_pred.txt')