|
|
import numpy as np |
|
|
import util |
|
|
|
|
|
|
|
|
WILDCARD = 'X' |
|
|
|
|
|
def main_LogReg(train_path, valid_path, save_path): |
|
|
"""Problem (1b): Logistic regression with Newton's Method. |
|
|
|
|
|
Args: |
|
|
train_path: Path to CSV file containing dataset for training. |
|
|
valid_path: Path to CSV file containing dataset for validation. |
|
|
save_path: Path to save predicted probabilities using np.savetxt(). |
|
|
""" |
|
|
|
|
|
x_train, y_train = util.load_dataset(train_path, add_intercept=True) |
|
|
|
|
|
|
|
|
clf = LogisticRegression() |
|
|
clf.fit(x_train, y_train) |
|
|
|
|
|
|
|
|
x_eval, y_eval = util.load_dataset(valid_path, add_intercept=True) |
|
|
plot_path = save_path.replace('.txt', '.png') |
|
|
util.plot(x_eval, y_eval, clf.theta, plot_path) |
|
|
|
|
|
|
|
|
p_eval = clf.predict(x_eval) |
|
|
yhat = p_eval > 0.5 |
|
|
print('LR Accuracy: %.2f' % np.mean( (yhat == 1) == (y_eval == 1))) |
|
|
np.savetxt(save_path, p_eval) |
|
|
|
|
|
class LogisticRegression: |
|
|
"""Logistic regression with Newton's Method as the solver. |
|
|
|
|
|
Example usage: |
|
|
> clf = LogisticRegression() |
|
|
> clf.fit(x_train, y_train) |
|
|
> clf.predict(x_eval) |
|
|
""" |
|
|
def __init__(self, step_size=0.01, max_iter=1000000, eps=1e-5, |
|
|
theta_0=None, verbose=True): |
|
|
""" |
|
|
Args: |
|
|
step_size: Step size for iterative solvers only. |
|
|
max_iter: Maximum number of iterations for the solver. |
|
|
eps: Threshold for determining convergence. |
|
|
theta_0: Initial guess for theta. If None, use the zero vector. |
|
|
verbose: Print loss values during training. |
|
|
""" |
|
|
self.theta = theta_0 |
|
|
self.step_size = step_size |
|
|
self.max_iter = max_iter |
|
|
self.eps = eps |
|
|
self.verbose = verbose |
|
|
|
|
|
def gradient(self,x, y): |
|
|
n_examples, dim = x.shape |
|
|
logits = self.sigmoid(x) |
|
|
|
|
|
gradient = 1 / n_examples * x.T @ (logits - y) |
|
|
return gradient |
|
|
|
|
|
def hessian(self, x, y): |
|
|
n_examples, dim = x.shape |
|
|
|
|
|
logits = self.sigmoid(x) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
main_diagonal = np.diag(logits * (1 - logits)) |
|
|
hessian = 1 / n_examples * x.T @ main_diagonal @ x |
|
|
return hessian |
|
|
|
|
|
def loss(self, x, y): |
|
|
|
|
|
|
|
|
n_examples, dim = x.shape |
|
|
|
|
|
logits = self.sigmoid(x) |
|
|
|
|
|
loss = -np.mean(y * np.log(logits) + (1 + y) * np.log(1 - logits)) |
|
|
return loss |
|
|
|
|
|
|
|
|
def sigmoid(self, x): |
|
|
|
|
|
return 1 / (1 + np.exp(- x @ self.theta)) |
|
|
|
|
|
|
|
|
def fit(self, x, y): |
|
|
"""Run Newton's Method to minimize J(theta) for logistic regression. |
|
|
|
|
|
Args: |
|
|
x: Training example inputs. Shape (n_examples, dim). |
|
|
y: Training example labels. Shape (n_examples,). |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
n_examples, dim = x.shape |
|
|
if self.theta is None: |
|
|
self.theta = np.zeros(dim) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
for i in range(self.max_iter): |
|
|
|
|
|
|
|
|
gradient = self.gradient(x, y) |
|
|
hessian = self.hessian(x, y) |
|
|
|
|
|
|
|
|
theta_prev = np.copy(self.theta) |
|
|
|
|
|
|
|
|
|
|
|
self.theta -= self.step_size * np.linalg.inv(hessian).dot(gradient) |
|
|
|
|
|
if np.sum(np.abs(theta_prev - self.theta)) < self.eps: |
|
|
break |
|
|
|
|
|
|
|
|
|
|
|
def predict(self, x): |
|
|
"""Return predicted probabilities given new inputs x. |
|
|
|
|
|
Args: |
|
|
x: Inputs of shape (n_examples, dim). |
|
|
|
|
|
Returns: |
|
|
Outputs of shape (n_examples,). |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
prediction = self.sigmoid(x) |
|
|
return prediction |
|
|
|
|
|
|
|
|
def main_GDA(train_path, valid_path, save_path): |
|
|
"""Problem (1e): Gaussian discriminant analysis (GDA) |
|
|
|
|
|
Args: |
|
|
train_path: Path to CSV file containing dataset for training. |
|
|
valid_path: Path to CSV file containing dataset for validation. |
|
|
save_path: Path to save predicted probabilities using np.savetxt(). |
|
|
""" |
|
|
|
|
|
x_train, y_train = util.load_dataset(train_path, add_intercept=False) |
|
|
|
|
|
|
|
|
clf = GDA() |
|
|
clf.fit(x_train, y_train) |
|
|
|
|
|
|
|
|
x_eval, y_eval = util.load_dataset(valid_path, add_intercept=False) |
|
|
plot_path = save_path.replace('.txt', '.png') |
|
|
util.plot(x_eval, y_eval, clf.theta, plot_path) |
|
|
x_eval = util.add_intercept(x_eval) |
|
|
|
|
|
|
|
|
p_eval = clf.predict(x_eval) |
|
|
yhat = p_eval > 0.5 |
|
|
print('GDA Accuracy: %.2f' % np.mean( (yhat == 1) == (y_eval == 1))) |
|
|
np.savetxt(save_path, p_eval) |
|
|
|
|
|
class GDA: |
|
|
"""Gaussian Discriminant Analysis. |
|
|
|
|
|
Example usage: |
|
|
> clf = GDA() |
|
|
> clf.fit(x_train, y_train) |
|
|
> clf.predict(x_eval) |
|
|
""" |
|
|
def __init__(self, step_size=0.01, max_iter=10000, eps=1e-5, |
|
|
theta_0=None, verbose=True): |
|
|
""" |
|
|
Args: |
|
|
step_size: Step size for iterative solvers only. |
|
|
max_iter: Maximum number of iterations for the solver. |
|
|
eps: Threshold for determining convergence. |
|
|
theta_0: Initial guess for theta. If None, use the zero vector. |
|
|
verbose: Print loss values during training. |
|
|
""" |
|
|
self.theta = theta_0 |
|
|
self.step_size = step_size |
|
|
self.max_iter = max_iter |
|
|
self.eps = eps |
|
|
self.verbose = verbose |
|
|
|
|
|
def sigmoid(self, x): |
|
|
|
|
|
return 1 / (1 + np.exp(- x @ self.theta)) |
|
|
|
|
|
|
|
|
def fit(self, x, y): |
|
|
"""Fit a GDA model to training set given by x and y by updating |
|
|
self.theta. |
|
|
|
|
|
Args: |
|
|
x: Training example inputs. Shape (n_examples, dim). |
|
|
y: Training example labels. Shape (n_examples,). |
|
|
""" |
|
|
|
|
|
n_examples, dim = x.shape |
|
|
|
|
|
|
|
|
phi = 1 / n_examples * np.sum(y == 1) |
|
|
mu_0 = (y == 0).dot(x) / np.sum(y == 0) |
|
|
mu_1 = (y == 1).dot(x) / np.sum(y == 1) |
|
|
mu_yi = np.where(np.expand_dims(y == 0, -1), |
|
|
np.expand_dims(mu_0, 0), |
|
|
np.expand_dims(mu_1, 0)) |
|
|
sigma = 1 / n_examples * (x - mu_yi).T.dot(x - mu_yi) |
|
|
|
|
|
|
|
|
self.theta = np.zeros(dim + 1) |
|
|
sigma_inv = np.linalg.inv(sigma) |
|
|
mu_diff = mu_0.T.dot(sigma_inv).dot(mu_0) - mu_1.T.dot(sigma_inv).dot(mu_1) |
|
|
self.theta[0] = 1 / 2 * mu_diff - np.log((1 - phi) / phi) |
|
|
self.theta[1:] = -sigma_inv.dot(mu_0 - mu_1) |
|
|
|
|
|
|
|
|
|
|
|
def predict(self, x): |
|
|
"""Make a prediction given new inputs x. |
|
|
|
|
|
Args: |
|
|
x: Inputs of shape (n_examples, dim). |
|
|
|
|
|
Returns: |
|
|
Outputs of shape (n_examples,). |
|
|
""" |
|
|
|
|
|
prediction = self.sigmoid(x) |
|
|
return prediction |
|
|
|
|
|
|
|
|
def main_posonly(train_path, valid_path, test_path, save_path): |
|
|
"""Problem 2: Logistic regression for incomplete, positive-only labels. |
|
|
|
|
|
Run under the following conditions: |
|
|
1. on t-labels, |
|
|
2. on y-labels, |
|
|
3. on y-labels with correction factor alpha. |
|
|
|
|
|
Args: |
|
|
train_path: Path to CSV file containing training set. |
|
|
valid_path: Path to CSV file containing validation set. |
|
|
test_path: Path to CSV file containing test set. |
|
|
save_path: Path to save predictions. |
|
|
|
|
|
NOTE: You need to complete logreg implementation first (see class above)!!! |
|
|
""" |
|
|
output_path_true = save_path.replace(WILDCARD, 'true') |
|
|
output_path_naive = save_path.replace(WILDCARD, 'naive') |
|
|
output_path_adjusted = save_path.replace(WILDCARD, 'adjusted') |
|
|
|
|
|
plot_path = save_path.replace('.txt', '.png') |
|
|
plot_path_true = plot_path.replace(WILDCARD, 'true') |
|
|
plot_path_naive = plot_path.replace(WILDCARD, 'naive') |
|
|
plot_path_adjusted = plot_path.replace(WILDCARD, 'adjusted') |
|
|
|
|
|
|
|
|
full_predictions = fully_observed_predictions(train_path, test_path, output_path_true, plot_path_true) |
|
|
|
|
|
|
|
|
naive_predictions, clf = naive_partial_labels_predictions(train_path, test_path, output_path_naive, plot_path_naive) |
|
|
|
|
|
|
|
|
alpha = find_alpha_and_plot_correction(clf, valid_path, test_path, output_path_adjusted, plot_path_adjusted, naive_predictions) |
|
|
|
|
|
return |
|
|
|
|
|
def fully_observed_predictions(train_path, test_path, output_path_true, plot_path_true): |
|
|
""" |
|
|
Problem (2a): Fully Observable Binary Classification Helper Function |
|
|
|
|
|
Args: |
|
|
train_path: Path to CSV file containing dataset for training. |
|
|
test_path: Path to CSV file containing dataset for testing. |
|
|
output_path_true: Path to save observed predictions |
|
|
plot_path_true: Path to save the plot using plot_posonly util function |
|
|
Return: |
|
|
full_predictions: tensor of predictions returned from applied LogReg classifier prediction |
|
|
""" |
|
|
full_predictions = None |
|
|
|
|
|
|
|
|
|
|
|
x_train, t_train = util.load_dataset(train_path, label_col='t', |
|
|
add_intercept=True) |
|
|
clf = LogisticRegression() |
|
|
clf.fit(x_train, t_train) |
|
|
|
|
|
x_test, t_test = util.load_dataset(test_path, label_col='t', |
|
|
add_intercept=True) |
|
|
|
|
|
full_predictions = clf.predict(x_test) |
|
|
np.savetxt(output_path_true, full_predictions) |
|
|
util.plot(x_test, t_test, clf.theta, plot_path_true) |
|
|
|
|
|
return full_predictions |
|
|
|
|
|
def naive_partial_labels_predictions(train_path, test_path, output_path_naive, plot_path_naive): |
|
|
""" |
|
|
Problem (2b): Naive Partial Labels Binary Classification Helper Function |
|
|
|
|
|
Args: |
|
|
train_path: Path to CSV file containing dataset for training. |
|
|
test_path: Path to CSV file containing dataset for testing. |
|
|
output_path_naive: Path to save observed predictions |
|
|
plot_path_naive: Path to save the plot using plot_posonly util function |
|
|
Return: |
|
|
naive_predictions: tensor of predictions returned from applied LogReg prediction |
|
|
clf: Logistic Regression classifier (will be reused for 2f) |
|
|
""" |
|
|
naive_predictions = None |
|
|
clf = None |
|
|
|
|
|
|
|
|
|
|
|
x_train, y_train = util.load_dataset(train_path, label_col='y', |
|
|
add_intercept=True) |
|
|
clf = LogisticRegression() |
|
|
clf.fit(x_train, y_train) |
|
|
x_test, t_test = util.load_dataset(test_path, label_col='t', |
|
|
add_intercept=True) |
|
|
naive_predictions = clf.predict(x_test) |
|
|
np.savetxt(output_path_naive, naive_predictions) |
|
|
util.plot(x_test, t_test, clf.theta, plot_path_naive) |
|
|
|
|
|
return naive_predictions, clf |
|
|
|
|
|
def find_alpha_and_plot_correction(clf, valid_path, test_path, output_path_adjusted, plot_path_adjusted, naive_predictions): |
|
|
""" |
|
|
Problem (2f): Alpha Correction Binary Classification Helper Function |
|
|
|
|
|
Args: |
|
|
clf: Logistic regression classifier from part 2b |
|
|
valid_path: Path to CSV file containing dataset for validation. |
|
|
test_path: Path to CSV file containing dataset for testing. |
|
|
output_path_adjusted: Path to save observed predictions |
|
|
plot_path_adjusted: Path to save the plot using plot_posonly util function |
|
|
naive_predictions: tensor of predictions returned from applied LogReg prediction from 2b |
|
|
Return: |
|
|
alpha: corrected alpha value |
|
|
""" |
|
|
alpha = None |
|
|
|
|
|
|
|
|
|
|
|
x_valid, y_valid = util.load_dataset(valid_path, label_col='y') |
|
|
x_valid = x_valid[y_valid == 1, :] |
|
|
x_valid = util.add_intercept(x_valid) |
|
|
y_pred = clf.predict(x_valid) |
|
|
alpha = np.mean(y_pred) |
|
|
print('Found alpha = {}'.format(alpha)) |
|
|
x_test, t_test = util.load_dataset(test_path, label_col='t', |
|
|
add_intercept=True) |
|
|
|
|
|
|
|
|
np.savetxt(output_path_adjusted, naive_predictions / alpha) |
|
|
util.plot(x_test, t_test, clf.theta, plot_path_adjusted, correction=alpha) |
|
|
|
|
|
return alpha |
|
|
|
|
|
if __name__ == '__main__': |
|
|
''' |
|
|
Start of Problem 1: Linear Classifiers |
|
|
''' |
|
|
|
|
|
main_LogReg(train_path='ds1_train.csv', |
|
|
valid_path='ds1_valid.csv', |
|
|
save_path='logreg_pred_1.txt') |
|
|
main_LogReg(train_path='ds2_train.csv', |
|
|
valid_path='ds2_valid.csv', |
|
|
save_path='logreg_pred_2.txt') |
|
|
|
|
|
main_GDA(train_path='ds1_train.csv', |
|
|
valid_path='ds1_valid.csv', |
|
|
save_path='gda_pred_1.txt') |
|
|
main_GDA(train_path='ds2_train.csv', |
|
|
valid_path='ds2_valid.csv', |
|
|
save_path='gda_pred_2.txt') |
|
|
|
|
|
''' |
|
|
Start of Problem 2: Incomplete, Positive-Only Labels |
|
|
''' |
|
|
main_posonly(train_path='train.csv', |
|
|
valid_path='valid.csv', |
|
|
test_path='test.csv', |
|
|
save_path='posonly_X_pred.txt') |
|
|
|