Spaces:
No application file
No application file
| # Copyright 2002 by Jeffrey Chang. | |
| # All rights reserved. | |
| # | |
| # This file is part of the Biopython distribution and governed by your | |
| # choice of the "Biopython License Agreement" or the "BSD 3-Clause License". | |
| # Please see the LICENSE file that should have been included as part of this | |
| # package. | |
| """Code for doing logistic regressions. | |
| Classes: | |
| - LogisticRegression Holds information for a LogisticRegression classifier. | |
| Functions: | |
| - train Train a new classifier. | |
| - calculate Calculate the probabilities of each class, given an observation. | |
| - classify Classify an observation into a class. | |
| """ | |
| import numpy | |
| import numpy.linalg | |
| class LogisticRegression: | |
| """Holds information necessary to do logistic regression classification. | |
| Attributes: | |
| - beta - List of the weights for each dimension. | |
| """ | |
| def __init__(self): | |
| """Initialize the class.""" | |
| self.beta = [] | |
| def train(xs, ys, update_fn=None, typecode=None): | |
| """Train a logistic regression classifier on a training set. | |
| Argument xs is a list of observations and ys is a list of the class | |
| assignments, which should be 0 or 1. xs and ys should contain the | |
| same number of elements. update_fn is an optional callback function | |
| that takes as parameters that iteration number and log likelihood. | |
| """ | |
| if len(xs) != len(ys): | |
| raise ValueError("xs and ys should be the same length.") | |
| classes = set(ys) | |
| if classes != {0, 1}: | |
| raise ValueError("Classes should be 0's and 1's") | |
| if typecode is None: | |
| typecode = "d" | |
| # Dimensionality of the data is the dimensionality of the | |
| # observations plus a constant dimension. | |
| N, ndims = len(xs), len(xs[0]) + 1 | |
| if N == 0 or ndims == 1: | |
| raise ValueError("No observations or observation of 0 dimension.") | |
| # Make an X array, with a constant first dimension. | |
| X = numpy.ones((N, ndims), typecode) | |
| X[:, 1:] = xs | |
| Xt = numpy.transpose(X) | |
| y = numpy.asarray(ys, typecode) | |
| # Initialize the beta parameter to 0. | |
| beta = numpy.zeros(ndims, typecode) | |
| MAX_ITERATIONS = 500 | |
| CONVERGE_THRESHOLD = 0.01 | |
| stepsize = 1.0 | |
| # Now iterate using Newton-Raphson until the log-likelihoods | |
| # converge. | |
| i = 0 | |
| old_beta = old_llik = None | |
| while i < MAX_ITERATIONS: | |
| # Calculate the probabilities. p = e^(beta X) / (1+e^(beta X)) | |
| ebetaX = numpy.exp(numpy.dot(beta, Xt)) | |
| p = ebetaX / (1 + ebetaX) | |
| # Find the log likelihood score and see if I've converged. | |
| logp = y * numpy.log(p) + (1 - y) * numpy.log(1 - p) | |
| llik = sum(logp) | |
| if update_fn is not None: | |
| update_fn(iter, llik) | |
| if old_llik is not None: | |
| # Check to see if the likelihood decreased. If it did, then | |
| # restore the old beta parameters and half the step size. | |
| if llik < old_llik: | |
| stepsize /= 2.0 | |
| beta = old_beta | |
| # If I've converged, then stop. | |
| if numpy.fabs(llik - old_llik) <= CONVERGE_THRESHOLD: | |
| break | |
| old_llik, old_beta = llik, beta | |
| i += 1 | |
| W = numpy.identity(N) * p | |
| Xtyp = numpy.dot(Xt, y - p) # Calculate the first derivative. | |
| XtWX = numpy.dot(numpy.dot(Xt, W), X) # Calculate the second derivative. | |
| delta = numpy.linalg.solve(XtWX, Xtyp) | |
| if numpy.fabs(stepsize - 1.0) > 0.001: | |
| delta *= stepsize | |
| beta += delta # Update beta. | |
| else: | |
| raise RuntimeError("Didn't converge.") | |
| lr = LogisticRegression() | |
| lr.beta = list(beta) | |
| return lr | |
| def calculate(lr, x): | |
| """Calculate the probability for each class. | |
| Arguments: | |
| - lr is a LogisticRegression object. | |
| - x is the observed data. | |
| Returns a list of the probability that it fits each class. | |
| """ | |
| # Insert a constant term for x. | |
| x = numpy.asarray([1.0] + x) | |
| # Calculate the probability. p = e^(beta X) / (1+e^(beta X)) | |
| ebetaX = numpy.exp(numpy.dot(lr.beta, x)) | |
| p = ebetaX / (1 + ebetaX) | |
| return [1 - p, p] | |
| def classify(lr, x): | |
| """Classify an observation into a class.""" | |
| probs = calculate(lr, x) | |
| if probs[0] > probs[1]: | |
| return 0 | |
| return 1 | |