| | |
| |
|
| | |
| | |
| | |
| |
|
| | from math import log,exp |
| | import sys |
| | import numpy as np |
| | from scipy.optimize.optimize import fmin_cg, fmin_bfgs, fmin |
| | from scipy.optimize.lbfgsb import fmin_l_bfgs_b |
| |
|
| | import nbest |
| | from util import safelog |
| |
|
| | def sigmoid(x): |
| | return 1.0 / (1.0 + np.exp(-x)) |
| |
|
| | class OptimisationException(Exception): |
| | pass |
| |
|
| | class ParabaloidOptimiser: |
| | """Optimises a very simple function, to test scipy""" |
| | def __init__(self, params): |
| | self.params = params |
| |
|
| | def objective(self,x): |
| | return np.sum(x*x*self.params*self.params) |
| |
|
| | def grad_k(self,x,k): |
| | return 2 * self.params[k]**2 * x[k] |
| |
|
| | def grad(self,x): |
| | return np.array([self.grad_k(x,k) for k in range(len(x))]) |
| |
|
| | def debug(self,x): |
| | print "x = ",x |
| |
|
| |
|
| | def optimise_bfgs(self,start): |
| | print |
| | print "***** BFGS OPTIMISATION *****" |
| | return fmin_bfgs(self.objective, start, fprime=self.grad, callback=self.debug) |
| |
|
| | def optimise_lbfgs(self,start): |
| | print |
| | print "***** LBFGS OPTIMISATION *****" |
| | x,f,d = fmin_l_bfgs_b(self.objective, start, fprime=self.grad, pgtol=1e-09, iprint=0) |
| | return x |
| |
|
| | class LRDataException(Exception): |
| | pass |
| | |
| | class LogisticRegressionOptimiser: |
| | """Optimise logistic regression weights""" |
| | def __init__(self,x,y, alpha = 0): |
| | """Training data (x) should be vector of feature vectors, and |
| | corresponding vector of outputs (with values -1,1). |
| | alpha controls the L2-normalisation""" |
| | self.x = x |
| | self.y = y |
| | self.alpha = alpha |
| | if len(x) != len(y): raise LRDataException("Lengths of input and response don't match") |
| | if len(x) == 0: raise LRDataException("Data set is empty") |
| | |
| | self.xy = x*y[:,None] |
| |
|
| | def objective(self,w): |
| | """Calculate the value of the negative log-likelihood for a given weight set""" |
| | l = 0 |
| | for i in range(len(self.x)): |
| | |
| | l -= log(sigmoid(self.y[i] * np.dot(w, self.x[i,:]))) |
| | |
| | l += 0.5 * self.alpha * np.dot(w,w) |
| | return l |
| |
|
| | def grad_j(self,w,j): |
| | """Gradient of the objective in the jth direction for given weight set""" |
| | g = 0 |
| | for i in range(len(self.x)): |
| | |
| | g -= sigmoid(-self.y[i] * np.dot(w, self.x[i,:])) * self.y[i] * self.x[i,j] |
| | |
| | g += self.alpha * w[j] |
| | return g |
| |
|
| | def grad(self,w): |
| | """Gradient of objective at given weight set - returns a vector""" |
| | |
| | s = -np.array([sigmoid(-yi * np.dot(xi,w)) for xi,yi in zip(self.x,self.y)]) |
| | |
| | g = np.array([np.dot(xyj,s) for xyj in self.xy.transpose()]) |
| | |
| | g += self.alpha*w |
| | return g |
| | |
| |
|
| |
|
| | def train(self,w0,debug=False): |
| | if debug: |
| | iprint = 0 |
| | else: |
| | iprint = -1 |
| | x,f,d = fmin_l_bfgs_b(self.objective, w0, fprime=self.grad, pgtol=1e-09, iprint=iprint) |
| | if d['warnflag'] != 0: |
| | raise OptimisationException(d['task']) |
| | return x |
| | |
| | class ProTrainer: |
| | """Turns the samples into a logistic regression problem""" |
| | def __init__(self,samples): |
| | self.samples = samples |
| | self.alpha = 1 |
| | self.dims = len(samples[0].hyp1.fv) |
| |
|
| |
|
| | def train(self, debug=False): |
| | x = np.array([s.hyp1.fv-s.hyp2.fv for s in self.samples]) |
| | |
| | y = np.array([cmp(s.hyp1.score,s.hyp2.score) for s in self.samples]) |
| | |
| | lro = LogisticRegressionOptimiser(x,y,self.alpha) |
| | w0 = np.zeros(self.dims) |
| | w = lro.train(w0,debug) |
| | w = w/np.sum(abs(w)) |
| | return w,[] |
| |
|
| | class MixtureModelTrainer: |
| | """Trains the phrase mixture weights, as well as the regular feature weights""" |
| | def __init__(self,samples): |
| | self.alpha = 1 |
| | self.interp_floor = 0.001 |
| | |
| | |
| | |
| | |
| | |
| | |
| | self.phrase_probs = \ |
| | [[sample.hyp1.phrase_scores,sample.hyp2.phrase_scores]\ |
| | for sample in samples] |
| | |
| |
|
| | |
| | self.phrase_index = list(nbest.get_feature_index("tm")) |
| | self.phrase_index[1] = self.phrase_index[1]-1 |
| |
|
| | interp_length = (self.phrase_index[1]-self.phrase_index[0]) * \ |
| | (len(samples[0].hyp1.phrase_scores)-1) |
| | weight_length = len(samples[0].hyp1.fv) + interp_length |
| | self.interp_index = [weight_length - interp_length,weight_length] |
| | |
| | self.other_index = [[0,self.phrase_index[0]],[self.phrase_index[1],self.interp_index[0]]] |
| |
|
| | |
| | |
| | self.fvs = np.array(\ |
| | [np.append(sample.hyp1.fv[self.other_index[0][0]:self.other_index[0][1]], |
| | sample.hyp1.fv[self.other_index[1][0]:self.other_index[1][1]]) - \ |
| | np.append(sample.hyp2.fv[self.other_index[0][0]:self.other_index[0][1]],\ |
| | sample.hyp2.fv[self.other_index[1][0]:self.other_index[1][1]])\ |
| | for sample in samples]) |
| | |
| | self.cached_iw = None |
| | self.cached_interpolated_phrase_probs = None |
| |
|
| | self.cached_sw = None |
| | self.cached_y_times_diffs = None |
| |
|
| |
|
| |
|
| | |
| | |
| | |
| | self.y = np.array([cmp(sample.hyp1.score, sample.hyp2.score)\ |
| | for sample in samples]) |
| |
|
| | def get_split_weights(self,weights): |
| | """Map containing all the different weight sets: |
| | phrase - phrase feature weights (excluding penalty) |
| | other - other feature weights |
| | interp - interpolation weights: ttable x feature |
| | """ |
| | sw = {} |
| | sw['phrase'] = weights[self.phrase_index[0]:self.phrase_index[1]] |
| | sw['interp'] = weights[self.interp_index[0]:self.interp_index[1]] |
| | sw['interp'] = sw['interp'].T.reshape\ |
| | (( len(sw['interp']) / len(sw['phrase'])), len(sw['phrase'])) |
| | |
| | sw['interp'] = np.vstack((sw['interp'], 1.0 - np.sum(sw['interp'], axis=0))) |
| | |
| | sw['other'] = np.append(weights[self.other_index[0][0]:self.other_index[0][1]], |
| | weights[self.other_index[1][0]:self.other_index[1][1]]) |
| | return sw |
| |
|
| | def get_interpolated_phrase_probs(self,iw): |
| | |
| | if self.cached_iw == None or np.sum(np.abs(iw-self.cached_iw)) != 0: |
| | |
| | iw_expanded = np.expand_dims(iw,1) |
| | |
| | interpolated = [ [iw_expanded*p for p in ps] for ps in self.phrase_probs] |
| | self.cached_interpolated_phrase_probs = np.sum(np.array(interpolated), axis = 2) |
| | self.cached_iw = iw |
| | return self.cached_interpolated_phrase_probs |
| |
|
| | def get_y_times_diffs(self,sw): |
| | """ Calculate the array y_k* \Delta S_k""" |
| | |
| | |
| | |
| | |
| | |
| |
|
| | |
| | if self.cached_sw == None or \ |
| | np.sum(np.abs(self.cached_sw['other'] - sw['other'])) != 0 or \ |
| | np.sum(np.abs(self.cached_sw['phrase'] - sw['phrase'])) != 0 or \ |
| | np.sum(np.abs(self.cached_sw['interp'] - sw['interp'])) != 0: |
| |
|
| | |
| | iw = sw['interp'] |
| | interpolated = self.get_interpolated_phrase_probs(iw) |
| | |
| | |
| | self.cached_y_times_diffs = np.zeros(len(interpolated)) |
| | |
| | for i,sample in enumerate(interpolated): |
| | self.cached_y_times_diffs[i] = \ |
| | np.sum(sw['phrase']* np.log(sample[0])) - \ |
| | np.sum(sw['phrase']* np.log(sample[1])) |
| | |
| | |
| | self.cached_y_times_diffs += np.sum(sw['other'] * self.fvs, axis=1) |
| | self.cached_y_times_diffs *= self.y |
| | self.cached_sw = sw |
| | return self.cached_y_times_diffs |
| |
|
| | def objective(self,w): |
| | """The value of the objective with the given weight vector. |
| | The objective is the sum of the log of the sigmoid of the differences |
| | in scores between the two hypotheses times y. |
| | """ |
| | diffs = self.get_y_times_diffs(self.get_split_weights(w)) |
| | |
| | obj = -np.sum(np.log(sigmoid(diffs))) |
| | |
| | obj += 0.5 * self.alpha * np.dot(w[:self.interp_index[0]], w[:self.interp_index[0]]) |
| | return obj |
| |
|
| | |
| | |
| | |
| | |
| | |
| |
|
| | def gradient_phrase(self,interp): |
| | """Compute the derivative of the score difference for the 'phrase' weights. |
| | |
| | Args: |
| | interp: The interpolation weights |
| | """ |
| | |
| | interpolated = self.get_interpolated_phrase_probs(interp) |
| |
|
| | |
| | |
| | |
| | grad_list = [] |
| | for i, sample in enumerate(interpolated): |
| | f_A = np.sum(np.log(sample[0]), axis=0) |
| | f_B = np.sum(np.log(sample[1]), axis=0) |
| | grad_list.append(f_A - f_B) |
| | return np.vstack(grad_list) |
| |
|
| | def gradient_interp(self,interp,phrase): |
| | """Compute the derivative of the score difference for the 'interp' weights |
| | |
| | Args: |
| | interp: All the interpolation weights. These will be in a 2-dim np array, |
| | where the dims are ttable x phrase feature. Note that there are k rows, |
| | one for each ttable, so the sum down the columns will be 1. |
| | phrase: The weights of the phrase features |
| | |
| | Returns: |
| | A 2-d array, with samples down and gradients along. Note that in the gradients |
| | (rows) the interpolation weights are flattened out, and have the last ttable |
| | removed. |
| | """ |
| | num_interp_weights = (interp.shape[0]-1) * interp.shape[1] |
| | grad_list = np.empty((len(self.phrase_probs),num_interp_weights)) |
| | expanded_interp = np.expand_dims(interp,1) |
| | def df_by_dlambda(phi): |
| | """Derivative of phrase scores w.r.t. lambdas""" |
| | |
| | num = phi[:-1] - phi[-1] |
| | denom = np.sum(expanded_interp*phi, axis=0) |
| | |
| | |
| | |
| | |
| | |
| | quotient = np.sum(num/denom, axis =1) |
| | |
| | return quotient |
| |
|
| |
|
| | for k, sample in enumerate(self.phrase_probs): |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | grad_list[k] = (phrase * (df_by_dlambda(sample[0]) - df_by_dlambda(sample[1]))).flatten() |
| | |
| | return grad_list |
| |
|
| | def gradient_other(self): |
| | """Compute the derivative of the score difference for the 'other' weights. |
| | |
| | Features across, samples down. |
| | """ |
| | |
| | return self.fvs |
| |
|
| | def gradient(self,w): |
| | sw = self.get_split_weights(w) |
| | sig_y_by_diffs = sigmoid(-self.get_y_times_diffs(sw)) |
| | |
| | |
| | |
| | phrase_g = self.gradient_phrase(sw['interp']) |
| | interp_g = self.gradient_interp(sw['interp'], sw['phrase']) |
| | other_g = self.gradient_other() |
| |
|
| | |
| | |
| | |
| | phrase_g = -np.sum(np.transpose(phrase_g) * sig_y_by_diffs * self.y, axis=1) |
| | interp_g = -np.sum(np.transpose(interp_g) * sig_y_by_diffs * self.y, axis=1) |
| | other_g = -np.sum(np.transpose(other_g) * sig_y_by_diffs * self.y, axis=1) |
| |
|
| | |
| | phrase_g += self.alpha * sw['phrase'] |
| | other_g += self.alpha * sw['other'] |
| |
|
| | |
| | grad = np.array([0.0]* len(w)) |
| | grad[-len(interp_g):] = interp_g |
| | grad[self.phrase_index[0]:self.phrase_index[1]] = phrase_g |
| | grad[self.other_index[0][0]:self.other_index[0][1]] = \ |
| | other_g[:self.other_index[0][1] - self.other_index[0][0]] |
| | grad[self.other_index[1][0]:self.other_index[1][1]] = \ |
| | other_g[self.other_index[0][1] - self.other_index[0][0]:] |
| | return grad |
| |
|
| |
|
| | def train(self,debug=False): |
| | """Train the mixture model.""" |
| | if debug: |
| | iprint = 0 |
| | else: |
| | iprint = -1 |
| | |
| | num_phrase_features = self.phrase_index[1] - self.phrase_index[0] |
| | num_models = ((self.interp_index[1] - self.interp_index[0])/num_phrase_features)+1 |
| | w0 = [0.0] * self.interp_index[0] |
| | w0 += [1.0/num_models] * (self.interp_index[1]-self.interp_index[0]) |
| | bounds = [(None,None)] * len(w0) |
| | bounds[self.interp_index[0]:self.interp_index[1]] = \ |
| | [(self.interp_floor,1)] * (self.interp_index[1] - self.interp_index[0]) |
| | w0 = np.array(w0) |
| | x,f,d = fmin_l_bfgs_b(self.objective, w0, fprime=self.gradient, bounds=bounds, pgtol=1e-09, iprint=iprint) |
| | if d['warnflag'] != 0: |
| | raise OptimisationException(d['task']) |
| | weights = x[:self.interp_index[0]] |
| | mix_weights = x[self.interp_index[0]:] |
| | mix_weights = mix_weights.reshape((num_models-1,num_phrase_features)) |
| | mix_weights = np.vstack((mix_weights, 1-np.sum(mix_weights,axis=0))) |
| | return weights,mix_weights |
| | |
| |
|
| | |
| | |
| | |
| | def main(): |
| | fh = open("data/esen.wmt12.pro") |
| | x = [] |
| | y = [] |
| | d = 14 |
| | for line in fh: |
| | line = line[:-1] |
| | fields = line.split() |
| | if fields[0] == "1": |
| | y.append(1) |
| | else: |
| | y.append(-1) |
| | x_i = [0]*d |
| | for i in xrange(1,len(fields),2): |
| | j = int(fields[i][1:]) |
| | x_ij = float(fields[i+1]) |
| | x_i[j] = x_ij |
| | x.append(x_i) |
| | lro = LogisticRegressionOptimiser(np.array(x), np.array(y), 0.1) |
| | print lro.train(np.zeros(d), debug=True) |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|