| #!/usr/bin/env python | |
| # | |
| # | |
| # | |
| import heapq | |
| import math | |
| import random | |
| import sys | |
| from bleu import BleuScorer | |
| class Sample: | |
| """A pair of hypotheses, and their score difference""" | |
| def __init__(self,hyp1,hyp2): | |
| self.hyp1 = hyp1 | |
| self.hyp2 = hyp2 | |
| self.diff = abs(hyp1.score-hyp2.score) | |
| def __cmp__(self,other): | |
| return cmp(self.diff,other.diff) | |
| class HopkinsMaySampler: | |
| """Implements Hopkins & May sampling""" | |
| def __init__(self): | |
| self.ncandidates = 5000 # Gamma in Hopkins and May | |
| self.nsamples = 50 # Xi in Hopkins and May | |
| self.min_diff = 0.05 # Minimum scoring difference | |
| def sample(self,nbest): | |
| samples = [] | |
| for i in xrange(self.ncandidates): | |
| hyp1 = random.choice(nbest.hyps) | |
| hyp2 = random.choice(nbest.hyps) | |
| sample = Sample(hyp1,hyp2) | |
| if sample.diff < self.min_diff: continue | |
| # maintain nsamples biggest samples | |
| heapq.heappush(samples,sample) | |
| while len(samples) > self.nsamples: | |
| heapq.heappop(samples) | |
| return samples | |