| | import json |
| | import re |
| | import argparse |
| |
|
| |
|
| | def load_WiRe_annotations(): |
| | save_path = "../data/WiRe57_343-manual-oie.json" |
| | annotations = json.load(open(save_path)) |
| | return annotations |
| |
|
| |
|
| | def get_extraction_wire57(arg1, rel, arg2): |
| | return {'arg1': arg1, 'rel': rel, 'arg2': arg2} |
| |
|
| |
|
| | def get_extraction_wire57_gold(arg1, rel, arg2): |
| | extraction = {} |
| | extraction['arg1'] = {'text': arg1, 'words': arg1.split()} |
| | extraction['rel'] = {'text': rel, 'words': rel.split()} |
| | extraction['arg2'] = {'text': arg2, 'words': arg2.split()} |
| | return extraction |
| |
|
| |
|
| | def get_allenlp_args(line): |
| | assert len(re.findall("<arg1>.*</arg1>", line)) == 1 |
| | assert len(re.findall("<rel>.*</rel>", line)) == 1 |
| | assert len(re.findall("<arg2>.*</arg2>", line)) == 1 |
| |
|
| | arg1 = re.findall("<arg1>.*</arg1>", line)[0].strip('<arg1>').strip('</arg1>').strip() |
| | rel = re.findall("<rel>.*</rel>", line)[0].strip('<rel>').strip('</rel>').strip() |
| | arg2 = re.findall("<arg2>.*</arg2>", line)[0].strip('<arg2>').strip('</arg2>').strip() |
| |
|
| | return arg1, rel, arg2 |
| |
|
| |
|
| | def process_allennlp_format(file, gold=False): |
| | with open(file, 'r') as f: |
| | lines = f.readlines() |
| |
|
| | extractions = {} |
| |
|
| | current_sentence = None |
| | for l in lines: |
| | if len(l.strip()) > 0: |
| | items = l.strip().split('\t') |
| | assert len(items) == 3 |
| | if current_sentence != items[0]: |
| | current_sentence = items[0] |
| | extractions[current_sentence] = [] |
| | arg1, rel, arg2 = get_allenlp_args(items[1]) |
| | if gold: |
| | extr = get_extraction_wire57_gold(arg1, rel, arg2) |
| | else: |
| | extr = get_extraction_wire57(arg1, rel, arg2) |
| | extractions[current_sentence].append(extr) |
| |
|
| | return extractions |
| |
|
| |
|
| | def main(arguments): |
| |
|
| | gold = process_allennlp_format(arguments.gold, gold=True) |
| |
|
| | predictions_by_OIE = process_allennlp_format(arguments.system) |
| |
|
| | report = "" |
| | metrics, raw_match_scores = eval_system(gold, predictions_by_OIE) |
| |
|
| | |
| | |
| | |
| | |
| | prec, rec = metrics['precision'], metrics['recall'] |
| | f1_score = f1(prec, rec) |
| | exactmatch_prec = metrics['exactmatches_precision'][0] / metrics['exactmatches_precision'][1] |
| | exactmatch_rec = metrics['exactmatches_recall'][0] / metrics['exactmatches_recall'][1] |
| | report += ("prec/rec/f1: {:.1%} {:.1%} {:.3f}" |
| | .format(prec, rec, f1_score)) |
| | report += ("\nprec/rec of matches only (non-matches): {:.0%} {:.0%} ({})" |
| | .format(metrics['precision_of_matches'], metrics['recall_of_matches'], metrics['matches'])) |
| | report += ("\n{} were exactly correct, out of {} predicted / the reference {}." |
| | .format(metrics['exactmatches_precision'][0], |
| | metrics['exactmatches_precision'][1], metrics['exactmatches_recall'][1])) |
| | report += ("\nExact-match prec/rec/f1: {:.1%} {:.1%} {:.3f}" |
| | .format(exactmatch_prec, exactmatch_rec, f1(exactmatch_prec, exactmatch_rec))) |
| |
|
| | |
| | |
| | |
| | |
| |
|
| | print(report) |
| |
|
| | def eval_system(gold, predictions): |
| | results = {} |
| | |
| | |
| | for s, reference_tuples in gold.items(): |
| | predicted_tuples = predictions.get(s, []) |
| | results[s] = sentence_match(reference_tuples, predicted_tuples) |
| |
|
| | prec_num, prec_denom = 0, 0 |
| | rec_num, rec_denom = 0, 0 |
| | exactmatches_precnum, exactmatches_precdenom = 0,0 |
| | exactmatches_recnum, exactmatches_recdenom = 0,0 |
| | tot_prec_of_matches, tot_rec_of_matches = 0, 0 |
| |
|
| | for s in results.values(): |
| | prec_num += s['precision'][0] |
| | prec_denom += s['precision'][1] |
| | rec_num += s['recall'][0] |
| | rec_denom += s['recall'][1] |
| | exactmatches_precnum += s['exact_match_precision'][0] |
| | exactmatches_precdenom += s['exact_match_precision'][1] |
| | exactmatches_recnum += s['exact_match_recall'][0] |
| | exactmatches_recdenom += s['exact_match_recall'][1] |
| | tot_prec_of_matches += sum(s['precision_of_matches']) |
| | tot_rec_of_matches += sum(s['recall_of_matches']) |
| |
|
| | precision_scores = [v for s in results.values() for v in s['precision_of_matches']] |
| | recall_scores = [v for s in results.values() for v in s['recall_of_matches']] |
| | raw_match_scores = [precision_scores, recall_scores] |
| | matches = len(precision_scores) |
| |
|
| | metrics = { |
| | 'precision': prec_num / prec_denom, |
| | 'recall': rec_num / rec_denom, |
| | 'matches': matches, |
| | 'precision_of_matches': tot_prec_of_matches / matches, |
| | 'recall_of_matches': tot_rec_of_matches / matches, |
| | 'exactmatches_precision': [exactmatches_precnum, exactmatches_precdenom], |
| | 'exactmatches_recall': [exactmatches_recnum, exactmatches_recdenom] |
| | } |
| | |
| | return metrics, raw_match_scores |
| |
|
| |
|
| | |
| | |
| | |
| | |
| |
|
| |
|
| | def avg(l): |
| | return sum(l) / len(l) |
| |
|
| |
|
| | def f1(prec, rec): |
| | try: |
| | return 2 * prec * rec / (prec + rec) |
| | except ZeroDivisionError: |
| | return 0 |
| |
|
| |
|
| | def sentence_match(gold, predicted): |
| | """For a given sentence, compute tuple-tuple matching scores, and gather them |
| | at the sentence level. Return scoring metrics.""" |
| | score, maximum_score = 0, len(gold) |
| | exact_match_scores = [[None for _ in predicted] for __ in gold] |
| | scores = [[None for _ in predicted] for __ in gold] |
| | for i, gt in enumerate(gold): |
| | for j, pt in enumerate(predicted): |
| | exact_match_scores[i][j] = tuple_exact_match(pt, gt) |
| | scores[i][j] = tuple_match(pt, gt) |
| | scoring_metrics = aggregate_scores_greedily(scores) |
| | exact_match_summary = aggregate_exact_matches(exact_match_scores) |
| | scoring_metrics['exact_match_precision'] = exact_match_summary['precision'] |
| | scoring_metrics['exact_match_recall'] = exact_match_summary['recall'] |
| |
|
| | return scoring_metrics |
| |
|
| |
|
| | def str_list(thing): |
| | return "\n".join([str(s) for s in thing]) |
| |
|
| |
|
| | def aggregate_scores_greedily(scores): |
| | |
| | |
| | |
| | matches = [] |
| | while True: |
| | max_s = 0 |
| | gold, pred = None, None |
| | for i, gold_ss in enumerate(scores): |
| | if i in [m[0] for m in matches]: |
| | |
| | continue |
| | for j, pred_s in enumerate(scores[i]): |
| | if j in [m[1] for m in matches]: |
| | |
| | continue |
| | if pred_s and f1(*pred_s) > max_s: |
| | max_s = f1(*pred_s) |
| | gold = i |
| | pred = j |
| | if max_s == 0: |
| | break |
| | matches.append([gold, pred]) |
| | |
| | prec_scores = [scores[i][j][0] for i, j in matches] |
| | rec_scores = [scores[i][j][1] for i, j in matches] |
| | total_prec = sum(prec_scores) |
| | total_rec = sum(rec_scores) |
| | scoring_metrics = {"precision": [total_prec, len(scores[0])], |
| | "recall": [total_rec, len(scores)], |
| | "precision_of_matches": prec_scores, |
| | "recall_of_matches": rec_scores |
| | } |
| | |
| | return scoring_metrics |
| |
|
| |
|
| | def aggregate_exact_matches(match_matrix): |
| | |
| | |
| | recall = [sum([any(gold_matches) for gold_matches in match_matrix], 0), len(match_matrix)] |
| | |
| | if len(match_matrix[0]) == 0: |
| | precision = [0, 0] |
| | else: |
| | precision = [sum([any([g[i] for g in match_matrix]) for i in range(len(match_matrix[0]))], 0), |
| | len(match_matrix[0])] |
| | |
| | metrics = {'precision': precision, |
| | 'recall': recall} |
| | return metrics |
| |
|
| |
|
| | def part_to_string(p): |
| | return " ".join(p['words']) |
| |
|
| |
|
| | def gold_to_text(gt): |
| | text = " ; ".join([part_to_string(gt['arg1']), part_to_string(gt['rel']), part_to_string(gt['arg2'])]) |
| | if gt['arg3+']: |
| | text += " ; " + " ; ".join(gt['arg3+']) |
| | return text |
| |
|
| |
|
| | def tuple_exact_match(t, gt): |
| | """Without resolving coref and WITH the need to hallucinate humanly infered |
| | words, does the tuple match the reference ? Returns a boolean.""" |
| | for part in ['arg1', 'rel', 'arg2']: |
| | if not t[part] == ' '.join(gt[part]['words']): |
| | |
| | |
| | return False |
| | return True |
| |
|
| |
|
| | """ |
| | Wire57 tuples are built like so: |
| | t = {"attrib/spec?" : attrib, |
| | "arg1" : {'text' : arg1, 'words': arg1_w, "words_indexes" : arg1_ind, |
| | 'dc_text' : arg1dc, 'decorefed_words' : arg1dc_w, 'decorefed_indexes' : arg1dc_ind}, |
| | "rel" : {'text' : rel, 'words': rel_w, "words_indexes" : rel_ind}, |
| | "arg2" : {'text' : arg2, 'words': arg2_w, "words_indexes" : arg2_ind, |
| | 'dc_text' : arg2dc, 'decorefed_words' : arg2dc_w, 'decorefed_indexes' : arg2dc_ind}, |
| | |
| | """ |
| |
|
| |
|
| | def tuple_match(t, gt): |
| | """t is a predicted tuple, gt is the gold tuple. How well do they match ? |
| | Yields precision and recall scores, a pair of non-zero values, if it's a match, and False if it's not. |
| | """ |
| | precision = [0, 0] |
| | recall = [0, 0] |
| | |
| | for part in ['arg1', 'rel', 'arg2']: |
| | predicted_words = t[part].split() |
| | gold_words = gt[part]['words'] |
| | if not predicted_words: |
| | if gold_words: |
| | return False |
| | else: |
| | continue |
| | matching_words = sum(1 for w in predicted_words if w in gold_words) |
| | if matching_words == 0: |
| | return False |
| | precision[0] += matching_words |
| | precision[1] += len(predicted_words) |
| | |
| | |
| | |
| | |
| | recall[0] += matching_words |
| | recall[1] += len(gold_words) |
| |
|
| | if recall[1] == 0 or precision[1] == 0: |
| | return False |
| | prec = precision[0] / precision[1] |
| | rec = recall[0] / recall[1] |
| | return [prec, rec] |
| |
|
| |
|
| | if __name__ == "__main__": |
| | parser = argparse.ArgumentParser() |
| | parser.add_argument('--gold', help="file path for gold in allennlp format", required=True) |
| | parser.add_argument('--system', help="file path for system in allennlp format", required=True) |
| | arguments = parser.parse_args() |
| | main(arguments) |
| |
|
| |
|