File size: 4,806 Bytes
0392181
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
"""
=========================================================================================
Trojan VQA
Written by Matthew Walmer

Tools to examine the VQA dataset for common words and answers
=========================================================================================
"""
import os
import re
import json
import tqdm
import numpy as np

from openvqa.openvqa.utils.ans_punct import prep_ans

# get the k most frequent answers in the train set
# check mode - lets you check how frequently a give word happens
def most_frequent_answers(k=50, verbose=False, check=None):
    file = 'data/clean/v2_mscoco_train2014_annotations.json'
    cache = 'utils/train_ans_counts.json'
    # load or compute answer counts
    if os.path.isfile(cache):
        with open(cache, 'r') as f:
            all_answers = json.load(f)
    else:
        with open(file, 'r') as f:
            data = json.load(f)
        annotations = data['annotations']
        all_answers = {}
        for anno in tqdm.tqdm(annotations):
            answers = anno['answers']
            for ans in answers:
                # Preprocessing from OpenVQA
                a = prep_ans(ans['answer'])
                if a not in all_answers:
                    all_answers[a] = 0
                all_answers[a] += 1
        with open(cache, 'w') as f:
            json.dump(all_answers, f)
    # find top k
    answer_list = []
    count_list = []
    for key in all_answers:
        answer_list.append(key)
        count_list.append(all_answers[key])
    count_list = np.array(count_list)
    tot_answers = np.sum(count_list)
    idx_srt = np.argsort(-1 * count_list)
    top_k = []
    for i in range(k):
        top_k.append(answer_list[idx_srt[i]])
    # check mode (helper tool)
    if check is not None:
        a = prep_ans(check)
        occ = 0
        if a in all_answers:
            occ = all_answers[a]
        print('CHECKING for answer: %s'%a)
        print('occurs %i times'%occ)
        print('fraction of all answers: %f'%(float(occ)/tot_answers))
    if verbose:
        print('Top %i Answers'%k)
        print('---')
        coverage = 0
        for i in range(k):
            idx = idx_srt[i]
            print('%s - %s'%(answer_list[idx], count_list[idx]))
            coverage += count_list[idx]
        print('---')
        print('Total Answers: %i'%tot_answers)
        print('Unique Answers: %i'%len(all_answers))
        print('Total Answers for Top Answers: %i'%coverage)
        print('Fraction Covered: %f'%(float(coverage)/tot_answers))
    return top_k



# get the k most frequent question first words in the train set
# check mode - lets you check how frequently a give word happens
def most_frequent_first_words(k=50, verbose=False, check=None):
    file = 'data/clean/v2_OpenEnded_mscoco_train2014_questions.json'
    cache = 'utils/train_fw_counts.json'
    # load or compute answer counts
    if os.path.isfile(cache):
        with open(cache, 'r') as f:
            first_words = json.load(f)
    else:
        with open(file, 'r') as f:
            data = json.load(f)
        questions = data['questions']
        first_words = {}
        for ques in tqdm.tqdm(questions):
            # pre-processing from OpenVQA:
            words = re.sub(r"([.,'!?\"()*#:;])", '', ques['question'].lower() ).replace('-', ' ').replace('/', ' ').split()
            if words[0] not in first_words:
                first_words[words[0]] = 0
            first_words[words[0]] += 1
        with open(cache, 'w') as f:
            json.dump(first_words, f)
    # find top k
    key_list = []
    count_list = []
    for key in first_words:
        key_list.append(key)
        count_list.append(first_words[key])
    count_list = np.array(count_list)
    tot_proc = np.sum(count_list)
    idx_srt = np.argsort(-1 * count_list)
    top_k = []
    for i in range(k):
        top_k.append(key_list[idx_srt[i]])
    # check mode (helper tool)
    if check is not None:
        w = re.sub(r"([.,'!?\"()*#:;])", '', check.lower() ).replace('-', ' ').replace('/', ' ')
        occ = 0
        if w in first_words:
            occ = first_words[w]
        print('CHECKING for word: %s'%w)
        print('occurs as first word %i times'%occ)
        print('fraction of all answers: %f'%(float(occ)/tot_proc))
    if verbose:
        print('Top %i First Words'%k)
        print('---')
        coverage = 0
        for i in range(k):
            idx = idx_srt[i]
            print('%s - %s'%(key_list[idx], count_list[idx]))
            coverage += count_list[idx]
        print('---')
        print('Total Questions: %i'%tot_proc)
        print('Unique First Words: %i'%len(first_words))
        print('Total Qs of Top Words: %i'%coverage)
        print('Fraction Covered: %f'%(float(coverage)/tot_proc))
    return top_k