File size: 6,171 Bytes
4a08ba7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
/* WC File Parser and Word List Calcs*/

// Create XMLHttpRequest request in order to read word list files for each language
function get(url) {

  return new Promise(function(resolve, reject) {

    var req = new XMLHttpRequest();
    req.open("GET", url);

    req.onload = function() {
      if(req.status == 200) {
        resolve(req.response);
      }
      else {
        reject(Error(req.statusText));
      }
    };

    req.onerror = function() {
      reject(Error("Network Error"));
    };

    req.send();
  });
}

// Parse and format word list file
async function read_wordlists(filename) {

  if (!filename) { // ensure file exists
    alert("Error: No file is listed for the language");
    return;
  }

  var rawdata = await get(filename);
  var lines = rawdata.split("\n");
  var word_list = {};
  var lines2 = [];

  // Exclude word/frequency entries where capital letters are present
  for (line in lines) {
    if (lines[line].match(/.*\p{Lu}.*/gu) == null) {
      lines2.push(lines[line]);
    }
  }

  // Create a dictionary that lists word/frequency key value pairs
  for (var i=0; i < lines2.length; i++){
    str = lines2[i];
    word_freq = str.split(" ");
    word_list[word_freq[0]] = Number(word_freq[1]);
  }

  // Remove blank entries or those with no recorded frequency
  delete word_list[""];
  for(word in word_list) {
    if (Number.isNaN(word_list[word])){
      delete word_list[word];
    }
  };

  return word_list;
}

// Parse and format input text
function read_input() {

  var x = document.getElementById("in1");
  var text_string = document.getElementById("in1").value;

  if (text_string == "") { // ensure text has been inputed
    x.style.background = "#ff0000";
    setTimeout(function() {
      x.style.background = "white";
    }, 250);
    return "";
  }
  else {

    // Split input string into an array (excluding punctuation)
    text_string = text_string.toLowerCase();
    text_string = text_string.replace(/[.,\/#!$%\^&\*;:{}=0-9\-_`~()\'\"\[\]]/g,"");
    text_string = text_string.replace(/\n+|\s+/g," ");
    var word_list = text_string.split(" ");

    if(word_list[word_list.length - 1] == "") { // remove last element if empty
      word_list.splice(-1, 1);
    }
  }

  // Calculate the frequency of each word
  var freqs = {};

  for (var i=0; i < word_list.length; i++){
    var freq_val = word_list[i];
    if (freq_val in freqs) {
      freqs[freq_val]++;
    }
    else {
      freqs[freq_val] = 1;
    }
  }

  function orderfreq(a, b) { // function to sort word list from highest to lowest frequency
    return freqs[b] - freqs[a];
  }
  Object.keys(freqs).sort(orderfreq); // sort word list from highest to lowest frequency

  return freqs;
}


// Compare the input text to the predetermined word list
async function comp() {

  a = document.getElementById("rule_dropdown").value;
  word_list_new = read_input();
  word_list_pre = await read_wordlists(a);

  var word_list_input = {};

  for (word in word_list_new) {
    if (word_list_pre.hasOwnProperty(word)) { // search the preselected word list for the input words
      word_list_input[word] = word_list_new[word]; // create dictionary of matched words with the user's word list frequencies
      delete word_list_new[word] // once a match has been found with a user's input word, remove that word from the initial list
    }
  }

  initial_prelength = Object.keys(word_list_pre).length // get length prior to adding the "empty" entries below

  // If a word in the input string is not in the preselected word list, include the word in the preselected word list, but set the frequency to 0
  for (word in word_list_new) { // word_list_new now contains all user input words that weren't previously matched
    word_list_input[word] = word_list_new[word];
    word_list_pre[word] = 0;
  }

  // Turn the input and fixed word lists into probability distributions
  // (1) Calculate the total frequency count for both the input string and the preselected word list

  input_total = Object.values(word_list_input).reduce((num_tot, num_new) => num_tot + num_new, 0); // freq total for input list
  fixed_total = Object.values(word_list_pre).reduce((num_tot, num_new) => num_tot + num_new, 0); // freq total for fixed list (including all words in preselected lists, not just the matched ones)

  // (2) Calculate the probabilites (with add-one smoothing)
  var input_list_probs = {};
  var pre_list_probs = {}

  for (word in word_list_input) {
    prob = (word_list_input[word] + 1) / (input_total + Object.keys(word_list_input).length + 1); // add-one smoothing - num: count of the word + 1; den: frequency count of entire input string + # of uniqe words in input string + 1
    input_list_probs[word] = prob;
  }

  for (word in word_list_pre) {
    prob = (word_list_pre[word] + 1) / (fixed_total + initial_prelength + 1); // add-one smoothing - num: count of the word + 1; den: frequency count of entire word list + # of unique words in word list + 1
    pre_list_probs[word] = prob;
  }

  // (3) Calculate the pointwise kl-divergence between the two distributions
  var kl_value;
  var kl_vals = [];
  var kl_pos = []; // for words and values
  var kl_posvals = []; // for values only
  var kl_neg = []; // for words and values
  var kl_negvals = []; // for values only
  var wc_word_list = [];

  for (word in input_list_probs) {
    if (pre_list_probs.hasOwnProperty(word)) {
      kl_value = input_list_probs[word] * Math.log(input_list_probs[word] / pre_list_probs[word]);
      kl_vals.push(kl_value);
      wc_word_list.push({word: word, kl: kl_value});

      if (kl_value > 0) {
        kl_pos.push({word: word, kl: kl_value});
        kl_posvals.push(kl_value);
      }
      else {
        kl_neg.push({word: word, kl: kl_value});
        kl_negvals.push(Math.abs(kl_value)); // need absolute value for word size in word cloud(s)
      }
    }
  }

  //console.log(wc_word_list)
  //console.log(kl_pos);
  //console.log(kl_neg);
  //console.log(kl_negvals);

  // wc_word_list and kl_vals aren't necessarily needed any further, but I kept them for access
  return [kl_pos, kl_posvals, kl_neg, kl_negvals, wc_word_list, kl_vals];
}