File size: 6,171 Bytes
4a08ba7 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 |
/* WC File Parser and Word List Calcs*/
// Create XMLHttpRequest request in order to read word list files for each language
function get(url) {
return new Promise(function(resolve, reject) {
var req = new XMLHttpRequest();
req.open("GET", url);
req.onload = function() {
if(req.status == 200) {
resolve(req.response);
}
else {
reject(Error(req.statusText));
}
};
req.onerror = function() {
reject(Error("Network Error"));
};
req.send();
});
}
// Parse and format word list file
async function read_wordlists(filename) {
if (!filename) { // ensure file exists
alert("Error: No file is listed for the language");
return;
}
var rawdata = await get(filename);
var lines = rawdata.split("\n");
var word_list = {};
var lines2 = [];
// Exclude word/frequency entries where capital letters are present
for (line in lines) {
if (lines[line].match(/.*\p{Lu}.*/gu) == null) {
lines2.push(lines[line]);
}
}
// Create a dictionary that lists word/frequency key value pairs
for (var i=0; i < lines2.length; i++){
str = lines2[i];
word_freq = str.split(" ");
word_list[word_freq[0]] = Number(word_freq[1]);
}
// Remove blank entries or those with no recorded frequency
delete word_list[""];
for(word in word_list) {
if (Number.isNaN(word_list[word])){
delete word_list[word];
}
};
return word_list;
}
// Parse and format input text
function read_input() {
var x = document.getElementById("in1");
var text_string = document.getElementById("in1").value;
if (text_string == "") { // ensure text has been inputed
x.style.background = "#ff0000";
setTimeout(function() {
x.style.background = "white";
}, 250);
return "";
}
else {
// Split input string into an array (excluding punctuation)
text_string = text_string.toLowerCase();
text_string = text_string.replace(/[.,\/#!$%\^&\*;:{}=0-9\-_`~()\'\"\[\]]/g,"");
text_string = text_string.replace(/\n+|\s+/g," ");
var word_list = text_string.split(" ");
if(word_list[word_list.length - 1] == "") { // remove last element if empty
word_list.splice(-1, 1);
}
}
// Calculate the frequency of each word
var freqs = {};
for (var i=0; i < word_list.length; i++){
var freq_val = word_list[i];
if (freq_val in freqs) {
freqs[freq_val]++;
}
else {
freqs[freq_val] = 1;
}
}
function orderfreq(a, b) { // function to sort word list from highest to lowest frequency
return freqs[b] - freqs[a];
}
Object.keys(freqs).sort(orderfreq); // sort word list from highest to lowest frequency
return freqs;
}
// Compare the input text to the predetermined word list
async function comp() {
a = document.getElementById("rule_dropdown").value;
word_list_new = read_input();
word_list_pre = await read_wordlists(a);
var word_list_input = {};
for (word in word_list_new) {
if (word_list_pre.hasOwnProperty(word)) { // search the preselected word list for the input words
word_list_input[word] = word_list_new[word]; // create dictionary of matched words with the user's word list frequencies
delete word_list_new[word] // once a match has been found with a user's input word, remove that word from the initial list
}
}
initial_prelength = Object.keys(word_list_pre).length // get length prior to adding the "empty" entries below
// If a word in the input string is not in the preselected word list, include the word in the preselected word list, but set the frequency to 0
for (word in word_list_new) { // word_list_new now contains all user input words that weren't previously matched
word_list_input[word] = word_list_new[word];
word_list_pre[word] = 0;
}
// Turn the input and fixed word lists into probability distributions
// (1) Calculate the total frequency count for both the input string and the preselected word list
input_total = Object.values(word_list_input).reduce((num_tot, num_new) => num_tot + num_new, 0); // freq total for input list
fixed_total = Object.values(word_list_pre).reduce((num_tot, num_new) => num_tot + num_new, 0); // freq total for fixed list (including all words in preselected lists, not just the matched ones)
// (2) Calculate the probabilites (with add-one smoothing)
var input_list_probs = {};
var pre_list_probs = {}
for (word in word_list_input) {
prob = (word_list_input[word] + 1) / (input_total + Object.keys(word_list_input).length + 1); // add-one smoothing - num: count of the word + 1; den: frequency count of entire input string + # of uniqe words in input string + 1
input_list_probs[word] = prob;
}
for (word in word_list_pre) {
prob = (word_list_pre[word] + 1) / (fixed_total + initial_prelength + 1); // add-one smoothing - num: count of the word + 1; den: frequency count of entire word list + # of unique words in word list + 1
pre_list_probs[word] = prob;
}
// (3) Calculate the pointwise kl-divergence between the two distributions
var kl_value;
var kl_vals = [];
var kl_pos = []; // for words and values
var kl_posvals = []; // for values only
var kl_neg = []; // for words and values
var kl_negvals = []; // for values only
var wc_word_list = [];
for (word in input_list_probs) {
if (pre_list_probs.hasOwnProperty(word)) {
kl_value = input_list_probs[word] * Math.log(input_list_probs[word] / pre_list_probs[word]);
kl_vals.push(kl_value);
wc_word_list.push({word: word, kl: kl_value});
if (kl_value > 0) {
kl_pos.push({word: word, kl: kl_value});
kl_posvals.push(kl_value);
}
else {
kl_neg.push({word: word, kl: kl_value});
kl_negvals.push(Math.abs(kl_value)); // need absolute value for word size in word cloud(s)
}
}
}
//console.log(wc_word_list)
//console.log(kl_pos);
//console.log(kl_neg);
//console.log(kl_negvals);
// wc_word_list and kl_vals aren't necessarily needed any further, but I kept them for access
return [kl_pos, kl_posvals, kl_neg, kl_negvals, wc_word_list, kl_vals];
}
|