niobures
/

XPF

Model card Files Files and versions

XPF / docs /js /wc_language_files.js

niobures's picture

XPF

4a08ba7 verified 3 months ago

history blame contribute delete

6.17 kB

	/* WC File Parser and Word List Calcs*/

	// Create XMLHttpRequest request in order to read word list files for each language
	function get(url) {

	return new Promise(function(resolve, reject) {

	var req = new XMLHttpRequest();
	req.open("GET", url);

	req.onload = function() {
	if(req.status == 200) {
	resolve(req.response);
	}
	else {
	reject(Error(req.statusText));
	}
	};

	req.onerror = function() {
	reject(Error("Network Error"));
	};

	req.send();
	});
	}

	// Parse and format word list file
	async function read_wordlists(filename) {

	if (!filename) { // ensure file exists
	alert("Error: No file is listed for the language");
	return;
	}

	var rawdata = await get(filename);
	var lines = rawdata.split("\n");
	var word_list = {};
	var lines2 = [];

	// Exclude word/frequency entries where capital letters are present
	for (line in lines) {
	if (lines[line].match(/.\p{Lu}./gu) == null) {
	lines2.push(lines[line]);
	}
	}

	// Create a dictionary that lists word/frequency key value pairs
	for (var i=0; i < lines2.length; i++){
	str = lines2[i];
	word_freq = str.split(" ");
	word_list[word_freq[0]] = Number(word_freq[1]);
	}

	// Remove blank entries or those with no recorded frequency
	delete word_list[""];
	for(word in word_list) {
	if (Number.isNaN(word_list[word])){
	delete word_list[word];
	}
	};

	return word_list;
	}

	// Parse and format input text
	function read_input() {

	var x = document.getElementById("in1");
	var text_string = document.getElementById("in1").value;

	if (text_string == "") { // ensure text has been inputed
	x.style.background = "#ff0000";
	setTimeout(function() {
	x.style.background = "white";
	}, 250);
	return "";
	}
	else {

	// Split input string into an array (excluding punctuation)
	text_string = text_string.toLowerCase();
	text_string = text_string.replace(/[.,\/#!$%\^&\*;:{}=0-9\-_`~()\'\"\[\]]/g,"");
	text_string = text_string.replace(/\n+\|\s+/g," ");
	var word_list = text_string.split(" ");

	if(word_list[word_list.length - 1] == "") { // remove last element if empty
	word_list.splice(-1, 1);
	}
	}

	// Calculate the frequency of each word
	var freqs = {};

	for (var i=0; i < word_list.length; i++){
	var freq_val = word_list[i];
	if (freq_val in freqs) {
	freqs[freq_val]++;
	}
	else {
	freqs[freq_val] = 1;
	}
	}

	function orderfreq(a, b) { // function to sort word list from highest to lowest frequency
	return freqs[b] - freqs[a];
	}
	Object.keys(freqs).sort(orderfreq); // sort word list from highest to lowest frequency

	return freqs;
	}


	// Compare the input text to the predetermined word list
	async function comp() {

	a = document.getElementById("rule_dropdown").value;
	word_list_new = read_input();
	word_list_pre = await read_wordlists(a);

	var word_list_input = {};

	for (word in word_list_new) {
	if (word_list_pre.hasOwnProperty(word)) { // search the preselected word list for the input words
	word_list_input[word] = word_list_new[word]; // create dictionary of matched words with the user's word list frequencies
	delete word_list_new[word] // once a match has been found with a user's input word, remove that word from the initial list
	}
	}

	initial_prelength = Object.keys(word_list_pre).length // get length prior to adding the "empty" entries below

	// If a word in the input string is not in the preselected word list, include the word in the preselected word list, but set the frequency to 0
	for (word in word_list_new) { // word_list_new now contains all user input words that weren't previously matched
	word_list_input[word] = word_list_new[word];
	word_list_pre[word] = 0;
	}

	// Turn the input and fixed word lists into probability distributions
	// (1) Calculate the total frequency count for both the input string and the preselected word list

	input_total = Object.values(word_list_input).reduce((num_tot, num_new) => num_tot + num_new, 0); // freq total for input list
	fixed_total = Object.values(word_list_pre).reduce((num_tot, num_new) => num_tot + num_new, 0); // freq total for fixed list (including all words in preselected lists, not just the matched ones)

	// (2) Calculate the probabilites (with add-one smoothing)
	var input_list_probs = {};
	var pre_list_probs = {}

	for (word in word_list_input) {
	prob = (word_list_input[word] + 1) / (input_total + Object.keys(word_list_input).length + 1); // add-one smoothing - num: count of the word + 1; den: frequency count of entire input string + # of uniqe words in input string + 1
	input_list_probs[word] = prob;
	}

	for (word in word_list_pre) {
	prob = (word_list_pre[word] + 1) / (fixed_total + initial_prelength + 1); // add-one smoothing - num: count of the word + 1; den: frequency count of entire word list + # of unique words in word list + 1
	pre_list_probs[word] = prob;
	}

	// (3) Calculate the pointwise kl-divergence between the two distributions
	var kl_value;
	var kl_vals = [];
	var kl_pos = []; // for words and values
	var kl_posvals = []; // for values only
	var kl_neg = []; // for words and values
	var kl_negvals = []; // for values only
	var wc_word_list = [];

	for (word in input_list_probs) {
	if (pre_list_probs.hasOwnProperty(word)) {
	kl_value = input_list_probs[word] * Math.log(input_list_probs[word] / pre_list_probs[word]);
	kl_vals.push(kl_value);
	wc_word_list.push({word: word, kl: kl_value});

	if (kl_value > 0) {
	kl_pos.push({word: word, kl: kl_value});
	kl_posvals.push(kl_value);
	}
	else {
	kl_neg.push({word: word, kl: kl_value});
	kl_negvals.push(Math.abs(kl_value)); // need absolute value for word size in word cloud(s)
	}
	}
	}

	//console.log(wc_word_list)
	//console.log(kl_pos);
	//console.log(kl_neg);
	//console.log(kl_negvals);

	// wc_word_list and kl_vals aren't necessarily needed any further, but I kept them for access
	return [kl_pos, kl_posvals, kl_neg, kl_negvals, wc_word_list, kl_vals];
	}