| import * as d3 from "d3"; |
| import { TokenWithOffset } from "../api/generatedSchemas"; |
|
|
| |
| |
| |
| let the_unique_id_counter = 0; |
|
|
| export class Util { |
| static simpleUId({prefix = ''}): string { |
| the_unique_id_counter += 1; |
|
|
| return prefix + the_unique_id_counter; |
| } |
| } |
|
|
| export type D3Sel = d3.Selection<any, any, any, any> |
|
|
| export function argsort(array, sortFct):number[] { |
| return array |
| .map((d, i) => [d, i]) |
| .sort((a,b) => sortFct(a[0], b[0])) |
| .map(d => d[1]); |
| } |
|
|
| export function range(end){ |
| return [...Array(end).keys()] |
| } |
|
|
| |
| export function isFiniteNumber(x: unknown): x is number { |
| return typeof x === 'number' && Number.isFinite(x); |
| } |
|
|
| export function obj_to_arr(obj:object){ |
| const sortedKeys = Object.keys(obj).sort(); |
| const res=[]; |
| sortedKeys.forEach(k => {res.push(k); res.push(obj[k])}) |
| return res; |
| } |
|
|
| export function arr_to_obj(arr:any){ |
| const res={}; |
| const max_l = Math.floor(arr.length/2); |
| for (let i = 0; i<max_l; i++){ |
| res[arr[2*i]] = arr[2*i+1]; |
| } |
| return res; |
| } |
|
|
| export function splitString(string, splitters) { |
| var list = [string]; |
| for(var i=0, len=splitters.length; i<len; i++) { |
| traverseList(list, splitters[i], 0); |
| } |
| return flatten(list); |
| } |
|
|
| export function traverseList(list, splitter, index) { |
| if(list[index]) { |
| if((list.constructor !== String) && (list[index].constructor === String)) { |
| const splitted = list[index].split(splitter); |
| if (splitted.length > 1) { |
| list[index] = splitted; |
| } |
| } |
| (list[index].constructor === Array) ? traverseList(list[index], splitter, 0) : null; |
| (list.constructor === Array) ? traverseList(list, splitter, index+1) : null; |
| } |
| } |
|
|
| export function flatten(arr) { |
| return arr.reduce(function(acc, val) { |
| return acc.concat(val.constructor === Array ? flatten(val) : val); |
| },[]); |
| } |
|
|
|
|
| |
| |
| export const cleanSpecials = input => input |
| |
| |
| |
|
|
| |
| |
| |
| |
| |
| export function calculateSurprisal(probability: number): number { |
| return -Math.log2(Math.max(probability, Number.EPSILON)); |
| } |
|
|
| |
| |
| |
| |
| |
| |
| export function countTokenCharacters(tokenText: string): number { |
| |
| return Array.from(tokenText).length; |
| } |
|
|
| |
| const textEncoder = new TextEncoder(); |
|
|
| |
| |
| |
| |
| |
| export const getByteLength = (value: string): number => { |
| return textEncoder.encode(value).length; |
| }; |
|
|
| |
| |
| |
| |
| |
| |
| function calculateSurprisalPerByte(surprisal: number, tokenText: string): number { |
| |
| const byteCount = getByteLength(tokenText); |
| return byteCount > 0 ? surprisal / byteCount : 0; |
| } |
|
|
| |
| |
| |
| |
| |
| export function calculateSurprisalDensity(token: TokenWithOffset): number { |
| const [rank, prob] = token.real_topk; |
| const surprisal = calculateSurprisal(prob); |
| const tokenText = token.raw; |
| return calculateSurprisalPerByte(surprisal, tokenText); |
| } |
|
|
| |
| |
| |
| |
| |
| export function buildCharToByteIndexMap(text: string): number[] { |
| const map: number[] = []; |
| let byteOffset = 0; |
| |
| for (let charIndex = 0; charIndex < text.length; charIndex++) { |
| map[charIndex] = byteOffset; |
| |
| const char = text[charIndex]; |
| byteOffset += getByteLength(char); |
| } |
| |
| |
| map[text.length] = byteOffset; |
| |
| return map; |
| } |
|
|