InfoRadar / client /src /ts /utils /Util.ts
dqy08's picture
语义分析功能v0.3
ce2c2f8
import * as d3 from "d3";
import { TokenWithOffset } from "../api/generatedSchemas";
/**
* Created by hen on 5/15/17.
*/
let the_unique_id_counter = 0;
export class Util {
static simpleUId({prefix = ''}): string {
the_unique_id_counter += 1;
return prefix + the_unique_id_counter;
}
}
export type D3Sel = d3.Selection<any, any, any, any>
export function argsort(array, sortFct):number[] {
return array
.map((d, i) => [d, i])
.sort((a,b) => sortFct(a[0], b[0]))
.map(d => d[1]);
}
export function range(end){
return [...Array(end).keys()]
}
/** 判断是否为有限数字(排除 NaN、Infinity、非 number 类型) */
export function isFiniteNumber(x: unknown): x is number {
return typeof x === 'number' && Number.isFinite(x);
}
export function obj_to_arr(obj:object){
const sortedKeys = Object.keys(obj).sort();
const res=[];
sortedKeys.forEach(k => {res.push(k); res.push(obj[k])})
return res;
}
export function arr_to_obj(arr:any){
const res={};
const max_l = Math.floor(arr.length/2);
for (let i = 0; i<max_l; i++){
res[arr[2*i]] = arr[2*i+1];
}
return res;
}
export function splitString(string, splitters) {
var list = [string];
for(var i=0, len=splitters.length; i<len; i++) {
traverseList(list, splitters[i], 0);
}
return flatten(list);
}
export function traverseList(list, splitter, index) {
if(list[index]) {
if((list.constructor !== String) && (list[index].constructor === String)) {
const splitted = list[index].split(splitter);
if (splitted.length > 1) {
list[index] = splitted;
}
}
(list[index].constructor === Array) ? traverseList(list[index], splitter, 0) : null;
(list.constructor === Array) ? traverseList(list, splitter, index+1) : null;
}
}
export function flatten(arr) {
return arr.reduce(function(acc, val) {
return acc.concat(val.constructor === Array ? flatten(val) : val);
},[]);
}
// Kudos: https://stackoverflow.com/questions/9401312/how-to-replace-curly-quotation-marks-in-a-string-using-javascript#answer-9401374
// Note: Removed em dash (\u2014) replacement to preserve Chinese em dash "——"
export const cleanSpecials = input => input
// .replace(/[‘’]/g, "'") // 注释掉替换卷单引号的逻辑
// .replace(/[“”]/g, '"') // 注释掉替换卷双引号的逻辑
// .replace(/[–]/g, "-"); // 注释掉替换en dash的逻辑,em dash (—) 已在上一个版本中移除替换
/**
* Calculate surprisal (information content) from probability
* @param probability - The probability value (0 < p <= 1)
* @returns Surprisal in bits (using base-2 logarithm)
*/
export function calculateSurprisal(probability: number): number {
return -Math.log2(Math.max(probability, Number.EPSILON));
}
/**
* 计算token的字符数(中文按字,英文按字母)
* 使用Array.from正确处理Unicode字符(包括emoji)
* @param tokenText token文本
* @returns 字符数
*/
export function countTokenCharacters(tokenText: string): number {
// 使用Array.from正确处理Unicode字符(包括中文、emoji等)
return Array.from(tokenText).length;
}
// 复用 TextEncoder 实例,避免每次调用都创建新实例
const textEncoder = new TextEncoder();
/**
* 获取字符串的UTF-8编码字节长度
* @param value 要计算字节长度的字符串
* @returns UTF-8编码的字节数
*/
export const getByteLength = (value: string): number => {
return textEncoder.encode(value).length;
};
/**
* 计算单位字节的surprisal值
* @param surprisal token的总surprisal值
* @param tokenText token文本
* @returns 单位字节的surprisal值(bits/Byte)
*/
function calculateSurprisalPerByte(surprisal: number, tokenText: string): number {
// 按UTF-8编码字节数计算
const byteCount = getByteLength(tokenText);
return byteCount > 0 ? surprisal / byteCount : 0;
}
/**
* 计算信息密度(统一接口,方便将来扩展)
* @param token token对象,包含real_topk和raw字段
* @returns 信息密度值(bits/Byte)
*/
export function calculateSurprisalDensity(token: TokenWithOffset): number {
const [rank, prob] = token.real_topk;
const surprisal = calculateSurprisal(prob);
const tokenText = token.raw;
return calculateSurprisalPerByte(surprisal, tokenText);
}
/**
* 为文本创建字符索引到字节索引的映射表
* @param text 原始文本
* @returns 数组,charToByteIndex[charIndex] = byteIndex
*/
export function buildCharToByteIndexMap(text: string): number[] {
const map: number[] = [];
let byteOffset = 0;
for (let charIndex = 0; charIndex < text.length; charIndex++) {
map[charIndex] = byteOffset;
// 获取当前字符的UTF-8字节长度
const char = text[charIndex];
byteOffset += getByteLength(char);
}
// 添加末尾位置(文本总字节长度)
map[text.length] = byteOffset;
return map;
}