| | import type { FrontendAnalyzeResult } from '../api/GLTR_API'; |
| | import { calculateSurprisal, calculateSurprisalDensity, countTokenCharacters, getByteLength } from './Util'; |
| | import { extractRealTopkFromTokens } from './tokenUtils'; |
| |
|
| | export type TextStats = { |
| | byteCount: number; |
| | charCount: number; |
| | tokenCount: number; |
| | tokenSurprisals: number[]; |
| | byteSurprisals: number[]; |
| | tokenAverage: number | null; |
| | tokenP90: number | null; |
| | byteAverage: number | null; |
| | totalSurprisal: number | null; |
| | }; |
| |
|
| | |
| | |
| | |
| | export type DiffStats = { |
| | |
| | byteCount: number; |
| | charCount: number; |
| | tokenCount: number; |
| | tokenSurprisals: number[]; |
| | tokenAverage: number | null; |
| | |
| | deltaTotalSurprisal: number | null; |
| | deltaByteSurprisals: number[]; |
| | }; |
| |
|
| | |
| | |
| | |
| | export const computeAverage = (values: number[] | null | undefined): number | null => { |
| | if (!values || values.length === 0) { |
| | return null; |
| | } |
| | const validValues = values.filter((value) => Number.isFinite(value)); |
| | if (validValues.length === 0) { |
| | return null; |
| | } |
| | const sum = validValues.reduce((acc, value) => acc + value, 0); |
| | return sum / validValues.length; |
| | }; |
| |
|
| | |
| | |
| | |
| | export const computeP90 = (values: number[] | null | undefined): number | null => { |
| | if (!values || values.length === 0) { |
| | return null; |
| | } |
| | const sorted = values |
| | .filter((value) => Number.isFinite(value)) |
| | .slice() |
| | .sort((a, b) => a - b); |
| | const n = sorted.length; |
| | if (n === 0) { |
| | return null; |
| | } |
| | |
| | const index = (n - 1) * 0.9; |
| | const lower = Math.floor(index); |
| | const upper = Math.ceil(index); |
| | const weight = index - lower; |
| | |
| | if (lower === upper) { |
| | return sorted[lower]; |
| | } |
| | |
| | return sorted[lower] * (1 - weight) + sorted[upper] * weight; |
| | }; |
| |
|
| | |
| | |
| | |
| | export const calculateTextStats = ( |
| | result: FrontendAnalyzeResult, |
| | originalText: string |
| | ): TextStats => { |
| | const originalTokens = result.originalTokens; |
| | const mergedTokens = result.mergedTokens; |
| |
|
| | const realTopkOriginal = extractRealTopkFromTokens(originalTokens); |
| | const realTopkMerged = extractRealTopkFromTokens(mergedTokens); |
| |
|
| | |
| | let truncatedTextLength = 0; |
| | if (originalTokens.length > 0) { |
| | const lastToken = originalTokens[originalTokens.length - 1]; |
| | truncatedTextLength = lastToken.offset[1]; |
| | } |
| | |
| | |
| | const truncatedText = originalText.slice(0, truncatedTextLength); |
| | const safeText = truncatedText; |
| | |
| | const byteCount = getByteLength(safeText); |
| | const charCount = countTokenCharacters(safeText); |
| | const tokenCount = originalTokens.length; |
| |
|
| | const tokenSurprisals: number[] = []; |
| | const byteSurprisals: number[] = []; |
| | let totalSurprisal = 0; |
| | let hasValidTotal = false; |
| |
|
| | originalTokens.forEach((token, index) => { |
| | const prob = realTopkOriginal[index][1]; |
| | const surprisal = calculateSurprisal(prob); |
| | tokenSurprisals.push(surprisal); |
| | if (Number.isFinite(surprisal)) { |
| | totalSurprisal += surprisal; |
| | hasValidTotal = true; |
| | } |
| | }); |
| |
|
| | mergedTokens.forEach((token) => { |
| | const tokenText = token.raw; |
| | const byteCountForToken = getByteLength(tokenText); |
| | const byteSurprisal = calculateSurprisalDensity(token); |
| | |
| | |
| | |
| | for (let i = 0; i < byteCountForToken; i++) { |
| | byteSurprisals.push(byteSurprisal); |
| | } |
| | }); |
| |
|
| | return { |
| | byteCount, |
| | charCount, |
| | tokenCount, |
| | tokenSurprisals, |
| | byteSurprisals, |
| | tokenAverage: computeAverage(tokenSurprisals), |
| | tokenP90: computeP90(tokenSurprisals), |
| | byteAverage: computeAverage(byteSurprisals), |
| | totalSurprisal: hasValidTotal ? totalSurprisal : null |
| | }; |
| | }; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | export const calculateDiffStats = ( |
| | diffStats: TextStats, |
| | baseStats: TextStats |
| | ): DiffStats => { |
| | |
| | const deltaTotalSurprisal = (diffStats.totalSurprisal !== null && baseStats.totalSurprisal !== null) |
| | ? diffStats.totalSurprisal - baseStats.totalSurprisal |
| | : null; |
| |
|
| | |
| | const deltaByteSurprisals: number[] = []; |
| | const minLength = Math.min(diffStats.byteSurprisals.length, baseStats.byteSurprisals.length); |
| | |
| | for (let i = 0; i < minLength; i++) { |
| | const delta = diffStats.byteSurprisals[i] - baseStats.byteSurprisals[i]; |
| | deltaByteSurprisals.push(delta); |
| | } |
| |
|
| | return { |
| | byteCount: diffStats.byteCount, |
| | charCount: diffStats.charCount, |
| | tokenCount: diffStats.tokenCount, |
| | tokenSurprisals: diffStats.tokenSurprisals, |
| | tokenAverage: diffStats.tokenAverage, |
| | deltaTotalSurprisal, |
| | deltaByteSurprisals |
| | }; |
| | }; |
| |
|
| |
|