| import type { AnalyzeResponse, FrontendToken } from '../api/GLTR_API'; |
| import { |
| type DigitMergePipelineOptions, |
| digitMergeIndexGroupsByText, |
| dropEmptyZeroWidthTokens, |
| flattenMergePartsForDigitGroup, |
| mergeSequentialOverlap, |
| mergeSourcePartsForOverlapPair, |
| sliceTextByCodePointOffsets, |
| } from './mergeTokenSpans'; |
|
|
| export type DigitMergeResult = { |
| digitMergedTokens: FrontendToken[]; |
| |
| mergeGroups: number[][]; |
| }; |
|
|
| export type CloneTokenOptions = { |
| keepMergedFlag?: boolean; |
| }; |
|
|
| |
| |
| |
| export const cloneRealTopk = (tuple: [number, number] | null | undefined): [number, number] | undefined => { |
| if (Array.isArray(tuple) && tuple.length === 2 && tuple.every((item) => typeof item === 'number')) { |
| return [tuple[0], tuple[1]]; |
| } |
| return undefined; |
| }; |
|
|
| |
| |
| |
| export const clonePredTopk = (list: [string, number][] | null | undefined): [string, number][] => { |
| if (!Array.isArray(list)) { |
| return []; |
| } |
| return list.map((item) => { |
| const tokenText = typeof item?.[0] === 'string' ? item[0] : ''; |
| const prob = typeof item?.[1] === 'number' && Number.isFinite(item[1]) ? item[1] : 0; |
| return [tokenText, prob] as [string, number]; |
| }); |
| }; |
|
|
| |
| |
| |
| export const cloneFrontendToken = (token: FrontendToken, options: CloneTokenOptions = {}): FrontendToken => { |
| const cloned: FrontendToken = { |
| offset: [token.offset[0], token.offset[1]], |
| raw: token.raw, |
| real_topk: cloneRealTopk(token.real_topk), |
| pred_topk: clonePredTopk(token.pred_topk) |
| }; |
| if (options.keepMergedFlag !== false && typeof token.bpe_merged === 'string') { |
| cloned.bpe_merged = token.bpe_merged; |
| } |
| if (options.keepMergedFlag !== false && Array.isArray(token.bpe_merge_parts)) { |
| cloned.bpe_merge_parts = [...token.bpe_merge_parts]; |
| } |
| return cloned; |
| }; |
|
|
| |
| |
| |
| export const getTokenProbability = (token: FrontendToken): number => { |
| const tuple = token.real_topk; |
| if (Array.isArray(tuple) && tuple.length === 2 && typeof tuple[1] === 'number') { |
| return tuple[1]; |
| } |
| return 0; |
| }; |
|
|
| |
| |
| |
| |
| |
| |
| |
| export const mergeBpeOverlapTokens = (tokens: FrontendToken[], originalText: string): FrontendToken[] => { |
| const prepared = dropEmptyZeroWidthTokens(tokens); |
| return mergeSequentialOverlap(prepared, { |
| getOffset: (t) => t.offset, |
| cloneForStep: (t) => cloneFrontendToken(t), |
| sliceMergedRaw: (start, end) => sliceTextByCodePointOffsets(originalText, start, end), |
| mergeOverlappingPair: (current, next, mergedOffset, mergedRaw) => { |
| const mergedParts = mergeSourcePartsForOverlapPair(originalText, current, next); |
| current.offset[0] = mergedOffset[0]; |
| current.offset[1] = mergedOffset[1]; |
| current.raw = mergedRaw; |
| current.bpe_merge_parts = mergedParts; |
| const combinedProb = getTokenProbability(current) * getTokenProbability(next); |
| current.real_topk = [0, combinedProb]; |
| current.pred_topk = []; |
| current.bpe_merged = 'overlap'; |
| return current; |
| }, |
| }); |
| }; |
|
|
| |
| |
| |
| |
| export const mergeBpeDigitTokens = (tokens: FrontendToken[], originalText: string): DigitMergeResult => { |
| const mergeGroups = digitMergeIndexGroupsByText(originalText, tokens); |
| const digitMergedTokens = mergeGroups.map((group) => { |
| if (group.length === 1) { |
| return tokens[group[0]!]!; |
| } |
| const first = tokens[group[0]!]!; |
| const last = tokens[group[group.length - 1]!]!; |
| const mergedRaw = sliceTextByCodePointOffsets(originalText, first.offset[0], last.offset[1]); |
| const mergedProb = group.reduce((p, idx) => p * getTokenProbability(tokens[idx]!), 1); |
| return { |
| offset: [first.offset[0], last.offset[1]] as [number, number], |
| raw: mergedRaw, |
| real_topk: [0, mergedProb] as [number, number], |
| pred_topk: [], |
| bpe_merged: 'digit' as const, |
| bpe_merge_parts: flattenMergePartsForDigitGroup(group, tokens), |
| }; |
| }); |
| return { digitMergedTokens, mergeGroups }; |
| }; |
|
|
| |
| |
| |
| export const digitMergeWithScores = ( |
| tokens: FrontendToken[], |
| scoreArrays: (number | undefined)[][], |
| originalText: string |
| ): { digitMergedTokens: FrontendToken[]; mergedScoreArrays: (number | undefined)[][] } => { |
| const { digitMergedTokens, mergeGroups } = mergeBpeDigitTokens(tokens, originalText); |
| const mergedScoreArrays = scoreArrays.map((arr) => |
| mergeGroups.map((group) => group.reduce((sum, idx) => sum + (arr[idx] ?? 0), 0)) |
| ); |
| return { digitMergedTokens, mergedScoreArrays }; |
| }; |
|
|
| |
| |
| |
| export const mergeTokensForRendering = ( |
| tokens: FrontendToken[], |
| originalText: string, |
| options: DigitMergePipelineOptions = {} |
| ): FrontendToken[] => { |
| const overlapMerged = mergeBpeOverlapTokens(tokens, originalText); |
| if (options.digitMerge === false) { |
| return overlapMerged; |
| } |
| const { digitMergedTokens } = mergeBpeDigitTokens(overlapMerged, originalText); |
| return digitMergedTokens; |
| }; |
|
|
| |
| |
| |
| export const extractRealTopkFromTokens = (tokens: FrontendToken[] | null | undefined): [number, number][] => { |
| if (!Array.isArray(tokens)) { |
| return []; |
| } |
| return tokens.map((token) => { |
| const tuple = token.real_topk; |
| return [tuple[0], tuple[1]]; |
| }); |
| }; |
|
|
| |
| |
| |
| export const createRawSnapshot = (response: AnalyzeResponse): AnalyzeResponse => { |
| const requestClone: AnalyzeResponse['request'] = { |
| text: response.request.text |
| }; |
| const originalResult = response.result; |
| const tokensForSave = originalResult.bpe_strings.map((token) => |
| cloneFrontendToken(token as FrontendToken, { keepMergedFlag: false }) |
| ); |
| |
| const resultClone: AnalyzeResponse['result'] = { |
| model: originalResult.model, |
| ...originalResult, |
| bpe_strings: tokensForSave |
| }; |
| return { |
| request: requestClone, |
| result: resultClone |
| }; |
| }; |
|
|
|
|