|
|
import type { FrontendToken } from '../api/GLTR_API'; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export const isValidRealTopkTuple = (value: unknown): value is [number, number] => { |
|
|
return Array.isArray(value) |
|
|
&& value.length === 2 |
|
|
&& value.every((item) => typeof item === 'number' && Number.isFinite(item)); |
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export const validateTokenProbabilities = ( |
|
|
tokens: Array<{ real_topk?: [number, number] }> |
|
|
): string | null => { |
|
|
if (!Array.isArray(tokens) || tokens.length === 0) { |
|
|
return null; |
|
|
} |
|
|
for (let i = 0; i < tokens.length; i++) { |
|
|
const tuple = tokens[i]?.real_topk; |
|
|
if (!isValidRealTopkTuple(tuple)) { |
|
|
return `Token #${i} 缺少合法 real_topk 数据,已取消本次处理。`; |
|
|
} |
|
|
} |
|
|
return null; |
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export const isValidPredTopkEntry = (value: unknown): value is [string, number] => { |
|
|
return Array.isArray(value) |
|
|
&& value.length === 2 |
|
|
&& typeof value[0] === 'string' |
|
|
&& typeof value[1] === 'number' |
|
|
&& Number.isFinite(value[1]); |
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export const validateTokenPredictions = ( |
|
|
tokens: Array<{ pred_topk?: [string, number][] }> |
|
|
): string | null => { |
|
|
if (!Array.isArray(tokens) || tokens.length === 0) { |
|
|
return '返回数据缺少 token 序列,已取消本次处理。'; |
|
|
} |
|
|
for (let i = 0; i < tokens.length; i++) { |
|
|
const list = tokens[i]?.pred_topk; |
|
|
|
|
|
if (!Array.isArray(list)) { |
|
|
return `Token #${i} 缺少合法 pred_topk 数组,已取消本次处理。`; |
|
|
} |
|
|
|
|
|
if (list.length > 0) { |
|
|
for (let j = 0; j < list.length; j++) { |
|
|
if (!isValidPredTopkEntry(list[j])) { |
|
|
return `Token #${i} 的 pred_topk 项 #${j} 格式非法,已取消本次处理。`; |
|
|
} |
|
|
} |
|
|
} |
|
|
} |
|
|
return null; |
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export const formatTokenPreview = (text: string): string => { |
|
|
if (!text) { |
|
|
return '[空]'; |
|
|
} |
|
|
|
|
|
const chars = Array.from(text); |
|
|
if (chars.length <= 12) { |
|
|
return text; |
|
|
} |
|
|
|
|
|
const head = chars.slice(0, 6).join(''); |
|
|
const tail = chars.slice(-3).join(''); |
|
|
return `${head}…${tail}`; |
|
|
}; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
export const validateTokenConsistency = ( |
|
|
bpeStrings: Array<{ offset?: [number, number]; raw?: string }>, |
|
|
originalText: string, |
|
|
options: { allowOverlap?: boolean } = {} |
|
|
): string | null => { |
|
|
const { allowOverlap = false } = options; |
|
|
if (!Array.isArray(bpeStrings) || bpeStrings.length === 0) { |
|
|
return null; |
|
|
} |
|
|
|
|
|
if (typeof originalText !== 'string') { |
|
|
return '响应缺少原始文本,无法校验 token 数据,已取消本次 demo。'; |
|
|
} |
|
|
|
|
|
const charArray = Array.from(originalText); |
|
|
const totalChars = charArray.length; |
|
|
|
|
|
for (let i = 0; i < bpeStrings.length; i++) { |
|
|
const token = bpeStrings[i]; |
|
|
if (!token) { |
|
|
return `Token #${i} 数据缺失,已取消本次 demo。`; |
|
|
} |
|
|
|
|
|
const offset = token.offset; |
|
|
if (!Array.isArray(offset) || offset.length !== 2) { |
|
|
return `Token #${i} 缺少合法 offset,已取消本次 demo。`; |
|
|
} |
|
|
|
|
|
const [start, end] = offset; |
|
|
if (!Number.isInteger(start) || !Number.isInteger(end)) { |
|
|
return `Token #${i} 的 offset (${start}, ${end}) 不是整数,已取消本次 demo。`; |
|
|
} |
|
|
|
|
|
if (start < 0 || end < start || end > totalChars) { |
|
|
return `Token #${i} 的 offset (${start}, ${end}) 超出原文范围,已取消本次 demo。`; |
|
|
} |
|
|
|
|
|
if (!allowOverlap && i > 0) { |
|
|
const prevOffset = bpeStrings[i - 1]?.offset; |
|
|
if (Array.isArray(prevOffset) && prevOffset.length === 2) { |
|
|
const prevEnd = prevOffset[1]; |
|
|
if (Number.isInteger(prevEnd) && start < prevEnd) { |
|
|
return `Token #${i} 的 offset (${start}, ${end}) 与 Token #${i - 1} 重叠,已取消本次 demo。`; |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
const expected = charArray.slice(start, end).join(''); |
|
|
const raw = token.raw ?? ''; |
|
|
if (expected !== raw) { |
|
|
const previewExpected = formatTokenPreview(expected); |
|
|
const previewRaw = formatTokenPreview(raw); |
|
|
return `Token #${i} 数据异常:offset(${start}, ${end}) 对应原文 "${previewExpected}",但 raw 为 "${previewRaw}"。请重新分析或修复数据。`; |
|
|
} |
|
|
} |
|
|
|
|
|
return null; |
|
|
}; |
|
|
|
|
|
|