统一可视化配置,并将信息密度单位从字符改为字节
Browse files- client/src/index.html +1 -1
- client/src/ts/appInitializer.ts +5 -5
- client/src/ts/compare.ts +84 -73
- client/src/ts/controllers/highlightController.ts +3 -3
- client/src/ts/controllers/textInputController.ts +5 -6
- client/src/ts/start.ts +2 -4
- client/src/ts/ui/dialog.ts +2 -1
- client/src/ts/utils/SurprisalColorConfig.ts +50 -27
- client/src/ts/utils/Util.ts +53 -6
- client/src/ts/utils/demoPathUtils.ts +1 -1
- client/src/ts/utils/highlightUtils.ts +53 -20
- client/src/ts/utils/localFileUtils.ts +1 -1
- client/src/ts/utils/textStatistics.ts +28 -42
- client/src/ts/utils/tokenUtils.ts +5 -8
- client/src/ts/utils/visualizationConfigs.ts +52 -0
- client/src/ts/utils/visualizationUpdater.ts +14 -11
- client/src/ts/vis/GLTR_Text_Box.ts +32 -37
- client/src/ts/vis/Histogram.ts +65 -66
- client/src/ts/vis/ScrollbarMinimap.ts +13 -30
- client/src/ts/vis/SvgOverlayManager.ts +37 -27
- client/src/ts/vis/ToolTip.ts +13 -10
client/src/index.html
CHANGED
|
@@ -85,7 +85,7 @@
|
|
| 85 |
<section id="all_result" class="results-section">
|
| 86 |
<div id="stats" class="stats-container">
|
| 87 |
<div class="histogram-item">
|
| 88 |
-
<div
|
| 89 |
<svg id="stats_frac"></svg>
|
| 90 |
</div>
|
| 91 |
<div class="histogram-item">
|
|
|
|
| 85 |
<section id="all_result" class="results-section">
|
| 86 |
<div id="stats" class="stats-container">
|
| 87 |
<div class="histogram-item">
|
| 88 |
+
<div id="token_histogram_title"></div>
|
| 89 |
<svg id="stats_frac"></svg>
|
| 90 |
</div>
|
| 91 |
<div class="histogram-item">
|
client/src/ts/appInitializer.ts
CHANGED
|
@@ -6,7 +6,7 @@
|
|
| 6 |
import * as d3 from 'd3';
|
| 7 |
import { SimpleEventHandler } from './utils/SimpleEventHandler';
|
| 8 |
import { TextAnalysisAPI } from './api/GLTR_API';
|
| 9 |
-
import {
|
| 10 |
|
| 11 |
/**
|
| 12 |
* 公共初始化返回对象
|
|
@@ -14,8 +14,8 @@ import { getSurprisalColor } from './utils/SurprisalColorConfig';
|
|
| 14 |
export interface CommonAppContext {
|
| 15 |
eventHandler: SimpleEventHandler;
|
| 16 |
api: TextAnalysisAPI;
|
| 17 |
-
|
| 18 |
-
|
| 19 |
totalSurprisalFormat: (n: number | null) => string;
|
| 20 |
}
|
| 21 |
|
|
@@ -33,8 +33,8 @@ export function initializeCommonApp(apiPrefix: string = '', element?: Element):
|
|
| 33 |
return {
|
| 34 |
eventHandler: new SimpleEventHandler(targetElement),
|
| 35 |
api: new TextAnalysisAPI(apiPrefix),
|
| 36 |
-
|
| 37 |
-
|
| 38 |
totalSurprisalFormat: (n: number | null) => n !== null && Number.isFinite(n) ? format(n) : String(n)
|
| 39 |
};
|
| 40 |
}
|
|
|
|
| 6 |
import * as d3 from 'd3';
|
| 7 |
import { SimpleEventHandler } from './utils/SimpleEventHandler';
|
| 8 |
import { TextAnalysisAPI } from './api/GLTR_API';
|
| 9 |
+
import { getTokenSurprisalColor, getByteSurprisalColor } from './utils/SurprisalColorConfig';
|
| 10 |
|
| 11 |
/**
|
| 12 |
* 公共初始化返回对象
|
|
|
|
| 14 |
export interface CommonAppContext {
|
| 15 |
eventHandler: SimpleEventHandler;
|
| 16 |
api: TextAnalysisAPI;
|
| 17 |
+
tokenSurprisalColorScale: (value: number) => string;
|
| 18 |
+
byteSurprisalColorScale: (value: number) => string;
|
| 19 |
totalSurprisalFormat: (n: number | null) => string;
|
| 20 |
}
|
| 21 |
|
|
|
|
| 33 |
return {
|
| 34 |
eventHandler: new SimpleEventHandler(targetElement),
|
| 35 |
api: new TextAnalysisAPI(apiPrefix),
|
| 36 |
+
tokenSurprisalColorScale: getTokenSurprisalColor,
|
| 37 |
+
byteSurprisalColorScale: getByteSurprisalColor,
|
| 38 |
totalSurprisalFormat: (n: number | null) => n !== null && Number.isFinite(n) ? format(n) : String(n)
|
| 39 |
};
|
| 40 |
}
|
client/src/ts/compare.ts
CHANGED
|
@@ -40,6 +40,12 @@ import {ToolTip} from './vis/ToolTip';
|
|
| 40 |
import { calculateHighlights } from './utils/highlightUtils';
|
| 41 |
// 公共初始化模块
|
| 42 |
import {initializeCommonApp} from './appInitializer';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
|
| 44 |
// 使用从 demoManager 导出的验证函数
|
| 45 |
|
|
@@ -114,7 +120,7 @@ type DemoColumnData = {
|
|
| 114 |
lmfInstance?: GLTR_Text_Box; // LMF实例引用(对比模式下使用)
|
| 115 |
histograms: {
|
| 116 |
stats_frac: Histogram | null;
|
| 117 |
-
|
| 118 |
stats_surprisal_progress: ScatterPlot | null;
|
| 119 |
};
|
| 120 |
};
|
|
@@ -123,7 +129,7 @@ window.onload = () => {
|
|
| 123 |
// 初始化公共应用组件
|
| 124 |
const api_prefix = URLHandler.parameters['api'] || '';
|
| 125 |
const bodyElement = <Element>d3.select('body').node();
|
| 126 |
-
const { eventHandler, api,
|
| 127 |
|
| 128 |
const container = d3.select('#compare-container');
|
| 129 |
const mainFrame = d3.select('.main_frame');
|
|
@@ -231,7 +237,7 @@ window.onload = () => {
|
|
| 231 |
const metricsId = `text_metrics_${safeId}`;
|
| 232 |
const errorId = `error_${safeId}`;
|
| 233 |
const statsFracId = `stats_frac_${safeId}`;
|
| 234 |
-
const
|
| 235 |
const statsProgressId = `stats_surprisal_progress_${safeId}`;
|
| 236 |
const textRenderId = `text_render_${safeId}`;
|
| 237 |
|
|
@@ -262,12 +268,12 @@ window.onload = () => {
|
|
| 262 |
</div>
|
| 263 |
<div id="${statsId}" class="stats" style="text-align:center;">
|
| 264 |
<div style="display:block;text-align: center;margin-bottom: 20px;">
|
| 265 |
-
<div
|
| 266 |
<svg id="${statsFracId}"></svg>
|
| 267 |
</div>
|
| 268 |
<div style="display:block;text-align: center;margin-bottom: 20px;">
|
| 269 |
-
<div id="
|
| 270 |
-
<svg id="${
|
| 271 |
</div>
|
| 272 |
<div style="display:block;text-align: center;margin-bottom: 20px;">
|
| 273 |
<div>surprisal vs token progress</div>
|
|
@@ -282,7 +288,7 @@ window.onload = () => {
|
|
| 282 |
// 处理单个 demo 的数据
|
| 283 |
const processDemoData = (data: AnalysisData): FrontendAnalyzeResult => {
|
| 284 |
const result = data.result;
|
| 285 |
-
const safeText = data.request
|
| 286 |
|
| 287 |
// 验证数据
|
| 288 |
if (!Array.isArray(result.bpe_strings) || result.bpe_strings.length === 0) {
|
|
@@ -327,7 +333,7 @@ window.onload = () => {
|
|
| 327 |
|
| 328 |
// 为单个列渲染统计图表(使用ID)
|
| 329 |
const renderStatsForColumn = (id: string, columnData: DemoColumnData) => {
|
| 330 |
-
if (!columnData.stats || !columnData.histograms.stats_frac || !columnData.histograms.
|
| 331 |
return;
|
| 332 |
}
|
| 333 |
|
|
@@ -336,68 +342,64 @@ window.onload = () => {
|
|
| 336 |
const safeId = toSafeId(id);
|
| 337 |
|
| 338 |
// 更新 token surprisal histogram(保持不变,不显示差分)
|
|
|
|
| 339 |
columnData.histograms.stats_frac.update({
|
|
|
|
| 340 |
data: stats.tokenSurprisals,
|
| 341 |
-
|
| 342 |
-
no_bins: 20,
|
| 343 |
-
extent: [0, 20],
|
| 344 |
-
colorScale: surprisalColorScale,
|
| 345 |
averageValue: stats.tokenAverage ?? undefined,
|
| 346 |
-
averageLabel: 'bits/token'
|
| 347 |
});
|
| 348 |
|
| 349 |
-
// 更新
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 350 |
if (isDiffColumn && columnData.diffStats) {
|
| 351 |
-
// Diff列:显示Δ
|
| 352 |
-
const
|
| 353 |
|
| 354 |
// 计算平均差分
|
| 355 |
-
const deltaAverage =
|
| 356 |
-
?
|
| 357 |
: 0;
|
| 358 |
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
data:
|
| 362 |
-
label: label,
|
| 363 |
-
no_bins: 20,
|
| 364 |
-
extent: [-10, 10],
|
| 365 |
colorScale: getDiffColor,
|
| 366 |
averageValue: deltaAverage,
|
| 367 |
-
averageLabel: 'Δ bits/char'
|
| 368 |
});
|
| 369 |
|
| 370 |
// 更新标题文本
|
| 371 |
-
const titleElement = document.getElementById(`
|
| 372 |
if (titleElement) {
|
| 373 |
-
titleElement.textContent = label;
|
| 374 |
}
|
| 375 |
} else {
|
| 376 |
-
// Base列或非模型差分模式:显示原始
|
| 377 |
-
|
| 378 |
-
columnData.histograms.
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
colorScale: surprisalColorScale,
|
| 384 |
-
averageValue: stats.charAverage ?? undefined,
|
| 385 |
-
averageLabel: 'bits/char'
|
| 386 |
});
|
| 387 |
|
| 388 |
// 更新标题文本
|
| 389 |
-
const titleElement = document.getElementById(`
|
| 390 |
if (titleElement) {
|
| 391 |
-
titleElement.textContent = label;
|
| 392 |
}
|
| 393 |
}
|
| 394 |
|
| 395 |
// 更新 surprisal progress scatter plot(保持不变)
|
| 396 |
if (stats.tokenSurprisals && stats.tokenSurprisals.length > 0) {
|
| 397 |
columnData.histograms.stats_surprisal_progress.update({
|
|
|
|
| 398 |
data: stats.tokenSurprisals,
|
| 399 |
-
xLabel: 'token index',
|
| 400 |
-
yLabel: 'surprisal (bits)'
|
| 401 |
});
|
| 402 |
}
|
| 403 |
};
|
|
@@ -537,8 +539,8 @@ window.onload = () => {
|
|
| 537 |
response = result.data;
|
| 538 |
}
|
| 539 |
const enhancedResult = processDemoData(response);
|
| 540 |
-
const safeText = response.request
|
| 541 |
-
const textStats = calculateTextStats(enhancedResult, safeText
|
| 542 |
|
| 543 |
columnData.data = response;
|
| 544 |
columnData.enhancedResult = enhancedResult;
|
|
@@ -551,7 +553,7 @@ window.onload = () => {
|
|
| 551 |
showErrorForColumn(id, null);
|
| 552 |
|
| 553 |
// 更新统计信息显示(从分析结果中获取实际使用的模型)
|
| 554 |
-
const resultModel = response.result
|
| 555 |
updateMetricsForColumn(id, textStats, resultModel);
|
| 556 |
|
| 557 |
// 渲染统计图表
|
|
@@ -565,7 +567,7 @@ window.onload = () => {
|
|
| 565 |
// 重新渲染所有列的统计图表和指标(因为差分数据可能变化)
|
| 566 |
columnsData.forEach((colData, colId) => {
|
| 567 |
if (colData.stats) {
|
| 568 |
-
const resultModel = colData.data
|
| 569 |
updateMetricsForColumn(colId, colData.stats, resultModel);
|
| 570 |
renderStatsForColumn(colId, colData);
|
| 571 |
}
|
|
@@ -577,7 +579,7 @@ window.onload = () => {
|
|
| 577 |
// 更新差分模式(因为差分数据可能变化)
|
| 578 |
const isDiffColumn = columnData.diffStats && !isBaseColumn(id);
|
| 579 |
if (isDiffColumn && columnData.diffStats) {
|
| 580 |
-
columnData.lmfInstance.setDiffMode(true, columnData.diffStats.
|
| 581 |
} else {
|
| 582 |
columnData.lmfInstance.setDiffMode(false, []);
|
| 583 |
}
|
|
@@ -606,7 +608,7 @@ window.onload = () => {
|
|
| 606 |
const initializeColumnVisualizations = (id: string, columnData: DemoColumnData): void => {
|
| 607 |
const safeId = toSafeId(id);
|
| 608 |
const statsFracId = `#stats_frac_${safeId}`;
|
| 609 |
-
const
|
| 610 |
const statsProgressId = `#stats_surprisal_progress_${safeId}`;
|
| 611 |
|
| 612 |
// 创建 Histogram 实例
|
|
@@ -616,8 +618,8 @@ window.onload = () => {
|
|
| 616 |
{ width: 400, height: 200 }
|
| 617 |
);
|
| 618 |
|
| 619 |
-
columnData.histograms.
|
| 620 |
-
d3.select(
|
| 621 |
eventHandler,
|
| 622 |
{ width: 400, height: 200 }
|
| 623 |
);
|
|
@@ -669,7 +671,7 @@ window.onload = () => {
|
|
| 669 |
// 设置差分模式(如果是Diff列)
|
| 670 |
const isDiffColumn = modelDiffMode && columnData.diffStats && !isBaseColumn(id);
|
| 671 |
if (isDiffColumn && columnData.diffStats) {
|
| 672 |
-
columnData.lmfInstance.setDiffMode(true, columnData.diffStats.
|
| 673 |
} else {
|
| 674 |
columnData.lmfInstance.setDiffMode(false, []);
|
| 675 |
}
|
|
@@ -686,17 +688,17 @@ window.onload = () => {
|
|
| 686 |
};
|
| 687 |
|
| 688 |
// 根据 histogram source 解析出列的 safeId 和直方图类型
|
| 689 |
-
const parseHistogramSource = (source?: string): { safeId: string; histogramType: 'token' | '
|
| 690 |
if (!source) {
|
| 691 |
return null;
|
| 692 |
}
|
| 693 |
|
| 694 |
-
const
|
| 695 |
const tokenPrefix = 'stats_frac';
|
| 696 |
|
| 697 |
-
if (source.startsWith(
|
| 698 |
-
const safeId = source.substring(
|
| 699 |
-
return safeId ? { safeId, histogramType: '
|
| 700 |
}
|
| 701 |
|
| 702 |
if (source.startsWith(tokenPrefix)) {
|
|
@@ -736,12 +738,21 @@ window.onload = () => {
|
|
| 736 |
|
| 737 |
const { columnData } = columnEntry;
|
| 738 |
|
| 739 |
-
//
|
| 740 |
-
|
| 741 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 742 |
}
|
| 743 |
|
| 744 |
-
const { stats_frac,
|
| 745 |
|
| 746 |
let enhancedResult = columnData.enhancedResult;
|
| 747 |
if (!enhancedResult && columnData.data) {
|
|
@@ -756,21 +767,21 @@ window.onload = () => {
|
|
| 756 |
// binIndex 为 -1 表示取消高亮
|
| 757 |
if (ev.binIndex === -1) {
|
| 758 |
stats_frac?.clearSelection();
|
| 759 |
-
|
| 760 |
columnData.lmfInstance.clearHighlight();
|
| 761 |
return;
|
| 762 |
}
|
| 763 |
|
| 764 |
// 同一列内仅保持一个直方图的选中状态
|
| 765 |
-
if (parsed.histogramType === '
|
| 766 |
stats_frac?.clearSelection();
|
| 767 |
} else {
|
| 768 |
-
|
| 769 |
}
|
| 770 |
|
| 771 |
// 使用通用的高亮计算函数
|
| 772 |
const { x0, x1 } = ev;
|
| 773 |
-
const { indices, style } = calculateHighlights(parsed.histogramType, x0, x1, enhancedResult);
|
| 774 |
|
| 775 |
// 高亮这些 token
|
| 776 |
columnData.lmfInstance.setHighlightedIndices(indices, style);
|
|
@@ -818,7 +829,7 @@ window.onload = () => {
|
|
| 818 |
// 优先使用缓存的原文
|
| 819 |
if (columnData.originalText !== undefined) {
|
| 820 |
text = columnData.originalText;
|
| 821 |
-
} else if (columnData.data
|
| 822 |
text = columnData.data.request.text;
|
| 823 |
}
|
| 824 |
|
|
@@ -894,7 +905,7 @@ window.onload = () => {
|
|
| 894 |
columnsData.forEach((columnData, id) => {
|
| 895 |
if (columnData.stats) {
|
| 896 |
// 更新统计信息显示
|
| 897 |
-
const resultModel = columnData.data
|
| 898 |
updateMetricsForColumn(id, columnData.stats, resultModel);
|
| 899 |
|
| 900 |
// 重新渲染图表
|
|
@@ -918,7 +929,7 @@ window.onload = () => {
|
|
| 918 |
// 更新差分模式
|
| 919 |
const isDiffColumn = columnData.diffStats && !isBaseColumn(id);
|
| 920 |
if (isDiffColumn && columnData.diffStats) {
|
| 921 |
-
columnData.lmfInstance.setDiffMode(true, columnData.diffStats.
|
| 922 |
} else {
|
| 923 |
columnData.lmfInstance.setDiffMode(false, []);
|
| 924 |
}
|
|
@@ -946,7 +957,7 @@ window.onload = () => {
|
|
| 946 |
columnsData.forEach((columnData, id) => {
|
| 947 |
if (columnData.stats) {
|
| 948 |
// 更新统计信息显示
|
| 949 |
-
const resultModel = columnData.data
|
| 950 |
updateMetricsForColumn(id, columnData.stats, resultModel);
|
| 951 |
|
| 952 |
// 重新渲染图表
|
|
@@ -1125,7 +1136,7 @@ window.onload = () => {
|
|
| 1125 |
return;
|
| 1126 |
}
|
| 1127 |
|
| 1128 |
-
const preloadText = result.data.request
|
| 1129 |
|
| 1130 |
// 与已有 demo 的原文对比
|
| 1131 |
const consistency = checkTextConsistency();
|
|
@@ -1161,7 +1172,7 @@ window.onload = () => {
|
|
| 1161 |
lmfInstance: undefined,
|
| 1162 |
histograms: {
|
| 1163 |
stats_frac: null,
|
| 1164 |
-
|
| 1165 |
stats_surprisal_progress: null
|
| 1166 |
}
|
| 1167 |
};
|
|
@@ -1537,7 +1548,7 @@ window.onload = () => {
|
|
| 1537 |
// 重新渲染所有列的统计图表和指标,并更新 LMF 实例的差分模式
|
| 1538 |
columnsData.forEach((columnData, id) => {
|
| 1539 |
if (columnData.stats) {
|
| 1540 |
-
const resultModel = columnData.data
|
| 1541 |
updateMetricsForColumn(id, columnData.stats, resultModel);
|
| 1542 |
renderStatsForColumn(id, columnData);
|
| 1543 |
}
|
|
@@ -1546,7 +1557,7 @@ window.onload = () => {
|
|
| 1546 |
if (columnData.lmfInstance) {
|
| 1547 |
const isDiffColumn = columnData.diffStats && !isBaseColumn(id);
|
| 1548 |
if (isDiffColumn && columnData.diffStats) {
|
| 1549 |
-
columnData.lmfInstance.setDiffMode(true, columnData.diffStats.
|
| 1550 |
} else {
|
| 1551 |
columnData.lmfInstance.setDiffMode(false, []);
|
| 1552 |
}
|
|
@@ -1694,7 +1705,7 @@ window.onload = () => {
|
|
| 1694 |
// 重新渲染所有列的统计图表和指标,并更新 LMF 实例的差分模式
|
| 1695 |
columnsData.forEach((columnData, id) => {
|
| 1696 |
if (columnData.stats) {
|
| 1697 |
-
const resultModel = columnData.data
|
| 1698 |
updateMetricsForColumn(id, columnData.stats, resultModel);
|
| 1699 |
renderStatsForColumn(id, columnData);
|
| 1700 |
}
|
|
@@ -1703,7 +1714,7 @@ window.onload = () => {
|
|
| 1703 |
if (columnData.lmfInstance) {
|
| 1704 |
const isDiffColumn = columnData.diffStats && !isBaseColumn(id);
|
| 1705 |
if (isDiffColumn && columnData.diffStats) {
|
| 1706 |
-
columnData.lmfInstance.setDiffMode(true, columnData.diffStats.
|
| 1707 |
} else {
|
| 1708 |
columnData.lmfInstance.setDiffMode(false, []);
|
| 1709 |
}
|
|
|
|
| 40 |
import { calculateHighlights } from './utils/highlightUtils';
|
| 41 |
// 公共初始化模块
|
| 42 |
import {initializeCommonApp} from './appInitializer';
|
| 43 |
+
import {
|
| 44 |
+
TOKEN_SURPRISAL_HISTOGRAM_BASE,
|
| 45 |
+
BYTE_SURPRISAL_HISTOGRAM_BASE,
|
| 46 |
+
DELTA_BYTE_SURPRISAL_HISTOGRAM_BASE,
|
| 47 |
+
SURPRISAL_PROGRESS_BASE
|
| 48 |
+
} from "./utils/visualizationConfigs";
|
| 49 |
|
| 50 |
// 使用从 demoManager 导出的验证函数
|
| 51 |
|
|
|
|
| 120 |
lmfInstance?: GLTR_Text_Box; // LMF实例引用(对比模式下使用)
|
| 121 |
histograms: {
|
| 122 |
stats_frac: Histogram | null;
|
| 123 |
+
stats_byte_frac: Histogram | null;
|
| 124 |
stats_surprisal_progress: ScatterPlot | null;
|
| 125 |
};
|
| 126 |
};
|
|
|
|
| 129 |
// 初始化公共应用组件
|
| 130 |
const api_prefix = URLHandler.parameters['api'] || '';
|
| 131 |
const bodyElement = <Element>d3.select('body').node();
|
| 132 |
+
const { eventHandler, api, tokenSurprisalColorScale, byteSurprisalColorScale, totalSurprisalFormat } = initializeCommonApp(api_prefix, bodyElement);
|
| 133 |
|
| 134 |
const container = d3.select('#compare-container');
|
| 135 |
const mainFrame = d3.select('.main_frame');
|
|
|
|
| 237 |
const metricsId = `text_metrics_${safeId}`;
|
| 238 |
const errorId = `error_${safeId}`;
|
| 239 |
const statsFracId = `stats_frac_${safeId}`;
|
| 240 |
+
const statsByteFracId = `stats_byte_frac_${safeId}`;
|
| 241 |
const statsProgressId = `stats_surprisal_progress_${safeId}`;
|
| 242 |
const textRenderId = `text_render_${safeId}`;
|
| 243 |
|
|
|
|
| 268 |
</div>
|
| 269 |
<div id="${statsId}" class="stats" style="text-align:center;">
|
| 270 |
<div style="display:block;text-align: center;margin-bottom: 20px;">
|
| 271 |
+
<div id="token_histogram_title_${safeId}"></div>
|
| 272 |
<svg id="${statsFracId}"></svg>
|
| 273 |
</div>
|
| 274 |
<div style="display:block;text-align: center;margin-bottom: 20px;">
|
| 275 |
+
<div id="byte_histogram_title_${safeId}"></div>
|
| 276 |
+
<svg id="${statsByteFracId}"></svg>
|
| 277 |
</div>
|
| 278 |
<div style="display:block;text-align: center;margin-bottom: 20px;">
|
| 279 |
<div>surprisal vs token progress</div>
|
|
|
|
| 288 |
// 处理单个 demo 的数据
|
| 289 |
const processDemoData = (data: AnalysisData): FrontendAnalyzeResult => {
|
| 290 |
const result = data.result;
|
| 291 |
+
const safeText = data.request.text;
|
| 292 |
|
| 293 |
// 验证数据
|
| 294 |
if (!Array.isArray(result.bpe_strings) || result.bpe_strings.length === 0) {
|
|
|
|
| 333 |
|
| 334 |
// 为单个列渲染统计图表(使用ID)
|
| 335 |
const renderStatsForColumn = (id: string, columnData: DemoColumnData) => {
|
| 336 |
+
if (!columnData.stats || !columnData.histograms.stats_frac || !columnData.histograms.stats_byte_frac || !columnData.histograms.stats_surprisal_progress) {
|
| 337 |
return;
|
| 338 |
}
|
| 339 |
|
|
|
|
| 342 |
const safeId = toSafeId(id);
|
| 343 |
|
| 344 |
// 更新 token surprisal histogram(保持不变,不显示差分)
|
| 345 |
+
// 使用 19 个台阶,对应区间:[0,1), [1,2), ..., [17,18), [18,∞)
|
| 346 |
columnData.histograms.stats_frac.update({
|
| 347 |
+
...TOKEN_SURPRISAL_HISTOGRAM_BASE,
|
| 348 |
data: stats.tokenSurprisals,
|
| 349 |
+
colorScale: tokenSurprisalColorScale,
|
|
|
|
|
|
|
|
|
|
| 350 |
averageValue: stats.tokenAverage ?? undefined,
|
|
|
|
| 351 |
});
|
| 352 |
|
| 353 |
+
// 更新列视图中 token surprisal histogram 的标题文本
|
| 354 |
+
const tokenTitleElement = document.getElementById(`token_histogram_title_${safeId}`);
|
| 355 |
+
if (tokenTitleElement) {
|
| 356 |
+
tokenTitleElement.textContent = TOKEN_SURPRISAL_HISTOGRAM_BASE.label;
|
| 357 |
+
}
|
| 358 |
+
|
| 359 |
+
// 更新信息密度histogram(Diff列显示差分)
|
| 360 |
if (isDiffColumn && columnData.diffStats) {
|
| 361 |
+
// Diff列:显示Δ信息密度 histogram
|
| 362 |
+
const deltaByteSurprisals = columnData.diffStats.deltaByteSurprisals;
|
| 363 |
|
| 364 |
// 计算平均差分
|
| 365 |
+
const deltaAverage = deltaByteSurprisals.length > 0
|
| 366 |
+
? deltaByteSurprisals.reduce((sum, val) => sum + val, 0) / deltaByteSurprisals.length
|
| 367 |
: 0;
|
| 368 |
|
| 369 |
+
columnData.histograms.stats_byte_frac.update({
|
| 370 |
+
...DELTA_BYTE_SURPRISAL_HISTOGRAM_BASE,
|
| 371 |
+
data: deltaByteSurprisals,
|
|
|
|
|
|
|
|
|
|
| 372 |
colorScale: getDiffColor,
|
| 373 |
averageValue: deltaAverage,
|
|
|
|
| 374 |
});
|
| 375 |
|
| 376 |
// 更新标题文本
|
| 377 |
+
const titleElement = document.getElementById(`byte_histogram_title_${safeId}`);
|
| 378 |
if (titleElement) {
|
| 379 |
+
titleElement.textContent = DELTA_BYTE_SURPRISAL_HISTOGRAM_BASE.label;
|
| 380 |
}
|
| 381 |
} else {
|
| 382 |
+
// Base列或非模型差分模式:显示原始信息密度 histogram
|
| 383 |
+
// 使用 13 个台阶,对应区间:[0,0.5), [0.5,1), [1,1.5), ..., [5.5,6), [6,∞)
|
| 384 |
+
columnData.histograms.stats_byte_frac.update({
|
| 385 |
+
...BYTE_SURPRISAL_HISTOGRAM_BASE,
|
| 386 |
+
data: stats.byteSurprisals,
|
| 387 |
+
colorScale: byteSurprisalColorScale,
|
| 388 |
+
averageValue: stats.byteAverage ?? undefined,
|
|
|
|
|
|
|
|
|
|
| 389 |
});
|
| 390 |
|
| 391 |
// 更新标题文本
|
| 392 |
+
const titleElement = document.getElementById(`byte_histogram_title_${safeId}`);
|
| 393 |
if (titleElement) {
|
| 394 |
+
titleElement.textContent = BYTE_SURPRISAL_HISTOGRAM_BASE.label;
|
| 395 |
}
|
| 396 |
}
|
| 397 |
|
| 398 |
// 更新 surprisal progress scatter plot(保持不变)
|
| 399 |
if (stats.tokenSurprisals && stats.tokenSurprisals.length > 0) {
|
| 400 |
columnData.histograms.stats_surprisal_progress.update({
|
| 401 |
+
...SURPRISAL_PROGRESS_BASE,
|
| 402 |
data: stats.tokenSurprisals,
|
|
|
|
|
|
|
| 403 |
});
|
| 404 |
}
|
| 405 |
};
|
|
|
|
| 539 |
response = result.data;
|
| 540 |
}
|
| 541 |
const enhancedResult = processDemoData(response);
|
| 542 |
+
const safeText = response.request.text;
|
| 543 |
+
const textStats = calculateTextStats(enhancedResult, safeText);
|
| 544 |
|
| 545 |
columnData.data = response;
|
| 546 |
columnData.enhancedResult = enhancedResult;
|
|
|
|
| 553 |
showErrorForColumn(id, null);
|
| 554 |
|
| 555 |
// 更新统计信息显示(从分析结果中获取实际使用的模型)
|
| 556 |
+
const resultModel = response.result.model;
|
| 557 |
updateMetricsForColumn(id, textStats, resultModel);
|
| 558 |
|
| 559 |
// 渲染统计图表
|
|
|
|
| 567 |
// 重新渲染所有列的统计图表和指标(因为差分数据可能变化)
|
| 568 |
columnsData.forEach((colData, colId) => {
|
| 569 |
if (colData.stats) {
|
| 570 |
+
const resultModel = colData.data.result.model;
|
| 571 |
updateMetricsForColumn(colId, colData.stats, resultModel);
|
| 572 |
renderStatsForColumn(colId, colData);
|
| 573 |
}
|
|
|
|
| 579 |
// 更新差分模式(因为差分数据可能变化)
|
| 580 |
const isDiffColumn = columnData.diffStats && !isBaseColumn(id);
|
| 581 |
if (isDiffColumn && columnData.diffStats) {
|
| 582 |
+
columnData.lmfInstance.setDiffMode(true, columnData.diffStats.deltaByteSurprisals);
|
| 583 |
} else {
|
| 584 |
columnData.lmfInstance.setDiffMode(false, []);
|
| 585 |
}
|
|
|
|
| 608 |
const initializeColumnVisualizations = (id: string, columnData: DemoColumnData): void => {
|
| 609 |
const safeId = toSafeId(id);
|
| 610 |
const statsFracId = `#stats_frac_${safeId}`;
|
| 611 |
+
const statsByteFracId = `#stats_byte_frac_${safeId}`;
|
| 612 |
const statsProgressId = `#stats_surprisal_progress_${safeId}`;
|
| 613 |
|
| 614 |
// 创建 Histogram 实例
|
|
|
|
| 618 |
{ width: 400, height: 200 }
|
| 619 |
);
|
| 620 |
|
| 621 |
+
columnData.histograms.stats_byte_frac = new Histogram(
|
| 622 |
+
d3.select(statsByteFracId),
|
| 623 |
eventHandler,
|
| 624 |
{ width: 400, height: 200 }
|
| 625 |
);
|
|
|
|
| 671 |
// 设置差分模式(如果是Diff列)
|
| 672 |
const isDiffColumn = modelDiffMode && columnData.diffStats && !isBaseColumn(id);
|
| 673 |
if (isDiffColumn && columnData.diffStats) {
|
| 674 |
+
columnData.lmfInstance.setDiffMode(true, columnData.diffStats.deltaByteSurprisals);
|
| 675 |
} else {
|
| 676 |
columnData.lmfInstance.setDiffMode(false, []);
|
| 677 |
}
|
|
|
|
| 688 |
};
|
| 689 |
|
| 690 |
// 根据 histogram source 解析出列的 safeId 和直方图类型
|
| 691 |
+
const parseHistogramSource = (source?: string): { safeId: string; histogramType: 'token' | 'byte' } | null => {
|
| 692 |
if (!source) {
|
| 693 |
return null;
|
| 694 |
}
|
| 695 |
|
| 696 |
+
const bytePrefix = 'stats_byte_frac';
|
| 697 |
const tokenPrefix = 'stats_frac';
|
| 698 |
|
| 699 |
+
if (source.startsWith(bytePrefix)) {
|
| 700 |
+
const safeId = source.substring(bytePrefix.length).replace(/^_/, '');
|
| 701 |
+
return safeId ? { safeId, histogramType: 'byte' } : null;
|
| 702 |
}
|
| 703 |
|
| 704 |
if (source.startsWith(tokenPrefix)) {
|
|
|
|
| 738 |
|
| 739 |
const { columnData } = columnEntry;
|
| 740 |
|
| 741 |
+
// 在模型差分模式下,只有base列支持点击高亮
|
| 742 |
+
// 非差分模式下,仅在文本渲染已初始化时处理高亮
|
| 743 |
+
if (modelDiffMode) {
|
| 744 |
+
// 模型差分模式:只有base列支持点击高亮
|
| 745 |
+
if (!isBaseColumn(columnData.id) || !columnData.lmfInstance) {
|
| 746 |
+
return;
|
| 747 |
+
}
|
| 748 |
+
} else {
|
| 749 |
+
// 非模型差分模式:需要文本渲染已初始化
|
| 750 |
+
if (!columnData.lmfInstance) {
|
| 751 |
+
return;
|
| 752 |
+
}
|
| 753 |
}
|
| 754 |
|
| 755 |
+
const { stats_frac, stats_byte_frac } = columnData.histograms;
|
| 756 |
|
| 757 |
let enhancedResult = columnData.enhancedResult;
|
| 758 |
if (!enhancedResult && columnData.data) {
|
|
|
|
| 767 |
// binIndex 为 -1 表示取消高亮
|
| 768 |
if (ev.binIndex === -1) {
|
| 769 |
stats_frac?.clearSelection();
|
| 770 |
+
stats_byte_frac?.clearSelection();
|
| 771 |
columnData.lmfInstance.clearHighlight();
|
| 772 |
return;
|
| 773 |
}
|
| 774 |
|
| 775 |
// 同一列内仅保持一个直方图的选中状态
|
| 776 |
+
if (parsed.histogramType === 'byte') {
|
| 777 |
stats_frac?.clearSelection();
|
| 778 |
} else {
|
| 779 |
+
stats_byte_frac?.clearSelection();
|
| 780 |
}
|
| 781 |
|
| 782 |
// 使用通用的高亮计算函数
|
| 783 |
const { x0, x1 } = ev;
|
| 784 |
+
const { indices, style } = calculateHighlights(parsed.histogramType, x0, x1, ev.binIndex, ev.no_bins, enhancedResult);
|
| 785 |
|
| 786 |
// 高亮这些 token
|
| 787 |
columnData.lmfInstance.setHighlightedIndices(indices, style);
|
|
|
|
| 829 |
// 优先使用缓存的原文
|
| 830 |
if (columnData.originalText !== undefined) {
|
| 831 |
text = columnData.originalText;
|
| 832 |
+
} else if (columnData.data) {
|
| 833 |
text = columnData.data.request.text;
|
| 834 |
}
|
| 835 |
|
|
|
|
| 905 |
columnsData.forEach((columnData, id) => {
|
| 906 |
if (columnData.stats) {
|
| 907 |
// 更新统计信息显示
|
| 908 |
+
const resultModel = columnData.data.result.model;
|
| 909 |
updateMetricsForColumn(id, columnData.stats, resultModel);
|
| 910 |
|
| 911 |
// 重新渲染图表
|
|
|
|
| 929 |
// 更新差分模式
|
| 930 |
const isDiffColumn = columnData.diffStats && !isBaseColumn(id);
|
| 931 |
if (isDiffColumn && columnData.diffStats) {
|
| 932 |
+
columnData.lmfInstance.setDiffMode(true, columnData.diffStats.deltaByteSurprisals);
|
| 933 |
} else {
|
| 934 |
columnData.lmfInstance.setDiffMode(false, []);
|
| 935 |
}
|
|
|
|
| 957 |
columnsData.forEach((columnData, id) => {
|
| 958 |
if (columnData.stats) {
|
| 959 |
// 更新统计信息显示
|
| 960 |
+
const resultModel = columnData.data.result.model;
|
| 961 |
updateMetricsForColumn(id, columnData.stats, resultModel);
|
| 962 |
|
| 963 |
// 重新渲染图表
|
|
|
|
| 1136 |
return;
|
| 1137 |
}
|
| 1138 |
|
| 1139 |
+
const preloadText = result.data.request.text;
|
| 1140 |
|
| 1141 |
// 与已有 demo 的原文对比
|
| 1142 |
const consistency = checkTextConsistency();
|
|
|
|
| 1172 |
lmfInstance: undefined,
|
| 1173 |
histograms: {
|
| 1174 |
stats_frac: null,
|
| 1175 |
+
stats_byte_frac: null,
|
| 1176 |
stats_surprisal_progress: null
|
| 1177 |
}
|
| 1178 |
};
|
|
|
|
| 1548 |
// 重新渲染所有列的统计图表和指标,并更新 LMF 实例的差分模式
|
| 1549 |
columnsData.forEach((columnData, id) => {
|
| 1550 |
if (columnData.stats) {
|
| 1551 |
+
const resultModel = columnData.data.result.model;
|
| 1552 |
updateMetricsForColumn(id, columnData.stats, resultModel);
|
| 1553 |
renderStatsForColumn(id, columnData);
|
| 1554 |
}
|
|
|
|
| 1557 |
if (columnData.lmfInstance) {
|
| 1558 |
const isDiffColumn = columnData.diffStats && !isBaseColumn(id);
|
| 1559 |
if (isDiffColumn && columnData.diffStats) {
|
| 1560 |
+
columnData.lmfInstance.setDiffMode(true, columnData.diffStats.deltaByteSurprisals);
|
| 1561 |
} else {
|
| 1562 |
columnData.lmfInstance.setDiffMode(false, []);
|
| 1563 |
}
|
|
|
|
| 1705 |
// 重新渲染所有列的统计图表和指标,并更新 LMF 实例的差分模式
|
| 1706 |
columnsData.forEach((columnData, id) => {
|
| 1707 |
if (columnData.stats) {
|
| 1708 |
+
const resultModel = columnData.data.result.model;
|
| 1709 |
updateMetricsForColumn(id, columnData.stats, resultModel);
|
| 1710 |
renderStatsForColumn(id, columnData);
|
| 1711 |
}
|
|
|
|
| 1714 |
if (columnData.lmfInstance) {
|
| 1715 |
const isDiffColumn = columnData.diffStats && !isBaseColumn(id);
|
| 1716 |
if (isDiffColumn && columnData.diffStats) {
|
| 1717 |
+
columnData.lmfInstance.setDiffMode(true, columnData.diffStats.deltaByteSurprisals);
|
| 1718 |
} else {
|
| 1719 |
columnData.lmfInstance.setDiffMode(false, []);
|
| 1720 |
}
|
client/src/ts/controllers/highlightController.ts
CHANGED
|
@@ -39,11 +39,11 @@ export class HighlightController {
|
|
| 39 |
return;
|
| 40 |
}
|
| 41 |
|
| 42 |
-
const { x0, x1, source } = ev;
|
| 43 |
const data = currentData.result;
|
| 44 |
-
|
| 45 |
// 仅处理 token 直方图
|
| 46 |
-
const { indices, style } = calculateHighlights('token', x0, x1, data);
|
| 47 |
|
| 48 |
// 高亮这些 token
|
| 49 |
this.options.lmf.setHighlightedIndices(indices, style);
|
|
|
|
| 39 |
return;
|
| 40 |
}
|
| 41 |
|
| 42 |
+
const { x0, x1, binIndex, no_bins, source } = ev;
|
| 43 |
const data = currentData.result;
|
| 44 |
+
|
| 45 |
// 仅处理 token 直方图
|
| 46 |
+
const { indices, style } = calculateHighlights('token', x0, x1, binIndex, no_bins, data);
|
| 47 |
|
| 48 |
// 高亮这些 token
|
| 49 |
this.options.lmf.setHighlightedIndices(indices, style);
|
client/src/ts/controllers/textInputController.ts
CHANGED
|
@@ -1,6 +1,7 @@
|
|
| 1 |
import * as d3 from 'd3';
|
| 2 |
import type { TextStats } from '../utils/textStatistics';
|
| 3 |
-
import { calculateTextStats
|
|
|
|
| 4 |
import type { FrontendAnalyzeResult } from '../api/GLTR_API';
|
| 5 |
|
| 6 |
/**
|
|
@@ -24,7 +25,6 @@ export type TextInputControllerOptions = {
|
|
| 24 |
submitBtn: d3.Selection<any, unknown, any, any>;
|
| 25 |
saveBtn: d3.Selection<any, unknown, any, any>;
|
| 26 |
pasteBtn: d3.Selection<any, unknown, any, any>;
|
| 27 |
-
textEncoder: TextEncoder | null;
|
| 28 |
totalSurprisalFormat: (value: number | null) => string;
|
| 29 |
showAlertDialog: (title: string, message: string) => void;
|
| 30 |
};
|
|
@@ -77,7 +77,7 @@ export class TextInputController {
|
|
| 77 |
// 注意:submitBtn 的状态现在由外部状态系统统一管理,不再在这里设置
|
| 78 |
|
| 79 |
if (!this.options.textCountValue.empty()) {
|
| 80 |
-
const charCount =
|
| 81 |
this.options.textCountValue.text(charCount.toString());
|
| 82 |
}
|
| 83 |
}
|
|
@@ -205,9 +205,8 @@ export class TextInputController {
|
|
| 205 |
*/
|
| 206 |
export const calculateTextStatsForController = (
|
| 207 |
result: FrontendAnalyzeResult,
|
| 208 |
-
originalText: string
|
| 209 |
-
textEncoder: TextEncoder | null
|
| 210 |
): TextStats => {
|
| 211 |
-
return calculateTextStats(result, originalText
|
| 212 |
};
|
| 213 |
|
|
|
|
| 1 |
import * as d3 from 'd3';
|
| 2 |
import type { TextStats } from '../utils/textStatistics';
|
| 3 |
+
import { calculateTextStats } from '../utils/textStatistics';
|
| 4 |
+
import { countTokenCharacters } from '../utils/Util';
|
| 5 |
import type { FrontendAnalyzeResult } from '../api/GLTR_API';
|
| 6 |
|
| 7 |
/**
|
|
|
|
| 25 |
submitBtn: d3.Selection<any, unknown, any, any>;
|
| 26 |
saveBtn: d3.Selection<any, unknown, any, any>;
|
| 27 |
pasteBtn: d3.Selection<any, unknown, any, any>;
|
|
|
|
| 28 |
totalSurprisalFormat: (value: number | null) => string;
|
| 29 |
showAlertDialog: (title: string, message: string) => void;
|
| 30 |
};
|
|
|
|
| 77 |
// 注意:submitBtn 的状态现在由外部状态系统统一管理,不再在这里设置
|
| 78 |
|
| 79 |
if (!this.options.textCountValue.empty()) {
|
| 80 |
+
const charCount = countTokenCharacters(textValue);
|
| 81 |
this.options.textCountValue.text(charCount.toString());
|
| 82 |
}
|
| 83 |
}
|
|
|
|
| 205 |
*/
|
| 206 |
export const calculateTextStatsForController = (
|
| 207 |
result: FrontendAnalyzeResult,
|
| 208 |
+
originalText: string
|
|
|
|
| 209 |
): TextStats => {
|
| 210 |
+
return calculateTextStats(result, originalText);
|
| 211 |
};
|
| 212 |
|
client/src/ts/start.ts
CHANGED
|
@@ -61,7 +61,7 @@ window.onload = () => {
|
|
| 61 |
// 初始化公共应用组件
|
| 62 |
const api_prefix = URLHandler.parameters['api'] || '';
|
| 63 |
const bodyElement = <Element>d3.select('body').node();
|
| 64 |
-
const { eventHandler, api,
|
| 65 |
|
| 66 |
// 页面初始化时确保 loading 状态被重置(防止刷新后仍显示转圈)
|
| 67 |
d3.selectAll(".loadersmall").style('display', 'none');
|
|
@@ -115,7 +115,6 @@ window.onload = () => {
|
|
| 115 |
submitBtn,
|
| 116 |
saveBtn,
|
| 117 |
pasteBtn,
|
| 118 |
-
textEncoder,
|
| 119 |
totalSurprisalFormat,
|
| 120 |
showAlertDialog
|
| 121 |
});
|
|
@@ -176,8 +175,7 @@ window.onload = () => {
|
|
| 176 |
stats_frac,
|
| 177 |
stats_surprisal_progress,
|
| 178 |
appStateManager,
|
| 179 |
-
|
| 180 |
-
surprisalColorScale: surprisalColorScale as d3.ScaleSequential<string>
|
| 181 |
});
|
| 182 |
|
| 183 |
// 初始化主题管理器
|
|
|
|
| 61 |
// 初始化公共应用组件
|
| 62 |
const api_prefix = URLHandler.parameters['api'] || '';
|
| 63 |
const bodyElement = <Element>d3.select('body').node();
|
| 64 |
+
const { eventHandler, api, tokenSurprisalColorScale, byteSurprisalColorScale, totalSurprisalFormat } = initializeCommonApp(api_prefix, bodyElement);
|
| 65 |
|
| 66 |
// 页面初始化时确保 loading 状态被重置(防止刷新后仍显示转圈)
|
| 67 |
d3.selectAll(".loadersmall").style('display', 'none');
|
|
|
|
| 115 |
submitBtn,
|
| 116 |
saveBtn,
|
| 117 |
pasteBtn,
|
|
|
|
| 118 |
totalSurprisalFormat,
|
| 119 |
showAlertDialog
|
| 120 |
});
|
|
|
|
| 175 |
stats_frac,
|
| 176 |
stats_surprisal_progress,
|
| 177 |
appStateManager,
|
| 178 |
+
surprisalColorScale: tokenSurprisalColorScale as d3.ScaleSequential<string>
|
|
|
|
| 179 |
});
|
| 180 |
|
| 181 |
// 初始化主题管理器
|
client/src/ts/ui/dialog.ts
CHANGED
|
@@ -3,6 +3,7 @@
|
|
| 3 |
* 提供统一的弹框样式和行为
|
| 4 |
*/
|
| 5 |
import * as d3 from 'd3';
|
|
|
|
| 6 |
|
| 7 |
export type DialogContentBuilder = (
|
| 8 |
dialog: d3.Selection<HTMLDivElement, unknown, any, any>,
|
|
@@ -515,7 +516,7 @@ export function createNamePathTextContent(
|
|
| 515 |
const updateTextCount = () => {
|
| 516 |
const textNode = textarea.node() as HTMLTextAreaElement;
|
| 517 |
const textValue = textNode?.value || '';
|
| 518 |
-
const charCount =
|
| 519 |
textCountDisplay.text(`${charCount} 字`);
|
| 520 |
};
|
| 521 |
|
|
|
|
| 3 |
* 提供统一的弹框样式和行为
|
| 4 |
*/
|
| 5 |
import * as d3 from 'd3';
|
| 6 |
+
import { countTokenCharacters } from '../utils/Util';
|
| 7 |
|
| 8 |
export type DialogContentBuilder = (
|
| 9 |
dialog: d3.Selection<HTMLDivElement, unknown, any, any>,
|
|
|
|
| 516 |
const updateTextCount = () => {
|
| 517 |
const textNode = textarea.node() as HTMLTextAreaElement;
|
| 518 |
const textValue = textNode?.value || '';
|
| 519 |
+
const charCount = countTokenCharacters(textValue);
|
| 520 |
textCountDisplay.text(`${charCount} 字`);
|
| 521 |
};
|
| 522 |
|
client/src/ts/utils/SurprisalColorConfig.ts
CHANGED
|
@@ -3,51 +3,74 @@ import * as d3 from "d3";
|
|
| 3 |
/**
|
| 4 |
* 惊讶度颜色配置模块
|
| 5 |
* 统一管理文本渲染和直方图的红色颜色配置
|
| 6 |
-
* 20个台阶对应区间:[0,1), [1,2), ..., [18,19), [19,∞)
|
| 7 |
*/
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
/**
|
| 11 |
* 根据归一化值获取对应的颜色(输入值应在[0,1]区间)
|
| 12 |
* @param normalizedValue 归一化后的值,范围[0,1]
|
| 13 |
-
* @param colorFactor 颜色因子,用于调整颜色强度,默认为0.7
|
| 14 |
* @returns 颜色字符串(rgba格式,从透明到红色)
|
| 15 |
*/
|
| 16 |
-
export function getSurprisalColorNormalized(normalizedValue: number
|
| 17 |
-
// 红色 #ff4740 对应的 RGB 值为 (255, 71, 64)
|
| 18 |
-
const startColor = "rgba(255, 71, 64, 0)"; // 完全透明的红色
|
| 19 |
-
const endColor = "rgba(255, 71, 64, 1)"; // 完全不透明的红色
|
| 20 |
-
|
| 21 |
-
// 使用d3的插值函数进行线性映射(支持rgba格式)
|
| 22 |
-
const colorInterpolator = d3.interpolate(startColor, endColor);
|
| 23 |
-
|
| 24 |
// 确保输入值在[0,1]范围内
|
| 25 |
const clampedValue = Math.max(0, Math.min(1, normalizedValue));
|
| 26 |
|
| 27 |
-
//
|
| 28 |
-
|
| 29 |
-
|
|
|
|
| 30 |
|
|
|
|
|
|
|
|
|
|
| 31 |
|
| 32 |
/**
|
| 33 |
-
*
|
| 34 |
-
* @param
|
| 35 |
-
* @param
|
| 36 |
-
* @returns
|
| 37 |
*/
|
| 38 |
-
|
| 39 |
-
|
| 40 |
-
|
| 41 |
-
if (
|
| 42 |
-
|
| 43 |
-
} else if (surprisal >= 19) {
|
| 44 |
-
normalizedValue = 1;
|
| 45 |
} else {
|
| 46 |
-
|
| 47 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 48 |
|
| 49 |
-
|
| 50 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
}
|
| 52 |
|
| 53 |
// ==========================================
|
|
|
|
| 3 |
/**
|
| 4 |
* 惊讶度颜色配置模块
|
| 5 |
* 统一管理文本渲染和直方图的红色颜色配置
|
|
|
|
| 6 |
*/
|
| 7 |
|
| 8 |
+
// ==========================================
|
| 9 |
+
// 常量定义
|
| 10 |
+
// ==========================================
|
| 11 |
+
|
| 12 |
+
/** Token surprisal 的最大值,用于颜色映射 */
|
| 13 |
+
const TOKEN_SURPRISAL_MAX = 18;
|
| 14 |
+
|
| 15 |
+
/** Byte surprisal 的最大值,用于颜色映射 */
|
| 16 |
+
const BYTE_SURPRISAL_MAX = 6;
|
| 17 |
+
|
| 18 |
+
/** Minimap 颜色因子:用于放大颜色强度,因为平均后的byte surprisal密度会过小,需要放大以在minimap中更明显 */
|
| 19 |
+
export const MINIMAP_COLOR_FACTOR = 1.3;
|
| 20 |
|
| 21 |
/**
|
| 22 |
* 根据归一化值获取对应的颜色(输入值应在[0,1]区间)
|
| 23 |
* @param normalizedValue 归一化后的值,范围[0,1]
|
|
|
|
| 24 |
* @returns 颜色字符串(rgba格式,从透明到红色)
|
| 25 |
*/
|
| 26 |
+
export function getSurprisalColorNormalized(normalizedValue: number): string {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 27 |
// 确保输入值在[0,1]范围内
|
| 28 |
const clampedValue = Math.max(0, Math.min(1, normalizedValue));
|
| 29 |
|
| 30 |
+
// 红色 #ff4740 对应的 RGB 值
|
| 31 |
+
const surprisalColor = "rgb(255, 71, 64)";
|
| 32 |
+
// 用于调整颜色强度,决定alpha范围[0, maxAlpha]
|
| 33 |
+
const maxAlpha = 0.7;
|
| 34 |
|
| 35 |
+
const alpha = clampedValue * maxAlpha;
|
| 36 |
+
return `rgba(255, 71, 64, ${alpha})`;
|
| 37 |
+
}
|
| 38 |
|
| 39 |
/**
|
| 40 |
+
* 将惊讶度值线性映射到[0, 1]范围
|
| 41 |
+
* @param value 惊讶度值
|
| 42 |
+
* @param maxValue 最大值,value >= maxValue 时映射为1,value < 0 时映射为0
|
| 43 |
+
* @returns 归一化后的值,范围[0, 1]
|
| 44 |
*/
|
| 45 |
+
function normalizeSurprisalValue(value: number, maxValue: number): number {
|
| 46 |
+
if (value < 0) {
|
| 47 |
+
return 0;
|
| 48 |
+
} else if (value >= maxValue) {
|
| 49 |
+
return 1;
|
|
|
|
|
|
|
| 50 |
} else {
|
| 51 |
+
return value / maxValue; // 线性映射到[0, 1]
|
| 52 |
}
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
/**
|
| 56 |
+
* 根据token惊讶度值获取对应的颜色(线性映射,不取整)
|
| 57 |
+
* @param surprisal token惊讶度值,范围[0, TOKEN_SURPRISAL_MAX]会被映射到[0, 1],超出范围会被截断
|
| 58 |
+
* @returns 颜色字符串(rgba格式)
|
| 59 |
+
*/
|
| 60 |
+
export function getTokenSurprisalColor(surprisal: number): string {
|
| 61 |
+
const normalizedValue = normalizeSurprisalValue(surprisal, TOKEN_SURPRISAL_MAX);
|
| 62 |
+
return getSurprisalColorNormalized(normalizedValue);
|
| 63 |
+
}
|
| 64 |
|
| 65 |
+
/**
|
| 66 |
+
* 根据byte密度惊讶度值获取对应的颜色(线性映射,不取整)
|
| 67 |
+
* @param byteSurprisal byte密度惊讶度值,范围[0, BYTE_SURPRISAL_MAX]会被映射到[0, 1],超出范围会被截断
|
| 68 |
+
* @param colorFactor 颜色因子,用于调整颜色强度,目前主要为了minimap显示更明显(平均后byte surprisal密度会过小,所以需要放大)。默认为1
|
| 69 |
+
* @returns 颜色字符串(rgba格式)
|
| 70 |
+
*/
|
| 71 |
+
export function getByteSurprisalColor(byteSurprisal: number, colorFactor: number = 1): string {
|
| 72 |
+
const normalizedValue = normalizeSurprisalValue(byteSurprisal * colorFactor, BYTE_SURPRISAL_MAX);
|
| 73 |
+
return getSurprisalColorNormalized(normalizedValue);
|
| 74 |
}
|
| 75 |
|
| 76 |
// ==========================================
|
client/src/ts/utils/Util.ts
CHANGED
|
@@ -1,4 +1,5 @@
|
|
| 1 |
import * as d3 from "d3";
|
|
|
|
| 2 |
|
| 3 |
/**
|
| 4 |
* Created by hen on 5/15/17.
|
|
@@ -93,18 +94,64 @@ export function calculateSurprisal(probability: number): number {
|
|
| 93 |
* @returns 字符数
|
| 94 |
*/
|
| 95 |
export function countTokenCharacters(tokenText: string): number {
|
| 96 |
-
if (!tokenText) return 1; // 空token按1个字符处理,避免除零
|
| 97 |
// 使用Array.from正确处理Unicode字符(包括中文、emoji等)
|
| 98 |
return Array.from(tokenText).length;
|
| 99 |
}
|
| 100 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
/**
|
| 102 |
-
* 计算单位字
|
| 103 |
* @param surprisal token的总surprisal值
|
| 104 |
* @param tokenText token文本
|
| 105 |
-
* @returns 单位字
|
| 106 |
*/
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 110 |
}
|
|
|
|
| 1 |
import * as d3 from "d3";
|
| 2 |
+
import { TokenWithOffset } from "../api/generatedSchemas";
|
| 3 |
|
| 4 |
/**
|
| 5 |
* Created by hen on 5/15/17.
|
|
|
|
| 94 |
* @returns 字符数
|
| 95 |
*/
|
| 96 |
export function countTokenCharacters(tokenText: string): number {
|
|
|
|
| 97 |
// 使用Array.from正确处理Unicode字符(包括中文、emoji等)
|
| 98 |
return Array.from(tokenText).length;
|
| 99 |
}
|
| 100 |
|
| 101 |
+
// 复用 TextEncoder 实例,避免每次调用都创建新实例
|
| 102 |
+
const textEncoder = new TextEncoder();
|
| 103 |
+
|
| 104 |
+
/**
|
| 105 |
+
* 获取字符串的UTF-8编码字节长度
|
| 106 |
+
* @param value 要计算字节长度的字符串
|
| 107 |
+
* @returns UTF-8编码的字节数
|
| 108 |
+
*/
|
| 109 |
+
export const getByteLength = (value: string): number => {
|
| 110 |
+
return textEncoder.encode(value).length;
|
| 111 |
+
};
|
| 112 |
+
|
| 113 |
/**
|
| 114 |
+
* 计算单位字节的surprisal值
|
| 115 |
* @param surprisal token的总surprisal值
|
| 116 |
* @param tokenText token文本
|
| 117 |
+
* @returns 单位字节的surprisal值(bits/Byte)
|
| 118 |
*/
|
| 119 |
+
function calculateSurprisalPerByte(surprisal: number, tokenText: string): number {
|
| 120 |
+
// 按UTF-8编码字节数计算
|
| 121 |
+
const byteCount = getByteLength(tokenText);
|
| 122 |
+
return byteCount > 0 ? surprisal / byteCount : 0;
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
/**
|
| 126 |
+
* 计算信息密度(统一接口,方便将来扩展)
|
| 127 |
+
* @param token token对象,包含real_topk和raw字段
|
| 128 |
+
* @returns 信息密度值(bits/Byte)
|
| 129 |
+
*/
|
| 130 |
+
export function calculateSurprisalDensity(token: TokenWithOffset): number {
|
| 131 |
+
const [rank, prob] = token.real_topk;
|
| 132 |
+
const surprisal = calculateSurprisal(prob);
|
| 133 |
+
const tokenText = token.raw;
|
| 134 |
+
return calculateSurprisalPerByte(surprisal, tokenText);
|
| 135 |
+
}
|
| 136 |
+
|
| 137 |
+
/**
|
| 138 |
+
* 为文本创建字符索引到字节索引的映射表
|
| 139 |
+
* @param text 原始文本
|
| 140 |
+
* @returns 数组,charToByteIndex[charIndex] = byteIndex
|
| 141 |
+
*/
|
| 142 |
+
export function buildCharToByteIndexMap(text: string): number[] {
|
| 143 |
+
const map: number[] = [];
|
| 144 |
+
let byteOffset = 0;
|
| 145 |
+
|
| 146 |
+
for (let charIndex = 0; charIndex < text.length; charIndex++) {
|
| 147 |
+
map[charIndex] = byteOffset;
|
| 148 |
+
// 获取当前字符的UTF-8字节长度
|
| 149 |
+
const char = text[charIndex];
|
| 150 |
+
byteOffset += getByteLength(char);
|
| 151 |
+
}
|
| 152 |
+
|
| 153 |
+
// 添加末尾位置(文本总字节长度)
|
| 154 |
+
map[text.length] = byteOffset;
|
| 155 |
+
|
| 156 |
+
return map;
|
| 157 |
}
|
client/src/ts/utils/demoPathUtils.ts
CHANGED
|
@@ -118,7 +118,7 @@ export const getDefaultDemoName = (
|
|
| 118 |
}
|
| 119 |
|
| 120 |
// 否则,使用第一行逻辑
|
| 121 |
-
const rawText = (currentData?.request
|
| 122 |
if (!rawText) {
|
| 123 |
return '新Demo';
|
| 124 |
}
|
|
|
|
| 118 |
}
|
| 119 |
|
| 120 |
// 否则,使用第一行逻辑
|
| 121 |
+
const rawText = (currentData ? currentData.request.text : textFieldValue || '').trim();
|
| 122 |
if (!rawText) {
|
| 123 |
return '新Demo';
|
| 124 |
}
|
client/src/ts/utils/highlightUtils.ts
CHANGED
|
@@ -1,17 +1,21 @@
|
|
| 1 |
import type { FrontendAnalyzeResult } from '../api/GLTR_API';
|
| 2 |
-
import { calculateSurprisal,
|
| 3 |
import { extractRealTopkFromTokens } from './tokenUtils';
|
| 4 |
|
| 5 |
/**
|
| 6 |
* 根据直方图 bin 的范围计算需要高亮的 token 索引集合(基于 token surprisal)
|
| 7 |
* @param x0 bin 起始值
|
| 8 |
* @param x1 bin 结束值
|
|
|
|
|
|
|
| 9 |
* @param result 前端分析结果(包含 originalTokens、mergedTokens、originalToMergedMap)
|
| 10 |
* @returns 需要高亮的 merged token 索引集合
|
| 11 |
*/
|
| 12 |
export function calculateTokenSurprisalHighlights(
|
| 13 |
x0: number,
|
| 14 |
x1: number,
|
|
|
|
|
|
|
| 15 |
result: FrontendAnalyzeResult
|
| 16 |
): Set<number> {
|
| 17 |
const highlightedIndices = new Set<number>();
|
|
@@ -20,13 +24,25 @@ export function calculateTokenSurprisalHighlights(
|
|
| 20 |
const originalToMergedMap = result.originalToMergedMap;
|
| 21 |
const mergedTokens = result.mergedTokens;
|
| 22 |
|
| 23 |
-
// 判断是否
|
| 24 |
-
const
|
|
|
|
| 25 |
|
| 26 |
// 遍历原始 token,找到 surprisal 在范围内的 token
|
| 27 |
for (let i = 0; i < originalTokens.length; i++) {
|
| 28 |
-
const surprisal = calculateSurprisal(originalRealTopk[i]
|
| 29 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
|
| 31 |
if (inRange) {
|
| 32 |
// 映射到 merged token 索引
|
|
@@ -41,30 +57,43 @@ export function calculateTokenSurprisalHighlights(
|
|
| 41 |
}
|
| 42 |
|
| 43 |
/**
|
| 44 |
-
* 根据直方图 bin 的范围计算需要高亮的 token 索引集合(基于
|
| 45 |
* @param x0 bin 起始值
|
| 46 |
* @param x1 bin 结束值
|
|
|
|
|
|
|
| 47 |
* @param result 前端分析结果(包含 mergedTokens)
|
| 48 |
* @returns 需要高亮的 merged token 索引集合
|
| 49 |
*/
|
| 50 |
-
export function
|
| 51 |
x0: number,
|
| 52 |
x1: number,
|
|
|
|
|
|
|
| 53 |
result: FrontendAnalyzeResult
|
| 54 |
): Set<number> {
|
| 55 |
const highlightedIndices = new Set<number>();
|
| 56 |
const mergedTokens = result.mergedTokens;
|
| 57 |
-
const mergedRealTopk = extractRealTopkFromTokens(mergedTokens);
|
| 58 |
|
| 59 |
-
// 判断是否
|
| 60 |
-
const
|
|
|
|
| 61 |
|
| 62 |
-
// 遍历 merged token,找到
|
| 63 |
for (let i = 0; i < mergedTokens.length; i++) {
|
| 64 |
-
const
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
|
| 69 |
if (inRange) {
|
| 70 |
highlightedIndices.add(i);
|
|
@@ -77,13 +106,15 @@ export function calculateCharSurprisalHighlights(
|
|
| 77 |
/**
|
| 78 |
* 直方图类型
|
| 79 |
*/
|
| 80 |
-
export type HistogramType = 'token' | '
|
| 81 |
|
| 82 |
/**
|
| 83 |
* 根据直方图类型和 bin 范围计算需要高亮的 token 索引集合
|
| 84 |
-
* @param histogramType 直方图类型('token' 或 '
|
| 85 |
* @param x0 bin 起始值
|
| 86 |
* @param x1 bin 结束值
|
|
|
|
|
|
|
| 87 |
* @param result 前端分析结果
|
| 88 |
* @returns 需要高亮的 merged token 索引集合和对应的高亮样式
|
| 89 |
*/
|
|
@@ -91,16 +122,18 @@ export function calculateHighlights(
|
|
| 91 |
histogramType: HistogramType,
|
| 92 |
x0: number,
|
| 93 |
x1: number,
|
|
|
|
|
|
|
| 94 |
result: FrontendAnalyzeResult
|
| 95 |
): { indices: Set<number>; style: 'border' | 'underline' } {
|
| 96 |
-
if (histogramType === '
|
| 97 |
return {
|
| 98 |
-
indices:
|
| 99 |
style: 'underline'
|
| 100 |
};
|
| 101 |
} else {
|
| 102 |
return {
|
| 103 |
-
indices: calculateTokenSurprisalHighlights(x0, x1, result),
|
| 104 |
style: 'border'
|
| 105 |
};
|
| 106 |
}
|
|
|
|
| 1 |
import type { FrontendAnalyzeResult } from '../api/GLTR_API';
|
| 2 |
+
import { calculateSurprisal, calculateSurprisalDensity } from './Util';
|
| 3 |
import { extractRealTopkFromTokens } from './tokenUtils';
|
| 4 |
|
| 5 |
/**
|
| 6 |
* 根据直方图 bin 的范围计算需要高亮的 token 索引集合(基于 token surprisal)
|
| 7 |
* @param x0 bin 起始值
|
| 8 |
* @param x1 bin 结束值
|
| 9 |
+
* @param binIndex bin在bins数组中的索引
|
| 10 |
+
* @param no_bins 直方图的总bin数量
|
| 11 |
* @param result 前端分析结果(包含 originalTokens、mergedTokens、originalToMergedMap)
|
| 12 |
* @returns 需要高亮的 merged token 索引集合
|
| 13 |
*/
|
| 14 |
export function calculateTokenSurprisalHighlights(
|
| 15 |
x0: number,
|
| 16 |
x1: number,
|
| 17 |
+
binIndex: number,
|
| 18 |
+
no_bins: number,
|
| 19 |
result: FrontendAnalyzeResult
|
| 20 |
): Set<number> {
|
| 21 |
const highlightedIndices = new Set<number>();
|
|
|
|
| 24 |
const originalToMergedMap = result.originalToMergedMap;
|
| 25 |
const mergedTokens = result.mergedTokens;
|
| 26 |
|
| 27 |
+
// 使用binIndex判断是否是最两侧的bin
|
| 28 |
+
const isFirstBin = binIndex === 0; // 第一个bin:包含超出下界的值
|
| 29 |
+
const isLastBin = binIndex === no_bins - 1; // 最后一个bin:包含超出上界的值
|
| 30 |
|
| 31 |
// 遍历原始 token,找到 surprisal 在范围内的 token
|
| 32 |
for (let i = 0; i < originalTokens.length; i++) {
|
| 33 |
+
const surprisal = calculateSurprisal(originalRealTopk[i][1]);
|
| 34 |
+
let inRange = false;
|
| 35 |
+
|
| 36 |
+
if (isFirstBin) {
|
| 37 |
+
// 第一个bin:包含所有 < x1 的值(自身bin + 超出下界的数据)
|
| 38 |
+
inRange = surprisal < x1;
|
| 39 |
+
} else if (isLastBin) {
|
| 40 |
+
// 最后一个bin:包含所有 >= x0 的值(自身bin + 超出上界的数据)
|
| 41 |
+
inRange = surprisal >= x0;
|
| 42 |
+
} else {
|
| 43 |
+
// 中间bins:正常范围
|
| 44 |
+
inRange = surprisal >= x0 && surprisal < x1;
|
| 45 |
+
}
|
| 46 |
|
| 47 |
if (inRange) {
|
| 48 |
// 映射到 merged token 索引
|
|
|
|
| 57 |
}
|
| 58 |
|
| 59 |
/**
|
| 60 |
+
* 根据直方图 bin 的范围计算需要高亮的 token 索引集合(基于信息密度)
|
| 61 |
* @param x0 bin 起始值
|
| 62 |
* @param x1 bin 结束值
|
| 63 |
+
* @param binIndex bin在bins数组中的索引
|
| 64 |
+
* @param no_bins 直方图的总bin数量
|
| 65 |
* @param result 前端分析结果(包含 mergedTokens)
|
| 66 |
* @returns 需要高亮的 merged token 索引集合
|
| 67 |
*/
|
| 68 |
+
export function calculateByteSurprisalHighlights(
|
| 69 |
x0: number,
|
| 70 |
x1: number,
|
| 71 |
+
binIndex: number,
|
| 72 |
+
no_bins: number,
|
| 73 |
result: FrontendAnalyzeResult
|
| 74 |
): Set<number> {
|
| 75 |
const highlightedIndices = new Set<number>();
|
| 76 |
const mergedTokens = result.mergedTokens;
|
|
|
|
| 77 |
|
| 78 |
+
// 使用binIndex判断是否是最两侧的bin
|
| 79 |
+
const isFirstBin = binIndex === 0; // 第一个bin:包含超出下界的值
|
| 80 |
+
const isLastBin = binIndex === no_bins - 1; // 最后一个bin:包含超出上界的值
|
| 81 |
|
| 82 |
+
// 遍历 merged token,找到信息密度在范围内的 token
|
| 83 |
for (let i = 0; i < mergedTokens.length; i++) {
|
| 84 |
+
const informationDensity = calculateSurprisalDensity(mergedTokens[i]);
|
| 85 |
+
let inRange = false;
|
| 86 |
+
|
| 87 |
+
if (isFirstBin) {
|
| 88 |
+
// 第一个bin:包含所有 < x1 的值(自身bin + 超出下界的数据)
|
| 89 |
+
inRange = informationDensity < x1;
|
| 90 |
+
} else if (isLastBin) {
|
| 91 |
+
// 最后一个bin:包含所有 >= x0 的值(自身bin + 超出上界的数据)
|
| 92 |
+
inRange = informationDensity >= x0;
|
| 93 |
+
} else {
|
| 94 |
+
// 中间bins:正常范围
|
| 95 |
+
inRange = informationDensity >= x0 && informationDensity < x1;
|
| 96 |
+
}
|
| 97 |
|
| 98 |
if (inRange) {
|
| 99 |
highlightedIndices.add(i);
|
|
|
|
| 106 |
/**
|
| 107 |
* 直方图类型
|
| 108 |
*/
|
| 109 |
+
export type HistogramType = 'token' | 'byte';
|
| 110 |
|
| 111 |
/**
|
| 112 |
* 根据直方图类型和 bin 范围计算需要高亮的 token 索引集合
|
| 113 |
+
* @param histogramType 直方图类型('token' 或 'byte')
|
| 114 |
* @param x0 bin 起始值
|
| 115 |
* @param x1 bin 结束值
|
| 116 |
+
* @param binIndex bin在bins数组中的索引
|
| 117 |
+
* @param no_bins 直方图的总bin数量
|
| 118 |
* @param result 前端分析结果
|
| 119 |
* @returns 需要高亮的 merged token 索引集合和对应的高亮样式
|
| 120 |
*/
|
|
|
|
| 122 |
histogramType: HistogramType,
|
| 123 |
x0: number,
|
| 124 |
x1: number,
|
| 125 |
+
binIndex: number,
|
| 126 |
+
no_bins: number,
|
| 127 |
result: FrontendAnalyzeResult
|
| 128 |
): { indices: Set<number>; style: 'border' | 'underline' } {
|
| 129 |
+
if (histogramType === 'byte') {
|
| 130 |
return {
|
| 131 |
+
indices: calculateByteSurprisalHighlights(x0, x1, binIndex, no_bins, result),
|
| 132 |
style: 'underline'
|
| 133 |
};
|
| 134 |
} else {
|
| 135 |
return {
|
| 136 |
+
indices: calculateTokenSurprisalHighlights(x0, x1, binIndex, no_bins, result),
|
| 137 |
style: 'border'
|
| 138 |
};
|
| 139 |
}
|
client/src/ts/utils/localFileUtils.ts
CHANGED
|
@@ -76,7 +76,7 @@ export function validateDemoFormat(data: any): data is AnalysisData {
|
|
| 76 |
}
|
| 77 |
|
| 78 |
// 7. 验证token数据的一致性(offset和raw是否匹配)
|
| 79 |
-
const text = data.request
|
| 80 |
if (text) {
|
| 81 |
const consistencyError = validateTokenConsistency(
|
| 82 |
data.result.bpe_strings as Array<{ offset?: [number, number]; raw?: string }>,
|
|
|
|
| 76 |
}
|
| 77 |
|
| 78 |
// 7. 验证token数据的一致性(offset和raw是否匹配)
|
| 79 |
+
const text = data.request.text;
|
| 80 |
if (text) {
|
| 81 |
const consistencyError = validateTokenConsistency(
|
| 82 |
data.result.bpe_strings as Array<{ offset?: [number, number]; raw?: string }>,
|
client/src/ts/utils/textStatistics.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
import type { FrontendAnalyzeResult } from '../api/GLTR_API';
|
| 2 |
-
import { calculateSurprisal,
|
| 3 |
import { extractRealTopkFromTokens } from './tokenUtils';
|
| 4 |
|
| 5 |
export type TextStats = {
|
|
@@ -7,9 +7,9 @@ export type TextStats = {
|
|
| 7 |
charCount: number;
|
| 8 |
tokenCount: number;
|
| 9 |
tokenSurprisals: number[];
|
| 10 |
-
|
| 11 |
tokenAverage: number | null;
|
| 12 |
-
|
| 13 |
totalSurprisal: number | null;
|
| 14 |
};
|
| 15 |
|
|
@@ -25,7 +25,7 @@ export type DiffStats = {
|
|
| 25 |
tokenAverage: number | null;
|
| 26 |
// 差分字段
|
| 27 |
deltaTotalSurprisal: number | null; // Δ总surprisal
|
| 28 |
-
|
| 29 |
};
|
| 30 |
|
| 31 |
/**
|
|
@@ -43,27 +43,12 @@ export const computeAverage = (values: number[] | null | undefined): number | nu
|
|
| 43 |
return sum / validValues.length;
|
| 44 |
};
|
| 45 |
|
| 46 |
-
/**
|
| 47 |
-
* 获取字节长度
|
| 48 |
-
*/
|
| 49 |
-
export const getByteLength = (value: string, textEncoder: TextEncoder | null): number => {
|
| 50 |
-
if (textEncoder) {
|
| 51 |
-
return textEncoder.encode(value).length;
|
| 52 |
-
}
|
| 53 |
-
try {
|
| 54 |
-
return new Blob([value]).size;
|
| 55 |
-
} catch {
|
| 56 |
-
return value.length;
|
| 57 |
-
}
|
| 58 |
-
};
|
| 59 |
-
|
| 60 |
/**
|
| 61 |
* 计算文本统计信息
|
| 62 |
*/
|
| 63 |
export const calculateTextStats = (
|
| 64 |
result: FrontendAnalyzeResult,
|
| 65 |
-
originalText: string
|
| 66 |
-
textEncoder: TextEncoder | null
|
| 67 |
): TextStats => {
|
| 68 |
const originalTokens = result.originalTokens;
|
| 69 |
const mergedTokens = result.mergedTokens;
|
|
@@ -75,24 +60,24 @@ export const calculateTextStats = (
|
|
| 75 |
let truncatedTextLength = 0;
|
| 76 |
if (originalTokens.length > 0) {
|
| 77 |
const lastToken = originalTokens[originalTokens.length - 1];
|
| 78 |
-
truncatedTextLength = lastToken
|
| 79 |
}
|
| 80 |
|
| 81 |
// 从原始文本中截取实际分析的文本部分
|
| 82 |
const truncatedText = originalText.slice(0, truncatedTextLength);
|
| 83 |
-
const safeText = truncatedText
|
| 84 |
|
| 85 |
-
const byteCount = getByteLength(safeText
|
| 86 |
-
const charCount =
|
| 87 |
const tokenCount = originalTokens.length;
|
| 88 |
|
| 89 |
const tokenSurprisals: number[] = [];
|
| 90 |
-
const
|
| 91 |
let totalSurprisal = 0;
|
| 92 |
let hasValidTotal = false;
|
| 93 |
|
| 94 |
originalTokens.forEach((token, index) => {
|
| 95 |
-
const prob = realTopkOriginal[index]
|
| 96 |
const surprisal = calculateSurprisal(prob);
|
| 97 |
tokenSurprisals.push(surprisal);
|
| 98 |
if (Number.isFinite(surprisal)) {
|
|
@@ -101,14 +86,15 @@ export const calculateTextStats = (
|
|
| 101 |
}
|
| 102 |
});
|
| 103 |
|
| 104 |
-
mergedTokens.forEach((token
|
| 105 |
-
const
|
| 106 |
-
const
|
| 107 |
-
const
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
|
|
|
| 112 |
}
|
| 113 |
});
|
| 114 |
|
|
@@ -117,9 +103,9 @@ export const calculateTextStats = (
|
|
| 117 |
charCount,
|
| 118 |
tokenCount,
|
| 119 |
tokenSurprisals,
|
| 120 |
-
|
| 121 |
tokenAverage: computeAverage(tokenSurprisals),
|
| 122 |
-
|
| 123 |
totalSurprisal: hasValidTotal ? totalSurprisal : null
|
| 124 |
};
|
| 125 |
};
|
|
@@ -139,13 +125,13 @@ export const calculateDiffStats = (
|
|
| 139 |
? diffStats.totalSurprisal - baseStats.totalSurprisal
|
| 140 |
: null;
|
| 141 |
|
| 142 |
-
// 计算逐字的Δ(
|
| 143 |
-
const
|
| 144 |
-
const minLength = Math.min(diffStats.
|
| 145 |
|
| 146 |
for (let i = 0; i < minLength; i++) {
|
| 147 |
-
const delta = diffStats.
|
| 148 |
-
|
| 149 |
}
|
| 150 |
|
| 151 |
return {
|
|
@@ -155,7 +141,7 @@ export const calculateDiffStats = (
|
|
| 155 |
tokenSurprisals: diffStats.tokenSurprisals,
|
| 156 |
tokenAverage: diffStats.tokenAverage,
|
| 157 |
deltaTotalSurprisal,
|
| 158 |
-
|
| 159 |
};
|
| 160 |
};
|
| 161 |
|
|
|
|
| 1 |
import type { FrontendAnalyzeResult } from '../api/GLTR_API';
|
| 2 |
+
import { calculateSurprisal, calculateSurprisalDensity, countTokenCharacters, getByteLength } from './Util';
|
| 3 |
import { extractRealTopkFromTokens } from './tokenUtils';
|
| 4 |
|
| 5 |
export type TextStats = {
|
|
|
|
| 7 |
charCount: number;
|
| 8 |
tokenCount: number;
|
| 9 |
tokenSurprisals: number[];
|
| 10 |
+
byteSurprisals: number[];
|
| 11 |
tokenAverage: number | null;
|
| 12 |
+
byteAverage: number | null;
|
| 13 |
totalSurprisal: number | null;
|
| 14 |
};
|
| 15 |
|
|
|
|
| 25 |
tokenAverage: number | null;
|
| 26 |
// 差分字段
|
| 27 |
deltaTotalSurprisal: number | null; // Δ总surprisal
|
| 28 |
+
deltaByteSurprisals: number[]; // 逐字节的Δ信息密度(bits/Byte)
|
| 29 |
};
|
| 30 |
|
| 31 |
/**
|
|
|
|
| 43 |
return sum / validValues.length;
|
| 44 |
};
|
| 45 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
/**
|
| 47 |
* 计算文本统计信息
|
| 48 |
*/
|
| 49 |
export const calculateTextStats = (
|
| 50 |
result: FrontendAnalyzeResult,
|
| 51 |
+
originalText: string
|
|
|
|
| 52 |
): TextStats => {
|
| 53 |
const originalTokens = result.originalTokens;
|
| 54 |
const mergedTokens = result.mergedTokens;
|
|
|
|
| 60 |
let truncatedTextLength = 0;
|
| 61 |
if (originalTokens.length > 0) {
|
| 62 |
const lastToken = originalTokens[originalTokens.length - 1];
|
| 63 |
+
truncatedTextLength = lastToken.offset[1];
|
| 64 |
}
|
| 65 |
|
| 66 |
// 从原始文本中截取实际分析的文本部分
|
| 67 |
const truncatedText = originalText.slice(0, truncatedTextLength);
|
| 68 |
+
const safeText = truncatedText;
|
| 69 |
|
| 70 |
+
const byteCount = getByteLength(safeText);
|
| 71 |
+
const charCount = countTokenCharacters(safeText);
|
| 72 |
const tokenCount = originalTokens.length;
|
| 73 |
|
| 74 |
const tokenSurprisals: number[] = [];
|
| 75 |
+
const byteSurprisals: number[] = [];
|
| 76 |
let totalSurprisal = 0;
|
| 77 |
let hasValidTotal = false;
|
| 78 |
|
| 79 |
originalTokens.forEach((token, index) => {
|
| 80 |
+
const prob = realTopkOriginal[index][1];
|
| 81 |
const surprisal = calculateSurprisal(prob);
|
| 82 |
tokenSurprisals.push(surprisal);
|
| 83 |
if (Number.isFinite(surprisal)) {
|
|
|
|
| 86 |
}
|
| 87 |
});
|
| 88 |
|
| 89 |
+
mergedTokens.forEach((token) => {
|
| 90 |
+
const tokenText = token.raw;
|
| 91 |
+
const byteCountForToken = getByteLength(tokenText);
|
| 92 |
+
const byteSurprisal = calculateSurprisalDensity(token);
|
| 93 |
+
// 为token的每个字节添加相同的byteSurprisal值
|
| 94 |
+
// 注意:虽然可以使用Array.fill优化,但考虑到token的字节数通常很少(平均几个字节),
|
| 95 |
+
// 使用简单的循环更直观,性能差异可忽略不计
|
| 96 |
+
for (let i = 0; i < byteCountForToken; i++) {
|
| 97 |
+
byteSurprisals.push(byteSurprisal);
|
| 98 |
}
|
| 99 |
});
|
| 100 |
|
|
|
|
| 103 |
charCount,
|
| 104 |
tokenCount,
|
| 105 |
tokenSurprisals,
|
| 106 |
+
byteSurprisals,
|
| 107 |
tokenAverage: computeAverage(tokenSurprisals),
|
| 108 |
+
byteAverage: computeAverage(byteSurprisals),
|
| 109 |
totalSurprisal: hasValidTotal ? totalSurprisal : null
|
| 110 |
};
|
| 111 |
};
|
|
|
|
| 125 |
? diffStats.totalSurprisal - baseStats.totalSurprisal
|
| 126 |
: null;
|
| 127 |
|
| 128 |
+
// 计算逐字节的Δ信息密度(bits/Byte)
|
| 129 |
+
const deltaByteSurprisals: number[] = [];
|
| 130 |
+
const minLength = Math.min(diffStats.byteSurprisals.length, baseStats.byteSurprisals.length);
|
| 131 |
|
| 132 |
for (let i = 0; i < minLength; i++) {
|
| 133 |
+
const delta = diffStats.byteSurprisals[i] - baseStats.byteSurprisals[i];
|
| 134 |
+
deltaByteSurprisals.push(delta);
|
| 135 |
}
|
| 136 |
|
| 137 |
return {
|
|
|
|
| 141 |
tokenSurprisals: diffStats.tokenSurprisals,
|
| 142 |
tokenAverage: diffStats.tokenAverage,
|
| 143 |
deltaTotalSurprisal,
|
| 144 |
+
deltaByteSurprisals
|
| 145 |
};
|
| 146 |
};
|
| 147 |
|
client/src/ts/utils/tokenUtils.ts
CHANGED
|
@@ -39,7 +39,7 @@ export const clonePredTopk = (list: [string, number][] | null | undefined): [str
|
|
| 39 |
export const cloneFrontendToken = (token: FrontendToken, options: CloneTokenOptions = {}): FrontendToken => {
|
| 40 |
const cloned: FrontendToken = {
|
| 41 |
offset: [token.offset[0], token.offset[1]],
|
| 42 |
-
raw: token.raw
|
| 43 |
real_topk: cloneRealTopk(token.real_topk),
|
| 44 |
pred_topk: clonePredTopk(token.pred_topk)
|
| 45 |
};
|
|
@@ -129,11 +129,8 @@ export const extractRealTopkFromTokens = (tokens: FrontendToken[] | null | undef
|
|
| 129 |
return [];
|
| 130 |
}
|
| 131 |
return tokens.map((token) => {
|
| 132 |
-
const tuple = token
|
| 133 |
-
|
| 134 |
-
return [tuple[0], tuple[1]];
|
| 135 |
-
}
|
| 136 |
-
return [0, 0];
|
| 137 |
});
|
| 138 |
};
|
| 139 |
|
|
@@ -142,7 +139,7 @@ export const extractRealTopkFromTokens = (tokens: FrontendToken[] | null | undef
|
|
| 142 |
*/
|
| 143 |
export const createRawSnapshot = (response: AnalyzeResponse): AnalyzeResponse => {
|
| 144 |
const requestClone: AnalyzeResponse['request'] = {
|
| 145 |
-
text: response.request
|
| 146 |
};
|
| 147 |
const originalResult = response.result;
|
| 148 |
const tokensForSave = originalResult.bpe_strings.map((token) =>
|
|
@@ -150,7 +147,7 @@ export const createRawSnapshot = (response: AnalyzeResponse): AnalyzeResponse =>
|
|
| 150 |
);
|
| 151 |
// 确保 model 字段在最前面
|
| 152 |
const resultClone: AnalyzeResponse['result'] = {
|
| 153 |
-
model: originalResult.model
|
| 154 |
...originalResult,
|
| 155 |
bpe_strings: tokensForSave
|
| 156 |
};
|
|
|
|
| 39 |
export const cloneFrontendToken = (token: FrontendToken, options: CloneTokenOptions = {}): FrontendToken => {
|
| 40 |
const cloned: FrontendToken = {
|
| 41 |
offset: [token.offset[0], token.offset[1]],
|
| 42 |
+
raw: token.raw,
|
| 43 |
real_topk: cloneRealTopk(token.real_topk),
|
| 44 |
pred_topk: clonePredTopk(token.pred_topk)
|
| 45 |
};
|
|
|
|
| 129 |
return [];
|
| 130 |
}
|
| 131 |
return tokens.map((token) => {
|
| 132 |
+
const tuple = token.real_topk;
|
| 133 |
+
return [tuple[0], tuple[1]];
|
|
|
|
|
|
|
|
|
|
| 134 |
});
|
| 135 |
};
|
| 136 |
|
|
|
|
| 139 |
*/
|
| 140 |
export const createRawSnapshot = (response: AnalyzeResponse): AnalyzeResponse => {
|
| 141 |
const requestClone: AnalyzeResponse['request'] = {
|
| 142 |
+
text: response.request.text
|
| 143 |
};
|
| 144 |
const originalResult = response.result;
|
| 145 |
const tokensForSave = originalResult.bpe_strings.map((token) =>
|
|
|
|
| 147 |
);
|
| 148 |
// 确保 model 字段在最前面
|
| 149 |
const resultClone: AnalyzeResponse['result'] = {
|
| 150 |
+
model: originalResult.model,
|
| 151 |
...originalResult,
|
| 152 |
bpe_strings: tokensForSave
|
| 153 |
};
|
client/src/ts/utils/visualizationConfigs.ts
ADDED
|
@@ -0,0 +1,52 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* 直方图基础配置类型
|
| 3 |
+
*/
|
| 4 |
+
export interface HistogramBaseConfig {
|
| 5 |
+
label: string;
|
| 6 |
+
no_bins: number;
|
| 7 |
+
extent: [number, number];
|
| 8 |
+
averageLabel: string;
|
| 9 |
+
showLeftInfinity?: boolean;
|
| 10 |
+
xAxisTickSkip?: number;
|
| 11 |
+
}
|
| 12 |
+
|
| 13 |
+
/**
|
| 14 |
+
* 散点图基础配置类型
|
| 15 |
+
*/
|
| 16 |
+
export interface ScatterPlotBaseConfig {
|
| 17 |
+
xLabel: string;
|
| 18 |
+
yLabel: string;
|
| 19 |
+
}
|
| 20 |
+
|
| 21 |
+
// Token surprisal 直方图的公共外观配置(不包含数据和颜色)
|
| 22 |
+
export const TOKEN_SURPRISAL_HISTOGRAM_BASE: HistogramBaseConfig = {
|
| 23 |
+
label: "token surprisal histogram",
|
| 24 |
+
no_bins: 19,
|
| 25 |
+
extent: [0, 19],
|
| 26 |
+
averageLabel: "bits/token",
|
| 27 |
+
};
|
| 28 |
+
|
| 29 |
+
// Byte surprisal 直方图(原始信息密度)的公共外观配置
|
| 30 |
+
export const BYTE_SURPRISAL_HISTOGRAM_BASE: HistogramBaseConfig = {
|
| 31 |
+
label: "byte surprisal histogram",
|
| 32 |
+
no_bins: 13,
|
| 33 |
+
extent: [0, 6.5],
|
| 34 |
+
averageLabel: "bits/Byte",
|
| 35 |
+
};
|
| 36 |
+
|
| 37 |
+
// ΔByte surprisal 直方图(差分信息密度)的公共外观配置
|
| 38 |
+
export const DELTA_BYTE_SURPRISAL_HISTOGRAM_BASE: HistogramBaseConfig = {
|
| 39 |
+
label: "Δbyte surprisal histogram",
|
| 40 |
+
no_bins: 20,
|
| 41 |
+
xAxisTickSkip: 1, // x轴刻度数字绘制间隔
|
| 42 |
+
extent: [-5, 5],
|
| 43 |
+
averageLabel: "Δ bits/Byte",
|
| 44 |
+
showLeftInfinity: true, // 左侧显示 -∞ 符号
|
| 45 |
+
};
|
| 46 |
+
|
| 47 |
+
// Surprisal progress 散点图的公共外观配置(不包含数据)
|
| 48 |
+
export const SURPRISAL_PROGRESS_BASE: ScatterPlotBaseConfig = {
|
| 49 |
+
xLabel: "token index",
|
| 50 |
+
yLabel: "surprisal (bits)",
|
| 51 |
+
} as const;
|
| 52 |
+
|
client/src/ts/utils/visualizationUpdater.ts
CHANGED
|
@@ -25,6 +25,7 @@ import {
|
|
| 25 |
calculateTextStats,
|
| 26 |
type TextStats
|
| 27 |
} from './textStatistics';
|
|
|
|
| 28 |
import { showAlertDialog } from '../ui/dialog';
|
| 29 |
|
| 30 |
/**
|
|
@@ -37,7 +38,6 @@ export interface VisualizationDependencies {
|
|
| 37 |
stats_frac: Histogram;
|
| 38 |
stats_surprisal_progress: ScatterPlot;
|
| 39 |
appStateManager: AppStateManager;
|
| 40 |
-
textEncoder: TextEncoder;
|
| 41 |
surprisalColorScale: d3.ScaleSequential<string>;
|
| 42 |
}
|
| 43 |
|
|
@@ -120,21 +120,24 @@ export class VisualizationUpdater {
|
|
| 120 |
const currentTokenAvg = this.currentState.currentTokenAvg;
|
| 121 |
|
| 122 |
if (currentSurprisals) {
|
|
|
|
| 123 |
this.deps.stats_frac.update({
|
|
|
|
| 124 |
data: currentSurprisals,
|
| 125 |
-
label: "surprisal",
|
| 126 |
-
no_bins: 20,
|
| 127 |
-
extent: [0, 20],
|
| 128 |
colorScale: this.deps.surprisalColorScale,
|
| 129 |
averageValue: currentTokenAvg ?? undefined,
|
| 130 |
-
averageLabel: 'bits/token'
|
| 131 |
});
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
}
|
| 133 |
if (currentSurprisals && currentSurprisals.length > 0) {
|
| 134 |
this.deps.stats_surprisal_progress.update({
|
|
|
|
| 135 |
data: currentSurprisals,
|
| 136 |
-
xLabel: 'token index',
|
| 137 |
-
yLabel: 'surprisal (bits)'
|
| 138 |
});
|
| 139 |
}
|
| 140 |
}
|
|
@@ -204,7 +207,7 @@ export class VisualizationUpdater {
|
|
| 204 |
return;
|
| 205 |
}
|
| 206 |
|
| 207 |
-
const safeText = data.request
|
| 208 |
const validationError = validateTokenConsistency(result.bpe_strings, safeText, { allowOverlap: true });
|
| 209 |
if (validationError) {
|
| 210 |
abortDueToInvalidResponse(validationError);
|
|
@@ -245,7 +248,7 @@ export class VisualizationUpdater {
|
|
| 245 |
// 只调用 lmf.update,不调用任何统计更新
|
| 246 |
this.deps.lmf.update(enhancedResult);
|
| 247 |
|
| 248 |
-
const textStats = calculateTextStats(enhancedResult, safeText
|
| 249 |
|
| 250 |
// 保存当前surprisal数据,用于主题切换时重新渲染
|
| 251 |
this.currentState.currentSurprisals = textStats.tokenSurprisals;
|
|
@@ -253,14 +256,14 @@ export class VisualizationUpdater {
|
|
| 253 |
this.currentState.currentTotalSurprisal = textStats.totalSurprisal;
|
| 254 |
|
| 255 |
// 更新文本指标和模型显示(从分析结果中获取实际使用的模型)
|
| 256 |
-
const resultModel = data.result
|
| 257 |
this.updateTextMetrics(textStats, resultModel);
|
| 258 |
|
| 259 |
// Analyze 渲染完成后关闭动画,避免拖拽等二次渲染再次播放
|
| 260 |
if (!disableAnimation) {
|
| 261 |
// 延迟关闭,确保动画有足够时间完成
|
| 262 |
// 动画时长估算:初始延迟100ms + 批次处理时间(根据token数量)
|
| 263 |
-
const tokenCount = enhancedResult.bpe_strings
|
| 264 |
const estimatedAnimationTime = 100 + Math.ceil(tokenCount / 50) * 100;
|
| 265 |
const delayTime = Math.max(2000, estimatedAnimationTime + 500);
|
| 266 |
|
|
|
|
| 25 |
calculateTextStats,
|
| 26 |
type TextStats
|
| 27 |
} from './textStatistics';
|
| 28 |
+
import { TOKEN_SURPRISAL_HISTOGRAM_BASE, SURPRISAL_PROGRESS_BASE } from "./visualizationConfigs";
|
| 29 |
import { showAlertDialog } from '../ui/dialog';
|
| 30 |
|
| 31 |
/**
|
|
|
|
| 38 |
stats_frac: Histogram;
|
| 39 |
stats_surprisal_progress: ScatterPlot;
|
| 40 |
appStateManager: AppStateManager;
|
|
|
|
| 41 |
surprisalColorScale: d3.ScaleSequential<string>;
|
| 42 |
}
|
| 43 |
|
|
|
|
| 120 |
const currentTokenAvg = this.currentState.currentTokenAvg;
|
| 121 |
|
| 122 |
if (currentSurprisals) {
|
| 123 |
+
// Token surprisal histogram: 使用 19 个台阶,对应区间:[0,1), [1,2), ..., [17,18), [18,∞)
|
| 124 |
this.deps.stats_frac.update({
|
| 125 |
+
...TOKEN_SURPRISAL_HISTOGRAM_BASE,
|
| 126 |
data: currentSurprisals,
|
|
|
|
|
|
|
|
|
|
| 127 |
colorScale: this.deps.surprisalColorScale,
|
| 128 |
averageValue: currentTokenAvg ?? undefined,
|
|
|
|
| 129 |
});
|
| 130 |
+
|
| 131 |
+
// 更新主视图中 token surprisal histogram 的标题文本
|
| 132 |
+
const titleElement = document.getElementById('token_histogram_title');
|
| 133 |
+
if (titleElement) {
|
| 134 |
+
titleElement.textContent = TOKEN_SURPRISAL_HISTOGRAM_BASE.label;
|
| 135 |
+
}
|
| 136 |
}
|
| 137 |
if (currentSurprisals && currentSurprisals.length > 0) {
|
| 138 |
this.deps.stats_surprisal_progress.update({
|
| 139 |
+
...SURPRISAL_PROGRESS_BASE,
|
| 140 |
data: currentSurprisals,
|
|
|
|
|
|
|
| 141 |
});
|
| 142 |
}
|
| 143 |
}
|
|
|
|
| 207 |
return;
|
| 208 |
}
|
| 209 |
|
| 210 |
+
const safeText = data.request.text;
|
| 211 |
const validationError = validateTokenConsistency(result.bpe_strings, safeText, { allowOverlap: true });
|
| 212 |
if (validationError) {
|
| 213 |
abortDueToInvalidResponse(validationError);
|
|
|
|
| 248 |
// 只调用 lmf.update,不调用任何统计更新
|
| 249 |
this.deps.lmf.update(enhancedResult);
|
| 250 |
|
| 251 |
+
const textStats = calculateTextStats(enhancedResult, safeText);
|
| 252 |
|
| 253 |
// 保存当前surprisal数据,用于主题切换时重新渲染
|
| 254 |
this.currentState.currentSurprisals = textStats.tokenSurprisals;
|
|
|
|
| 256 |
this.currentState.currentTotalSurprisal = textStats.totalSurprisal;
|
| 257 |
|
| 258 |
// 更新文本指标和模型显示(从分析结果中获取实际使用的模型)
|
| 259 |
+
const resultModel = data.result.model;
|
| 260 |
this.updateTextMetrics(textStats, resultModel);
|
| 261 |
|
| 262 |
// Analyze 渲染完成后关闭动画,避免拖拽等二次渲染再次播放
|
| 263 |
if (!disableAnimation) {
|
| 264 |
// 延迟关闭,确保动画有足够时间完成
|
| 265 |
// 动画时长估算:初始延迟100ms + 批次处理时间(根据token数量)
|
| 266 |
+
const tokenCount = enhancedResult.bpe_strings.length;
|
| 267 |
const estimatedAnimationTime = 100 + Math.ceil(tokenCount / 50) * 100;
|
| 268 |
const delayTime = Math.max(2000, estimatedAnimationTime + 500);
|
| 269 |
|
client/src/ts/vis/GLTR_Text_Box.ts
CHANGED
|
@@ -1,10 +1,9 @@
|
|
| 1 |
import {VComponent} from "./VisComponent";
|
| 2 |
import {FrontendAnalyzeResult} from "../api/GLTR_API";
|
| 3 |
-
import {D3Sel, calculateSurprisal,
|
| 4 |
import {SimpleEventHandler} from "../utils/SimpleEventHandler";
|
| 5 |
import * as d3 from "d3";
|
| 6 |
import {RenderAnimator, TokenRenderTask} from "./RenderAnimator";
|
| 7 |
-
import {getSurprisalColor} from "../utils/SurprisalColorConfig";
|
| 8 |
import {HighlightManager} from "./HighlightManager";
|
| 9 |
import {SvgOverlayManager} from "./SvgOverlayManager";
|
| 10 |
import {TokenPositionCalculator} from "./TokenPositionCalculator";
|
|
@@ -42,11 +41,7 @@ export enum GLTR_Mode {
|
|
| 42 |
}
|
| 43 |
|
| 44 |
export type GLTR_RenderItem = {
|
| 45 |
-
|
| 46 |
-
prop: number;
|
| 47 |
-
others: [string, number][];
|
| 48 |
-
token: string;
|
| 49 |
-
bpeMerged?: boolean;
|
| 50 |
};
|
| 51 |
export type GLTR_HoverEvent = { hovered: boolean, d: GLTR_RenderItem, event?: MouseEvent }
|
| 52 |
|
|
@@ -57,7 +52,8 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 57 |
highlightStyle: 'border' as 'border' | 'underline', // 当前高亮样式
|
| 58 |
// 差分渲染相关
|
| 59 |
diffMode: false, // 是否启用差分渲染模式
|
| 60 |
-
|
|
|
|
| 61 |
};
|
| 62 |
protected css_name = "LMF";
|
| 63 |
protected options = {
|
|
@@ -172,6 +168,12 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 172 |
// 保存当前渲染数据
|
| 173 |
this.currentRenderData = rd;
|
| 174 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 175 |
// 隐藏加载状态
|
| 176 |
this.hideLoading();
|
| 177 |
|
|
@@ -221,7 +223,8 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 221 |
getTokenRealTopk: (rd, tokenIndex) => this.getTokenRealTopk(rd, tokenIndex),
|
| 222 |
addTokenEventListeners: (element, tokenIndex, rd) => this.addTokenEventListeners(element, tokenIndex, rd),
|
| 223 |
diffMode: this._current.diffMode,
|
| 224 |
-
|
|
|
|
| 225 |
});
|
| 226 |
} else {
|
| 227 |
// 更新差分模式和数据
|
|
@@ -229,7 +232,8 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 229 |
getTokenRealTopk: (rd, tokenIndex) => this.getTokenRealTopk(rd, tokenIndex),
|
| 230 |
addTokenEventListeners: (element, tokenIndex, rd) => this.addTokenEventListeners(element, tokenIndex, rd),
|
| 231 |
diffMode: this._current.diffMode,
|
| 232 |
-
|
|
|
|
| 233 |
});
|
| 234 |
}
|
| 235 |
|
|
@@ -333,7 +337,7 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 333 |
}
|
| 334 |
|
| 335 |
// originalText 始终由前端注入,直接使用(更接近用户输入)
|
| 336 |
-
const fullText = rd.originalText
|
| 337 |
|
| 338 |
// 创建一个文本容器div,确保文本在SVG上方
|
| 339 |
if (fullText) {
|
|
@@ -598,8 +602,8 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 598 |
* 获取指定token的真实概率信息
|
| 599 |
*/
|
| 600 |
private getTokenRealTopk(rd: FrontendAnalyzeResult, tokenIndex: number): [number, number] | undefined {
|
| 601 |
-
const token = rd
|
| 602 |
-
return token
|
| 603 |
? token.real_topk as [number, number]
|
| 604 |
: undefined;
|
| 605 |
}
|
|
@@ -610,23 +614,13 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 610 |
*/
|
| 611 |
private addTokenEventListeners(element: SVGGElement, tokenIndex: number, rd: FrontendAnalyzeResult): void {
|
| 612 |
const tokenData = rd.bpe_strings[tokenIndex];
|
| 613 |
-
|
| 614 |
-
|
| 615 |
-
? tokenData.pred_topk as [string, number][]
|
| 616 |
-
: [];
|
| 617 |
-
const isMerged = tokenData?.bpe_merged === true;
|
| 618 |
-
const tokenText = tokenData?.raw || '';
|
| 619 |
-
|
| 620 |
-
// 创建事件处理函数
|
| 621 |
const handleMouseEnter = (event: MouseEvent) => {
|
| 622 |
this.eventHandler.trigger(GLTR_Text_Box.events.tokenHovered, <GLTR_HoverEvent>{
|
| 623 |
hovered: true,
|
| 624 |
d: {
|
| 625 |
-
|
| 626 |
-
top: topK?.[0] || 0,
|
| 627 |
-
prop: topK?.[1] || 0,
|
| 628 |
-
others: tokenPredTopK,
|
| 629 |
-
bpeMerged: isMerged
|
| 630 |
},
|
| 631 |
event: event
|
| 632 |
});
|
|
@@ -643,11 +637,7 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 643 |
this.eventHandler.trigger(GLTR_Text_Box.events.tokenHovered, <GLTR_HoverEvent>{
|
| 644 |
hovered: false,
|
| 645 |
d: {
|
| 646 |
-
|
| 647 |
-
top: topK?.[0] || 0,
|
| 648 |
-
prop: topK?.[1] || 0,
|
| 649 |
-
others: tokenPredTopK,
|
| 650 |
-
bpeMerged: isMerged
|
| 651 |
},
|
| 652 |
event: event
|
| 653 |
});
|
|
@@ -771,9 +761,9 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 771 |
}
|
| 772 |
|
| 773 |
protected _wrangle(data: FrontendAnalyzeResult) {
|
| 774 |
-
const tokens =
|
| 775 |
const allTop1 = tokens
|
| 776 |
-
.map(token =>
|
| 777 |
.filter((value): value is number => typeof value === 'number' && Number.isFinite(value));
|
| 778 |
|
| 779 |
if (allTop1.length === 0) {
|
|
@@ -862,14 +852,19 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 862 |
/**
|
| 863 |
* 设置差分渲染模式和数据
|
| 864 |
* @param enabled 是否启用差分模式
|
| 865 |
-
* @param
|
| 866 |
*/
|
| 867 |
-
setDiffMode(enabled: boolean,
|
| 868 |
this._current.diffMode = enabled;
|
| 869 |
-
this._current.
|
| 870 |
|
| 871 |
-
// 如果有当前渲染数据,重新渲染
|
| 872 |
if (this.currentRenderData) {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 873 |
// 差分模式切换时禁用动画
|
| 874 |
const originalAnimationSetting = this.options.enableRenderAnimation;
|
| 875 |
this.options.enableRenderAnimation = false;
|
|
|
|
| 1 |
import {VComponent} from "./VisComponent";
|
| 2 |
import {FrontendAnalyzeResult} from "../api/GLTR_API";
|
| 3 |
+
import {D3Sel, calculateSurprisal, calculateSurprisalDensity, buildCharToByteIndexMap} from "../utils/Util";
|
| 4 |
import {SimpleEventHandler} from "../utils/SimpleEventHandler";
|
| 5 |
import * as d3 from "d3";
|
| 6 |
import {RenderAnimator, TokenRenderTask} from "./RenderAnimator";
|
|
|
|
| 7 |
import {HighlightManager} from "./HighlightManager";
|
| 8 |
import {SvgOverlayManager} from "./SvgOverlayManager";
|
| 9 |
import {TokenPositionCalculator} from "./TokenPositionCalculator";
|
|
|
|
| 41 |
}
|
| 42 |
|
| 43 |
export type GLTR_RenderItem = {
|
| 44 |
+
tokenData: import('../api/GLTR_API').FrontendToken; // 完整的token对象,包含所有信息
|
|
|
|
|
|
|
|
|
|
|
|
|
| 45 |
};
|
| 46 |
export type GLTR_HoverEvent = { hovered: boolean, d: GLTR_RenderItem, event?: MouseEvent }
|
| 47 |
|
|
|
|
| 52 |
highlightStyle: 'border' as 'border' | 'underline', // 当前高亮样式
|
| 53 |
// 差分渲染相关
|
| 54 |
diffMode: false, // 是否启用差分渲染模式
|
| 55 |
+
deltaByteSurprisals: [] as number[], // 逐字节的Δ信息密度(bits/Byte)
|
| 56 |
+
charToByteIndexMap: [] as number[], // 字符索引到字节索引的映射表
|
| 57 |
};
|
| 58 |
protected css_name = "LMF";
|
| 59 |
protected options = {
|
|
|
|
| 168 |
// 保存当前渲染数据
|
| 169 |
this.currentRenderData = rd;
|
| 170 |
|
| 171 |
+
// 如果差分模式已启用,更新字符到字节的映射表(使用最新的原始文本)
|
| 172 |
+
if (this._current.diffMode && this._current.deltaByteSurprisals.length > 0) {
|
| 173 |
+
const originalText = rd.originalText;
|
| 174 |
+
this._current.charToByteIndexMap = buildCharToByteIndexMap(originalText);
|
| 175 |
+
}
|
| 176 |
+
|
| 177 |
// 隐藏加载状态
|
| 178 |
this.hideLoading();
|
| 179 |
|
|
|
|
| 223 |
getTokenRealTopk: (rd, tokenIndex) => this.getTokenRealTopk(rd, tokenIndex),
|
| 224 |
addTokenEventListeners: (element, tokenIndex, rd) => this.addTokenEventListeners(element, tokenIndex, rd),
|
| 225 |
diffMode: this._current.diffMode,
|
| 226 |
+
deltaByteSurprisals: this._current.deltaByteSurprisals,
|
| 227 |
+
charToByteIndexMap: this._current.charToByteIndexMap
|
| 228 |
});
|
| 229 |
} else {
|
| 230 |
// 更新差分模式和数据
|
|
|
|
| 232 |
getTokenRealTopk: (rd, tokenIndex) => this.getTokenRealTopk(rd, tokenIndex),
|
| 233 |
addTokenEventListeners: (element, tokenIndex, rd) => this.addTokenEventListeners(element, tokenIndex, rd),
|
| 234 |
diffMode: this._current.diffMode,
|
| 235 |
+
deltaByteSurprisals: this._current.deltaByteSurprisals,
|
| 236 |
+
charToByteIndexMap: this._current.charToByteIndexMap
|
| 237 |
});
|
| 238 |
}
|
| 239 |
|
|
|
|
| 337 |
}
|
| 338 |
|
| 339 |
// originalText 始终由前端注入,直接使用(更接近用户输入)
|
| 340 |
+
const fullText = rd.originalText;
|
| 341 |
|
| 342 |
// 创建一个文本容器div,确保文本在SVG上方
|
| 343 |
if (fullText) {
|
|
|
|
| 602 |
* 获取指定token的真实概率信息
|
| 603 |
*/
|
| 604 |
private getTokenRealTopk(rd: FrontendAnalyzeResult, tokenIndex: number): [number, number] | undefined {
|
| 605 |
+
const token = rd.bpe_strings[tokenIndex];
|
| 606 |
+
return token.real_topk
|
| 607 |
? token.real_topk as [number, number]
|
| 608 |
: undefined;
|
| 609 |
}
|
|
|
|
| 614 |
*/
|
| 615 |
private addTokenEventListeners(element: SVGGElement, tokenIndex: number, rd: FrontendAnalyzeResult): void {
|
| 616 |
const tokenData = rd.bpe_strings[tokenIndex];
|
| 617 |
+
|
| 618 |
+
// 创建事件处理函数
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 619 |
const handleMouseEnter = (event: MouseEvent) => {
|
| 620 |
this.eventHandler.trigger(GLTR_Text_Box.events.tokenHovered, <GLTR_HoverEvent>{
|
| 621 |
hovered: true,
|
| 622 |
d: {
|
| 623 |
+
tokenData: tokenData
|
|
|
|
|
|
|
|
|
|
|
|
|
| 624 |
},
|
| 625 |
event: event
|
| 626 |
});
|
|
|
|
| 637 |
this.eventHandler.trigger(GLTR_Text_Box.events.tokenHovered, <GLTR_HoverEvent>{
|
| 638 |
hovered: false,
|
| 639 |
d: {
|
| 640 |
+
tokenData: tokenData
|
|
|
|
|
|
|
|
|
|
|
|
|
| 641 |
},
|
| 642 |
event: event
|
| 643 |
});
|
|
|
|
| 761 |
}
|
| 762 |
|
| 763 |
protected _wrangle(data: FrontendAnalyzeResult) {
|
| 764 |
+
const tokens = data.bpe_strings;
|
| 765 |
const allTop1 = tokens
|
| 766 |
+
.map(token => token.pred_topk.length > 0 ? token.pred_topk[0][1] : null)
|
| 767 |
.filter((value): value is number => typeof value === 'number' && Number.isFinite(value));
|
| 768 |
|
| 769 |
if (allTop1.length === 0) {
|
|
|
|
| 852 |
/**
|
| 853 |
* 设置差分渲染模式和数据
|
| 854 |
* @param enabled 是否启用差分模式
|
| 855 |
+
* @param deltaByteSurprisals 逐字节的Δ信息密度(bits/Byte)
|
| 856 |
*/
|
| 857 |
+
setDiffMode(enabled: boolean, deltaByteSurprisals: number[] = []) {
|
| 858 |
this._current.diffMode = enabled;
|
| 859 |
+
this._current.deltaByteSurprisals = deltaByteSurprisals;
|
| 860 |
|
| 861 |
+
// 如果有当前渲染数据,构建字符索引到字节索引的映射表并重新渲染
|
| 862 |
if (this.currentRenderData) {
|
| 863 |
+
// 构建字符索引到字节索引的映射表
|
| 864 |
+
// 使用当前渲染数据的原始文本
|
| 865 |
+
const originalText = this.currentRenderData.originalText;
|
| 866 |
+
this._current.charToByteIndexMap = buildCharToByteIndexMap(originalText);
|
| 867 |
+
|
| 868 |
// 差分模式切换时禁用动画
|
| 869 |
const originalAnimationSetting = this.options.enableRenderAnimation;
|
| 870 |
this.options.enableRenderAnimation = false;
|
client/src/ts/vis/Histogram.ts
CHANGED
|
@@ -9,11 +9,13 @@ const averageNumberFormat = d3.format('.2f');
|
|
| 9 |
export type HistogramData = {
|
| 10 |
data: number[],
|
| 11 |
label?: string,
|
| 12 |
-
no_bins
|
| 13 |
-
extent
|
| 14 |
-
colorScale
|
| 15 |
averageValue?: number,
|
| 16 |
-
averageLabel?: string
|
|
|
|
|
|
|
| 17 |
}
|
| 18 |
|
| 19 |
|
|
@@ -22,6 +24,7 @@ export type HistogramBinClickEvent = {
|
|
| 22 |
x0: number;
|
| 23 |
x1: number;
|
| 24 |
data: number[];
|
|
|
|
| 25 |
source?: string; // 直方图标识,用于区分不同的直方图实例
|
| 26 |
}
|
| 27 |
|
|
@@ -81,8 +84,26 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 81 |
protected _render(rD: HistogramData): void {
|
| 82 |
const op = this.options;
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
// 如果数据为空,显示空图表
|
| 87 |
if (values.length === 0) {
|
| 88 |
this.layers.main.selectAll('.bar').remove();
|
|
@@ -91,31 +112,13 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 91 |
this.layers.fg.selectAll('.avg-label').remove();
|
| 92 |
return;
|
| 93 |
}
|
| 94 |
-
|
| 95 |
-
let extent = rD.extent || d3.extent(values);
|
| 96 |
-
|
| 97 |
-
// 安全检查:确保 extent 有效
|
| 98 |
-
// 检查 extent 是否为 null/undefined、长度不为2、包含非有限值、或范围无效(包括 [0, 0] 这种情况)
|
| 99 |
-
if (!extent || extent.length !== 2 || !isFinite(extent[0]) || !isFinite(extent[1]) || extent[0] >= extent[1]) {
|
| 100 |
-
// 如果 extent 无效,使用数据的实际范围,如果数据也为空则使用默认值 [0, 1]
|
| 101 |
-
if (values.length > 0) {
|
| 102 |
-
const dataExtent = d3.extent(values);
|
| 103 |
-
if (dataExtent && dataExtent.length === 2 && isFinite(dataExtent[0]) && isFinite(dataExtent[1]) && dataExtent[0] < dataExtent[1]) {
|
| 104 |
-
extent = dataExtent;
|
| 105 |
-
} else {
|
| 106 |
-
extent = [0, 1];
|
| 107 |
-
}
|
| 108 |
-
} else {
|
| 109 |
-
extent = [0, 1];
|
| 110 |
-
}
|
| 111 |
-
}
|
| 112 |
-
|
| 113 |
// 如果指定了 extent,确保使用 extent 作为 domain,而不是 nice() 调整后的 domain
|
| 114 |
// 这样可以保证 extent 的上限被正确使用,即使数据被截断了
|
| 115 |
// 使用 extent 作为 domain,确保范围正确
|
| 116 |
const padding = { left: 24, right: 35 };
|
| 117 |
let valueScale = d3.scaleLinear().domain([extent[0], extent[1]]).range([padding.left, op.width - padding.right]);
|
| 118 |
-
|
| 119 |
const hasAverageValue = typeof rD.averageValue === 'number' && Number.isFinite(rD.averageValue);
|
| 120 |
const clampedAverage = hasAverageValue
|
| 121 |
? Math.min(Math.max(rD.averageValue as number, extent[0]), extent[1])
|
|
@@ -123,26 +126,13 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 123 |
const averageX = hasAverageValue && clampedAverage !== null
|
| 124 |
? valueScale(clampedAverage)
|
| 125 |
: null;
|
| 126 |
-
|
| 127 |
-
// 如果指定了 no_bins=20 且 extent=[0, 20],使用固定的20个台阶阈值
|
| 128 |
-
// 阈值正好是 [0, 1, 2, ..., 19, 20],对应区间 [0,1), [1,2), ..., [18,19), [19,20]
|
| 129 |
-
// 如果指定了 no_bins=20 且 extent=[-10, 10](差分模式),使用固定的20个台阶阈值
|
| 130 |
-
// 阈值正好是 [-10, -9, -8, ..., 8, 9, 10],对应区间 [-10,-9), [-9,-8), ..., [8,9), [9,10]
|
| 131 |
-
let thresholds: number[];
|
| 132 |
-
if (rD.no_bins === 20 && extent[0] === 0 && extent[1] === 20) {
|
| 133 |
-
// 固定20个台阶:生成阈值 [0, 1, 2, ..., 19, 20]
|
| 134 |
-
thresholds = Array.from({ length: 21 }, (_, i) => i); // [0, 1, 2, ..., 20]
|
| 135 |
-
} else if (rD.no_bins === 20 && extent[0] === -10 && extent[1] === 10) {
|
| 136 |
-
// 差分模式:生成阈值 [-10, -9, -8, ..., 8, 9, 10]
|
| 137 |
-
thresholds = Array.from({ length: 21 }, (_, i) => i - 10); // [-10, -9, ..., 9, 10]
|
| 138 |
-
} else {
|
| 139 |
-
// 其他情况使用原来的逻辑
|
| 140 |
-
const idealNoBins = rD.no_bins || Math.min(d3.thresholdFreedmanDiaconis(values, extent[0], extent[1]), 20);
|
| 141 |
-
thresholds = d3.ticks(extent[0], extent[1], idealNoBins);
|
| 142 |
-
// 确保最后一个阈值小于 extent[1],这样最后一个 bin 可以包含所有 >= 最后一个阈值的数据
|
| 143 |
-
thresholds = thresholds.filter(t => t < extent[1]);
|
| 144 |
-
}
|
| 145 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 146 |
const histo = d3.bin()
|
| 147 |
.domain(<[number, number]>[extent[0], extent[1]])
|
| 148 |
.thresholds(thresholds)(values);
|
|
@@ -200,22 +190,9 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 200 |
},
|
| 201 |
})
|
| 202 |
.style('fill', d => {
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
|
| 206 |
-
|
| 207 |
-
if (isDiffMode) {
|
| 208 |
-
// 差分模式:负值区间使用左边界 x0,非负值区间使用右边界 x1
|
| 209 |
-
// 例如:bin [-1,0) 使用 x0=-1,bin [0,1) 使用 x1=1
|
| 210 |
-
const colorValue = d.x0 < 0 ? d.x0 : d.x1;
|
| 211 |
-
return rD.colorScale(colorValue);
|
| 212 |
-
} else {
|
| 213 |
-
// 正常模式:使用 bin 的起始值 x0 计算颜色,以匹配区间定义
|
| 214 |
-
// 例如:bin [0,1) 使用 x0=0,bin [1,2) 使用 x0=1,bin [19,20) 使用 x0=19
|
| 215 |
-
return rD.colorScale(d.x0);
|
| 216 |
-
}
|
| 217 |
-
}
|
| 218 |
-
return '#666'; // 默认颜色
|
| 219 |
})
|
| 220 |
.style('stroke', (d, i) => {
|
| 221 |
// 如果这个bin被选中,添加蓝色边框
|
|
@@ -319,6 +296,7 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 319 |
x0: d.x0,
|
| 320 |
x1: d.x1,
|
| 321 |
data: d,
|
|
|
|
| 322 |
source: sourceId
|
| 323 |
});
|
| 324 |
} else {
|
|
@@ -334,6 +312,7 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 334 |
x0: d.x0,
|
| 335 |
x1: d.x1,
|
| 336 |
data: d,
|
|
|
|
| 337 |
source: sourceId
|
| 338 |
});
|
| 339 |
}
|
|
@@ -342,21 +321,41 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 342 |
|
| 343 |
this.layers.bg.select('.y-axis').call(<any>d3.axisRight(countScale).tickFormat(op.numberFormat));
|
| 344 |
|
| 345 |
-
//
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 346 |
const xAxisTickFormat = (d: number) => {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 347 |
if (rD.extent) {
|
| 348 |
-
// 检查是否是最小值
|
| 349 |
-
if (Math.abs(d - rD.extent[0]) < 0.001) {
|
| 350 |
return '-∞';
|
| 351 |
}
|
| 352 |
-
// 检查是否是最大值
|
| 353 |
if (Math.abs(d - rD.extent[1]) < 0.001) {
|
| 354 |
return '∞';
|
| 355 |
}
|
| 356 |
}
|
| 357 |
return op.numberFormat(d);
|
| 358 |
};
|
| 359 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 360 |
|
| 361 |
|
| 362 |
}
|
|
|
|
| 9 |
export type HistogramData = {
|
| 10 |
data: number[],
|
| 11 |
label?: string,
|
| 12 |
+
no_bins: number,
|
| 13 |
+
extent: number[],
|
| 14 |
+
colorScale: (value: number) => string, // 添加颜色 scale
|
| 15 |
averageValue?: number,
|
| 16 |
+
averageLabel?: string,
|
| 17 |
+
showLeftInfinity?: boolean, // 是否在左侧显示 -∞ 符号
|
| 18 |
+
xAxisTickSkip?: number // x轴刻度数字绘制间隔,0表示不跳过,1表示隔一个绘制一个(0,2,4...)
|
| 19 |
}
|
| 20 |
|
| 21 |
|
|
|
|
| 24 |
x0: number;
|
| 25 |
x1: number;
|
| 26 |
data: number[];
|
| 27 |
+
no_bins: number; // 直方图的bin数量
|
| 28 |
source?: string; // 直方图标识,用于区分不同的直方图实例
|
| 29 |
}
|
| 30 |
|
|
|
|
| 84 |
protected _render(rD: HistogramData): void {
|
| 85 |
const op = this.options;
|
| 86 |
|
| 87 |
+
// extent 是必选参数,直接使用
|
| 88 |
+
const extent = rD.extent;
|
| 89 |
+
|
| 90 |
+
// 计算bin宽度
|
| 91 |
+
const binWidth = (extent[1] - extent[0]) / rD.no_bins;
|
| 92 |
+
|
| 93 |
+
// 超出上下界的按照对应bin的中心值处理
|
| 94 |
+
const values = rD.data.map(d => +d)
|
| 95 |
+
.filter(d => isFinite(d))
|
| 96 |
+
.map(d => {
|
| 97 |
+
if (d >= extent[1]) {
|
| 98 |
+
// 超出或等于上界:映射到最后一个bin的中心值,避免d3.bin()为等于extent[1]的值创建额外的[19,19]bin
|
| 99 |
+
return extent[1] - 0.5 * binWidth;
|
| 100 |
+
} else if (d <= extent[0]) {
|
| 101 |
+
// 超出或等于下界:映射到第一个bin的中心值,避免d3.bin()为等于extent[0]的值创建额外的[0,0]bin
|
| 102 |
+
return extent[0] + 0.5 * binWidth;
|
| 103 |
+
}
|
| 104 |
+
return d;
|
| 105 |
+
});
|
| 106 |
+
|
| 107 |
// 如果数据为空,显示空图表
|
| 108 |
if (values.length === 0) {
|
| 109 |
this.layers.main.selectAll('.bar').remove();
|
|
|
|
| 112 |
this.layers.fg.selectAll('.avg-label').remove();
|
| 113 |
return;
|
| 114 |
}
|
| 115 |
+
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 116 |
// 如果指定了 extent,确保使用 extent 作为 domain,而不是 nice() 调整后的 domain
|
| 117 |
// 这样可以保证 extent 的上限被正确使用,即使数据被截断了
|
| 118 |
// 使用 extent 作为 domain,确保范围正确
|
| 119 |
const padding = { left: 24, right: 35 };
|
| 120 |
let valueScale = d3.scaleLinear().domain([extent[0], extent[1]]).range([padding.left, op.width - padding.right]);
|
| 121 |
+
|
| 122 |
const hasAverageValue = typeof rD.averageValue === 'number' && Number.isFinite(rD.averageValue);
|
| 123 |
const clampedAverage = hasAverageValue
|
| 124 |
? Math.min(Math.max(rD.averageValue as number, extent[0]), extent[1])
|
|
|
|
| 126 |
const averageX = hasAverageValue && clampedAverage !== null
|
| 127 |
? valueScale(clampedAverage)
|
| 128 |
: null;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
|
| 130 |
+
// 统一的阈值生成逻辑:生成有限数阈值,两侧bin自动包含超出范围的值
|
| 131 |
+
// no_bins 是必选参数,直接使用等宽bin
|
| 132 |
+
// thresholds 长度应该是 no_bins - 1,从 extent[0]+binWidth 开始,比如[0,10],10个bin,则thresholds长度为9,分别是1..9,不包括0和10
|
| 133 |
+
const thresholds = Array.from({ length: rD.no_bins - 1 }, (_, i) => extent[0] + (i + 1) * binWidth);
|
| 134 |
+
|
| 135 |
+
// 设置domain确保边界严格按照extent划分,而不是实际的数据最大值和最小值
|
| 136 |
const histo = d3.bin()
|
| 137 |
.domain(<[number, number]>[extent[0], extent[1]])
|
| 138 |
.thresholds(thresholds)(values);
|
|
|
|
| 190 |
},
|
| 191 |
})
|
| 192 |
.style('fill', d => {
|
| 193 |
+
// 统一使用bin的中间值计算颜色
|
| 194 |
+
const colorValue = (d.x0 + d.x1) / 2;
|
| 195 |
+
return rD.colorScale(colorValue);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 196 |
})
|
| 197 |
.style('stroke', (d, i) => {
|
| 198 |
// 如果这个bin被选中,添加蓝色边框
|
|
|
|
| 296 |
x0: d.x0,
|
| 297 |
x1: d.x1,
|
| 298 |
data: d,
|
| 299 |
+
no_bins: rD.no_bins,
|
| 300 |
source: sourceId
|
| 301 |
});
|
| 302 |
} else {
|
|
|
|
| 312 |
x0: d.x0,
|
| 313 |
x1: d.x1,
|
| 314 |
data: d,
|
| 315 |
+
no_bins: rD.no_bins,
|
| 316 |
source: sourceId
|
| 317 |
});
|
| 318 |
}
|
|
|
|
| 321 |
|
| 322 |
this.layers.bg.select('.y-axis').call(<any>d3.axisRight(countScale).tickFormat(op.numberFormat));
|
| 323 |
|
| 324 |
+
// 构建所有刻度值数组,用于确定索引位置
|
| 325 |
+
const allTickValues = [extent[0], ...thresholds, extent[1]]; // 包含边界和所有阈值
|
| 326 |
+
const tickSkip = rD.xAxisTickSkip ?? 0;
|
| 327 |
+
|
| 328 |
+
// Custom tick format: 左侧只有配置了 showLeftInfinity 的直方图显示 -∞,右侧所有直方图都显示 ∞
|
| 329 |
+
// 根据 xAxisTickSkip 参数决定是否显示数字标签(刻度线始终显示)
|
| 330 |
const xAxisTickFormat = (d: number) => {
|
| 331 |
+
// 查找当前刻度值在数组中的索引
|
| 332 |
+
const tickIndex = allTickValues.findIndex(tick => Math.abs(tick - d) < 0.001);
|
| 333 |
+
|
| 334 |
+
// 如果配置了 xAxisTickSkip,根据索引决定是否显示标签
|
| 335 |
+
if (tickSkip > 0 && tickIndex >= 0) {
|
| 336 |
+
// 如果索引不符合跳过规则,返回空字符串(不显示数字,但刻度线仍会显示)
|
| 337 |
+
if (tickIndex % (tickSkip + 1) !== 0) {
|
| 338 |
+
return '';
|
| 339 |
+
}
|
| 340 |
+
}
|
| 341 |
+
|
| 342 |
if (rD.extent) {
|
| 343 |
+
// 检查是否是最小值,且配置了显示左侧无穷大符号
|
| 344 |
+
if (rD.showLeftInfinity && Math.abs(d - rD.extent[0]) < 0.001) {
|
| 345 |
return '-∞';
|
| 346 |
}
|
| 347 |
+
// 检查是否是最大值,所有直方图右侧都显示 ∞
|
| 348 |
if (Math.abs(d - rD.extent[1]) < 0.001) {
|
| 349 |
return '∞';
|
| 350 |
}
|
| 351 |
}
|
| 352 |
return op.numberFormat(d);
|
| 353 |
};
|
| 354 |
+
|
| 355 |
+
const xAxis = d3.axisBottom(valueScale)
|
| 356 |
+
.tickFormat(xAxisTickFormat)
|
| 357 |
+
.tickValues(allTickValues);
|
| 358 |
+
this.layers.bg.select('.x-axis').call(<any>xAxis);
|
| 359 |
|
| 360 |
|
| 361 |
}
|
client/src/ts/vis/ScrollbarMinimap.ts
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
-
import {
|
| 2 |
import { isNarrowScreen } from '../utils/responsive';
|
|
|
|
| 3 |
import type { TokenFragmentRect } from './types';
|
| 4 |
import type { FrontendAnalyzeResult } from '../api/GLTR_API';
|
| 5 |
|
|
@@ -22,7 +23,7 @@ interface AggregationResult {
|
|
| 22 |
*/
|
| 23 |
interface BucketData {
|
| 24 |
y: number;
|
| 25 |
-
|
| 26 |
TokenFragmentCount: number;
|
| 27 |
}
|
| 28 |
|
|
@@ -210,11 +211,11 @@ export class ScrollbarMinimap {
|
|
| 210 |
const { buckets } = aggregationResult;
|
| 211 |
|
| 212 |
buckets.forEach(bucket => {
|
| 213 |
-
// 计算平均surprisal:总surprisal(
|
| 214 |
-
const
|
| 215 |
-
? bucket.
|
| 216 |
: 0;
|
| 217 |
-
const color =
|
| 218 |
ctx.fillStyle = color;
|
| 219 |
|
| 220 |
const y = (textAreaTop + bucket.y) / worldUnitsPerMinimapPixel; // 映射到minimap的y坐标
|
|
@@ -223,26 +224,6 @@ export class ScrollbarMinimap {
|
|
| 223 |
});
|
| 224 |
}
|
| 225 |
|
| 226 |
-
|
| 227 |
-
/**
|
| 228 |
-
* 计算token每个字符的惊讶度
|
| 229 |
-
* @param tokenIndex token索引
|
| 230 |
-
* @param renderData 渲染数据
|
| 231 |
-
* @returns 每个字符的惊讶度值
|
| 232 |
-
*/
|
| 233 |
-
private getTokenSurprisalPerChar(tokenIndex: number, renderData: FrontendAnalyzeResult): number {
|
| 234 |
-
const token = renderData.bpe_strings?.[tokenIndex];
|
| 235 |
-
const realTopk = token?.real_topk;
|
| 236 |
-
if (Array.isArray(realTopk) && realTopk.length === 2) {
|
| 237 |
-
const [rank, prob] = realTopk;
|
| 238 |
-
const tokenSurprisal = -Math.log2(prob);
|
| 239 |
-
const tokenText = token?.raw || '';
|
| 240 |
-
const charCount = Array.from(tokenText).length;
|
| 241 |
-
return charCount > 0 ? tokenSurprisal / charCount : 0;
|
| 242 |
-
}
|
| 243 |
-
return 0;
|
| 244 |
-
}
|
| 245 |
-
|
| 246 |
/**
|
| 247 |
* 按Y坐标,把[y_min, y_max]范围内的token fragment聚合到bucketCount个桶
|
| 248 |
* @param positions token 位置数组
|
|
@@ -271,7 +252,7 @@ export class ScrollbarMinimap {
|
|
| 271 |
// 初始化桶数组,y坐标从y_min开始,依次递增bucketHeight
|
| 272 |
const buckets: BucketData[] = Array.from({ length: bucketCount }, (_, bucketIndex) => ({
|
| 273 |
y: y_min + bucketIndex * bucketHeight,
|
| 274 |
-
|
| 275 |
TokenFragmentCount: 0
|
| 276 |
}));
|
| 277 |
|
|
@@ -288,10 +269,12 @@ export class ScrollbarMinimap {
|
|
| 288 |
|
| 289 |
const bucket = buckets[bucketIndex];
|
| 290 |
|
| 291 |
-
// 计算该token
|
| 292 |
-
const
|
| 293 |
-
|
|
|
|
| 294 |
bucket.TokenFragmentCount += 1;
|
|
|
|
| 295 |
});
|
| 296 |
|
| 297 |
return {
|
|
|
|
| 1 |
+
import { getByteSurprisalColor, MINIMAP_COLOR_FACTOR } from '../utils/SurprisalColorConfig';
|
| 2 |
import { isNarrowScreen } from '../utils/responsive';
|
| 3 |
+
import { calculateSurprisalDensity } from '../utils/Util';
|
| 4 |
import type { TokenFragmentRect } from './types';
|
| 5 |
import type { FrontendAnalyzeResult } from '../api/GLTR_API';
|
| 6 |
|
|
|
|
| 23 |
*/
|
| 24 |
interface BucketData {
|
| 25 |
y: number;
|
| 26 |
+
surprisalDensitySum: number;
|
| 27 |
TokenFragmentCount: number;
|
| 28 |
}
|
| 29 |
|
|
|
|
| 211 |
const { buckets } = aggregationResult;
|
| 212 |
|
| 213 |
buckets.forEach(bucket => {
|
| 214 |
+
// 计算平均surprisal密度:总surprisal(surprisalPerByte累加)除以token数
|
| 215 |
+
const averageSurprisalDensity = bucket.TokenFragmentCount > 0
|
| 216 |
+
? bucket.surprisalDensitySum / bucket.TokenFragmentCount
|
| 217 |
: 0;
|
| 218 |
+
const color = getByteSurprisalColor(averageSurprisalDensity, MINIMAP_COLOR_FACTOR);
|
| 219 |
ctx.fillStyle = color;
|
| 220 |
|
| 221 |
const y = (textAreaTop + bucket.y) / worldUnitsPerMinimapPixel; // 映射到minimap的y坐标
|
|
|
|
| 224 |
});
|
| 225 |
}
|
| 226 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 227 |
/**
|
| 228 |
* 按Y坐标,把[y_min, y_max]范围内的token fragment聚合到bucketCount个桶
|
| 229 |
* @param positions token 位置数组
|
|
|
|
| 252 |
// 初始化桶数组,y坐标从y_min开始,依次递增bucketHeight
|
| 253 |
const buckets: BucketData[] = Array.from({ length: bucketCount }, (_, bucketIndex) => ({
|
| 254 |
y: y_min + bucketIndex * bucketHeight,
|
| 255 |
+
surprisalDensitySum: 0,
|
| 256 |
TokenFragmentCount: 0
|
| 257 |
}));
|
| 258 |
|
|
|
|
| 269 |
|
| 270 |
const bucket = buckets[bucketIndex];
|
| 271 |
|
| 272 |
+
// 计算该token字节平均惊讶度并累加
|
| 273 |
+
const token = renderData.bpe_strings[pos.tokenIndex];
|
| 274 |
+
const surprisalDensity = calculateSurprisalDensity(token);
|
| 275 |
+
bucket.surprisalDensitySum += surprisalDensity;
|
| 276 |
bucket.TokenFragmentCount += 1;
|
| 277 |
+
// todo: 使用字节数加权计算bucket的平均信息密度,而不是按token平均计算
|
| 278 |
});
|
| 279 |
|
| 280 |
return {
|
client/src/ts/vis/SvgOverlayManager.ts
CHANGED
|
@@ -4,8 +4,8 @@
|
|
| 4 |
*/
|
| 5 |
|
| 6 |
import {FrontendAnalyzeResult} from "../api/GLTR_API";
|
| 7 |
-
import {
|
| 8 |
-
import {
|
| 9 |
import {TokenFragmentRect, RectCacheEntry} from "./types";
|
| 10 |
import * as d3 from "d3";
|
| 11 |
|
|
@@ -16,8 +16,10 @@ export interface SvgOverlayManagerOptions {
|
|
| 16 |
addTokenEventListeners: (element: SVGGElement, tokenIndex: number, rd: FrontendAnalyzeResult) => void;
|
| 17 |
/** 差分模式:是否启用差分渲染 */
|
| 18 |
diffMode?: boolean;
|
| 19 |
-
/** 差分数据:逐字的Δ(
|
| 20 |
-
|
|
|
|
|
|
|
| 21 |
}
|
| 22 |
|
| 23 |
export class SvgOverlayManager {
|
|
@@ -243,38 +245,46 @@ export class SvgOverlayManager {
|
|
| 243 |
// 计算颜色
|
| 244 |
let color: string;
|
| 245 |
|
| 246 |
-
if (this.options.diffMode && this.options.
|
| 247 |
-
// 差分模式:按字计算颜色
|
| 248 |
const tokenData = rd.bpe_strings[pos.tokenIndex];
|
| 249 |
-
const offset = tokenData
|
| 250 |
const charStart = offset[0];
|
| 251 |
const charEnd = offset[1];
|
| 252 |
|
| 253 |
-
// 获取该token对应的字
|
| 254 |
-
|
| 255 |
-
const
|
|
|
|
|
|
|
| 256 |
|
| 257 |
-
//
|
| 258 |
-
|
| 259 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
}
|
| 261 |
-
|
| 262 |
-
// 计算平均差分值
|
| 263 |
-
let avgDelta = 0;
|
| 264 |
-
if (tokenCharDeltas.length > 0) {
|
| 265 |
-
avgDelta = tokenCharDeltas.reduce((sum, val) => sum + val, 0) / tokenCharDeltas.length;
|
| 266 |
-
}
|
| 267 |
-
|
| 268 |
-
// 使用统一的差分颜色配置
|
| 269 |
-
color = getDiffColor(avgDelta);
|
| 270 |
} else {
|
| 271 |
// 正常模式:使用surprisal计算颜色
|
| 272 |
const tokenData = rd.bpe_strings[pos.tokenIndex];
|
| 273 |
-
const
|
| 274 |
-
|
| 275 |
-
const surprisal = tokenData ? calculateSurprisal(tokenTopK?.[1] || 0) : 0;
|
| 276 |
-
const surprisalPerChar = calculateSurprisalPerCharacter(surprisal, tokenText);
|
| 277 |
-
color = getSurprisalColor(surprisalPerChar);
|
| 278 |
}
|
| 279 |
|
| 280 |
// 设置填充颜色
|
|
|
|
| 4 |
*/
|
| 5 |
|
| 6 |
import {FrontendAnalyzeResult} from "../api/GLTR_API";
|
| 7 |
+
import {calculateSurprisalDensity} from "../utils/Util";
|
| 8 |
+
import {getByteSurprisalColor, getDiffColor} from "../utils/SurprisalColorConfig";
|
| 9 |
import {TokenFragmentRect, RectCacheEntry} from "./types";
|
| 10 |
import * as d3 from "d3";
|
| 11 |
|
|
|
|
| 16 |
addTokenEventListeners: (element: SVGGElement, tokenIndex: number, rd: FrontendAnalyzeResult) => void;
|
| 17 |
/** 差分模式:是否启用差分渲染 */
|
| 18 |
diffMode?: boolean;
|
| 19 |
+
/** 差分数据:逐字节的Δ信息密度(bits/Byte) */
|
| 20 |
+
deltaByteSurprisals?: number[];
|
| 21 |
+
/** 字符索引到字节索引的映射表 */
|
| 22 |
+
charToByteIndexMap?: number[];
|
| 23 |
}
|
| 24 |
|
| 25 |
export class SvgOverlayManager {
|
|
|
|
| 245 |
// 计算颜色
|
| 246 |
let color: string;
|
| 247 |
|
| 248 |
+
if (this.options.diffMode && this.options.deltaByteSurprisals) {
|
| 249 |
+
// 差分模式:按字节计算颜色
|
| 250 |
const tokenData = rd.bpe_strings[pos.tokenIndex];
|
| 251 |
+
const offset = tokenData.offset;
|
| 252 |
const charStart = offset[0];
|
| 253 |
const charEnd = offset[1];
|
| 254 |
|
| 255 |
+
// 获取该token对应的字节范围内的差分值
|
| 256 |
+
// token的offset是字符索引,需要通过映射表转换为字节索引
|
| 257 |
+
const deltaByteSurprisals = this.options.deltaByteSurprisals;
|
| 258 |
+
const charToByteIndexMap = this.options.charToByteIndexMap;
|
| 259 |
+
const tokenByteDeltas: number[] = [];
|
| 260 |
|
| 261 |
+
// 必须有映射表才能正确转换字符索引到字节索引
|
| 262 |
+
if (!charToByteIndexMap || charToByteIndexMap.length === 0) {
|
| 263 |
+
color = getDiffColor(0);
|
| 264 |
+
} else {
|
| 265 |
+
// 将字符索引范围转换为字节索引范围
|
| 266 |
+
const byteStart = charToByteIndexMap[charStart] ?? charStart;
|
| 267 |
+
const byteEnd = charToByteIndexMap[charEnd] ?? charEnd;
|
| 268 |
+
|
| 269 |
+
// 遍历token的字节范围,收集差分值
|
| 270 |
+
for (let byteIdx = byteStart; byteIdx < byteEnd && byteIdx < deltaByteSurprisals.length; byteIdx++) {
|
| 271 |
+
tokenByteDeltas.push(deltaByteSurprisals[byteIdx]);
|
| 272 |
+
}
|
| 273 |
+
|
| 274 |
+
// 计算平均差分值
|
| 275 |
+
let avgDelta = 0;
|
| 276 |
+
if (tokenByteDeltas.length > 0) {
|
| 277 |
+
avgDelta = tokenByteDeltas.reduce((sum, val) => sum + val, 0) / tokenByteDeltas.length;
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
// 使用统一的差分颜色配置
|
| 281 |
+
color = getDiffColor(avgDelta);
|
| 282 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 283 |
} else {
|
| 284 |
// 正常模式:使用surprisal计算颜色
|
| 285 |
const tokenData = rd.bpe_strings[pos.tokenIndex];
|
| 286 |
+
const informationDensity = calculateSurprisalDensity(tokenData);
|
| 287 |
+
color = getByteSurprisalColor(informationDensity);
|
|
|
|
|
|
|
|
|
|
| 288 |
}
|
| 289 |
|
| 290 |
// 设置填充颜色
|
client/src/ts/vis/ToolTip.ts
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
import { D3Sel, calculateSurprisal,
|
| 2 |
import { SimpleEventHandler } from "../utils/SimpleEventHandler";
|
| 3 |
import { GLTR_RenderItem } from "./GLTR_Text_Box";
|
| 4 |
import * as d3 from "d3";
|
|
@@ -397,7 +397,7 @@ export class ToolTip {
|
|
| 397 |
*/
|
| 398 |
private _doUpdate(ri: GLTR_RenderItem, event?: MouseEvent) {
|
| 399 |
const { normalColor, selectedColor, detailColor, valueColor } = this.themeColors;
|
| 400 |
-
const predictions =
|
| 401 |
const hasPredictions = predictions.length > 0;
|
| 402 |
const wScale = hasPredictions ? this._getScale(predictions[0][1]) : this._getScale(1);
|
| 403 |
|
|
@@ -406,12 +406,12 @@ export class ToolTip {
|
|
| 406 |
|
| 407 |
// 更新当前token显示(第一行)
|
| 408 |
this.currentToken.html(() => {
|
| 409 |
-
const visualizedToken = escapeHtml(visualizeSpecialChars(ri.
|
| 410 |
return `<span style="color: ${selectedColor};">${visualizedToken}</span>`;
|
| 411 |
});
|
| 412 |
|
| 413 |
// 先设置内容,以便获取tooltip的实际尺寸
|
| 414 |
-
if (ri.
|
| 415 |
this.predictions.selectAll('.row').data([{ label: 'bpe_merged' }])
|
| 416 |
.join('div')
|
| 417 |
.attr('class', 'row info-row')
|
|
@@ -429,7 +429,7 @@ export class ToolTip {
|
|
| 429 |
.attr('class', 'row')
|
| 430 |
.style('display', 'table-row')
|
| 431 |
.html(d => {
|
| 432 |
-
const color = ri.
|
| 433 |
const bar = '<div style="display: table-cell; width:110px;padding-left:5px;">' +
|
| 434 |
`<div style="display:inline-block;width: ${wScale(d[1])}px;background-color:${color};height: 10px;"></div>` +
|
| 435 |
` <div style="display:inline-block;color: ${color};">${this.numF(d[1])}</div>` + "</div>";
|
|
@@ -442,13 +442,16 @@ export class ToolTip {
|
|
| 442 |
}
|
| 443 |
|
| 444 |
this.myDetail.html(() => {
|
| 445 |
-
const
|
| 446 |
-
const
|
|
|
|
|
|
|
| 447 |
|
| 448 |
-
const
|
|
|
|
| 449 |
const surprisalText = `<span style="color: ${detailColor}">surprisal:</span> <span style="color: ${valueColor}">${this.significantF(surprisal)}</span> <span style="color: ${detailColor}">bits</span>`
|
| 450 |
-
const prop = `<span style="color: ${detailColor}">prob:</span> <span style="color: ${valueColor}">${this.significantF(
|
| 451 |
-
return `${
|
| 452 |
|
| 453 |
})
|
| 454 |
|
|
|
|
| 1 |
+
import { D3Sel, calculateSurprisal, calculateSurprisalDensity } from "../utils/Util";
|
| 2 |
import { SimpleEventHandler } from "../utils/SimpleEventHandler";
|
| 3 |
import { GLTR_RenderItem } from "./GLTR_Text_Box";
|
| 4 |
import * as d3 from "d3";
|
|
|
|
| 397 |
*/
|
| 398 |
private _doUpdate(ri: GLTR_RenderItem, event?: MouseEvent) {
|
| 399 |
const { normalColor, selectedColor, detailColor, valueColor } = this.themeColors;
|
| 400 |
+
const predictions = ri.tokenData.pred_topk;
|
| 401 |
const hasPredictions = predictions.length > 0;
|
| 402 |
const wScale = hasPredictions ? this._getScale(predictions[0][1]) : this._getScale(1);
|
| 403 |
|
|
|
|
| 406 |
|
| 407 |
// 更新当前token显示(第一行)
|
| 408 |
this.currentToken.html(() => {
|
| 409 |
+
const visualizedToken = escapeHtml(visualizeSpecialChars(ri.tokenData.raw));
|
| 410 |
return `<span style="color: ${selectedColor};">${visualizedToken}</span>`;
|
| 411 |
});
|
| 412 |
|
| 413 |
// 先设置内容,以便获取tooltip的实际尺寸
|
| 414 |
+
if (ri.tokenData.bpe_merged) {
|
| 415 |
this.predictions.selectAll('.row').data([{ label: 'bpe_merged' }])
|
| 416 |
.join('div')
|
| 417 |
.attr('class', 'row info-row')
|
|
|
|
| 429 |
.attr('class', 'row')
|
| 430 |
.style('display', 'table-row')
|
| 431 |
.html(d => {
|
| 432 |
+
const color = ri.tokenData.raw != d[0] ? normalColor : selectedColor;
|
| 433 |
const bar = '<div style="display: table-cell; width:110px;padding-left:5px;">' +
|
| 434 |
`<div style="display:inline-block;width: ${wScale(d[1])}px;background-color:${color};height: 10px;"></div>` +
|
| 435 |
` <div style="display:inline-block;color: ${color};">${this.numF(d[1])}</div>` + "</div>";
|
|
|
|
| 442 |
}
|
| 443 |
|
| 444 |
this.myDetail.html(() => {
|
| 445 |
+
const prob = ri.tokenData.real_topk[1];
|
| 446 |
+
const surprisal = calculateSurprisal(prob);
|
| 447 |
+
const informationDensity = calculateSurprisalDensity(ri.tokenData);
|
| 448 |
+
const utf8Size = new TextEncoder().encode(ri.tokenData.raw).length;
|
| 449 |
|
| 450 |
+
const informationDensityText = `<span style="color: ${detailColor}">surprisal density:</span> <span style="color: ${valueColor}">${this.significantF(informationDensity)}</span> <span style="color: ${detailColor}">bits/Byte</span>`
|
| 451 |
+
const utf8SizeText = `<span style="color: ${detailColor}">utf8 size:</span> <span style="color: ${valueColor}">${utf8Size}</span> <span style="color: ${detailColor}">bytes</span>`
|
| 452 |
const surprisalText = `<span style="color: ${detailColor}">surprisal:</span> <span style="color: ${valueColor}">${this.significantF(surprisal)}</span> <span style="color: ${detailColor}">bits</span>`
|
| 453 |
+
const prop = `<span style="color: ${detailColor}">prob:</span> <span style="color: ${valueColor}">${this.significantF(prob)}</span>`
|
| 454 |
+
return `${informationDensityText}<br/>${utf8SizeText}<br/>${surprisalText}<br/>${prop}`
|
| 455 |
|
| 456 |
})
|
| 457 |
|