dqy08 commited on
Commit
a51041c
·
1 Parent(s): baa7926

增强语义分析功能:噪声分布拟合截尾对数正态分布,扫描-拟合迭代查找尾部信号阈值点,增加后验加权pw score染色;语义结果缓存

Browse files
backend/api/analyze_semantic.py CHANGED
@@ -25,23 +25,21 @@ def _log_request(query, text, client_ip=None):
25
 
26
 
27
  def _build_success_response(result, debug_info: bool = False):
28
- """构建成功响应。debug_info=True 时包含 debug_abbrevdebug_top10"""
29
  resp = {
30
  "success": True,
31
  "model": result["model"],
32
  "token_attention": result["token_attention"],
33
  "full_match_degree": result["full_match_degree"],
34
  }
35
- if debug_info:
36
- if "debug_abbrev" in result:
37
- resp["debug_abbrev"] = result["debug_abbrev"]
38
- if "debug_top10" in result:
39
- resp["debug_top10"] = result["debug_top10"]
40
  return resp
41
 
42
 
43
  def _generate_semantic_events(
44
- query: str, text: str, submode: Optional[str] = None, debug_info: bool = False, client_ip: Optional[str] = None
 
45
  ):
46
  """
47
  流式语义分析核心:生成 SSE 事件流(progress + result/error)。
@@ -77,7 +75,7 @@ def _generate_semantic_events(
77
  try:
78
  from backend.access_log import log_analyze_semantic_start
79
  log_analyze_semantic_start(request_id, lock_wait_time, stream_mode=True)
80
- result = _analyze_semantic(query, text, submode_override=submode, progress_callback=progress_callback, debug_info=debug_info)
81
  analysis_result = result
82
  finally:
83
  _inference_lock.release()
@@ -139,16 +137,18 @@ def _generate_semantic_events(
139
 
140
 
141
  def _analyze_semantic_with_stream(
142
- query: str, text: str, submode: Optional[str] = None, debug_info: bool = False, client_ip: Optional[str] = None
 
143
  ):
144
  """流式语义分析,通过 SSE 返回阶段级进度"""
145
  return SSEProgressReporter(
146
- lambda: _generate_semantic_events(query, text, submode, debug_info, client_ip)
147
  ).create_response()
148
 
149
 
150
  def _analyze_semantic_plain(
151
- query: str, text: str, submode: Optional[str] = None, debug_info: bool = False, client_ip: Optional[str] = None
 
152
  ):
153
  """
154
  非流式语义分析:封装流式实现,消费事件流后返回 JSON。
@@ -158,7 +158,7 @@ def _analyze_semantic_plain(
158
  error_msg = None
159
  status_code = 500
160
  try:
161
- for event_str in _generate_semantic_events(query, text, submode, debug_info, client_ip):
162
  if not event_str.startswith('data: '):
163
  continue
164
  data = json.loads(event_str[6:].strip())
@@ -199,6 +199,7 @@ def analyze_semantic(semantic_request):
199
  stream = semantic_request.get("stream", False)
200
  submode = (semantic_request.get("submode") or "").strip() or None
201
  debug_info = bool(semantic_request.get("debug_info", False))
 
202
 
203
  if not query:
204
  return {"success": False, "message": "缺少 query 字段"}, 400
@@ -207,5 +208,5 @@ def analyze_semantic(semantic_request):
207
 
208
  client_ip = get_client_ip()
209
  if stream:
210
- return _analyze_semantic_with_stream(query, text, submode, debug_info, client_ip)
211
- return _analyze_semantic_plain(query, text, submode, debug_info, client_ip)
 
25
 
26
 
27
  def _build_success_response(result, debug_info: bool = False):
28
+ """构建成功响应。debug_info=True 时包含 debug_info 对象(abbrevtopk_tokens、topk_probs)"""
29
  resp = {
30
  "success": True,
31
  "model": result["model"],
32
  "token_attention": result["token_attention"],
33
  "full_match_degree": result["full_match_degree"],
34
  }
35
+ if debug_info and "debug_info" in result:
36
+ resp["debug_info"] = result["debug_info"]
 
 
 
37
  return resp
38
 
39
 
40
  def _generate_semantic_events(
41
+ query: str, text: str, submode: Optional[str] = None, debug_info: bool = False,
42
+ full_match_degree_only: bool = False, client_ip: Optional[str] = None
43
  ):
44
  """
45
  流式语义分析核心:生成 SSE 事件流(progress + result/error)。
 
75
  try:
76
  from backend.access_log import log_analyze_semantic_start
77
  log_analyze_semantic_start(request_id, lock_wait_time, stream_mode=True)
78
+ result = _analyze_semantic(query, text, submode_override=submode, progress_callback=progress_callback, debug_info=debug_info, full_match_degree_only=full_match_degree_only)
79
  analysis_result = result
80
  finally:
81
  _inference_lock.release()
 
137
 
138
 
139
  def _analyze_semantic_with_stream(
140
+ query: str, text: str, submode: Optional[str] = None, debug_info: bool = False,
141
+ full_match_degree_only: bool = False, client_ip: Optional[str] = None
142
  ):
143
  """流式语义分析,通过 SSE 返回阶段级进度"""
144
  return SSEProgressReporter(
145
+ lambda: _generate_semantic_events(query, text, submode, debug_info, full_match_degree_only, client_ip)
146
  ).create_response()
147
 
148
 
149
  def _analyze_semantic_plain(
150
+ query: str, text: str, submode: Optional[str] = None, debug_info: bool = False,
151
+ full_match_degree_only: bool = False, client_ip: Optional[str] = None
152
  ):
153
  """
154
  非流式语义分析:封装流式实现,消费事件流后返回 JSON。
 
158
  error_msg = None
159
  status_code = 500
160
  try:
161
+ for event_str in _generate_semantic_events(query, text, submode, debug_info, full_match_degree_only, client_ip):
162
  if not event_str.startswith('data: '):
163
  continue
164
  data = json.loads(event_str[6:].strip())
 
199
  stream = semantic_request.get("stream", False)
200
  submode = (semantic_request.get("submode") or "").strip() or None
201
  debug_info = bool(semantic_request.get("debug_info", False))
202
+ full_match_degree_only = bool(semantic_request.get("full_match_degree_only", False))
203
 
204
  if not query:
205
  return {"success": False, "message": "缺少 query 字段"}, 400
 
208
 
209
  client_ip = get_client_ip()
210
  if stream:
211
+ return _analyze_semantic_with_stream(query, text, submode, debug_info, full_match_degree_only, client_ip)
212
+ return _analyze_semantic_plain(query, text, submode, debug_info, full_match_degree_only, client_ip)
backend/semantic_analyzer.py CHANGED
@@ -12,6 +12,7 @@ count/fill_blank 按概率加权(Σ pᵢ·zᵢ)。
12
  """
13
 
14
  import gc
 
15
  from typing import Callable, Dict, List, Optional
16
 
17
  import torch
@@ -58,6 +59,7 @@ def _analyze_logits_gradient(
58
  submode_override: Optional[str] = None,
59
  progress_callback: Optional[Callable[[int, int, str, Optional[int]], None]] = None,
60
  debug_info: bool = False,
 
61
  ) -> Dict:
62
  """
63
  梯度归因:logits 对输入 embedding 的梯度。
@@ -147,13 +149,12 @@ def _analyze_logits_gradient(
147
  attention_mask=attention_mask,
148
  output_attentions=False,
149
  )
150
- # 显式同步,确保前向已完成,progress_callback 时机准确
151
  if device.type == "cuda":
152
  torch.cuda.synchronize(device)
153
  elif device.type == "mps":
154
  torch.mps.synchronize()
155
- if progress_callback:
156
- progress_callback(3, TOTAL_STEPS, "backward", None)
157
  logits = outputs.logits[:, -1, :]
158
  topk_vals, topk_ids = torch.topk(logits, LOGITS_GRADIENT_TOPK, dim=-1)
159
  probs = torch.softmax(logits, dim=-1)
@@ -163,6 +164,19 @@ def _analyze_logits_gradient(
163
 
164
  neg_token = "无" if submode == "fill_blank" else "0"
165
  neg_id = tokenizer.encode(neg_token, add_special_tokens=False)[0]
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  # 归因目标:raw logits(不经过 softmax backward),避免饱和与竞争污染。
167
  if submode == "count" or submode == "fill_blank":
168
  # count/fill_blank 均用 top-10、按概率加权 Σ pᵢ·zᵢ,并排除 neg_token(0/无)以保持梯度方向与「相关」一致。
@@ -180,39 +194,43 @@ def _analyze_logits_gradient(
180
  else:
181
  raise ValueError(f"未知 submode: {submode}")
182
  target_logit.backward()
183
-
184
  grad = embeds.grad
185
  if grad is None:
186
  raise RuntimeError("logits_gradient: 梯度未回传,可能模型不支持(如 int8 量化)")
187
 
 
 
 
 
 
188
  if progress_callback:
189
  progress_callback(4, TOTAL_STEPS, "processing", None)
 
190
  text_token_end = len(offset_mapping)
191
  # 在 GPU 上一次性计算所有 token 的 ‖∇f‖,避免循环内 .item() 导致 500 次 GPU→CPU 同步
192
  grad_slice = grad[0, prompt_end:text_token_end].float()
193
  norms = grad_slice.norm(dim=-1).cpu().tolist()
194
  token_attention: List[Dict] = []
 
195
  for i in range(prompt_end, text_token_end):
196
  s, e = offset_mapping[i]
197
  if s >= text_start_char and e <= text_end_char:
198
  s_rel, e_rel = s - text_start_char, e - text_start_char
199
- token_attention.append({"offset": [s_rel, e_rel], "raw": truncated_text[s_rel:e_rel], "score": norms[i - prompt_end]})
200
-
201
- # 全文匹配度:count/match_score 用 1-P("0"),fill_blank 用 1-P("无")
202
- p_neg = probs[0, neg_id].item()
203
- full_match_degree = round(1.0 - p_neg, 4)
 
 
204
 
205
- model_display = get_semantic_model_display_name()
206
  out = {
207
- "model": model_display,
208
  "token_attention": token_attention,
209
- "topk_tokens": topk_tokens,
210
- "topk_probs": topk_probs,
211
  "full_match_degree": full_match_degree,
212
  }
213
  if debug_info:
214
- out["debug_abbrev"] = abbrev
215
- out["debug_top10"] = [{"token": t, "prob": p} for t, p in zip(topk_tokens, topk_probs)]
216
  return out
217
  finally:
218
  if use_gc:
@@ -227,6 +245,7 @@ def analyze_semantic(
227
  submode_override: Optional[str] = None,
228
  progress_callback: Optional[Callable[[int, int, str, Optional[int]], None]] = None,
229
  debug_info: bool = False,
 
230
  ) -> Dict:
231
  """
232
  分析原文各 token 与 query 的相关度(使用 logits_gradient 梯度归因)。
@@ -236,10 +255,10 @@ def analyze_semantic(
236
  text: 原文
237
  submode_override: 评估时可选覆盖子模式(count/match_score/fill_blank)
238
  progress_callback: 可选进度回调 (step, total_steps, stage, percentage)
239
- debug_info: 为 True 时返回 debug_abbrev(推理原文缩写) debug_top10
240
 
241
  Returns:
242
- {"model", "token_attention", "topk_tokens", "topk_probs"}
243
  """
244
  tokenizer, model, device = ensure_semantic_loaded()
245
  return _analyze_logits_gradient(
@@ -247,4 +266,5 @@ def analyze_semantic(
247
  submode_override=submode_override,
248
  progress_callback=progress_callback,
249
  debug_info=debug_info,
 
250
  )
 
12
  """
13
 
14
  import gc
15
+ import math
16
  from typing import Callable, Dict, List, Optional
17
 
18
  import torch
 
59
  submode_override: Optional[str] = None,
60
  progress_callback: Optional[Callable[[int, int, str, Optional[int]], None]] = None,
61
  debug_info: bool = False,
62
+ full_match_degree_only: bool = False,
63
  ) -> Dict:
64
  """
65
  梯度归因:logits 对输入 embedding 的梯度。
 
149
  attention_mask=attention_mask,
150
  output_attentions=False,
151
  )
152
+ # 显式同步,确保已完成,progress_callback 时机准确
153
  if device.type == "cuda":
154
  torch.cuda.synchronize(device)
155
  elif device.type == "mps":
156
  torch.mps.synchronize()
157
+
 
158
  logits = outputs.logits[:, -1, :]
159
  topk_vals, topk_ids = torch.topk(logits, LOGITS_GRADIENT_TOPK, dim=-1)
160
  probs = torch.softmax(logits, dim=-1)
 
164
 
165
  neg_token = "无" if submode == "fill_blank" else "0"
166
  neg_id = tokenizer.encode(neg_token, add_special_tokens=False)[0]
167
+ # 全文匹配度:count/match_score 用 1-P("0"),fill_blank 用 1-P("无")
168
+ p_neg = probs[0, neg_id].item()
169
+ full_match_degree = round(1.0 - p_neg, 4)
170
+
171
+ if full_match_degree_only and submode == "count":
172
+ return {
173
+ "model": get_semantic_model_display_name(),
174
+ "token_attention": [],
175
+ "full_match_degree": full_match_degree,
176
+ }
177
+
178
+ if progress_callback:
179
+ progress_callback(3, TOTAL_STEPS, "backward", None)
180
  # 归因目标:raw logits(不经过 softmax backward),避免饱和与竞争污染。
181
  if submode == "count" or submode == "fill_blank":
182
  # count/fill_blank 均用 top-10、按概率加权 Σ pᵢ·zᵢ,并排除 neg_token(0/无)以保持梯度方向与「相关」一致。
 
194
  else:
195
  raise ValueError(f"未知 submode: {submode}")
196
  target_logit.backward()
 
197
  grad = embeds.grad
198
  if grad is None:
199
  raise RuntimeError("logits_gradient: 梯度未回传,可能模型不支持(如 int8 量化)")
200
 
201
+ # 显式同步,确保已完成,progress_callback 时机准确
202
+ if device.type == "cuda":
203
+ torch.cuda.synchronize(device)
204
+ elif device.type == "mps":
205
+ torch.mps.synchronize()
206
  if progress_callback:
207
  progress_callback(4, TOTAL_STEPS, "processing", None)
208
+
209
  text_token_end = len(offset_mapping)
210
  # 在 GPU 上一次性计算所有 token 的 ‖∇f‖,避免循环内 .item() 导致 500 次 GPU→CPU 同步
211
  grad_slice = grad[0, prompt_end:text_token_end].float()
212
  norms = grad_slice.norm(dim=-1).cpu().tolist()
213
  token_attention: List[Dict] = []
214
+ nan_count = 0
215
  for i in range(prompt_end, text_token_end):
216
  s, e = offset_mapping[i]
217
  if s >= text_start_char and e <= text_end_char:
218
  s_rel, e_rel = s - text_start_char, e - text_start_char
219
+ score = norms[i - prompt_end]
220
+ if not math.isfinite(score):
221
+ score = 0.0
222
+ nan_count += 1
223
+ token_attention.append({"offset": [s_rel, e_rel], "raw": truncated_text[s_rel:e_rel], "score": score})
224
+ if nan_count > 0:
225
+ print(f"⚠️ token_attention 中有 {nan_count} 个 score 为 NaN/Inf,已替换为 0。")
226
 
 
227
  out = {
228
+ "model": get_semantic_model_display_name(),
229
  "token_attention": token_attention,
 
 
230
  "full_match_degree": full_match_degree,
231
  }
232
  if debug_info:
233
+ out["debug_info"] = {"abbrev": abbrev, "topk_tokens": topk_tokens, "topk_probs": topk_probs}
 
234
  return out
235
  finally:
236
  if use_gc:
 
245
  submode_override: Optional[str] = None,
246
  progress_callback: Optional[Callable[[int, int, str, Optional[int]], None]] = None,
247
  debug_info: bool = False,
248
+ full_match_degree_only: bool = False,
249
  ) -> Dict:
250
  """
251
  分析原文各 token 与 query 的相关度(使用 logits_gradient 梯度归因)。
 
255
  text: 原文
256
  submode_override: 评估时可选覆盖子模式(count/match_score/fill_blank)
257
  progress_callback: 可选进度回调 (step, total_steps, stage, percentage)
258
+ debug_info: 为 True 时返回 debug_abbrev(推理原文缩写);topk_tokens、topk_probs 始终在结果中
259
 
260
  Returns:
261
+ {"model", "token_attention", "full_match_degree"};debug_info=True 时包含 debug_info 对象
262
  """
263
  tokenizer, model, device = ensure_semantic_loaded()
264
  return _analyze_logits_gradient(
 
266
  submode_override=submode_override,
267
  progress_callback=progress_callback,
268
  debug_info=debug_info,
269
+ full_match_degree_only=full_match_degree_only,
270
  )
client/src/css/_responsive.scss CHANGED
@@ -246,9 +246,10 @@
246
  }
247
 
248
  // 调整浮动内容宽度,不使用自己的滚动条
 
249
  .floating_content {
250
  @include full-width-adaptive;
251
- @include mobile-scroll-container;
252
  }
253
 
254
  // 调整统计图容器
 
246
  }
247
 
248
  // 调整浮动内容宽度,不使用自己的滚动条
249
+ // 使用 overflow: visible 避免下拉框(如查询历史)被裁剪;overflow-x: hidden 会令 overflow-y 被计算为 auto 从而产生裁剪
250
  .floating_content {
251
  @include full-width-adaptive;
252
+ overflow: visible;
253
  }
254
 
255
  // 调整统计图容器
client/src/css/_semantic-analysis.scss CHANGED
@@ -48,6 +48,16 @@
48
  align-items: center;
49
  gap: 6px;
50
 
 
 
 
 
 
 
 
 
 
 
51
  .semantic-submode-label {
52
  font-size: 9pt;
53
  color: var(--text-muted);
@@ -199,9 +209,13 @@
199
  font-family: ui-monospace, "Cascadia Code", "Source Code Pro", Menlo, Consolas, "DejaVu Sans Mono", monospace;
200
  word-break: break-word;
201
  overflow-wrap: break-word;
202
- // 颜色加重
203
  color: var(--text-color, #333);
204
  }
 
 
 
 
 
205
  }
206
 
207
  // 语义分析阶段级进度(与 analyze-progress 同风格,绝对定位不占布局空间)
 
48
  align-items: center;
49
  gap: 6px;
50
 
51
+ .semantic-submode-group {
52
+ display: flex;
53
+ align-items: center;
54
+ gap: 6px;
55
+
56
+ &.semantic-submode-group-right {
57
+ margin-left: auto;
58
+ }
59
+ }
60
+
61
  .semantic-submode-label {
62
  font-size: 9pt;
63
  color: var(--text-muted);
 
209
  font-family: ui-monospace, "Cascadia Code", "Source Code Pro", Menlo, Consolas, "DejaVu Sans Mono", monospace;
210
  word-break: break-word;
211
  overflow-wrap: break-word;
 
212
  color: var(--text-color, #333);
213
  }
214
+
215
+ // TopK 图表:与 tooltip 一致,宽度更大,与上方打印区留间隔
216
+ .semantic-debug-topk-chart {
217
+ margin-top: 20px;
218
+ }
219
  }
220
 
221
  // 语义分析阶段级进度(与 analyze-progress 同风格,绝对定位不占布局空间)
client/src/css/start.scss CHANGED
@@ -565,7 +565,6 @@ select {
565
  // 白天模式使用默认字重(400),夜间模式使用 Light 字重(300)
566
  background-color: var(--text-area-bg); // 使用CSS变量控制背景色
567
  color: var(--text-color); // 使用CSS变量控制文字颜色
568
- transition: background-color 0.3s ease, color 0.3s ease; // 平滑过渡
569
  // 确保至少350px高度以容纳tooltip(内容不足时生效)
570
  min-height: 350px;
571
  // 不设置固定padding-bottom,让内容自然决定高度
 
565
  // 白天模式使用默认字重(400),夜间模式使用 Light 字重(300)
566
  background-color: var(--text-area-bg); // 使用CSS变量控制背景色
567
  color: var(--text-color); // 使用CSS变量控制文字颜色
 
568
  // 确保至少350px高度以容纳tooltip(内容不足时生效)
569
  min-height: 350px;
570
  // 不设置固定padding-bottom,让内容自然决定高度
client/src/index.html CHANGED
@@ -150,13 +150,23 @@
150
  </div>
151
  </div>
152
  <div class="semantic-submode-row">
153
- <label class="semantic-submode-label" for="semantic_submode_select">submode: </label>
154
- <select id="semantic_submode_select" class="semantic-submode-select">
155
- <option value="count">count</option>
156
- <option value="match_score">match_score</option>
157
- <option value="fill_blank">fill_blank</option>
158
- <option value="hybrid">hybrid</option>
159
- </select>
 
 
 
 
 
 
 
 
 
 
160
  </div>
161
  </div>
162
  </section>
@@ -164,9 +174,9 @@
164
 
165
  <section id="all_result" class="results-section">
166
  <div id="stats" class="stats-container">
167
- <div id="semantic_score_histogram_item" class="histogram-item" style="display: none;">
168
- <div id="semantic_score_histogram_title"></div>
169
- <svg id="stats_semantic_score"></svg>
170
  </div>
171
  <div id="token_histogram_item" class="histogram-item" style="display: none;">
172
  <div id="token_histogram_title"></div>
 
150
  </div>
151
  </div>
152
  <div class="semantic-submode-row">
153
+ <span class="semantic-submode-group">
154
+ <label class="semantic-submode-label" for="semantic_submode_select">submode: </label>
155
+ <select id="semantic_submode_select" class="semantic-submode-select">
156
+ <option value="count">count</option>
157
+ <option value="match_score">match_score</option>
158
+ <option value="fill_blank">fill_blank</option>
159
+ <option value="hybrid">hybrid</option>
160
+ </select>
161
+ </span>
162
+ <span class="semantic-submode-group semantic-submode-group-right">
163
+ <label class="semantic-submode-label" for="semantic_color_source_select">color source: </label>
164
+ <select id="semantic_color_source_select" class="semantic-submode-select">
165
+ <option value="raw_score_normed" selected>raw score normed</option>
166
+ <option value="signal_probability">signal probability</option>
167
+ <option value="pw_score">pw score</option>
168
+ </select>
169
+ </span>
170
  </div>
171
  </div>
172
  </section>
 
174
 
175
  <section id="all_result" class="results-section">
176
  <div id="stats" class="stats-container">
177
+ <div id="raw_score_normed_histogram_item" class="histogram-item" style="display: none;">
178
+ <div id="raw_score_normed_histogram_title"></div>
179
+ <svg id="stats_raw_score_normed"></svg>
180
  </div>
181
  <div id="token_histogram_item" class="histogram-item" style="display: none;">
182
  <div id="token_histogram_title"></div>
client/src/package.json CHANGED
@@ -5,6 +5,10 @@
5
  "main": "webpack.config.js",
6
  "scripts": {
7
  "test": "echo \"Error: no test specified\" && exit 1",
 
 
 
 
8
  "prebuild": "node scripts/updateIntroHTML.js",
9
  "prebuild:dev": "node scripts/updateIntroHTML.js",
10
  "wp": "npm run build:dev",
 
5
  "main": "webpack.config.js",
6
  "scripts": {
7
  "test": "echo \"Error: no test specified\" && exit 1",
8
+ "test:lognormal": "npx tsx ts/utils/visualizationUpdater.lognormal.test.ts",
9
+ "test:lognormal:tau": "npx tsx ts/utils/lognormalFit.tauBoundary.test.ts",
10
+ "test:signalThreshold": "npx tsx ts/utils/signalThresholdDetector.1log.test.ts",
11
+ "demo:histogramCdf": "npx tsx ts/utils/histogramCdfDemoData.ts",
12
  "prebuild": "node scripts/updateIntroHTML.js",
13
  "prebuild:dev": "node scripts/updateIntroHTML.js",
14
  "wp": "npm run build:dev",
client/src/ts/api/GLTR_API.ts CHANGED
@@ -5,6 +5,7 @@ Attn API and Types
5
  import * as d3 from "d3";
6
  import URLHandler from "../utils/URLHandler";
7
  import {cleanSpecials} from "../utils/Util";
 
8
  import {AnalyzeResponse, AnalyzeResult, TokenWithOffset} from "./generatedSchemas";
9
 
10
  export type FrontendToken = TokenWithOffset & { bpe_merged?: boolean };
@@ -254,28 +255,38 @@ export class TextAnalysisAPI {
254
  query: string,
255
  text: string,
256
  onProgress?: (step: number, totalSteps: number, stage: string, percentage?: number) => void,
257
- submode?: string
258
- ): Promise<{ success: boolean; model?: string; token_attention?: Array<{ offset: [number, number]; raw: string; score: number }>; topk_tokens?: string[]; topk_probs?: number[]; full_match_degree?: number; message?: string }> {
 
259
  if (submode === 'hybrid') {
260
- const r1 = await this.analyzeSemantic(query, text, onProgress, 'count');
261
  const r2 = await this.analyzeSemantic(query, text, onProgress, 'fill_blank');
262
- return { ...r2, full_match_degree: r1.full_match_degree };
 
263
  }
264
- if (onProgress) {
265
- return this.analyzeSemanticWithProgress(query, text, onProgress, submode);
266
- }
267
- const payload: Record<string, unknown> = { query, text, debug_info: true };
268
- if (submode) payload.submode = submode;
269
- return d3.json(this.baseURL + '/api/analyze-semantic', {
270
- method: 'POST',
271
- body: JSON.stringify(payload),
272
- headers: this.getHeaders()
273
- }).then((response: any) => {
 
 
 
 
 
274
  if (response && response.success === false) {
275
  throw new Error(response.message || 'Semantic analysis failed');
276
  }
277
  return response;
278
- });
 
 
 
279
  }
280
 
281
  /**
@@ -285,10 +296,12 @@ export class TextAnalysisAPI {
285
  query: string,
286
  text: string,
287
  onProgress: (step: number, totalSteps: number, stage: string, percentage?: number) => void,
288
- submode?: string
289
- ): Promise<{ success: boolean; model?: string; token_attention?: Array<{ offset: [number, number]; raw: string; score: number }>; topk_tokens?: string[]; topk_probs?: number[]; full_match_degree?: number; message?: string }> {
 
290
  const payload: Record<string, unknown> = { query, text, stream: true, debug_info: true };
291
  if (submode) payload.submode = submode;
 
292
  return this.fetchSSEStream(
293
  '/api/analyze-semantic',
294
  payload,
@@ -386,7 +399,10 @@ export class TextAnalysisAPI {
386
  reject(new Error(parsed.message || errorMessage));
387
  }
388
  } catch (e) {
389
- console.warn('Failed to parse SSE message:', e, data);
 
 
 
390
  }
391
  }
392
 
 
5
  import * as d3 from "d3";
6
  import URLHandler from "../utils/URLHandler";
7
  import {cleanSpecials} from "../utils/Util";
8
+ import * as semanticResultCache from "../utils/semanticResultCache";
9
  import {AnalyzeResponse, AnalyzeResult, TokenWithOffset} from "./generatedSchemas";
10
 
11
  export type FrontendToken = TokenWithOffset & { bpe_merged?: boolean };
 
255
  query: string,
256
  text: string,
257
  onProgress?: (step: number, totalSteps: number, stage: string, percentage?: number) => void,
258
+ submode?: string,
259
+ fullMatchDegreeOnly?: boolean
260
+ ): Promise<{ success: boolean; model?: string; token_attention?: Array<{ offset: [number, number]; raw: string; score: number }>; debug_info?: { abbrev?: string; topk_tokens?: string[]; topk_probs?: number[] }; full_match_degree?: number; message?: string }> {
261
  if (submode === 'hybrid') {
262
+ const r1 = await this.analyzeSemantic(query, text, onProgress, 'count', true);
263
  const r2 = await this.analyzeSemantic(query, text, onProgress, 'fill_blank');
264
+ const fromCache = (r1 as { __fromCache?: boolean }).__fromCache && (r2 as { __fromCache?: boolean }).__fromCache;
265
+ return { ...r2, full_match_degree: r1.full_match_degree, __fromCache: fromCache } as typeof r2 & { __fromCache?: boolean };
266
  }
267
+ const cacheSubmode = submode;
268
+ const cached = semanticResultCache.get(text, query, cacheSubmode);
269
+ if (cached && (fullMatchDegreeOnly || cached.token_attention)) return { ...cached, __fromCache: true } as typeof cached & { __fromCache?: boolean };
270
+ const doRequest = async (): Promise<typeof cached> => {
271
+ if (onProgress) {
272
+ return this.analyzeSemanticWithProgress(query, text, onProgress, submode, fullMatchDegreeOnly);
273
+ }
274
+ const payload: Record<string, unknown> = { query, text, debug_info: true };
275
+ if (submode) payload.submode = submode;
276
+ if (fullMatchDegreeOnly) payload.full_match_degree_only = true;
277
+ const response = await d3.json(this.baseURL + '/api/analyze-semantic', {
278
+ method: 'POST',
279
+ body: JSON.stringify(payload),
280
+ headers: this.getHeaders()
281
+ }) as any;
282
  if (response && response.success === false) {
283
  throw new Error(response.message || 'Semantic analysis failed');
284
  }
285
  return response;
286
+ };
287
+ const res = await doRequest();
288
+ if (res?.success) semanticResultCache.set(text, query, res, cacheSubmode);
289
+ return res;
290
  }
291
 
292
  /**
 
296
  query: string,
297
  text: string,
298
  onProgress: (step: number, totalSteps: number, stage: string, percentage?: number) => void,
299
+ submode?: string,
300
+ fullMatchDegreeOnly?: boolean
301
+ ): Promise<{ success: boolean; model?: string; token_attention?: Array<{ offset: [number, number]; raw: string; score: number }>; debug_info?: { abbrev?: string; topk_tokens?: string[]; topk_probs?: number[] }; full_match_degree?: number; message?: string }> {
302
  const payload: Record<string, unknown> = { query, text, stream: true, debug_info: true };
303
  if (submode) payload.submode = submode;
304
+ if (fullMatchDegreeOnly) payload.full_match_degree_only = true;
305
  return this.fetchSSEStream(
306
  '/api/analyze-semantic',
307
  payload,
 
399
  reject(new Error(parsed.message || errorMessage));
400
  }
401
  } catch (e) {
402
+ const msg = e instanceof SyntaxError
403
+ ? `SSE 数据解析失败:${e.message}(可能是后端返回了无效 JSON,如 NaN)`
404
+ : `SSE 消息处理失败:${e instanceof Error ? e.message : String(e)}`;
405
+ reject(new Error(msg));
406
  }
407
  }
408
 
client/src/ts/appInitializer.ts CHANGED
@@ -6,7 +6,7 @@
6
  import * as d3 from 'd3';
7
  import { SimpleEventHandler } from './utils/SimpleEventHandler';
8
  import { TextAnalysisAPI } from './api/GLTR_API';
9
- import { getTokenSurprisalColor, getByteSurprisalColor } from './utils/SurprisalColorConfig';
10
 
11
  /**
12
  * 公共初始化返回对象
@@ -33,8 +33,8 @@ export function initializeCommonApp(apiPrefix: string = '', element?: Element):
33
  return {
34
  eventHandler: new SimpleEventHandler(targetElement),
35
  api: new TextAnalysisAPI(apiPrefix),
36
- tokenSurprisalColorScale: getTokenSurprisalColor,
37
- byteSurprisalColorScale: getByteSurprisalColor,
38
  totalSurprisalFormat: (n: number | null) => n !== null && Number.isFinite(n) ? format(n) : String(n)
39
  };
40
  }
 
6
  import * as d3 from 'd3';
7
  import { SimpleEventHandler } from './utils/SimpleEventHandler';
8
  import { TextAnalysisAPI } from './api/GLTR_API';
9
+ import { getTokenSurprisalColor, getByteSurprisalColor, HISTOGRAM_MIN_ALPHA } from './utils/SurprisalColorConfig';
10
 
11
  /**
12
  * 公共初始化返回对象
 
33
  return {
34
  eventHandler: new SimpleEventHandler(targetElement),
35
  api: new TextAnalysisAPI(apiPrefix),
36
+ tokenSurprisalColorScale: (v) => getTokenSurprisalColor(v, HISTOGRAM_MIN_ALPHA),
37
+ byteSurprisalColorScale: (v) => getByteSurprisalColor(v, 1, HISTOGRAM_MIN_ALPHA),
38
  totalSurprisalFormat: (n: number | null) => n !== null && Number.isFinite(n) ? format(n) : String(n)
39
  };
40
  }
client/src/ts/compare.ts CHANGED
@@ -348,6 +348,7 @@ window.onload = () => {
348
  colorScale: tokenSurprisalColorScale,
349
  averageValue: stats.tokenAverage ?? undefined,
350
  p90Value: stats.tokenP90 ?? undefined,
 
351
  });
352
 
353
  // 更新列视图中 token surprisal histogram 的标题文本
@@ -1270,11 +1271,11 @@ window.onload = () => {
1270
  // 初始化主题管理器(在所有函数定义之后)
1271
  const themeManager = initThemeManager({
1272
  onThemeChange: () => {
1273
- // 主题切换时重新渲染所有图表
1274
  columnsData.forEach((col) => {
1275
  if (col.data && col.stats) {
1276
  renderStatsForColumn(col.id, col);
1277
  }
 
1278
  });
1279
  }
1280
  });
 
348
  colorScale: tokenSurprisalColorScale,
349
  averageValue: stats.tokenAverage ?? undefined,
350
  p90Value: stats.tokenP90 ?? undefined,
351
+ p90Label: tokenHistogramConfig.averageLabel,
352
  });
353
 
354
  // 更新列视图中 token surprisal histogram 的标题文本
 
1271
  // 初始化主题管理器(在所有函数定义之后)
1272
  const themeManager = initThemeManager({
1273
  onThemeChange: () => {
 
1274
  columnsData.forEach((col) => {
1275
  if (col.data && col.stats) {
1276
  renderStatsForColumn(col.id, col);
1277
  }
1278
+ requestAnimationFrame(() => col.lmfInstance?.reRenderCurrent());
1279
  });
1280
  }
1281
  });
client/src/ts/controllers/highlightController.ts CHANGED
@@ -3,14 +3,15 @@ import type { GLTR_Text_Box } from '../vis/GLTR_Text_Box';
3
  import type { Histogram } from '../vis/Histogram';
4
  import type { HistogramBinClickEvent } from '../vis/Histogram';
5
  import type { FrontendAnalyzeResult } from '../api/GLTR_API';
6
- import { calculateHighlights, type HistogramType } from '../utils/highlightUtils';
 
 
7
 
8
  export type HighlightControllerOptions = {
9
  stats_frac: Histogram;
10
- /** Semantic score histogram(可选) */
11
- stats_semantic_score?: Histogram;
12
  lmf: GLTR_Text_Box;
13
- currentData: { result: FrontendAnalyzeResult } | null;
14
  };
15
 
16
  export class HighlightController {
@@ -25,7 +26,7 @@ export class HighlightController {
25
  */
26
  public clearHighlights(): void {
27
  this.options.stats_frac.clearSelection();
28
- this.options.stats_semantic_score?.clearSelection();
29
  this.options.lmf.clearHighlight();
30
  }
31
 
@@ -43,32 +44,32 @@ export class HighlightController {
43
  }
44
 
45
  const { x0, x1, binIndex, no_bins, source } = ev;
46
- const data = currentData.result;
47
 
48
- // 首页:根据直方图 source 区分类型
49
  let histogramType: HistogramType = 'token';
50
- if (source === 'stats_semantic_score') {
51
- histogramType = 'semantic';
52
- }
53
 
54
- // 同一视图内仅保持一个直方图选中状态
55
- if (histogramType === 'semantic') {
56
  this.options.stats_frac.clearSelection();
57
  } else {
58
- this.options.stats_semantic_score?.clearSelection();
59
  }
60
 
61
- const { indices, style } = calculateHighlights(histogramType, x0, x1, binIndex, no_bins, data);
62
 
63
  this.options.lmf.setHighlightedIndices(indices, style);
64
  }
65
 
 
 
 
 
 
66
  /**
67
  * 更新当前数据(当数据变化时调用)
68
  */
69
- public updateCurrentData(currentData: { result: FrontendAnalyzeResult } | null): void {
70
- // 创建一个新对象来更新,保持 options 对象的引用不变
71
- (this.options as any).currentData = currentData;
72
  }
73
  }
74
 
 
3
  import type { Histogram } from '../vis/Histogram';
4
  import type { HistogramBinClickEvent } from '../vis/Histogram';
5
  import type { FrontendAnalyzeResult } from '../api/GLTR_API';
6
+ import { calculateHighlights, type HistogramType, type HighlightData } from '../utils/highlightUtils';
7
+
8
+ export type HighlightCurrentData = { result: FrontendAnalyzeResult; signalProbs?: number[]; pPwValues?: number[]; pwScores?: number[] } | null;
9
 
10
  export type HighlightControllerOptions = {
11
  stats_frac: Histogram;
12
+ stats_raw_score_normed?: Histogram;
 
13
  lmf: GLTR_Text_Box;
14
+ currentData: HighlightCurrentData;
15
  };
16
 
17
  export class HighlightController {
 
26
  */
27
  public clearHighlights(): void {
28
  this.options.stats_frac.clearSelection();
29
+ this.options.stats_raw_score_normed?.clearSelection();
30
  this.options.lmf.clearHighlight();
31
  }
32
 
 
44
  }
45
 
46
  const { x0, x1, binIndex, no_bins, source } = ev;
47
+ const highlightData: HighlightData = { ...currentData.result, signalProbs: currentData.signalProbs, pPwValues: currentData.pPwValues, pwScores: currentData.pwScores };
48
 
 
49
  let histogramType: HistogramType = 'token';
50
+ if (source === 'stats_raw_score_normed') histogramType = 'raw_score_normed';
 
 
51
 
52
+ if (histogramType === 'raw_score_normed') {
 
53
  this.options.stats_frac.clearSelection();
54
  } else {
55
+ this.options.stats_raw_score_normed?.clearSelection();
56
  }
57
 
58
+ const { indices, style } = calculateHighlights(histogramType, x0, x1, binIndex, no_bins, highlightData);
59
 
60
  this.options.lmf.setHighlightedIndices(indices, style);
61
  }
62
 
63
+ /** 获取当前高亮数据 */
64
+ public getCurrentData(): HighlightCurrentData {
65
+ return (this.options as { currentData: HighlightCurrentData }).currentData;
66
+ }
67
+
68
  /**
69
  * 更新当前数据(当数据变化时调用)
70
  */
71
+ public updateCurrentData(currentData: HighlightCurrentData): void {
72
+ (this.options as { currentData: HighlightCurrentData }).currentData = currentData;
 
73
  }
74
  }
75
 
client/src/ts/lang/translations.ts CHANGED
@@ -188,12 +188,18 @@ export const translations: Translations = {
188
  'information per token histogram': 'token信息量直方图',
189
  'information per token progress': 'token信息量进度图',
190
  'token index': 'token索引',
191
- 'semantic score histogram': '语义得分直方图',
192
- 'score': '得分',
 
 
 
193
 
194
  // ========== Tooltip 内容 ==========
195
  'information density:': '信息密度:',
196
- 'match score:': '匹配度:',
 
 
 
197
  'Match: {0}%': '匹配度: {0}%',
198
  'raw score:': '原始分数:',
199
  'prob:': '概率:',
 
188
  'information per token histogram': 'token信息量直方图',
189
  'information per token progress': 'token信息量进度图',
190
  'token index': 'token索引',
191
+ 'raw score normed histogram': '归一化原始直方图',
192
+ 'semantic signal prob histogram': '语义信号概率直方图',
193
+ 'signal prob': 'signal概率',
194
+ 'signal ratio': '信号比',
195
+ 'pw score': 'pw 分数',
196
 
197
  // ========== Tooltip 内容 ==========
198
  'information density:': '信息密度:',
199
+ 'pw score:': 'pw 分数:',
200
+ 'signal prob:': 'signal概率:',
201
+ 'signal probability:': '信号概率:',
202
+ 'raw score normed:': '归一化原始分数:',
203
  'Match: {0}%': '匹配度: {0}%',
204
  'raw score:': '原始分数:',
205
  'prob:': '概率:',
client/src/ts/start.ts CHANGED
@@ -47,6 +47,7 @@ import { isValidUrl, extractUrl, isPureUrl } from './utils/urlUtils';
47
  import { AdminManager } from './utils/adminManager';
48
  import { SettingsMenuManager } from './utils/settingsMenuManager';
49
  import { saveHistory, initQueryHistoryDropdown } from './utils/queryHistory';
 
50
  import { playAnalysisCompleteSound } from './utils/soundNotification';
51
 
52
  const current = {
@@ -170,7 +171,7 @@ window.onload = () => {
170
  width: 400, // 宽度
171
  height: 200 // 增加高度从默认150px到200px
172
  });
173
- const stats_semantic_score = new Histogram(d3.select('#stats_semantic_score'), eventHandler, {
174
  width: 400,
175
  height: 200
176
  });
@@ -202,7 +203,7 @@ window.onload = () => {
202
  // 创建高亮控制器
203
  const highlightController = new HighlightController({
204
  stats_frac,
205
- stats_semantic_score,
206
  lmf,
207
  currentData: null
208
  });
@@ -217,7 +218,7 @@ window.onload = () => {
217
  highlightController,
218
  textInputController,
219
  stats_frac,
220
- stats_semantic_score,
221
  stats_surprisal_progress,
222
  appStateManager,
223
  surprisalColorScale: tokenSurprisalColorScale as d3.ScaleSequential<string>
@@ -225,7 +226,9 @@ window.onload = () => {
225
 
226
  // 初始化主题管理器(在设置菜单中)
227
  const themeManager = initThemeManager({
228
- onThemeChange: () => visualizationUpdater.rerenderHistograms()
 
 
229
  }, '#theme_dropdown');
230
 
231
  // 初始化语言管理器(在设置菜单中)
@@ -276,6 +279,47 @@ window.onload = () => {
276
  // Semantic analysis UI 完全由配置决定,初始化时同步
277
  visualizationUpdater.syncSemanticUiFromConfig();
278
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
279
  // *****************************
280
  // ***** demo stuff *****
281
  // *****************************
@@ -805,7 +849,8 @@ window.onload = () => {
805
  visualizationUpdater.handleSemanticResponse(res, text);
806
  appStateManager.setLastSearchedQuery(query);
807
  saveHistory(query);
808
- playAnalysisCompleteSound();
 
809
  const md = res?.full_match_degree;
810
  const mdEl = d3.select('#semantic_match_degree');
811
  if (md != null && typeof md === 'number') {
@@ -838,7 +883,14 @@ window.onload = () => {
838
  input: semanticSearchInput,
839
  dropdownId: 'semantic_search_history_dropdown',
840
  onSelect: () => appStateManager.updateButtonStates(),
841
- onHistorySelect: runSemanticSearch
 
 
 
 
 
 
 
842
  });
843
 
844
  // Save按钮点击事件(使用 serverDemoController)
 
47
  import { AdminManager } from './utils/adminManager';
48
  import { SettingsMenuManager } from './utils/settingsMenuManager';
49
  import { saveHistory, initQueryHistoryDropdown } from './utils/queryHistory';
50
+ import { removeByQuery as removeSemanticCacheByQuery } from './utils/semanticResultCache';
51
  import { playAnalysisCompleteSound } from './utils/soundNotification';
52
 
53
  const current = {
 
171
  width: 400, // 宽度
172
  height: 200 // 增加高度从默认150px到200px
173
  });
174
+ const stats_raw_score_normed = new Histogram(d3.select('#stats_raw_score_normed'), eventHandler, {
175
  width: 400,
176
  height: 200
177
  });
 
203
  // 创建高亮控制器
204
  const highlightController = new HighlightController({
205
  stats_frac,
206
+ stats_raw_score_normed,
207
  lmf,
208
  currentData: null
209
  });
 
218
  highlightController,
219
  textInputController,
220
  stats_frac,
221
+ stats_raw_score_normed,
222
  stats_surprisal_progress,
223
  appStateManager,
224
  surprisalColorScale: tokenSurprisalColorScale as d3.ScaleSequential<string>
 
226
 
227
  // 初始化主题管理器(在设置菜单中)
228
  const themeManager = initThemeManager({
229
+ onThemeChange: () => {
230
+ visualizationUpdater.rerenderOnThemeChange();
231
+ }
232
  }, '#theme_dropdown');
233
 
234
  // 初始化语言管理器(在设置菜单中)
 
279
  // Semantic analysis UI 完全由配置决定,初始化时同步
280
  visualizationUpdater.syncSemanticUiFromConfig();
281
 
282
+ // 语义分析:从 URL 参数恢复查询输入和选项(刷新后可恢复)
283
+ const initSemanticFromUrl = () => {
284
+ const params = URLHandler.parameters;
285
+ const query = params['semantic_query'];
286
+ const submode = params['semantic_submode'];
287
+ const colorSource = params['semantic_color_source'];
288
+ const validSubmodes = ['count', 'match_score', 'fill_blank', 'hybrid'];
289
+ const validColorSources = ['raw_score_normed', 'signal_probability', 'pw_score'];
290
+ if (typeof query === 'string') {
291
+ const el = document.getElementById('semantic_search_input') as HTMLInputElement | null;
292
+ if (el) el.value = query;
293
+ }
294
+ if (typeof submode === 'string' && validSubmodes.includes(submode)) {
295
+ const el = document.getElementById('semantic_submode_select') as HTMLSelectElement | null;
296
+ if (el) el.value = submode;
297
+ }
298
+ if (typeof colorSource === 'string' && validColorSources.includes(colorSource)) {
299
+ const el = document.getElementById('semantic_color_source_select') as HTMLSelectElement | null;
300
+ if (el) el.value = colorSource;
301
+ }
302
+ };
303
+ initSemanticFromUrl();
304
+
305
+ // 语义分析:同步查询和选项到 URL(刷新后可恢复)
306
+ const syncSemanticToUrl = () => {
307
+ const queryEl = document.getElementById('semantic_search_input') as HTMLInputElement | null;
308
+ const submodeEl = document.getElementById('semantic_submode_select') as HTMLSelectElement | null;
309
+ const colorEl = document.getElementById('semantic_color_source_select') as HTMLSelectElement | null;
310
+ const query = queryEl?.value?.trim() ?? '';
311
+ const submode = submodeEl?.value?.trim() ?? '';
312
+ const colorSource = colorEl?.value?.trim() ?? '';
313
+ const currentParams = URLHandler.parameters;
314
+ if (query) currentParams['semantic_query'] = query;
315
+ else delete currentParams['semantic_query'];
316
+ if (submode) currentParams['semantic_submode'] = submode;
317
+ else delete currentParams['semantic_submode'];
318
+ if (colorSource) currentParams['semantic_color_source'] = colorSource;
319
+ else delete currentParams['semantic_color_source'];
320
+ URLHandler.updateUrl(currentParams, false);
321
+ };
322
+
323
  // *****************************
324
  // ***** demo stuff *****
325
  // *****************************
 
849
  visualizationUpdater.handleSemanticResponse(res, text);
850
  appStateManager.setLastSearchedQuery(query);
851
  saveHistory(query);
852
+ syncSemanticToUrl();
853
+ if (!(res as { __fromCache?: boolean }).__fromCache) playAnalysisCompleteSound();
854
  const md = res?.full_match_degree;
855
  const mdEl = d3.select('#semantic_match_degree');
856
  if (md != null && typeof md === 'number') {
 
883
  input: semanticSearchInput,
884
  dropdownId: 'semantic_search_history_dropdown',
885
  onSelect: () => appStateManager.updateButtonStates(),
886
+ onHistorySelect: runSemanticSearch,
887
+ onRemove: removeSemanticCacheByQuery
888
+ });
889
+ semanticSearchInput?.addEventListener('blur', syncSemanticToUrl);
890
+ document.getElementById('semantic_submode_select')?.addEventListener('change', syncSemanticToUrl);
891
+ document.getElementById('semantic_color_source_select')?.addEventListener('change', () => {
892
+ visualizationUpdater.updateSemanticColorSource();
893
+ syncSemanticToUrl();
894
  });
895
 
896
  // Save按钮点击事件(使用 serverDemoController)
client/src/ts/utils/SurprisalColorConfig.ts CHANGED
@@ -26,15 +26,18 @@ export const MINIMAP_COLOR_FACTOR = 1.3;
26
  const SURPRISAL_RED_RGB = "255, 71, 64";
27
  const SURPRISAL_MAX_ALPHA = 0.7;
28
 
 
 
 
29
  /**
30
  * 根据归一化值获取对应的颜色(输入值应在[0,1]区间)
31
  * @param normalizedValue 归一化后的值,范围[0,1]
32
- * @returns 颜色字符串(rgba格式从透明到红色)
33
  */
34
-
35
- export function getSurprisalColorNormalized(normalizedValue: number): string {
36
  const clampedValue = Math.max(0, Math.min(1, normalizedValue));
37
- const alpha = clampedValue * SURPRISAL_MAX_ALPHA;
 
38
  return `rgba(${SURPRISAL_RED_RGB}, ${alpha})`;
39
  }
40
 
@@ -57,32 +60,33 @@ function normalizeTo_01(value: number, maxValue: number): number {
57
  /**
58
  * 根据token惊讶度值获取对应的颜色(线性映射,不取整)
59
  * @param surprisal token惊讶度值,范围[0, TOKEN_SURPRISAL_MAX]会被映射到[0, 1],超出范围会被截断
60
- * @returns 颜色字符串(rgba格式)
61
  */
62
- export function getTokenSurprisalColor(surprisal: number): string {
63
  const normalizedValue = normalizeTo_01(surprisal, TOKEN_SURPRISAL_MAX);
64
- return getSurprisalColorNormalized(normalizedValue);
65
  }
66
 
67
  /**
68
  * 根据byte密度惊讶度值获取对应的颜色(线性映射,不取整)
69
  * @param byteSurprisal byte密度惊讶度值,范围[0, BYTE_SURPRISAL_MAX]会被映射到[0, 1],超出范围会被截断
70
  * @param colorFactor 颜色因子,用于调整颜色强度,目前主要为了minimap显示更明显(平均后byte surprisal密度会过小,所以需要放大)。默认为1
71
- * @returns 颜色字符串(rgba格式)
72
  */
73
- export function getByteSurprisalColor(byteSurprisal: number, colorFactor: number = 1): string {
74
  const normalizedValue = normalizeTo_01(byteSurprisal * colorFactor, BYTE_SURPRISAL_MAX);
75
- return getSurprisalColorNormalized(normalizedValue);
76
  }
77
 
78
  /**
79
- * 根据 matchScore 获取颜色(用于语义匹配度染色)
80
- * @param matchScore 归一化分数,范围 [0, 1]
 
81
  */
82
- export function getSemanticSimilarityColor(matchScore: number): string {
83
- if (!isFiniteNumber(matchScore)) return 'transparent';
84
- const normalizedValue = normalizeTo_01(matchScore, SEMANTIC_SIMILARITY_MAX);
85
- return getSurprisalColorNormalized(normalizedValue);
86
  }
87
 
88
  // ==========================================
 
26
  const SURPRISAL_RED_RGB = "255, 71, 64";
27
  const SURPRISAL_MAX_ALPHA = 0.7;
28
 
29
+ /** 直方图渐变最浅色 alpha 下限(10% 区间),供直方图使用方配置 */
30
+ export const HISTOGRAM_MIN_ALPHA = 0.1 * SURPRISAL_MAX_ALPHA;
31
+
32
  /**
33
  * 根据归一化值获取对应的颜色(输入值应在[0,1]区间)
34
  * @param normalizedValue 归一化后的值,范围[0,1]
35
+ * @param minAlpha alpha 下限默认不限制
36
  */
37
+ export function getSurprisalColorNormalized(normalizedValue: number, minAlpha?: number): string {
 
38
  const clampedValue = Math.max(0, Math.min(1, normalizedValue));
39
+ let alpha = clampedValue * SURPRISAL_MAX_ALPHA;
40
+ if (minAlpha != null) alpha = Math.max(minAlpha, alpha);
41
  return `rgba(${SURPRISAL_RED_RGB}, ${alpha})`;
42
  }
43
 
 
60
  /**
61
  * 根据token惊讶度值获取对应的颜色(线性映射,不取整)
62
  * @param surprisal token惊讶度值,范围[0, TOKEN_SURPRISAL_MAX]会被映射到[0, 1],超出范围会被截断
63
+ * @param minAlpha alpha 下限,默认不限制
64
  */
65
+ export function getTokenSurprisalColor(surprisal: number, minAlpha?: number): string {
66
  const normalizedValue = normalizeTo_01(surprisal, TOKEN_SURPRISAL_MAX);
67
+ return getSurprisalColorNormalized(normalizedValue, minAlpha);
68
  }
69
 
70
  /**
71
  * 根据byte密度惊讶度值获取对应的颜色(线性映射,不取整)
72
  * @param byteSurprisal byte密度惊讶度值,范围[0, BYTE_SURPRISAL_MAX]会被映射到[0, 1],超出范围会被截断
73
  * @param colorFactor 颜色因子,用于调整颜色强度,目前主要为了minimap显示更明显(平均后byte surprisal密度会过小,所以需要放大)。默认为1
74
+ * @param minAlpha alpha 下限,默认不限制
75
  */
76
+ export function getByteSurprisalColor(byteSurprisal: number, colorFactor: number = 1, minAlpha?: number): string {
77
  const normalizedValue = normalizeTo_01(byteSurprisal * colorFactor, BYTE_SURPRISAL_MAX);
78
+ return getSurprisalColorNormalized(normalizedValue, minAlpha);
79
  }
80
 
81
  /**
82
+ * 根据 rawScoreNormed 获取颜色(用于语义匹配度染色)
83
+ * @param rawScoreNormed 归一化分数,范围 [0, 1]
84
+ * @param minAlpha alpha 下限,默认不限制
85
  */
86
+ export function getSemanticSimilarityColor(rawScoreNormed: number, minAlpha?: number): string {
87
+ if (!isFiniteNumber(rawScoreNormed)) return 'transparent';
88
+ const normalizedValue = normalizeTo_01(rawScoreNormed, SEMANTIC_SIMILARITY_MAX);
89
+ return getSurprisalColorNormalized(normalizedValue, minAlpha);
90
  }
91
 
92
  // ==========================================
client/src/ts/utils/fitQuality.ts ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * 拟合质量计算(纯数学,无 Node 依赖)
3
+ */
4
+
5
+ import { logNormalCdf } from './lognormalFit';
6
+
7
+ /**
8
+ * 计算截尾对数正态在拟合区间内的拟合质量(仅用拟合数据)
9
+ * @returns { maxDiff, rmse, maxDiffIdx } maxDiff = max|CDF_trunc - ECDF|,rmse = sqrt(mean(diff²))
10
+ */
11
+ export function computeFitQuality(
12
+ noise: number[],
13
+ tau: number,
14
+ mu: number,
15
+ sigma: number
16
+ ): { maxDiff: number; rmse: number; maxDiffIdx: number } {
17
+ const nNoise = noise.length;
18
+ if (nNoise < 1) return { maxDiff: NaN, rmse: NaN, maxDiffIdx: -1 };
19
+ const F_tau = logNormalCdf(tau, mu, sigma);
20
+ const cdfTrunc = (x: number) =>
21
+ x <= 0 ? 0 : x >= tau ? 1 : logNormalCdf(x, mu, sigma) / F_tau;
22
+
23
+ let maxDiff = 0;
24
+ let maxDiffIdx = 0;
25
+ let sumSqDiff = 0;
26
+ for (let i = 0; i < nNoise; i++) {
27
+ const x = noise[i]!;
28
+ const ecdf = (i + 1) / nNoise;
29
+ const cdf = cdfTrunc(x);
30
+ const diff = cdf - ecdf;
31
+ if (Math.abs(diff) > maxDiff) {
32
+ maxDiff = Math.abs(diff);
33
+ maxDiffIdx = i;
34
+ }
35
+ sumSqDiff += diff * diff;
36
+ }
37
+ const rmse = Math.sqrt(sumSqDiff / nNoise);
38
+ return { maxDiff, rmse, maxDiffIdx };
39
+ }
client/src/ts/utils/highlightUtils.ts CHANGED
@@ -2,13 +2,25 @@ import type { FrontendAnalyzeResult } from '../api/GLTR_API';
2
  import { calculateSurprisal, calculateSurprisalDensity } from './Util';
3
  import { extractRealTopkFromTokens } from './tokenUtils';
4
 
 
 
 
 
 
 
 
 
 
 
 
 
5
  /**
6
  * 根据直方图 bin 的范围计算需要高亮的 token 索引集合(基于 token surprisal)
7
  * @param x0 bin 起始值
8
  * @param x1 bin 结束值
9
  * @param binIndex bin在bins数组中的索引
10
  * @param no_bins 直方图的总bin数量
11
- * @param result 前端分析结果(包含 originalTokens、mergedTokens、originalToMergedMap)
12
  * @returns 需要高亮的 merged token 索引集合
13
  */
14
  export function calculateTokenSurprisalHighlights(
@@ -16,43 +28,23 @@ export function calculateTokenSurprisalHighlights(
16
  x1: number,
17
  binIndex: number,
18
  no_bins: number,
19
- result: FrontendAnalyzeResult
20
  ): Set<number> {
21
  const highlightedIndices = new Set<number>();
22
- const originalTokens = result.originalTokens;
23
  const originalRealTopk = extractRealTopkFromTokens(originalTokens);
24
- const originalToMergedMap = result.originalToMergedMap;
25
- const mergedTokens = result.mergedTokens;
26
-
27
- // 使用binIndex判断是否是最两侧的bin
28
- const isFirstBin = binIndex === 0; // 第一个bin:包含超出下界的值
29
- const isLastBin = binIndex === no_bins - 1; // 最后一个bin:包含超出上界的值
30
 
31
- // 遍历原始 token,找到 surprisal 在范围内的 token
32
  for (let i = 0; i < originalTokens.length; i++) {
33
  const surprisal = calculateSurprisal(originalRealTopk[i][1]);
34
- let inRange = false;
35
-
36
- if (isFirstBin) {
37
- // 第一个bin:包含所有 < x1 的值(自身bin + 超出下界的数据)
38
- inRange = surprisal < x1;
39
- } else if (isLastBin) {
40
- // 最后一个bin:包含所有 >= x0 的值(自身bin + 超出上界的数据)
41
- inRange = surprisal >= x0;
42
- } else {
43
- // 中间bins:正常范围
44
- inRange = surprisal >= x0 && surprisal < x1;
45
- }
46
-
47
- if (inRange) {
48
- // 映射到 merged token 索引
49
- const mappedIndex = originalToMergedMap[i];
50
- if (Number.isInteger(mappedIndex) && mappedIndex >= 0 && mappedIndex < mergedTokens.length) {
51
- highlightedIndices.add(mappedIndex);
52
- }
53
  }
54
  }
55
-
56
  return highlightedIndices;
57
  }
58
 
@@ -62,7 +54,7 @@ export function calculateTokenSurprisalHighlights(
62
  * @param x1 bin 结束值
63
  * @param binIndex bin在bins数组中的索引
64
  * @param no_bins 直方图的总bin数量
65
- * @param result 前端分析结果(包含 mergedTokens)
66
  * @returns 需要高亮的 merged token 索引集合
67
  */
68
  export function calculateByteSurprisalHighlights(
@@ -70,95 +62,53 @@ export function calculateByteSurprisalHighlights(
70
  x1: number,
71
  binIndex: number,
72
  no_bins: number,
73
- result: FrontendAnalyzeResult
74
  ): Set<number> {
75
  const highlightedIndices = new Set<number>();
76
- const mergedTokens = result.mergedTokens;
77
-
78
- // 使用binIndex判断是否是最两侧的bin
79
- const isFirstBin = binIndex === 0; // 第一个bin:包含超出下界的值
80
- const isLastBin = binIndex === no_bins - 1; // 最后一个bin:包含超出上界的值
81
 
82
- // 遍历 merged token,找到信息密度在范围内的 token
83
  for (let i = 0; i < mergedTokens.length; i++) {
84
  const informationDensity = calculateSurprisalDensity(mergedTokens[i]);
85
- let inRange = false;
86
-
87
- if (isFirstBin) {
88
- // 第一个bin:包含所有 < x1 的值(自身bin + 超出下界的数据)
89
- inRange = informationDensity < x1;
90
- } else if (isLastBin) {
91
- // 最后一个bin:包含所有 >= x0 的值(自身bin + 超出上界的数据)
92
- inRange = informationDensity >= x0;
93
- } else {
94
- // 中间bins:正常范围
95
- inRange = informationDensity >= x0 && informationDensity < x1;
96
- }
97
-
98
- if (inRange) {
99
- highlightedIndices.add(i);
100
- }
101
  }
102
-
103
  return highlightedIndices;
104
  }
105
 
106
  /**
107
- * 根据直方图 bin 的范围计算需要高亮的 token 索引集合(基于 match_score
108
- * 使用 attentionScores(与 mergedTokens 对齐),按 bin 范围筛选
109
  */
110
- export function calculateSemanticScoreHighlights(
111
  x0: number,
112
  x1: number,
113
  binIndex: number,
114
  no_bins: number,
115
- result: FrontendAnalyzeResult & { attentionScores?: number[] }
116
  ): Set<number> {
117
  const highlightedIndices = new Set<number>();
118
- const scores = result.attentionScores;
119
- if (!scores || scores.length === 0) {
120
- return highlightedIndices;
121
- }
122
-
123
- const isFirstBin = binIndex === 0;
124
- const isLastBin = binIndex === no_bins - 1;
125
 
126
  for (let i = 0; i < scores.length; i++) {
127
  const score = scores[i];
128
- if (!Number.isFinite(score)) {
129
- continue;
130
- }
131
-
132
- let inRange = false;
133
- if (isFirstBin) {
134
- inRange = score < x1;
135
- } else if (isLastBin) {
136
- inRange = score >= x0;
137
- } else {
138
- inRange = score >= x0 && score < x1;
139
- }
140
-
141
- if (inRange) {
142
- highlightedIndices.add(i);
143
- }
144
  }
145
-
146
  return highlightedIndices;
147
  }
148
 
149
- /**
150
- * 直方图类型
151
- */
152
- export type HistogramType = 'token' | 'byte' | 'semantic';
153
-
154
  /**
155
  * 根据直方图类型和 bin 范围计算需要高亮的 token 索引集合
156
- * @param histogramType 直方图类型('token' 或 'byte')
157
  * @param x0 bin 起始值
158
  * @param x1 bin 结束值
159
  * @param binIndex bin在bins数组中的索引
160
  * @param no_bins 直方图的总bin数量
161
- * @param result 前端分析结果
162
  * @returns 需要高亮的 merged token 索引集合和对应的高亮样式
163
  */
164
  export function calculateHighlights(
@@ -167,22 +117,22 @@ export function calculateHighlights(
167
  x1: number,
168
  binIndex: number,
169
  no_bins: number,
170
- result: FrontendAnalyzeResult
171
  ): { indices: Set<number>; style: 'border' | 'underline' } {
172
  if (histogramType === 'byte') {
173
  return {
174
- indices: calculateByteSurprisalHighlights(x0, x1, binIndex, no_bins, result),
175
  style: 'underline'
176
  };
177
  }
178
- if (histogramType === 'semantic') {
179
  return {
180
- indices: calculateSemanticScoreHighlights(x0, x1, binIndex, no_bins, result as FrontendAnalyzeResult & { attentionScores?: number[] }),
181
  style: 'underline'
182
  };
183
  }
184
  return {
185
- indices: calculateTokenSurprisalHighlights(x0, x1, binIndex, no_bins, result),
186
  style: 'border'
187
  };
188
  }
 
2
  import { calculateSurprisal, calculateSurprisalDensity } from './Util';
3
  import { extractRealTopkFromTokens } from './tokenUtils';
4
 
5
+ /** 首/末 bin 包含超出范围的值,中间 bin 为 [x0, x1) */
6
+ function valueInBinRange(value: number, x0: number, x1: number, binIndex: number, no_bins: number): boolean {
7
+ const isFirstBin = binIndex === 0;
8
+ const isLastBin = binIndex === no_bins - 1;
9
+ if (isFirstBin) return value < x1;
10
+ if (isLastBin) return value >= x0;
11
+ return value >= x0 && value < x1;
12
+ }
13
+
14
+ export type HistogramType = 'token' | 'byte' | 'raw_score_normed';
15
+ export type HighlightData = FrontendAnalyzeResult & { rawScoresNormed?: number[]; attentionRawScores?: number[]; signalProbs?: number[]; pPwValues?: number[]; pwScores?: number[] };
16
+
17
  /**
18
  * 根据直方图 bin 的范围计算需要高亮的 token 索引集合(基于 token surprisal)
19
  * @param x0 bin 起始值
20
  * @param x1 bin 结束值
21
  * @param binIndex bin在bins数组中的索引
22
  * @param no_bins 直方图的总bin数量
23
+ * @param data 前端分析结果(包含 originalTokens、mergedTokens、originalToMergedMap)
24
  * @returns 需要高亮的 merged token 索引集合
25
  */
26
  export function calculateTokenSurprisalHighlights(
 
28
  x1: number,
29
  binIndex: number,
30
  no_bins: number,
31
+ data: HighlightData
32
  ): Set<number> {
33
  const highlightedIndices = new Set<number>();
34
+ const originalTokens = data.originalTokens;
35
  const originalRealTopk = extractRealTopkFromTokens(originalTokens);
36
+ const originalToMergedMap = data.originalToMergedMap;
37
+ const mergedTokens = data.mergedTokens;
 
 
 
 
38
 
 
39
  for (let i = 0; i < originalTokens.length; i++) {
40
  const surprisal = calculateSurprisal(originalRealTopk[i][1]);
41
+ if (!Number.isFinite(surprisal)) continue;
42
+ if (!valueInBinRange(surprisal, x0, x1, binIndex, no_bins)) continue;
43
+ const mappedIndex = originalToMergedMap[i];
44
+ if (Number.isInteger(mappedIndex) && mappedIndex >= 0 && mappedIndex < mergedTokens.length) {
45
+ highlightedIndices.add(mappedIndex);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  }
47
  }
 
48
  return highlightedIndices;
49
  }
50
 
 
54
  * @param x1 bin 结束值
55
  * @param binIndex bin在bins数组中的索引
56
  * @param no_bins 直方图的总bin数量
57
+ * @param data 前端分析结果(包含 mergedTokens)
58
  * @returns 需要高亮的 merged token 索引集合
59
  */
60
  export function calculateByteSurprisalHighlights(
 
62
  x1: number,
63
  binIndex: number,
64
  no_bins: number,
65
+ data: HighlightData
66
  ): Set<number> {
67
  const highlightedIndices = new Set<number>();
68
+ const mergedTokens = data.mergedTokens;
69
+ if (!mergedTokens?.length) return highlightedIndices;
 
 
 
70
 
 
71
  for (let i = 0; i < mergedTokens.length; i++) {
72
  const informationDensity = calculateSurprisalDensity(mergedTokens[i]);
73
+ if (!Number.isFinite(informationDensity)) continue;
74
+ if (!valueInBinRange(informationDensity, x0, x1, binIndex, no_bins)) continue;
75
+ highlightedIndices.add(i);
 
 
 
 
 
 
 
 
 
 
 
 
 
76
  }
 
77
  return highlightedIndices;
78
  }
79
 
80
  /**
81
+ * 根据直方图 bin 的范围计算需要高亮的 token 索引集合(基于 raw_score_normed
82
+ * 使用 rawScoresNormed(与 mergedTokens 对齐),按 bin 范围筛选
83
  */
84
+ export function calculateRawScoreNormedHighlights(
85
  x0: number,
86
  x1: number,
87
  binIndex: number,
88
  no_bins: number,
89
+ data: HighlightData
90
  ): Set<number> {
91
  const highlightedIndices = new Set<number>();
92
+ const scores = data.rawScoresNormed;
93
+ if (!scores?.length) return highlightedIndices;
 
 
 
 
 
94
 
95
  for (let i = 0; i < scores.length; i++) {
96
  const score = scores[i];
97
+ if (!Number.isFinite(score)) continue;
98
+ if (!valueInBinRange(score, x0, x1, binIndex, no_bins)) continue;
99
+ highlightedIndices.add(i);
 
 
 
 
 
 
 
 
 
 
 
 
 
100
  }
 
101
  return highlightedIndices;
102
  }
103
 
 
 
 
 
 
104
  /**
105
  * 根据直方图类型和 bin 范围计算需要高亮的 token 索引集合
106
+ * @param histogramType 直方图类型
107
  * @param x0 bin 起始值
108
  * @param x1 bin 结束值
109
  * @param binIndex bin在bins数组中的索引
110
  * @param no_bins 直方图的总bin数量
111
+ * @param data 前端分析结果
112
  * @returns 需要高亮的 merged token 索引集合和对应的高亮样式
113
  */
114
  export function calculateHighlights(
 
117
  x1: number,
118
  binIndex: number,
119
  no_bins: number,
120
+ data: HighlightData
121
  ): { indices: Set<number>; style: 'border' | 'underline' } {
122
  if (histogramType === 'byte') {
123
  return {
124
+ indices: calculateByteSurprisalHighlights(x0, x1, binIndex, no_bins, data),
125
  style: 'underline'
126
  };
127
  }
128
+ if (histogramType === 'raw_score_normed') {
129
  return {
130
+ indices: calculateRawScoreNormedHighlights(x0, x1, binIndex, no_bins, data),
131
  style: 'underline'
132
  };
133
  }
134
  return {
135
+ indices: calculateTokenSurprisalHighlights(x0, x1, binIndex, no_bins, data),
136
  style: 'border'
137
  };
138
  }
client/src/ts/utils/lognormalFit.ts ADDED
@@ -0,0 +1,144 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * 对数正态噪声拟合(纯数学,无依赖)
3
+ * 供 visualizationUpdater 使用,可独立在 Node 中测试
4
+ */
5
+
6
+ export const LN_EPS = 1e-10;
7
+
8
+ /** 标准正态 CDF Φ(x),Abramowitz & Stegun 26.2.17 近似 */
9
+ export function normCdf(x: number): number {
10
+ if (x <= -6) return 0;
11
+ if (x >= 6) return 1;
12
+ const a1 = 0.254829592, a2 = -0.284496736, a3 = 1.421413741, a4 = -1.453152027, a5 = 1.061405429, p = 0.3275911;
13
+ const sign = x < 0 ? -1 : 1;
14
+ const t = 1 / (1 + p * Math.abs(x) / Math.SQRT2);
15
+ const y = 1 - (((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t) * Math.exp(-x * x / 2);
16
+ return 0.5 * (1 + sign * y);
17
+ }
18
+
19
+ /** 对数正态 CDF:F(x) = Φ((log(x) - μ) / σ),x > 0 */
20
+ export function logNormalCdf(x: number, mu: number, sigma: number): number {
21
+ if (x <= 0) return 0;
22
+ return normCdf((Math.log(x) - mu) / sigma);
23
+ }
24
+
25
+ /** 区间 [a, b) 在 log-normal(μ,σ) 下的期望计数:n × (CDF(b) - CDF(a)) */
26
+ export function logNormalExpectedCountInInterval(
27
+ a: number, b: number, n: number, mu: number, sigma: number
28
+ ): number {
29
+ return n * (logNormalCdf(b, mu, sigma) - logNormalCdf(a, mu, sigma));
30
+ }
31
+
32
+ /** 对数正态 PDF:f(x) = φ((log(x)-μ)/σ) / (xσ),x > 0 */
33
+ export function logNormalPdf(x: number, mu: number, sigma: number): number {
34
+ if (x <= 0 || sigma <= 0) return 0;
35
+ const z = (Math.log(x) - mu) / sigma;
36
+ return normPdf(z) / (x * sigma);
37
+ }
38
+
39
+ /** 标准正态 PDF φ(x) */
40
+ function normPdf(x: number): number {
41
+ return Math.exp(-x * x / 2) / Math.sqrt(2 * Math.PI);
42
+ }
43
+
44
+ /** 逆 Mills 比率 λ(α) = φ(α)/Φ(α),α → −∞ 时近似 |α| */
45
+ function millsRatio(alpha: number): number {
46
+ const Phi = normCdf(alpha);
47
+ if (Phi < 1e-300) return Math.abs(alpha);
48
+ return normPdf(alpha) / Phi;
49
+ }
50
+
51
+ /**
52
+ * 截尾对数正态 MLE(右截尾于 τ)
53
+ * 导出供测试对比 tau=max(samples) vs tau=固定值
54
+ */
55
+ export function fitLogNormalTruncatedMLE(
56
+ noiseScores: number[],
57
+ tau: number
58
+ ): { mu: number; sigma: number } | null {
59
+ const n = noiseScores.length;
60
+ if (n < 2 || tau <= LN_EPS) return null;
61
+
62
+ const T = Math.log(tau);
63
+ const logData = noiseScores.map(x => Math.log(x));
64
+ const ybar = logData.reduce((a, b) => a + b, 0) / n;
65
+ const s2 = logData.reduce((a, x) => a + (x - ybar) ** 2, 0) / n;
66
+ const s = Math.sqrt(s2);
67
+ if (s <= 0 || !isFinite(s)) return null;
68
+
69
+ const delta = T - ybar;
70
+
71
+ const F = (alpha: number): number => {
72
+ const lam = millsRatio(alpha);
73
+ if (!isFinite(lam)) return delta > 0 ? -1 : 1;
74
+ const g = alpha + lam;
75
+ const h = 1 - lam * g;
76
+ if (h <= 0) return NaN;
77
+ return g - (delta / s) * Math.sqrt(h);
78
+ };
79
+
80
+ const lo0 = -8, hi0 = delta / s + 8;
81
+ const Flo = F(lo0), Fhi = F(hi0);
82
+ if (!isFinite(Flo) || !isFinite(Fhi) || Flo * Fhi > 0) return null;
83
+
84
+ let lo = lo0, hi = hi0, Flo_cur = Flo;
85
+ for (let i = 0; i < 60; i++) {
86
+ const mid = (lo + hi) / 2;
87
+ const Fmid = F(mid);
88
+ if (!isFinite(Fmid) || (hi - lo) < 1e-12) break;
89
+ if (Flo_cur * Fmid <= 0) { hi = mid; }
90
+ else { lo = mid; Flo_cur = Fmid; }
91
+ }
92
+
93
+ const alpha = (lo + hi) / 2;
94
+ const lam = millsRatio(alpha);
95
+ if (!isFinite(lam)) return null;
96
+ const h = 1 - lam * (alpha + lam);
97
+ if (h <= 0) return null;
98
+
99
+ const sigma = s / Math.sqrt(h);
100
+ const mu = ybar + sigma * lam;
101
+ if (!isFinite(sigma) || sigma <= 0 || !isFinite(mu)) return null;
102
+ return { mu, sigma };
103
+ }
104
+
105
+ /*
106
+ * todo: 未知原因的偏差现象:
107
+ * Monte Carlo 下 E[μ̂] 随 n 减小单调增大(系统性正偏),而非围绕真值的随机波动。
108
+ > inforadar@0.1.0 test:lognormal:tau
109
+ > npx tsx ts/utils/lognormalFit.tauBoundary.test.ts
110
+ === 截尾对数正态拟合硬指标测试 ===
111
+
112
+ 真实参数: μ=-2, σ=0.8, τ=1
113
+ Monte Carlo 500 次,fitLogNormalNoiseExpectedCounts percentile=0.9
114
+
115
+ n | E[μ̂] E[σ̂] Δμ Δσ
116
+ -------|------------------------------
117
+ 1600 | -1.9977 0.8013 0.0023 0.0013
118
+ 800 | -1.9950 0.8023 0.0050 0.0023
119
+ 400 | -1.9910 0.8054 0.0090 0.0054
120
+ 200 | -1.9851 0.8059 0.0149 0.0059
121
+ 100 | -1.9722 0.8096 0.0278 0.0096
122
+ 50 | -1.9541 0.8056 0.0459 0.0056
123
+ */
124
+
125
+ /**
126
+ * 从 (μ, σ) 计算直方图各 bin 的期望计数
127
+ */
128
+ export function computeExpectedCounts(
129
+ mu: number,
130
+ sigma: number,
131
+ extent: [number, number],
132
+ noBins: number,
133
+ n: number
134
+ ): number[] {
135
+ const binWidth = (extent[1] - extent[0]) / noBins;
136
+ const expectedCounts: number[] = [];
137
+ for (let i = 0; i < noBins; i++) {
138
+ const a = extent[0] + i * binWidth;
139
+ const b = extent[0] + (i + 1) * binWidth;
140
+ const p = logNormalCdf(b, mu, sigma) - logNormalCdf(a, mu, sigma);
141
+ expectedCounts.push(n * p);
142
+ }
143
+ return expectedCounts;
144
+ }
client/src/ts/utils/queryHistory.ts CHANGED
@@ -41,10 +41,12 @@ export interface InitQueryHistoryDropdownOptions {
41
  dropdownId: string;
42
  onSelect: () => void;
43
  onHistorySelect?: () => void;
 
 
44
  }
45
 
46
  export function initQueryHistoryDropdown(options: InitQueryHistoryDropdownOptions): void {
47
- const { input, dropdownId, onSelect, onHistorySelect } = options;
48
  if (!input) return;
49
 
50
  const wrapper = input.closest('.semantic-search-input-wrapper');
@@ -82,6 +84,7 @@ export function initQueryHistoryDropdown(options: InitQueryHistoryDropdownOption
82
  btn.onclick = (e) => {
83
  e.stopPropagation();
84
  remove(q);
 
85
  render();
86
  };
87
  li.appendChild(span);
 
41
  dropdownId: string;
42
  onSelect: () => void;
43
  onHistorySelect?: () => void;
44
+ /** 删除某条历史时回调,用于同步清理相关缓存 */
45
+ onRemove?: (query: string) => void;
46
  }
47
 
48
  export function initQueryHistoryDropdown(options: InitQueryHistoryDropdownOptions): void {
49
+ const { input, dropdownId, onSelect, onHistorySelect, onRemove } = options;
50
  if (!input) return;
51
 
52
  const wrapper = input.closest('.semantic-search-input-wrapper');
 
84
  btn.onclick = (e) => {
85
  e.stopPropagation();
86
  remove(q);
87
+ onRemove?.(q);
88
  render();
89
  };
90
  li.appendChild(span);
client/src/ts/utils/semanticResultCache.ts ADDED
@@ -0,0 +1,104 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * 语义分析结果缓存:以 text + query + submode 的 hash 为索引,最大 100 条。
3
+ * 持久化到 localStorage,刷新后保留。删除查询历史时需调用 removeByQuery 清理对应缓存。
4
+ */
5
+
6
+ const MAX_SIZE = 100;
7
+ const STORAGE_KEY = 'info_radar_semantic_result_cache';
8
+
9
+ export type SemanticCacheResult = {
10
+ success: boolean;
11
+ model?: string;
12
+ token_attention?: Array<{ offset: [number, number]; raw: string; score: number }>;
13
+ debug_info?: { abbrev?: string; topk_tokens?: string[]; topk_probs?: number[] };
14
+ full_match_degree?: number;
15
+ message?: string;
16
+ };
17
+
18
+ type StoredEntry = SemanticCacheResult & { _query?: string };
19
+
20
+ function simpleHash(s: string): string {
21
+ let h = 0;
22
+ for (let i = 0; i < s.length; i++) {
23
+ h = ((h << 5) - h + s.charCodeAt(i)) | 0;
24
+ }
25
+ return (h >>> 0).toString(36);
26
+ }
27
+
28
+ function buildKey(text: string, query: string, submode?: string): string {
29
+ const parts = [text, query, submode ?? ''];
30
+ return simpleHash(parts.join('\0'));
31
+ }
32
+
33
+ const cache = new Map<string, StoredEntry>();
34
+ let keyOrder: string[] = [];
35
+
36
+ function load(): void {
37
+ try {
38
+ const raw = localStorage.getItem(STORAGE_KEY);
39
+ if (!raw) return;
40
+ const parsed = JSON.parse(raw) as { entries?: Record<string, StoredEntry>; keyOrder?: string[] };
41
+ if (!parsed?.entries || typeof parsed.entries !== 'object') return;
42
+ cache.clear();
43
+ for (const [k, v] of Object.entries(parsed.entries)) {
44
+ if (v && typeof v === 'object') cache.set(k, v);
45
+ }
46
+ keyOrder = Array.isArray(parsed.keyOrder)
47
+ ? parsed.keyOrder.filter((k) => cache.has(k)).slice(-MAX_SIZE)
48
+ : [...cache.keys()];
49
+ } catch {
50
+ cache.clear();
51
+ keyOrder = [];
52
+ }
53
+ }
54
+
55
+ load();
56
+
57
+ function persist(): void {
58
+ try {
59
+ const entries: Record<string, StoredEntry> = {};
60
+ for (const [k, v] of cache) entries[k] = v;
61
+ localStorage.setItem(STORAGE_KEY, JSON.stringify({ entries, keyOrder }));
62
+ } catch {
63
+ // QuotaExceededError 等,忽略
64
+ }
65
+ }
66
+
67
+ function evictOne(): void {
68
+ if (keyOrder.length < MAX_SIZE) return;
69
+ const oldest = keyOrder.shift()!;
70
+ cache.delete(oldest);
71
+ }
72
+
73
+ export function get(text: string, query: string, submode?: string): SemanticCacheResult | undefined {
74
+ const key = buildKey(text, query, submode);
75
+ const entry = cache.get(key);
76
+ if (!entry) return undefined;
77
+ const { _query, ...rest } = entry as SemanticCacheResult & { _query?: string };
78
+ return rest;
79
+ }
80
+
81
+ export function set(text: string, query: string, result: SemanticCacheResult, submode?: string): void {
82
+ const key = buildKey(text, query, submode);
83
+ if (cache.has(key)) {
84
+ const idx = keyOrder.indexOf(key);
85
+ if (idx >= 0) keyOrder.splice(idx, 1);
86
+ }
87
+ evictOne();
88
+ cache.set(key, { ...result, _query: query });
89
+ keyOrder.push(key);
90
+ persist();
91
+ }
92
+
93
+ export function removeByQuery(query: string): void {
94
+ const keysToRemove: string[] = [];
95
+ for (const [key, entry] of cache) {
96
+ if (entry._query === query) keysToRemove.push(key);
97
+ }
98
+ for (const key of keysToRemove) {
99
+ cache.delete(key);
100
+ const idx = keyOrder.indexOf(key);
101
+ if (idx >= 0) keyOrder.splice(idx, 1);
102
+ }
103
+ if (keysToRemove.length) persist();
104
+ }
client/src/ts/utils/signalThresholdDetector.ts ADDED
@@ -0,0 +1,330 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * 信号阈值检测:自动找到「噪声/信号」边界
3
+ *
4
+ * 输入:raw score normed [0,1]
5
+ * 输出:{ threshold, confidence, mu, sigma };无命中时返回 null
6
+ *
7
+ * 算法概要:
8
+ * 1. 迭代 0:用全部样本(P0=1)拟合截尾对数正态 (μ, σ),从 startPercentile 分位 bin 起逐 bin 扫描
9
+ * - 每个 bin [τ_left, τ_right) 左闭右开:obsInBin = 该 bin 内观测计数,expInBin = n × (CDF(τ_right) - CDF(τ_left))
10
+ * - 纯噪声区:信号样本不在 bin 内 → excess ≈ 0
11
+ * - 到信号边界:bin 内出现超额样本 → excess 跃升
12
+ * - 不重叠扫描:bin 边界取相邻点中点,τ_right >= τ_left + MIN_BIN_WIDTH,obsInBin >= MIN_OBSERVED
13
+ * - 误报概率:cumulativeFalsePositiveProbability = ∏(1-Φ(excess_i)),excess>excessMin 时累积,否则重置
14
+ * - 当 cumulativeFalsePositiveProbability <= 1-CONFIDENCE_THRESHOLD 时,取首次命中 bin 的左边界 sorted[j] 为阈值(保守)
15
+ * - 若全程无命中,返回 null
16
+ * 2. 迭代 1..N:用 threshold 以下样本重拟合,再扫描;阈值变化不大则提前结束;任一迭代无命中则返回 null
17
+ *
18
+ * 与现有 lognormalFit 逻辑独立,未来可能替换现有拟合代码
19
+ */
20
+
21
+ import { fitLogNormalTruncatedMLE, logNormalExpectedCountInInterval, normCdf, LN_EPS } from './lognormalFit';
22
+ import { computeFitQuality } from './fitQuality';
23
+
24
+ /** 置信度阈值,达到此值即判定「确定找到」信号边界;默认 0.9999 */
25
+ const CONFIDENCE_THRESHOLD = 0.9999;
26
+ /** excess 最小阈值,排除无意义随机波动;需 excess > 此值才计为命中 */
27
+ const EXCESS_MIN = 0.1;
28
+ const MIN_OBSERVED = 1; // 每个 bin 至少 N 个观测
29
+ const MIN_BIN_WIDTH = 0.01; // bin 最小宽度;边界取相邻点中点
30
+ const MIN_SAMPLE_SIZE = 20;
31
+ const P0 = 1; // 迭代初始的样本拟合比例
32
+ const MAX_REFINE_ITER = 10;
33
+ const THRESHOLD_CONVERGE_EPS = 0.01; //迭代收敛阈值
34
+ /** 扫描起始分位,默认 0.5(从 50% 分位所在 bin 开始) */
35
+ const START_PERCENTILE_DEFAULT = 0.5;
36
+ /** 输出过滤:最终 confidence 低于此值则返回 null;与 CONFIDENCE_THRESHOLD 不同,后者用于内部扫描判定 */
37
+ const MIN_OUTPUT_CONFIDENCE = 0.9;
38
+ /** expInBin 最小有效值,避免除零或数值不稳定 */
39
+ const EXP_IN_BIN_EPS = 1e-10;
40
+
41
+ /** 内部:evaluateBins 的中间结果,仅 threshold + confidence */
42
+ interface SignalThresholdScanResult {
43
+ threshold: number;
44
+ confidence: number;
45
+ }
46
+
47
+ /** 对外:findSignalThreshold 成功时的完整结果,mu/sigma/bins 必存在 */
48
+ export interface SignalThresholdResult {
49
+ threshold: number;
50
+ /** 0~1,统计置信度:有命中时 1-误报概率 */
51
+ confidence: number;
52
+ /** 最终拟合的 μ(供 histogram 使用) */
53
+ mu: number;
54
+ /** 最终拟合的 σ(供 histogram 使用) */
55
+ sigma: number;
56
+ /** 全范围 bins(供 signal prob 等使用) */
57
+ bins: SignalThresholdBin[];
58
+ }
59
+
60
+ export interface SignalThresholdBin {
61
+ tauLeft: number;
62
+ tauRight: number;
63
+ obsInBin: number;
64
+ expInBin: number;
65
+ }
66
+
67
+ /** 内部:bin 结构(tauLeft/tauRight/obsInBin)仅依赖 sorted,迭代间不变 */
68
+ interface BinStructure {
69
+ tauLeft: number;
70
+ tauRight: number;
71
+ obsInBin: number;
72
+ }
73
+
74
+ const TAU_RIGHT_EPSILON = 1e-6;
75
+
76
+ const PERCENTILE_DIAGNOSTICS = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1] as const;
77
+
78
+ /** 计算 excess = (obs - exp) / sqrt(exp),exp 过小时避免除零 */
79
+ function computeExcess(obsInBin: number, expInBin: number): number {
80
+ if (expInBin <= EXP_IN_BIN_EPS) return obsInBin > 0 ? Infinity : 0;
81
+ return (obsInBin - expInBin) / Math.sqrt(expInBin);
82
+ }
83
+
84
+ /** 打印不同分位数下的拟合结果,用于验证渐近一致性 */
85
+ function logPercentileDiagnostics(scores: number[]): void {
86
+ const sorted = [...scores].sort((a, b) => a - b);
87
+ const n = sorted.length;
88
+ if (n < 2) return;
89
+ const rows: Array<{ p: number; n: number; mu: number; sigma: number }> = [];
90
+ for (const p of PERCENTILE_DIAGNOSTICS) {
91
+ const pIdx = Math.max(1, Math.min(n, Math.round(n * p)));
92
+ const noiseNorm = sorted.slice(0, pIdx);
93
+ const tau = pIdx < n ? (sorted[pIdx - 1]! + sorted[pIdx]!) / 2 : sorted[pIdx - 1]!;
94
+ const fit = fitLogNormalTruncatedMLE(noiseNorm, tau);
95
+ if (fit) rows.push({ p, n: pIdx, mu: fit.mu, sigma: fit.sigma });
96
+ }
97
+ if (rows.length === 0) return;
98
+ console.log('[signalThreshold] 渐近一致性诊断 (percentile → μ, σ)');
99
+ for (const { p, n, mu, sigma } of rows) {
100
+ console.log(` p=${p} n=${n}: μ=${mu.toFixed(4)}, σ=${sigma.toFixed(4)}`);
101
+ }
102
+ }
103
+ /** verbose 时打印完整 bin 扫描日志(独立于 evaluateBins,仅追加输出) */
104
+ function printBinScanLogs(bins: SignalThresholdBin[], excessMin: number): void {
105
+ console.log('[signalThreshold] 完整扫描明细 τ_left | τ_right | obsInBin | expInBin | excess | binConf | hit | confidence');
106
+ let cumulativeFalsePositiveProbability = 1;
107
+ let firstHitTauLeft: number | null = null;
108
+ for (const bin of bins) {
109
+ const excess = computeExcess(bin.obsInBin, bin.expInBin);
110
+ const hit = excess > excessMin;
111
+ const binConfidence = normCdf(excess);
112
+ if (hit) {
113
+ if (firstHitTauLeft === null) firstHitTauLeft = bin.tauLeft;
114
+ cumulativeFalsePositiveProbability *= 1 - binConfidence;
115
+ const confidence = 1 - cumulativeFalsePositiveProbability;
116
+ console.log(`[signalThreshold] ${bin.tauLeft.toFixed(4)} | ${bin.tauRight.toFixed(4)} | ${String(bin.obsInBin).padStart(7)} | ${bin.expInBin.toFixed(1).padStart(8)} | ${excess.toFixed(2).padStart(6)} | ${binConfidence.toFixed(4)} | ✓ | ${confidence.toFixed(4)}`);
117
+ } else {
118
+ cumulativeFalsePositiveProbability = 1;
119
+ firstHitTauLeft = null;
120
+ console.log(`[signalThreshold] ${bin.tauLeft.toFixed(4)} | ${bin.tauRight.toFixed(4)} | ${String(bin.obsInBin).padStart(7)} | ${bin.expInBin.toFixed(1).padStart(8)} | ${excess.toFixed(2).padStart(6)} | ${binConfidence.toFixed(4)} | | -`);
121
+ }
122
+ }
123
+ }
124
+
125
+ /** bin 边界取相邻点中点,τ_right >= τ_left + MIN_BIN_WIDTH,obsInBin >= MIN_OBSERVED;仅依赖 sorted,迭代间不变 */
126
+ function formBinStructures(sorted: number[]): BinStructure[] {
127
+ const n = sorted.length;
128
+ const mids: number[] = [];
129
+ for (let i = 0; i < n - 1; i++) mids.push((sorted[i]! + sorted[i + 1]!) / 2);
130
+ const structures: BinStructure[] = [];
131
+ let tauLeft = sorted[0]! - TAU_RIGHT_EPSILON;
132
+
133
+ while (tauLeft < sorted[n - 1]!) {
134
+ let midIdx = mids.findIndex((m) => m >= tauLeft + MIN_BIN_WIDTH);
135
+ let tauRight = midIdx >= 0 ? mids[midIdx]! : sorted[n - 1]! + TAU_RIGHT_EPSILON;
136
+
137
+ let leftIdx = sorted.findIndex((v) => v >= tauLeft);
138
+ let rightIdx = midIdx >= 0 ? sorted.findIndex((v) => v >= tauRight) : -1;
139
+ let obsInBin = leftIdx < 0 ? 0 : rightIdx < 0 ? n - leftIdx : rightIdx - leftIdx;
140
+
141
+ while (obsInBin < MIN_OBSERVED && midIdx >= 0 && midIdx < mids.length - 1) {
142
+ midIdx++;
143
+ tauRight = mids[midIdx]!;
144
+ rightIdx = sorted.findIndex((v) => v >= tauRight);
145
+ obsInBin = leftIdx < 0 ? 0 : rightIdx < 0 ? n - leftIdx : rightIdx - leftIdx;
146
+ }
147
+ if (obsInBin < MIN_OBSERVED) {
148
+ tauRight = sorted[n - 1]! + TAU_RIGHT_EPSILON;
149
+ rightIdx = -1;
150
+ obsInBin = leftIdx < 0 ? 0 : n - leftIdx;
151
+ if (obsInBin < MIN_OBSERVED) break;
152
+ }
153
+
154
+ structures.push({ tauLeft, tauRight, obsInBin });
155
+ tauLeft = tauRight;
156
+ if (tauRight >= sorted[n - 1]! + TAU_RIGHT_EPSILON) break;
157
+ }
158
+ return structures;
159
+ }
160
+
161
+ /** 遍历 bin 结构,按需计算 expInBin,返回阈值结果;通过 obsInBin 累积找到 startPercentile 分位对应 bin,从该 bin 开始扫描 */
162
+ function evaluateBins(
163
+ structures: BinStructure[],
164
+ n: number,
165
+ mu: number,
166
+ sigma: number,
167
+ excessMin: number,
168
+ confidenceThreshold: number,
169
+ verbose: boolean,
170
+ startPercentile: number
171
+ ): SignalThresholdScanResult | null {
172
+ let cumulativeFalsePositiveProbability = 1;
173
+ let firstHitTauLeft: number | null = null;
174
+
175
+ const K = Math.min(Math.floor((n - 1) * startPercentile), n - 1);
176
+ let cumSum = 0;
177
+ let startIdx = 0;
178
+ for (let i = 0; i < structures.length; i++) {
179
+ if (K < cumSum + structures[i]!.obsInBin) {
180
+ startIdx = i;
181
+ break;
182
+ }
183
+ cumSum += structures[i]!.obsInBin;
184
+ }
185
+ const structuresToScan = structures.slice(startIdx);
186
+
187
+ if (verbose) {
188
+ console.log('[signalThreshold] 扫描明细 τ_left | τ_right | obsInBin | expInBin | excess | binConf | hit | confidence');
189
+ }
190
+
191
+ for (const s of structuresToScan) {
192
+ const expInBin = logNormalExpectedCountInInterval(s.tauLeft, s.tauRight, n, mu, sigma);
193
+ const excess = computeExcess(s.obsInBin, expInBin);
194
+ const hit = excess > excessMin;
195
+ const binConfidence = normCdf(excess);
196
+
197
+ if (hit) {
198
+ if (firstHitTauLeft === null) firstHitTauLeft = s.tauLeft;
199
+ cumulativeFalsePositiveProbability *= 1 - binConfidence;
200
+ const confidence = 1 - cumulativeFalsePositiveProbability;
201
+ if (verbose) {
202
+ console.log(`[signalThreshold] ${s.tauLeft.toFixed(4)} | ${s.tauRight.toFixed(4)} | ${String(s.obsInBin).padStart(7)} | ${expInBin.toFixed(1).padStart(8)} | ${excess.toFixed(2).padStart(6)} | ${binConfidence.toFixed(4)} | ✓ | ${confidence.toFixed(4)}`);
203
+ }
204
+ if (confidence >= confidenceThreshold) {
205
+ return { threshold: firstHitTauLeft, confidence };
206
+ }
207
+ } else {
208
+ cumulativeFalsePositiveProbability = 1;
209
+ firstHitTauLeft = null;
210
+ if (verbose) {
211
+ console.log(`[signalThreshold] ${s.tauLeft.toFixed(4)} | ${s.tauRight.toFixed(4)} | ${String(s.obsInBin).padStart(7)} | ${expInBin.toFixed(1).padStart(8)} | ${excess.toFixed(2).padStart(6)} | ${binConfidence.toFixed(4)} | | -`);
212
+ }
213
+ }
214
+ }
215
+
216
+ if (firstHitTauLeft !== null) {
217
+ return { threshold: firstHitTauLeft, confidence: 1 - cumulativeFalsePositiveProbability };
218
+ }
219
+ return null;
220
+ }
221
+
222
+ /**
223
+ * 从 raw score normed 数组自动检测信号阈值
224
+ * @param rawScoresNormed 归一化分数 [0,1]
225
+ * @param verbose 是否输出详细日志,默认 false
226
+ * @returns 成功时返回完整结果 { threshold, confidence, mu, sigma, bins };样本不足、拟合失败或无命中时返回 null
227
+ */
228
+ export function findSignalThreshold(
229
+ rawScoresNormed: number[],
230
+ verbose = false
231
+ ): SignalThresholdResult | null {
232
+ const values = rawScoresNormed.filter(
233
+ (s) => typeof s === 'number' && isFinite(s) && s > LN_EPS
234
+ );
235
+ const sorted = [...values].sort((a, b) => a - b);
236
+ const n = sorted.length;
237
+
238
+ if (n < MIN_SAMPLE_SIZE) {
239
+ if (verbose) console.log('[signalThreshold] 样本不足 n<', MIN_SAMPLE_SIZE, ',跳过');
240
+ return null;
241
+ }
242
+
243
+ const p0 = P0;
244
+ const splitIdx = Math.max(1, Math.min(n, Math.round(n * p0)));
245
+ if (verbose) console.log('[signalThreshold] n=', n, 'splitIdx=', splitIdx);
246
+
247
+ let result: SignalThresholdScanResult | null = null;
248
+ let lastFit = { mu: 0, sigma: 0 };
249
+ const binStructures = formBinStructures(sorted);
250
+
251
+ for (let iter = 0; iter <= MAX_REFINE_ITER; iter++) {
252
+ if (iter > 0 && result === null) return null;
253
+ const thresholdForNoise = result?.threshold ?? 0;
254
+ const noiseSamples = iter === 0
255
+ ? sorted.slice(0, splitIdx)
256
+ : sorted.filter((x) => x <= thresholdForNoise);
257
+ const tauBoundary = iter === 0
258
+ ? (splitIdx < n ? (sorted[splitIdx - 1]! + sorted[splitIdx]!) / 2 : sorted[splitIdx - 1]!)
259
+ : thresholdForNoise;
260
+
261
+ if (iter > 0 && noiseSamples.length < MIN_SAMPLE_SIZE) {
262
+ if (verbose) console.log('[signalThreshold] 迭代', iter, '提前结束:噪声样本数<', MIN_SAMPLE_SIZE);
263
+ return null;
264
+ }
265
+
266
+ if (verbose && iter === 0) {
267
+ const nInit = noiseSamples.length;
268
+ const minN = noiseSamples[0]!, maxN = noiseSamples[nInit - 1]!;
269
+ const midN = noiseSamples[Math.floor(nInit / 2)]!;
270
+ console.log('[signalThreshold] 迭代 0 噪声样本 n=', nInit, 'min=', minN.toFixed(4), 'max=', maxN.toFixed(4), 'median=', midN.toFixed(4));
271
+ }
272
+
273
+ const fit = fitLogNormalTruncatedMLE(noiseSamples, tauBoundary);
274
+ if (fit === null) {
275
+ if (verbose) console.log('[signalThreshold] 迭代', iter, '拟合失败');
276
+ return null;
277
+ }
278
+ lastFit = { mu: fit.mu, sigma: fit.sigma };
279
+
280
+ const q = computeFitQuality(noiseSamples, tauBoundary, fit.mu, fit.sigma);
281
+ if (verbose) {
282
+ console.log('[signalThreshold] 迭代', iter, '拟合 μ=', fit.mu.toFixed(4), 'σ=', fit.sigma.toFixed(4), '| maxDiff=', q.maxDiff.toFixed(4), 'RMSE=', q.rmse.toFixed(4));
283
+ if (iter === 0) {
284
+ console.log('[signalThreshold] 迭代', iter, '从', (START_PERCENTILE_DEFAULT * 100).toFixed(0), '% 分位 bin 开始扫描 (excess>', EXCESS_MIN, ', confidence>=', CONFIDENCE_THRESHOLD, ')');
285
+ }
286
+ }
287
+
288
+ const scanResult = evaluateBins(binStructures, n, fit.mu, fit.sigma, EXCESS_MIN, CONFIDENCE_THRESHOLD, verbose, START_PERCENTILE_DEFAULT);
289
+ if (scanResult === null) {
290
+ if (verbose) console.log('[signalThreshold] 迭代', iter, '未检测到阈值');
291
+ return null;
292
+ }
293
+
294
+ const savedThreshold = result?.threshold;
295
+ result = scanResult;
296
+
297
+ if (iter > 0 && savedThreshold !== undefined) {
298
+ const delta = Math.abs(result.threshold - savedThreshold);
299
+ if (verbose) {
300
+ console.log('[signalThreshold] 迭代', iter, '新阈值=', result.threshold.toFixed(4), 'confidence=', result.confidence.toFixed(2), 'delta=', delta.toFixed(6));
301
+ }
302
+ if (delta < THRESHOLD_CONVERGE_EPS) {
303
+ if (verbose) console.log('[signalThreshold] 迭代', iter, '收敛,最终阈值=', result.threshold.toFixed(4));
304
+ break;
305
+ }
306
+ if (iter === MAX_REFINE_ITER && verbose) {
307
+ console.log('[signalThreshold] 达到最大迭代次数,最终阈值=', result.threshold.toFixed(4));
308
+ }
309
+ } else if (verbose) {
310
+ console.log('[signalThreshold] 迭代 0 检测到阈值', result.threshold.toFixed(4), 'confidence=', result.confidence.toFixed(2));
311
+ }
312
+ }
313
+
314
+ const bins: SignalThresholdBin[] = binStructures.map((s) => ({
315
+ ...s,
316
+ expInBin: logNormalExpectedCountInInterval(s.tauLeft, s.tauRight, n, lastFit.mu, lastFit.sigma),
317
+ }));
318
+ if (verbose && bins.length > 0) {
319
+ printBinScanLogs(bins, EXCESS_MIN);
320
+ logPercentileDiagnostics(values);
321
+ }
322
+ if (result === null) return null;
323
+ if (result.confidence < MIN_OUTPUT_CONFIDENCE) {
324
+ if (verbose) {
325
+ console.warn('[signalThreshold] confidence <', (MIN_OUTPUT_CONFIDENCE * 100).toFixed(0), '%,返回 null。当前 confidence=', result.confidence.toFixed(2));
326
+ }
327
+ return null;
328
+ }
329
+ return { ...result, mu: lastFit.mu, sigma: lastFit.sigma, bins };
330
+ }
client/src/ts/utils/tokenDisplayUtils.ts ADDED
@@ -0,0 +1,97 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Token 文本显示工具:特殊字符可视化、HTML 转义
3
+ * 与 Tooltip、TopK 图表等共享
4
+ */
5
+
6
+ function escapeHtmlImpl(text: string): string {
7
+ const div = document.createElement('div');
8
+ div.textContent = text;
9
+ return div.innerHTML;
10
+ }
11
+
12
+ function isWhitespaceChar(char: string): boolean {
13
+ return /\p{White_Space}/u.test(char);
14
+ }
15
+
16
+ function isPrintableChar(char: string): boolean {
17
+ if (isWhitespaceChar(char)) return false;
18
+ const codePoint = char.codePointAt(0);
19
+ if (codePoint === undefined) return false;
20
+ if (codePoint >= 32 && codePoint <= 126) return true;
21
+ if (
22
+ (codePoint >= 0x00A0 && codePoint <= 0x00FF) ||
23
+ (codePoint >= 0x0100 && codePoint <= 0x017F) ||
24
+ (codePoint >= 0x0180 && codePoint <= 0x024F) ||
25
+ (codePoint >= 0x2000 && codePoint <= 0x206F) ||
26
+ (codePoint >= 0x2070 && codePoint <= 0x209F) ||
27
+ (codePoint >= 0x20A0 && codePoint <= 0x20CF) ||
28
+ (codePoint >= 0x2100 && codePoint <= 0x214F) ||
29
+ (codePoint >= 0x2190 && codePoint <= 0x21FF) ||
30
+ (codePoint >= 0x2200 && codePoint <= 0x22FF) ||
31
+ (codePoint >= 0x2300 && codePoint <= 0x23FF) ||
32
+ (codePoint >= 0x2400 && codePoint <= 0x243F) ||
33
+ (codePoint >= 0x2E00 && codePoint <= 0x2E7F) ||
34
+ (codePoint >= 0x3000 && codePoint <= 0x303F) ||
35
+ (codePoint >= 0x3040 && codePoint <= 0x309F) ||
36
+ (codePoint >= 0x30A0 && codePoint <= 0x30FF) ||
37
+ (codePoint >= 0x4E00 && codePoint <= 0x9FFF) ||
38
+ (codePoint >= 0xAC00 && codePoint <= 0xD7AF) ||
39
+ (codePoint >= 0xF900 && codePoint <= 0xFAFF) ||
40
+ (codePoint >= 0xFF00 && codePoint <= 0xFFEF)
41
+ ) return true;
42
+ return false;
43
+ }
44
+
45
+ function visualizeSpecialCharsImpl(text: string): string {
46
+ let result = text
47
+ .replace(/\r\n/g, '[CRLF]')
48
+ .replace(/\n/g, '[LF]')
49
+ .replace(/\r/g, '[CR]')
50
+ .replace(/\t/g, '[TAB]')
51
+ .replace(/\u3000/g, '[FS]')
52
+ .replace(/ /g, '·');
53
+
54
+ const processed: string[] = [];
55
+ let inBracket = false;
56
+
57
+ for (let i = 0; i < result.length; i++) {
58
+ const char = result[i];
59
+ if (char === '[') {
60
+ inBracket = true;
61
+ processed.push(char);
62
+ } else if (char === ']' && inBracket) {
63
+ processed.push(char);
64
+ inBracket = false;
65
+ } else if (inBracket) {
66
+ processed.push(char);
67
+ } else {
68
+ if (isPrintableChar(char)) {
69
+ processed.push(char);
70
+ } else {
71
+ const codePoint = char.codePointAt(0);
72
+ if (codePoint !== undefined) {
73
+ const hexCode = codePoint.toString(16).toUpperCase().padStart(4, '0');
74
+ processed.push(`[U+${hexCode}]`);
75
+ } else {
76
+ processed.push(char);
77
+ }
78
+ }
79
+ }
80
+ }
81
+ return processed.join('');
82
+ }
83
+
84
+ /** 处理候选词文本,与主 token 保持一致:先可视化特殊字符,再 HTML 转义 */
85
+ export function processCandidateText(text: string): string {
86
+ return escapeHtmlImpl(visualizeSpecialCharsImpl(text));
87
+ }
88
+
89
+ /** HTML 转义 */
90
+ export function escapeHtml(text: string): string {
91
+ return escapeHtmlImpl(text);
92
+ }
93
+
94
+ /** 可视化特殊字符 */
95
+ export function visualizeSpecialChars(text: string): string {
96
+ return visualizeSpecialCharsImpl(text);
97
+ }
client/src/ts/utils/topkChartUtils.ts ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * TopK 图表渲染工具:与 Tooltip 中的 topk 图表完全一致
3
+ * 供 Tooltip 和语义分析 debug info 复用
4
+ */
5
+
6
+ import * as d3 from 'd3';
7
+ import { processCandidateText } from './tokenDisplayUtils';
8
+
9
+ const DISPLAY_TOPK = 10;
10
+ /** Tooltip 默认条形宽度 */
11
+ const MAX_BAR_WIDTH = 60;
12
+ /** Semantic debug 专用:更大条形与列宽,tooltip 不受影响 */
13
+ const SEMANTIC_DEBUG_MAX_BAR = 100;
14
+ const SEMANTIC_DEBUG_BAR_CELL = 180;
15
+
16
+ export interface TopkChartOptions {
17
+ /** 高亮的 token(与当前 token 一致时用 selectedColor) */
18
+ selectedToken?: string;
19
+ normalColor?: string;
20
+ selectedColor?: string;
21
+ /** 条形最大宽度 px */
22
+ maxBarWidth?: number;
23
+ /** 条形列单元格宽度 px */
24
+ barCellWidth?: number;
25
+ numFormat?: (n: number) => string;
26
+ }
27
+
28
+ function getThemeColors(): { normalColor: string; selectedColor: string } {
29
+ const isDark = document.documentElement.getAttribute('data-theme') === 'dark';
30
+ return {
31
+ normalColor: isDark ? '#ccc' : '#333',
32
+ selectedColor: isDark ? '#ff6666' : '#933',
33
+ };
34
+ }
35
+
36
+ /** 生成与 Tooltip 完全一致的 TopK 图表 HTML */
37
+ export function renderTopkChartHtml(
38
+ data: Array<{ token: string; prob: number }>,
39
+ options?: TopkChartOptions
40
+ ): string {
41
+ if (!data.length) return '';
42
+
43
+ const { normalColor, selectedColor } = getThemeColors();
44
+ const norm = options?.normalColor ?? normalColor;
45
+ const sel = options?.selectedColor ?? selectedColor;
46
+ const maxBar = options?.maxBarWidth ?? MAX_BAR_WIDTH;
47
+ const numF = options?.numFormat ?? d3.format('.3f');
48
+
49
+ const maxProb = data[0]?.prob ?? 1;
50
+ const scale = d3.scaleLinear().domain([0, maxProb]).range([0, maxBar]);
51
+ const barCellW = options?.barCellWidth ?? 110;
52
+
53
+ const rows = data.slice(0, DISPLAY_TOPK).map((d) => {
54
+ const color = options?.selectedToken !== undefined && d.token === options.selectedToken ? sel : norm;
55
+ const bar = `<div style="display: table-cell; width:${barCellW}px;padding-left:5px;">` +
56
+ `<div style="display:inline-block;width: ${scale(d.prob)}px;background-color:${color};height: 10px;"></div>` +
57
+ ` <div style="display:inline-block;color: ${color};">${numF(d.prob)}</div></div>`;
58
+ const text = `<div style="display: table-cell;color: ${color};padding-right:5px;">${processCandidateText(d.token)}</div>`;
59
+ return `<div class="row" style="display: table-row;">${bar} ${text}</div>`;
60
+ });
61
+
62
+ return rows.join('');
63
+ }
64
+
65
+ /** 生成完整 TopK 图表 HTML(含容器),用于独立展示如 semantic debug。tooltip 用 renderTopkChartHtml,不传尺寸选项。 */
66
+ export function renderTopkChartFullHtml(data: Array<{ token: string; prob: number }>, options?: TopkChartOptions): string {
67
+ const opts = options ?? {};
68
+ const semanticOpts = { ...opts, maxBarWidth: opts.maxBarWidth ?? SEMANTIC_DEBUG_MAX_BAR, barCellWidth: opts.barCellWidth ?? SEMANTIC_DEBUG_BAR_CELL };
69
+ const rows = renderTopkChartHtml(data, semanticOpts);
70
+ return rows ? `<div class="semantic-debug-topk-chart predictions predictions-table">${rows}</div>` : '';
71
+ }
client/src/ts/utils/visualizationConfigs.ts CHANGED
@@ -1,4 +1,7 @@
1
  import { tr } from '../lang/i18n-lite';
 
 
 
2
 
3
  /**
4
  * 直方图基础配置类型
@@ -6,12 +9,14 @@ import { tr } from '../lang/i18n-lite';
6
  export interface HistogramBaseConfig {
7
  label: string;
8
  no_bins: number;
9
- extent: [number, number];
10
- averageLabel: string;
11
  showLeftInfinity?: boolean;
12
  showRightInfinity?: boolean;
13
  xAxisTickSkip?: number;
14
- yScaleType?: 'linear' | 'sqrt';
 
 
15
  }
16
 
17
  /**
@@ -51,7 +56,8 @@ export const getByteSurprisalHistogramConfig = (): HistogramBaseConfig => ({
51
  export const getDeltaByteSurprisalHistogramConfig = (): HistogramBaseConfig => ({
52
  label: tr("Δinformation per byte histogram"),
53
  no_bins: 20,
54
- xAxisTickSkip: 1, // x轴刻度数字绘制间隔
 
55
  extent: [-5, 5],
56
  averageLabel: tr("Δ bits/Byte"),
57
  showLeftInfinity: true,
@@ -68,14 +74,14 @@ export const getSurprisalProgressConfig = (): ScatterPlotBaseConfig => ({
68
  });
69
 
70
  /**
71
- * 获取 Semantic score 直方图配置(基于 match_score,归一化 0-1)
72
  */
73
- export const getSemanticScoreHistogramConfig = (): HistogramBaseConfig => ({
74
- label: tr("semantic score histogram"),
75
  no_bins: 20,
76
  xAxisTickSkip: 1,
 
77
  extent: [0, 1],
78
- averageLabel: tr("score"),
79
  yScaleType: 'sqrt',
80
  });
81
 
 
1
  import { tr } from '../lang/i18n-lite';
2
+ import type { HistogramExtent, HistogramExtentBound } from '../vis/Histogram';
3
+
4
+ export type { HistogramExtent, HistogramExtentBound };
5
 
6
  /**
7
  * 直方图基础配置类型
 
9
  export interface HistogramBaseConfig {
10
  label: string;
11
  no_bins: number;
12
+ extent: HistogramExtent;
13
+ averageLabel?: string;
14
  showLeftInfinity?: boolean;
15
  showRightInfinity?: boolean;
16
  xAxisTickSkip?: number;
17
+ /** x轴刻度凑整:true=仅显示 step 整数倍处的标签,false/undefined=显示全部 */
18
+ xAxisTickRound?: boolean;
19
+ yScaleType?: 'linear' | 'sqrt' | 'log';
20
  }
21
 
22
  /**
 
56
  export const getDeltaByteSurprisalHistogramConfig = (): HistogramBaseConfig => ({
57
  label: tr("Δinformation per byte histogram"),
58
  no_bins: 20,
59
+ xAxisTickSkip: 1,
60
+ xAxisTickRound: true,
61
  extent: [-5, 5],
62
  averageLabel: tr("Δ bits/Byte"),
63
  showLeftInfinity: true,
 
74
  });
75
 
76
  /**
77
+ * 获取 Raw score normed 直方图配置(归一化 0-1)
78
  */
79
+ export const getRawScoreNormedHistogramConfig = (): HistogramBaseConfig => ({
80
+ label: tr("semantic raw score histogram"),
81
  no_bins: 20,
82
  xAxisTickSkip: 1,
83
+ xAxisTickRound: true,
84
  extent: [0, 1],
 
85
  yScaleType: 'sqrt',
86
  });
87
 
client/src/ts/utils/visualizationUpdater.ts CHANGED
@@ -23,18 +23,38 @@ import {
23
  } from './dataValidation';
24
  import {
25
  calculateTextStats,
 
26
  type TextStats
27
  } from './textStatistics';
28
  import {
29
  getTokenSurprisalHistogramConfig,
30
  getSurprisalProgressConfig,
31
- getSemanticScoreHistogramConfig
32
  } from "./visualizationConfigs";
33
  import { isFiniteNumber } from './Util';
34
- import { getSemanticSimilarityColor } from './SurprisalColorConfig';
35
  import { showAlertDialog } from '../ui/dialog';
36
  import { tr } from '../lang/i18n-lite';
 
 
37
  import { getSemanticAnalysisEnabled } from './semanticAnalysisManager';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
38
 
39
  /**
40
  * 可视化更新依赖
@@ -44,7 +64,7 @@ export interface VisualizationDependencies {
44
  highlightController: HighlightController;
45
  textInputController: TextInputController;
46
  stats_frac: Histogram;
47
- stats_semantic_score: Histogram;
48
  stats_surprisal_progress: ScatterPlot;
49
  appStateManager: AppStateManager;
50
  surprisalColorScale: d3.ScaleSequential<string>;
@@ -140,7 +160,7 @@ export class VisualizationUpdater {
140
  /**
141
  * 计算展示结果:仅信息密度 / 仅语义 / 联合(两者一致时)
142
  */
143
- private computeDisplayResult(): (FrontendAnalyzeResult & { attentionScores?: number[]; attentionRawScores?: number[] }) | null {
144
  const info = this.currentState.infoDensityData;
145
  const sem = this.currentState.semanticData;
146
  const infoResult = info?.result as FrontendAnalyzeResult | undefined;
@@ -175,7 +195,7 @@ export class VisualizationUpdater {
175
  mergedTokens: filteredMerged,
176
  bpe_strings: filteredMerged,
177
  originalToMergedMap: filteredOriginalToMergedMap,
178
- attentionScores: this.normalizeScoresForColor(scores),
179
  attentionRawScores: scores,
180
  };
181
  }
@@ -194,7 +214,7 @@ export class VisualizationUpdater {
194
  const trimmed = text.trim();
195
  const tokenHistogramItem = document.getElementById('token_histogram_item');
196
  const surprisalProgressItem = document.getElementById('surprisal_progress_item');
197
- const semanticScoreItem = document.getElementById('semantic_score_histogram_item');
198
 
199
  const infoText = (this.currentState.infoDensityData?.request?.text ?? '').trim();
200
  const semText = (this.currentState.semanticData?.text ?? '').trim();
@@ -212,7 +232,7 @@ export class VisualizationUpdater {
212
 
213
  if (tokenHistogramItem) tokenHistogramItem.style.display = showInfoDensity ? '' : 'none';
214
  if (surprisalProgressItem) surprisalProgressItem.style.display = showInfoDensity ? '' : 'none';
215
- if (semanticScoreItem) semanticScoreItem.style.display = showSemantic ? '' : 'none';
216
 
217
  // pending 时渲染空统计图(坐标轴 + 空柱体/散点),避免空白
218
  if (showInfoDensity && mode === 'infoDensity') {
@@ -226,24 +246,25 @@ export class VisualizationUpdater {
226
  if (progressTitle && progressConfig.label) progressTitle.textContent = progressConfig.label;
227
  }
228
  if (showSemantic && mode === 'semantic') {
229
- const semanticConfig = getSemanticScoreHistogramConfig();
230
- this.deps.stats_semantic_score.update({ ...semanticConfig, data: [], colorScale: () => 'transparent' });
231
- const titleEl = document.getElementById('semantic_score_histogram_title');
232
- if (titleEl) titleEl.textContent = semanticConfig.label;
233
  }
234
  }
235
 
236
  /**
237
  * 重新渲染直方图(内部方法)
238
- * 仅信息密度:只显示 token/surprisal progress;仅语义:只显示 semantic score;联合:全部显示
 
239
  */
240
- private rerenderHistogramsInternal(): void {
241
  const hasInfoDensity = !!this.currentState.infoDensityData;
242
  const displayResult = this.computeDisplayResult();
243
 
244
  const tokenHistogramItem = document.getElementById('token_histogram_item');
245
  const surprisalProgressItem = document.getElementById('surprisal_progress_item');
246
- const semanticScoreItem = document.getElementById('semantic_score_histogram_item');
247
 
248
  if (hasInfoDensity) {
249
  const currentSurprisals = this.currentState.currentSurprisals;
@@ -257,6 +278,7 @@ export class VisualizationUpdater {
257
  colorScale: this.deps.surprisalColorScale,
258
  averageValue: currentTokenAvg ?? undefined,
259
  p90Value: currentTokenP90 ?? undefined,
 
260
  });
261
  const titleElement = document.getElementById('token_histogram_title');
262
  if (titleElement) titleElement.textContent = tokenHistogramConfig.label;
@@ -279,31 +301,125 @@ export class VisualizationUpdater {
279
  if (surprisalProgressItem) surprisalProgressItem.style.display = 'none';
280
  }
281
 
282
- const matchScores = displayResult?.attentionScores;
283
- const validMatchScores = matchScores?.filter((s) => typeof s === 'number' && isFinite(s));
284
- if (validMatchScores && validMatchScores.length > 0) {
285
- const semanticConfig = getSemanticScoreHistogramConfig();
286
- const avgScore = validMatchScores.reduce((a, b) => a + b, 0) / validMatchScores.length;
287
- const colorScale = (v: number) => getSemanticSimilarityColor(v);
288
- this.deps.stats_semantic_score.update({
289
- ...semanticConfig,
290
- data: validMatchScores,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
291
  colorScale,
292
- averageValue: avgScore,
 
 
 
 
293
  });
294
- const titleEl = document.getElementById('semantic_score_histogram_title');
295
- if (titleEl) titleEl.textContent = semanticConfig.label;
296
- if (semanticScoreItem) semanticScoreItem.style.display = '';
297
- } else if (semanticScoreItem) {
298
- semanticScoreItem.style.display = 'none';
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
  }
300
  }
301
 
302
- /**
303
- * 重新渲染直方图(公开方法,供外部调用,如主题切换时)
304
- */
305
  public rerenderHistograms(): void {
306
- this.rerenderHistogramsInternal();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
307
  }
308
 
309
  /**
@@ -486,8 +602,7 @@ export class VisualizationUpdater {
486
  res: {
487
  model?: string;
488
  token_attention?: Array<{ offset: [number, number]; raw: string; score: number }>;
489
- debug_abbrev?: string;
490
- debug_top10?: Array<{ token: string; prob: number }>;
491
  },
492
  text?: string
493
  ): void {
@@ -523,12 +638,16 @@ export class VisualizationUpdater {
523
  enableRenderAnimation: false,
524
  semanticAnalysisMode: getSemanticAnalysisEnabled(),
525
  }, false);
526
- this.deps.lmf.update(displayResult);
527
  this.clearHighlights();
 
528
  this.rerenderHistogramsInternal();
529
  this.syncSemanticUiFromConfig();
530
 
531
- this.updateSemanticDebugInfo(res.debug_abbrev, res.debug_top10);
 
 
 
 
532
  }
533
 
534
  /** 更新文本渲染区下方的 debug 信息(abbrev + top10) */
@@ -553,6 +672,7 @@ export class VisualizationUpdater {
553
  if (top10?.length) {
554
  const items = top10.map((t) => `'${esc(t.token)}(${(t.prob * 100).toFixed(1)}%)'`);
555
  parts.push(`<div class="semantic-debug-top10">[${items.join(', ')}]</div>`);
 
556
  }
557
  el.innerHTML = parts.join('');
558
  }
@@ -561,7 +681,7 @@ export class VisualizationUpdater {
561
  res: { model?: string },
562
  tokenAttention: Array<{ offset: [number, number]; raw: string; score: number }>,
563
  text: string
564
- ): (FrontendAnalyzeResult & { attentionScores: number[]; attentionRawScores: number[] }) | null {
565
  const safeText = text.trim();
566
  if (!safeText) return null;
567
  const syntheticTokens = tokenAttention.map((t) => ({
@@ -583,7 +703,7 @@ export class VisualizationUpdater {
583
  mergedTokens,
584
  originalToMergedMap,
585
  originalText: safeText,
586
- attentionScores: this.normalizeScoresForColor(scores),
587
  attentionRawScores: scores,
588
  };
589
  }
@@ -637,7 +757,7 @@ export class VisualizationUpdater {
637
  }
638
 
639
  /**
640
- * 将 score 归一化到 [0,1] 用于染色(0-max 归一化:norm = score / max)
641
  * NaN/Inf 不参与 max 计算,映射为 0
642
  */
643
  private normalizeScoresForColor(scores: number[]): number[] {
 
23
  } from './dataValidation';
24
  import {
25
  calculateTextStats,
26
+ computeP90,
27
  type TextStats
28
  } from './textStatistics';
29
  import {
30
  getTokenSurprisalHistogramConfig,
31
  getSurprisalProgressConfig,
32
+ getRawScoreNormedHistogramConfig
33
  } from "./visualizationConfigs";
34
  import { isFiniteNumber } from './Util';
35
+ import { getSemanticSimilarityColor, HISTOGRAM_MIN_ALPHA } from './SurprisalColorConfig';
36
  import { showAlertDialog } from '../ui/dialog';
37
  import { tr } from '../lang/i18n-lite';
38
+ import { computeExpectedCounts } from './lognormalFit';
39
+ import { findSignalThreshold, type SignalThresholdBin } from './signalThresholdDetector';
40
  import { getSemanticAnalysisEnabled } from './semanticAnalysisManager';
41
+ import { renderTopkChartFullHtml } from './topkChartUtils';
42
+
43
+
44
+ /**
45
+ * P(signal | raw_score_normed = s) 复用 findSignalThreshold 的 bins
46
+ * 每个样本 s 落入对应 bin,P(signal) = (obsInBin - expInBin) / obsInBin
47
+ */
48
+ function signalProbFromBins(scores: number[], bins: SignalThresholdBin[]): number[] {
49
+ if (scores.length === 0 || bins.length === 0) return [];
50
+ const tauLefts = bins.map((b) => b.tauLeft);
51
+ return scores.map((s) => {
52
+ const i = Math.max(0, Math.min(bins.length - 1, d3.bisectRight(tauLefts, s) - 1));
53
+ const b = bins[i]!;
54
+ if (s < b.tauLeft || s >= b.tauRight) return 0;
55
+ return b.obsInBin > 0 ? Math.max(0, Math.min(1, (b.obsInBin - b.expInBin) / b.obsInBin)) : 0;
56
+ });
57
+ }
58
 
59
  /**
60
  * 可视化更新依赖
 
64
  highlightController: HighlightController;
65
  textInputController: TextInputController;
66
  stats_frac: Histogram;
67
+ stats_raw_score_normed: Histogram;
68
  stats_surprisal_progress: ScatterPlot;
69
  appStateManager: AppStateManager;
70
  surprisalColorScale: d3.ScaleSequential<string>;
 
160
  /**
161
  * 计算展示结果:仅信息密度 / 仅语义 / 联合(两者一致时)
162
  */
163
+ private computeDisplayResult(): (FrontendAnalyzeResult & { rawScoresNormed?: number[]; attentionRawScores?: number[] }) | null {
164
  const info = this.currentState.infoDensityData;
165
  const sem = this.currentState.semanticData;
166
  const infoResult = info?.result as FrontendAnalyzeResult | undefined;
 
195
  mergedTokens: filteredMerged,
196
  bpe_strings: filteredMerged,
197
  originalToMergedMap: filteredOriginalToMergedMap,
198
+ rawScoresNormed: this.normalizeScoresForColor(scores),
199
  attentionRawScores: scores,
200
  };
201
  }
 
214
  const trimmed = text.trim();
215
  const tokenHistogramItem = document.getElementById('token_histogram_item');
216
  const surprisalProgressItem = document.getElementById('surprisal_progress_item');
217
+ const rawScoreNormedItem = document.getElementById('raw_score_normed_histogram_item');
218
 
219
  const infoText = (this.currentState.infoDensityData?.request?.text ?? '').trim();
220
  const semText = (this.currentState.semanticData?.text ?? '').trim();
 
232
 
233
  if (tokenHistogramItem) tokenHistogramItem.style.display = showInfoDensity ? '' : 'none';
234
  if (surprisalProgressItem) surprisalProgressItem.style.display = showInfoDensity ? '' : 'none';
235
+ if (rawScoreNormedItem) rawScoreNormedItem.style.display = showSemantic ? '' : 'none';
236
 
237
  // pending 时渲染空统计图(坐标轴 + 空柱体/散点),避免空白
238
  if (showInfoDensity && mode === 'infoDensity') {
 
246
  if (progressTitle && progressConfig.label) progressTitle.textContent = progressConfig.label;
247
  }
248
  if (showSemantic && mode === 'semantic') {
249
+ const rawScoreNormedConfig = getRawScoreNormedHistogramConfig();
250
+ this.deps.stats_raw_score_normed.update({ ...rawScoreNormedConfig, data: [], colorScale: () => 'transparent' });
251
+ const titleEl = document.getElementById('raw_score_normed_histogram_title');
252
+ if (titleEl) titleEl.textContent = rawScoreNormedConfig.label;
253
  }
254
  }
255
 
256
  /**
257
  * 重新渲染直方图(内部方法)
258
+ * 仅信息密度:只显示 token/surprisal progress;仅语义:只显示 raw score normed;联合:全部显示
259
+ * @param skipLmfUpdate 为 true 时跳过 lmf.update(主题切换时由 rerenderOnThemeChange 统一重绘,避免竞态)
260
  */
261
+ private rerenderHistogramsInternal(skipLmfUpdate = false): void {
262
  const hasInfoDensity = !!this.currentState.infoDensityData;
263
  const displayResult = this.computeDisplayResult();
264
 
265
  const tokenHistogramItem = document.getElementById('token_histogram_item');
266
  const surprisalProgressItem = document.getElementById('surprisal_progress_item');
267
+ const rawScoreNormedItem = document.getElementById('raw_score_normed_histogram_item');
268
 
269
  if (hasInfoDensity) {
270
  const currentSurprisals = this.currentState.currentSurprisals;
 
278
  colorScale: this.deps.surprisalColorScale,
279
  averageValue: currentTokenAvg ?? undefined,
280
  p90Value: currentTokenP90 ?? undefined,
281
+ p90Label: tokenHistogramConfig.averageLabel,
282
  });
283
  const titleElement = document.getElementById('token_histogram_title');
284
  if (titleElement) titleElement.textContent = tokenHistogramConfig.label;
 
301
  if (surprisalProgressItem) surprisalProgressItem.style.display = 'none';
302
  }
303
 
304
+ const rawScoresNormed = displayResult?.rawScoresNormed;
305
+ const validRawScoresNormed = rawScoresNormed?.filter((s) => typeof s === 'number' && isFinite(s));
306
+ if (validRawScoresNormed && validRawScoresNormed.length > 0) {
307
+ const rawScoreNormedConfig = getRawScoreNormedHistogramConfig();
308
+ const colorScale = (v: number) => getSemanticSimilarityColor(v, HISTOGRAM_MIN_ALPHA);
309
+ // 默认关闭 verbose;浏览器控制台执行 window.signalThresholdVerbose = true 后重新搜索可开启
310
+ const verbose = !!(window as Window & { signalThresholdVerbose?: boolean }).signalThresholdVerbose;
311
+ const signalThresholdResult = findSignalThreshold(validRawScoresNormed, verbose);
312
+ console.log('[signalThreshold] 最终结果:', signalThresholdResult !== null
313
+ ? (() => {
314
+ const t = signalThresholdResult!.threshold;
315
+ const below = validRawScoresNormed.filter((s) => s < t).length;
316
+ const quantile = validRawScoresNormed.length > 0 ? (below / validRawScoresNormed.length) : 0;
317
+ return `threshold=${t.toFixed(4)} confidence=${signalThresholdResult!.confidence.toFixed(2)} quantile=${quantile.toFixed(4)} (${below}/${validRawScoresNormed.length} below)`;
318
+ })()
319
+ : 'null(无信号)');
320
+ if (!verbose) {
321
+ console.log('[signalThreshold] 提示:控制台执行 window.signalThresholdVerbose = true 后重新搜索可查看完整 bin 扫描日志');
322
+ }
323
+ const fitResult = validRawScoresNormed.length >= 2 && signalThresholdResult != null
324
+ ? {
325
+ mu: signalThresholdResult.mu,
326
+ sigma: signalThresholdResult.sigma,
327
+ expectedCounts: computeExpectedCounts(
328
+ signalThresholdResult.mu,
329
+ signalThresholdResult.sigma,
330
+ rawScoreNormedConfig.extent as [number, number],
331
+ rawScoreNormedConfig.no_bins,
332
+ validRawScoresNormed.length
333
+ ),
334
+ }
335
+ : null;
336
+ console.log('[raw score normed histogram] fitted log-normal μ, σ:', fitResult ? [fitResult.mu, fitResult.sigma] : 'failed');
337
+ const signalProbs = signalThresholdResult != null
338
+ ? signalProbFromBins(validRawScoresNormed, signalThresholdResult.bins)
339
+ : [];
340
+ /**
341
+ * P_pw:后验信号概率的简化映射,x <= threshold 时为 0,x > threshold 时为 1
342
+ * pw_score = score × P_pw
343
+ * 基于 rawScoresNormed 全数组计算,保证与 token 索引对齐
344
+ */
345
+ const rawScoresNormedFull = displayResult!.rawScoresNormed ?? [];
346
+ const t = signalThresholdResult?.threshold ?? 0;
347
+ const pPwValues = signalThresholdResult != null
348
+ ? rawScoresNormedFull.map((s) => (typeof s === 'number' && isFinite(s) && s > t ? 1 : 0))
349
+ : [];
350
+ const pwScores = signalThresholdResult != null
351
+ ? rawScoresNormedFull.map((s) => (typeof s === 'number' && isFinite(s) && s > t ? s : 0))
352
+ : [];
353
+ const probCurveData = signalProbs.length > 0
354
+ ? (() => {
355
+ const pairs = validRawScoresNormed.map((x, i) => ({ x, y: signalProbs[i]! })).sort((a, b) => a.x - b.x);
356
+ return { x: pairs.map(p => p.x), y: pairs.map(p => p.y) };
357
+ })()
358
+ : undefined;
359
+ const signalThresholdPercentile = signalThresholdResult != null && validRawScoresNormed.length > 0
360
+ ? Math.round((validRawScoresNormed.filter((s) => s < signalThresholdResult.threshold).length / validRawScoresNormed.length) * 100)
361
+ : undefined;
362
+ this.deps.stats_raw_score_normed.update({
363
+ ...rawScoreNormedConfig,
364
+ data: validRawScoresNormed,
365
  colorScale,
366
+ fitExpectedCounts: fitResult?.expectedCounts,
367
+ showProbCurve: true,
368
+ probCurveData: probCurveData?.x.length ? probCurveData : undefined,
369
+ signalThreshold: signalThresholdResult?.threshold ?? undefined,
370
+ signalThresholdPercentile: signalThresholdPercentile ?? undefined,
371
  });
372
+ const titleEl = document.getElementById('raw_score_normed_histogram_title');
373
+ if (titleEl) titleEl.textContent = rawScoreNormedConfig.label;
374
+ if (rawScoreNormedItem) rawScoreNormedItem.style.display = '';
375
+
376
+ const colorSourceEl = document.getElementById('semantic_color_source_select') as HTMLSelectElement | null;
377
+ const scoresForColor = colorSourceEl?.value === 'signal_probability' ? pPwValues
378
+ : colorSourceEl?.value === 'pw_score' ? pwScores
379
+ : (displayResult!.rawScoresNormed ?? []);
380
+
381
+ if (fitResult != null) {
382
+ const resultWithExt = { ...displayResult, signalProbs, pPwValues, pwScores };
383
+ this.deps.highlightController.updateCurrentData({ result: resultWithExt, signalProbs, pPwValues, pwScores });
384
+ if (!skipLmfUpdate) {
385
+ this.deps.lmf.update({ ...resultWithExt, pwScores, colorScores: scoresForColor } as FrontendAnalyzeResult & { pPwValues?: number[]; pwScores?: number[]; colorScores?: number[] });
386
+ }
387
+ } else {
388
+ this.deps.highlightController.updateCurrentData({ result: displayResult });
389
+ if (!skipLmfUpdate) {
390
+ this.deps.lmf.update({ ...displayResult, colorScores: scoresForColor } as FrontendAnalyzeResult & { colorScores?: number[] });
391
+ }
392
+ }
393
+ } else {
394
+ if (rawScoreNormedItem) rawScoreNormedItem.style.display = 'none';
395
+ if (displayResult) this.deps.highlightController.updateCurrentData({ result: displayResult });
396
  }
397
  }
398
 
399
+ /** 重新渲染直方图(供外部调用) */
 
 
400
  public rerenderHistograms(): void {
401
+ this.rerenderHistogramsInternal(false);
402
+ }
403
+
404
+ /** 仅更新语义着色源(color source 切换时调用,不重新拟合) */
405
+ public updateSemanticColorSource(): void {
406
+ const cd = this.deps.highlightController.getCurrentData();
407
+ const r = cd?.result as (FrontendAnalyzeResult & { rawScoresNormed?: number[] }) | undefined;
408
+ if (!r?.rawScoresNormed?.length) return;
409
+ const el = document.getElementById('semantic_color_source_select') as HTMLSelectElement | null;
410
+ const v = el?.value;
411
+ const scoresForColor = v === 'signal_probability' ? (cd!.pPwValues ?? [])
412
+ : v === 'pw_score' ? (cd!.pwScores ?? [])
413
+ : r.rawScoresNormed;
414
+ this.deps.lmf.update({ ...r, pPwValues: cd!.pPwValues, pwScores: cd!.pwScores, colorScores: scoresForColor } as FrontendAnalyzeResult & { pPwValues?: number[]; pwScores?: number[]; colorScores?: number[] });
415
+ }
416
+
417
+ /** 主题切换时调用:在样式生效后统一重绘直方图与文本(rgba 透出背景,需等新主题生效) */
418
+ public rerenderOnThemeChange(): void {
419
+ requestAnimationFrame(() => requestAnimationFrame(() => {
420
+ this.rerenderHistogramsInternal(true);
421
+ this.deps.lmf.reRenderCurrent();
422
+ }));
423
  }
424
 
425
  /**
 
602
  res: {
603
  model?: string;
604
  token_attention?: Array<{ offset: [number, number]; raw: string; score: number }>;
605
+ debug_info?: { abbrev?: string; topk_tokens?: string[]; topk_probs?: number[] };
 
606
  },
607
  text?: string
608
  ): void {
 
638
  enableRenderAnimation: false,
639
  semanticAnalysisMode: getSemanticAnalysisEnabled(),
640
  }, false);
 
641
  this.clearHighlights();
642
+ // 仅由 rerenderHistogramsInternal 调用 lmf.update,避免与 handleSemanticResponse 的重复调用导致语义渲染双重叠加
643
  this.rerenderHistogramsInternal();
644
  this.syncSemanticUiFromConfig();
645
 
646
+ const di = res.debug_info;
647
+ const top10 = (di?.topk_tokens && di?.topk_probs)
648
+ ? di.topk_tokens.map((token, i) => ({ token, prob: di.topk_probs![i] ?? 0 }))
649
+ : undefined;
650
+ this.updateSemanticDebugInfo(di?.abbrev, top10);
651
  }
652
 
653
  /** 更新文本渲染区下方的 debug 信息(abbrev + top10) */
 
672
  if (top10?.length) {
673
  const items = top10.map((t) => `'${esc(t.token)}(${(t.prob * 100).toFixed(1)}%)'`);
674
  parts.push(`<div class="semantic-debug-top10">[${items.join(', ')}]</div>`);
675
+ parts.push(renderTopkChartFullHtml(top10));
676
  }
677
  el.innerHTML = parts.join('');
678
  }
 
681
  res: { model?: string },
682
  tokenAttention: Array<{ offset: [number, number]; raw: string; score: number }>,
683
  text: string
684
+ ): (FrontendAnalyzeResult & { rawScoresNormed: number[]; attentionRawScores: number[] }) | null {
685
  const safeText = text.trim();
686
  if (!safeText) return null;
687
  const syntheticTokens = tokenAttention.map((t) => ({
 
703
  mergedTokens,
704
  originalToMergedMap,
705
  originalText: safeText,
706
+ rawScoresNormed: this.normalizeScoresForColor(scores),
707
  attentionRawScores: scores,
708
  };
709
  }
 
757
  }
758
 
759
  /**
760
+ * 将 raw score 归一化到 [0,1] 用于染色(0-max 归一化:norm = raw_score / max)
761
  * NaN/Inf 不参与 max 计算,映射为 0
762
  */
763
  private normalizeScoresForColor(scores: number[]): number[] {
client/src/ts/vis/GLTR_Text_Box.ts CHANGED
@@ -43,12 +43,15 @@ export enum GLTR_Mode {
43
  fract_p
44
  }
45
 
46
- /** tokenData:信息密度模式为 FrontendToken,Semantic analysis 模式下附加 matchScore */
47
- export type TokenDataForRender = FrontendToken & { matchScore?: number };
48
 
49
  /** 语义模式下的 Tooltip 展示字段 */
50
  export type SemanticRenderFields = {
51
- matchScore?: number;
 
 
 
52
  /** Attention 分析时的原始 score(未归一化) */
53
  rawScore?: number;
54
  };
@@ -62,9 +65,9 @@ export type GLTR_HoverEvent = { hovered: boolean, d: GLTR_RenderItem, event?: Mo
62
 
63
  /** 从 token 中安全提取语义展示字段,无需类型断言 */
64
  function extractSemanticFields(token: TokenDataForRender): SemanticRenderFields | undefined {
65
- const matchScore = "matchScore" in token && typeof token.matchScore === "number" ? token.matchScore : undefined;
66
- if (matchScore === undefined) return undefined;
67
- return { matchScore };
68
  }
69
 
70
  export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
@@ -87,7 +90,7 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
87
  // Minimap 配置
88
  enableMinimap: false, // 是否启用 minimap(默认关闭)
89
  minimapWidth: getMinimapWidthFromCSS(), // minimap 宽度(像素),从 CSS 变量读取
90
- // Semantic analysis 模式:为 true 时按 attention score 染色
91
  semanticAnalysisMode: false,
92
  };
93
 
@@ -237,9 +240,10 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
237
  this.positionCalculator = new TokenPositionCalculator(baseNode);
238
  }
239
 
240
- const rdExt = rd as FrontendAnalyzeResult & { attentionScores?: number[] };
241
- const attentionScores = rdExt.attentionScores;
242
- const isSemantic = this.options.semanticAnalysisMode && attentionScores?.length;
 
243
 
244
  // Semantic analysis 模式:按 BPE(merged tokens);否则按 BPE
245
  const rdForPositions: FrontendAnalyzeResult = rd;
@@ -262,7 +266,7 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
262
  charToByteIndexMap: this._current.charToByteIndexMap,
263
  }
264
  : undefined,
265
- semantic: isSemantic ? { analysisMode: true, matchScores: attentionScores } : undefined,
266
  };
267
  this.svgOverlayManager = new SvgOverlayManager(baseNode, overlayOptions);
268
 
@@ -616,30 +620,16 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
616
 
617
  /**
618
  * 设置主题变化监听器
 
619
  */
620
  private setupThemeListener(): void {
621
- // 使用MutationObserver监听data-theme属性的变化
622
  const observer = new MutationObserver((mutations) => {
623
  mutations.forEach((mutation) => {
624
  if (mutation.type === 'attributes' && mutation.attributeName === 'data-theme') {
625
- // 主题变化时,更新颜色scale并重新渲染
626
  this.updateColorScales();
627
- if (this.currentRenderData) {
628
- // 主题切换时禁用动画,立即重新渲染
629
- const originalAnimationSetting = this.options.enableRenderAnimation;
630
- this.options.enableRenderAnimation = false;
631
- // 重新渲染当前数据
632
- this._render(this.currentRenderData);
633
- // 恢复动画设置
634
- setTimeout(() => {
635
- this.options.enableRenderAnimation = originalAnimationSetting;
636
- }, 100);
637
- }
638
  }
639
  });
640
  });
641
-
642
- // 开始观察document.documentElement的data-theme属性
643
  observer.observe(document.documentElement, {
644
  attributes: true,
645
  attributeFilter: ['data-theme']
@@ -662,18 +652,23 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
662
  */
663
  private addTokenEventListeners(element: SVGGElement, tokenIndex: number, rd: FrontendAnalyzeResult): void {
664
  const rdExt = rd as FrontendAnalyzeResult & {
665
- attentionScores?: number[];
666
  attentionRawScores?: number[];
 
 
667
  };
668
- const hasAttentionScores = rdExt.attentionScores?.length && tokenIndex < rdExt.attentionScores.length;
669
- const showTooltip = true; // 始终显示 tooltip,semantic 部分在 hasAttentionScores 时填充
670
 
671
  const tokenData = rd.bpe_strings[tokenIndex] as TokenDataForRender;
672
  let semantic = showTooltip ? extractSemanticFields(tokenData) : undefined;
673
- if (showTooltip && hasAttentionScores && rdExt.attentionScores) {
674
- const attnScore = rdExt.attentionScores[tokenIndex];
 
675
  const rawScore = rdExt.attentionRawScores?.[tokenIndex];
676
- semantic = { ...semantic, matchScore: attnScore, rawScore } as SemanticRenderFields;
 
 
677
  }
678
 
679
  const handleMouseEnter = (event: MouseEvent) => {
 
43
  fract_p
44
  }
45
 
46
+ /** tokenData:信息密度模式为 FrontendToken,Semantic analysis 模式下附加 rawScoreNormed */
47
+ export type TokenDataForRender = FrontendToken & { rawScoreNormed?: number };
48
 
49
  /** 语义模式下的 Tooltip 展示字段 */
50
  export type SemanticRenderFields = {
51
+ pwScore?: number;
52
+ /** 信号概率 P_pw:x<=threshold 为 0,x>threshold 为 1 */
53
+ signalProb?: number;
54
+ rawScoreNormed?: number;
55
  /** Attention 分析时的原始 score(未归一化) */
56
  rawScore?: number;
57
  };
 
65
 
66
  /** 从 token 中安全提取语义展示字段,无需类型断言 */
67
  function extractSemanticFields(token: TokenDataForRender): SemanticRenderFields | undefined {
68
+ const rawScoreNormed = "rawScoreNormed" in token && typeof token.rawScoreNormed === "number" ? token.rawScoreNormed : undefined;
69
+ if (rawScoreNormed === undefined) return undefined;
70
+ return { rawScoreNormed };
71
  }
72
 
73
  export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
 
90
  // Minimap 配置
91
  enableMinimap: false, // 是否启用 minimap(默认关闭)
92
  minimapWidth: getMinimapWidthFromCSS(), // minimap 宽度(像素),从 CSS 变量读取
93
+ // Semantic analysis 模式:为 true 时按 raw score normed 染色
94
  semanticAnalysisMode: false,
95
  };
96
 
 
240
  this.positionCalculator = new TokenPositionCalculator(baseNode);
241
  }
242
 
243
+ const rdExt = rd as FrontendAnalyzeResult & { rawScoresNormed?: number[]; colorScores?: number[] };
244
+ const rawScoresNormed = rdExt.rawScoresNormed;
245
+ const colorScores = (rdExt.colorScores?.length ? rdExt.colorScores : undefined) ?? rawScoresNormed;
246
+ const isSemantic = this.options.semanticAnalysisMode && colorScores?.length;
247
 
248
  // Semantic analysis 模式:按 BPE(merged tokens);否则按 BPE
249
  const rdForPositions: FrontendAnalyzeResult = rd;
 
266
  charToByteIndexMap: this._current.charToByteIndexMap,
267
  }
268
  : undefined,
269
+ semantic: isSemantic ? { analysisMode: true, rawScoresNormed: colorScores } : undefined,
270
  };
271
  this.svgOverlayManager = new SvgOverlayManager(baseNode, overlayOptions);
272
 
 
620
 
621
  /**
622
  * 设置主题变化监听器
623
+ * 仅更新 fracScale/diffScale;重渲染由 initThemeManager 的 onThemeChange -> rerenderOnThemeChange 统一触发
624
  */
625
  private setupThemeListener(): void {
 
626
  const observer = new MutationObserver((mutations) => {
627
  mutations.forEach((mutation) => {
628
  if (mutation.type === 'attributes' && mutation.attributeName === 'data-theme') {
 
629
  this.updateColorScales();
 
 
 
 
 
 
 
 
 
 
 
630
  }
631
  });
632
  });
 
 
633
  observer.observe(document.documentElement, {
634
  attributes: true,
635
  attributeFilter: ['data-theme']
 
652
  */
653
  private addTokenEventListeners(element: SVGGElement, tokenIndex: number, rd: FrontendAnalyzeResult): void {
654
  const rdExt = rd as FrontendAnalyzeResult & {
655
+ rawScoresNormed?: number[];
656
  attentionRawScores?: number[];
657
+ pPwValues?: number[];
658
+ pwScores?: number[];
659
  };
660
+ const hasRawScoresNormed = rdExt.rawScoresNormed?.length && tokenIndex < rdExt.rawScoresNormed.length;
661
+ const showTooltip = true; // 始终显示 tooltip,semantic 部分在 hasRawScoresNormed 时填充
662
 
663
  const tokenData = rd.bpe_strings[tokenIndex] as TokenDataForRender;
664
  let semantic = showTooltip ? extractSemanticFields(tokenData) : undefined;
665
+ if (showTooltip && hasRawScoresNormed && rdExt.rawScoresNormed) {
666
+ // rawScoreNormed 始终用 rawScoresNormed,与 color source 无关
667
+ const attnScore = rdExt.rawScoresNormed[tokenIndex];
668
  const rawScore = rdExt.attentionRawScores?.[tokenIndex];
669
+ const signalProb = rdExt.pPwValues?.[tokenIndex]; // P_pw:x<=threshold 0,x>threshold 1
670
+ const pwScore = rdExt.pwScores?.[tokenIndex];
671
+ semantic = { ...semantic, rawScoreNormed: attnScore, rawScore, signalProb, pwScore } as SemanticRenderFields;
672
  }
673
 
674
  const handleMouseEnter = (event: MouseEvent) => {
client/src/ts/vis/Histogram.ts CHANGED
@@ -1,16 +1,46 @@
1
  import { VComponent } from "./VisComponent";
2
  import { D3Sel } from "../utils/Util";
3
  import { SimpleEventHandler } from "../utils/SimpleEventHandler";
 
4
  import * as d3 from "d3";
5
  import { schemeDark2 } from "d3";
6
 
7
  const averageNumberFormat = d3.format('.2f');
8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  export type HistogramData = {
10
  data: number[],
11
  label?: string,
12
  no_bins: number,
13
- extent: number[],
14
  colorScale: (value: number) => string, // 添加颜色 scale
15
  averageValue?: number,
16
  p90Value?: number,
@@ -18,8 +48,21 @@ export type HistogramData = {
18
  p90Label?: string,
19
  showLeftInfinity?: boolean,
20
  showRightInfinity?: boolean,
21
- xAxisTickSkip?: number, // x轴刻度数字绘制间隔,0表示不跳过,1表示隔一个绘制一个(0,2,4...)
22
- yScaleType?: 'linear' | 'sqrt' // y轴尺度:linear 线性,sqrt 平方根(压缩高值、放大低值,适合分布悬殊的直方图)
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  }
24
 
25
 
@@ -68,6 +111,9 @@ export class Histogram extends VComponent<HistogramData> {
68
  .attr('class', 'y-axis')
69
  .attr('transform', `translate(${op.width - 33},0)`)
70
 
 
 
 
71
  // 背景面板:避免柱体与整体页面纯白背景混淆
72
  this.layers.bg.insert('rect', ':first-child')
73
  .attr('class', 'panel-bg')
@@ -88,8 +134,18 @@ export class Histogram extends VComponent<HistogramData> {
88
  protected _render(rD: HistogramData): void {
89
  const op = this.options;
90
 
91
- // extent 是必选参数直接使用
92
- const extent = rD.extent;
 
 
 
 
 
 
 
 
 
 
93
 
94
  // 计算bin宽度
95
  const binWidth = (extent[1] - extent[0]) / rD.no_bins;
@@ -111,7 +167,7 @@ export class Histogram extends VComponent<HistogramData> {
111
  // 如果指定了 extent,确保使用 extent 作为 domain,而不是 nice() 调整后的 domain
112
  // 这样可以保证 extent 的上限被正确使用,即使数据被截断了
113
  // 使用 extent 作为 domain,确保范围正确
114
- const padding = { left: 10, right: 35 };
115
  let valueScale = d3.scaleLinear().domain([extent[0], extent[1]]).range([padding.left, op.width - padding.right]);
116
 
117
  const hasAverageValue = typeof rD.averageValue === 'number' && Number.isFinite(rD.averageValue);
@@ -146,15 +202,25 @@ export class Histogram extends VComponent<HistogramData> {
146
  console.warn('Invalid maxCount for histogram:', maxCount);
147
  maxCount = 1;
148
  }
 
 
 
 
149
 
150
  const useSqrt = rD.yScaleType === 'sqrt';
151
- const countScale = useSqrt
152
- ? d3.scaleSqrt().domain([0, maxCount]).range([op.height - op.margin_bottom, op.margin_top])
153
- : d3.scaleLinear().domain([0, maxCount]).nice().range([op.height - op.margin_bottom, op.margin_top]);
154
-
155
- const adjustWidth = (bandH: number) => {
156
- if (!isFinite(bandH) || bandH <= 0) return 0;
157
- return (bandH > 5) ? Math.max(0, bandH - 2) : (0.8 * bandH);
 
 
 
 
 
 
158
  };
159
 
160
  const getBandWidth = (d: d3.Bin<number, number>) => valueScale(d.x1) - valueScale(d.x0);
@@ -187,9 +253,7 @@ export class Histogram extends VComponent<HistogramData> {
187
  return isFinite(w) && w > 0 ? w : 1;
188
  },
189
  height: d => {
190
- if (d.length === 0) {
191
- return 0;
192
- }
193
  const h = op.height - op.margin_bottom - countScale(d.length);
194
  return isFinite(h) && h > 0 ? h : 1;
195
  },
@@ -211,6 +275,29 @@ export class Histogram extends VComponent<HistogramData> {
211
  return this._current.selectedBinIndex === i ? 'drop-shadow(0 0 6px rgba(42, 158, 255, 0.8))' : 'none';
212
  });
213
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  const avgMarkerData = averageX !== null && Number.isFinite(averageX)
215
  ? [{ x: averageX, value: rD.averageValue as number }]
216
  : [];
@@ -273,15 +360,50 @@ export class Histogram extends VComponent<HistogramData> {
273
  .attr('y', op.margin_top)
274
  .text('p90');
275
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
276
  const p90LabelData = (typeof rD.p90Value === 'number' && Number.isFinite(rD.p90Value)) ? [rD.p90Value] : [];
 
277
  this.layers.fg.selectAll('.p90-label').data(p90LabelData)
278
  .join('text')
279
  .attr('class', 'p90-label sizeLabel')
280
  .attr('text-anchor', 'end')
281
  .attr('x', op.width * 0.75)
282
- .attr('y', Math.max(24, op.margin_top + 10))
283
  .text(value => {
284
- return `p90 = ${averageNumberFormat(value)} bits`;
 
285
  });
286
 
287
  const labelData = histo.filter(bin => bin.length > 0);
@@ -408,27 +530,28 @@ export class Histogram extends VComponent<HistogramData> {
408
  });
409
 
410
 
411
- const yAxis = d3.axisRight(countScale).tickFormat(op.numberFormat);
412
- if (useSqrt) yAxis.ticks(5); // sqrt 尺度下减少刻度,避免拥挤
 
413
  this.layers.bg.select('.y-axis').call(<any>yAxis);
414
 
415
  const tickValues = [extent[0], ...thresholds, extent[1]];
416
  const tickSkip = rD.xAxisTickSkip ?? 0;
417
 
418
  // Custom tick format: 根据 showLeftInfinity/showRightInfinity 决定是否显示 ±∞
419
- // 根据 xAxisTickSkip 参数决定是否显示数字标签(刻线始终显示)
420
  const xAxisTickFormat = (d: number) => {
421
- // 边界值优先处理,避免被 tickSkip 逻辑意外隐藏
422
- if (rD.extent) {
423
- if (rD.showLeftInfinity && Math.abs(d - rD.extent[0]) < 0.001) return '-∞';
424
- if (rD.showRightInfinity && Math.abs(d - rD.extent[1]) < 0.001) return '∞';
425
- }
426
 
427
- // 基于值而非索引决定是否显示:检查 d 是否为 step 的整数倍
428
- // 这样可以自动对齐到整的刻度,避免起点偏移导致显示非整数刻度
429
  if (tickSkip > 0) {
430
- const step = (tickSkip + 1) * binWidth;
431
- if (Math.abs(d / step - Math.round(d / step)) > 1e-9) return '';
 
 
 
 
 
432
  }
433
 
434
  return op.numberFormat(d);
@@ -439,6 +562,46 @@ export class Histogram extends VComponent<HistogramData> {
439
  .tickValues(tickValues);
440
  this.layers.bg.select('.x-axis').call(<any>xAxis);
441
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
 
443
  }
444
 
 
1
  import { VComponent } from "./VisComponent";
2
  import { D3Sel } from "../utils/Util";
3
  import { SimpleEventHandler } from "../utils/SimpleEventHandler";
4
+ import { tr } from "../lang/i18n-lite";
5
  import * as d3 from "d3";
6
  import { schemeDark2 } from "d3";
7
 
8
  const averageNumberFormat = d3.format('.2f');
9
 
10
+ /** 1-2-5 decade 模式生成非线性 y 轴刻度,最多 maxTicks 个 */
11
+ function getNonLinearTickValues(maxCount: number, maxTicks = 10): number[] {
12
+ if (maxCount <= 0) return [0];
13
+ const ticks: number[] = [0];
14
+ const base = [1, 2, 5];
15
+ let decade = 1;
16
+ while (decade <= maxCount) {
17
+ for (const b of base) {
18
+ const v = b * decade;
19
+ if (v <= maxCount) ticks.push(v);
20
+ }
21
+ decade *= 10;
22
+ }
23
+ if (ticks[ticks.length - 1] !== maxCount) ticks.push(maxCount);
24
+ if (ticks.length <= maxTicks) return ticks;
25
+ const result: number[] = [];
26
+ for (let i = 0; i < maxTicks; i++) {
27
+ const idx = Math.round((i / (maxTicks - 1)) * (ticks.length - 1));
28
+ result.push(ticks[idx]);
29
+ }
30
+ return [...new Set(result)].sort((a, b) => a - b);
31
+ }
32
+
33
+ /** 单边:固定值或 'auto'(从 data 解析) */
34
+ export type HistogramExtentBound = number | 'auto';
35
+
36
+ /** extent:'auto' 等价于 ['auto','auto'],支持双边独立配置 */
37
+ export type HistogramExtent = [HistogramExtentBound, HistogramExtentBound] | 'auto';
38
+
39
  export type HistogramData = {
40
  data: number[],
41
  label?: string,
42
  no_bins: number,
43
+ extent: HistogramExtent,
44
  colorScale: (value: number) => string, // 添加颜色 scale
45
  averageValue?: number,
46
  p90Value?: number,
 
48
  p90Label?: string,
49
  showLeftInfinity?: boolean,
50
  showRightInfinity?: boolean,
51
+ /** x轴刻度数字绘制间隔,0表示不跳过,1表示隔一个绘制一个(0,2,4...) */
52
+ xAxisTickSkip?: number,
53
+ /** x轴刻度凑整:true=仅显示 step 整数倍处的标签(与 tickSkip 配合),false/undefined=显示全部 */
54
+ xAxisTickRound?: boolean;
55
+ yScaleType?: 'linear' | 'sqrt' | 'log' // y轴尺度:linear 线性,sqrt 平方根,log 对数(指数刻度,从 0 开始)
56
+ /** 拟合分布的每个 bin 期望计数,用于绘制横虚线标识(如指数噪声拟合) */
57
+ fitExpectedCounts?: number[];
58
+ /** 是否叠加 prob 曲线(共用 x 轴,左侧新 y 轴 0–1) */
59
+ showProbCurve?: boolean;
60
+ /** 曲线数据:x=raw_score_normed,y=prob(0–1),P(signal) 按 findSignalThreshold 的 bin 分块估计,(obs-exp)/obs */
61
+ probCurveData?: { x: number[]; y: number[] };
62
+ /** 信号阈值竖线:归一化分数,用于 raw_score_normed 直方图 */
63
+ signalThreshold?: number | null;
64
+ /** 信号阈值对应的分位数(0–100),用于 label 显示 τ = pXX */
65
+ signalThresholdPercentile?: number | null;
66
  }
67
 
68
 
 
111
  .attr('class', 'y-axis')
112
  .attr('transform', `translate(${op.width - 33},0)`)
113
 
114
+ this.layers.bg.append('g')
115
+ .attr('class', 'y-axis-prob')
116
+
117
  // 背景面板:避免柱体与整体页面纯白背景混淆
118
  this.layers.bg.insert('rect', ':first-child')
119
  .attr('class', 'panel-bg')
 
134
  protected _render(rD: HistogramData): void {
135
  const op = this.options;
136
 
137
+ // extent 解析:'auto' 等价于 ['auto','auto']支持双边独立配置
138
+ const [loSpec, hiSpec]: [HistogramExtentBound, HistogramExtentBound] =
139
+ rD.extent === 'auto' ? ['auto', 'auto'] : rD.extent;
140
+ const finite = rD.data.filter((d) => typeof d === 'number' && isFinite(d));
141
+ const [dataLo, dataHi] = finite.length > 0
142
+ ? (d3.extent(finite) as [number, number])
143
+ : [0, 1];
144
+ const fallbackLo = finite.length <= 1 ? dataLo - 0.5 : dataLo;
145
+ const fallbackHi = finite.length <= 1 ? dataHi + 0.5 : dataHi;
146
+ const lo = loSpec === 'auto' ? fallbackLo : loSpec;
147
+ const hi = hiSpec === 'auto' ? fallbackHi : hiSpec;
148
+ const extent: [number, number] = lo > hi ? [lo, lo] : [lo, hi];
149
 
150
  // 计算bin宽度
151
  const binWidth = (extent[1] - extent[0]) / rD.no_bins;
 
167
  // 如果指定了 extent,确保使用 extent 作为 domain,而不是 nice() 调整后的 domain
168
  // 这样可以保证 extent 的上限被正确使用,即使数据被截断了
169
  // 使用 extent 作为 domain,确保范围正确
170
+ const padding = { left: rD.showProbCurve ? 35 : 10, right: 35 };
171
  let valueScale = d3.scaleLinear().domain([extent[0], extent[1]]).range([padding.left, op.width - padding.right]);
172
 
173
  const hasAverageValue = typeof rD.averageValue === 'number' && Number.isFinite(rD.averageValue);
 
202
  console.warn('Invalid maxCount for histogram:', maxCount);
203
  maxCount = 1;
204
  }
205
+ if (rD.fitExpectedCounts && rD.fitExpectedCounts.length > 0) {
206
+ const fitMax = d3.max(rD.fitExpectedCounts) ?? 0;
207
+ if (isFinite(fitMax) && fitMax > maxCount) maxCount = fitMax;
208
+ }
209
 
210
  const useSqrt = rD.yScaleType === 'sqrt';
211
+ const useLog = rD.yScaleType === 'log';
212
+ const countScale = useLog
213
+ ? d3.scaleSymlog().domain([0, Math.max(1, maxCount)]).range([op.height - op.margin_bottom, op.margin_top])
214
+ : useSqrt
215
+ ? d3.scaleSqrt().domain([0, maxCount]).range([op.height - op.margin_bottom, op.margin_top])
216
+ : d3.scaleLinear().domain([0, maxCount]).nice().range([op.height - op.margin_bottom, op.margin_top]);
217
+
218
+ // 与 d3 scaleBand 一致:bandwidth = step * (1 - paddingInner),gap = step * paddingInner
219
+ // no_bins=20 时 barWidth:gap ≈ 2.875:1 → paddingInner ≈ 0.258
220
+ const PADDING_INNER = 0.15;
221
+ const adjustWidth = (step: number) => {
222
+ if (!isFinite(step) || step <= 0) return 0;
223
+ return step * (1 - PADDING_INNER);
224
  };
225
 
226
  const getBandWidth = (d: d3.Bin<number, number>) => valueScale(d.x1) - valueScale(d.x0);
 
253
  return isFinite(w) && w > 0 ? w : 1;
254
  },
255
  height: d => {
256
+ if (d.length === 0) return 0;
 
 
257
  const h = op.height - op.margin_bottom - countScale(d.length);
258
  return isFinite(h) && h > 0 ? h : 1;
259
  },
 
275
  return this._current.selectedBinIndex === i ? 'drop-shadow(0 0 6px rgba(42, 158, 255, 0.8))' : 'none';
276
  });
277
 
278
+ // 拟合分布横虚线:每个 bin 上标识期望计数,宽度与柱体对齐
279
+ const fitData = rD.fitExpectedCounts && rD.fitExpectedCounts.length === histo.length
280
+ ? histo.map((d, i) => {
281
+ const bandWidth = getBandWidth(d);
282
+ const barWidth = adjustWidth(bandWidth);
283
+ const base = valueScale(d.x0);
284
+ const x1 = base + 0.5 * (bandWidth - barWidth);
285
+ return { x1, x2: x1 + barWidth, y: countScale(Math.max(0, rD.fitExpectedCounts![i])) };
286
+ })
287
+ : [];
288
+ this.layers.main.selectAll('.fit-overlay-line').data(fitData)
289
+ .join('line')
290
+ .attr('class', 'fit-overlay-line')
291
+ .attrs({
292
+ x1: d => d.x1,
293
+ x2: d => d.x2,
294
+ y1: d => d.y,
295
+ y2: d => d.y,
296
+ })
297
+ .style('stroke', 'var(--fit-line-color, #999)')
298
+ .style('stroke-width', 1)
299
+ .style('stroke-dasharray', '1,1');
300
+
301
  const avgMarkerData = averageX !== null && Number.isFinite(averageX)
302
  ? [{ x: averageX, value: rD.averageValue as number }]
303
  : [];
 
360
  .attr('y', op.margin_top)
361
  .text('p90');
362
 
363
+ const hasSignalThreshold = typeof rD.signalThreshold === 'number' && Number.isFinite(rD.signalThreshold);
364
+ const clampedSignalThreshold = hasSignalThreshold
365
+ ? Math.min(Math.max(rD.signalThreshold as number, extent[0]), extent[1])
366
+ : null;
367
+ const signalThresholdX = hasSignalThreshold && clampedSignalThreshold !== null
368
+ ? valueScale(clampedSignalThreshold)
369
+ : null;
370
+
371
+ const signalThresholdMarkerData = signalThresholdX !== null && Number.isFinite(signalThresholdX)
372
+ ? [{ x: signalThresholdX, value: rD.signalThreshold as number, percentile: rD.signalThresholdPercentile }]
373
+ : [];
374
+
375
+ this.layers.fg.selectAll('.signal-threshold-line').data(signalThresholdMarkerData)
376
+ .join('line')
377
+ .attr('class', 'signal-threshold-line')
378
+ .attrs({
379
+ x1: d => d.x,
380
+ x2: d => d.x,
381
+ y1: op.margin_top + 4,
382
+ y2: op.height - op.margin_bottom
383
+ })
384
+ .style('stroke', 'var(--signal-threshold-line-color, #e74c3c)')
385
+ .style('stroke-width', 1.5)
386
+ .style('stroke-dasharray', '3,2');
387
+
388
+ this.layers.fg.selectAll('.signal-threshold-marker-label').data(signalThresholdMarkerData)
389
+ .join('text')
390
+ .attr('class', 'signal-threshold-marker-label sizeLabel')
391
+ .attr('text-anchor', 'middle')
392
+ .attr('x', d => d.x)
393
+ .attr('y', op.margin_top)
394
+ .text(d => typeof d.percentile === 'number' ? `τ = p${d.percentile}` : 'τ');
395
+
396
  const p90LabelData = (typeof rD.p90Value === 'number' && Number.isFinite(rD.p90Value)) ? [rD.p90Value] : [];
397
+ const p90LabelY = avgLabelData.length > 0 ? Math.max(24, op.margin_top + 10) : Math.max(12, op.margin_top - 2);
398
  this.layers.fg.selectAll('.p90-label').data(p90LabelData)
399
  .join('text')
400
  .attr('class', 'p90-label sizeLabel')
401
  .attr('text-anchor', 'end')
402
  .attr('x', op.width * 0.75)
403
+ .attr('y', p90LabelY)
404
  .text(value => {
405
+ const suffix = rD.p90Label ? ` ${rD.p90Label}` : '';
406
+ return `p90 = ${averageNumberFormat(value)}${suffix}`;
407
  });
408
 
409
  const labelData = histo.filter(bin => bin.length > 0);
 
530
  });
531
 
532
 
533
+ const yAxis = d3.axisRight(countScale)
534
+ .tickFormat(useLog ? d3.format('.0f') : op.numberFormat);
535
+ if (useSqrt || useLog) yAxis.tickValues(getNonLinearTickValues(maxCount, 10));
536
  this.layers.bg.select('.y-axis').call(<any>yAxis);
537
 
538
  const tickValues = [extent[0], ...thresholds, extent[1]];
539
  const tickSkip = rD.xAxisTickSkip ?? 0;
540
 
541
  // Custom tick format: 根据 showLeftInfinity/showRightInfinity 决定是否显示 ±∞
542
+ // xAxisTickSkip:减少刻度标签;xAxisTickRound:true 时按 step 对齐过滤,false 时按索引跳过
543
  const xAxisTickFormat = (d: number) => {
544
+ if (rD.showLeftInfinity && Math.abs(d - extent[0]) < 0.001) return '-∞';
545
+ if (rD.showRightInfinity && Math.abs(d - extent[1]) < 0.001) return '∞';
 
 
 
546
 
 
 
547
  if (tickSkip > 0) {
548
+ if (rD.xAxisTickRound) {
549
+ const step = (tickSkip + 1) * binWidth;
550
+ if (Math.abs(d / step - Math.round(d / step)) > 1e-9) return '';
551
+ } else {
552
+ const idx = tickValues.findIndex((t) => Math.abs(t - d) < 1e-9 * (Math.abs(d) + 1));
553
+ if (idx >= 0 && idx % (tickSkip + 1) !== 0) return '';
554
+ }
555
  }
556
 
557
  return op.numberFormat(d);
 
562
  .tickValues(tickValues);
563
  this.layers.bg.select('.x-axis').call(<any>xAxis);
564
 
565
+ const hasProbCurve = rD.showProbCurve && rD.probCurveData && rD.probCurveData.x.length > 0;
566
+ if (hasProbCurve) {
567
+ const probYScale = d3.scaleLinear()
568
+ .domain([0, 1])
569
+ .range([op.height - op.margin_bottom, op.margin_top]);
570
+
571
+ const probPoints: { x: number; y: number }[] = rD.probCurveData!.x.map((x, i) => ({ x, y: rD.probCurveData!.y[i] ?? 0 }));
572
+ const probLine = d3.line<{ x: number; y: number }>()
573
+ .x(d => valueScale(d.x))
574
+ .y(d => probYScale(d.y))
575
+ .curve(d3.curveLinear);
576
+
577
+ this.layers.fg.selectAll('.prob-curve').data([probPoints])
578
+ .join('path')
579
+ .attr('class', 'prob-curve')
580
+ .attr('d', probLine)
581
+ .style('fill', 'none')
582
+ .style('stroke', 'var(--prob-curve-color, rgba(160,200,255,0.85))')
583
+ .style('stroke-width', 1.5)
584
+ .style('pointer-events', 'none');
585
+
586
+ const probAxis = d3.axisLeft(probYScale)
587
+ .ticks(5)
588
+ .tickFormat(d3.format('.1f'));
589
+ this.layers.bg.select('.y-axis-prob')
590
+ .attr('transform', `translate(${padding.left},0)`)
591
+ .call(<any>probAxis);
592
+
593
+ this.layers.bg.selectAll('.prob-curve-axis-label').data([1])
594
+ .join('text')
595
+ .attr('class', 'prob-curve-axis-label sizeLabel')
596
+ .attr('text-anchor', 'middle')
597
+ .attr('transform', `translate(8,${(op.height - op.margin_bottom) / 2 + op.margin_top}) rotate(-90)`)
598
+ .text(tr('signal ratio'));
599
+
600
+ } else {
601
+ this.layers.fg.selectAll('.prob-curve').remove();
602
+ this.layers.bg.select('.y-axis-prob').selectAll('*').remove();
603
+ this.layers.bg.selectAll('.prob-curve-axis-label').remove();
604
+ }
605
 
606
  }
607
 
client/src/ts/vis/SvgOverlayManager.ts CHANGED
@@ -18,8 +18,8 @@ export interface DiffOverlayOptions {
18
  /** 语义分析模式配置 */
19
  export interface SemanticOverlayOptions {
20
  analysisMode: boolean;
21
- /** 查询匹配时每 token 的匹配度 [0,1] */
22
- matchScores?: number[];
23
  }
24
 
25
  export interface SvgOverlayManagerOptions {
@@ -226,7 +226,7 @@ export class SvgOverlayManager {
226
  group.appendChild(rect);
227
  // 语义分析模式:在信息密度之上叠加语义高亮(黄色渐变)
228
  const sem = this.options.semantic;
229
- if (sem?.analysisMode && sem.matchScores) {
230
  const overlayRect = this.createSemanticOverlayRect(pos, tokenIndex, rd);
231
  group.appendChild(overlayRect);
232
  }
@@ -326,7 +326,7 @@ export class SvgOverlayManager {
326
  ): SVGRectElement {
327
  const rect = document.createElementNS('http://www.w3.org/2000/svg', 'rect');
328
  const sem = this.options.semantic!;
329
- const score = sem.matchScores![tokenIndex];
330
  const color = score !== undefined ? getSemanticSimilarityColor(score) : 'transparent';
331
 
332
  this.setRectGeometry(rect, pos);
 
18
  /** 语义分析模式配置 */
19
  export interface SemanticOverlayOptions {
20
  analysisMode: boolean;
21
+ /** 查询匹配时每 token 的 raw score normed [0,1] */
22
+ rawScoresNormed?: number[];
23
  }
24
 
25
  export interface SvgOverlayManagerOptions {
 
226
  group.appendChild(rect);
227
  // 语义分析模式:在信息密度之上叠加语义高亮(黄色渐变)
228
  const sem = this.options.semantic;
229
+ if (sem?.analysisMode && sem.rawScoresNormed) {
230
  const overlayRect = this.createSemanticOverlayRect(pos, tokenIndex, rd);
231
  group.appendChild(overlayRect);
232
  }
 
326
  ): SVGRectElement {
327
  const rect = document.createElementNS('http://www.w3.org/2000/svg', 'rect');
328
  const sem = this.options.semantic!;
329
+ const score = sem.rawScoresNormed![tokenIndex];
330
  const color = score !== undefined ? getSemanticSimilarityColor(score) : 'transparent';
331
 
332
  this.setRectGeometry(rect, pos);
client/src/ts/vis/ToolTip.ts CHANGED
@@ -1,6 +1,3 @@
1
- /** Tooltip 显示的 pred_topk 候选数量,与后端 runtime_config.DEFAULT_TOPK 保持一致 */
2
- const DISPLAY_TOPK = 10;
3
-
4
  import { D3Sel, calculateSurprisal, calculateSurprisalDensity } from "../utils/Util";
5
  import { SimpleEventHandler } from "../utils/SimpleEventHandler";
6
  import { GLTR_RenderItem } from "./GLTR_Text_Box";
@@ -8,6 +5,8 @@ import type { FrontendToken } from "../api/GLTR_API";
8
  import * as d3 from "d3";
9
  import { tr } from "../lang/i18n-lite";
10
  import { getTokenRenderStyle } from "../utils/tokenRenderStyle";
 
 
11
 
12
  const SEPARATOR = '─────────────';
13
 
@@ -18,147 +17,6 @@ function renderField(f: DetailField, dc: string, vc: string): string {
18
  return `<span style="color: ${dc}">${f.label}</span> <span style="color: ${valColor}">${f.value}</span>`;
19
  }
20
 
21
- /**
22
- * 处理候选词文本,与主token保持一致的处理方式
23
- * 后端不再处理候选词,直接返回原始解码字符串,前端统一处理
24
- * @param text 原始文本
25
- * @returns 处理后的文本(特殊字符可视化 + HTML转义)
26
- */
27
- function processCandidateText(text: string): string {
28
- // 与主token保持一致:先可视化特殊字符,再HTML转义
29
- return escapeHtml(visualizeSpecialChars(text));
30
- }
31
-
32
- /**
33
- * HTML转义,防止XSS和HTML结构破坏
34
- * @param text 原始文本
35
- * @returns 转义后的文本
36
- */
37
- function escapeHtml(text: string): string {
38
- const div = document.createElement('div');
39
- div.textContent = text;
40
- return div.innerHTML;
41
- }
42
-
43
- /**
44
- * 检查字符是否是空白字符
45
- * @param char 单个字符
46
- * @returns 是否是空白字符
47
- */
48
- function isWhitespaceChar(char: string): boolean {
49
- return /\p{White_Space}/u.test(char);
50
- }
51
- /**
52
- * 检查字符是否可打印(常见字符范围)
53
- * @param char 单个字符
54
- * @returns 是否可打印
55
- */
56
- function isPrintableChar(char: string): boolean {
57
-
58
- // 首先排除所有空白字符
59
- if (isWhitespaceChar(char)) {
60
- return false;
61
- }
62
-
63
- const codePoint = char.codePointAt(0);
64
- if (codePoint === undefined) return false;
65
-
66
- // ASCII 可打印字符范围:32-126(空格到波浪号)
67
- if (codePoint >= 32 && codePoint <= 126) {
68
- return true;
69
- }
70
-
71
- // 常见 Unicode 范围(中文、日文、韩文、常用符号等)
72
- // 基本多文种平面(BMP)中的常见字符范围
73
- if (
74
- (codePoint >= 0x00A0 && codePoint <= 0x00FF) || // 拉丁文补充
75
- (codePoint >= 0x0100 && codePoint <= 0x017F) || // 拉丁文扩展-A
76
- (codePoint >= 0x0180 && codePoint <= 0x024F) || // 拉丁文扩展-B
77
- (codePoint >= 0x2000 && codePoint <= 0x206F) || // 常用标点
78
- (codePoint >= 0x2070 && codePoint <= 0x209F) || // 上标和下标
79
- (codePoint >= 0x20A0 && codePoint <= 0x20CF) || // 货币符号
80
- (codePoint >= 0x2100 && codePoint <= 0x214F) || // 字母式符号
81
- (codePoint >= 0x2190 && codePoint <= 0x21FF) || // 箭头
82
- (codePoint >= 0x2200 && codePoint <= 0x22FF) || // 数学运算符
83
- (codePoint >= 0x2300 && codePoint <= 0x23FF) || // 杂项技术符号
84
- (codePoint >= 0x2400 && codePoint <= 0x243F) || // 控制图片
85
- (codePoint >= 0x2E00 && codePoint <= 0x2E7F) || // 补充标点
86
- (codePoint >= 0x3000 && codePoint <= 0x303F) || // CJK符号和标点
87
- (codePoint >= 0x3040 && codePoint <= 0x309F) || // 平假名
88
- (codePoint >= 0x30A0 && codePoint <= 0x30FF) || // 片假名
89
- (codePoint >= 0x4E00 && codePoint <= 0x9FFF) || // CJK统一汉字
90
- (codePoint >= 0xAC00 && codePoint <= 0xD7AF) || // 韩文音节
91
- (codePoint >= 0xF900 && codePoint <= 0xFAFF) || // CJK兼容汉字
92
- (codePoint >= 0xFF00 && codePoint <= 0xFFEF) // 全角字符
93
- ) {
94
- return true;
95
- }
96
-
97
- return false;
98
- }
99
-
100
- /**
101
- * 将特殊字符转换为可见的文本表示形式(方案3:文本形式,空格也转义)
102
- * 对于无法显示的特殊字符,显示其 Unicode 编码
103
- * @param text 原始文本
104
- * @returns 转换后的文本,特殊字符已替换为文本标记或 Unicode 编码
105
- */
106
- function visualizeSpecialChars(text: string): string {
107
- // 先处理常见的特殊字符
108
- let result = text
109
- .replace(/\r\n/g, '[CRLF]') // Windows换行 -> [CRLF]
110
- .replace(/\n/g, '[LF]') // 换行 -> [LF]
111
- .replace(/\r/g, '[CR]') // 回车 -> [CR]
112
- .replace(/\t/g, '[TAB]') // Tab -> [TAB]
113
- .replace(/\u3000/g, '[FS]') // 全角空格 -> [FS]
114
- .replace(/ /g, '·'); // 空格 -> ·
115
- // .replace(/ /g, '␣'); // 空格 -> ␣
116
-
117
- // 处理其他不可打印或特殊字符,显示 Unicode 编码
118
- // 需要跳过已经转换的标记([...] 内的内容)
119
- const processed: string[] = [];
120
- let inBracket = false;
121
- let bracketContent = '';
122
-
123
- for (let i = 0; i < result.length; i++) {
124
- const char = result[i];
125
-
126
- if (char === '[') {
127
- // 开始标记
128
- inBracket = true;
129
- bracketContent = '[';
130
- processed.push(char);
131
- } else if (char === ']' && inBracket) {
132
- // 结束标记
133
- bracketContent += ']';
134
- processed.push(char);
135
- inBracket = false;
136
- bracketContent = '';
137
- } else if (inBracket) {
138
- // 在标记内,直接保留
139
- bracketContent += char;
140
- processed.push(char);
141
- } else {
142
- // 不在标记内,检查是否可打印
143
- if (isPrintableChar(char)) {
144
- processed.push(char);
145
- } else {
146
- // 显示 Unicode 编码
147
- const codePoint = char.codePointAt(0);
148
- if (codePoint !== undefined) {
149
- const hexCode = codePoint.toString(16).toUpperCase().padStart(4, '0');
150
- processed.push(`[U+${hexCode}]`);
151
- } else {
152
- processed.push(char); // 如果无法获取编码,保持原样
153
- }
154
- }
155
- }
156
- }
157
-
158
- return processed.join('');
159
- }
160
-
161
-
162
  export class ToolTip {
163
  private predictions: D3Sel;
164
  private myDetail: D3Sel;
@@ -168,9 +26,6 @@ export class ToolTip {
168
  private readonly numF = d3.format('.3f');
169
  private readonly significantF = d3.format('.3g');
170
 
171
- // 缓存:d3 scale(按 maxW 缓存)
172
- private scaleCache = new Map<number, d3.ScaleLinear<number, number>>();
173
-
174
  // 缓存:主题颜色
175
  private themeColors = {
176
  normalColor: '#333',
@@ -245,16 +100,6 @@ export class ToolTip {
245
  };
246
  }
247
 
248
- /**
249
- * 获取或创建 scale(带缓存)
250
- */
251
- private _getScale(maxW: number): d3.ScaleLinear<number, number> {
252
- if (!this.scaleCache.has(maxW)) {
253
- this.scaleCache.set(maxW, d3.scaleLinear().domain([0, maxW]).range([0, 60]));
254
- }
255
- return this.scaleCache.get(maxW)!;
256
- }
257
-
258
  /**
259
  * 获取真实的可见视口尺寸和偏移量
260
  * 优先使用 visualViewport API(解决 iOS Safari 地址栏动态显示/隐藏问题)
@@ -434,15 +279,17 @@ export class ToolTip {
434
 
435
  const tokenData = ri.tokenData as FrontendToken;
436
  const s = ri.semantic;
437
- const hasSemantic = s && (s.matchScore !== undefined || s.rawScore !== undefined);
438
  const hasRealTopk = tokenData?.real_topk != null && Array.isArray(tokenData.real_topk);
439
  const predTopk = tokenData?.pred_topk ?? [];
440
  const hasPredictions = predTopk.length > 0;
441
 
442
- // 1. 构建语义区块
443
  const semanticRows: string[] = [];
444
  if (hasSemantic && s) {
445
- if (s.matchScore !== undefined) semanticRows.push(renderField({ label: tr('match score:'), value: this.numF(s.matchScore) }, detailColor, valueColor));
 
 
446
  if (s.rawScore !== undefined) semanticRows.push(renderField({ label: tr('raw score:'), value: d3.format('.6f')(s.rawScore), valueColor: false }, detailColor, valueColor));
447
  }
448
 
@@ -485,20 +332,11 @@ export class ToolTip {
485
  .style('display', 'block')
486
  .html(() => `<div style="color:${detailColor};padding-left:5px;">${tr('Top-k data not available.')}</div>`);
487
  } else {
488
- const wScale = this._getScale(predTopk[0][1]);
489
- this.predictions.selectAll('.row').data(predTopk.slice(0, DISPLAY_TOPK))
490
- .join('div')
491
- .attr('class', 'row')
492
- .style('display', 'table-row')
493
- .html(d => {
494
- const color = tokenData.raw != d[0] ? normalColor : selectedColor;
495
- const bar = '<div style="display: table-cell; width:110px;padding-left:5px;">' +
496
- `<div style="display:inline-block;width: ${wScale(d[1])}px;background-color:${color};height: 10px;"></div>` +
497
- ` <div style="display:inline-block;color: ${color};">${this.numF(d[1])}</div>` + "</div>";
498
- const processedText = processCandidateText(d[0]);
499
- const text = `<div style="display: table-cell;color: ${color};padding-right:5px;">${processedText}</div>`;
500
- return `${bar} ${text}`;
501
- });
502
  }
503
  }
504
 
 
 
 
 
1
  import { D3Sel, calculateSurprisal, calculateSurprisalDensity } from "../utils/Util";
2
  import { SimpleEventHandler } from "../utils/SimpleEventHandler";
3
  import { GLTR_RenderItem } from "./GLTR_Text_Box";
 
5
  import * as d3 from "d3";
6
  import { tr } from "../lang/i18n-lite";
7
  import { getTokenRenderStyle } from "../utils/tokenRenderStyle";
8
+ import { escapeHtml, visualizeSpecialChars } from "../utils/tokenDisplayUtils";
9
+ import { renderTopkChartHtml } from "../utils/topkChartUtils";
10
 
11
  const SEPARATOR = '─────────────';
12
 
 
17
  return `<span style="color: ${dc}">${f.label}</span> <span style="color: ${valColor}">${f.value}</span>`;
18
  }
19
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
  export class ToolTip {
21
  private predictions: D3Sel;
22
  private myDetail: D3Sel;
 
26
  private readonly numF = d3.format('.3f');
27
  private readonly significantF = d3.format('.3g');
28
 
 
 
 
29
  // 缓存:主题颜色
30
  private themeColors = {
31
  normalColor: '#333',
 
100
  };
101
  }
102
 
 
 
 
 
 
 
 
 
 
 
103
  /**
104
  * 获取真实的可见视口尺寸和偏移量
105
  * 优先使用 visualViewport API(解决 iOS Safari 地址栏动态显示/隐藏问题)
 
279
 
280
  const tokenData = ri.tokenData as FrontendToken;
281
  const s = ri.semantic;
282
+ const hasSemantic = s && (s.pwScore !== undefined || s.signalProb !== undefined || s.rawScoreNormed !== undefined || s.rawScore !== undefined);
283
  const hasRealTopk = tokenData?.real_topk != null && Array.isArray(tokenData.real_topk);
284
  const predTopk = tokenData?.pred_topk ?? [];
285
  const hasPredictions = predTopk.length > 0;
286
 
287
+ // 1. 构建语义区块(pw score = raw_score_normed × P_pw,P_pw: x≤threshold 为 0,x>threshold 为 1)
288
  const semanticRows: string[] = [];
289
  if (hasSemantic && s) {
290
+ if (s.pwScore !== undefined) semanticRows.push(renderField({ label: tr('pw score:'), value: this.numF(s.pwScore) }, detailColor, valueColor));
291
+ if (s.signalProb !== undefined) semanticRows.push(renderField({ label: tr('signal probability:'), value: this.numF(s.signalProb) }, detailColor, valueColor));
292
+ if (s.rawScoreNormed !== undefined) semanticRows.push(renderField({ label: tr('raw score normed:'), value: this.numF(s.rawScoreNormed) }, detailColor, valueColor));
293
  if (s.rawScore !== undefined) semanticRows.push(renderField({ label: tr('raw score:'), value: d3.format('.6f')(s.rawScore), valueColor: false }, detailColor, valueColor));
294
  }
295
 
 
332
  .style('display', 'block')
333
  .html(() => `<div style="color:${detailColor};padding-left:5px;">${tr('Top-k data not available.')}</div>`);
334
  } else {
335
+ const topkData = predTopk.slice(0, 10).map(([token, prob]) => ({ token, prob }));
336
+ this.predictions.html(renderTopkChartHtml(topkData, {
337
+ selectedToken: tokenData.raw,
338
+ numFormat: this.numF,
339
+ }));
 
 
 
 
 
 
 
 
 
340
  }
341
  }
342
 
math_demo/.streamlit/config.toml ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [server]
2
+ headless = true
math_demo/requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ streamlit>=1.28.0
2
+ matplotlib>=3.7.0
3
+ numpy>=1.24.0
4
+ scipy>=1.10.0
model_paths.py CHANGED
@@ -8,12 +8,12 @@ DEFAULT_MODEL = "qwen3.0-0.6b"
8
  DEFAULT_SEMANTIC_MODEL = "qwen3-0.6b-instruct"
9
 
10
  # Semantic analysis 模型(instruct 版本,用于 chat template 与指令理解)
11
- # 与 qwen3.0-14b 同级:0.6B → 1.7B → 4B → 8B → 14B
12
  SEMANTIC_MODEL_PATHS = {
13
  "qwen3-0.6b-instruct": "Qwen/Qwen3-0.6B",
14
  "qwen3-1.7b-instruct": "Qwen/Qwen3-1.7B",
15
  # "qwen3-4b-instruct": "Qwen/Qwen3-4B",
16
  "qwen3-4b-instruct": "Qwen/Qwen3-4B-Instruct-2507",
 
17
  }
18
 
19
  # 所有可用模型的 HuggingFace 路径映射
@@ -26,6 +26,7 @@ MODEL_PATHS = {
26
  'qwen3.0-8b': 'Qwen/Qwen3-8B-Base',
27
  'qwen3.0-14b': 'Qwen/Qwen3-14B-Base',
28
  'qwen3.0-30b-a3b': 'Qwen/Qwen3-30B-A3B-Base',
 
29
  'qwen2.5-32b': 'Qwen/Qwen2.5-32B',
30
  'qwen2.5-72b': 'Qwen/Qwen2.5-72B',
31
 
 
8
  DEFAULT_SEMANTIC_MODEL = "qwen3-0.6b-instruct"
9
 
10
  # Semantic analysis 模型(instruct 版本,用于 chat template 与指令理解)
 
11
  SEMANTIC_MODEL_PATHS = {
12
  "qwen3-0.6b-instruct": "Qwen/Qwen3-0.6B",
13
  "qwen3-1.7b-instruct": "Qwen/Qwen3-1.7B",
14
  # "qwen3-4b-instruct": "Qwen/Qwen3-4B",
15
  "qwen3-4b-instruct": "Qwen/Qwen3-4B-Instruct-2507",
16
+ "qwen3.5-0.8b-instruct": "Qwen/Qwen3.5-0.8B",
17
  }
18
 
19
  # 所有可用模型的 HuggingFace 路径映射
 
26
  'qwen3.0-8b': 'Qwen/Qwen3-8B-Base',
27
  'qwen3.0-14b': 'Qwen/Qwen3-14B-Base',
28
  'qwen3.0-30b-a3b': 'Qwen/Qwen3-30B-A3B-Base',
29
+ 'qwen3.5-0.8b': 'Qwen/Qwen3.5-0.8B-Base',
30
  'qwen2.5-32b': 'Qwen/Qwen2.5-32B',
31
  'qwen2.5-72b': 'Qwen/Qwen2.5-72B',
32
 
scripts/eval_semantic_submodes.py CHANGED
@@ -52,7 +52,7 @@ DEFAULT_API_BASE = "http://localhost:5001"
52
  def analyze_semantic_http(api_base: str, query: str, text: str, submode: str, token: Optional[str] = None, prob_weighted: Optional[bool] = None, timeout: int = 300) -> dict:
53
  """通过 HTTP 调用 analyze-semantic 接口"""
54
  url = f"{api_base.rstrip('/')}/api/analyze-semantic"
55
- payload: dict = {"query": query, "text": text, "submode": submode}
56
  if prob_weighted is not None:
57
  payload["prob_weighted"] = prob_weighted
58
  headers = {"Content-Type": "application/json"}
@@ -87,8 +87,9 @@ def run_eval(api_base: str, submode: str, test_cases: list, token: Optional[str]
87
  })
88
  continue
89
 
90
- topk_tokens = res.get("topk_tokens", [])
91
- topk_probs = res.get("topk_probs", [])
 
92
  token_attention = res.get("token_attention", [])
93
 
94
  # 0-max 归一化: score / max ∈ [0, 1],最大值归一为 1
 
52
  def analyze_semantic_http(api_base: str, query: str, text: str, submode: str, token: Optional[str] = None, prob_weighted: Optional[bool] = None, timeout: int = 300) -> dict:
53
  """通过 HTTP 调用 analyze-semantic 接口"""
54
  url = f"{api_base.rstrip('/')}/api/analyze-semantic"
55
+ payload: dict = {"query": query, "text": text, "submode": submode, "debug_info": True}
56
  if prob_weighted is not None:
57
  payload["prob_weighted"] = prob_weighted
58
  headers = {"Content-Type": "application/json"}
 
87
  })
88
  continue
89
 
90
+ di = res.get("debug_info", {})
91
+ topk_tokens = di.get("topk_tokens", [])
92
+ topk_probs = di.get("topk_probs", [])
93
  token_attention = res.get("token_attention", [])
94
 
95
  # 0-max 归一化: score / max ∈ [0, 1],最大值归一为 1
server.yaml CHANGED
@@ -503,16 +503,23 @@ paths:
503
  score:
504
  type: number
505
  description: 对 prompt 区域的平均关注度
506
- topk_tokens:
507
- type: array
508
- items:
509
- type: string
510
- description: top10 预测 token 列表
511
- topk_probs:
512
- type: array
513
- items:
514
- type: number
515
- description: top10 对应的概率
 
 
 
 
 
 
 
516
  message:
517
  type: string
518
  400:
 
503
  score:
504
  type: number
505
  description: 对 prompt 区域的平均关注度
506
+ debug_info:
507
+ type: object
508
+ description: debug_info=true 时返回
509
+ properties:
510
+ abbrev:
511
+ type: string
512
+ description: 推理原文缩写
513
+ topk_tokens:
514
+ type: array
515
+ items:
516
+ type: string
517
+ description: top10 预测 token 列表
518
+ topk_probs:
519
+ type: array
520
+ items:
521
+ type: number
522
+ description: top10 对应的概率
523
  message:
524
  type: string
525
  400: