增强语义分析功能:噪声分布拟合截尾对数正态分布,扫描-拟合迭代查找尾部信号阈值点,增加后验加权pw score染色;语义结果缓存
Browse files- backend/api/analyze_semantic.py +15 -14
- backend/semantic_analyzer.py +37 -17
- client/src/css/_responsive.scss +2 -1
- client/src/css/_semantic-analysis.scss +15 -1
- client/src/css/start.scss +0 -1
- client/src/index.html +20 -10
- client/src/package.json +4 -0
- client/src/ts/api/GLTR_API.ts +34 -18
- client/src/ts/appInitializer.ts +3 -3
- client/src/ts/compare.ts +2 -1
- client/src/ts/controllers/highlightController.ts +18 -17
- client/src/ts/lang/translations.ts +9 -3
- client/src/ts/start.ts +58 -6
- client/src/ts/utils/SurprisalColorConfig.ts +20 -16
- client/src/ts/utils/fitQuality.ts +39 -0
- client/src/ts/utils/highlightUtils.ts +45 -95
- client/src/ts/utils/lognormalFit.ts +144 -0
- client/src/ts/utils/queryHistory.ts +4 -1
- client/src/ts/utils/semanticResultCache.ts +104 -0
- client/src/ts/utils/signalThresholdDetector.ts +330 -0
- client/src/ts/utils/tokenDisplayUtils.ts +97 -0
- client/src/ts/utils/topkChartUtils.ts +71 -0
- client/src/ts/utils/visualizationConfigs.ts +14 -8
- client/src/ts/utils/visualizationUpdater.ts +160 -40
- client/src/ts/vis/GLTR_Text_Box.ts +27 -32
- client/src/ts/vis/Histogram.ts +193 -30
- client/src/ts/vis/SvgOverlayManager.ts +4 -4
- client/src/ts/vis/ToolTip.ts +12 -174
- math_demo/.streamlit/config.toml +2 -0
- math_demo/requirements.txt +4 -0
- model_paths.py +2 -1
- scripts/eval_semantic_submodes.py +4 -3
- server.yaml +17 -10
backend/api/analyze_semantic.py
CHANGED
|
@@ -25,23 +25,21 @@ def _log_request(query, text, client_ip=None):
|
|
| 25 |
|
| 26 |
|
| 27 |
def _build_success_response(result, debug_info: bool = False):
|
| 28 |
-
"""构建成功响应。debug_info=True 时包含
|
| 29 |
resp = {
|
| 30 |
"success": True,
|
| 31 |
"model": result["model"],
|
| 32 |
"token_attention": result["token_attention"],
|
| 33 |
"full_match_degree": result["full_match_degree"],
|
| 34 |
}
|
| 35 |
-
if debug_info:
|
| 36 |
-
|
| 37 |
-
resp["debug_abbrev"] = result["debug_abbrev"]
|
| 38 |
-
if "debug_top10" in result:
|
| 39 |
-
resp["debug_top10"] = result["debug_top10"]
|
| 40 |
return resp
|
| 41 |
|
| 42 |
|
| 43 |
def _generate_semantic_events(
|
| 44 |
-
query: str, text: str, submode: Optional[str] = None, debug_info: bool = False,
|
|
|
|
| 45 |
):
|
| 46 |
"""
|
| 47 |
流式语义分析核心:生成 SSE 事件流(progress + result/error)。
|
|
@@ -77,7 +75,7 @@ def _generate_semantic_events(
|
|
| 77 |
try:
|
| 78 |
from backend.access_log import log_analyze_semantic_start
|
| 79 |
log_analyze_semantic_start(request_id, lock_wait_time, stream_mode=True)
|
| 80 |
-
result = _analyze_semantic(query, text, submode_override=submode, progress_callback=progress_callback, debug_info=debug_info)
|
| 81 |
analysis_result = result
|
| 82 |
finally:
|
| 83 |
_inference_lock.release()
|
|
@@ -139,16 +137,18 @@ def _generate_semantic_events(
|
|
| 139 |
|
| 140 |
|
| 141 |
def _analyze_semantic_with_stream(
|
| 142 |
-
query: str, text: str, submode: Optional[str] = None, debug_info: bool = False,
|
|
|
|
| 143 |
):
|
| 144 |
"""流式语义分析,通过 SSE 返回阶段级进度"""
|
| 145 |
return SSEProgressReporter(
|
| 146 |
-
lambda: _generate_semantic_events(query, text, submode, debug_info, client_ip)
|
| 147 |
).create_response()
|
| 148 |
|
| 149 |
|
| 150 |
def _analyze_semantic_plain(
|
| 151 |
-
query: str, text: str, submode: Optional[str] = None, debug_info: bool = False,
|
|
|
|
| 152 |
):
|
| 153 |
"""
|
| 154 |
非流式语义分析:封装流式实现,消费事件流后返回 JSON。
|
|
@@ -158,7 +158,7 @@ def _analyze_semantic_plain(
|
|
| 158 |
error_msg = None
|
| 159 |
status_code = 500
|
| 160 |
try:
|
| 161 |
-
for event_str in _generate_semantic_events(query, text, submode, debug_info, client_ip):
|
| 162 |
if not event_str.startswith('data: '):
|
| 163 |
continue
|
| 164 |
data = json.loads(event_str[6:].strip())
|
|
@@ -199,6 +199,7 @@ def analyze_semantic(semantic_request):
|
|
| 199 |
stream = semantic_request.get("stream", False)
|
| 200 |
submode = (semantic_request.get("submode") or "").strip() or None
|
| 201 |
debug_info = bool(semantic_request.get("debug_info", False))
|
|
|
|
| 202 |
|
| 203 |
if not query:
|
| 204 |
return {"success": False, "message": "缺少 query 字段"}, 400
|
|
@@ -207,5 +208,5 @@ def analyze_semantic(semantic_request):
|
|
| 207 |
|
| 208 |
client_ip = get_client_ip()
|
| 209 |
if stream:
|
| 210 |
-
return _analyze_semantic_with_stream(query, text, submode, debug_info, client_ip)
|
| 211 |
-
return _analyze_semantic_plain(query, text, submode, debug_info, client_ip)
|
|
|
|
| 25 |
|
| 26 |
|
| 27 |
def _build_success_response(result, debug_info: bool = False):
|
| 28 |
+
"""构建成功响应。debug_info=True 时包含 debug_info 对象(abbrev、topk_tokens、topk_probs)"""
|
| 29 |
resp = {
|
| 30 |
"success": True,
|
| 31 |
"model": result["model"],
|
| 32 |
"token_attention": result["token_attention"],
|
| 33 |
"full_match_degree": result["full_match_degree"],
|
| 34 |
}
|
| 35 |
+
if debug_info and "debug_info" in result:
|
| 36 |
+
resp["debug_info"] = result["debug_info"]
|
|
|
|
|
|
|
|
|
|
| 37 |
return resp
|
| 38 |
|
| 39 |
|
| 40 |
def _generate_semantic_events(
|
| 41 |
+
query: str, text: str, submode: Optional[str] = None, debug_info: bool = False,
|
| 42 |
+
full_match_degree_only: bool = False, client_ip: Optional[str] = None
|
| 43 |
):
|
| 44 |
"""
|
| 45 |
流式语义分析核心:生成 SSE 事件流(progress + result/error)。
|
|
|
|
| 75 |
try:
|
| 76 |
from backend.access_log import log_analyze_semantic_start
|
| 77 |
log_analyze_semantic_start(request_id, lock_wait_time, stream_mode=True)
|
| 78 |
+
result = _analyze_semantic(query, text, submode_override=submode, progress_callback=progress_callback, debug_info=debug_info, full_match_degree_only=full_match_degree_only)
|
| 79 |
analysis_result = result
|
| 80 |
finally:
|
| 81 |
_inference_lock.release()
|
|
|
|
| 137 |
|
| 138 |
|
| 139 |
def _analyze_semantic_with_stream(
|
| 140 |
+
query: str, text: str, submode: Optional[str] = None, debug_info: bool = False,
|
| 141 |
+
full_match_degree_only: bool = False, client_ip: Optional[str] = None
|
| 142 |
):
|
| 143 |
"""流式语义分析,通过 SSE 返回阶段级进度"""
|
| 144 |
return SSEProgressReporter(
|
| 145 |
+
lambda: _generate_semantic_events(query, text, submode, debug_info, full_match_degree_only, client_ip)
|
| 146 |
).create_response()
|
| 147 |
|
| 148 |
|
| 149 |
def _analyze_semantic_plain(
|
| 150 |
+
query: str, text: str, submode: Optional[str] = None, debug_info: bool = False,
|
| 151 |
+
full_match_degree_only: bool = False, client_ip: Optional[str] = None
|
| 152 |
):
|
| 153 |
"""
|
| 154 |
非流式语义分析:封装流式实现,消费事件流后返回 JSON。
|
|
|
|
| 158 |
error_msg = None
|
| 159 |
status_code = 500
|
| 160 |
try:
|
| 161 |
+
for event_str in _generate_semantic_events(query, text, submode, debug_info, full_match_degree_only, client_ip):
|
| 162 |
if not event_str.startswith('data: '):
|
| 163 |
continue
|
| 164 |
data = json.loads(event_str[6:].strip())
|
|
|
|
| 199 |
stream = semantic_request.get("stream", False)
|
| 200 |
submode = (semantic_request.get("submode") or "").strip() or None
|
| 201 |
debug_info = bool(semantic_request.get("debug_info", False))
|
| 202 |
+
full_match_degree_only = bool(semantic_request.get("full_match_degree_only", False))
|
| 203 |
|
| 204 |
if not query:
|
| 205 |
return {"success": False, "message": "缺少 query 字段"}, 400
|
|
|
|
| 208 |
|
| 209 |
client_ip = get_client_ip()
|
| 210 |
if stream:
|
| 211 |
+
return _analyze_semantic_with_stream(query, text, submode, debug_info, full_match_degree_only, client_ip)
|
| 212 |
+
return _analyze_semantic_plain(query, text, submode, debug_info, full_match_degree_only, client_ip)
|
backend/semantic_analyzer.py
CHANGED
|
@@ -12,6 +12,7 @@ count/fill_blank 按概率加权(Σ pᵢ·zᵢ)。
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
import gc
|
|
|
|
| 15 |
from typing import Callable, Dict, List, Optional
|
| 16 |
|
| 17 |
import torch
|
|
@@ -58,6 +59,7 @@ def _analyze_logits_gradient(
|
|
| 58 |
submode_override: Optional[str] = None,
|
| 59 |
progress_callback: Optional[Callable[[int, int, str, Optional[int]], None]] = None,
|
| 60 |
debug_info: bool = False,
|
|
|
|
| 61 |
) -> Dict:
|
| 62 |
"""
|
| 63 |
梯度归因:logits 对输入 embedding 的梯度。
|
|
@@ -147,13 +149,12 @@ def _analyze_logits_gradient(
|
|
| 147 |
attention_mask=attention_mask,
|
| 148 |
output_attentions=False,
|
| 149 |
)
|
| 150 |
-
# 显式同步,确保
|
| 151 |
if device.type == "cuda":
|
| 152 |
torch.cuda.synchronize(device)
|
| 153 |
elif device.type == "mps":
|
| 154 |
torch.mps.synchronize()
|
| 155 |
-
|
| 156 |
-
progress_callback(3, TOTAL_STEPS, "backward", None)
|
| 157 |
logits = outputs.logits[:, -1, :]
|
| 158 |
topk_vals, topk_ids = torch.topk(logits, LOGITS_GRADIENT_TOPK, dim=-1)
|
| 159 |
probs = torch.softmax(logits, dim=-1)
|
|
@@ -163,6 +164,19 @@ def _analyze_logits_gradient(
|
|
| 163 |
|
| 164 |
neg_token = "无" if submode == "fill_blank" else "0"
|
| 165 |
neg_id = tokenizer.encode(neg_token, add_special_tokens=False)[0]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 166 |
# 归因目标:raw logits(不经过 softmax backward),避免饱和与竞争污染。
|
| 167 |
if submode == "count" or submode == "fill_blank":
|
| 168 |
# count/fill_blank 均用 top-10、按概率加权 Σ pᵢ·zᵢ,并排除 neg_token(0/无)以保持梯度方向与「相关」一致。
|
|
@@ -180,39 +194,43 @@ def _analyze_logits_gradient(
|
|
| 180 |
else:
|
| 181 |
raise ValueError(f"未知 submode: {submode}")
|
| 182 |
target_logit.backward()
|
| 183 |
-
|
| 184 |
grad = embeds.grad
|
| 185 |
if grad is None:
|
| 186 |
raise RuntimeError("logits_gradient: 梯度未回传,可能模型不支持(如 int8 量化)")
|
| 187 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
if progress_callback:
|
| 189 |
progress_callback(4, TOTAL_STEPS, "processing", None)
|
|
|
|
| 190 |
text_token_end = len(offset_mapping)
|
| 191 |
# 在 GPU 上一次性计算所有 token 的 ‖∇f‖,避免循环内 .item() 导致 500 次 GPU→CPU 同步
|
| 192 |
grad_slice = grad[0, prompt_end:text_token_end].float()
|
| 193 |
norms = grad_slice.norm(dim=-1).cpu().tolist()
|
| 194 |
token_attention: List[Dict] = []
|
|
|
|
| 195 |
for i in range(prompt_end, text_token_end):
|
| 196 |
s, e = offset_mapping[i]
|
| 197 |
if s >= text_start_char and e <= text_end_char:
|
| 198 |
s_rel, e_rel = s - text_start_char, e - text_start_char
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
| 204 |
|
| 205 |
-
model_display = get_semantic_model_display_name()
|
| 206 |
out = {
|
| 207 |
-
"model":
|
| 208 |
"token_attention": token_attention,
|
| 209 |
-
"topk_tokens": topk_tokens,
|
| 210 |
-
"topk_probs": topk_probs,
|
| 211 |
"full_match_degree": full_match_degree,
|
| 212 |
}
|
| 213 |
if debug_info:
|
| 214 |
-
out["
|
| 215 |
-
out["debug_top10"] = [{"token": t, "prob": p} for t, p in zip(topk_tokens, topk_probs)]
|
| 216 |
return out
|
| 217 |
finally:
|
| 218 |
if use_gc:
|
|
@@ -227,6 +245,7 @@ def analyze_semantic(
|
|
| 227 |
submode_override: Optional[str] = None,
|
| 228 |
progress_callback: Optional[Callable[[int, int, str, Optional[int]], None]] = None,
|
| 229 |
debug_info: bool = False,
|
|
|
|
| 230 |
) -> Dict:
|
| 231 |
"""
|
| 232 |
分析原文各 token 与 query 的相关度(使用 logits_gradient 梯度归因)。
|
|
@@ -236,10 +255,10 @@ def analyze_semantic(
|
|
| 236 |
text: 原文
|
| 237 |
submode_override: 评估时可选覆盖子模式(count/match_score/fill_blank)
|
| 238 |
progress_callback: 可选进度回调 (step, total_steps, stage, percentage)
|
| 239 |
-
debug_info: 为 True 时返回 debug_abbrev(推理原文缩写)
|
| 240 |
|
| 241 |
Returns:
|
| 242 |
-
{"model", "token_attention", "
|
| 243 |
"""
|
| 244 |
tokenizer, model, device = ensure_semantic_loaded()
|
| 245 |
return _analyze_logits_gradient(
|
|
@@ -247,4 +266,5 @@ def analyze_semantic(
|
|
| 247 |
submode_override=submode_override,
|
| 248 |
progress_callback=progress_callback,
|
| 249 |
debug_info=debug_info,
|
|
|
|
| 250 |
)
|
|
|
|
| 12 |
"""
|
| 13 |
|
| 14 |
import gc
|
| 15 |
+
import math
|
| 16 |
from typing import Callable, Dict, List, Optional
|
| 17 |
|
| 18 |
import torch
|
|
|
|
| 59 |
submode_override: Optional[str] = None,
|
| 60 |
progress_callback: Optional[Callable[[int, int, str, Optional[int]], None]] = None,
|
| 61 |
debug_info: bool = False,
|
| 62 |
+
full_match_degree_only: bool = False,
|
| 63 |
) -> Dict:
|
| 64 |
"""
|
| 65 |
梯度归因:logits 对输入 embedding 的梯度。
|
|
|
|
| 149 |
attention_mask=attention_mask,
|
| 150 |
output_attentions=False,
|
| 151 |
)
|
| 152 |
+
# 显式同步,确保已完成,progress_callback 时机准确
|
| 153 |
if device.type == "cuda":
|
| 154 |
torch.cuda.synchronize(device)
|
| 155 |
elif device.type == "mps":
|
| 156 |
torch.mps.synchronize()
|
| 157 |
+
|
|
|
|
| 158 |
logits = outputs.logits[:, -1, :]
|
| 159 |
topk_vals, topk_ids = torch.topk(logits, LOGITS_GRADIENT_TOPK, dim=-1)
|
| 160 |
probs = torch.softmax(logits, dim=-1)
|
|
|
|
| 164 |
|
| 165 |
neg_token = "无" if submode == "fill_blank" else "0"
|
| 166 |
neg_id = tokenizer.encode(neg_token, add_special_tokens=False)[0]
|
| 167 |
+
# 全文匹配度:count/match_score 用 1-P("0"),fill_blank 用 1-P("无")
|
| 168 |
+
p_neg = probs[0, neg_id].item()
|
| 169 |
+
full_match_degree = round(1.0 - p_neg, 4)
|
| 170 |
+
|
| 171 |
+
if full_match_degree_only and submode == "count":
|
| 172 |
+
return {
|
| 173 |
+
"model": get_semantic_model_display_name(),
|
| 174 |
+
"token_attention": [],
|
| 175 |
+
"full_match_degree": full_match_degree,
|
| 176 |
+
}
|
| 177 |
+
|
| 178 |
+
if progress_callback:
|
| 179 |
+
progress_callback(3, TOTAL_STEPS, "backward", None)
|
| 180 |
# 归因目标:raw logits(不经过 softmax backward),避免饱和与竞争污染。
|
| 181 |
if submode == "count" or submode == "fill_blank":
|
| 182 |
# count/fill_blank 均用 top-10、按概率加权 Σ pᵢ·zᵢ,并排除 neg_token(0/无)以保持梯度方向与「相关」一致。
|
|
|
|
| 194 |
else:
|
| 195 |
raise ValueError(f"未知 submode: {submode}")
|
| 196 |
target_logit.backward()
|
|
|
|
| 197 |
grad = embeds.grad
|
| 198 |
if grad is None:
|
| 199 |
raise RuntimeError("logits_gradient: 梯度未回传,可能模型不支持(如 int8 量化)")
|
| 200 |
|
| 201 |
+
# 显式同步,确保已完成,progress_callback 时机准确
|
| 202 |
+
if device.type == "cuda":
|
| 203 |
+
torch.cuda.synchronize(device)
|
| 204 |
+
elif device.type == "mps":
|
| 205 |
+
torch.mps.synchronize()
|
| 206 |
if progress_callback:
|
| 207 |
progress_callback(4, TOTAL_STEPS, "processing", None)
|
| 208 |
+
|
| 209 |
text_token_end = len(offset_mapping)
|
| 210 |
# 在 GPU 上一次性计算所有 token 的 ‖∇f‖,避免循环内 .item() 导致 500 次 GPU→CPU 同步
|
| 211 |
grad_slice = grad[0, prompt_end:text_token_end].float()
|
| 212 |
norms = grad_slice.norm(dim=-1).cpu().tolist()
|
| 213 |
token_attention: List[Dict] = []
|
| 214 |
+
nan_count = 0
|
| 215 |
for i in range(prompt_end, text_token_end):
|
| 216 |
s, e = offset_mapping[i]
|
| 217 |
if s >= text_start_char and e <= text_end_char:
|
| 218 |
s_rel, e_rel = s - text_start_char, e - text_start_char
|
| 219 |
+
score = norms[i - prompt_end]
|
| 220 |
+
if not math.isfinite(score):
|
| 221 |
+
score = 0.0
|
| 222 |
+
nan_count += 1
|
| 223 |
+
token_attention.append({"offset": [s_rel, e_rel], "raw": truncated_text[s_rel:e_rel], "score": score})
|
| 224 |
+
if nan_count > 0:
|
| 225 |
+
print(f"⚠️ token_attention 中有 {nan_count} 个 score 为 NaN/Inf,已替换为 0。")
|
| 226 |
|
|
|
|
| 227 |
out = {
|
| 228 |
+
"model": get_semantic_model_display_name(),
|
| 229 |
"token_attention": token_attention,
|
|
|
|
|
|
|
| 230 |
"full_match_degree": full_match_degree,
|
| 231 |
}
|
| 232 |
if debug_info:
|
| 233 |
+
out["debug_info"] = {"abbrev": abbrev, "topk_tokens": topk_tokens, "topk_probs": topk_probs}
|
|
|
|
| 234 |
return out
|
| 235 |
finally:
|
| 236 |
if use_gc:
|
|
|
|
| 245 |
submode_override: Optional[str] = None,
|
| 246 |
progress_callback: Optional[Callable[[int, int, str, Optional[int]], None]] = None,
|
| 247 |
debug_info: bool = False,
|
| 248 |
+
full_match_degree_only: bool = False,
|
| 249 |
) -> Dict:
|
| 250 |
"""
|
| 251 |
分析原文各 token 与 query 的相关度(使用 logits_gradient 梯度归因)。
|
|
|
|
| 255 |
text: 原文
|
| 256 |
submode_override: 评估时可选覆盖子模式(count/match_score/fill_blank)
|
| 257 |
progress_callback: 可选进度回调 (step, total_steps, stage, percentage)
|
| 258 |
+
debug_info: 为 True 时返回 debug_abbrev(推理原文缩写);topk_tokens、topk_probs 始终在结果中
|
| 259 |
|
| 260 |
Returns:
|
| 261 |
+
{"model", "token_attention", "full_match_degree"};debug_info=True 时包含 debug_info 对象
|
| 262 |
"""
|
| 263 |
tokenizer, model, device = ensure_semantic_loaded()
|
| 264 |
return _analyze_logits_gradient(
|
|
|
|
| 266 |
submode_override=submode_override,
|
| 267 |
progress_callback=progress_callback,
|
| 268 |
debug_info=debug_info,
|
| 269 |
+
full_match_degree_only=full_match_degree_only,
|
| 270 |
)
|
client/src/css/_responsive.scss
CHANGED
|
@@ -246,9 +246,10 @@
|
|
| 246 |
}
|
| 247 |
|
| 248 |
// 调整浮动内容宽度,不使用自己的滚动条
|
|
|
|
| 249 |
.floating_content {
|
| 250 |
@include full-width-adaptive;
|
| 251 |
-
|
| 252 |
}
|
| 253 |
|
| 254 |
// 调整统计图容器
|
|
|
|
| 246 |
}
|
| 247 |
|
| 248 |
// 调整浮动内容宽度,不使用自己的滚动条
|
| 249 |
+
// 使用 overflow: visible 避免下拉框(如查询历史)被裁剪;overflow-x: hidden 会令 overflow-y 被计算为 auto 从而产生裁剪
|
| 250 |
.floating_content {
|
| 251 |
@include full-width-adaptive;
|
| 252 |
+
overflow: visible;
|
| 253 |
}
|
| 254 |
|
| 255 |
// 调整统计图容器
|
client/src/css/_semantic-analysis.scss
CHANGED
|
@@ -48,6 +48,16 @@
|
|
| 48 |
align-items: center;
|
| 49 |
gap: 6px;
|
| 50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
.semantic-submode-label {
|
| 52 |
font-size: 9pt;
|
| 53 |
color: var(--text-muted);
|
|
@@ -199,9 +209,13 @@
|
|
| 199 |
font-family: ui-monospace, "Cascadia Code", "Source Code Pro", Menlo, Consolas, "DejaVu Sans Mono", monospace;
|
| 200 |
word-break: break-word;
|
| 201 |
overflow-wrap: break-word;
|
| 202 |
-
// 颜色加重
|
| 203 |
color: var(--text-color, #333);
|
| 204 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 205 |
}
|
| 206 |
|
| 207 |
// 语义分析阶段级进度(与 analyze-progress 同风格,绝对定位不占布局空间)
|
|
|
|
| 48 |
align-items: center;
|
| 49 |
gap: 6px;
|
| 50 |
|
| 51 |
+
.semantic-submode-group {
|
| 52 |
+
display: flex;
|
| 53 |
+
align-items: center;
|
| 54 |
+
gap: 6px;
|
| 55 |
+
|
| 56 |
+
&.semantic-submode-group-right {
|
| 57 |
+
margin-left: auto;
|
| 58 |
+
}
|
| 59 |
+
}
|
| 60 |
+
|
| 61 |
.semantic-submode-label {
|
| 62 |
font-size: 9pt;
|
| 63 |
color: var(--text-muted);
|
|
|
|
| 209 |
font-family: ui-monospace, "Cascadia Code", "Source Code Pro", Menlo, Consolas, "DejaVu Sans Mono", monospace;
|
| 210 |
word-break: break-word;
|
| 211 |
overflow-wrap: break-word;
|
|
|
|
| 212 |
color: var(--text-color, #333);
|
| 213 |
}
|
| 214 |
+
|
| 215 |
+
// TopK 图表:与 tooltip 一致,宽度更大,与上方打印区留间隔
|
| 216 |
+
.semantic-debug-topk-chart {
|
| 217 |
+
margin-top: 20px;
|
| 218 |
+
}
|
| 219 |
}
|
| 220 |
|
| 221 |
// 语义分析阶段级进度(与 analyze-progress 同风格,绝对定位不占布局空间)
|
client/src/css/start.scss
CHANGED
|
@@ -565,7 +565,6 @@ select {
|
|
| 565 |
// 白天模式使用默认字重(400),夜间模式使用 Light 字重(300)
|
| 566 |
background-color: var(--text-area-bg); // 使用CSS变量控制背景色
|
| 567 |
color: var(--text-color); // 使用CSS变量控制文字颜色
|
| 568 |
-
transition: background-color 0.3s ease, color 0.3s ease; // 平滑过渡
|
| 569 |
// 确保至少350px高度以容纳tooltip(内容不足时生效)
|
| 570 |
min-height: 350px;
|
| 571 |
// 不设置固定padding-bottom,让内容自然决定高度
|
|
|
|
| 565 |
// 白天模式使用默认字重(400),夜间模式使用 Light 字重(300)
|
| 566 |
background-color: var(--text-area-bg); // 使用CSS变量控制背景色
|
| 567 |
color: var(--text-color); // 使用CSS变量控制文字颜色
|
|
|
|
| 568 |
// 确保至少350px高度以容纳tooltip(内容不足时生效)
|
| 569 |
min-height: 350px;
|
| 570 |
// 不设置固定padding-bottom,让内容自然决定高度
|
client/src/index.html
CHANGED
|
@@ -150,13 +150,23 @@
|
|
| 150 |
</div>
|
| 151 |
</div>
|
| 152 |
<div class="semantic-submode-row">
|
| 153 |
-
<
|
| 154 |
-
|
| 155 |
-
<
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
</div>
|
| 161 |
</div>
|
| 162 |
</section>
|
|
@@ -164,9 +174,9 @@
|
|
| 164 |
|
| 165 |
<section id="all_result" class="results-section">
|
| 166 |
<div id="stats" class="stats-container">
|
| 167 |
-
<div id="
|
| 168 |
-
<div id="
|
| 169 |
-
<svg id="
|
| 170 |
</div>
|
| 171 |
<div id="token_histogram_item" class="histogram-item" style="display: none;">
|
| 172 |
<div id="token_histogram_title"></div>
|
|
|
|
| 150 |
</div>
|
| 151 |
</div>
|
| 152 |
<div class="semantic-submode-row">
|
| 153 |
+
<span class="semantic-submode-group">
|
| 154 |
+
<label class="semantic-submode-label" for="semantic_submode_select">submode: </label>
|
| 155 |
+
<select id="semantic_submode_select" class="semantic-submode-select">
|
| 156 |
+
<option value="count">count</option>
|
| 157 |
+
<option value="match_score">match_score</option>
|
| 158 |
+
<option value="fill_blank">fill_blank</option>
|
| 159 |
+
<option value="hybrid">hybrid</option>
|
| 160 |
+
</select>
|
| 161 |
+
</span>
|
| 162 |
+
<span class="semantic-submode-group semantic-submode-group-right">
|
| 163 |
+
<label class="semantic-submode-label" for="semantic_color_source_select">color source: </label>
|
| 164 |
+
<select id="semantic_color_source_select" class="semantic-submode-select">
|
| 165 |
+
<option value="raw_score_normed" selected>raw score normed</option>
|
| 166 |
+
<option value="signal_probability">signal probability</option>
|
| 167 |
+
<option value="pw_score">pw score</option>
|
| 168 |
+
</select>
|
| 169 |
+
</span>
|
| 170 |
</div>
|
| 171 |
</div>
|
| 172 |
</section>
|
|
|
|
| 174 |
|
| 175 |
<section id="all_result" class="results-section">
|
| 176 |
<div id="stats" class="stats-container">
|
| 177 |
+
<div id="raw_score_normed_histogram_item" class="histogram-item" style="display: none;">
|
| 178 |
+
<div id="raw_score_normed_histogram_title"></div>
|
| 179 |
+
<svg id="stats_raw_score_normed"></svg>
|
| 180 |
</div>
|
| 181 |
<div id="token_histogram_item" class="histogram-item" style="display: none;">
|
| 182 |
<div id="token_histogram_title"></div>
|
client/src/package.json
CHANGED
|
@@ -5,6 +5,10 @@
|
|
| 5 |
"main": "webpack.config.js",
|
| 6 |
"scripts": {
|
| 7 |
"test": "echo \"Error: no test specified\" && exit 1",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 8 |
"prebuild": "node scripts/updateIntroHTML.js",
|
| 9 |
"prebuild:dev": "node scripts/updateIntroHTML.js",
|
| 10 |
"wp": "npm run build:dev",
|
|
|
|
| 5 |
"main": "webpack.config.js",
|
| 6 |
"scripts": {
|
| 7 |
"test": "echo \"Error: no test specified\" && exit 1",
|
| 8 |
+
"test:lognormal": "npx tsx ts/utils/visualizationUpdater.lognormal.test.ts",
|
| 9 |
+
"test:lognormal:tau": "npx tsx ts/utils/lognormalFit.tauBoundary.test.ts",
|
| 10 |
+
"test:signalThreshold": "npx tsx ts/utils/signalThresholdDetector.1log.test.ts",
|
| 11 |
+
"demo:histogramCdf": "npx tsx ts/utils/histogramCdfDemoData.ts",
|
| 12 |
"prebuild": "node scripts/updateIntroHTML.js",
|
| 13 |
"prebuild:dev": "node scripts/updateIntroHTML.js",
|
| 14 |
"wp": "npm run build:dev",
|
client/src/ts/api/GLTR_API.ts
CHANGED
|
@@ -5,6 +5,7 @@ Attn API and Types
|
|
| 5 |
import * as d3 from "d3";
|
| 6 |
import URLHandler from "../utils/URLHandler";
|
| 7 |
import {cleanSpecials} from "../utils/Util";
|
|
|
|
| 8 |
import {AnalyzeResponse, AnalyzeResult, TokenWithOffset} from "./generatedSchemas";
|
| 9 |
|
| 10 |
export type FrontendToken = TokenWithOffset & { bpe_merged?: boolean };
|
|
@@ -254,28 +255,38 @@ export class TextAnalysisAPI {
|
|
| 254 |
query: string,
|
| 255 |
text: string,
|
| 256 |
onProgress?: (step: number, totalSteps: number, stage: string, percentage?: number) => void,
|
| 257 |
-
submode?: string
|
| 258 |
-
|
|
|
|
| 259 |
if (submode === 'hybrid') {
|
| 260 |
-
const r1 = await this.analyzeSemantic(query, text, onProgress, 'count');
|
| 261 |
const r2 = await this.analyzeSemantic(query, text, onProgress, 'fill_blank');
|
| 262 |
-
|
|
|
|
| 263 |
}
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
}
|
| 267 |
-
const
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
if (response && response.success === false) {
|
| 275 |
throw new Error(response.message || 'Semantic analysis failed');
|
| 276 |
}
|
| 277 |
return response;
|
| 278 |
-
}
|
|
|
|
|
|
|
|
|
|
| 279 |
}
|
| 280 |
|
| 281 |
/**
|
|
@@ -285,10 +296,12 @@ export class TextAnalysisAPI {
|
|
| 285 |
query: string,
|
| 286 |
text: string,
|
| 287 |
onProgress: (step: number, totalSteps: number, stage: string, percentage?: number) => void,
|
| 288 |
-
submode?: string
|
| 289 |
-
|
|
|
|
| 290 |
const payload: Record<string, unknown> = { query, text, stream: true, debug_info: true };
|
| 291 |
if (submode) payload.submode = submode;
|
|
|
|
| 292 |
return this.fetchSSEStream(
|
| 293 |
'/api/analyze-semantic',
|
| 294 |
payload,
|
|
@@ -386,7 +399,10 @@ export class TextAnalysisAPI {
|
|
| 386 |
reject(new Error(parsed.message || errorMessage));
|
| 387 |
}
|
| 388 |
} catch (e) {
|
| 389 |
-
|
|
|
|
|
|
|
|
|
|
| 390 |
}
|
| 391 |
}
|
| 392 |
|
|
|
|
| 5 |
import * as d3 from "d3";
|
| 6 |
import URLHandler from "../utils/URLHandler";
|
| 7 |
import {cleanSpecials} from "../utils/Util";
|
| 8 |
+
import * as semanticResultCache from "../utils/semanticResultCache";
|
| 9 |
import {AnalyzeResponse, AnalyzeResult, TokenWithOffset} from "./generatedSchemas";
|
| 10 |
|
| 11 |
export type FrontendToken = TokenWithOffset & { bpe_merged?: boolean };
|
|
|
|
| 255 |
query: string,
|
| 256 |
text: string,
|
| 257 |
onProgress?: (step: number, totalSteps: number, stage: string, percentage?: number) => void,
|
| 258 |
+
submode?: string,
|
| 259 |
+
fullMatchDegreeOnly?: boolean
|
| 260 |
+
): Promise<{ success: boolean; model?: string; token_attention?: Array<{ offset: [number, number]; raw: string; score: number }>; debug_info?: { abbrev?: string; topk_tokens?: string[]; topk_probs?: number[] }; full_match_degree?: number; message?: string }> {
|
| 261 |
if (submode === 'hybrid') {
|
| 262 |
+
const r1 = await this.analyzeSemantic(query, text, onProgress, 'count', true);
|
| 263 |
const r2 = await this.analyzeSemantic(query, text, onProgress, 'fill_blank');
|
| 264 |
+
const fromCache = (r1 as { __fromCache?: boolean }).__fromCache && (r2 as { __fromCache?: boolean }).__fromCache;
|
| 265 |
+
return { ...r2, full_match_degree: r1.full_match_degree, __fromCache: fromCache } as typeof r2 & { __fromCache?: boolean };
|
| 266 |
}
|
| 267 |
+
const cacheSubmode = submode;
|
| 268 |
+
const cached = semanticResultCache.get(text, query, cacheSubmode);
|
| 269 |
+
if (cached && (fullMatchDegreeOnly || cached.token_attention)) return { ...cached, __fromCache: true } as typeof cached & { __fromCache?: boolean };
|
| 270 |
+
const doRequest = async (): Promise<typeof cached> => {
|
| 271 |
+
if (onProgress) {
|
| 272 |
+
return this.analyzeSemanticWithProgress(query, text, onProgress, submode, fullMatchDegreeOnly);
|
| 273 |
+
}
|
| 274 |
+
const payload: Record<string, unknown> = { query, text, debug_info: true };
|
| 275 |
+
if (submode) payload.submode = submode;
|
| 276 |
+
if (fullMatchDegreeOnly) payload.full_match_degree_only = true;
|
| 277 |
+
const response = await d3.json(this.baseURL + '/api/analyze-semantic', {
|
| 278 |
+
method: 'POST',
|
| 279 |
+
body: JSON.stringify(payload),
|
| 280 |
+
headers: this.getHeaders()
|
| 281 |
+
}) as any;
|
| 282 |
if (response && response.success === false) {
|
| 283 |
throw new Error(response.message || 'Semantic analysis failed');
|
| 284 |
}
|
| 285 |
return response;
|
| 286 |
+
};
|
| 287 |
+
const res = await doRequest();
|
| 288 |
+
if (res?.success) semanticResultCache.set(text, query, res, cacheSubmode);
|
| 289 |
+
return res;
|
| 290 |
}
|
| 291 |
|
| 292 |
/**
|
|
|
|
| 296 |
query: string,
|
| 297 |
text: string,
|
| 298 |
onProgress: (step: number, totalSteps: number, stage: string, percentage?: number) => void,
|
| 299 |
+
submode?: string,
|
| 300 |
+
fullMatchDegreeOnly?: boolean
|
| 301 |
+
): Promise<{ success: boolean; model?: string; token_attention?: Array<{ offset: [number, number]; raw: string; score: number }>; debug_info?: { abbrev?: string; topk_tokens?: string[]; topk_probs?: number[] }; full_match_degree?: number; message?: string }> {
|
| 302 |
const payload: Record<string, unknown> = { query, text, stream: true, debug_info: true };
|
| 303 |
if (submode) payload.submode = submode;
|
| 304 |
+
if (fullMatchDegreeOnly) payload.full_match_degree_only = true;
|
| 305 |
return this.fetchSSEStream(
|
| 306 |
'/api/analyze-semantic',
|
| 307 |
payload,
|
|
|
|
| 399 |
reject(new Error(parsed.message || errorMessage));
|
| 400 |
}
|
| 401 |
} catch (e) {
|
| 402 |
+
const msg = e instanceof SyntaxError
|
| 403 |
+
? `SSE 数据解析失败:${e.message}(可能是后端返回了无效 JSON,如 NaN)`
|
| 404 |
+
: `SSE 消息处理失败:${e instanceof Error ? e.message : String(e)}`;
|
| 405 |
+
reject(new Error(msg));
|
| 406 |
}
|
| 407 |
}
|
| 408 |
|
client/src/ts/appInitializer.ts
CHANGED
|
@@ -6,7 +6,7 @@
|
|
| 6 |
import * as d3 from 'd3';
|
| 7 |
import { SimpleEventHandler } from './utils/SimpleEventHandler';
|
| 8 |
import { TextAnalysisAPI } from './api/GLTR_API';
|
| 9 |
-
import { getTokenSurprisalColor, getByteSurprisalColor } from './utils/SurprisalColorConfig';
|
| 10 |
|
| 11 |
/**
|
| 12 |
* 公共初始化返回对象
|
|
@@ -33,8 +33,8 @@ export function initializeCommonApp(apiPrefix: string = '', element?: Element):
|
|
| 33 |
return {
|
| 34 |
eventHandler: new SimpleEventHandler(targetElement),
|
| 35 |
api: new TextAnalysisAPI(apiPrefix),
|
| 36 |
-
tokenSurprisalColorScale: getTokenSurprisalColor,
|
| 37 |
-
byteSurprisalColorScale: getByteSurprisalColor,
|
| 38 |
totalSurprisalFormat: (n: number | null) => n !== null && Number.isFinite(n) ? format(n) : String(n)
|
| 39 |
};
|
| 40 |
}
|
|
|
|
| 6 |
import * as d3 from 'd3';
|
| 7 |
import { SimpleEventHandler } from './utils/SimpleEventHandler';
|
| 8 |
import { TextAnalysisAPI } from './api/GLTR_API';
|
| 9 |
+
import { getTokenSurprisalColor, getByteSurprisalColor, HISTOGRAM_MIN_ALPHA } from './utils/SurprisalColorConfig';
|
| 10 |
|
| 11 |
/**
|
| 12 |
* 公共初始化返回对象
|
|
|
|
| 33 |
return {
|
| 34 |
eventHandler: new SimpleEventHandler(targetElement),
|
| 35 |
api: new TextAnalysisAPI(apiPrefix),
|
| 36 |
+
tokenSurprisalColorScale: (v) => getTokenSurprisalColor(v, HISTOGRAM_MIN_ALPHA),
|
| 37 |
+
byteSurprisalColorScale: (v) => getByteSurprisalColor(v, 1, HISTOGRAM_MIN_ALPHA),
|
| 38 |
totalSurprisalFormat: (n: number | null) => n !== null && Number.isFinite(n) ? format(n) : String(n)
|
| 39 |
};
|
| 40 |
}
|
client/src/ts/compare.ts
CHANGED
|
@@ -348,6 +348,7 @@ window.onload = () => {
|
|
| 348 |
colorScale: tokenSurprisalColorScale,
|
| 349 |
averageValue: stats.tokenAverage ?? undefined,
|
| 350 |
p90Value: stats.tokenP90 ?? undefined,
|
|
|
|
| 351 |
});
|
| 352 |
|
| 353 |
// 更新列视图中 token surprisal histogram 的标题文本
|
|
@@ -1270,11 +1271,11 @@ window.onload = () => {
|
|
| 1270 |
// 初始化主题管理器(在所有函数定义之后)
|
| 1271 |
const themeManager = initThemeManager({
|
| 1272 |
onThemeChange: () => {
|
| 1273 |
-
// 主题切换时重新渲染所有图表
|
| 1274 |
columnsData.forEach((col) => {
|
| 1275 |
if (col.data && col.stats) {
|
| 1276 |
renderStatsForColumn(col.id, col);
|
| 1277 |
}
|
|
|
|
| 1278 |
});
|
| 1279 |
}
|
| 1280 |
});
|
|
|
|
| 348 |
colorScale: tokenSurprisalColorScale,
|
| 349 |
averageValue: stats.tokenAverage ?? undefined,
|
| 350 |
p90Value: stats.tokenP90 ?? undefined,
|
| 351 |
+
p90Label: tokenHistogramConfig.averageLabel,
|
| 352 |
});
|
| 353 |
|
| 354 |
// 更新列视图中 token surprisal histogram 的标题文本
|
|
|
|
| 1271 |
// 初始化主题管理器(在所有函数定义之后)
|
| 1272 |
const themeManager = initThemeManager({
|
| 1273 |
onThemeChange: () => {
|
|
|
|
| 1274 |
columnsData.forEach((col) => {
|
| 1275 |
if (col.data && col.stats) {
|
| 1276 |
renderStatsForColumn(col.id, col);
|
| 1277 |
}
|
| 1278 |
+
requestAnimationFrame(() => col.lmfInstance?.reRenderCurrent());
|
| 1279 |
});
|
| 1280 |
}
|
| 1281 |
});
|
client/src/ts/controllers/highlightController.ts
CHANGED
|
@@ -3,14 +3,15 @@ import type { GLTR_Text_Box } from '../vis/GLTR_Text_Box';
|
|
| 3 |
import type { Histogram } from '../vis/Histogram';
|
| 4 |
import type { HistogramBinClickEvent } from '../vis/Histogram';
|
| 5 |
import type { FrontendAnalyzeResult } from '../api/GLTR_API';
|
| 6 |
-
import { calculateHighlights, type HistogramType } from '../utils/highlightUtils';
|
|
|
|
|
|
|
| 7 |
|
| 8 |
export type HighlightControllerOptions = {
|
| 9 |
stats_frac: Histogram;
|
| 10 |
-
|
| 11 |
-
stats_semantic_score?: Histogram;
|
| 12 |
lmf: GLTR_Text_Box;
|
| 13 |
-
currentData:
|
| 14 |
};
|
| 15 |
|
| 16 |
export class HighlightController {
|
|
@@ -25,7 +26,7 @@ export class HighlightController {
|
|
| 25 |
*/
|
| 26 |
public clearHighlights(): void {
|
| 27 |
this.options.stats_frac.clearSelection();
|
| 28 |
-
this.options.
|
| 29 |
this.options.lmf.clearHighlight();
|
| 30 |
}
|
| 31 |
|
|
@@ -43,32 +44,32 @@ export class HighlightController {
|
|
| 43 |
}
|
| 44 |
|
| 45 |
const { x0, x1, binIndex, no_bins, source } = ev;
|
| 46 |
-
const
|
| 47 |
|
| 48 |
-
// 首页:根据直方图 source 区分类型
|
| 49 |
let histogramType: HistogramType = 'token';
|
| 50 |
-
if (source === '
|
| 51 |
-
histogramType = 'semantic';
|
| 52 |
-
}
|
| 53 |
|
| 54 |
-
|
| 55 |
-
if (histogramType === 'semantic') {
|
| 56 |
this.options.stats_frac.clearSelection();
|
| 57 |
} else {
|
| 58 |
-
this.options.
|
| 59 |
}
|
| 60 |
|
| 61 |
-
const { indices, style } = calculateHighlights(histogramType, x0, x1, binIndex, no_bins,
|
| 62 |
|
| 63 |
this.options.lmf.setHighlightedIndices(indices, style);
|
| 64 |
}
|
| 65 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
/**
|
| 67 |
* 更新当前数据(当数据变化时调用)
|
| 68 |
*/
|
| 69 |
-
public updateCurrentData(currentData:
|
| 70 |
-
|
| 71 |
-
(this.options as any).currentData = currentData;
|
| 72 |
}
|
| 73 |
}
|
| 74 |
|
|
|
|
| 3 |
import type { Histogram } from '../vis/Histogram';
|
| 4 |
import type { HistogramBinClickEvent } from '../vis/Histogram';
|
| 5 |
import type { FrontendAnalyzeResult } from '../api/GLTR_API';
|
| 6 |
+
import { calculateHighlights, type HistogramType, type HighlightData } from '../utils/highlightUtils';
|
| 7 |
+
|
| 8 |
+
export type HighlightCurrentData = { result: FrontendAnalyzeResult; signalProbs?: number[]; pPwValues?: number[]; pwScores?: number[] } | null;
|
| 9 |
|
| 10 |
export type HighlightControllerOptions = {
|
| 11 |
stats_frac: Histogram;
|
| 12 |
+
stats_raw_score_normed?: Histogram;
|
|
|
|
| 13 |
lmf: GLTR_Text_Box;
|
| 14 |
+
currentData: HighlightCurrentData;
|
| 15 |
};
|
| 16 |
|
| 17 |
export class HighlightController {
|
|
|
|
| 26 |
*/
|
| 27 |
public clearHighlights(): void {
|
| 28 |
this.options.stats_frac.clearSelection();
|
| 29 |
+
this.options.stats_raw_score_normed?.clearSelection();
|
| 30 |
this.options.lmf.clearHighlight();
|
| 31 |
}
|
| 32 |
|
|
|
|
| 44 |
}
|
| 45 |
|
| 46 |
const { x0, x1, binIndex, no_bins, source } = ev;
|
| 47 |
+
const highlightData: HighlightData = { ...currentData.result, signalProbs: currentData.signalProbs, pPwValues: currentData.pPwValues, pwScores: currentData.pwScores };
|
| 48 |
|
|
|
|
| 49 |
let histogramType: HistogramType = 'token';
|
| 50 |
+
if (source === 'stats_raw_score_normed') histogramType = 'raw_score_normed';
|
|
|
|
|
|
|
| 51 |
|
| 52 |
+
if (histogramType === 'raw_score_normed') {
|
|
|
|
| 53 |
this.options.stats_frac.clearSelection();
|
| 54 |
} else {
|
| 55 |
+
this.options.stats_raw_score_normed?.clearSelection();
|
| 56 |
}
|
| 57 |
|
| 58 |
+
const { indices, style } = calculateHighlights(histogramType, x0, x1, binIndex, no_bins, highlightData);
|
| 59 |
|
| 60 |
this.options.lmf.setHighlightedIndices(indices, style);
|
| 61 |
}
|
| 62 |
|
| 63 |
+
/** 获取当前高亮数据 */
|
| 64 |
+
public getCurrentData(): HighlightCurrentData {
|
| 65 |
+
return (this.options as { currentData: HighlightCurrentData }).currentData;
|
| 66 |
+
}
|
| 67 |
+
|
| 68 |
/**
|
| 69 |
* 更新当前数据(当数据变化时调用)
|
| 70 |
*/
|
| 71 |
+
public updateCurrentData(currentData: HighlightCurrentData): void {
|
| 72 |
+
(this.options as { currentData: HighlightCurrentData }).currentData = currentData;
|
|
|
|
| 73 |
}
|
| 74 |
}
|
| 75 |
|
client/src/ts/lang/translations.ts
CHANGED
|
@@ -188,12 +188,18 @@ export const translations: Translations = {
|
|
| 188 |
'information per token histogram': 'token信息量直方图',
|
| 189 |
'information per token progress': 'token信息量进度图',
|
| 190 |
'token index': 'token索引',
|
| 191 |
-
'
|
| 192 |
-
'
|
|
|
|
|
|
|
|
|
|
| 193 |
|
| 194 |
// ========== Tooltip 内容 ==========
|
| 195 |
'information density:': '信息密度:',
|
| 196 |
-
'
|
|
|
|
|
|
|
|
|
|
| 197 |
'Match: {0}%': '匹配度: {0}%',
|
| 198 |
'raw score:': '原始分数:',
|
| 199 |
'prob:': '概率:',
|
|
|
|
| 188 |
'information per token histogram': 'token信息量直方图',
|
| 189 |
'information per token progress': 'token信息量进度图',
|
| 190 |
'token index': 'token索引',
|
| 191 |
+
'raw score normed histogram': '归一化原始分数直方图',
|
| 192 |
+
'semantic signal prob histogram': '语义信号概率直方图',
|
| 193 |
+
'signal prob': 'signal概率',
|
| 194 |
+
'signal ratio': '信号比',
|
| 195 |
+
'pw score': 'pw 分数',
|
| 196 |
|
| 197 |
// ========== Tooltip 内容 ==========
|
| 198 |
'information density:': '信息密度:',
|
| 199 |
+
'pw score:': 'pw 分数:',
|
| 200 |
+
'signal prob:': 'signal概率:',
|
| 201 |
+
'signal probability:': '信号概率:',
|
| 202 |
+
'raw score normed:': '归一化原始分数:',
|
| 203 |
'Match: {0}%': '匹配度: {0}%',
|
| 204 |
'raw score:': '原始分数:',
|
| 205 |
'prob:': '概率:',
|
client/src/ts/start.ts
CHANGED
|
@@ -47,6 +47,7 @@ import { isValidUrl, extractUrl, isPureUrl } from './utils/urlUtils';
|
|
| 47 |
import { AdminManager } from './utils/adminManager';
|
| 48 |
import { SettingsMenuManager } from './utils/settingsMenuManager';
|
| 49 |
import { saveHistory, initQueryHistoryDropdown } from './utils/queryHistory';
|
|
|
|
| 50 |
import { playAnalysisCompleteSound } from './utils/soundNotification';
|
| 51 |
|
| 52 |
const current = {
|
|
@@ -170,7 +171,7 @@ window.onload = () => {
|
|
| 170 |
width: 400, // 宽度
|
| 171 |
height: 200 // 增加高度从默认150px到200px
|
| 172 |
});
|
| 173 |
-
const
|
| 174 |
width: 400,
|
| 175 |
height: 200
|
| 176 |
});
|
|
@@ -202,7 +203,7 @@ window.onload = () => {
|
|
| 202 |
// 创建高亮控制器
|
| 203 |
const highlightController = new HighlightController({
|
| 204 |
stats_frac,
|
| 205 |
-
|
| 206 |
lmf,
|
| 207 |
currentData: null
|
| 208 |
});
|
|
@@ -217,7 +218,7 @@ window.onload = () => {
|
|
| 217 |
highlightController,
|
| 218 |
textInputController,
|
| 219 |
stats_frac,
|
| 220 |
-
|
| 221 |
stats_surprisal_progress,
|
| 222 |
appStateManager,
|
| 223 |
surprisalColorScale: tokenSurprisalColorScale as d3.ScaleSequential<string>
|
|
@@ -225,7 +226,9 @@ window.onload = () => {
|
|
| 225 |
|
| 226 |
// 初始化主题管理器(在设置菜单中)
|
| 227 |
const themeManager = initThemeManager({
|
| 228 |
-
onThemeChange: () =>
|
|
|
|
|
|
|
| 229 |
}, '#theme_dropdown');
|
| 230 |
|
| 231 |
// 初始化语言管理器(在设置菜单中)
|
|
@@ -276,6 +279,47 @@ window.onload = () => {
|
|
| 276 |
// Semantic analysis UI 完全由配置决定,初始化时同步
|
| 277 |
visualizationUpdater.syncSemanticUiFromConfig();
|
| 278 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 279 |
// *****************************
|
| 280 |
// ***** demo stuff *****
|
| 281 |
// *****************************
|
|
@@ -805,7 +849,8 @@ window.onload = () => {
|
|
| 805 |
visualizationUpdater.handleSemanticResponse(res, text);
|
| 806 |
appStateManager.setLastSearchedQuery(query);
|
| 807 |
saveHistory(query);
|
| 808 |
-
|
|
|
|
| 809 |
const md = res?.full_match_degree;
|
| 810 |
const mdEl = d3.select('#semantic_match_degree');
|
| 811 |
if (md != null && typeof md === 'number') {
|
|
@@ -838,7 +883,14 @@ window.onload = () => {
|
|
| 838 |
input: semanticSearchInput,
|
| 839 |
dropdownId: 'semantic_search_history_dropdown',
|
| 840 |
onSelect: () => appStateManager.updateButtonStates(),
|
| 841 |
-
onHistorySelect: runSemanticSearch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 842 |
});
|
| 843 |
|
| 844 |
// Save按钮点击事件(使用 serverDemoController)
|
|
|
|
| 47 |
import { AdminManager } from './utils/adminManager';
|
| 48 |
import { SettingsMenuManager } from './utils/settingsMenuManager';
|
| 49 |
import { saveHistory, initQueryHistoryDropdown } from './utils/queryHistory';
|
| 50 |
+
import { removeByQuery as removeSemanticCacheByQuery } from './utils/semanticResultCache';
|
| 51 |
import { playAnalysisCompleteSound } from './utils/soundNotification';
|
| 52 |
|
| 53 |
const current = {
|
|
|
|
| 171 |
width: 400, // 宽度
|
| 172 |
height: 200 // 增加高度从默认150px到200px
|
| 173 |
});
|
| 174 |
+
const stats_raw_score_normed = new Histogram(d3.select('#stats_raw_score_normed'), eventHandler, {
|
| 175 |
width: 400,
|
| 176 |
height: 200
|
| 177 |
});
|
|
|
|
| 203 |
// 创建高亮控制器
|
| 204 |
const highlightController = new HighlightController({
|
| 205 |
stats_frac,
|
| 206 |
+
stats_raw_score_normed,
|
| 207 |
lmf,
|
| 208 |
currentData: null
|
| 209 |
});
|
|
|
|
| 218 |
highlightController,
|
| 219 |
textInputController,
|
| 220 |
stats_frac,
|
| 221 |
+
stats_raw_score_normed,
|
| 222 |
stats_surprisal_progress,
|
| 223 |
appStateManager,
|
| 224 |
surprisalColorScale: tokenSurprisalColorScale as d3.ScaleSequential<string>
|
|
|
|
| 226 |
|
| 227 |
// 初始化主题管理器(在设置菜单中)
|
| 228 |
const themeManager = initThemeManager({
|
| 229 |
+
onThemeChange: () => {
|
| 230 |
+
visualizationUpdater.rerenderOnThemeChange();
|
| 231 |
+
}
|
| 232 |
}, '#theme_dropdown');
|
| 233 |
|
| 234 |
// 初始化语言管理器(在设置菜单中)
|
|
|
|
| 279 |
// Semantic analysis UI 完全由配置决定,初始化时同步
|
| 280 |
visualizationUpdater.syncSemanticUiFromConfig();
|
| 281 |
|
| 282 |
+
// 语义分析:从 URL 参数恢复查询输入和选项(刷新后可恢复)
|
| 283 |
+
const initSemanticFromUrl = () => {
|
| 284 |
+
const params = URLHandler.parameters;
|
| 285 |
+
const query = params['semantic_query'];
|
| 286 |
+
const submode = params['semantic_submode'];
|
| 287 |
+
const colorSource = params['semantic_color_source'];
|
| 288 |
+
const validSubmodes = ['count', 'match_score', 'fill_blank', 'hybrid'];
|
| 289 |
+
const validColorSources = ['raw_score_normed', 'signal_probability', 'pw_score'];
|
| 290 |
+
if (typeof query === 'string') {
|
| 291 |
+
const el = document.getElementById('semantic_search_input') as HTMLInputElement | null;
|
| 292 |
+
if (el) el.value = query;
|
| 293 |
+
}
|
| 294 |
+
if (typeof submode === 'string' && validSubmodes.includes(submode)) {
|
| 295 |
+
const el = document.getElementById('semantic_submode_select') as HTMLSelectElement | null;
|
| 296 |
+
if (el) el.value = submode;
|
| 297 |
+
}
|
| 298 |
+
if (typeof colorSource === 'string' && validColorSources.includes(colorSource)) {
|
| 299 |
+
const el = document.getElementById('semantic_color_source_select') as HTMLSelectElement | null;
|
| 300 |
+
if (el) el.value = colorSource;
|
| 301 |
+
}
|
| 302 |
+
};
|
| 303 |
+
initSemanticFromUrl();
|
| 304 |
+
|
| 305 |
+
// 语义分析:同步查询和选项到 URL(刷新后可恢复)
|
| 306 |
+
const syncSemanticToUrl = () => {
|
| 307 |
+
const queryEl = document.getElementById('semantic_search_input') as HTMLInputElement | null;
|
| 308 |
+
const submodeEl = document.getElementById('semantic_submode_select') as HTMLSelectElement | null;
|
| 309 |
+
const colorEl = document.getElementById('semantic_color_source_select') as HTMLSelectElement | null;
|
| 310 |
+
const query = queryEl?.value?.trim() ?? '';
|
| 311 |
+
const submode = submodeEl?.value?.trim() ?? '';
|
| 312 |
+
const colorSource = colorEl?.value?.trim() ?? '';
|
| 313 |
+
const currentParams = URLHandler.parameters;
|
| 314 |
+
if (query) currentParams['semantic_query'] = query;
|
| 315 |
+
else delete currentParams['semantic_query'];
|
| 316 |
+
if (submode) currentParams['semantic_submode'] = submode;
|
| 317 |
+
else delete currentParams['semantic_submode'];
|
| 318 |
+
if (colorSource) currentParams['semantic_color_source'] = colorSource;
|
| 319 |
+
else delete currentParams['semantic_color_source'];
|
| 320 |
+
URLHandler.updateUrl(currentParams, false);
|
| 321 |
+
};
|
| 322 |
+
|
| 323 |
// *****************************
|
| 324 |
// ***** demo stuff *****
|
| 325 |
// *****************************
|
|
|
|
| 849 |
visualizationUpdater.handleSemanticResponse(res, text);
|
| 850 |
appStateManager.setLastSearchedQuery(query);
|
| 851 |
saveHistory(query);
|
| 852 |
+
syncSemanticToUrl();
|
| 853 |
+
if (!(res as { __fromCache?: boolean }).__fromCache) playAnalysisCompleteSound();
|
| 854 |
const md = res?.full_match_degree;
|
| 855 |
const mdEl = d3.select('#semantic_match_degree');
|
| 856 |
if (md != null && typeof md === 'number') {
|
|
|
|
| 883 |
input: semanticSearchInput,
|
| 884 |
dropdownId: 'semantic_search_history_dropdown',
|
| 885 |
onSelect: () => appStateManager.updateButtonStates(),
|
| 886 |
+
onHistorySelect: runSemanticSearch,
|
| 887 |
+
onRemove: removeSemanticCacheByQuery
|
| 888 |
+
});
|
| 889 |
+
semanticSearchInput?.addEventListener('blur', syncSemanticToUrl);
|
| 890 |
+
document.getElementById('semantic_submode_select')?.addEventListener('change', syncSemanticToUrl);
|
| 891 |
+
document.getElementById('semantic_color_source_select')?.addEventListener('change', () => {
|
| 892 |
+
visualizationUpdater.updateSemanticColorSource();
|
| 893 |
+
syncSemanticToUrl();
|
| 894 |
});
|
| 895 |
|
| 896 |
// Save按钮点击事件(使用 serverDemoController)
|
client/src/ts/utils/SurprisalColorConfig.ts
CHANGED
|
@@ -26,15 +26,18 @@ export const MINIMAP_COLOR_FACTOR = 1.3;
|
|
| 26 |
const SURPRISAL_RED_RGB = "255, 71, 64";
|
| 27 |
const SURPRISAL_MAX_ALPHA = 0.7;
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
/**
|
| 30 |
* 根据归一化值获取对应的颜色(输入值应在[0,1]区间)
|
| 31 |
* @param normalizedValue 归一化后的值,范围[0,1]
|
| 32 |
-
* @
|
| 33 |
*/
|
| 34 |
-
|
| 35 |
-
export function getSurprisalColorNormalized(normalizedValue: number): string {
|
| 36 |
const clampedValue = Math.max(0, Math.min(1, normalizedValue));
|
| 37 |
-
|
|
|
|
| 38 |
return `rgba(${SURPRISAL_RED_RGB}, ${alpha})`;
|
| 39 |
}
|
| 40 |
|
|
@@ -57,32 +60,33 @@ function normalizeTo_01(value: number, maxValue: number): number {
|
|
| 57 |
/**
|
| 58 |
* 根据token惊讶度值获取对应的颜色(线性映射,不取整)
|
| 59 |
* @param surprisal token惊讶度值,范围[0, TOKEN_SURPRISAL_MAX]会被映射到[0, 1],超出范围会被截断
|
| 60 |
-
* @
|
| 61 |
*/
|
| 62 |
-
export function getTokenSurprisalColor(surprisal: number): string {
|
| 63 |
const normalizedValue = normalizeTo_01(surprisal, TOKEN_SURPRISAL_MAX);
|
| 64 |
-
return getSurprisalColorNormalized(normalizedValue);
|
| 65 |
}
|
| 66 |
|
| 67 |
/**
|
| 68 |
* 根据byte密度惊讶度值获取对应的颜色(线性映射,不取整)
|
| 69 |
* @param byteSurprisal byte密度惊讶度值,范围[0, BYTE_SURPRISAL_MAX]会被映射到[0, 1],超出范围会被截断
|
| 70 |
* @param colorFactor 颜色因子,用于调整颜色强度,目前主要为了minimap显示更明显(平均后byte surprisal密度会过小,所以需要放大)。默认为1
|
| 71 |
-
* @
|
| 72 |
*/
|
| 73 |
-
export function getByteSurprisalColor(byteSurprisal: number, colorFactor: number = 1): string {
|
| 74 |
const normalizedValue = normalizeTo_01(byteSurprisal * colorFactor, BYTE_SURPRISAL_MAX);
|
| 75 |
-
return getSurprisalColorNormalized(normalizedValue);
|
| 76 |
}
|
| 77 |
|
| 78 |
/**
|
| 79 |
-
* 根据
|
| 80 |
-
* @param
|
|
|
|
| 81 |
*/
|
| 82 |
-
export function getSemanticSimilarityColor(
|
| 83 |
-
if (!isFiniteNumber(
|
| 84 |
-
const normalizedValue = normalizeTo_01(
|
| 85 |
-
return getSurprisalColorNormalized(normalizedValue);
|
| 86 |
}
|
| 87 |
|
| 88 |
// ==========================================
|
|
|
|
| 26 |
const SURPRISAL_RED_RGB = "255, 71, 64";
|
| 27 |
const SURPRISAL_MAX_ALPHA = 0.7;
|
| 28 |
|
| 29 |
+
/** 直方图渐变最浅色 alpha 下限(10% 区间),供直方图使用方配置 */
|
| 30 |
+
export const HISTOGRAM_MIN_ALPHA = 0.1 * SURPRISAL_MAX_ALPHA;
|
| 31 |
+
|
| 32 |
/**
|
| 33 |
* 根据归一化值获取对应的颜色(输入值应在[0,1]区间)
|
| 34 |
* @param normalizedValue 归一化后的值,范围[0,1]
|
| 35 |
+
* @param minAlpha alpha 下限,默认不限制
|
| 36 |
*/
|
| 37 |
+
export function getSurprisalColorNormalized(normalizedValue: number, minAlpha?: number): string {
|
|
|
|
| 38 |
const clampedValue = Math.max(0, Math.min(1, normalizedValue));
|
| 39 |
+
let alpha = clampedValue * SURPRISAL_MAX_ALPHA;
|
| 40 |
+
if (minAlpha != null) alpha = Math.max(minAlpha, alpha);
|
| 41 |
return `rgba(${SURPRISAL_RED_RGB}, ${alpha})`;
|
| 42 |
}
|
| 43 |
|
|
|
|
| 60 |
/**
|
| 61 |
* 根据token惊讶度值获取对应的颜色(线性映射,不取整)
|
| 62 |
* @param surprisal token惊讶度值,范围[0, TOKEN_SURPRISAL_MAX]会被映射到[0, 1],超出范围会被截断
|
| 63 |
+
* @param minAlpha alpha 下限,默认不限制
|
| 64 |
*/
|
| 65 |
+
export function getTokenSurprisalColor(surprisal: number, minAlpha?: number): string {
|
| 66 |
const normalizedValue = normalizeTo_01(surprisal, TOKEN_SURPRISAL_MAX);
|
| 67 |
+
return getSurprisalColorNormalized(normalizedValue, minAlpha);
|
| 68 |
}
|
| 69 |
|
| 70 |
/**
|
| 71 |
* 根据byte密度惊讶度值获取对应的颜色(线性映射,不取整)
|
| 72 |
* @param byteSurprisal byte密度惊讶度值,范围[0, BYTE_SURPRISAL_MAX]会被映射到[0, 1],超出范围会被截断
|
| 73 |
* @param colorFactor 颜色因子,用于调整颜色强度,目前主要为了minimap显示更明显(平均后byte surprisal密度会过小,所以需要放大)。默认为1
|
| 74 |
+
* @param minAlpha alpha 下限,默认不限制
|
| 75 |
*/
|
| 76 |
+
export function getByteSurprisalColor(byteSurprisal: number, colorFactor: number = 1, minAlpha?: number): string {
|
| 77 |
const normalizedValue = normalizeTo_01(byteSurprisal * colorFactor, BYTE_SURPRISAL_MAX);
|
| 78 |
+
return getSurprisalColorNormalized(normalizedValue, minAlpha);
|
| 79 |
}
|
| 80 |
|
| 81 |
/**
|
| 82 |
+
* 根据 rawScoreNormed 获取颜色(用于语义匹配度染色)
|
| 83 |
+
* @param rawScoreNormed 归一化分数,范围 [0, 1]
|
| 84 |
+
* @param minAlpha alpha 下限,默认不限制
|
| 85 |
*/
|
| 86 |
+
export function getSemanticSimilarityColor(rawScoreNormed: number, minAlpha?: number): string {
|
| 87 |
+
if (!isFiniteNumber(rawScoreNormed)) return 'transparent';
|
| 88 |
+
const normalizedValue = normalizeTo_01(rawScoreNormed, SEMANTIC_SIMILARITY_MAX);
|
| 89 |
+
return getSurprisalColorNormalized(normalizedValue, minAlpha);
|
| 90 |
}
|
| 91 |
|
| 92 |
// ==========================================
|
client/src/ts/utils/fitQuality.ts
ADDED
|
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* 拟合质量计算(纯数学,无 Node 依赖)
|
| 3 |
+
*/
|
| 4 |
+
|
| 5 |
+
import { logNormalCdf } from './lognormalFit';
|
| 6 |
+
|
| 7 |
+
/**
|
| 8 |
+
* 计算截尾对数正态在拟合区间内的拟合质量(仅用拟合数据)
|
| 9 |
+
* @returns { maxDiff, rmse, maxDiffIdx } maxDiff = max|CDF_trunc - ECDF|,rmse = sqrt(mean(diff²))
|
| 10 |
+
*/
|
| 11 |
+
export function computeFitQuality(
|
| 12 |
+
noise: number[],
|
| 13 |
+
tau: number,
|
| 14 |
+
mu: number,
|
| 15 |
+
sigma: number
|
| 16 |
+
): { maxDiff: number; rmse: number; maxDiffIdx: number } {
|
| 17 |
+
const nNoise = noise.length;
|
| 18 |
+
if (nNoise < 1) return { maxDiff: NaN, rmse: NaN, maxDiffIdx: -1 };
|
| 19 |
+
const F_tau = logNormalCdf(tau, mu, sigma);
|
| 20 |
+
const cdfTrunc = (x: number) =>
|
| 21 |
+
x <= 0 ? 0 : x >= tau ? 1 : logNormalCdf(x, mu, sigma) / F_tau;
|
| 22 |
+
|
| 23 |
+
let maxDiff = 0;
|
| 24 |
+
let maxDiffIdx = 0;
|
| 25 |
+
let sumSqDiff = 0;
|
| 26 |
+
for (let i = 0; i < nNoise; i++) {
|
| 27 |
+
const x = noise[i]!;
|
| 28 |
+
const ecdf = (i + 1) / nNoise;
|
| 29 |
+
const cdf = cdfTrunc(x);
|
| 30 |
+
const diff = cdf - ecdf;
|
| 31 |
+
if (Math.abs(diff) > maxDiff) {
|
| 32 |
+
maxDiff = Math.abs(diff);
|
| 33 |
+
maxDiffIdx = i;
|
| 34 |
+
}
|
| 35 |
+
sumSqDiff += diff * diff;
|
| 36 |
+
}
|
| 37 |
+
const rmse = Math.sqrt(sumSqDiff / nNoise);
|
| 38 |
+
return { maxDiff, rmse, maxDiffIdx };
|
| 39 |
+
}
|
client/src/ts/utils/highlightUtils.ts
CHANGED
|
@@ -2,13 +2,25 @@ import type { FrontendAnalyzeResult } from '../api/GLTR_API';
|
|
| 2 |
import { calculateSurprisal, calculateSurprisalDensity } from './Util';
|
| 3 |
import { extractRealTopkFromTokens } from './tokenUtils';
|
| 4 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
/**
|
| 6 |
* 根据直方图 bin 的范围计算需要高亮的 token 索引集合(基于 token surprisal)
|
| 7 |
* @param x0 bin 起始值
|
| 8 |
* @param x1 bin 结束值
|
| 9 |
* @param binIndex bin在bins数组中的索引
|
| 10 |
* @param no_bins 直方图的总bin数量
|
| 11 |
-
* @param
|
| 12 |
* @returns 需要高亮的 merged token 索引集合
|
| 13 |
*/
|
| 14 |
export function calculateTokenSurprisalHighlights(
|
|
@@ -16,43 +28,23 @@ export function calculateTokenSurprisalHighlights(
|
|
| 16 |
x1: number,
|
| 17 |
binIndex: number,
|
| 18 |
no_bins: number,
|
| 19 |
-
|
| 20 |
): Set<number> {
|
| 21 |
const highlightedIndices = new Set<number>();
|
| 22 |
-
const originalTokens =
|
| 23 |
const originalRealTopk = extractRealTopkFromTokens(originalTokens);
|
| 24 |
-
const originalToMergedMap =
|
| 25 |
-
const mergedTokens =
|
| 26 |
-
|
| 27 |
-
// 使用binIndex判断是否是最两侧的bin
|
| 28 |
-
const isFirstBin = binIndex === 0; // 第一个bin:包含超出下界的值
|
| 29 |
-
const isLastBin = binIndex === no_bins - 1; // 最后一个bin:包含超出上界的值
|
| 30 |
|
| 31 |
-
// 遍历原始 token,找到 surprisal 在范围内的 token
|
| 32 |
for (let i = 0; i < originalTokens.length; i++) {
|
| 33 |
const surprisal = calculateSurprisal(originalRealTopk[i][1]);
|
| 34 |
-
|
| 35 |
-
|
| 36 |
-
|
| 37 |
-
|
| 38 |
-
|
| 39 |
-
} else if (isLastBin) {
|
| 40 |
-
// 最后一个bin:包含所有 >= x0 的值(自身bin + 超出上界的数据)
|
| 41 |
-
inRange = surprisal >= x0;
|
| 42 |
-
} else {
|
| 43 |
-
// 中间bins:正常范围
|
| 44 |
-
inRange = surprisal >= x0 && surprisal < x1;
|
| 45 |
-
}
|
| 46 |
-
|
| 47 |
-
if (inRange) {
|
| 48 |
-
// 映射到 merged token 索引
|
| 49 |
-
const mappedIndex = originalToMergedMap[i];
|
| 50 |
-
if (Number.isInteger(mappedIndex) && mappedIndex >= 0 && mappedIndex < mergedTokens.length) {
|
| 51 |
-
highlightedIndices.add(mappedIndex);
|
| 52 |
-
}
|
| 53 |
}
|
| 54 |
}
|
| 55 |
-
|
| 56 |
return highlightedIndices;
|
| 57 |
}
|
| 58 |
|
|
@@ -62,7 +54,7 @@ export function calculateTokenSurprisalHighlights(
|
|
| 62 |
* @param x1 bin 结束值
|
| 63 |
* @param binIndex bin在bins数组中的索引
|
| 64 |
* @param no_bins 直方图的总bin数量
|
| 65 |
-
* @param
|
| 66 |
* @returns 需要高亮的 merged token 索引集合
|
| 67 |
*/
|
| 68 |
export function calculateByteSurprisalHighlights(
|
|
@@ -70,95 +62,53 @@ export function calculateByteSurprisalHighlights(
|
|
| 70 |
x1: number,
|
| 71 |
binIndex: number,
|
| 72 |
no_bins: number,
|
| 73 |
-
|
| 74 |
): Set<number> {
|
| 75 |
const highlightedIndices = new Set<number>();
|
| 76 |
-
const mergedTokens =
|
| 77 |
-
|
| 78 |
-
// 使用binIndex判断是否是最两侧的bin
|
| 79 |
-
const isFirstBin = binIndex === 0; // 第一个bin:包含超出下界的值
|
| 80 |
-
const isLastBin = binIndex === no_bins - 1; // 最后一个bin:包含超出上界的值
|
| 81 |
|
| 82 |
-
// 遍历 merged token,找到信息密度在范围内的 token
|
| 83 |
for (let i = 0; i < mergedTokens.length; i++) {
|
| 84 |
const informationDensity = calculateSurprisalDensity(mergedTokens[i]);
|
| 85 |
-
|
| 86 |
-
|
| 87 |
-
|
| 88 |
-
// 第一个bin:包含所有 < x1 的值(自身bin + 超出下界的数据)
|
| 89 |
-
inRange = informationDensity < x1;
|
| 90 |
-
} else if (isLastBin) {
|
| 91 |
-
// 最后一个bin:包含所有 >= x0 的值(自身bin + 超出上界的数据)
|
| 92 |
-
inRange = informationDensity >= x0;
|
| 93 |
-
} else {
|
| 94 |
-
// 中间bins:正常范围
|
| 95 |
-
inRange = informationDensity >= x0 && informationDensity < x1;
|
| 96 |
-
}
|
| 97 |
-
|
| 98 |
-
if (inRange) {
|
| 99 |
-
highlightedIndices.add(i);
|
| 100 |
-
}
|
| 101 |
}
|
| 102 |
-
|
| 103 |
return highlightedIndices;
|
| 104 |
}
|
| 105 |
|
| 106 |
/**
|
| 107 |
-
* 根据直方图 bin 的范围计算需要高亮的 token 索引集合(基于
|
| 108 |
-
* 使用
|
| 109 |
*/
|
| 110 |
-
export function
|
| 111 |
x0: number,
|
| 112 |
x1: number,
|
| 113 |
binIndex: number,
|
| 114 |
no_bins: number,
|
| 115 |
-
|
| 116 |
): Set<number> {
|
| 117 |
const highlightedIndices = new Set<number>();
|
| 118 |
-
const scores =
|
| 119 |
-
if (!scores
|
| 120 |
-
return highlightedIndices;
|
| 121 |
-
}
|
| 122 |
-
|
| 123 |
-
const isFirstBin = binIndex === 0;
|
| 124 |
-
const isLastBin = binIndex === no_bins - 1;
|
| 125 |
|
| 126 |
for (let i = 0; i < scores.length; i++) {
|
| 127 |
const score = scores[i];
|
| 128 |
-
if (!Number.isFinite(score))
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
let inRange = false;
|
| 133 |
-
if (isFirstBin) {
|
| 134 |
-
inRange = score < x1;
|
| 135 |
-
} else if (isLastBin) {
|
| 136 |
-
inRange = score >= x0;
|
| 137 |
-
} else {
|
| 138 |
-
inRange = score >= x0 && score < x1;
|
| 139 |
-
}
|
| 140 |
-
|
| 141 |
-
if (inRange) {
|
| 142 |
-
highlightedIndices.add(i);
|
| 143 |
-
}
|
| 144 |
}
|
| 145 |
-
|
| 146 |
return highlightedIndices;
|
| 147 |
}
|
| 148 |
|
| 149 |
-
/**
|
| 150 |
-
* 直方图类型
|
| 151 |
-
*/
|
| 152 |
-
export type HistogramType = 'token' | 'byte' | 'semantic';
|
| 153 |
-
|
| 154 |
/**
|
| 155 |
* 根据直方图类型和 bin 范围计算需要高亮的 token 索引集合
|
| 156 |
-
* @param histogramType 直方图类型
|
| 157 |
* @param x0 bin 起始值
|
| 158 |
* @param x1 bin 结束值
|
| 159 |
* @param binIndex bin在bins数组中的索引
|
| 160 |
* @param no_bins 直方图的总bin数量
|
| 161 |
-
* @param
|
| 162 |
* @returns 需要高亮的 merged token 索引集合和对应的高亮样式
|
| 163 |
*/
|
| 164 |
export function calculateHighlights(
|
|
@@ -167,22 +117,22 @@ export function calculateHighlights(
|
|
| 167 |
x1: number,
|
| 168 |
binIndex: number,
|
| 169 |
no_bins: number,
|
| 170 |
-
|
| 171 |
): { indices: Set<number>; style: 'border' | 'underline' } {
|
| 172 |
if (histogramType === 'byte') {
|
| 173 |
return {
|
| 174 |
-
indices: calculateByteSurprisalHighlights(x0, x1, binIndex, no_bins,
|
| 175 |
style: 'underline'
|
| 176 |
};
|
| 177 |
}
|
| 178 |
-
if (histogramType === '
|
| 179 |
return {
|
| 180 |
-
indices:
|
| 181 |
style: 'underline'
|
| 182 |
};
|
| 183 |
}
|
| 184 |
return {
|
| 185 |
-
indices: calculateTokenSurprisalHighlights(x0, x1, binIndex, no_bins,
|
| 186 |
style: 'border'
|
| 187 |
};
|
| 188 |
}
|
|
|
|
| 2 |
import { calculateSurprisal, calculateSurprisalDensity } from './Util';
|
| 3 |
import { extractRealTopkFromTokens } from './tokenUtils';
|
| 4 |
|
| 5 |
+
/** 首/末 bin 包含超出范围的值,中间 bin 为 [x0, x1) */
|
| 6 |
+
function valueInBinRange(value: number, x0: number, x1: number, binIndex: number, no_bins: number): boolean {
|
| 7 |
+
const isFirstBin = binIndex === 0;
|
| 8 |
+
const isLastBin = binIndex === no_bins - 1;
|
| 9 |
+
if (isFirstBin) return value < x1;
|
| 10 |
+
if (isLastBin) return value >= x0;
|
| 11 |
+
return value >= x0 && value < x1;
|
| 12 |
+
}
|
| 13 |
+
|
| 14 |
+
export type HistogramType = 'token' | 'byte' | 'raw_score_normed';
|
| 15 |
+
export type HighlightData = FrontendAnalyzeResult & { rawScoresNormed?: number[]; attentionRawScores?: number[]; signalProbs?: number[]; pPwValues?: number[]; pwScores?: number[] };
|
| 16 |
+
|
| 17 |
/**
|
| 18 |
* 根据直方图 bin 的范围计算需要高亮的 token 索引集合(基于 token surprisal)
|
| 19 |
* @param x0 bin 起始值
|
| 20 |
* @param x1 bin 结束值
|
| 21 |
* @param binIndex bin在bins数组中的索引
|
| 22 |
* @param no_bins 直方图的总bin数量
|
| 23 |
+
* @param data 前端分析结果(包含 originalTokens、mergedTokens、originalToMergedMap)
|
| 24 |
* @returns 需要高亮的 merged token 索引集合
|
| 25 |
*/
|
| 26 |
export function calculateTokenSurprisalHighlights(
|
|
|
|
| 28 |
x1: number,
|
| 29 |
binIndex: number,
|
| 30 |
no_bins: number,
|
| 31 |
+
data: HighlightData
|
| 32 |
): Set<number> {
|
| 33 |
const highlightedIndices = new Set<number>();
|
| 34 |
+
const originalTokens = data.originalTokens;
|
| 35 |
const originalRealTopk = extractRealTopkFromTokens(originalTokens);
|
| 36 |
+
const originalToMergedMap = data.originalToMergedMap;
|
| 37 |
+
const mergedTokens = data.mergedTokens;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
|
|
|
| 39 |
for (let i = 0; i < originalTokens.length; i++) {
|
| 40 |
const surprisal = calculateSurprisal(originalRealTopk[i][1]);
|
| 41 |
+
if (!Number.isFinite(surprisal)) continue;
|
| 42 |
+
if (!valueInBinRange(surprisal, x0, x1, binIndex, no_bins)) continue;
|
| 43 |
+
const mappedIndex = originalToMergedMap[i];
|
| 44 |
+
if (Number.isInteger(mappedIndex) && mappedIndex >= 0 && mappedIndex < mergedTokens.length) {
|
| 45 |
+
highlightedIndices.add(mappedIndex);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 46 |
}
|
| 47 |
}
|
|
|
|
| 48 |
return highlightedIndices;
|
| 49 |
}
|
| 50 |
|
|
|
|
| 54 |
* @param x1 bin 结束值
|
| 55 |
* @param binIndex bin在bins数组中的索引
|
| 56 |
* @param no_bins 直方图的总bin数量
|
| 57 |
+
* @param data 前端分析结果(包含 mergedTokens)
|
| 58 |
* @returns 需要高亮的 merged token 索引集合
|
| 59 |
*/
|
| 60 |
export function calculateByteSurprisalHighlights(
|
|
|
|
| 62 |
x1: number,
|
| 63 |
binIndex: number,
|
| 64 |
no_bins: number,
|
| 65 |
+
data: HighlightData
|
| 66 |
): Set<number> {
|
| 67 |
const highlightedIndices = new Set<number>();
|
| 68 |
+
const mergedTokens = data.mergedTokens;
|
| 69 |
+
if (!mergedTokens?.length) return highlightedIndices;
|
|
|
|
|
|
|
|
|
|
| 70 |
|
|
|
|
| 71 |
for (let i = 0; i < mergedTokens.length; i++) {
|
| 72 |
const informationDensity = calculateSurprisalDensity(mergedTokens[i]);
|
| 73 |
+
if (!Number.isFinite(informationDensity)) continue;
|
| 74 |
+
if (!valueInBinRange(informationDensity, x0, x1, binIndex, no_bins)) continue;
|
| 75 |
+
highlightedIndices.add(i);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 76 |
}
|
|
|
|
| 77 |
return highlightedIndices;
|
| 78 |
}
|
| 79 |
|
| 80 |
/**
|
| 81 |
+
* 根据直方图 bin 的范围计算需要高亮的 token 索引集合(基于 raw_score_normed)
|
| 82 |
+
* 使用 rawScoresNormed(与 mergedTokens 对齐),按 bin 范围筛选
|
| 83 |
*/
|
| 84 |
+
export function calculateRawScoreNormedHighlights(
|
| 85 |
x0: number,
|
| 86 |
x1: number,
|
| 87 |
binIndex: number,
|
| 88 |
no_bins: number,
|
| 89 |
+
data: HighlightData
|
| 90 |
): Set<number> {
|
| 91 |
const highlightedIndices = new Set<number>();
|
| 92 |
+
const scores = data.rawScoresNormed;
|
| 93 |
+
if (!scores?.length) return highlightedIndices;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
|
| 95 |
for (let i = 0; i < scores.length; i++) {
|
| 96 |
const score = scores[i];
|
| 97 |
+
if (!Number.isFinite(score)) continue;
|
| 98 |
+
if (!valueInBinRange(score, x0, x1, binIndex, no_bins)) continue;
|
| 99 |
+
highlightedIndices.add(i);
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 100 |
}
|
|
|
|
| 101 |
return highlightedIndices;
|
| 102 |
}
|
| 103 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 104 |
/**
|
| 105 |
* 根据直方图类型和 bin 范围计算需要高亮的 token 索引集合
|
| 106 |
+
* @param histogramType 直方图类型
|
| 107 |
* @param x0 bin 起始值
|
| 108 |
* @param x1 bin 结束值
|
| 109 |
* @param binIndex bin在bins数组中的索引
|
| 110 |
* @param no_bins 直方图的总bin数量
|
| 111 |
+
* @param data 前端分析结果
|
| 112 |
* @returns 需要高亮的 merged token 索引集合和对应的高亮样式
|
| 113 |
*/
|
| 114 |
export function calculateHighlights(
|
|
|
|
| 117 |
x1: number,
|
| 118 |
binIndex: number,
|
| 119 |
no_bins: number,
|
| 120 |
+
data: HighlightData
|
| 121 |
): { indices: Set<number>; style: 'border' | 'underline' } {
|
| 122 |
if (histogramType === 'byte') {
|
| 123 |
return {
|
| 124 |
+
indices: calculateByteSurprisalHighlights(x0, x1, binIndex, no_bins, data),
|
| 125 |
style: 'underline'
|
| 126 |
};
|
| 127 |
}
|
| 128 |
+
if (histogramType === 'raw_score_normed') {
|
| 129 |
return {
|
| 130 |
+
indices: calculateRawScoreNormedHighlights(x0, x1, binIndex, no_bins, data),
|
| 131 |
style: 'underline'
|
| 132 |
};
|
| 133 |
}
|
| 134 |
return {
|
| 135 |
+
indices: calculateTokenSurprisalHighlights(x0, x1, binIndex, no_bins, data),
|
| 136 |
style: 'border'
|
| 137 |
};
|
| 138 |
}
|
client/src/ts/utils/lognormalFit.ts
ADDED
|
@@ -0,0 +1,144 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* 对数正态噪声拟合(纯数学,无依赖)
|
| 3 |
+
* 供 visualizationUpdater 使用,可独立在 Node 中测试
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
export const LN_EPS = 1e-10;
|
| 7 |
+
|
| 8 |
+
/** 标准正态 CDF Φ(x),Abramowitz & Stegun 26.2.17 近似 */
|
| 9 |
+
export function normCdf(x: number): number {
|
| 10 |
+
if (x <= -6) return 0;
|
| 11 |
+
if (x >= 6) return 1;
|
| 12 |
+
const a1 = 0.254829592, a2 = -0.284496736, a3 = 1.421413741, a4 = -1.453152027, a5 = 1.061405429, p = 0.3275911;
|
| 13 |
+
const sign = x < 0 ? -1 : 1;
|
| 14 |
+
const t = 1 / (1 + p * Math.abs(x) / Math.SQRT2);
|
| 15 |
+
const y = 1 - (((((a5 * t + a4) * t + a3) * t + a2) * t + a1) * t) * Math.exp(-x * x / 2);
|
| 16 |
+
return 0.5 * (1 + sign * y);
|
| 17 |
+
}
|
| 18 |
+
|
| 19 |
+
/** 对数正态 CDF:F(x) = Φ((log(x) - μ) / σ),x > 0 */
|
| 20 |
+
export function logNormalCdf(x: number, mu: number, sigma: number): number {
|
| 21 |
+
if (x <= 0) return 0;
|
| 22 |
+
return normCdf((Math.log(x) - mu) / sigma);
|
| 23 |
+
}
|
| 24 |
+
|
| 25 |
+
/** 区间 [a, b) 在 log-normal(μ,σ) 下的期望计数:n × (CDF(b) - CDF(a)) */
|
| 26 |
+
export function logNormalExpectedCountInInterval(
|
| 27 |
+
a: number, b: number, n: number, mu: number, sigma: number
|
| 28 |
+
): number {
|
| 29 |
+
return n * (logNormalCdf(b, mu, sigma) - logNormalCdf(a, mu, sigma));
|
| 30 |
+
}
|
| 31 |
+
|
| 32 |
+
/** 对数正态 PDF:f(x) = φ((log(x)-μ)/σ) / (xσ),x > 0 */
|
| 33 |
+
export function logNormalPdf(x: number, mu: number, sigma: number): number {
|
| 34 |
+
if (x <= 0 || sigma <= 0) return 0;
|
| 35 |
+
const z = (Math.log(x) - mu) / sigma;
|
| 36 |
+
return normPdf(z) / (x * sigma);
|
| 37 |
+
}
|
| 38 |
+
|
| 39 |
+
/** 标准正态 PDF φ(x) */
|
| 40 |
+
function normPdf(x: number): number {
|
| 41 |
+
return Math.exp(-x * x / 2) / Math.sqrt(2 * Math.PI);
|
| 42 |
+
}
|
| 43 |
+
|
| 44 |
+
/** 逆 Mills 比率 λ(α) = φ(α)/Φ(α),α → −∞ 时近似 |α| */
|
| 45 |
+
function millsRatio(alpha: number): number {
|
| 46 |
+
const Phi = normCdf(alpha);
|
| 47 |
+
if (Phi < 1e-300) return Math.abs(alpha);
|
| 48 |
+
return normPdf(alpha) / Phi;
|
| 49 |
+
}
|
| 50 |
+
|
| 51 |
+
/**
|
| 52 |
+
* 截尾对数正态 MLE(右截尾于 τ)
|
| 53 |
+
* 导出供测试对比 tau=max(samples) vs tau=固定值
|
| 54 |
+
*/
|
| 55 |
+
export function fitLogNormalTruncatedMLE(
|
| 56 |
+
noiseScores: number[],
|
| 57 |
+
tau: number
|
| 58 |
+
): { mu: number; sigma: number } | null {
|
| 59 |
+
const n = noiseScores.length;
|
| 60 |
+
if (n < 2 || tau <= LN_EPS) return null;
|
| 61 |
+
|
| 62 |
+
const T = Math.log(tau);
|
| 63 |
+
const logData = noiseScores.map(x => Math.log(x));
|
| 64 |
+
const ybar = logData.reduce((a, b) => a + b, 0) / n;
|
| 65 |
+
const s2 = logData.reduce((a, x) => a + (x - ybar) ** 2, 0) / n;
|
| 66 |
+
const s = Math.sqrt(s2);
|
| 67 |
+
if (s <= 0 || !isFinite(s)) return null;
|
| 68 |
+
|
| 69 |
+
const delta = T - ybar;
|
| 70 |
+
|
| 71 |
+
const F = (alpha: number): number => {
|
| 72 |
+
const lam = millsRatio(alpha);
|
| 73 |
+
if (!isFinite(lam)) return delta > 0 ? -1 : 1;
|
| 74 |
+
const g = alpha + lam;
|
| 75 |
+
const h = 1 - lam * g;
|
| 76 |
+
if (h <= 0) return NaN;
|
| 77 |
+
return g - (delta / s) * Math.sqrt(h);
|
| 78 |
+
};
|
| 79 |
+
|
| 80 |
+
const lo0 = -8, hi0 = delta / s + 8;
|
| 81 |
+
const Flo = F(lo0), Fhi = F(hi0);
|
| 82 |
+
if (!isFinite(Flo) || !isFinite(Fhi) || Flo * Fhi > 0) return null;
|
| 83 |
+
|
| 84 |
+
let lo = lo0, hi = hi0, Flo_cur = Flo;
|
| 85 |
+
for (let i = 0; i < 60; i++) {
|
| 86 |
+
const mid = (lo + hi) / 2;
|
| 87 |
+
const Fmid = F(mid);
|
| 88 |
+
if (!isFinite(Fmid) || (hi - lo) < 1e-12) break;
|
| 89 |
+
if (Flo_cur * Fmid <= 0) { hi = mid; }
|
| 90 |
+
else { lo = mid; Flo_cur = Fmid; }
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
const alpha = (lo + hi) / 2;
|
| 94 |
+
const lam = millsRatio(alpha);
|
| 95 |
+
if (!isFinite(lam)) return null;
|
| 96 |
+
const h = 1 - lam * (alpha + lam);
|
| 97 |
+
if (h <= 0) return null;
|
| 98 |
+
|
| 99 |
+
const sigma = s / Math.sqrt(h);
|
| 100 |
+
const mu = ybar + sigma * lam;
|
| 101 |
+
if (!isFinite(sigma) || sigma <= 0 || !isFinite(mu)) return null;
|
| 102 |
+
return { mu, sigma };
|
| 103 |
+
}
|
| 104 |
+
|
| 105 |
+
/*
|
| 106 |
+
* todo: 未知原因的偏差现象:
|
| 107 |
+
* Monte Carlo 下 E[μ̂] 随 n 减小单调增大(系统性正偏),而非围绕真值的随机波动。
|
| 108 |
+
> inforadar@0.1.0 test:lognormal:tau
|
| 109 |
+
> npx tsx ts/utils/lognormalFit.tauBoundary.test.ts
|
| 110 |
+
=== 截尾对数正态拟合硬指标测试 ===
|
| 111 |
+
|
| 112 |
+
真实参数: μ=-2, σ=0.8, τ=1
|
| 113 |
+
Monte Carlo 500 次,fitLogNormalNoiseExpectedCounts percentile=0.9
|
| 114 |
+
|
| 115 |
+
n | E[μ̂] E[σ̂] Δμ Δσ
|
| 116 |
+
-------|------------------------------
|
| 117 |
+
1600 | -1.9977 0.8013 0.0023 0.0013
|
| 118 |
+
800 | -1.9950 0.8023 0.0050 0.0023
|
| 119 |
+
400 | -1.9910 0.8054 0.0090 0.0054
|
| 120 |
+
200 | -1.9851 0.8059 0.0149 0.0059
|
| 121 |
+
100 | -1.9722 0.8096 0.0278 0.0096
|
| 122 |
+
50 | -1.9541 0.8056 0.0459 0.0056
|
| 123 |
+
*/
|
| 124 |
+
|
| 125 |
+
/**
|
| 126 |
+
* 从 (μ, σ) 计算直方图各 bin 的期望计数
|
| 127 |
+
*/
|
| 128 |
+
export function computeExpectedCounts(
|
| 129 |
+
mu: number,
|
| 130 |
+
sigma: number,
|
| 131 |
+
extent: [number, number],
|
| 132 |
+
noBins: number,
|
| 133 |
+
n: number
|
| 134 |
+
): number[] {
|
| 135 |
+
const binWidth = (extent[1] - extent[0]) / noBins;
|
| 136 |
+
const expectedCounts: number[] = [];
|
| 137 |
+
for (let i = 0; i < noBins; i++) {
|
| 138 |
+
const a = extent[0] + i * binWidth;
|
| 139 |
+
const b = extent[0] + (i + 1) * binWidth;
|
| 140 |
+
const p = logNormalCdf(b, mu, sigma) - logNormalCdf(a, mu, sigma);
|
| 141 |
+
expectedCounts.push(n * p);
|
| 142 |
+
}
|
| 143 |
+
return expectedCounts;
|
| 144 |
+
}
|
client/src/ts/utils/queryHistory.ts
CHANGED
|
@@ -41,10 +41,12 @@ export interface InitQueryHistoryDropdownOptions {
|
|
| 41 |
dropdownId: string;
|
| 42 |
onSelect: () => void;
|
| 43 |
onHistorySelect?: () => void;
|
|
|
|
|
|
|
| 44 |
}
|
| 45 |
|
| 46 |
export function initQueryHistoryDropdown(options: InitQueryHistoryDropdownOptions): void {
|
| 47 |
-
const { input, dropdownId, onSelect, onHistorySelect } = options;
|
| 48 |
if (!input) return;
|
| 49 |
|
| 50 |
const wrapper = input.closest('.semantic-search-input-wrapper');
|
|
@@ -82,6 +84,7 @@ export function initQueryHistoryDropdown(options: InitQueryHistoryDropdownOption
|
|
| 82 |
btn.onclick = (e) => {
|
| 83 |
e.stopPropagation();
|
| 84 |
remove(q);
|
|
|
|
| 85 |
render();
|
| 86 |
};
|
| 87 |
li.appendChild(span);
|
|
|
|
| 41 |
dropdownId: string;
|
| 42 |
onSelect: () => void;
|
| 43 |
onHistorySelect?: () => void;
|
| 44 |
+
/** 删除某条历史时回调,用于同步清理相关缓存 */
|
| 45 |
+
onRemove?: (query: string) => void;
|
| 46 |
}
|
| 47 |
|
| 48 |
export function initQueryHistoryDropdown(options: InitQueryHistoryDropdownOptions): void {
|
| 49 |
+
const { input, dropdownId, onSelect, onHistorySelect, onRemove } = options;
|
| 50 |
if (!input) return;
|
| 51 |
|
| 52 |
const wrapper = input.closest('.semantic-search-input-wrapper');
|
|
|
|
| 84 |
btn.onclick = (e) => {
|
| 85 |
e.stopPropagation();
|
| 86 |
remove(q);
|
| 87 |
+
onRemove?.(q);
|
| 88 |
render();
|
| 89 |
};
|
| 90 |
li.appendChild(span);
|
client/src/ts/utils/semanticResultCache.ts
ADDED
|
@@ -0,0 +1,104 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* 语义分析结果缓存:以 text + query + submode 的 hash 为索引,最大 100 条。
|
| 3 |
+
* 持久化到 localStorage,刷新后保留。删除查询历史时需调用 removeByQuery 清理对应缓存。
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
const MAX_SIZE = 100;
|
| 7 |
+
const STORAGE_KEY = 'info_radar_semantic_result_cache';
|
| 8 |
+
|
| 9 |
+
export type SemanticCacheResult = {
|
| 10 |
+
success: boolean;
|
| 11 |
+
model?: string;
|
| 12 |
+
token_attention?: Array<{ offset: [number, number]; raw: string; score: number }>;
|
| 13 |
+
debug_info?: { abbrev?: string; topk_tokens?: string[]; topk_probs?: number[] };
|
| 14 |
+
full_match_degree?: number;
|
| 15 |
+
message?: string;
|
| 16 |
+
};
|
| 17 |
+
|
| 18 |
+
type StoredEntry = SemanticCacheResult & { _query?: string };
|
| 19 |
+
|
| 20 |
+
function simpleHash(s: string): string {
|
| 21 |
+
let h = 0;
|
| 22 |
+
for (let i = 0; i < s.length; i++) {
|
| 23 |
+
h = ((h << 5) - h + s.charCodeAt(i)) | 0;
|
| 24 |
+
}
|
| 25 |
+
return (h >>> 0).toString(36);
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
function buildKey(text: string, query: string, submode?: string): string {
|
| 29 |
+
const parts = [text, query, submode ?? ''];
|
| 30 |
+
return simpleHash(parts.join('\0'));
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
const cache = new Map<string, StoredEntry>();
|
| 34 |
+
let keyOrder: string[] = [];
|
| 35 |
+
|
| 36 |
+
function load(): void {
|
| 37 |
+
try {
|
| 38 |
+
const raw = localStorage.getItem(STORAGE_KEY);
|
| 39 |
+
if (!raw) return;
|
| 40 |
+
const parsed = JSON.parse(raw) as { entries?: Record<string, StoredEntry>; keyOrder?: string[] };
|
| 41 |
+
if (!parsed?.entries || typeof parsed.entries !== 'object') return;
|
| 42 |
+
cache.clear();
|
| 43 |
+
for (const [k, v] of Object.entries(parsed.entries)) {
|
| 44 |
+
if (v && typeof v === 'object') cache.set(k, v);
|
| 45 |
+
}
|
| 46 |
+
keyOrder = Array.isArray(parsed.keyOrder)
|
| 47 |
+
? parsed.keyOrder.filter((k) => cache.has(k)).slice(-MAX_SIZE)
|
| 48 |
+
: [...cache.keys()];
|
| 49 |
+
} catch {
|
| 50 |
+
cache.clear();
|
| 51 |
+
keyOrder = [];
|
| 52 |
+
}
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
load();
|
| 56 |
+
|
| 57 |
+
function persist(): void {
|
| 58 |
+
try {
|
| 59 |
+
const entries: Record<string, StoredEntry> = {};
|
| 60 |
+
for (const [k, v] of cache) entries[k] = v;
|
| 61 |
+
localStorage.setItem(STORAGE_KEY, JSON.stringify({ entries, keyOrder }));
|
| 62 |
+
} catch {
|
| 63 |
+
// QuotaExceededError 等,忽略
|
| 64 |
+
}
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
function evictOne(): void {
|
| 68 |
+
if (keyOrder.length < MAX_SIZE) return;
|
| 69 |
+
const oldest = keyOrder.shift()!;
|
| 70 |
+
cache.delete(oldest);
|
| 71 |
+
}
|
| 72 |
+
|
| 73 |
+
export function get(text: string, query: string, submode?: string): SemanticCacheResult | undefined {
|
| 74 |
+
const key = buildKey(text, query, submode);
|
| 75 |
+
const entry = cache.get(key);
|
| 76 |
+
if (!entry) return undefined;
|
| 77 |
+
const { _query, ...rest } = entry as SemanticCacheResult & { _query?: string };
|
| 78 |
+
return rest;
|
| 79 |
+
}
|
| 80 |
+
|
| 81 |
+
export function set(text: string, query: string, result: SemanticCacheResult, submode?: string): void {
|
| 82 |
+
const key = buildKey(text, query, submode);
|
| 83 |
+
if (cache.has(key)) {
|
| 84 |
+
const idx = keyOrder.indexOf(key);
|
| 85 |
+
if (idx >= 0) keyOrder.splice(idx, 1);
|
| 86 |
+
}
|
| 87 |
+
evictOne();
|
| 88 |
+
cache.set(key, { ...result, _query: query });
|
| 89 |
+
keyOrder.push(key);
|
| 90 |
+
persist();
|
| 91 |
+
}
|
| 92 |
+
|
| 93 |
+
export function removeByQuery(query: string): void {
|
| 94 |
+
const keysToRemove: string[] = [];
|
| 95 |
+
for (const [key, entry] of cache) {
|
| 96 |
+
if (entry._query === query) keysToRemove.push(key);
|
| 97 |
+
}
|
| 98 |
+
for (const key of keysToRemove) {
|
| 99 |
+
cache.delete(key);
|
| 100 |
+
const idx = keyOrder.indexOf(key);
|
| 101 |
+
if (idx >= 0) keyOrder.splice(idx, 1);
|
| 102 |
+
}
|
| 103 |
+
if (keysToRemove.length) persist();
|
| 104 |
+
}
|
client/src/ts/utils/signalThresholdDetector.ts
ADDED
|
@@ -0,0 +1,330 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* 信号阈值检测:自动找到「噪声/信号」边界
|
| 3 |
+
*
|
| 4 |
+
* 输入:raw score normed [0,1]
|
| 5 |
+
* 输出:{ threshold, confidence, mu, sigma };无命中时返回 null
|
| 6 |
+
*
|
| 7 |
+
* 算法概要:
|
| 8 |
+
* 1. 迭代 0:用全部样本(P0=1)拟合截尾对数正态 (μ, σ),从 startPercentile 分位 bin 起逐 bin 扫描
|
| 9 |
+
* - 每个 bin [τ_left, τ_right) 左闭右开:obsInBin = 该 bin 内观测计数,expInBin = n × (CDF(τ_right) - CDF(τ_left))
|
| 10 |
+
* - 纯噪声区:信号样本不在 bin 内 → excess ≈ 0
|
| 11 |
+
* - 到信号边界:bin 内出现超额样本 → excess 跃升
|
| 12 |
+
* - 不重叠扫描:bin 边界取相邻点中点,τ_right >= τ_left + MIN_BIN_WIDTH,obsInBin >= MIN_OBSERVED
|
| 13 |
+
* - 误报概率:cumulativeFalsePositiveProbability = ∏(1-Φ(excess_i)),excess>excessMin 时累积,否则重置
|
| 14 |
+
* - 当 cumulativeFalsePositiveProbability <= 1-CONFIDENCE_THRESHOLD 时,取首次命中 bin 的左边界 sorted[j] 为阈值(保守)
|
| 15 |
+
* - 若全程无命中,返回 null
|
| 16 |
+
* 2. 迭代 1..N:用 threshold 以下样本重拟合,再扫描;阈值变化不大则提前结束;任一迭代无命中则返回 null
|
| 17 |
+
*
|
| 18 |
+
* 与现有 lognormalFit 逻辑独立,未来可能替换现有拟合代码
|
| 19 |
+
*/
|
| 20 |
+
|
| 21 |
+
import { fitLogNormalTruncatedMLE, logNormalExpectedCountInInterval, normCdf, LN_EPS } from './lognormalFit';
|
| 22 |
+
import { computeFitQuality } from './fitQuality';
|
| 23 |
+
|
| 24 |
+
/** 置信度阈值,达到此值即判定「确定找到」信号边界;默认 0.9999 */
|
| 25 |
+
const CONFIDENCE_THRESHOLD = 0.9999;
|
| 26 |
+
/** excess 最小阈值,排除无意义随机波动;需 excess > 此值才计为命中 */
|
| 27 |
+
const EXCESS_MIN = 0.1;
|
| 28 |
+
const MIN_OBSERVED = 1; // 每个 bin 至少 N 个观测
|
| 29 |
+
const MIN_BIN_WIDTH = 0.01; // bin 最小宽度;边界取相邻点中点
|
| 30 |
+
const MIN_SAMPLE_SIZE = 20;
|
| 31 |
+
const P0 = 1; // 迭代初始的样本拟合比例
|
| 32 |
+
const MAX_REFINE_ITER = 10;
|
| 33 |
+
const THRESHOLD_CONVERGE_EPS = 0.01; //迭代收敛阈值
|
| 34 |
+
/** 扫描起始分位,默认 0.5(从 50% 分位所在 bin 开始) */
|
| 35 |
+
const START_PERCENTILE_DEFAULT = 0.5;
|
| 36 |
+
/** 输出过滤:最终 confidence 低于此值则返回 null;与 CONFIDENCE_THRESHOLD 不同,后者用于内部扫描判定 */
|
| 37 |
+
const MIN_OUTPUT_CONFIDENCE = 0.9;
|
| 38 |
+
/** expInBin 最小有效值,避免除零或数值不稳定 */
|
| 39 |
+
const EXP_IN_BIN_EPS = 1e-10;
|
| 40 |
+
|
| 41 |
+
/** 内部:evaluateBins 的中间结果,仅 threshold + confidence */
|
| 42 |
+
interface SignalThresholdScanResult {
|
| 43 |
+
threshold: number;
|
| 44 |
+
confidence: number;
|
| 45 |
+
}
|
| 46 |
+
|
| 47 |
+
/** 对外:findSignalThreshold 成功时的完整结果,mu/sigma/bins 必存在 */
|
| 48 |
+
export interface SignalThresholdResult {
|
| 49 |
+
threshold: number;
|
| 50 |
+
/** 0~1,统计置信度:有命中时 1-误报概率 */
|
| 51 |
+
confidence: number;
|
| 52 |
+
/** 最终拟合的 μ(供 histogram 使用) */
|
| 53 |
+
mu: number;
|
| 54 |
+
/** 最终拟合的 σ(供 histogram 使用) */
|
| 55 |
+
sigma: number;
|
| 56 |
+
/** 全范围 bins(供 signal prob 等使用) */
|
| 57 |
+
bins: SignalThresholdBin[];
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
export interface SignalThresholdBin {
|
| 61 |
+
tauLeft: number;
|
| 62 |
+
tauRight: number;
|
| 63 |
+
obsInBin: number;
|
| 64 |
+
expInBin: number;
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
/** 内部:bin 结构(tauLeft/tauRight/obsInBin)仅依赖 sorted,迭代间不变 */
|
| 68 |
+
interface BinStructure {
|
| 69 |
+
tauLeft: number;
|
| 70 |
+
tauRight: number;
|
| 71 |
+
obsInBin: number;
|
| 72 |
+
}
|
| 73 |
+
|
| 74 |
+
const TAU_RIGHT_EPSILON = 1e-6;
|
| 75 |
+
|
| 76 |
+
const PERCENTILE_DIAGNOSTICS = [0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 0.99, 1] as const;
|
| 77 |
+
|
| 78 |
+
/** 计算 excess = (obs - exp) / sqrt(exp),exp 过小时避免除零 */
|
| 79 |
+
function computeExcess(obsInBin: number, expInBin: number): number {
|
| 80 |
+
if (expInBin <= EXP_IN_BIN_EPS) return obsInBin > 0 ? Infinity : 0;
|
| 81 |
+
return (obsInBin - expInBin) / Math.sqrt(expInBin);
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
/** 打印不同分位数下的拟合结果,用于验证渐近一致性 */
|
| 85 |
+
function logPercentileDiagnostics(scores: number[]): void {
|
| 86 |
+
const sorted = [...scores].sort((a, b) => a - b);
|
| 87 |
+
const n = sorted.length;
|
| 88 |
+
if (n < 2) return;
|
| 89 |
+
const rows: Array<{ p: number; n: number; mu: number; sigma: number }> = [];
|
| 90 |
+
for (const p of PERCENTILE_DIAGNOSTICS) {
|
| 91 |
+
const pIdx = Math.max(1, Math.min(n, Math.round(n * p)));
|
| 92 |
+
const noiseNorm = sorted.slice(0, pIdx);
|
| 93 |
+
const tau = pIdx < n ? (sorted[pIdx - 1]! + sorted[pIdx]!) / 2 : sorted[pIdx - 1]!;
|
| 94 |
+
const fit = fitLogNormalTruncatedMLE(noiseNorm, tau);
|
| 95 |
+
if (fit) rows.push({ p, n: pIdx, mu: fit.mu, sigma: fit.sigma });
|
| 96 |
+
}
|
| 97 |
+
if (rows.length === 0) return;
|
| 98 |
+
console.log('[signalThreshold] 渐近一致性诊断 (percentile → μ, σ)');
|
| 99 |
+
for (const { p, n, mu, sigma } of rows) {
|
| 100 |
+
console.log(` p=${p} n=${n}: μ=${mu.toFixed(4)}, σ=${sigma.toFixed(4)}`);
|
| 101 |
+
}
|
| 102 |
+
}
|
| 103 |
+
/** verbose 时打印完整 bin 扫描日志(独立于 evaluateBins,仅追加输出) */
|
| 104 |
+
function printBinScanLogs(bins: SignalThresholdBin[], excessMin: number): void {
|
| 105 |
+
console.log('[signalThreshold] 完整扫描明细 τ_left | τ_right | obsInBin | expInBin | excess | binConf | hit | confidence');
|
| 106 |
+
let cumulativeFalsePositiveProbability = 1;
|
| 107 |
+
let firstHitTauLeft: number | null = null;
|
| 108 |
+
for (const bin of bins) {
|
| 109 |
+
const excess = computeExcess(bin.obsInBin, bin.expInBin);
|
| 110 |
+
const hit = excess > excessMin;
|
| 111 |
+
const binConfidence = normCdf(excess);
|
| 112 |
+
if (hit) {
|
| 113 |
+
if (firstHitTauLeft === null) firstHitTauLeft = bin.tauLeft;
|
| 114 |
+
cumulativeFalsePositiveProbability *= 1 - binConfidence;
|
| 115 |
+
const confidence = 1 - cumulativeFalsePositiveProbability;
|
| 116 |
+
console.log(`[signalThreshold] ${bin.tauLeft.toFixed(4)} | ${bin.tauRight.toFixed(4)} | ${String(bin.obsInBin).padStart(7)} | ${bin.expInBin.toFixed(1).padStart(8)} | ${excess.toFixed(2).padStart(6)} | ${binConfidence.toFixed(4)} | ✓ | ${confidence.toFixed(4)}`);
|
| 117 |
+
} else {
|
| 118 |
+
cumulativeFalsePositiveProbability = 1;
|
| 119 |
+
firstHitTauLeft = null;
|
| 120 |
+
console.log(`[signalThreshold] ${bin.tauLeft.toFixed(4)} | ${bin.tauRight.toFixed(4)} | ${String(bin.obsInBin).padStart(7)} | ${bin.expInBin.toFixed(1).padStart(8)} | ${excess.toFixed(2).padStart(6)} | ${binConfidence.toFixed(4)} | | -`);
|
| 121 |
+
}
|
| 122 |
+
}
|
| 123 |
+
}
|
| 124 |
+
|
| 125 |
+
/** bin 边界取相邻点中点,τ_right >= τ_left + MIN_BIN_WIDTH,obsInBin >= MIN_OBSERVED;仅依赖 sorted,迭代间不变 */
|
| 126 |
+
function formBinStructures(sorted: number[]): BinStructure[] {
|
| 127 |
+
const n = sorted.length;
|
| 128 |
+
const mids: number[] = [];
|
| 129 |
+
for (let i = 0; i < n - 1; i++) mids.push((sorted[i]! + sorted[i + 1]!) / 2);
|
| 130 |
+
const structures: BinStructure[] = [];
|
| 131 |
+
let tauLeft = sorted[0]! - TAU_RIGHT_EPSILON;
|
| 132 |
+
|
| 133 |
+
while (tauLeft < sorted[n - 1]!) {
|
| 134 |
+
let midIdx = mids.findIndex((m) => m >= tauLeft + MIN_BIN_WIDTH);
|
| 135 |
+
let tauRight = midIdx >= 0 ? mids[midIdx]! : sorted[n - 1]! + TAU_RIGHT_EPSILON;
|
| 136 |
+
|
| 137 |
+
let leftIdx = sorted.findIndex((v) => v >= tauLeft);
|
| 138 |
+
let rightIdx = midIdx >= 0 ? sorted.findIndex((v) => v >= tauRight) : -1;
|
| 139 |
+
let obsInBin = leftIdx < 0 ? 0 : rightIdx < 0 ? n - leftIdx : rightIdx - leftIdx;
|
| 140 |
+
|
| 141 |
+
while (obsInBin < MIN_OBSERVED && midIdx >= 0 && midIdx < mids.length - 1) {
|
| 142 |
+
midIdx++;
|
| 143 |
+
tauRight = mids[midIdx]!;
|
| 144 |
+
rightIdx = sorted.findIndex((v) => v >= tauRight);
|
| 145 |
+
obsInBin = leftIdx < 0 ? 0 : rightIdx < 0 ? n - leftIdx : rightIdx - leftIdx;
|
| 146 |
+
}
|
| 147 |
+
if (obsInBin < MIN_OBSERVED) {
|
| 148 |
+
tauRight = sorted[n - 1]! + TAU_RIGHT_EPSILON;
|
| 149 |
+
rightIdx = -1;
|
| 150 |
+
obsInBin = leftIdx < 0 ? 0 : n - leftIdx;
|
| 151 |
+
if (obsInBin < MIN_OBSERVED) break;
|
| 152 |
+
}
|
| 153 |
+
|
| 154 |
+
structures.push({ tauLeft, tauRight, obsInBin });
|
| 155 |
+
tauLeft = tauRight;
|
| 156 |
+
if (tauRight >= sorted[n - 1]! + TAU_RIGHT_EPSILON) break;
|
| 157 |
+
}
|
| 158 |
+
return structures;
|
| 159 |
+
}
|
| 160 |
+
|
| 161 |
+
/** 遍历 bin 结构,按需计算 expInBin,返回阈值结果;通过 obsInBin 累积找到 startPercentile 分位对应 bin,从该 bin 开始扫描 */
|
| 162 |
+
function evaluateBins(
|
| 163 |
+
structures: BinStructure[],
|
| 164 |
+
n: number,
|
| 165 |
+
mu: number,
|
| 166 |
+
sigma: number,
|
| 167 |
+
excessMin: number,
|
| 168 |
+
confidenceThreshold: number,
|
| 169 |
+
verbose: boolean,
|
| 170 |
+
startPercentile: number
|
| 171 |
+
): SignalThresholdScanResult | null {
|
| 172 |
+
let cumulativeFalsePositiveProbability = 1;
|
| 173 |
+
let firstHitTauLeft: number | null = null;
|
| 174 |
+
|
| 175 |
+
const K = Math.min(Math.floor((n - 1) * startPercentile), n - 1);
|
| 176 |
+
let cumSum = 0;
|
| 177 |
+
let startIdx = 0;
|
| 178 |
+
for (let i = 0; i < structures.length; i++) {
|
| 179 |
+
if (K < cumSum + structures[i]!.obsInBin) {
|
| 180 |
+
startIdx = i;
|
| 181 |
+
break;
|
| 182 |
+
}
|
| 183 |
+
cumSum += structures[i]!.obsInBin;
|
| 184 |
+
}
|
| 185 |
+
const structuresToScan = structures.slice(startIdx);
|
| 186 |
+
|
| 187 |
+
if (verbose) {
|
| 188 |
+
console.log('[signalThreshold] 扫描明细 τ_left | τ_right | obsInBin | expInBin | excess | binConf | hit | confidence');
|
| 189 |
+
}
|
| 190 |
+
|
| 191 |
+
for (const s of structuresToScan) {
|
| 192 |
+
const expInBin = logNormalExpectedCountInInterval(s.tauLeft, s.tauRight, n, mu, sigma);
|
| 193 |
+
const excess = computeExcess(s.obsInBin, expInBin);
|
| 194 |
+
const hit = excess > excessMin;
|
| 195 |
+
const binConfidence = normCdf(excess);
|
| 196 |
+
|
| 197 |
+
if (hit) {
|
| 198 |
+
if (firstHitTauLeft === null) firstHitTauLeft = s.tauLeft;
|
| 199 |
+
cumulativeFalsePositiveProbability *= 1 - binConfidence;
|
| 200 |
+
const confidence = 1 - cumulativeFalsePositiveProbability;
|
| 201 |
+
if (verbose) {
|
| 202 |
+
console.log(`[signalThreshold] ${s.tauLeft.toFixed(4)} | ${s.tauRight.toFixed(4)} | ${String(s.obsInBin).padStart(7)} | ${expInBin.toFixed(1).padStart(8)} | ${excess.toFixed(2).padStart(6)} | ${binConfidence.toFixed(4)} | ✓ | ${confidence.toFixed(4)}`);
|
| 203 |
+
}
|
| 204 |
+
if (confidence >= confidenceThreshold) {
|
| 205 |
+
return { threshold: firstHitTauLeft, confidence };
|
| 206 |
+
}
|
| 207 |
+
} else {
|
| 208 |
+
cumulativeFalsePositiveProbability = 1;
|
| 209 |
+
firstHitTauLeft = null;
|
| 210 |
+
if (verbose) {
|
| 211 |
+
console.log(`[signalThreshold] ${s.tauLeft.toFixed(4)} | ${s.tauRight.toFixed(4)} | ${String(s.obsInBin).padStart(7)} | ${expInBin.toFixed(1).padStart(8)} | ${excess.toFixed(2).padStart(6)} | ${binConfidence.toFixed(4)} | | -`);
|
| 212 |
+
}
|
| 213 |
+
}
|
| 214 |
+
}
|
| 215 |
+
|
| 216 |
+
if (firstHitTauLeft !== null) {
|
| 217 |
+
return { threshold: firstHitTauLeft, confidence: 1 - cumulativeFalsePositiveProbability };
|
| 218 |
+
}
|
| 219 |
+
return null;
|
| 220 |
+
}
|
| 221 |
+
|
| 222 |
+
/**
|
| 223 |
+
* 从 raw score normed 数组自动检测信号阈值
|
| 224 |
+
* @param rawScoresNormed 归一化分数 [0,1]
|
| 225 |
+
* @param verbose 是否输出详细日志,默认 false
|
| 226 |
+
* @returns 成功时返回完整结果 { threshold, confidence, mu, sigma, bins };样本不足、拟合失败或无命中时返回 null
|
| 227 |
+
*/
|
| 228 |
+
export function findSignalThreshold(
|
| 229 |
+
rawScoresNormed: number[],
|
| 230 |
+
verbose = false
|
| 231 |
+
): SignalThresholdResult | null {
|
| 232 |
+
const values = rawScoresNormed.filter(
|
| 233 |
+
(s) => typeof s === 'number' && isFinite(s) && s > LN_EPS
|
| 234 |
+
);
|
| 235 |
+
const sorted = [...values].sort((a, b) => a - b);
|
| 236 |
+
const n = sorted.length;
|
| 237 |
+
|
| 238 |
+
if (n < MIN_SAMPLE_SIZE) {
|
| 239 |
+
if (verbose) console.log('[signalThreshold] 样本不足 n<', MIN_SAMPLE_SIZE, ',跳过');
|
| 240 |
+
return null;
|
| 241 |
+
}
|
| 242 |
+
|
| 243 |
+
const p0 = P0;
|
| 244 |
+
const splitIdx = Math.max(1, Math.min(n, Math.round(n * p0)));
|
| 245 |
+
if (verbose) console.log('[signalThreshold] n=', n, 'splitIdx=', splitIdx);
|
| 246 |
+
|
| 247 |
+
let result: SignalThresholdScanResult | null = null;
|
| 248 |
+
let lastFit = { mu: 0, sigma: 0 };
|
| 249 |
+
const binStructures = formBinStructures(sorted);
|
| 250 |
+
|
| 251 |
+
for (let iter = 0; iter <= MAX_REFINE_ITER; iter++) {
|
| 252 |
+
if (iter > 0 && result === null) return null;
|
| 253 |
+
const thresholdForNoise = result?.threshold ?? 0;
|
| 254 |
+
const noiseSamples = iter === 0
|
| 255 |
+
? sorted.slice(0, splitIdx)
|
| 256 |
+
: sorted.filter((x) => x <= thresholdForNoise);
|
| 257 |
+
const tauBoundary = iter === 0
|
| 258 |
+
? (splitIdx < n ? (sorted[splitIdx - 1]! + sorted[splitIdx]!) / 2 : sorted[splitIdx - 1]!)
|
| 259 |
+
: thresholdForNoise;
|
| 260 |
+
|
| 261 |
+
if (iter > 0 && noiseSamples.length < MIN_SAMPLE_SIZE) {
|
| 262 |
+
if (verbose) console.log('[signalThreshold] 迭代', iter, '提前结束:噪声样本数<', MIN_SAMPLE_SIZE);
|
| 263 |
+
return null;
|
| 264 |
+
}
|
| 265 |
+
|
| 266 |
+
if (verbose && iter === 0) {
|
| 267 |
+
const nInit = noiseSamples.length;
|
| 268 |
+
const minN = noiseSamples[0]!, maxN = noiseSamples[nInit - 1]!;
|
| 269 |
+
const midN = noiseSamples[Math.floor(nInit / 2)]!;
|
| 270 |
+
console.log('[signalThreshold] 迭代 0 噪声样本 n=', nInit, 'min=', minN.toFixed(4), 'max=', maxN.toFixed(4), 'median=', midN.toFixed(4));
|
| 271 |
+
}
|
| 272 |
+
|
| 273 |
+
const fit = fitLogNormalTruncatedMLE(noiseSamples, tauBoundary);
|
| 274 |
+
if (fit === null) {
|
| 275 |
+
if (verbose) console.log('[signalThreshold] 迭代', iter, '拟合失败');
|
| 276 |
+
return null;
|
| 277 |
+
}
|
| 278 |
+
lastFit = { mu: fit.mu, sigma: fit.sigma };
|
| 279 |
+
|
| 280 |
+
const q = computeFitQuality(noiseSamples, tauBoundary, fit.mu, fit.sigma);
|
| 281 |
+
if (verbose) {
|
| 282 |
+
console.log('[signalThreshold] 迭代', iter, '拟合 μ=', fit.mu.toFixed(4), 'σ=', fit.sigma.toFixed(4), '| maxDiff=', q.maxDiff.toFixed(4), 'RMSE=', q.rmse.toFixed(4));
|
| 283 |
+
if (iter === 0) {
|
| 284 |
+
console.log('[signalThreshold] 迭代', iter, '从', (START_PERCENTILE_DEFAULT * 100).toFixed(0), '% 分位 bin 开始扫描 (excess>', EXCESS_MIN, ', confidence>=', CONFIDENCE_THRESHOLD, ')');
|
| 285 |
+
}
|
| 286 |
+
}
|
| 287 |
+
|
| 288 |
+
const scanResult = evaluateBins(binStructures, n, fit.mu, fit.sigma, EXCESS_MIN, CONFIDENCE_THRESHOLD, verbose, START_PERCENTILE_DEFAULT);
|
| 289 |
+
if (scanResult === null) {
|
| 290 |
+
if (verbose) console.log('[signalThreshold] 迭代', iter, '未检测到阈值');
|
| 291 |
+
return null;
|
| 292 |
+
}
|
| 293 |
+
|
| 294 |
+
const savedThreshold = result?.threshold;
|
| 295 |
+
result = scanResult;
|
| 296 |
+
|
| 297 |
+
if (iter > 0 && savedThreshold !== undefined) {
|
| 298 |
+
const delta = Math.abs(result.threshold - savedThreshold);
|
| 299 |
+
if (verbose) {
|
| 300 |
+
console.log('[signalThreshold] 迭代', iter, '新阈值=', result.threshold.toFixed(4), 'confidence=', result.confidence.toFixed(2), 'delta=', delta.toFixed(6));
|
| 301 |
+
}
|
| 302 |
+
if (delta < THRESHOLD_CONVERGE_EPS) {
|
| 303 |
+
if (verbose) console.log('[signalThreshold] 迭代', iter, '收敛,最终阈值=', result.threshold.toFixed(4));
|
| 304 |
+
break;
|
| 305 |
+
}
|
| 306 |
+
if (iter === MAX_REFINE_ITER && verbose) {
|
| 307 |
+
console.log('[signalThreshold] 达到最大迭代次数,最终阈值=', result.threshold.toFixed(4));
|
| 308 |
+
}
|
| 309 |
+
} else if (verbose) {
|
| 310 |
+
console.log('[signalThreshold] 迭代 0 检测到阈值', result.threshold.toFixed(4), 'confidence=', result.confidence.toFixed(2));
|
| 311 |
+
}
|
| 312 |
+
}
|
| 313 |
+
|
| 314 |
+
const bins: SignalThresholdBin[] = binStructures.map((s) => ({
|
| 315 |
+
...s,
|
| 316 |
+
expInBin: logNormalExpectedCountInInterval(s.tauLeft, s.tauRight, n, lastFit.mu, lastFit.sigma),
|
| 317 |
+
}));
|
| 318 |
+
if (verbose && bins.length > 0) {
|
| 319 |
+
printBinScanLogs(bins, EXCESS_MIN);
|
| 320 |
+
logPercentileDiagnostics(values);
|
| 321 |
+
}
|
| 322 |
+
if (result === null) return null;
|
| 323 |
+
if (result.confidence < MIN_OUTPUT_CONFIDENCE) {
|
| 324 |
+
if (verbose) {
|
| 325 |
+
console.warn('[signalThreshold] confidence <', (MIN_OUTPUT_CONFIDENCE * 100).toFixed(0), '%,返回 null。当前 confidence=', result.confidence.toFixed(2));
|
| 326 |
+
}
|
| 327 |
+
return null;
|
| 328 |
+
}
|
| 329 |
+
return { ...result, mu: lastFit.mu, sigma: lastFit.sigma, bins };
|
| 330 |
+
}
|
client/src/ts/utils/tokenDisplayUtils.ts
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* Token 文本显示工具:特殊字符可视化、HTML 转义
|
| 3 |
+
* 与 Tooltip、TopK 图表等共享
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
function escapeHtmlImpl(text: string): string {
|
| 7 |
+
const div = document.createElement('div');
|
| 8 |
+
div.textContent = text;
|
| 9 |
+
return div.innerHTML;
|
| 10 |
+
}
|
| 11 |
+
|
| 12 |
+
function isWhitespaceChar(char: string): boolean {
|
| 13 |
+
return /\p{White_Space}/u.test(char);
|
| 14 |
+
}
|
| 15 |
+
|
| 16 |
+
function isPrintableChar(char: string): boolean {
|
| 17 |
+
if (isWhitespaceChar(char)) return false;
|
| 18 |
+
const codePoint = char.codePointAt(0);
|
| 19 |
+
if (codePoint === undefined) return false;
|
| 20 |
+
if (codePoint >= 32 && codePoint <= 126) return true;
|
| 21 |
+
if (
|
| 22 |
+
(codePoint >= 0x00A0 && codePoint <= 0x00FF) ||
|
| 23 |
+
(codePoint >= 0x0100 && codePoint <= 0x017F) ||
|
| 24 |
+
(codePoint >= 0x0180 && codePoint <= 0x024F) ||
|
| 25 |
+
(codePoint >= 0x2000 && codePoint <= 0x206F) ||
|
| 26 |
+
(codePoint >= 0x2070 && codePoint <= 0x209F) ||
|
| 27 |
+
(codePoint >= 0x20A0 && codePoint <= 0x20CF) ||
|
| 28 |
+
(codePoint >= 0x2100 && codePoint <= 0x214F) ||
|
| 29 |
+
(codePoint >= 0x2190 && codePoint <= 0x21FF) ||
|
| 30 |
+
(codePoint >= 0x2200 && codePoint <= 0x22FF) ||
|
| 31 |
+
(codePoint >= 0x2300 && codePoint <= 0x23FF) ||
|
| 32 |
+
(codePoint >= 0x2400 && codePoint <= 0x243F) ||
|
| 33 |
+
(codePoint >= 0x2E00 && codePoint <= 0x2E7F) ||
|
| 34 |
+
(codePoint >= 0x3000 && codePoint <= 0x303F) ||
|
| 35 |
+
(codePoint >= 0x3040 && codePoint <= 0x309F) ||
|
| 36 |
+
(codePoint >= 0x30A0 && codePoint <= 0x30FF) ||
|
| 37 |
+
(codePoint >= 0x4E00 && codePoint <= 0x9FFF) ||
|
| 38 |
+
(codePoint >= 0xAC00 && codePoint <= 0xD7AF) ||
|
| 39 |
+
(codePoint >= 0xF900 && codePoint <= 0xFAFF) ||
|
| 40 |
+
(codePoint >= 0xFF00 && codePoint <= 0xFFEF)
|
| 41 |
+
) return true;
|
| 42 |
+
return false;
|
| 43 |
+
}
|
| 44 |
+
|
| 45 |
+
function visualizeSpecialCharsImpl(text: string): string {
|
| 46 |
+
let result = text
|
| 47 |
+
.replace(/\r\n/g, '[CRLF]')
|
| 48 |
+
.replace(/\n/g, '[LF]')
|
| 49 |
+
.replace(/\r/g, '[CR]')
|
| 50 |
+
.replace(/\t/g, '[TAB]')
|
| 51 |
+
.replace(/\u3000/g, '[FS]')
|
| 52 |
+
.replace(/ /g, '·');
|
| 53 |
+
|
| 54 |
+
const processed: string[] = [];
|
| 55 |
+
let inBracket = false;
|
| 56 |
+
|
| 57 |
+
for (let i = 0; i < result.length; i++) {
|
| 58 |
+
const char = result[i];
|
| 59 |
+
if (char === '[') {
|
| 60 |
+
inBracket = true;
|
| 61 |
+
processed.push(char);
|
| 62 |
+
} else if (char === ']' && inBracket) {
|
| 63 |
+
processed.push(char);
|
| 64 |
+
inBracket = false;
|
| 65 |
+
} else if (inBracket) {
|
| 66 |
+
processed.push(char);
|
| 67 |
+
} else {
|
| 68 |
+
if (isPrintableChar(char)) {
|
| 69 |
+
processed.push(char);
|
| 70 |
+
} else {
|
| 71 |
+
const codePoint = char.codePointAt(0);
|
| 72 |
+
if (codePoint !== undefined) {
|
| 73 |
+
const hexCode = codePoint.toString(16).toUpperCase().padStart(4, '0');
|
| 74 |
+
processed.push(`[U+${hexCode}]`);
|
| 75 |
+
} else {
|
| 76 |
+
processed.push(char);
|
| 77 |
+
}
|
| 78 |
+
}
|
| 79 |
+
}
|
| 80 |
+
}
|
| 81 |
+
return processed.join('');
|
| 82 |
+
}
|
| 83 |
+
|
| 84 |
+
/** 处理候选词文本,与主 token 保持一致:先可视化特殊字符,再 HTML 转义 */
|
| 85 |
+
export function processCandidateText(text: string): string {
|
| 86 |
+
return escapeHtmlImpl(visualizeSpecialCharsImpl(text));
|
| 87 |
+
}
|
| 88 |
+
|
| 89 |
+
/** HTML 转义 */
|
| 90 |
+
export function escapeHtml(text: string): string {
|
| 91 |
+
return escapeHtmlImpl(text);
|
| 92 |
+
}
|
| 93 |
+
|
| 94 |
+
/** 可视化特殊字符 */
|
| 95 |
+
export function visualizeSpecialChars(text: string): string {
|
| 96 |
+
return visualizeSpecialCharsImpl(text);
|
| 97 |
+
}
|
client/src/ts/utils/topkChartUtils.ts
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
/**
|
| 2 |
+
* TopK 图表渲染工具:与 Tooltip 中的 topk 图表完全一致
|
| 3 |
+
* 供 Tooltip 和语义分析 debug info 复用
|
| 4 |
+
*/
|
| 5 |
+
|
| 6 |
+
import * as d3 from 'd3';
|
| 7 |
+
import { processCandidateText } from './tokenDisplayUtils';
|
| 8 |
+
|
| 9 |
+
const DISPLAY_TOPK = 10;
|
| 10 |
+
/** Tooltip 默认条形宽度 */
|
| 11 |
+
const MAX_BAR_WIDTH = 60;
|
| 12 |
+
/** Semantic debug 专用:更大条形与列宽,tooltip 不受影响 */
|
| 13 |
+
const SEMANTIC_DEBUG_MAX_BAR = 100;
|
| 14 |
+
const SEMANTIC_DEBUG_BAR_CELL = 180;
|
| 15 |
+
|
| 16 |
+
export interface TopkChartOptions {
|
| 17 |
+
/** 高亮的 token(与当前 token 一致时用 selectedColor) */
|
| 18 |
+
selectedToken?: string;
|
| 19 |
+
normalColor?: string;
|
| 20 |
+
selectedColor?: string;
|
| 21 |
+
/** 条形最大宽度 px */
|
| 22 |
+
maxBarWidth?: number;
|
| 23 |
+
/** 条形列单元格宽度 px */
|
| 24 |
+
barCellWidth?: number;
|
| 25 |
+
numFormat?: (n: number) => string;
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
function getThemeColors(): { normalColor: string; selectedColor: string } {
|
| 29 |
+
const isDark = document.documentElement.getAttribute('data-theme') === 'dark';
|
| 30 |
+
return {
|
| 31 |
+
normalColor: isDark ? '#ccc' : '#333',
|
| 32 |
+
selectedColor: isDark ? '#ff6666' : '#933',
|
| 33 |
+
};
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
/** 生成与 Tooltip 完全一致的 TopK 图表 HTML */
|
| 37 |
+
export function renderTopkChartHtml(
|
| 38 |
+
data: Array<{ token: string; prob: number }>,
|
| 39 |
+
options?: TopkChartOptions
|
| 40 |
+
): string {
|
| 41 |
+
if (!data.length) return '';
|
| 42 |
+
|
| 43 |
+
const { normalColor, selectedColor } = getThemeColors();
|
| 44 |
+
const norm = options?.normalColor ?? normalColor;
|
| 45 |
+
const sel = options?.selectedColor ?? selectedColor;
|
| 46 |
+
const maxBar = options?.maxBarWidth ?? MAX_BAR_WIDTH;
|
| 47 |
+
const numF = options?.numFormat ?? d3.format('.3f');
|
| 48 |
+
|
| 49 |
+
const maxProb = data[0]?.prob ?? 1;
|
| 50 |
+
const scale = d3.scaleLinear().domain([0, maxProb]).range([0, maxBar]);
|
| 51 |
+
const barCellW = options?.barCellWidth ?? 110;
|
| 52 |
+
|
| 53 |
+
const rows = data.slice(0, DISPLAY_TOPK).map((d) => {
|
| 54 |
+
const color = options?.selectedToken !== undefined && d.token === options.selectedToken ? sel : norm;
|
| 55 |
+
const bar = `<div style="display: table-cell; width:${barCellW}px;padding-left:5px;">` +
|
| 56 |
+
`<div style="display:inline-block;width: ${scale(d.prob)}px;background-color:${color};height: 10px;"></div>` +
|
| 57 |
+
` <div style="display:inline-block;color: ${color};">${numF(d.prob)}</div></div>`;
|
| 58 |
+
const text = `<div style="display: table-cell;color: ${color};padding-right:5px;">${processCandidateText(d.token)}</div>`;
|
| 59 |
+
return `<div class="row" style="display: table-row;">${bar} ${text}</div>`;
|
| 60 |
+
});
|
| 61 |
+
|
| 62 |
+
return rows.join('');
|
| 63 |
+
}
|
| 64 |
+
|
| 65 |
+
/** 生成完整 TopK 图表 HTML(含容器),用于独立展示如 semantic debug。tooltip 用 renderTopkChartHtml,不传尺寸选项。 */
|
| 66 |
+
export function renderTopkChartFullHtml(data: Array<{ token: string; prob: number }>, options?: TopkChartOptions): string {
|
| 67 |
+
const opts = options ?? {};
|
| 68 |
+
const semanticOpts = { ...opts, maxBarWidth: opts.maxBarWidth ?? SEMANTIC_DEBUG_MAX_BAR, barCellWidth: opts.barCellWidth ?? SEMANTIC_DEBUG_BAR_CELL };
|
| 69 |
+
const rows = renderTopkChartHtml(data, semanticOpts);
|
| 70 |
+
return rows ? `<div class="semantic-debug-topk-chart predictions predictions-table">${rows}</div>` : '';
|
| 71 |
+
}
|
client/src/ts/utils/visualizationConfigs.ts
CHANGED
|
@@ -1,4 +1,7 @@
|
|
| 1 |
import { tr } from '../lang/i18n-lite';
|
|
|
|
|
|
|
|
|
|
| 2 |
|
| 3 |
/**
|
| 4 |
* 直方图基础配置类型
|
|
@@ -6,12 +9,14 @@ import { tr } from '../lang/i18n-lite';
|
|
| 6 |
export interface HistogramBaseConfig {
|
| 7 |
label: string;
|
| 8 |
no_bins: number;
|
| 9 |
-
extent:
|
| 10 |
-
averageLabel: string;
|
| 11 |
showLeftInfinity?: boolean;
|
| 12 |
showRightInfinity?: boolean;
|
| 13 |
xAxisTickSkip?: number;
|
| 14 |
-
|
|
|
|
|
|
|
| 15 |
}
|
| 16 |
|
| 17 |
/**
|
|
@@ -51,7 +56,8 @@ export const getByteSurprisalHistogramConfig = (): HistogramBaseConfig => ({
|
|
| 51 |
export const getDeltaByteSurprisalHistogramConfig = (): HistogramBaseConfig => ({
|
| 52 |
label: tr("Δinformation per byte histogram"),
|
| 53 |
no_bins: 20,
|
| 54 |
-
xAxisTickSkip: 1,
|
|
|
|
| 55 |
extent: [-5, 5],
|
| 56 |
averageLabel: tr("Δ bits/Byte"),
|
| 57 |
showLeftInfinity: true,
|
|
@@ -68,14 +74,14 @@ export const getSurprisalProgressConfig = (): ScatterPlotBaseConfig => ({
|
|
| 68 |
});
|
| 69 |
|
| 70 |
/**
|
| 71 |
-
* 获取
|
| 72 |
*/
|
| 73 |
-
export const
|
| 74 |
-
label: tr("semantic score histogram"),
|
| 75 |
no_bins: 20,
|
| 76 |
xAxisTickSkip: 1,
|
|
|
|
| 77 |
extent: [0, 1],
|
| 78 |
-
averageLabel: tr("score"),
|
| 79 |
yScaleType: 'sqrt',
|
| 80 |
});
|
| 81 |
|
|
|
|
| 1 |
import { tr } from '../lang/i18n-lite';
|
| 2 |
+
import type { HistogramExtent, HistogramExtentBound } from '../vis/Histogram';
|
| 3 |
+
|
| 4 |
+
export type { HistogramExtent, HistogramExtentBound };
|
| 5 |
|
| 6 |
/**
|
| 7 |
* 直方图基础配置类型
|
|
|
|
| 9 |
export interface HistogramBaseConfig {
|
| 10 |
label: string;
|
| 11 |
no_bins: number;
|
| 12 |
+
extent: HistogramExtent;
|
| 13 |
+
averageLabel?: string;
|
| 14 |
showLeftInfinity?: boolean;
|
| 15 |
showRightInfinity?: boolean;
|
| 16 |
xAxisTickSkip?: number;
|
| 17 |
+
/** x轴刻度凑整:true=仅显示 step 整数倍处的标签,false/undefined=显示全部 */
|
| 18 |
+
xAxisTickRound?: boolean;
|
| 19 |
+
yScaleType?: 'linear' | 'sqrt' | 'log';
|
| 20 |
}
|
| 21 |
|
| 22 |
/**
|
|
|
|
| 56 |
export const getDeltaByteSurprisalHistogramConfig = (): HistogramBaseConfig => ({
|
| 57 |
label: tr("Δinformation per byte histogram"),
|
| 58 |
no_bins: 20,
|
| 59 |
+
xAxisTickSkip: 1,
|
| 60 |
+
xAxisTickRound: true,
|
| 61 |
extent: [-5, 5],
|
| 62 |
averageLabel: tr("Δ bits/Byte"),
|
| 63 |
showLeftInfinity: true,
|
|
|
|
| 74 |
});
|
| 75 |
|
| 76 |
/**
|
| 77 |
+
* 获取 Raw score normed 直方图配置(归一化 0-1)
|
| 78 |
*/
|
| 79 |
+
export const getRawScoreNormedHistogramConfig = (): HistogramBaseConfig => ({
|
| 80 |
+
label: tr("semantic raw score histogram"),
|
| 81 |
no_bins: 20,
|
| 82 |
xAxisTickSkip: 1,
|
| 83 |
+
xAxisTickRound: true,
|
| 84 |
extent: [0, 1],
|
|
|
|
| 85 |
yScaleType: 'sqrt',
|
| 86 |
});
|
| 87 |
|
client/src/ts/utils/visualizationUpdater.ts
CHANGED
|
@@ -23,18 +23,38 @@ import {
|
|
| 23 |
} from './dataValidation';
|
| 24 |
import {
|
| 25 |
calculateTextStats,
|
|
|
|
| 26 |
type TextStats
|
| 27 |
} from './textStatistics';
|
| 28 |
import {
|
| 29 |
getTokenSurprisalHistogramConfig,
|
| 30 |
getSurprisalProgressConfig,
|
| 31 |
-
|
| 32 |
} from "./visualizationConfigs";
|
| 33 |
import { isFiniteNumber } from './Util';
|
| 34 |
-
import { getSemanticSimilarityColor } from './SurprisalColorConfig';
|
| 35 |
import { showAlertDialog } from '../ui/dialog';
|
| 36 |
import { tr } from '../lang/i18n-lite';
|
|
|
|
|
|
|
| 37 |
import { getSemanticAnalysisEnabled } from './semanticAnalysisManager';
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
/**
|
| 40 |
* 可视化更新依赖
|
|
@@ -44,7 +64,7 @@ export interface VisualizationDependencies {
|
|
| 44 |
highlightController: HighlightController;
|
| 45 |
textInputController: TextInputController;
|
| 46 |
stats_frac: Histogram;
|
| 47 |
-
|
| 48 |
stats_surprisal_progress: ScatterPlot;
|
| 49 |
appStateManager: AppStateManager;
|
| 50 |
surprisalColorScale: d3.ScaleSequential<string>;
|
|
@@ -140,7 +160,7 @@ export class VisualizationUpdater {
|
|
| 140 |
/**
|
| 141 |
* 计算展示结果:仅信息密度 / 仅语义 / 联合(两者一致时)
|
| 142 |
*/
|
| 143 |
-
private computeDisplayResult(): (FrontendAnalyzeResult & {
|
| 144 |
const info = this.currentState.infoDensityData;
|
| 145 |
const sem = this.currentState.semanticData;
|
| 146 |
const infoResult = info?.result as FrontendAnalyzeResult | undefined;
|
|
@@ -175,7 +195,7 @@ export class VisualizationUpdater {
|
|
| 175 |
mergedTokens: filteredMerged,
|
| 176 |
bpe_strings: filteredMerged,
|
| 177 |
originalToMergedMap: filteredOriginalToMergedMap,
|
| 178 |
-
|
| 179 |
attentionRawScores: scores,
|
| 180 |
};
|
| 181 |
}
|
|
@@ -194,7 +214,7 @@ export class VisualizationUpdater {
|
|
| 194 |
const trimmed = text.trim();
|
| 195 |
const tokenHistogramItem = document.getElementById('token_histogram_item');
|
| 196 |
const surprisalProgressItem = document.getElementById('surprisal_progress_item');
|
| 197 |
-
const
|
| 198 |
|
| 199 |
const infoText = (this.currentState.infoDensityData?.request?.text ?? '').trim();
|
| 200 |
const semText = (this.currentState.semanticData?.text ?? '').trim();
|
|
@@ -212,7 +232,7 @@ export class VisualizationUpdater {
|
|
| 212 |
|
| 213 |
if (tokenHistogramItem) tokenHistogramItem.style.display = showInfoDensity ? '' : 'none';
|
| 214 |
if (surprisalProgressItem) surprisalProgressItem.style.display = showInfoDensity ? '' : 'none';
|
| 215 |
-
if (
|
| 216 |
|
| 217 |
// pending 时渲染空统计图(坐标轴 + 空柱体/散点),避免空白
|
| 218 |
if (showInfoDensity && mode === 'infoDensity') {
|
|
@@ -226,24 +246,25 @@ export class VisualizationUpdater {
|
|
| 226 |
if (progressTitle && progressConfig.label) progressTitle.textContent = progressConfig.label;
|
| 227 |
}
|
| 228 |
if (showSemantic && mode === 'semantic') {
|
| 229 |
-
const
|
| 230 |
-
this.deps.
|
| 231 |
-
const titleEl = document.getElementById('
|
| 232 |
-
if (titleEl) titleEl.textContent =
|
| 233 |
}
|
| 234 |
}
|
| 235 |
|
| 236 |
/**
|
| 237 |
* 重新渲染直方图(内部方法)
|
| 238 |
-
* 仅信息密度:只显示 token/surprisal progress;仅语义:只显示
|
|
|
|
| 239 |
*/
|
| 240 |
-
private rerenderHistogramsInternal(): void {
|
| 241 |
const hasInfoDensity = !!this.currentState.infoDensityData;
|
| 242 |
const displayResult = this.computeDisplayResult();
|
| 243 |
|
| 244 |
const tokenHistogramItem = document.getElementById('token_histogram_item');
|
| 245 |
const surprisalProgressItem = document.getElementById('surprisal_progress_item');
|
| 246 |
-
const
|
| 247 |
|
| 248 |
if (hasInfoDensity) {
|
| 249 |
const currentSurprisals = this.currentState.currentSurprisals;
|
|
@@ -257,6 +278,7 @@ export class VisualizationUpdater {
|
|
| 257 |
colorScale: this.deps.surprisalColorScale,
|
| 258 |
averageValue: currentTokenAvg ?? undefined,
|
| 259 |
p90Value: currentTokenP90 ?? undefined,
|
|
|
|
| 260 |
});
|
| 261 |
const titleElement = document.getElementById('token_histogram_title');
|
| 262 |
if (titleElement) titleElement.textContent = tokenHistogramConfig.label;
|
|
@@ -279,31 +301,125 @@ export class VisualizationUpdater {
|
|
| 279 |
if (surprisalProgressItem) surprisalProgressItem.style.display = 'none';
|
| 280 |
}
|
| 281 |
|
| 282 |
-
const
|
| 283 |
-
const
|
| 284 |
-
if (
|
| 285 |
-
const
|
| 286 |
-
const
|
| 287 |
-
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 291 |
colorScale,
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
});
|
| 294 |
-
const titleEl = document.getElementById('
|
| 295 |
-
if (titleEl) titleEl.textContent =
|
| 296 |
-
if (
|
| 297 |
-
|
| 298 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 299 |
}
|
| 300 |
}
|
| 301 |
|
| 302 |
-
/**
|
| 303 |
-
* 重新渲染直方图(公开方法,供外部调用,如主题切换时)
|
| 304 |
-
*/
|
| 305 |
public rerenderHistograms(): void {
|
| 306 |
-
this.rerenderHistogramsInternal();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 307 |
}
|
| 308 |
|
| 309 |
/**
|
|
@@ -486,8 +602,7 @@ export class VisualizationUpdater {
|
|
| 486 |
res: {
|
| 487 |
model?: string;
|
| 488 |
token_attention?: Array<{ offset: [number, number]; raw: string; score: number }>;
|
| 489 |
-
|
| 490 |
-
debug_top10?: Array<{ token: string; prob: number }>;
|
| 491 |
},
|
| 492 |
text?: string
|
| 493 |
): void {
|
|
@@ -523,12 +638,16 @@ export class VisualizationUpdater {
|
|
| 523 |
enableRenderAnimation: false,
|
| 524 |
semanticAnalysisMode: getSemanticAnalysisEnabled(),
|
| 525 |
}, false);
|
| 526 |
-
this.deps.lmf.update(displayResult);
|
| 527 |
this.clearHighlights();
|
|
|
|
| 528 |
this.rerenderHistogramsInternal();
|
| 529 |
this.syncSemanticUiFromConfig();
|
| 530 |
|
| 531 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 532 |
}
|
| 533 |
|
| 534 |
/** 更新文本渲染区下方的 debug 信息(abbrev + top10) */
|
|
@@ -553,6 +672,7 @@ export class VisualizationUpdater {
|
|
| 553 |
if (top10?.length) {
|
| 554 |
const items = top10.map((t) => `'${esc(t.token)}(${(t.prob * 100).toFixed(1)}%)'`);
|
| 555 |
parts.push(`<div class="semantic-debug-top10">[${items.join(', ')}]</div>`);
|
|
|
|
| 556 |
}
|
| 557 |
el.innerHTML = parts.join('');
|
| 558 |
}
|
|
@@ -561,7 +681,7 @@ export class VisualizationUpdater {
|
|
| 561 |
res: { model?: string },
|
| 562 |
tokenAttention: Array<{ offset: [number, number]; raw: string; score: number }>,
|
| 563 |
text: string
|
| 564 |
-
): (FrontendAnalyzeResult & {
|
| 565 |
const safeText = text.trim();
|
| 566 |
if (!safeText) return null;
|
| 567 |
const syntheticTokens = tokenAttention.map((t) => ({
|
|
@@ -583,7 +703,7 @@ export class VisualizationUpdater {
|
|
| 583 |
mergedTokens,
|
| 584 |
originalToMergedMap,
|
| 585 |
originalText: safeText,
|
| 586 |
-
|
| 587 |
attentionRawScores: scores,
|
| 588 |
};
|
| 589 |
}
|
|
@@ -637,7 +757,7 @@ export class VisualizationUpdater {
|
|
| 637 |
}
|
| 638 |
|
| 639 |
/**
|
| 640 |
-
* 将 score 归一化到 [0,1] 用于染色(0-max 归一化:norm =
|
| 641 |
* NaN/Inf 不参与 max 计算,映射为 0
|
| 642 |
*/
|
| 643 |
private normalizeScoresForColor(scores: number[]): number[] {
|
|
|
|
| 23 |
} from './dataValidation';
|
| 24 |
import {
|
| 25 |
calculateTextStats,
|
| 26 |
+
computeP90,
|
| 27 |
type TextStats
|
| 28 |
} from './textStatistics';
|
| 29 |
import {
|
| 30 |
getTokenSurprisalHistogramConfig,
|
| 31 |
getSurprisalProgressConfig,
|
| 32 |
+
getRawScoreNormedHistogramConfig
|
| 33 |
} from "./visualizationConfigs";
|
| 34 |
import { isFiniteNumber } from './Util';
|
| 35 |
+
import { getSemanticSimilarityColor, HISTOGRAM_MIN_ALPHA } from './SurprisalColorConfig';
|
| 36 |
import { showAlertDialog } from '../ui/dialog';
|
| 37 |
import { tr } from '../lang/i18n-lite';
|
| 38 |
+
import { computeExpectedCounts } from './lognormalFit';
|
| 39 |
+
import { findSignalThreshold, type SignalThresholdBin } from './signalThresholdDetector';
|
| 40 |
import { getSemanticAnalysisEnabled } from './semanticAnalysisManager';
|
| 41 |
+
import { renderTopkChartFullHtml } from './topkChartUtils';
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
/**
|
| 45 |
+
* P(signal | raw_score_normed = s) 复用 findSignalThreshold 的 bins
|
| 46 |
+
* 每个样本 s 落入对应 bin,P(signal) = (obsInBin - expInBin) / obsInBin
|
| 47 |
+
*/
|
| 48 |
+
function signalProbFromBins(scores: number[], bins: SignalThresholdBin[]): number[] {
|
| 49 |
+
if (scores.length === 0 || bins.length === 0) return [];
|
| 50 |
+
const tauLefts = bins.map((b) => b.tauLeft);
|
| 51 |
+
return scores.map((s) => {
|
| 52 |
+
const i = Math.max(0, Math.min(bins.length - 1, d3.bisectRight(tauLefts, s) - 1));
|
| 53 |
+
const b = bins[i]!;
|
| 54 |
+
if (s < b.tauLeft || s >= b.tauRight) return 0;
|
| 55 |
+
return b.obsInBin > 0 ? Math.max(0, Math.min(1, (b.obsInBin - b.expInBin) / b.obsInBin)) : 0;
|
| 56 |
+
});
|
| 57 |
+
}
|
| 58 |
|
| 59 |
/**
|
| 60 |
* 可视化更新依赖
|
|
|
|
| 64 |
highlightController: HighlightController;
|
| 65 |
textInputController: TextInputController;
|
| 66 |
stats_frac: Histogram;
|
| 67 |
+
stats_raw_score_normed: Histogram;
|
| 68 |
stats_surprisal_progress: ScatterPlot;
|
| 69 |
appStateManager: AppStateManager;
|
| 70 |
surprisalColorScale: d3.ScaleSequential<string>;
|
|
|
|
| 160 |
/**
|
| 161 |
* 计算展示结果:仅信息密度 / 仅语义 / 联合(两者一致时)
|
| 162 |
*/
|
| 163 |
+
private computeDisplayResult(): (FrontendAnalyzeResult & { rawScoresNormed?: number[]; attentionRawScores?: number[] }) | null {
|
| 164 |
const info = this.currentState.infoDensityData;
|
| 165 |
const sem = this.currentState.semanticData;
|
| 166 |
const infoResult = info?.result as FrontendAnalyzeResult | undefined;
|
|
|
|
| 195 |
mergedTokens: filteredMerged,
|
| 196 |
bpe_strings: filteredMerged,
|
| 197 |
originalToMergedMap: filteredOriginalToMergedMap,
|
| 198 |
+
rawScoresNormed: this.normalizeScoresForColor(scores),
|
| 199 |
attentionRawScores: scores,
|
| 200 |
};
|
| 201 |
}
|
|
|
|
| 214 |
const trimmed = text.trim();
|
| 215 |
const tokenHistogramItem = document.getElementById('token_histogram_item');
|
| 216 |
const surprisalProgressItem = document.getElementById('surprisal_progress_item');
|
| 217 |
+
const rawScoreNormedItem = document.getElementById('raw_score_normed_histogram_item');
|
| 218 |
|
| 219 |
const infoText = (this.currentState.infoDensityData?.request?.text ?? '').trim();
|
| 220 |
const semText = (this.currentState.semanticData?.text ?? '').trim();
|
|
|
|
| 232 |
|
| 233 |
if (tokenHistogramItem) tokenHistogramItem.style.display = showInfoDensity ? '' : 'none';
|
| 234 |
if (surprisalProgressItem) surprisalProgressItem.style.display = showInfoDensity ? '' : 'none';
|
| 235 |
+
if (rawScoreNormedItem) rawScoreNormedItem.style.display = showSemantic ? '' : 'none';
|
| 236 |
|
| 237 |
// pending 时渲染空统计图(坐标轴 + 空柱体/散点),避免空白
|
| 238 |
if (showInfoDensity && mode === 'infoDensity') {
|
|
|
|
| 246 |
if (progressTitle && progressConfig.label) progressTitle.textContent = progressConfig.label;
|
| 247 |
}
|
| 248 |
if (showSemantic && mode === 'semantic') {
|
| 249 |
+
const rawScoreNormedConfig = getRawScoreNormedHistogramConfig();
|
| 250 |
+
this.deps.stats_raw_score_normed.update({ ...rawScoreNormedConfig, data: [], colorScale: () => 'transparent' });
|
| 251 |
+
const titleEl = document.getElementById('raw_score_normed_histogram_title');
|
| 252 |
+
if (titleEl) titleEl.textContent = rawScoreNormedConfig.label;
|
| 253 |
}
|
| 254 |
}
|
| 255 |
|
| 256 |
/**
|
| 257 |
* 重新渲染直方图(内部方法)
|
| 258 |
+
* 仅信息密度:只显示 token/surprisal progress;仅语义:只显示 raw score normed;联合:全部显示
|
| 259 |
+
* @param skipLmfUpdate 为 true 时跳过 lmf.update(主题切换时由 rerenderOnThemeChange 统一重绘,避免竞态)
|
| 260 |
*/
|
| 261 |
+
private rerenderHistogramsInternal(skipLmfUpdate = false): void {
|
| 262 |
const hasInfoDensity = !!this.currentState.infoDensityData;
|
| 263 |
const displayResult = this.computeDisplayResult();
|
| 264 |
|
| 265 |
const tokenHistogramItem = document.getElementById('token_histogram_item');
|
| 266 |
const surprisalProgressItem = document.getElementById('surprisal_progress_item');
|
| 267 |
+
const rawScoreNormedItem = document.getElementById('raw_score_normed_histogram_item');
|
| 268 |
|
| 269 |
if (hasInfoDensity) {
|
| 270 |
const currentSurprisals = this.currentState.currentSurprisals;
|
|
|
|
| 278 |
colorScale: this.deps.surprisalColorScale,
|
| 279 |
averageValue: currentTokenAvg ?? undefined,
|
| 280 |
p90Value: currentTokenP90 ?? undefined,
|
| 281 |
+
p90Label: tokenHistogramConfig.averageLabel,
|
| 282 |
});
|
| 283 |
const titleElement = document.getElementById('token_histogram_title');
|
| 284 |
if (titleElement) titleElement.textContent = tokenHistogramConfig.label;
|
|
|
|
| 301 |
if (surprisalProgressItem) surprisalProgressItem.style.display = 'none';
|
| 302 |
}
|
| 303 |
|
| 304 |
+
const rawScoresNormed = displayResult?.rawScoresNormed;
|
| 305 |
+
const validRawScoresNormed = rawScoresNormed?.filter((s) => typeof s === 'number' && isFinite(s));
|
| 306 |
+
if (validRawScoresNormed && validRawScoresNormed.length > 0) {
|
| 307 |
+
const rawScoreNormedConfig = getRawScoreNormedHistogramConfig();
|
| 308 |
+
const colorScale = (v: number) => getSemanticSimilarityColor(v, HISTOGRAM_MIN_ALPHA);
|
| 309 |
+
// 默认关闭 verbose;浏览器控制台执行 window.signalThresholdVerbose = true 后重新搜索可开启
|
| 310 |
+
const verbose = !!(window as Window & { signalThresholdVerbose?: boolean }).signalThresholdVerbose;
|
| 311 |
+
const signalThresholdResult = findSignalThreshold(validRawScoresNormed, verbose);
|
| 312 |
+
console.log('[signalThreshold] 最终结果:', signalThresholdResult !== null
|
| 313 |
+
? (() => {
|
| 314 |
+
const t = signalThresholdResult!.threshold;
|
| 315 |
+
const below = validRawScoresNormed.filter((s) => s < t).length;
|
| 316 |
+
const quantile = validRawScoresNormed.length > 0 ? (below / validRawScoresNormed.length) : 0;
|
| 317 |
+
return `threshold=${t.toFixed(4)} confidence=${signalThresholdResult!.confidence.toFixed(2)} quantile=${quantile.toFixed(4)} (${below}/${validRawScoresNormed.length} below)`;
|
| 318 |
+
})()
|
| 319 |
+
: 'null(无信号)');
|
| 320 |
+
if (!verbose) {
|
| 321 |
+
console.log('[signalThreshold] 提示:控制台执行 window.signalThresholdVerbose = true 后重新搜索可查看完整 bin 扫描日志');
|
| 322 |
+
}
|
| 323 |
+
const fitResult = validRawScoresNormed.length >= 2 && signalThresholdResult != null
|
| 324 |
+
? {
|
| 325 |
+
mu: signalThresholdResult.mu,
|
| 326 |
+
sigma: signalThresholdResult.sigma,
|
| 327 |
+
expectedCounts: computeExpectedCounts(
|
| 328 |
+
signalThresholdResult.mu,
|
| 329 |
+
signalThresholdResult.sigma,
|
| 330 |
+
rawScoreNormedConfig.extent as [number, number],
|
| 331 |
+
rawScoreNormedConfig.no_bins,
|
| 332 |
+
validRawScoresNormed.length
|
| 333 |
+
),
|
| 334 |
+
}
|
| 335 |
+
: null;
|
| 336 |
+
console.log('[raw score normed histogram] fitted log-normal μ, σ:', fitResult ? [fitResult.mu, fitResult.sigma] : 'failed');
|
| 337 |
+
const signalProbs = signalThresholdResult != null
|
| 338 |
+
? signalProbFromBins(validRawScoresNormed, signalThresholdResult.bins)
|
| 339 |
+
: [];
|
| 340 |
+
/**
|
| 341 |
+
* P_pw:后验信号概率的简化映射,x <= threshold 时为 0,x > threshold 时为 1
|
| 342 |
+
* pw_score = score × P_pw
|
| 343 |
+
* 基于 rawScoresNormed 全数组计算,保证与 token 索引对齐
|
| 344 |
+
*/
|
| 345 |
+
const rawScoresNormedFull = displayResult!.rawScoresNormed ?? [];
|
| 346 |
+
const t = signalThresholdResult?.threshold ?? 0;
|
| 347 |
+
const pPwValues = signalThresholdResult != null
|
| 348 |
+
? rawScoresNormedFull.map((s) => (typeof s === 'number' && isFinite(s) && s > t ? 1 : 0))
|
| 349 |
+
: [];
|
| 350 |
+
const pwScores = signalThresholdResult != null
|
| 351 |
+
? rawScoresNormedFull.map((s) => (typeof s === 'number' && isFinite(s) && s > t ? s : 0))
|
| 352 |
+
: [];
|
| 353 |
+
const probCurveData = signalProbs.length > 0
|
| 354 |
+
? (() => {
|
| 355 |
+
const pairs = validRawScoresNormed.map((x, i) => ({ x, y: signalProbs[i]! })).sort((a, b) => a.x - b.x);
|
| 356 |
+
return { x: pairs.map(p => p.x), y: pairs.map(p => p.y) };
|
| 357 |
+
})()
|
| 358 |
+
: undefined;
|
| 359 |
+
const signalThresholdPercentile = signalThresholdResult != null && validRawScoresNormed.length > 0
|
| 360 |
+
? Math.round((validRawScoresNormed.filter((s) => s < signalThresholdResult.threshold).length / validRawScoresNormed.length) * 100)
|
| 361 |
+
: undefined;
|
| 362 |
+
this.deps.stats_raw_score_normed.update({
|
| 363 |
+
...rawScoreNormedConfig,
|
| 364 |
+
data: validRawScoresNormed,
|
| 365 |
colorScale,
|
| 366 |
+
fitExpectedCounts: fitResult?.expectedCounts,
|
| 367 |
+
showProbCurve: true,
|
| 368 |
+
probCurveData: probCurveData?.x.length ? probCurveData : undefined,
|
| 369 |
+
signalThreshold: signalThresholdResult?.threshold ?? undefined,
|
| 370 |
+
signalThresholdPercentile: signalThresholdPercentile ?? undefined,
|
| 371 |
});
|
| 372 |
+
const titleEl = document.getElementById('raw_score_normed_histogram_title');
|
| 373 |
+
if (titleEl) titleEl.textContent = rawScoreNormedConfig.label;
|
| 374 |
+
if (rawScoreNormedItem) rawScoreNormedItem.style.display = '';
|
| 375 |
+
|
| 376 |
+
const colorSourceEl = document.getElementById('semantic_color_source_select') as HTMLSelectElement | null;
|
| 377 |
+
const scoresForColor = colorSourceEl?.value === 'signal_probability' ? pPwValues
|
| 378 |
+
: colorSourceEl?.value === 'pw_score' ? pwScores
|
| 379 |
+
: (displayResult!.rawScoresNormed ?? []);
|
| 380 |
+
|
| 381 |
+
if (fitResult != null) {
|
| 382 |
+
const resultWithExt = { ...displayResult, signalProbs, pPwValues, pwScores };
|
| 383 |
+
this.deps.highlightController.updateCurrentData({ result: resultWithExt, signalProbs, pPwValues, pwScores });
|
| 384 |
+
if (!skipLmfUpdate) {
|
| 385 |
+
this.deps.lmf.update({ ...resultWithExt, pwScores, colorScores: scoresForColor } as FrontendAnalyzeResult & { pPwValues?: number[]; pwScores?: number[]; colorScores?: number[] });
|
| 386 |
+
}
|
| 387 |
+
} else {
|
| 388 |
+
this.deps.highlightController.updateCurrentData({ result: displayResult });
|
| 389 |
+
if (!skipLmfUpdate) {
|
| 390 |
+
this.deps.lmf.update({ ...displayResult, colorScores: scoresForColor } as FrontendAnalyzeResult & { colorScores?: number[] });
|
| 391 |
+
}
|
| 392 |
+
}
|
| 393 |
+
} else {
|
| 394 |
+
if (rawScoreNormedItem) rawScoreNormedItem.style.display = 'none';
|
| 395 |
+
if (displayResult) this.deps.highlightController.updateCurrentData({ result: displayResult });
|
| 396 |
}
|
| 397 |
}
|
| 398 |
|
| 399 |
+
/** 重新渲染直方图(供外部调用) */
|
|
|
|
|
|
|
| 400 |
public rerenderHistograms(): void {
|
| 401 |
+
this.rerenderHistogramsInternal(false);
|
| 402 |
+
}
|
| 403 |
+
|
| 404 |
+
/** 仅更新语义着色源(color source 切换时调用,不重新拟合) */
|
| 405 |
+
public updateSemanticColorSource(): void {
|
| 406 |
+
const cd = this.deps.highlightController.getCurrentData();
|
| 407 |
+
const r = cd?.result as (FrontendAnalyzeResult & { rawScoresNormed?: number[] }) | undefined;
|
| 408 |
+
if (!r?.rawScoresNormed?.length) return;
|
| 409 |
+
const el = document.getElementById('semantic_color_source_select') as HTMLSelectElement | null;
|
| 410 |
+
const v = el?.value;
|
| 411 |
+
const scoresForColor = v === 'signal_probability' ? (cd!.pPwValues ?? [])
|
| 412 |
+
: v === 'pw_score' ? (cd!.pwScores ?? [])
|
| 413 |
+
: r.rawScoresNormed;
|
| 414 |
+
this.deps.lmf.update({ ...r, pPwValues: cd!.pPwValues, pwScores: cd!.pwScores, colorScores: scoresForColor } as FrontendAnalyzeResult & { pPwValues?: number[]; pwScores?: number[]; colorScores?: number[] });
|
| 415 |
+
}
|
| 416 |
+
|
| 417 |
+
/** 主题切换时调用:在样式生效后统一重绘直方图与文本(rgba 透出背景,需等新主题生效) */
|
| 418 |
+
public rerenderOnThemeChange(): void {
|
| 419 |
+
requestAnimationFrame(() => requestAnimationFrame(() => {
|
| 420 |
+
this.rerenderHistogramsInternal(true);
|
| 421 |
+
this.deps.lmf.reRenderCurrent();
|
| 422 |
+
}));
|
| 423 |
}
|
| 424 |
|
| 425 |
/**
|
|
|
|
| 602 |
res: {
|
| 603 |
model?: string;
|
| 604 |
token_attention?: Array<{ offset: [number, number]; raw: string; score: number }>;
|
| 605 |
+
debug_info?: { abbrev?: string; topk_tokens?: string[]; topk_probs?: number[] };
|
|
|
|
| 606 |
},
|
| 607 |
text?: string
|
| 608 |
): void {
|
|
|
|
| 638 |
enableRenderAnimation: false,
|
| 639 |
semanticAnalysisMode: getSemanticAnalysisEnabled(),
|
| 640 |
}, false);
|
|
|
|
| 641 |
this.clearHighlights();
|
| 642 |
+
// 仅由 rerenderHistogramsInternal 调用 lmf.update,避免与 handleSemanticResponse 的重复调用导致语义渲染双重叠加
|
| 643 |
this.rerenderHistogramsInternal();
|
| 644 |
this.syncSemanticUiFromConfig();
|
| 645 |
|
| 646 |
+
const di = res.debug_info;
|
| 647 |
+
const top10 = (di?.topk_tokens && di?.topk_probs)
|
| 648 |
+
? di.topk_tokens.map((token, i) => ({ token, prob: di.topk_probs![i] ?? 0 }))
|
| 649 |
+
: undefined;
|
| 650 |
+
this.updateSemanticDebugInfo(di?.abbrev, top10);
|
| 651 |
}
|
| 652 |
|
| 653 |
/** 更新文本渲染区下方的 debug 信息(abbrev + top10) */
|
|
|
|
| 672 |
if (top10?.length) {
|
| 673 |
const items = top10.map((t) => `'${esc(t.token)}(${(t.prob * 100).toFixed(1)}%)'`);
|
| 674 |
parts.push(`<div class="semantic-debug-top10">[${items.join(', ')}]</div>`);
|
| 675 |
+
parts.push(renderTopkChartFullHtml(top10));
|
| 676 |
}
|
| 677 |
el.innerHTML = parts.join('');
|
| 678 |
}
|
|
|
|
| 681 |
res: { model?: string },
|
| 682 |
tokenAttention: Array<{ offset: [number, number]; raw: string; score: number }>,
|
| 683 |
text: string
|
| 684 |
+
): (FrontendAnalyzeResult & { rawScoresNormed: number[]; attentionRawScores: number[] }) | null {
|
| 685 |
const safeText = text.trim();
|
| 686 |
if (!safeText) return null;
|
| 687 |
const syntheticTokens = tokenAttention.map((t) => ({
|
|
|
|
| 703 |
mergedTokens,
|
| 704 |
originalToMergedMap,
|
| 705 |
originalText: safeText,
|
| 706 |
+
rawScoresNormed: this.normalizeScoresForColor(scores),
|
| 707 |
attentionRawScores: scores,
|
| 708 |
};
|
| 709 |
}
|
|
|
|
| 757 |
}
|
| 758 |
|
| 759 |
/**
|
| 760 |
+
* 将 raw score 归一化到 [0,1] 用于染色(0-max 归一化:norm = raw_score / max)
|
| 761 |
* NaN/Inf 不参与 max 计算,映射为 0
|
| 762 |
*/
|
| 763 |
private normalizeScoresForColor(scores: number[]): number[] {
|
client/src/ts/vis/GLTR_Text_Box.ts
CHANGED
|
@@ -43,12 +43,15 @@ export enum GLTR_Mode {
|
|
| 43 |
fract_p
|
| 44 |
}
|
| 45 |
|
| 46 |
-
/** tokenData:信息密度模式为 FrontendToken,Semantic analysis 模式下附加
|
| 47 |
-
export type TokenDataForRender = FrontendToken & {
|
| 48 |
|
| 49 |
/** 语义模式下的 Tooltip 展示字段 */
|
| 50 |
export type SemanticRenderFields = {
|
| 51 |
-
|
|
|
|
|
|
|
|
|
|
| 52 |
/** Attention 分析时的原始 score(未归一化) */
|
| 53 |
rawScore?: number;
|
| 54 |
};
|
|
@@ -62,9 +65,9 @@ export type GLTR_HoverEvent = { hovered: boolean, d: GLTR_RenderItem, event?: Mo
|
|
| 62 |
|
| 63 |
/** 从 token 中安全提取语义展示字段,无需类型断言 */
|
| 64 |
function extractSemanticFields(token: TokenDataForRender): SemanticRenderFields | undefined {
|
| 65 |
-
const
|
| 66 |
-
if (
|
| 67 |
-
return {
|
| 68 |
}
|
| 69 |
|
| 70 |
export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
@@ -87,7 +90,7 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 87 |
// Minimap 配置
|
| 88 |
enableMinimap: false, // 是否启用 minimap(默认关闭)
|
| 89 |
minimapWidth: getMinimapWidthFromCSS(), // minimap 宽度(像素),从 CSS 变量读取
|
| 90 |
-
// Semantic analysis 模式:为 true 时按
|
| 91 |
semanticAnalysisMode: false,
|
| 92 |
};
|
| 93 |
|
|
@@ -237,9 +240,10 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 237 |
this.positionCalculator = new TokenPositionCalculator(baseNode);
|
| 238 |
}
|
| 239 |
|
| 240 |
-
const rdExt = rd as FrontendAnalyzeResult & {
|
| 241 |
-
const
|
| 242 |
-
const
|
|
|
|
| 243 |
|
| 244 |
// Semantic analysis 模式:按 BPE(merged tokens);否则按 BPE
|
| 245 |
const rdForPositions: FrontendAnalyzeResult = rd;
|
|
@@ -262,7 +266,7 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 262 |
charToByteIndexMap: this._current.charToByteIndexMap,
|
| 263 |
}
|
| 264 |
: undefined,
|
| 265 |
-
semantic: isSemantic ? { analysisMode: true,
|
| 266 |
};
|
| 267 |
this.svgOverlayManager = new SvgOverlayManager(baseNode, overlayOptions);
|
| 268 |
|
|
@@ -616,30 +620,16 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 616 |
|
| 617 |
/**
|
| 618 |
* 设置主题变化监听器
|
|
|
|
| 619 |
*/
|
| 620 |
private setupThemeListener(): void {
|
| 621 |
-
// 使用MutationObserver监听data-theme属性的变化
|
| 622 |
const observer = new MutationObserver((mutations) => {
|
| 623 |
mutations.forEach((mutation) => {
|
| 624 |
if (mutation.type === 'attributes' && mutation.attributeName === 'data-theme') {
|
| 625 |
-
// 主题变化时,更新颜色scale并重新渲染
|
| 626 |
this.updateColorScales();
|
| 627 |
-
if (this.currentRenderData) {
|
| 628 |
-
// 主题切换时禁用动画,立即重新渲染
|
| 629 |
-
const originalAnimationSetting = this.options.enableRenderAnimation;
|
| 630 |
-
this.options.enableRenderAnimation = false;
|
| 631 |
-
// 重新渲染当前数据
|
| 632 |
-
this._render(this.currentRenderData);
|
| 633 |
-
// 恢复动画设置
|
| 634 |
-
setTimeout(() => {
|
| 635 |
-
this.options.enableRenderAnimation = originalAnimationSetting;
|
| 636 |
-
}, 100);
|
| 637 |
-
}
|
| 638 |
}
|
| 639 |
});
|
| 640 |
});
|
| 641 |
-
|
| 642 |
-
// 开始观察document.documentElement的data-theme属性
|
| 643 |
observer.observe(document.documentElement, {
|
| 644 |
attributes: true,
|
| 645 |
attributeFilter: ['data-theme']
|
|
@@ -662,18 +652,23 @@ export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
| 662 |
*/
|
| 663 |
private addTokenEventListeners(element: SVGGElement, tokenIndex: number, rd: FrontendAnalyzeResult): void {
|
| 664 |
const rdExt = rd as FrontendAnalyzeResult & {
|
| 665 |
-
|
| 666 |
attentionRawScores?: number[];
|
|
|
|
|
|
|
| 667 |
};
|
| 668 |
-
const
|
| 669 |
-
const showTooltip = true; // 始终显示 tooltip,semantic 部分在
|
| 670 |
|
| 671 |
const tokenData = rd.bpe_strings[tokenIndex] as TokenDataForRender;
|
| 672 |
let semantic = showTooltip ? extractSemanticFields(tokenData) : undefined;
|
| 673 |
-
if (showTooltip &&
|
| 674 |
-
|
|
|
|
| 675 |
const rawScore = rdExt.attentionRawScores?.[tokenIndex];
|
| 676 |
-
|
|
|
|
|
|
|
| 677 |
}
|
| 678 |
|
| 679 |
const handleMouseEnter = (event: MouseEvent) => {
|
|
|
|
| 43 |
fract_p
|
| 44 |
}
|
| 45 |
|
| 46 |
+
/** tokenData:信息密度模式为 FrontendToken,Semantic analysis 模式下附加 rawScoreNormed */
|
| 47 |
+
export type TokenDataForRender = FrontendToken & { rawScoreNormed?: number };
|
| 48 |
|
| 49 |
/** 语义模式下的 Tooltip 展示字段 */
|
| 50 |
export type SemanticRenderFields = {
|
| 51 |
+
pwScore?: number;
|
| 52 |
+
/** 信号概率 P_pw:x<=threshold 为 0,x>threshold 为 1 */
|
| 53 |
+
signalProb?: number;
|
| 54 |
+
rawScoreNormed?: number;
|
| 55 |
/** Attention 分析时的原始 score(未归一化) */
|
| 56 |
rawScore?: number;
|
| 57 |
};
|
|
|
|
| 65 |
|
| 66 |
/** 从 token 中安全提取语义展示字段,无需类型断言 */
|
| 67 |
function extractSemanticFields(token: TokenDataForRender): SemanticRenderFields | undefined {
|
| 68 |
+
const rawScoreNormed = "rawScoreNormed" in token && typeof token.rawScoreNormed === "number" ? token.rawScoreNormed : undefined;
|
| 69 |
+
if (rawScoreNormed === undefined) return undefined;
|
| 70 |
+
return { rawScoreNormed };
|
| 71 |
}
|
| 72 |
|
| 73 |
export class GLTR_Text_Box extends VComponent<FrontendAnalyzeResult> {
|
|
|
|
| 90 |
// Minimap 配置
|
| 91 |
enableMinimap: false, // 是否启用 minimap(默认关闭)
|
| 92 |
minimapWidth: getMinimapWidthFromCSS(), // minimap 宽度(像素),从 CSS 变量读取
|
| 93 |
+
// Semantic analysis 模式:为 true 时按 raw score normed 染色
|
| 94 |
semanticAnalysisMode: false,
|
| 95 |
};
|
| 96 |
|
|
|
|
| 240 |
this.positionCalculator = new TokenPositionCalculator(baseNode);
|
| 241 |
}
|
| 242 |
|
| 243 |
+
const rdExt = rd as FrontendAnalyzeResult & { rawScoresNormed?: number[]; colorScores?: number[] };
|
| 244 |
+
const rawScoresNormed = rdExt.rawScoresNormed;
|
| 245 |
+
const colorScores = (rdExt.colorScores?.length ? rdExt.colorScores : undefined) ?? rawScoresNormed;
|
| 246 |
+
const isSemantic = this.options.semanticAnalysisMode && colorScores?.length;
|
| 247 |
|
| 248 |
// Semantic analysis 模式:按 BPE(merged tokens);否则按 BPE
|
| 249 |
const rdForPositions: FrontendAnalyzeResult = rd;
|
|
|
|
| 266 |
charToByteIndexMap: this._current.charToByteIndexMap,
|
| 267 |
}
|
| 268 |
: undefined,
|
| 269 |
+
semantic: isSemantic ? { analysisMode: true, rawScoresNormed: colorScores } : undefined,
|
| 270 |
};
|
| 271 |
this.svgOverlayManager = new SvgOverlayManager(baseNode, overlayOptions);
|
| 272 |
|
|
|
|
| 620 |
|
| 621 |
/**
|
| 622 |
* 设置主题变化监听器
|
| 623 |
+
* 仅更新 fracScale/diffScale;重渲染由 initThemeManager 的 onThemeChange -> rerenderOnThemeChange 统一触发
|
| 624 |
*/
|
| 625 |
private setupThemeListener(): void {
|
|
|
|
| 626 |
const observer = new MutationObserver((mutations) => {
|
| 627 |
mutations.forEach((mutation) => {
|
| 628 |
if (mutation.type === 'attributes' && mutation.attributeName === 'data-theme') {
|
|
|
|
| 629 |
this.updateColorScales();
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 630 |
}
|
| 631 |
});
|
| 632 |
});
|
|
|
|
|
|
|
| 633 |
observer.observe(document.documentElement, {
|
| 634 |
attributes: true,
|
| 635 |
attributeFilter: ['data-theme']
|
|
|
|
| 652 |
*/
|
| 653 |
private addTokenEventListeners(element: SVGGElement, tokenIndex: number, rd: FrontendAnalyzeResult): void {
|
| 654 |
const rdExt = rd as FrontendAnalyzeResult & {
|
| 655 |
+
rawScoresNormed?: number[];
|
| 656 |
attentionRawScores?: number[];
|
| 657 |
+
pPwValues?: number[];
|
| 658 |
+
pwScores?: number[];
|
| 659 |
};
|
| 660 |
+
const hasRawScoresNormed = rdExt.rawScoresNormed?.length && tokenIndex < rdExt.rawScoresNormed.length;
|
| 661 |
+
const showTooltip = true; // 始终显示 tooltip,semantic 部分在 hasRawScoresNormed 时填充
|
| 662 |
|
| 663 |
const tokenData = rd.bpe_strings[tokenIndex] as TokenDataForRender;
|
| 664 |
let semantic = showTooltip ? extractSemanticFields(tokenData) : undefined;
|
| 665 |
+
if (showTooltip && hasRawScoresNormed && rdExt.rawScoresNormed) {
|
| 666 |
+
// rawScoreNormed 始终用 rawScoresNormed,与 color source 无关
|
| 667 |
+
const attnScore = rdExt.rawScoresNormed[tokenIndex];
|
| 668 |
const rawScore = rdExt.attentionRawScores?.[tokenIndex];
|
| 669 |
+
const signalProb = rdExt.pPwValues?.[tokenIndex]; // P_pw:x<=threshold 为 0,x>threshold 为 1
|
| 670 |
+
const pwScore = rdExt.pwScores?.[tokenIndex];
|
| 671 |
+
semantic = { ...semantic, rawScoreNormed: attnScore, rawScore, signalProb, pwScore } as SemanticRenderFields;
|
| 672 |
}
|
| 673 |
|
| 674 |
const handleMouseEnter = (event: MouseEvent) => {
|
client/src/ts/vis/Histogram.ts
CHANGED
|
@@ -1,16 +1,46 @@
|
|
| 1 |
import { VComponent } from "./VisComponent";
|
| 2 |
import { D3Sel } from "../utils/Util";
|
| 3 |
import { SimpleEventHandler } from "../utils/SimpleEventHandler";
|
|
|
|
| 4 |
import * as d3 from "d3";
|
| 5 |
import { schemeDark2 } from "d3";
|
| 6 |
|
| 7 |
const averageNumberFormat = d3.format('.2f');
|
| 8 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
export type HistogramData = {
|
| 10 |
data: number[],
|
| 11 |
label?: string,
|
| 12 |
no_bins: number,
|
| 13 |
-
extent:
|
| 14 |
colorScale: (value: number) => string, // 添加颜色 scale
|
| 15 |
averageValue?: number,
|
| 16 |
p90Value?: number,
|
|
@@ -18,8 +48,21 @@ export type HistogramData = {
|
|
| 18 |
p90Label?: string,
|
| 19 |
showLeftInfinity?: boolean,
|
| 20 |
showRightInfinity?: boolean,
|
| 21 |
-
|
| 22 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
}
|
| 24 |
|
| 25 |
|
|
@@ -68,6 +111,9 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 68 |
.attr('class', 'y-axis')
|
| 69 |
.attr('transform', `translate(${op.width - 33},0)`)
|
| 70 |
|
|
|
|
|
|
|
|
|
|
| 71 |
// 背景面板:避免柱体与整体页面纯白背景混淆
|
| 72 |
this.layers.bg.insert('rect', ':first-child')
|
| 73 |
.attr('class', 'panel-bg')
|
|
@@ -88,8 +134,18 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 88 |
protected _render(rD: HistogramData): void {
|
| 89 |
const op = this.options;
|
| 90 |
|
| 91 |
-
// extent
|
| 92 |
-
const
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 93 |
|
| 94 |
// 计算bin宽度
|
| 95 |
const binWidth = (extent[1] - extent[0]) / rD.no_bins;
|
|
@@ -111,7 +167,7 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 111 |
// 如果指定了 extent,确保使用 extent 作为 domain,而不是 nice() 调整后的 domain
|
| 112 |
// 这样可以保证 extent 的上限被正确使用,即使数据被截断了
|
| 113 |
// 使用 extent 作为 domain,确保范围正确
|
| 114 |
-
const padding = { left: 10, right: 35 };
|
| 115 |
let valueScale = d3.scaleLinear().domain([extent[0], extent[1]]).range([padding.left, op.width - padding.right]);
|
| 116 |
|
| 117 |
const hasAverageValue = typeof rD.averageValue === 'number' && Number.isFinite(rD.averageValue);
|
|
@@ -146,15 +202,25 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 146 |
console.warn('Invalid maxCount for histogram:', maxCount);
|
| 147 |
maxCount = 1;
|
| 148 |
}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 149 |
|
| 150 |
const useSqrt = rD.yScaleType === 'sqrt';
|
| 151 |
-
const
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 158 |
};
|
| 159 |
|
| 160 |
const getBandWidth = (d: d3.Bin<number, number>) => valueScale(d.x1) - valueScale(d.x0);
|
|
@@ -187,9 +253,7 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 187 |
return isFinite(w) && w > 0 ? w : 1;
|
| 188 |
},
|
| 189 |
height: d => {
|
| 190 |
-
if (d.length === 0)
|
| 191 |
-
return 0;
|
| 192 |
-
}
|
| 193 |
const h = op.height - op.margin_bottom - countScale(d.length);
|
| 194 |
return isFinite(h) && h > 0 ? h : 1;
|
| 195 |
},
|
|
@@ -211,6 +275,29 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 211 |
return this._current.selectedBinIndex === i ? 'drop-shadow(0 0 6px rgba(42, 158, 255, 0.8))' : 'none';
|
| 212 |
});
|
| 213 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 214 |
const avgMarkerData = averageX !== null && Number.isFinite(averageX)
|
| 215 |
? [{ x: averageX, value: rD.averageValue as number }]
|
| 216 |
: [];
|
|
@@ -273,15 +360,50 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 273 |
.attr('y', op.margin_top)
|
| 274 |
.text('p90');
|
| 275 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 276 |
const p90LabelData = (typeof rD.p90Value === 'number' && Number.isFinite(rD.p90Value)) ? [rD.p90Value] : [];
|
|
|
|
| 277 |
this.layers.fg.selectAll('.p90-label').data(p90LabelData)
|
| 278 |
.join('text')
|
| 279 |
.attr('class', 'p90-label sizeLabel')
|
| 280 |
.attr('text-anchor', 'end')
|
| 281 |
.attr('x', op.width * 0.75)
|
| 282 |
-
.attr('y',
|
| 283 |
.text(value => {
|
| 284 |
-
|
|
|
|
| 285 |
});
|
| 286 |
|
| 287 |
const labelData = histo.filter(bin => bin.length > 0);
|
|
@@ -408,27 +530,28 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 408 |
});
|
| 409 |
|
| 410 |
|
| 411 |
-
const yAxis = d3.axisRight(countScale)
|
| 412 |
-
|
|
|
|
| 413 |
this.layers.bg.select('.y-axis').call(<any>yAxis);
|
| 414 |
|
| 415 |
const tickValues = [extent[0], ...thresholds, extent[1]];
|
| 416 |
const tickSkip = rD.xAxisTickSkip ?? 0;
|
| 417 |
|
| 418 |
// Custom tick format: 根据 showLeftInfinity/showRightInfinity 决定是否显示 ±∞
|
| 419 |
-
//
|
| 420 |
const xAxisTickFormat = (d: number) => {
|
| 421 |
-
|
| 422 |
-
if (rD.extent)
|
| 423 |
-
if (rD.showLeftInfinity && Math.abs(d - rD.extent[0]) < 0.001) return '-∞';
|
| 424 |
-
if (rD.showRightInfinity && Math.abs(d - rD.extent[1]) < 0.001) return '∞';
|
| 425 |
-
}
|
| 426 |
|
| 427 |
-
// 基于值而非索引决定是否显示:检查 d 是否为 step 的整数倍
|
| 428 |
-
// 这样可以自动对齐到整的刻度,避免起点偏移导致显示非整数刻度
|
| 429 |
if (tickSkip > 0) {
|
| 430 |
-
|
| 431 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 432 |
}
|
| 433 |
|
| 434 |
return op.numberFormat(d);
|
|
@@ -439,6 +562,46 @@ export class Histogram extends VComponent<HistogramData> {
|
|
| 439 |
.tickValues(tickValues);
|
| 440 |
this.layers.bg.select('.x-axis').call(<any>xAxis);
|
| 441 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 442 |
|
| 443 |
}
|
| 444 |
|
|
|
|
| 1 |
import { VComponent } from "./VisComponent";
|
| 2 |
import { D3Sel } from "../utils/Util";
|
| 3 |
import { SimpleEventHandler } from "../utils/SimpleEventHandler";
|
| 4 |
+
import { tr } from "../lang/i18n-lite";
|
| 5 |
import * as d3 from "d3";
|
| 6 |
import { schemeDark2 } from "d3";
|
| 7 |
|
| 8 |
const averageNumberFormat = d3.format('.2f');
|
| 9 |
|
| 10 |
+
/** 1-2-5 decade 模式生成非线性 y 轴刻度,最多 maxTicks 个 */
|
| 11 |
+
function getNonLinearTickValues(maxCount: number, maxTicks = 10): number[] {
|
| 12 |
+
if (maxCount <= 0) return [0];
|
| 13 |
+
const ticks: number[] = [0];
|
| 14 |
+
const base = [1, 2, 5];
|
| 15 |
+
let decade = 1;
|
| 16 |
+
while (decade <= maxCount) {
|
| 17 |
+
for (const b of base) {
|
| 18 |
+
const v = b * decade;
|
| 19 |
+
if (v <= maxCount) ticks.push(v);
|
| 20 |
+
}
|
| 21 |
+
decade *= 10;
|
| 22 |
+
}
|
| 23 |
+
if (ticks[ticks.length - 1] !== maxCount) ticks.push(maxCount);
|
| 24 |
+
if (ticks.length <= maxTicks) return ticks;
|
| 25 |
+
const result: number[] = [];
|
| 26 |
+
for (let i = 0; i < maxTicks; i++) {
|
| 27 |
+
const idx = Math.round((i / (maxTicks - 1)) * (ticks.length - 1));
|
| 28 |
+
result.push(ticks[idx]);
|
| 29 |
+
}
|
| 30 |
+
return [...new Set(result)].sort((a, b) => a - b);
|
| 31 |
+
}
|
| 32 |
+
|
| 33 |
+
/** 单边:固定值或 'auto'(从 data 解析) */
|
| 34 |
+
export type HistogramExtentBound = number | 'auto';
|
| 35 |
+
|
| 36 |
+
/** extent:'auto' 等价于 ['auto','auto'],支持双边独立配置 */
|
| 37 |
+
export type HistogramExtent = [HistogramExtentBound, HistogramExtentBound] | 'auto';
|
| 38 |
+
|
| 39 |
export type HistogramData = {
|
| 40 |
data: number[],
|
| 41 |
label?: string,
|
| 42 |
no_bins: number,
|
| 43 |
+
extent: HistogramExtent,
|
| 44 |
colorScale: (value: number) => string, // 添加颜色 scale
|
| 45 |
averageValue?: number,
|
| 46 |
p90Value?: number,
|
|
|
|
| 48 |
p90Label?: string,
|
| 49 |
showLeftInfinity?: boolean,
|
| 50 |
showRightInfinity?: boolean,
|
| 51 |
+
/** x轴刻度数字绘制间隔,0表示不跳过,1表示隔一个绘制一个(0,2,4...) */
|
| 52 |
+
xAxisTickSkip?: number,
|
| 53 |
+
/** x轴刻度凑整:true=仅显示 step 整数倍处的标签(与 tickSkip 配合),false/undefined=显示全部 */
|
| 54 |
+
xAxisTickRound?: boolean;
|
| 55 |
+
yScaleType?: 'linear' | 'sqrt' | 'log' // y轴尺度:linear 线性,sqrt 平方根,log 对数(指数刻度,从 0 开始)
|
| 56 |
+
/** 拟合分布的每个 bin 期望计数,用于绘制横虚线标识(如指数噪声拟合) */
|
| 57 |
+
fitExpectedCounts?: number[];
|
| 58 |
+
/** 是否叠加 prob 曲线(共用 x 轴,左侧新 y 轴 0–1) */
|
| 59 |
+
showProbCurve?: boolean;
|
| 60 |
+
/** 曲线数据:x=raw_score_normed,y=prob(0–1),P(signal) 按 findSignalThreshold 的 bin 分块估计,(obs-exp)/obs */
|
| 61 |
+
probCurveData?: { x: number[]; y: number[] };
|
| 62 |
+
/** 信号阈值竖线:归一化分数,用于 raw_score_normed 直方图 */
|
| 63 |
+
signalThreshold?: number | null;
|
| 64 |
+
/** 信号阈值对应的分位数(0–100),用于 label 显示 τ = pXX */
|
| 65 |
+
signalThresholdPercentile?: number | null;
|
| 66 |
}
|
| 67 |
|
| 68 |
|
|
|
|
| 111 |
.attr('class', 'y-axis')
|
| 112 |
.attr('transform', `translate(${op.width - 33},0)`)
|
| 113 |
|
| 114 |
+
this.layers.bg.append('g')
|
| 115 |
+
.attr('class', 'y-axis-prob')
|
| 116 |
+
|
| 117 |
// 背景面板:避免柱体与整体页面纯白背景混淆
|
| 118 |
this.layers.bg.insert('rect', ':first-child')
|
| 119 |
.attr('class', 'panel-bg')
|
|
|
|
| 134 |
protected _render(rD: HistogramData): void {
|
| 135 |
const op = this.options;
|
| 136 |
|
| 137 |
+
// extent 解析:'auto' 等价于 ['auto','auto'],支持双边独立配置
|
| 138 |
+
const [loSpec, hiSpec]: [HistogramExtentBound, HistogramExtentBound] =
|
| 139 |
+
rD.extent === 'auto' ? ['auto', 'auto'] : rD.extent;
|
| 140 |
+
const finite = rD.data.filter((d) => typeof d === 'number' && isFinite(d));
|
| 141 |
+
const [dataLo, dataHi] = finite.length > 0
|
| 142 |
+
? (d3.extent(finite) as [number, number])
|
| 143 |
+
: [0, 1];
|
| 144 |
+
const fallbackLo = finite.length <= 1 ? dataLo - 0.5 : dataLo;
|
| 145 |
+
const fallbackHi = finite.length <= 1 ? dataHi + 0.5 : dataHi;
|
| 146 |
+
const lo = loSpec === 'auto' ? fallbackLo : loSpec;
|
| 147 |
+
const hi = hiSpec === 'auto' ? fallbackHi : hiSpec;
|
| 148 |
+
const extent: [number, number] = lo > hi ? [lo, lo] : [lo, hi];
|
| 149 |
|
| 150 |
// 计算bin宽度
|
| 151 |
const binWidth = (extent[1] - extent[0]) / rD.no_bins;
|
|
|
|
| 167 |
// 如果指定了 extent,确保使用 extent 作为 domain,而不是 nice() 调整后的 domain
|
| 168 |
// 这样可以保证 extent 的上限被正确使用,即使数据被截断了
|
| 169 |
// 使用 extent 作为 domain,确保范围正确
|
| 170 |
+
const padding = { left: rD.showProbCurve ? 35 : 10, right: 35 };
|
| 171 |
let valueScale = d3.scaleLinear().domain([extent[0], extent[1]]).range([padding.left, op.width - padding.right]);
|
| 172 |
|
| 173 |
const hasAverageValue = typeof rD.averageValue === 'number' && Number.isFinite(rD.averageValue);
|
|
|
|
| 202 |
console.warn('Invalid maxCount for histogram:', maxCount);
|
| 203 |
maxCount = 1;
|
| 204 |
}
|
| 205 |
+
if (rD.fitExpectedCounts && rD.fitExpectedCounts.length > 0) {
|
| 206 |
+
const fitMax = d3.max(rD.fitExpectedCounts) ?? 0;
|
| 207 |
+
if (isFinite(fitMax) && fitMax > maxCount) maxCount = fitMax;
|
| 208 |
+
}
|
| 209 |
|
| 210 |
const useSqrt = rD.yScaleType === 'sqrt';
|
| 211 |
+
const useLog = rD.yScaleType === 'log';
|
| 212 |
+
const countScale = useLog
|
| 213 |
+
? d3.scaleSymlog().domain([0, Math.max(1, maxCount)]).range([op.height - op.margin_bottom, op.margin_top])
|
| 214 |
+
: useSqrt
|
| 215 |
+
? d3.scaleSqrt().domain([0, maxCount]).range([op.height - op.margin_bottom, op.margin_top])
|
| 216 |
+
: d3.scaleLinear().domain([0, maxCount]).nice().range([op.height - op.margin_bottom, op.margin_top]);
|
| 217 |
+
|
| 218 |
+
// 与 d3 scaleBand 一致:bandwidth = step * (1 - paddingInner),gap = step * paddingInner
|
| 219 |
+
// no_bins=20 时 barWidth:gap ≈ 2.875:1 → paddingInner ≈ 0.258
|
| 220 |
+
const PADDING_INNER = 0.15;
|
| 221 |
+
const adjustWidth = (step: number) => {
|
| 222 |
+
if (!isFinite(step) || step <= 0) return 0;
|
| 223 |
+
return step * (1 - PADDING_INNER);
|
| 224 |
};
|
| 225 |
|
| 226 |
const getBandWidth = (d: d3.Bin<number, number>) => valueScale(d.x1) - valueScale(d.x0);
|
|
|
|
| 253 |
return isFinite(w) && w > 0 ? w : 1;
|
| 254 |
},
|
| 255 |
height: d => {
|
| 256 |
+
if (d.length === 0) return 0;
|
|
|
|
|
|
|
| 257 |
const h = op.height - op.margin_bottom - countScale(d.length);
|
| 258 |
return isFinite(h) && h > 0 ? h : 1;
|
| 259 |
},
|
|
|
|
| 275 |
return this._current.selectedBinIndex === i ? 'drop-shadow(0 0 6px rgba(42, 158, 255, 0.8))' : 'none';
|
| 276 |
});
|
| 277 |
|
| 278 |
+
// 拟合分布横虚线:每个 bin 上标识期望计数,宽度与柱体对齐
|
| 279 |
+
const fitData = rD.fitExpectedCounts && rD.fitExpectedCounts.length === histo.length
|
| 280 |
+
? histo.map((d, i) => {
|
| 281 |
+
const bandWidth = getBandWidth(d);
|
| 282 |
+
const barWidth = adjustWidth(bandWidth);
|
| 283 |
+
const base = valueScale(d.x0);
|
| 284 |
+
const x1 = base + 0.5 * (bandWidth - barWidth);
|
| 285 |
+
return { x1, x2: x1 + barWidth, y: countScale(Math.max(0, rD.fitExpectedCounts![i])) };
|
| 286 |
+
})
|
| 287 |
+
: [];
|
| 288 |
+
this.layers.main.selectAll('.fit-overlay-line').data(fitData)
|
| 289 |
+
.join('line')
|
| 290 |
+
.attr('class', 'fit-overlay-line')
|
| 291 |
+
.attrs({
|
| 292 |
+
x1: d => d.x1,
|
| 293 |
+
x2: d => d.x2,
|
| 294 |
+
y1: d => d.y,
|
| 295 |
+
y2: d => d.y,
|
| 296 |
+
})
|
| 297 |
+
.style('stroke', 'var(--fit-line-color, #999)')
|
| 298 |
+
.style('stroke-width', 1)
|
| 299 |
+
.style('stroke-dasharray', '1,1');
|
| 300 |
+
|
| 301 |
const avgMarkerData = averageX !== null && Number.isFinite(averageX)
|
| 302 |
? [{ x: averageX, value: rD.averageValue as number }]
|
| 303 |
: [];
|
|
|
|
| 360 |
.attr('y', op.margin_top)
|
| 361 |
.text('p90');
|
| 362 |
|
| 363 |
+
const hasSignalThreshold = typeof rD.signalThreshold === 'number' && Number.isFinite(rD.signalThreshold);
|
| 364 |
+
const clampedSignalThreshold = hasSignalThreshold
|
| 365 |
+
? Math.min(Math.max(rD.signalThreshold as number, extent[0]), extent[1])
|
| 366 |
+
: null;
|
| 367 |
+
const signalThresholdX = hasSignalThreshold && clampedSignalThreshold !== null
|
| 368 |
+
? valueScale(clampedSignalThreshold)
|
| 369 |
+
: null;
|
| 370 |
+
|
| 371 |
+
const signalThresholdMarkerData = signalThresholdX !== null && Number.isFinite(signalThresholdX)
|
| 372 |
+
? [{ x: signalThresholdX, value: rD.signalThreshold as number, percentile: rD.signalThresholdPercentile }]
|
| 373 |
+
: [];
|
| 374 |
+
|
| 375 |
+
this.layers.fg.selectAll('.signal-threshold-line').data(signalThresholdMarkerData)
|
| 376 |
+
.join('line')
|
| 377 |
+
.attr('class', 'signal-threshold-line')
|
| 378 |
+
.attrs({
|
| 379 |
+
x1: d => d.x,
|
| 380 |
+
x2: d => d.x,
|
| 381 |
+
y1: op.margin_top + 4,
|
| 382 |
+
y2: op.height - op.margin_bottom
|
| 383 |
+
})
|
| 384 |
+
.style('stroke', 'var(--signal-threshold-line-color, #e74c3c)')
|
| 385 |
+
.style('stroke-width', 1.5)
|
| 386 |
+
.style('stroke-dasharray', '3,2');
|
| 387 |
+
|
| 388 |
+
this.layers.fg.selectAll('.signal-threshold-marker-label').data(signalThresholdMarkerData)
|
| 389 |
+
.join('text')
|
| 390 |
+
.attr('class', 'signal-threshold-marker-label sizeLabel')
|
| 391 |
+
.attr('text-anchor', 'middle')
|
| 392 |
+
.attr('x', d => d.x)
|
| 393 |
+
.attr('y', op.margin_top)
|
| 394 |
+
.text(d => typeof d.percentile === 'number' ? `τ = p${d.percentile}` : 'τ');
|
| 395 |
+
|
| 396 |
const p90LabelData = (typeof rD.p90Value === 'number' && Number.isFinite(rD.p90Value)) ? [rD.p90Value] : [];
|
| 397 |
+
const p90LabelY = avgLabelData.length > 0 ? Math.max(24, op.margin_top + 10) : Math.max(12, op.margin_top - 2);
|
| 398 |
this.layers.fg.selectAll('.p90-label').data(p90LabelData)
|
| 399 |
.join('text')
|
| 400 |
.attr('class', 'p90-label sizeLabel')
|
| 401 |
.attr('text-anchor', 'end')
|
| 402 |
.attr('x', op.width * 0.75)
|
| 403 |
+
.attr('y', p90LabelY)
|
| 404 |
.text(value => {
|
| 405 |
+
const suffix = rD.p90Label ? ` ${rD.p90Label}` : '';
|
| 406 |
+
return `p90 = ${averageNumberFormat(value)}${suffix}`;
|
| 407 |
});
|
| 408 |
|
| 409 |
const labelData = histo.filter(bin => bin.length > 0);
|
|
|
|
| 530 |
});
|
| 531 |
|
| 532 |
|
| 533 |
+
const yAxis = d3.axisRight(countScale)
|
| 534 |
+
.tickFormat(useLog ? d3.format('.0f') : op.numberFormat);
|
| 535 |
+
if (useSqrt || useLog) yAxis.tickValues(getNonLinearTickValues(maxCount, 10));
|
| 536 |
this.layers.bg.select('.y-axis').call(<any>yAxis);
|
| 537 |
|
| 538 |
const tickValues = [extent[0], ...thresholds, extent[1]];
|
| 539 |
const tickSkip = rD.xAxisTickSkip ?? 0;
|
| 540 |
|
| 541 |
// Custom tick format: 根据 showLeftInfinity/showRightInfinity 决定是否显示 ±∞
|
| 542 |
+
// xAxisTickSkip:减少刻度标签密度;xAxisTickRound:true 时按 step 对齐过滤,false 时按索引跳过
|
| 543 |
const xAxisTickFormat = (d: number) => {
|
| 544 |
+
if (rD.showLeftInfinity && Math.abs(d - extent[0]) < 0.001) return '-∞';
|
| 545 |
+
if (rD.showRightInfinity && Math.abs(d - extent[1]) < 0.001) return '∞';
|
|
|
|
|
|
|
|
|
|
| 546 |
|
|
|
|
|
|
|
| 547 |
if (tickSkip > 0) {
|
| 548 |
+
if (rD.xAxisTickRound) {
|
| 549 |
+
const step = (tickSkip + 1) * binWidth;
|
| 550 |
+
if (Math.abs(d / step - Math.round(d / step)) > 1e-9) return '';
|
| 551 |
+
} else {
|
| 552 |
+
const idx = tickValues.findIndex((t) => Math.abs(t - d) < 1e-9 * (Math.abs(d) + 1));
|
| 553 |
+
if (idx >= 0 && idx % (tickSkip + 1) !== 0) return '';
|
| 554 |
+
}
|
| 555 |
}
|
| 556 |
|
| 557 |
return op.numberFormat(d);
|
|
|
|
| 562 |
.tickValues(tickValues);
|
| 563 |
this.layers.bg.select('.x-axis').call(<any>xAxis);
|
| 564 |
|
| 565 |
+
const hasProbCurve = rD.showProbCurve && rD.probCurveData && rD.probCurveData.x.length > 0;
|
| 566 |
+
if (hasProbCurve) {
|
| 567 |
+
const probYScale = d3.scaleLinear()
|
| 568 |
+
.domain([0, 1])
|
| 569 |
+
.range([op.height - op.margin_bottom, op.margin_top]);
|
| 570 |
+
|
| 571 |
+
const probPoints: { x: number; y: number }[] = rD.probCurveData!.x.map((x, i) => ({ x, y: rD.probCurveData!.y[i] ?? 0 }));
|
| 572 |
+
const probLine = d3.line<{ x: number; y: number }>()
|
| 573 |
+
.x(d => valueScale(d.x))
|
| 574 |
+
.y(d => probYScale(d.y))
|
| 575 |
+
.curve(d3.curveLinear);
|
| 576 |
+
|
| 577 |
+
this.layers.fg.selectAll('.prob-curve').data([probPoints])
|
| 578 |
+
.join('path')
|
| 579 |
+
.attr('class', 'prob-curve')
|
| 580 |
+
.attr('d', probLine)
|
| 581 |
+
.style('fill', 'none')
|
| 582 |
+
.style('stroke', 'var(--prob-curve-color, rgba(160,200,255,0.85))')
|
| 583 |
+
.style('stroke-width', 1.5)
|
| 584 |
+
.style('pointer-events', 'none');
|
| 585 |
+
|
| 586 |
+
const probAxis = d3.axisLeft(probYScale)
|
| 587 |
+
.ticks(5)
|
| 588 |
+
.tickFormat(d3.format('.1f'));
|
| 589 |
+
this.layers.bg.select('.y-axis-prob')
|
| 590 |
+
.attr('transform', `translate(${padding.left},0)`)
|
| 591 |
+
.call(<any>probAxis);
|
| 592 |
+
|
| 593 |
+
this.layers.bg.selectAll('.prob-curve-axis-label').data([1])
|
| 594 |
+
.join('text')
|
| 595 |
+
.attr('class', 'prob-curve-axis-label sizeLabel')
|
| 596 |
+
.attr('text-anchor', 'middle')
|
| 597 |
+
.attr('transform', `translate(8,${(op.height - op.margin_bottom) / 2 + op.margin_top}) rotate(-90)`)
|
| 598 |
+
.text(tr('signal ratio'));
|
| 599 |
+
|
| 600 |
+
} else {
|
| 601 |
+
this.layers.fg.selectAll('.prob-curve').remove();
|
| 602 |
+
this.layers.bg.select('.y-axis-prob').selectAll('*').remove();
|
| 603 |
+
this.layers.bg.selectAll('.prob-curve-axis-label').remove();
|
| 604 |
+
}
|
| 605 |
|
| 606 |
}
|
| 607 |
|
client/src/ts/vis/SvgOverlayManager.ts
CHANGED
|
@@ -18,8 +18,8 @@ export interface DiffOverlayOptions {
|
|
| 18 |
/** 语义分析模式配置 */
|
| 19 |
export interface SemanticOverlayOptions {
|
| 20 |
analysisMode: boolean;
|
| 21 |
-
/** 查询匹配时每 token 的
|
| 22 |
-
|
| 23 |
}
|
| 24 |
|
| 25 |
export interface SvgOverlayManagerOptions {
|
|
@@ -226,7 +226,7 @@ export class SvgOverlayManager {
|
|
| 226 |
group.appendChild(rect);
|
| 227 |
// 语义分析模式:在信息密度之上叠加语义高亮(黄色渐变)
|
| 228 |
const sem = this.options.semantic;
|
| 229 |
-
if (sem?.analysisMode && sem.
|
| 230 |
const overlayRect = this.createSemanticOverlayRect(pos, tokenIndex, rd);
|
| 231 |
group.appendChild(overlayRect);
|
| 232 |
}
|
|
@@ -326,7 +326,7 @@ export class SvgOverlayManager {
|
|
| 326 |
): SVGRectElement {
|
| 327 |
const rect = document.createElementNS('http://www.w3.org/2000/svg', 'rect');
|
| 328 |
const sem = this.options.semantic!;
|
| 329 |
-
const score = sem.
|
| 330 |
const color = score !== undefined ? getSemanticSimilarityColor(score) : 'transparent';
|
| 331 |
|
| 332 |
this.setRectGeometry(rect, pos);
|
|
|
|
| 18 |
/** 语义分析模式配置 */
|
| 19 |
export interface SemanticOverlayOptions {
|
| 20 |
analysisMode: boolean;
|
| 21 |
+
/** 查询匹配时每 token 的 raw score normed [0,1] */
|
| 22 |
+
rawScoresNormed?: number[];
|
| 23 |
}
|
| 24 |
|
| 25 |
export interface SvgOverlayManagerOptions {
|
|
|
|
| 226 |
group.appendChild(rect);
|
| 227 |
// 语义分析模式:在信息密度之上叠加语义高亮(黄色渐变)
|
| 228 |
const sem = this.options.semantic;
|
| 229 |
+
if (sem?.analysisMode && sem.rawScoresNormed) {
|
| 230 |
const overlayRect = this.createSemanticOverlayRect(pos, tokenIndex, rd);
|
| 231 |
group.appendChild(overlayRect);
|
| 232 |
}
|
|
|
|
| 326 |
): SVGRectElement {
|
| 327 |
const rect = document.createElementNS('http://www.w3.org/2000/svg', 'rect');
|
| 328 |
const sem = this.options.semantic!;
|
| 329 |
+
const score = sem.rawScoresNormed![tokenIndex];
|
| 330 |
const color = score !== undefined ? getSemanticSimilarityColor(score) : 'transparent';
|
| 331 |
|
| 332 |
this.setRectGeometry(rect, pos);
|
client/src/ts/vis/ToolTip.ts
CHANGED
|
@@ -1,6 +1,3 @@
|
|
| 1 |
-
/** Tooltip 显示的 pred_topk 候选数量,与后端 runtime_config.DEFAULT_TOPK 保持一致 */
|
| 2 |
-
const DISPLAY_TOPK = 10;
|
| 3 |
-
|
| 4 |
import { D3Sel, calculateSurprisal, calculateSurprisalDensity } from "../utils/Util";
|
| 5 |
import { SimpleEventHandler } from "../utils/SimpleEventHandler";
|
| 6 |
import { GLTR_RenderItem } from "./GLTR_Text_Box";
|
|
@@ -8,6 +5,8 @@ import type { FrontendToken } from "../api/GLTR_API";
|
|
| 8 |
import * as d3 from "d3";
|
| 9 |
import { tr } from "../lang/i18n-lite";
|
| 10 |
import { getTokenRenderStyle } from "../utils/tokenRenderStyle";
|
|
|
|
|
|
|
| 11 |
|
| 12 |
const SEPARATOR = '─────────────';
|
| 13 |
|
|
@@ -18,147 +17,6 @@ function renderField(f: DetailField, dc: string, vc: string): string {
|
|
| 18 |
return `<span style="color: ${dc}">${f.label}</span> <span style="color: ${valColor}">${f.value}</span>`;
|
| 19 |
}
|
| 20 |
|
| 21 |
-
/**
|
| 22 |
-
* 处理候选词文本,与主token保持一致的处理方式
|
| 23 |
-
* 后端不再处理候选词,直接返回原始解码字符串,前端统一处理
|
| 24 |
-
* @param text 原始文本
|
| 25 |
-
* @returns 处理后的文本(特殊字符可视化 + HTML转义)
|
| 26 |
-
*/
|
| 27 |
-
function processCandidateText(text: string): string {
|
| 28 |
-
// 与主token保持一致:先可视化特殊字符,再HTML转义
|
| 29 |
-
return escapeHtml(visualizeSpecialChars(text));
|
| 30 |
-
}
|
| 31 |
-
|
| 32 |
-
/**
|
| 33 |
-
* HTML转义,防止XSS和HTML结构破坏
|
| 34 |
-
* @param text 原始文本
|
| 35 |
-
* @returns 转义后的文本
|
| 36 |
-
*/
|
| 37 |
-
function escapeHtml(text: string): string {
|
| 38 |
-
const div = document.createElement('div');
|
| 39 |
-
div.textContent = text;
|
| 40 |
-
return div.innerHTML;
|
| 41 |
-
}
|
| 42 |
-
|
| 43 |
-
/**
|
| 44 |
-
* 检查字符是否是空白字符
|
| 45 |
-
* @param char 单个字符
|
| 46 |
-
* @returns 是否是空白字符
|
| 47 |
-
*/
|
| 48 |
-
function isWhitespaceChar(char: string): boolean {
|
| 49 |
-
return /\p{White_Space}/u.test(char);
|
| 50 |
-
}
|
| 51 |
-
/**
|
| 52 |
-
* 检查字符是否可打印(常见字符范围)
|
| 53 |
-
* @param char 单个字符
|
| 54 |
-
* @returns 是否可打印
|
| 55 |
-
*/
|
| 56 |
-
function isPrintableChar(char: string): boolean {
|
| 57 |
-
|
| 58 |
-
// 首先排除所有空白字符
|
| 59 |
-
if (isWhitespaceChar(char)) {
|
| 60 |
-
return false;
|
| 61 |
-
}
|
| 62 |
-
|
| 63 |
-
const codePoint = char.codePointAt(0);
|
| 64 |
-
if (codePoint === undefined) return false;
|
| 65 |
-
|
| 66 |
-
// ASCII 可打印字符范围:32-126(空格到波浪号)
|
| 67 |
-
if (codePoint >= 32 && codePoint <= 126) {
|
| 68 |
-
return true;
|
| 69 |
-
}
|
| 70 |
-
|
| 71 |
-
// 常见 Unicode 范围(中文、日文、韩文、常用符号等)
|
| 72 |
-
// 基本多文种平面(BMP)中的常见字符范围
|
| 73 |
-
if (
|
| 74 |
-
(codePoint >= 0x00A0 && codePoint <= 0x00FF) || // 拉丁文补充
|
| 75 |
-
(codePoint >= 0x0100 && codePoint <= 0x017F) || // 拉丁文扩展-A
|
| 76 |
-
(codePoint >= 0x0180 && codePoint <= 0x024F) || // 拉丁文扩展-B
|
| 77 |
-
(codePoint >= 0x2000 && codePoint <= 0x206F) || // 常用标点
|
| 78 |
-
(codePoint >= 0x2070 && codePoint <= 0x209F) || // 上标和下标
|
| 79 |
-
(codePoint >= 0x20A0 && codePoint <= 0x20CF) || // 货币符号
|
| 80 |
-
(codePoint >= 0x2100 && codePoint <= 0x214F) || // 字母式符号
|
| 81 |
-
(codePoint >= 0x2190 && codePoint <= 0x21FF) || // 箭头
|
| 82 |
-
(codePoint >= 0x2200 && codePoint <= 0x22FF) || // 数学运算符
|
| 83 |
-
(codePoint >= 0x2300 && codePoint <= 0x23FF) || // 杂项技术符号
|
| 84 |
-
(codePoint >= 0x2400 && codePoint <= 0x243F) || // 控制图片
|
| 85 |
-
(codePoint >= 0x2E00 && codePoint <= 0x2E7F) || // 补充标点
|
| 86 |
-
(codePoint >= 0x3000 && codePoint <= 0x303F) || // CJK符号和标点
|
| 87 |
-
(codePoint >= 0x3040 && codePoint <= 0x309F) || // 平假名
|
| 88 |
-
(codePoint >= 0x30A0 && codePoint <= 0x30FF) || // 片假名
|
| 89 |
-
(codePoint >= 0x4E00 && codePoint <= 0x9FFF) || // CJK统一汉字
|
| 90 |
-
(codePoint >= 0xAC00 && codePoint <= 0xD7AF) || // 韩文音节
|
| 91 |
-
(codePoint >= 0xF900 && codePoint <= 0xFAFF) || // CJK兼容汉字
|
| 92 |
-
(codePoint >= 0xFF00 && codePoint <= 0xFFEF) // 全角字符
|
| 93 |
-
) {
|
| 94 |
-
return true;
|
| 95 |
-
}
|
| 96 |
-
|
| 97 |
-
return false;
|
| 98 |
-
}
|
| 99 |
-
|
| 100 |
-
/**
|
| 101 |
-
* 将特殊字符转换为可见的文本表示形式(方案3:文本形式,空格也转义)
|
| 102 |
-
* 对于无法显示的特殊字符,显示其 Unicode 编码
|
| 103 |
-
* @param text 原始文本
|
| 104 |
-
* @returns 转换后的文本,特殊字符已替换为文本标记或 Unicode 编码
|
| 105 |
-
*/
|
| 106 |
-
function visualizeSpecialChars(text: string): string {
|
| 107 |
-
// 先处理常见的特殊字符
|
| 108 |
-
let result = text
|
| 109 |
-
.replace(/\r\n/g, '[CRLF]') // Windows换行 -> [CRLF]
|
| 110 |
-
.replace(/\n/g, '[LF]') // 换行 -> [LF]
|
| 111 |
-
.replace(/\r/g, '[CR]') // 回车 -> [CR]
|
| 112 |
-
.replace(/\t/g, '[TAB]') // Tab -> [TAB]
|
| 113 |
-
.replace(/\u3000/g, '[FS]') // 全角空格 -> [FS]
|
| 114 |
-
.replace(/ /g, '·'); // 空格 -> ·
|
| 115 |
-
// .replace(/ /g, '␣'); // 空格 -> ␣
|
| 116 |
-
|
| 117 |
-
// 处理其他不可打印或特殊字符,显示 Unicode 编码
|
| 118 |
-
// 需要跳过已经转换的标记([...] 内的内容)
|
| 119 |
-
const processed: string[] = [];
|
| 120 |
-
let inBracket = false;
|
| 121 |
-
let bracketContent = '';
|
| 122 |
-
|
| 123 |
-
for (let i = 0; i < result.length; i++) {
|
| 124 |
-
const char = result[i];
|
| 125 |
-
|
| 126 |
-
if (char === '[') {
|
| 127 |
-
// 开始标记
|
| 128 |
-
inBracket = true;
|
| 129 |
-
bracketContent = '[';
|
| 130 |
-
processed.push(char);
|
| 131 |
-
} else if (char === ']' && inBracket) {
|
| 132 |
-
// 结束标记
|
| 133 |
-
bracketContent += ']';
|
| 134 |
-
processed.push(char);
|
| 135 |
-
inBracket = false;
|
| 136 |
-
bracketContent = '';
|
| 137 |
-
} else if (inBracket) {
|
| 138 |
-
// 在标记内,直接保留
|
| 139 |
-
bracketContent += char;
|
| 140 |
-
processed.push(char);
|
| 141 |
-
} else {
|
| 142 |
-
// 不在标记内,检查是否可打印
|
| 143 |
-
if (isPrintableChar(char)) {
|
| 144 |
-
processed.push(char);
|
| 145 |
-
} else {
|
| 146 |
-
// 显示 Unicode 编码
|
| 147 |
-
const codePoint = char.codePointAt(0);
|
| 148 |
-
if (codePoint !== undefined) {
|
| 149 |
-
const hexCode = codePoint.toString(16).toUpperCase().padStart(4, '0');
|
| 150 |
-
processed.push(`[U+${hexCode}]`);
|
| 151 |
-
} else {
|
| 152 |
-
processed.push(char); // 如果无法获取编码,保持原样
|
| 153 |
-
}
|
| 154 |
-
}
|
| 155 |
-
}
|
| 156 |
-
}
|
| 157 |
-
|
| 158 |
-
return processed.join('');
|
| 159 |
-
}
|
| 160 |
-
|
| 161 |
-
|
| 162 |
export class ToolTip {
|
| 163 |
private predictions: D3Sel;
|
| 164 |
private myDetail: D3Sel;
|
|
@@ -168,9 +26,6 @@ export class ToolTip {
|
|
| 168 |
private readonly numF = d3.format('.3f');
|
| 169 |
private readonly significantF = d3.format('.3g');
|
| 170 |
|
| 171 |
-
// 缓存:d3 scale(按 maxW 缓存)
|
| 172 |
-
private scaleCache = new Map<number, d3.ScaleLinear<number, number>>();
|
| 173 |
-
|
| 174 |
// 缓存:主题颜色
|
| 175 |
private themeColors = {
|
| 176 |
normalColor: '#333',
|
|
@@ -245,16 +100,6 @@ export class ToolTip {
|
|
| 245 |
};
|
| 246 |
}
|
| 247 |
|
| 248 |
-
/**
|
| 249 |
-
* 获取或创建 scale(带缓存)
|
| 250 |
-
*/
|
| 251 |
-
private _getScale(maxW: number): d3.ScaleLinear<number, number> {
|
| 252 |
-
if (!this.scaleCache.has(maxW)) {
|
| 253 |
-
this.scaleCache.set(maxW, d3.scaleLinear().domain([0, maxW]).range([0, 60]));
|
| 254 |
-
}
|
| 255 |
-
return this.scaleCache.get(maxW)!;
|
| 256 |
-
}
|
| 257 |
-
|
| 258 |
/**
|
| 259 |
* 获取真实的可见视口尺寸和偏移量
|
| 260 |
* 优先使用 visualViewport API(解决 iOS Safari 地址栏动态显示/隐藏问题)
|
|
@@ -434,15 +279,17 @@ export class ToolTip {
|
|
| 434 |
|
| 435 |
const tokenData = ri.tokenData as FrontendToken;
|
| 436 |
const s = ri.semantic;
|
| 437 |
-
const hasSemantic = s && (s.
|
| 438 |
const hasRealTopk = tokenData?.real_topk != null && Array.isArray(tokenData.real_topk);
|
| 439 |
const predTopk = tokenData?.pred_topk ?? [];
|
| 440 |
const hasPredictions = predTopk.length > 0;
|
| 441 |
|
| 442 |
-
// 1. 构建语义区块
|
| 443 |
const semanticRows: string[] = [];
|
| 444 |
if (hasSemantic && s) {
|
| 445 |
-
if (s.
|
|
|
|
|
|
|
| 446 |
if (s.rawScore !== undefined) semanticRows.push(renderField({ label: tr('raw score:'), value: d3.format('.6f')(s.rawScore), valueColor: false }, detailColor, valueColor));
|
| 447 |
}
|
| 448 |
|
|
@@ -485,20 +332,11 @@ export class ToolTip {
|
|
| 485 |
.style('display', 'block')
|
| 486 |
.html(() => `<div style="color:${detailColor};padding-left:5px;">${tr('Top-k data not available.')}</div>`);
|
| 487 |
} else {
|
| 488 |
-
const
|
| 489 |
-
this.predictions.
|
| 490 |
-
.
|
| 491 |
-
.
|
| 492 |
-
|
| 493 |
-
.html(d => {
|
| 494 |
-
const color = tokenData.raw != d[0] ? normalColor : selectedColor;
|
| 495 |
-
const bar = '<div style="display: table-cell; width:110px;padding-left:5px;">' +
|
| 496 |
-
`<div style="display:inline-block;width: ${wScale(d[1])}px;background-color:${color};height: 10px;"></div>` +
|
| 497 |
-
` <div style="display:inline-block;color: ${color};">${this.numF(d[1])}</div>` + "</div>";
|
| 498 |
-
const processedText = processCandidateText(d[0]);
|
| 499 |
-
const text = `<div style="display: table-cell;color: ${color};padding-right:5px;">${processedText}</div>`;
|
| 500 |
-
return `${bar} ${text}`;
|
| 501 |
-
});
|
| 502 |
}
|
| 503 |
}
|
| 504 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
import { D3Sel, calculateSurprisal, calculateSurprisalDensity } from "../utils/Util";
|
| 2 |
import { SimpleEventHandler } from "../utils/SimpleEventHandler";
|
| 3 |
import { GLTR_RenderItem } from "./GLTR_Text_Box";
|
|
|
|
| 5 |
import * as d3 from "d3";
|
| 6 |
import { tr } from "../lang/i18n-lite";
|
| 7 |
import { getTokenRenderStyle } from "../utils/tokenRenderStyle";
|
| 8 |
+
import { escapeHtml, visualizeSpecialChars } from "../utils/tokenDisplayUtils";
|
| 9 |
+
import { renderTopkChartHtml } from "../utils/topkChartUtils";
|
| 10 |
|
| 11 |
const SEPARATOR = '─────────────';
|
| 12 |
|
|
|
|
| 17 |
return `<span style="color: ${dc}">${f.label}</span> <span style="color: ${valColor}">${f.value}</span>`;
|
| 18 |
}
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
export class ToolTip {
|
| 21 |
private predictions: D3Sel;
|
| 22 |
private myDetail: D3Sel;
|
|
|
|
| 26 |
private readonly numF = d3.format('.3f');
|
| 27 |
private readonly significantF = d3.format('.3g');
|
| 28 |
|
|
|
|
|
|
|
|
|
|
| 29 |
// 缓存:主题颜色
|
| 30 |
private themeColors = {
|
| 31 |
normalColor: '#333',
|
|
|
|
| 100 |
};
|
| 101 |
}
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
/**
|
| 104 |
* 获取真实的可见视口尺寸和偏移量
|
| 105 |
* 优先使用 visualViewport API(解决 iOS Safari 地址栏动态显示/隐藏问题)
|
|
|
|
| 279 |
|
| 280 |
const tokenData = ri.tokenData as FrontendToken;
|
| 281 |
const s = ri.semantic;
|
| 282 |
+
const hasSemantic = s && (s.pwScore !== undefined || s.signalProb !== undefined || s.rawScoreNormed !== undefined || s.rawScore !== undefined);
|
| 283 |
const hasRealTopk = tokenData?.real_topk != null && Array.isArray(tokenData.real_topk);
|
| 284 |
const predTopk = tokenData?.pred_topk ?? [];
|
| 285 |
const hasPredictions = predTopk.length > 0;
|
| 286 |
|
| 287 |
+
// 1. 构建语义区块(pw score = raw_score_normed × P_pw,P_pw: x≤threshold 为 0,x>threshold 为 1)
|
| 288 |
const semanticRows: string[] = [];
|
| 289 |
if (hasSemantic && s) {
|
| 290 |
+
if (s.pwScore !== undefined) semanticRows.push(renderField({ label: tr('pw score:'), value: this.numF(s.pwScore) }, detailColor, valueColor));
|
| 291 |
+
if (s.signalProb !== undefined) semanticRows.push(renderField({ label: tr('signal probability:'), value: this.numF(s.signalProb) }, detailColor, valueColor));
|
| 292 |
+
if (s.rawScoreNormed !== undefined) semanticRows.push(renderField({ label: tr('raw score normed:'), value: this.numF(s.rawScoreNormed) }, detailColor, valueColor));
|
| 293 |
if (s.rawScore !== undefined) semanticRows.push(renderField({ label: tr('raw score:'), value: d3.format('.6f')(s.rawScore), valueColor: false }, detailColor, valueColor));
|
| 294 |
}
|
| 295 |
|
|
|
|
| 332 |
.style('display', 'block')
|
| 333 |
.html(() => `<div style="color:${detailColor};padding-left:5px;">${tr('Top-k data not available.')}</div>`);
|
| 334 |
} else {
|
| 335 |
+
const topkData = predTopk.slice(0, 10).map(([token, prob]) => ({ token, prob }));
|
| 336 |
+
this.predictions.html(renderTopkChartHtml(topkData, {
|
| 337 |
+
selectedToken: tokenData.raw,
|
| 338 |
+
numFormat: this.numF,
|
| 339 |
+
}));
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
}
|
| 341 |
}
|
| 342 |
|
math_demo/.streamlit/config.toml
ADDED
|
@@ -0,0 +1,2 @@
|
|
|
|
|
|
|
|
|
|
| 1 |
+
[server]
|
| 2 |
+
headless = true
|
math_demo/requirements.txt
ADDED
|
@@ -0,0 +1,4 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
streamlit>=1.28.0
|
| 2 |
+
matplotlib>=3.7.0
|
| 3 |
+
numpy>=1.24.0
|
| 4 |
+
scipy>=1.10.0
|
model_paths.py
CHANGED
|
@@ -8,12 +8,12 @@ DEFAULT_MODEL = "qwen3.0-0.6b"
|
|
| 8 |
DEFAULT_SEMANTIC_MODEL = "qwen3-0.6b-instruct"
|
| 9 |
|
| 10 |
# Semantic analysis 模型(instruct 版本,用于 chat template 与指令理解)
|
| 11 |
-
# 与 qwen3.0-14b 同级:0.6B → 1.7B → 4B → 8B → 14B
|
| 12 |
SEMANTIC_MODEL_PATHS = {
|
| 13 |
"qwen3-0.6b-instruct": "Qwen/Qwen3-0.6B",
|
| 14 |
"qwen3-1.7b-instruct": "Qwen/Qwen3-1.7B",
|
| 15 |
# "qwen3-4b-instruct": "Qwen/Qwen3-4B",
|
| 16 |
"qwen3-4b-instruct": "Qwen/Qwen3-4B-Instruct-2507",
|
|
|
|
| 17 |
}
|
| 18 |
|
| 19 |
# 所有可用模型的 HuggingFace 路径映射
|
|
@@ -26,6 +26,7 @@ MODEL_PATHS = {
|
|
| 26 |
'qwen3.0-8b': 'Qwen/Qwen3-8B-Base',
|
| 27 |
'qwen3.0-14b': 'Qwen/Qwen3-14B-Base',
|
| 28 |
'qwen3.0-30b-a3b': 'Qwen/Qwen3-30B-A3B-Base',
|
|
|
|
| 29 |
'qwen2.5-32b': 'Qwen/Qwen2.5-32B',
|
| 30 |
'qwen2.5-72b': 'Qwen/Qwen2.5-72B',
|
| 31 |
|
|
|
|
| 8 |
DEFAULT_SEMANTIC_MODEL = "qwen3-0.6b-instruct"
|
| 9 |
|
| 10 |
# Semantic analysis 模型(instruct 版本,用于 chat template 与指令理解)
|
|
|
|
| 11 |
SEMANTIC_MODEL_PATHS = {
|
| 12 |
"qwen3-0.6b-instruct": "Qwen/Qwen3-0.6B",
|
| 13 |
"qwen3-1.7b-instruct": "Qwen/Qwen3-1.7B",
|
| 14 |
# "qwen3-4b-instruct": "Qwen/Qwen3-4B",
|
| 15 |
"qwen3-4b-instruct": "Qwen/Qwen3-4B-Instruct-2507",
|
| 16 |
+
"qwen3.5-0.8b-instruct": "Qwen/Qwen3.5-0.8B",
|
| 17 |
}
|
| 18 |
|
| 19 |
# 所有可用模型的 HuggingFace 路径映射
|
|
|
|
| 26 |
'qwen3.0-8b': 'Qwen/Qwen3-8B-Base',
|
| 27 |
'qwen3.0-14b': 'Qwen/Qwen3-14B-Base',
|
| 28 |
'qwen3.0-30b-a3b': 'Qwen/Qwen3-30B-A3B-Base',
|
| 29 |
+
'qwen3.5-0.8b': 'Qwen/Qwen3.5-0.8B-Base',
|
| 30 |
'qwen2.5-32b': 'Qwen/Qwen2.5-32B',
|
| 31 |
'qwen2.5-72b': 'Qwen/Qwen2.5-72B',
|
| 32 |
|
scripts/eval_semantic_submodes.py
CHANGED
|
@@ -52,7 +52,7 @@ DEFAULT_API_BASE = "http://localhost:5001"
|
|
| 52 |
def analyze_semantic_http(api_base: str, query: str, text: str, submode: str, token: Optional[str] = None, prob_weighted: Optional[bool] = None, timeout: int = 300) -> dict:
|
| 53 |
"""通过 HTTP 调用 analyze-semantic 接口"""
|
| 54 |
url = f"{api_base.rstrip('/')}/api/analyze-semantic"
|
| 55 |
-
payload: dict = {"query": query, "text": text, "submode": submode}
|
| 56 |
if prob_weighted is not None:
|
| 57 |
payload["prob_weighted"] = prob_weighted
|
| 58 |
headers = {"Content-Type": "application/json"}
|
|
@@ -87,8 +87,9 @@ def run_eval(api_base: str, submode: str, test_cases: list, token: Optional[str]
|
|
| 87 |
})
|
| 88 |
continue
|
| 89 |
|
| 90 |
-
|
| 91 |
-
|
|
|
|
| 92 |
token_attention = res.get("token_attention", [])
|
| 93 |
|
| 94 |
# 0-max 归一化: score / max ∈ [0, 1],最大值归一为 1
|
|
|
|
| 52 |
def analyze_semantic_http(api_base: str, query: str, text: str, submode: str, token: Optional[str] = None, prob_weighted: Optional[bool] = None, timeout: int = 300) -> dict:
|
| 53 |
"""通过 HTTP 调用 analyze-semantic 接口"""
|
| 54 |
url = f"{api_base.rstrip('/')}/api/analyze-semantic"
|
| 55 |
+
payload: dict = {"query": query, "text": text, "submode": submode, "debug_info": True}
|
| 56 |
if prob_weighted is not None:
|
| 57 |
payload["prob_weighted"] = prob_weighted
|
| 58 |
headers = {"Content-Type": "application/json"}
|
|
|
|
| 87 |
})
|
| 88 |
continue
|
| 89 |
|
| 90 |
+
di = res.get("debug_info", {})
|
| 91 |
+
topk_tokens = di.get("topk_tokens", [])
|
| 92 |
+
topk_probs = di.get("topk_probs", [])
|
| 93 |
token_attention = res.get("token_attention", [])
|
| 94 |
|
| 95 |
# 0-max 归一化: score / max ∈ [0, 1],最大值归一为 1
|
server.yaml
CHANGED
|
@@ -503,16 +503,23 @@ paths:
|
|
| 503 |
score:
|
| 504 |
type: number
|
| 505 |
description: 对 prompt 区域的平均关注度
|
| 506 |
-
|
| 507 |
-
type:
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 516 |
message:
|
| 517 |
type: string
|
| 518 |
400:
|
|
|
|
| 503 |
score:
|
| 504 |
type: number
|
| 505 |
description: 对 prompt 区域的平均关注度
|
| 506 |
+
debug_info:
|
| 507 |
+
type: object
|
| 508 |
+
description: debug_info=true 时返回
|
| 509 |
+
properties:
|
| 510 |
+
abbrev:
|
| 511 |
+
type: string
|
| 512 |
+
description: 推理原文缩写
|
| 513 |
+
topk_tokens:
|
| 514 |
+
type: array
|
| 515 |
+
items:
|
| 516 |
+
type: string
|
| 517 |
+
description: top10 预测 token 列表
|
| 518 |
+
topk_probs:
|
| 519 |
+
type: array
|
| 520 |
+
items:
|
| 521 |
+
type: number
|
| 522 |
+
description: top10 对应的概率
|
| 523 |
message:
|
| 524 |
type: string
|
| 525 |
400:
|