Spaces:
Running
Running
Commit
·
350392a
1
Parent(s):
25a9607
Checkpoint before tooltip palette update
Browse files- .claude/settings.local.json +3 -1
- README.md +21 -0
- app.py +13 -3
- core/escaping.py +46 -0
- core/render_model.py +59 -0
- core/segmentation.py +24 -0
- tests/_out/stress.output.html +0 -0
- tests/_out/stress.render_model.json +0 -0
- tests/compare_html.py +46 -0
- tests/compare_snapshots.py +97 -0
- tests/generate_snapshots.py +63 -0
- tests/golden/README.md +21 -0
- tests/golden/stress.output.html +0 -0
- tests/golden/stress.render_model.json +0 -0
- tests/samples/stress_inputs.txt +27 -0
- tests/visual_smoke.py +25 -0
- visualization/assets/main.css +250 -0
- visualization/assets/main.js +615 -0
- visualization/html_generator.py +220 -731
- visualization/render.py +16 -0
- visualization/templates/page.html.tmpl +20 -0
.claude/settings.local.json
CHANGED
|
@@ -9,7 +9,9 @@
|
|
| 9 |
"Bash(git commit -m \"$\\(cat <<''EOF''\nFix Gradio compatibility for HuggingFace Spaces\n\n- Upgrade gradio to >=5.0.0 to fix API schema bug\n- Add server_name and server_port to demo.launch\\(\\)\n\nCo-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>\nEOF\n\\)\")",
|
| 10 |
"Bash(git commit:*)",
|
| 11 |
"Bash(git reset:*)",
|
| 12 |
-
"Bash(and top-10 predictions\" to better reflect what users see in the tooltip.\nAlso updated color legend to match the swapped model positions.\n\nCo-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>\nEOF\n\\)\")"
|
|
|
|
|
|
|
| 13 |
]
|
| 14 |
}
|
| 15 |
}
|
|
|
|
| 9 |
"Bash(git commit -m \"$\\(cat <<''EOF''\nFix Gradio compatibility for HuggingFace Spaces\n\n- Upgrade gradio to >=5.0.0 to fix API schema bug\n- Add server_name and server_port to demo.launch\\(\\)\n\nCo-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>\nEOF\n\\)\")",
|
| 10 |
"Bash(git commit:*)",
|
| 11 |
"Bash(git reset:*)",
|
| 12 |
+
"Bash(and top-10 predictions\" to better reflect what users see in the tooltip.\nAlso updated color legend to match the swapped model positions.\n\nCo-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>\nEOF\n\\)\")",
|
| 13 |
+
"Bash(git fetch:*)",
|
| 14 |
+
"Bash(git pull:*)"
|
| 15 |
]
|
| 16 |
}
|
| 17 |
}
|
README.md
CHANGED
|
@@ -60,6 +60,27 @@ pip install -r requirements.txt
|
|
| 60 |
python app.py
|
| 61 |
```
|
| 62 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 63 |
## Requirements
|
| 64 |
|
| 65 |
- CUDA-capable GPU (16GB+ VRAM recommended)
|
|
|
|
| 60 |
python app.py
|
| 61 |
```
|
| 62 |
|
| 63 |
+
## Regression Checks (Recommended)
|
| 64 |
+
|
| 65 |
+
Run these after UI or rendering changes:
|
| 66 |
+
|
| 67 |
+
```bash
|
| 68 |
+
# Generate baseline snapshots
|
| 69 |
+
conda run -n torch2 python tests/generate_snapshots.py --out tests/golden
|
| 70 |
+
|
| 71 |
+
# Generate candidate snapshots
|
| 72 |
+
conda run -n torch2 python tests/generate_snapshots.py --out tests/_out
|
| 73 |
+
|
| 74 |
+
# Compare render-model JSON
|
| 75 |
+
conda run -n torch2 python tests/compare_snapshots.py --baseline tests/golden/stress.render_model.json --candidate tests/_out/stress.render_model.json
|
| 76 |
+
|
| 77 |
+
# Compare HTML output
|
| 78 |
+
conda run -n torch2 python tests/compare_html.py --baseline tests/golden/stress.output.html --candidate tests/_out/stress.output.html
|
| 79 |
+
|
| 80 |
+
# Optional: visual smoke placeholder
|
| 81 |
+
conda run -n torch2 python tests/visual_smoke.py --html tests/_out/stress.output.html
|
| 82 |
+
```
|
| 83 |
+
|
| 84 |
## Requirements
|
| 85 |
|
| 86 |
- CUDA-capable GPU (16GB+ VRAM recommended)
|
app.py
CHANGED
|
@@ -7,6 +7,7 @@ Compare byte-level prediction performance between Qwen3-1.7B-Base and RWKV7-G1C-
|
|
| 7 |
import gc
|
| 8 |
import os
|
| 9 |
from pathlib import Path
|
|
|
|
| 10 |
|
| 11 |
import gradio as gr
|
| 12 |
import torch
|
|
@@ -27,7 +28,7 @@ MODELS_DIR = SCRIPT_DIR / "models"
|
|
| 27 |
SUPPORT_DIR = SCRIPT_DIR / "support"
|
| 28 |
|
| 29 |
# Text length limits
|
| 30 |
-
MAX_TEXT_LENGTH =
|
| 31 |
MIN_TEXT_LENGTH = 1
|
| 32 |
|
| 33 |
# Global model cache
|
|
@@ -120,7 +121,7 @@ def validate_input(text: str) -> tuple[bool, str]:
|
|
| 120 |
if not text or not text.strip():
|
| 121 |
return False, "Please enter some text to analyze."
|
| 122 |
|
| 123 |
-
text = text.strip()
|
| 124 |
|
| 125 |
if len(text) < MIN_TEXT_LENGTH:
|
| 126 |
return False, f"Text is too short. Minimum {MIN_TEXT_LENGTH} characters required."
|
|
@@ -299,7 +300,15 @@ def get_default_example():
|
|
| 299 |
|
| 300 |
|
| 301 |
# Build Gradio UI
|
| 302 |
-
with gr.Blocks(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 303 |
gr.HTML(
|
| 304 |
"""
|
| 305 |
<div style="text-align: center; margin-bottom: 20px;">
|
|
@@ -324,6 +333,7 @@ with gr.Blocks(title="Compression-Lens: RWKV-7 vs Qwen3", theme=gr.themes.Soft()
|
|
| 324 |
placeholder=f"Enter text to analyze (max {MAX_TEXT_LENGTH} characters)...",
|
| 325 |
lines=10,
|
| 326 |
max_lines=20,
|
|
|
|
| 327 |
)
|
| 328 |
|
| 329 |
with gr.Row():
|
|
|
|
| 7 |
import gc
|
| 8 |
import os
|
| 9 |
from pathlib import Path
|
| 10 |
+
import unicodedata
|
| 11 |
|
| 12 |
import gradio as gr
|
| 13 |
import torch
|
|
|
|
| 28 |
SUPPORT_DIR = SCRIPT_DIR / "support"
|
| 29 |
|
| 30 |
# Text length limits
|
| 31 |
+
MAX_TEXT_LENGTH = 16384
|
| 32 |
MIN_TEXT_LENGTH = 1
|
| 33 |
|
| 34 |
# Global model cache
|
|
|
|
| 121 |
if not text or not text.strip():
|
| 122 |
return False, "Please enter some text to analyze."
|
| 123 |
|
| 124 |
+
text = unicodedata.normalize("NFC", text).strip()
|
| 125 |
|
| 126 |
if len(text) < MIN_TEXT_LENGTH:
|
| 127 |
return False, f"Text is too short. Minimum {MIN_TEXT_LENGTH} characters required."
|
|
|
|
| 300 |
|
| 301 |
|
| 302 |
# Build Gradio UI
|
| 303 |
+
with gr.Blocks(
|
| 304 |
+
title="Compression-Lens: RWKV-7 vs Qwen3",
|
| 305 |
+
theme=gr.themes.Soft(),
|
| 306 |
+
css="""
|
| 307 |
+
#input-text textarea {
|
| 308 |
+
font-family: Consolas, 'Courier New', monospace;
|
| 309 |
+
}
|
| 310 |
+
""",
|
| 311 |
+
) as demo:
|
| 312 |
gr.HTML(
|
| 313 |
"""
|
| 314 |
<div style="text-align: center; margin-bottom: 20px;">
|
|
|
|
| 333 |
placeholder=f"Enter text to analyze (max {MAX_TEXT_LENGTH} characters)...",
|
| 334 |
lines=10,
|
| 335 |
max_lines=20,
|
| 336 |
+
elem_id="input-text",
|
| 337 |
)
|
| 338 |
|
| 339 |
with gr.Row():
|
core/escaping.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Unified escaping helpers for HTML rendering.
|
| 3 |
+
|
| 4 |
+
Keep all HTML/attribute/script escaping logic in one place to avoid divergence.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import json
|
| 8 |
+
from typing import Any
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def escape_text_node(text: str) -> str:
|
| 12 |
+
"""Escape text for HTML text node insertion."""
|
| 13 |
+
if text is None:
|
| 14 |
+
return ""
|
| 15 |
+
return (
|
| 16 |
+
text.replace("&", "&")
|
| 17 |
+
.replace("<", "<")
|
| 18 |
+
.replace(">", ">")
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def escape_attr(text: str) -> str:
|
| 23 |
+
"""Escape text for safe placement in HTML attribute values."""
|
| 24 |
+
if text is None:
|
| 25 |
+
return ""
|
| 26 |
+
return (
|
| 27 |
+
text.replace("&", "&")
|
| 28 |
+
.replace('"', """)
|
| 29 |
+
.replace("'", "'")
|
| 30 |
+
.replace("<", "<")
|
| 31 |
+
.replace(">", ">")
|
| 32 |
+
.replace("\n", " ")
|
| 33 |
+
.replace("\r", " ")
|
| 34 |
+
.replace("\t", "	")
|
| 35 |
+
)
|
| 36 |
+
|
| 37 |
+
|
| 38 |
+
def escape_json_for_script(value: Any) -> str:
|
| 39 |
+
"""Serialize JSON for safe embedding inside <script> tags."""
|
| 40 |
+
text = json.dumps(value, ensure_ascii=False)
|
| 41 |
+
# Prevent closing tags or HTML entities from breaking script context.
|
| 42 |
+
return (
|
| 43 |
+
text.replace("<", "\\u003c")
|
| 44 |
+
.replace(">", "\\u003e")
|
| 45 |
+
.replace("&", "\\u0026")
|
| 46 |
+
)
|
core/render_model.py
ADDED
|
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Render model definitions for visualization.
|
| 3 |
+
|
| 4 |
+
This module defines a stable, serializable intermediate representation
|
| 5 |
+
between model outputs and HTML rendering.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from dataclasses import dataclass, field, asdict
|
| 9 |
+
from typing import Any, Dict, List, Optional
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def _has_control_chars(text: str) -> bool:
|
| 13 |
+
if not text:
|
| 14 |
+
return False
|
| 15 |
+
for ch in text:
|
| 16 |
+
code = ord(ch)
|
| 17 |
+
if code < 32 or code == 127:
|
| 18 |
+
return True
|
| 19 |
+
return False
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
@dataclass
|
| 23 |
+
class TokenDisplay:
|
| 24 |
+
text: str
|
| 25 |
+
kind: str # "normal" | "control" | "raw"
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
@dataclass
|
| 29 |
+
class TokenInfo:
|
| 30 |
+
byte_start: int
|
| 31 |
+
byte_end: int
|
| 32 |
+
display: TokenDisplay
|
| 33 |
+
is_word: bool
|
| 34 |
+
word_id: Optional[int] = None
|
| 35 |
+
word_key: Optional[str] = None
|
| 36 |
+
bytes_hex: str = ""
|
| 37 |
+
compression: Dict[str, str] = field(default_factory=dict)
|
| 38 |
+
model_tokens: Dict[str, List[List[Any]]] = field(default_factory=dict)
|
| 39 |
+
loss: Dict[str, float] = field(default_factory=dict)
|
| 40 |
+
topk: Dict[str, Any] = field(default_factory=dict)
|
| 41 |
+
tuned_delta: float = 0.0
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
@dataclass
|
| 45 |
+
class RenderModel:
|
| 46 |
+
text: str
|
| 47 |
+
tokens: List[TokenInfo]
|
| 48 |
+
meta: Dict[str, Any] = field(default_factory=dict)
|
| 49 |
+
|
| 50 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 51 |
+
return asdict(self)
|
| 52 |
+
|
| 53 |
+
|
| 54 |
+
def build_display(text: str, is_raw: bool = False) -> TokenDisplay:
|
| 55 |
+
if is_raw:
|
| 56 |
+
return TokenDisplay(text=text, kind="raw")
|
| 57 |
+
if _has_control_chars(text):
|
| 58 |
+
return TokenDisplay(text=text, kind="control")
|
| 59 |
+
return TokenDisplay(text=text, kind="normal")
|
core/segmentation.py
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Fallback segmentation utilities.
|
| 3 |
+
|
| 4 |
+
Used for offline tests or snapshot generation when model tokenizers
|
| 5 |
+
are unavailable.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
from typing import Dict, List
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def fallback_token_info(text: str) -> Dict[str, List]:
|
| 12 |
+
"""Return minimal token info using UTF-8 codepoint boundaries."""
|
| 13 |
+
boundaries = [0]
|
| 14 |
+
byte_pos = 0
|
| 15 |
+
for ch in text:
|
| 16 |
+
byte_pos += len(ch.encode("utf-8"))
|
| 17 |
+
boundaries.append(byte_pos)
|
| 18 |
+
return {
|
| 19 |
+
"common_boundaries": boundaries,
|
| 20 |
+
"qwen_tokens": [],
|
| 21 |
+
"rwkv_tokens": [],
|
| 22 |
+
"byte_to_qwen": {},
|
| 23 |
+
"byte_to_rwkv": {},
|
| 24 |
+
}
|
tests/_out/stress.output.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tests/_out/stress.render_model.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tests/compare_html.py
ADDED
|
@@ -0,0 +1,46 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Compare two HTML files with a unified diff.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import difflib
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
def normalize(text: str, ignore_whitespace: bool) -> str:
|
| 11 |
+
text = text.replace("\r\n", "\n").replace("\r", "\n")
|
| 12 |
+
if ignore_whitespace:
|
| 13 |
+
return " ".join(text.split())
|
| 14 |
+
return text
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
def main() -> int:
|
| 18 |
+
parser = argparse.ArgumentParser(description="Compare HTML files.")
|
| 19 |
+
parser.add_argument("--baseline", type=Path, required=True, help="Baseline HTML path")
|
| 20 |
+
parser.add_argument("--candidate", type=Path, required=True, help="Candidate HTML path")
|
| 21 |
+
parser.add_argument("--ignore-whitespace", action="store_true", help="Normalize whitespace before diff")
|
| 22 |
+
parser.add_argument("--max-lines", type=int, default=200, help="Max diff lines to print")
|
| 23 |
+
args = parser.parse_args()
|
| 24 |
+
|
| 25 |
+
base_text = normalize(args.baseline.read_text(encoding="utf-8"), args.ignore_whitespace)
|
| 26 |
+
cand_text = normalize(args.candidate.read_text(encoding="utf-8"), args.ignore_whitespace)
|
| 27 |
+
|
| 28 |
+
base_lines = base_text.splitlines(keepends=True)
|
| 29 |
+
cand_lines = cand_text.splitlines(keepends=True)
|
| 30 |
+
|
| 31 |
+
diff = list(difflib.unified_diff(base_lines, cand_lines, fromfile=str(args.baseline), tofile=str(args.candidate)))
|
| 32 |
+
|
| 33 |
+
if not diff:
|
| 34 |
+
print("No differences found.")
|
| 35 |
+
return 0
|
| 36 |
+
|
| 37 |
+
print("Differences found:")
|
| 38 |
+
for line in diff[: args.max_lines]:
|
| 39 |
+
print(line, end="")
|
| 40 |
+
if len(diff) > args.max_lines:
|
| 41 |
+
print(f"\n... truncated ({len(diff)} total diff lines).")
|
| 42 |
+
return 1
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
if __name__ == "__main__":
|
| 46 |
+
raise SystemExit(main())
|
tests/compare_snapshots.py
ADDED
|
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Compare two render-model JSON snapshots and report differences.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
import argparse
|
| 6 |
+
import json
|
| 7 |
+
import math
|
| 8 |
+
import sys
|
| 9 |
+
from pathlib import Path
|
| 10 |
+
from typing import Any, List
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def load_json(path: Path) -> Any:
|
| 14 |
+
with path.open("r", encoding="utf-8") as f:
|
| 15 |
+
return json.load(f)
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
def is_number(value: Any) -> bool:
|
| 19 |
+
return isinstance(value, (int, float)) and not isinstance(value, bool)
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def nearly_equal(a: float, b: float, tol: float) -> bool:
|
| 23 |
+
if math.isnan(a) and math.isnan(b):
|
| 24 |
+
return True
|
| 25 |
+
return abs(a - b) <= tol
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
def compare(a: Any, b: Any, path: str, diffs: List[str], tol: float, max_diffs: int) -> None:
|
| 29 |
+
if len(diffs) >= max_diffs:
|
| 30 |
+
return
|
| 31 |
+
|
| 32 |
+
if type(a) != type(b):
|
| 33 |
+
diffs.append(f"{path}: type {type(a).__name__} != {type(b).__name__}")
|
| 34 |
+
return
|
| 35 |
+
|
| 36 |
+
if isinstance(a, dict):
|
| 37 |
+
a_keys = set(a.keys())
|
| 38 |
+
b_keys = set(b.keys())
|
| 39 |
+
for key in sorted(a_keys - b_keys):
|
| 40 |
+
diffs.append(f"{path}.{key}: missing in candidate")
|
| 41 |
+
if len(diffs) >= max_diffs:
|
| 42 |
+
return
|
| 43 |
+
for key in sorted(b_keys - a_keys):
|
| 44 |
+
diffs.append(f"{path}.{key}: extra in candidate")
|
| 45 |
+
if len(diffs) >= max_diffs:
|
| 46 |
+
return
|
| 47 |
+
for key in sorted(a_keys & b_keys):
|
| 48 |
+
compare(a[key], b[key], f"{path}.{key}", diffs, tol, max_diffs)
|
| 49 |
+
if len(diffs) >= max_diffs:
|
| 50 |
+
return
|
| 51 |
+
return
|
| 52 |
+
|
| 53 |
+
if isinstance(a, list):
|
| 54 |
+
if len(a) != len(b):
|
| 55 |
+
diffs.append(f"{path}: list length {len(a)} != {len(b)}")
|
| 56 |
+
min_len = min(len(a), len(b))
|
| 57 |
+
for idx in range(min_len):
|
| 58 |
+
compare(a[idx], b[idx], f"{path}[{idx}]", diffs, tol, max_diffs)
|
| 59 |
+
if len(diffs) >= max_diffs:
|
| 60 |
+
return
|
| 61 |
+
return
|
| 62 |
+
|
| 63 |
+
if is_number(a) and is_number(b) and tol > 0:
|
| 64 |
+
if not nearly_equal(float(a), float(b), tol):
|
| 65 |
+
diffs.append(f"{path}: {a} != {b} (tol={tol})")
|
| 66 |
+
return
|
| 67 |
+
|
| 68 |
+
if a != b:
|
| 69 |
+
diffs.append(f"{path}: {a!r} != {b!r}")
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def main() -> int:
|
| 73 |
+
parser = argparse.ArgumentParser(description="Compare render-model JSON snapshots.")
|
| 74 |
+
parser.add_argument("--baseline", type=Path, required=True, help="Baseline JSON path")
|
| 75 |
+
parser.add_argument("--candidate", type=Path, required=True, help="Candidate JSON path")
|
| 76 |
+
parser.add_argument("--float-tol", type=float, default=0.0, help="Float comparison tolerance")
|
| 77 |
+
parser.add_argument("--max-diffs", type=int, default=200, help="Max diffs to display")
|
| 78 |
+
args = parser.parse_args()
|
| 79 |
+
|
| 80 |
+
baseline = load_json(args.baseline)
|
| 81 |
+
candidate = load_json(args.candidate)
|
| 82 |
+
|
| 83 |
+
diffs: List[str] = []
|
| 84 |
+
compare(baseline, candidate, "$", diffs, args.float_tol, args.max_diffs)
|
| 85 |
+
|
| 86 |
+
if diffs:
|
| 87 |
+
print(f"Differences found: {len(diffs)}")
|
| 88 |
+
for line in diffs:
|
| 89 |
+
print(line)
|
| 90 |
+
return 1
|
| 91 |
+
|
| 92 |
+
print("No differences found.")
|
| 93 |
+
return 0
|
| 94 |
+
|
| 95 |
+
|
| 96 |
+
if __name__ == "__main__":
|
| 97 |
+
sys.exit(main())
|
tests/generate_snapshots.py
ADDED
|
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Generate offline HTML + render-model snapshots for stress inputs.
|
| 3 |
+
|
| 4 |
+
Uses fallback segmentation (no model downloads).
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import argparse
|
| 8 |
+
import json
|
| 9 |
+
import sys
|
| 10 |
+
from pathlib import Path
|
| 11 |
+
|
| 12 |
+
ROOT = Path(__file__).resolve().parents[1]
|
| 13 |
+
sys.path.insert(0, str(ROOT))
|
| 14 |
+
|
| 15 |
+
from core.segmentation import fallback_token_info
|
| 16 |
+
from visualization.html_generator import generate_comparison_html
|
| 17 |
+
SAMPLES_DIR = ROOT / "tests" / "samples"
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
def main() -> None:
|
| 21 |
+
parser = argparse.ArgumentParser(description="Generate offline snapshot HTML/JSON.")
|
| 22 |
+
parser.add_argument("--out", type=Path, default=ROOT / "tests" / "golden", help="Output directory")
|
| 23 |
+
args = parser.parse_args()
|
| 24 |
+
|
| 25 |
+
input_path = SAMPLES_DIR / "stress_inputs.txt"
|
| 26 |
+
text = input_path.read_text(encoding="utf-8")
|
| 27 |
+
|
| 28 |
+
byte_len = len(text.encode("utf-8"))
|
| 29 |
+
losses_a = [0.5] * byte_len
|
| 30 |
+
losses_b = [0.6] * byte_len
|
| 31 |
+
|
| 32 |
+
html, render_model = generate_comparison_html(
|
| 33 |
+
text=text,
|
| 34 |
+
byte_losses_a=losses_a,
|
| 35 |
+
byte_losses_b=losses_b,
|
| 36 |
+
model_a_name="RWKV7 (dummy)",
|
| 37 |
+
model_b_name="Qwen3 (dummy)",
|
| 38 |
+
topk_predictions_a=None,
|
| 39 |
+
topk_predictions_b=None,
|
| 40 |
+
tokenizer_a=None,
|
| 41 |
+
tokenizer_b=None,
|
| 42 |
+
model_type_a="rwkv7",
|
| 43 |
+
model_type_b="hf",
|
| 44 |
+
token_info_override=fallback_token_info(text),
|
| 45 |
+
return_render_model=True,
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
out_dir = args.out
|
| 49 |
+
out_dir.mkdir(parents=True, exist_ok=True)
|
| 50 |
+
|
| 51 |
+
html_path = out_dir / "stress.output.html"
|
| 52 |
+
json_path = out_dir / "stress.render_model.json"
|
| 53 |
+
|
| 54 |
+
html_path.write_text(html, encoding="utf-8")
|
| 55 |
+
with json_path.open("w", encoding="utf-8") as f:
|
| 56 |
+
json.dump(render_model, f, ensure_ascii=False, indent=2)
|
| 57 |
+
|
| 58 |
+
print(f"Wrote {html_path}")
|
| 59 |
+
print(f"Wrote {json_path}")
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
if __name__ == "__main__":
|
| 63 |
+
main()
|
tests/golden/README.md
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Golden Snapshots
|
| 2 |
+
|
| 3 |
+
This folder stores render-model JSON and HTML snapshots for regression checks.
|
| 4 |
+
|
| 5 |
+
Suggested naming:
|
| 6 |
+
- sample_01.render_model.json
|
| 7 |
+
- sample_01.output.html
|
| 8 |
+
|
| 9 |
+
Keep these files in sync with `tests/samples/`.
|
| 10 |
+
|
| 11 |
+
Generate snapshots with:
|
| 12 |
+
`conda run -n torch2 python tests/generate_snapshots.py --out tests/golden`
|
| 13 |
+
|
| 14 |
+
Generate a candidate snapshot:
|
| 15 |
+
`conda run -n torch2 python tests/generate_snapshots.py --out tests/_out`
|
| 16 |
+
|
| 17 |
+
Compare snapshots:
|
| 18 |
+
`conda run -n torch2 python tests/compare_snapshots.py --baseline tests/golden/stress.render_model.json --candidate tests/_out/stress.render_model.json`
|
| 19 |
+
|
| 20 |
+
Compare HTML output:
|
| 21 |
+
`conda run -n torch2 python tests/compare_html.py --baseline tests/golden/stress.output.html --candidate tests/_out/stress.output.html`
|
tests/golden/stress.output.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tests/golden/stress.render_model.json
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
tests/samples/stress_inputs.txt
ADDED
|
@@ -0,0 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
BEGIN TEST
|
| 2 |
+
Leading spaces (2) + trailing spaces (2)··
|
| 3 |
+
TAB_LITERAL: [START] [END] (there is a real TAB between)
|
| 4 |
+
|
| 5 |
+
Raw escape-like text: \n \r \t \\n \\r \\t \\x00 \\x1f \\x7f \\xff \u0000 \u202E \u200F \u200E
|
| 6 |
+
Bytes-ish hex: e5 bd 93 e7 84 b6 | 00 1f 7f ff | 0x00 0x1F 0x7F 0xFF | b"\x00\x1f\x7f\xff"
|
| 7 |
+
|
| 8 |
+
HTML tags (should render as text, not tags):
|
| 9 |
+
<think></think> <think>inner</think> <script>alert('x')</script> <style>body{color:red}</style>
|
| 10 |
+
<div class="x" data-x="1 & 2">Hello</div> <span>Span</span> <a href="https://example.com?q=1&x=<tag>">link</a>
|
| 11 |
+
<img src=x onerror=alert(1)> <br> <hr> <p>para</p> <table><tr><td>cell</td></tr></table>
|
| 12 |
+
Nested-ish: </span><span data-x="</span>">confuse</span>
|
| 13 |
+
|
| 14 |
+
HTML entities:
|
| 15 |
+
<think> </think> & " ' < > &
|
| 16 |
+
|
| 17 |
+
Markdown-ish:
|
| 18 |
+
# H1
|
| 19 |
+
## H2
|
| 20 |
+
- list item 1
|
| 21 |
+
- list item 2
|
| 22 |
+
> blockquote
|
| 23 |
+
--- (three hyphens)
|
| 24 |
+
|
| 25 |
+
Languages:
|
| 26 |
+
中文 简体/繁體 日本語 かな カタカナ 한국어 العربية עברית हिन्दी ไทย Русский Ελληνικά Español Français Português Türkçe Việt
|
| 27 |
+
RTL mix: العربية ABC עברית 123 (mixed direction)
|
tests/visual_smoke.py
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Placeholder for visual smoke checks.
|
| 3 |
+
|
| 4 |
+
This script is intentionally minimal; hook Playwright or similar later.
|
| 5 |
+
"""
|
| 6 |
+
|
| 7 |
+
import argparse
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def main() -> None:
|
| 12 |
+
parser = argparse.ArgumentParser(description="Visual smoke check placeholder.")
|
| 13 |
+
parser.add_argument("--html", type=Path, required=True, help="HTML file to inspect")
|
| 14 |
+
args = parser.parse_args()
|
| 15 |
+
|
| 16 |
+
if not args.html.exists():
|
| 17 |
+
raise SystemExit(f"Missing HTML file: {args.html}")
|
| 18 |
+
|
| 19 |
+
print("Visual smoke placeholder:")
|
| 20 |
+
print(f"- Open in browser and visually verify: {args.html}")
|
| 21 |
+
print("- TODO: integrate Playwright for screenshot diffs.")
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
if __name__ == "__main__":
|
| 25 |
+
main()
|
visualization/assets/main.css
ADDED
|
@@ -0,0 +1,250 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
body {
|
| 2 |
+
font-family: Consolas, 'Courier New', monospace;
|
| 3 |
+
margin: 0;
|
| 4 |
+
padding: 0;
|
| 5 |
+
background-color: #f5f5f5;
|
| 6 |
+
}
|
| 7 |
+
.header {
|
| 8 |
+
background-color: #333;
|
| 9 |
+
color: white;
|
| 10 |
+
padding: 20px;
|
| 11 |
+
position: sticky;
|
| 12 |
+
top: 0;
|
| 13 |
+
z-index: 100;
|
| 14 |
+
}
|
| 15 |
+
.header h1 {
|
| 16 |
+
margin: 0 0 15px 0;
|
| 17 |
+
font-size: 18px;
|
| 18 |
+
}
|
| 19 |
+
.meta {
|
| 20 |
+
display: flex;
|
| 21 |
+
flex-wrap: wrap;
|
| 22 |
+
gap: 20px;
|
| 23 |
+
font-size: 12px;
|
| 24 |
+
color: #c8c8c8;
|
| 25 |
+
}
|
| 26 |
+
.legend {
|
| 27 |
+
display: flex;
|
| 28 |
+
gap: 15px;
|
| 29 |
+
margin-top: 10px;
|
| 30 |
+
}
|
| 31 |
+
.legend-item {
|
| 32 |
+
display: flex;
|
| 33 |
+
align-items: center;
|
| 34 |
+
gap: 5px;
|
| 35 |
+
}
|
| 36 |
+
.legend-box {
|
| 37 |
+
width: 20px;
|
| 38 |
+
height: 12px;
|
| 39 |
+
border: 1px solid #666;
|
| 40 |
+
}
|
| 41 |
+
.content {
|
| 42 |
+
background-color: white;
|
| 43 |
+
margin: 10px;
|
| 44 |
+
padding: 15px;
|
| 45 |
+
border: 1px solid #ccc;
|
| 46 |
+
font-size: 14px;
|
| 47 |
+
line-height: 1.8;
|
| 48 |
+
white-space: pre-wrap;
|
| 49 |
+
overflow-wrap: anywhere;
|
| 50 |
+
word-wrap: break-word;
|
| 51 |
+
position: relative;
|
| 52 |
+
}
|
| 53 |
+
.content span {
|
| 54 |
+
padding: 1px 0;
|
| 55 |
+
}
|
| 56 |
+
.word {
|
| 57 |
+
cursor: pointer;
|
| 58 |
+
position: relative;
|
| 59 |
+
}
|
| 60 |
+
.word:hover {
|
| 61 |
+
outline: 2px solid #007bff;
|
| 62 |
+
outline-offset: 1px;
|
| 63 |
+
}
|
| 64 |
+
.word.highlighted {
|
| 65 |
+
outline: 2px solid #ff6b6b;
|
| 66 |
+
outline-offset: 1px;
|
| 67 |
+
}
|
| 68 |
+
#svg-overlay {
|
| 69 |
+
position: fixed;
|
| 70 |
+
top: 0;
|
| 71 |
+
left: 0;
|
| 72 |
+
width: 100%;
|
| 73 |
+
height: 100%;
|
| 74 |
+
pointer-events: none;
|
| 75 |
+
z-index: 1000;
|
| 76 |
+
}
|
| 77 |
+
.link-line {
|
| 78 |
+
stroke: #007bff;
|
| 79 |
+
stroke-width: 2;
|
| 80 |
+
fill: none;
|
| 81 |
+
opacity: 0.7;
|
| 82 |
+
}
|
| 83 |
+
.link-dot {
|
| 84 |
+
fill: #007bff;
|
| 85 |
+
opacity: 0.8;
|
| 86 |
+
}
|
| 87 |
+
.token {
|
| 88 |
+
position: relative;
|
| 89 |
+
cursor: help;
|
| 90 |
+
}
|
| 91 |
+
.token:hover {
|
| 92 |
+
outline: 1px dashed #666;
|
| 93 |
+
}
|
| 94 |
+
.token-kind-control {
|
| 95 |
+
color: #f59e0b;
|
| 96 |
+
}
|
| 97 |
+
.token-kind-raw {
|
| 98 |
+
color: #fb7185;
|
| 99 |
+
}
|
| 100 |
+
#tooltip {
|
| 101 |
+
position: fixed;
|
| 102 |
+
background-color: rgba(0, 0, 0, 0.9);
|
| 103 |
+
color: white;
|
| 104 |
+
padding: 10px 14px;
|
| 105 |
+
border-radius: 6px;
|
| 106 |
+
font-size: 12px;
|
| 107 |
+
max-width: none;
|
| 108 |
+
width: max-content;
|
| 109 |
+
z-index: 2000;
|
| 110 |
+
pointer-events: none;
|
| 111 |
+
display: none;
|
| 112 |
+
line-height: 1.6;
|
| 113 |
+
box-shadow: 0 2px 10px rgba(0,0,0,0.3);
|
| 114 |
+
}
|
| 115 |
+
#tooltip .label {
|
| 116 |
+
color: #aaa;
|
| 117 |
+
font-weight: bold;
|
| 118 |
+
}
|
| 119 |
+
#tooltip .bytes {
|
| 120 |
+
color: #a5f3fc;
|
| 121 |
+
font-family: monospace;
|
| 122 |
+
}
|
| 123 |
+
#tooltip .loss-a {
|
| 124 |
+
color: #86efac;
|
| 125 |
+
font-family: monospace;
|
| 126 |
+
}
|
| 127 |
+
#tooltip .loss-b {
|
| 128 |
+
color: #fca5a5;
|
| 129 |
+
font-family: monospace;
|
| 130 |
+
}
|
| 131 |
+
#tooltip .model-a {
|
| 132 |
+
color: #fcd34d;
|
| 133 |
+
}
|
| 134 |
+
#tooltip .model-b {
|
| 135 |
+
color: #7dd3fc;
|
| 136 |
+
}
|
| 137 |
+
#tooltip .topk-section {
|
| 138 |
+
margin-top: 8px;
|
| 139 |
+
padding-top: 8px;
|
| 140 |
+
border-top: 1px solid #555;
|
| 141 |
+
}
|
| 142 |
+
#tooltip .topk-container {
|
| 143 |
+
display: flex;
|
| 144 |
+
gap: 16px;
|
| 145 |
+
}
|
| 146 |
+
#tooltip .topk-column {
|
| 147 |
+
flex: 1;
|
| 148 |
+
min-width: 180px;
|
| 149 |
+
}
|
| 150 |
+
#tooltip .topk-title {
|
| 151 |
+
color: #aaa;
|
| 152 |
+
font-weight: bold;
|
| 153 |
+
margin-bottom: 4px;
|
| 154 |
+
font-size: 11px;
|
| 155 |
+
}
|
| 156 |
+
#tooltip .topk-title.model-a {
|
| 157 |
+
color: #86efac;
|
| 158 |
+
}
|
| 159 |
+
#tooltip .topk-title.model-b {
|
| 160 |
+
color: #fca5a5;
|
| 161 |
+
}
|
| 162 |
+
#tooltip .topk-list {
|
| 163 |
+
font-size: 11px;
|
| 164 |
+
}
|
| 165 |
+
#tooltip .topk-item {
|
| 166 |
+
display: flex;
|
| 167 |
+
gap: 4px;
|
| 168 |
+
padding: 1px 0;
|
| 169 |
+
align-items: flex-start;
|
| 170 |
+
}
|
| 171 |
+
#tooltip .token-block {
|
| 172 |
+
margin-top: 6px;
|
| 173 |
+
display: flex;
|
| 174 |
+
align-items: center;
|
| 175 |
+
gap: 6px;
|
| 176 |
+
white-space: nowrap;
|
| 177 |
+
flex-wrap: nowrap;
|
| 178 |
+
overflow-x: visible;
|
| 179 |
+
}
|
| 180 |
+
#tooltip .token-chips {
|
| 181 |
+
display: flex;
|
| 182 |
+
flex-wrap: nowrap;
|
| 183 |
+
gap: 4px;
|
| 184 |
+
align-items: center;
|
| 185 |
+
flex: 0 0 auto;
|
| 186 |
+
}
|
| 187 |
+
#tooltip .token-chip-group {
|
| 188 |
+
display: inline-flex;
|
| 189 |
+
align-items: center;
|
| 190 |
+
gap: 4px;
|
| 191 |
+
flex: 0 0 auto;
|
| 192 |
+
white-space: nowrap;
|
| 193 |
+
}
|
| 194 |
+
#tooltip .token-prob {
|
| 195 |
+
color: #86efac;
|
| 196 |
+
font-family: monospace;
|
| 197 |
+
font-size: 11px;
|
| 198 |
+
white-space: nowrap;
|
| 199 |
+
}
|
| 200 |
+
#tooltip .token-id {
|
| 201 |
+
color: #888;
|
| 202 |
+
font-family: monospace;
|
| 203 |
+
white-space: nowrap;
|
| 204 |
+
}
|
| 205 |
+
#tooltip .token-chip {
|
| 206 |
+
max-width: 100%;
|
| 207 |
+
}
|
| 208 |
+
#tooltip .token-chip-group .topk-token {
|
| 209 |
+
white-space: pre;
|
| 210 |
+
overflow-wrap: normal;
|
| 211 |
+
word-break: normal;
|
| 212 |
+
}
|
| 213 |
+
#tooltip .topk-rank {
|
| 214 |
+
color: #888;
|
| 215 |
+
min-width: 18px;
|
| 216 |
+
}
|
| 217 |
+
#tooltip .topk-rank.hit {
|
| 218 |
+
color: #ffd700;
|
| 219 |
+
}
|
| 220 |
+
#tooltip .topk-token {
|
| 221 |
+
color: #a5f3fc;
|
| 222 |
+
white-space: pre-wrap;
|
| 223 |
+
overflow-wrap: anywhere;
|
| 224 |
+
word-break: break-word;
|
| 225 |
+
font-family: monospace;
|
| 226 |
+
background-color: rgba(255, 255, 255, 0.08);
|
| 227 |
+
padding: 0 4px;
|
| 228 |
+
border-radius: 3px;
|
| 229 |
+
display: inline-block;
|
| 230 |
+
max-width: 100%;
|
| 231 |
+
}
|
| 232 |
+
#tooltip .esc-control {
|
| 233 |
+
color: #fbbf24;
|
| 234 |
+
}
|
| 235 |
+
#tooltip .esc-raw {
|
| 236 |
+
color: #fb7185;
|
| 237 |
+
}
|
| 238 |
+
#tooltip .topk-prob {
|
| 239 |
+
color: #86efac;
|
| 240 |
+
min-width: 45px;
|
| 241 |
+
text-align: right;
|
| 242 |
+
}
|
| 243 |
+
#tooltip .topk-hit {
|
| 244 |
+
color: #22c55e;
|
| 245 |
+
}
|
| 246 |
+
#tooltip .topk-miss {
|
| 247 |
+
color: #ef4444;
|
| 248 |
+
font-style: italic;
|
| 249 |
+
}
|
| 250 |
+
|
visualization/assets/main.js
ADDED
|
@@ -0,0 +1,615 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
const svgOverlay = document.getElementById('svg-overlay');
|
| 2 |
+
const content = document.querySelector('.content');
|
| 3 |
+
const renderModelEl = document.getElementById('render-model');
|
| 4 |
+
let renderModel = {};
|
| 5 |
+
let renderTokens = [];
|
| 6 |
+
try {
|
| 7 |
+
renderModel = JSON.parse(renderModelEl ? renderModelEl.textContent : '{}') || {};
|
| 8 |
+
renderTokens = Array.isArray(renderModel.tokens) ? renderModel.tokens : [];
|
| 9 |
+
} catch (e) {
|
| 10 |
+
console.warn('Failed to parse render model JSON:', e);
|
| 11 |
+
renderModel = {};
|
| 12 |
+
renderTokens = [];
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
function escapeControlChars(text) {
|
| 16 |
+
if (!text) return text;
|
| 17 |
+
let out = '';
|
| 18 |
+
for (let i = 0; i < text.length; i++) {
|
| 19 |
+
const ch = text[i];
|
| 20 |
+
const code = text.charCodeAt(i);
|
| 21 |
+
if (ch === '\\') {
|
| 22 |
+
out += '\\\\';
|
| 23 |
+
} else if (ch === '\n') {
|
| 24 |
+
out += '\\n';
|
| 25 |
+
} else if (ch === '\r') {
|
| 26 |
+
out += '\\r';
|
| 27 |
+
} else if (ch === '\t') {
|
| 28 |
+
out += '\\t';
|
| 29 |
+
} else if (code < 32 || code === 127) {
|
| 30 |
+
out += '\\x' + code.toString(16).padStart(2, '0');
|
| 31 |
+
} else {
|
| 32 |
+
out += ch;
|
| 33 |
+
}
|
| 34 |
+
}
|
| 35 |
+
return out;
|
| 36 |
+
}
|
| 37 |
+
|
| 38 |
+
if (content) {
|
| 39 |
+
while (content.firstChild) {
|
| 40 |
+
content.removeChild(content.firstChild);
|
| 41 |
+
}
|
| 42 |
+
const wordCounts = {};
|
| 43 |
+
renderTokens.forEach((token) => {
|
| 44 |
+
if (token && token.is_word && token.word_key) {
|
| 45 |
+
wordCounts[token.word_key] = (wordCounts[token.word_key] || 0) + 1;
|
| 46 |
+
}
|
| 47 |
+
});
|
| 48 |
+
|
| 49 |
+
renderTokens.forEach((token, idx) => {
|
| 50 |
+
const span = document.createElement('span');
|
| 51 |
+
span.className = 'token';
|
| 52 |
+
span.dataset.tokenIdx = String(idx);
|
| 53 |
+
span.dataset.tunedDelta = (token && typeof token.tuned_delta === 'number') ? String(token.tuned_delta) : '0';
|
| 54 |
+
const kind = (token && token.display && token.display.kind) ? token.display.kind : 'normal';
|
| 55 |
+
const text = (token && token.display && typeof token.display.text === 'string') ? token.display.text : '';
|
| 56 |
+
const hasVisible = (() => {
|
| 57 |
+
if (!text) return false;
|
| 58 |
+
for (let i = 0; i < text.length; i++) {
|
| 59 |
+
const code = text.charCodeAt(i);
|
| 60 |
+
if (code >= 32 && code !== 127) {
|
| 61 |
+
return true;
|
| 62 |
+
}
|
| 63 |
+
}
|
| 64 |
+
return false;
|
| 65 |
+
})();
|
| 66 |
+
const mainKind = (kind === 'control' && hasVisible) ? 'normal' : kind;
|
| 67 |
+
if (text.includes('\n') || text.includes('\r')) {
|
| 68 |
+
span.dataset.hasLinebreak = '1';
|
| 69 |
+
}
|
| 70 |
+
if (mainKind === 'control') {
|
| 71 |
+
span.classList.add('token-kind-control');
|
| 72 |
+
span.textContent = text;
|
| 73 |
+
} else if (mainKind === 'raw') {
|
| 74 |
+
span.classList.add('token-kind-raw');
|
| 75 |
+
span.textContent = text;
|
| 76 |
+
} else {
|
| 77 |
+
span.textContent = text;
|
| 78 |
+
}
|
| 79 |
+
|
| 80 |
+
if (token && token.is_word && token.word_key && wordCounts[token.word_key] > 1) {
|
| 81 |
+
span.classList.add('word');
|
| 82 |
+
span.dataset.word = token.word_key;
|
| 83 |
+
if (token.word_id !== undefined && token.word_id !== null) {
|
| 84 |
+
span.dataset.wordId = String(token.word_id);
|
| 85 |
+
}
|
| 86 |
+
}
|
| 87 |
+
|
| 88 |
+
content.appendChild(span);
|
| 89 |
+
});
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
const words = document.querySelectorAll('.word');
|
| 93 |
+
const wordGroups = {};
|
| 94 |
+
words.forEach(word => {
|
| 95 |
+
const wordText = word.getAttribute('data-word');
|
| 96 |
+
if (!wordGroups[wordText]) {
|
| 97 |
+
wordGroups[wordText] = [];
|
| 98 |
+
}
|
| 99 |
+
wordGroups[wordText].push(word);
|
| 100 |
+
});
|
| 101 |
+
|
| 102 |
+
function clearLines() {
|
| 103 |
+
while (svgOverlay.firstChild) {
|
| 104 |
+
svgOverlay.removeChild(svgOverlay.firstChild);
|
| 105 |
+
}
|
| 106 |
+
words.forEach(w => w.classList.remove('highlighted'));
|
| 107 |
+
}
|
| 108 |
+
|
| 109 |
+
function pickRectByY(rects, targetY) {
|
| 110 |
+
if (!rects || rects.length === 0) return null;
|
| 111 |
+
let best = rects[0];
|
| 112 |
+
let bestDist = Infinity;
|
| 113 |
+
rects.forEach(r => {
|
| 114 |
+
const cy = r.top + r.height / 2;
|
| 115 |
+
const dist = Math.abs(cy - targetY);
|
| 116 |
+
if (dist < bestDist) {
|
| 117 |
+
best = r;
|
| 118 |
+
bestDist = dist;
|
| 119 |
+
}
|
| 120 |
+
});
|
| 121 |
+
return best;
|
| 122 |
+
}
|
| 123 |
+
|
| 124 |
+
function getAnchorRect(element, targetY) {
|
| 125 |
+
const rects = Array.from(element.getClientRects());
|
| 126 |
+
if (rects.length === 0) return element.getBoundingClientRect();
|
| 127 |
+
if (rects.length === 1) return rects[0];
|
| 128 |
+
const picked = pickRectByY(rects, targetY);
|
| 129 |
+
return picked || rects[0];
|
| 130 |
+
}
|
| 131 |
+
|
| 132 |
+
function drawLines(hoveredWord, evt) {
|
| 133 |
+
clearLines();
|
| 134 |
+
|
| 135 |
+
const wordText = hoveredWord.getAttribute('data-word');
|
| 136 |
+
const wordId = parseInt(hoveredWord.getAttribute('data-word-id'));
|
| 137 |
+
const sameWords = wordGroups[wordText] || [];
|
| 138 |
+
|
| 139 |
+
const previousWords = sameWords.filter(w => {
|
| 140 |
+
const id = parseInt(w.getAttribute('data-word-id'));
|
| 141 |
+
return id < wordId;
|
| 142 |
+
});
|
| 143 |
+
|
| 144 |
+
if (previousWords.length === 0) return;
|
| 145 |
+
|
| 146 |
+
sameWords.forEach(w => w.classList.add('highlighted'));
|
| 147 |
+
|
| 148 |
+
const targetY = evt ? evt.clientY : (hoveredWord.getBoundingClientRect().top + hoveredWord.getBoundingClientRect().height / 2);
|
| 149 |
+
const hoveredRect = getAnchorRect(hoveredWord, targetY);
|
| 150 |
+
const hoveredX = hoveredRect.left + hoveredRect.width / 2;
|
| 151 |
+
const hoveredY = hoveredRect.top + hoveredRect.height / 2;
|
| 152 |
+
|
| 153 |
+
previousWords.forEach(prevWord => {
|
| 154 |
+
const prevRect = getAnchorRect(prevWord, hoveredY);
|
| 155 |
+
const prevX = prevRect.left + prevRect.width / 2;
|
| 156 |
+
const prevY = prevRect.top + prevRect.height / 2;
|
| 157 |
+
|
| 158 |
+
const midX = (hoveredX + prevX) / 2;
|
| 159 |
+
const midY = Math.min(hoveredY, prevY) - 30;
|
| 160 |
+
|
| 161 |
+
const path = document.createElementNS('http://www.w3.org/2000/svg', 'path');
|
| 162 |
+
path.setAttribute('class', 'link-line');
|
| 163 |
+
path.setAttribute('d', `M ${prevX} ${prevY} Q ${midX} ${midY} ${hoveredX} ${hoveredY}`);
|
| 164 |
+
svgOverlay.appendChild(path);
|
| 165 |
+
|
| 166 |
+
const dot1 = document.createElementNS('http://www.w3.org/2000/svg', 'circle');
|
| 167 |
+
dot1.setAttribute('class', 'link-dot');
|
| 168 |
+
dot1.setAttribute('cx', prevX);
|
| 169 |
+
dot1.setAttribute('cy', prevY);
|
| 170 |
+
dot1.setAttribute('r', 4);
|
| 171 |
+
svgOverlay.appendChild(dot1);
|
| 172 |
+
|
| 173 |
+
const dot2 = document.createElementNS('http://www.w3.org/2000/svg', 'circle');
|
| 174 |
+
dot2.setAttribute('class', 'link-dot');
|
| 175 |
+
dot2.setAttribute('cx', hoveredX);
|
| 176 |
+
dot2.setAttribute('cy', hoveredY);
|
| 177 |
+
dot2.setAttribute('r', 4);
|
| 178 |
+
svgOverlay.appendChild(dot2);
|
| 179 |
+
});
|
| 180 |
+
}
|
| 181 |
+
|
| 182 |
+
words.forEach(word => {
|
| 183 |
+
word.addEventListener('mouseenter', (e) => drawLines(word, e));
|
| 184 |
+
word.addEventListener('mouseleave', clearLines);
|
| 185 |
+
});
|
| 186 |
+
|
| 187 |
+
window.addEventListener('scroll', clearLines);
|
| 188 |
+
|
| 189 |
+
const tooltip = document.getElementById('tooltip');
|
| 190 |
+
const tokenSpans = document.querySelectorAll('.token');
|
| 191 |
+
|
| 192 |
+
tokenSpans.forEach(token => {
|
| 193 |
+
token.addEventListener('mouseenter', (e) => {
|
| 194 |
+
const tokenIdx = parseInt(token.dataset.tokenIdx);
|
| 195 |
+
const tokenInfo = (!isNaN(tokenIdx) && renderTokens[tokenIdx]) ? renderTokens[tokenIdx] : null;
|
| 196 |
+
const bytes = (tokenInfo && tokenInfo.bytes_hex) ? tokenInfo.bytes_hex : '';
|
| 197 |
+
const compressionA = (tokenInfo && tokenInfo.compression && tokenInfo.compression.rwkv) ? tokenInfo.compression.rwkv : '';
|
| 198 |
+
const compressionB = (tokenInfo && tokenInfo.compression && tokenInfo.compression.qwen) ? tokenInfo.compression.qwen : '';
|
| 199 |
+
const avgCompressionA = (tokenInfo && tokenInfo.loss && typeof tokenInfo.loss.rwkv === 'number') ? tokenInfo.loss.rwkv.toFixed(2) : '';
|
| 200 |
+
const avgCompressionB = (tokenInfo && tokenInfo.loss && typeof tokenInfo.loss.qwen === 'number') ? tokenInfo.loss.qwen.toFixed(2) : '';
|
| 201 |
+
const modelA = (tokenInfo && tokenInfo.model_tokens && tokenInfo.model_tokens.rwkv) ? tokenInfo.model_tokens.rwkv : null;
|
| 202 |
+
const modelB = (tokenInfo && tokenInfo.model_tokens && tokenInfo.model_tokens.qwen) ? tokenInfo.model_tokens.qwen : null;
|
| 203 |
+
const top5A = (tokenInfo && tokenInfo.topk && tokenInfo.topk.rwkv) ? tokenInfo.topk.rwkv : null;
|
| 204 |
+
const top5B = (tokenInfo && tokenInfo.topk && tokenInfo.topk.qwen) ? tokenInfo.topk.qwen : null;
|
| 205 |
+
|
| 206 |
+
function hasControlChars(text) {
|
| 207 |
+
if (!text) return false;
|
| 208 |
+
for (let i = 0; i < text.length; i++) {
|
| 209 |
+
const code = text.charCodeAt(i);
|
| 210 |
+
if (code < 32 || code === 127) {
|
| 211 |
+
return true;
|
| 212 |
+
}
|
| 213 |
+
}
|
| 214 |
+
return false;
|
| 215 |
+
}
|
| 216 |
+
|
| 217 |
+
function resolveKind(text, kindHint) {
|
| 218 |
+
if (kindHint === 'raw' || kindHint === 'control' || kindHint === 'normal') {
|
| 219 |
+
return kindHint;
|
| 220 |
+
}
|
| 221 |
+
if (kindHint === true) {
|
| 222 |
+
return 'raw';
|
| 223 |
+
}
|
| 224 |
+
if (hasControlChars(text)) {
|
| 225 |
+
return 'control';
|
| 226 |
+
}
|
| 227 |
+
return 'normal';
|
| 228 |
+
}
|
| 229 |
+
|
| 230 |
+
function appendEscapedWithControlColor(container, text) {
|
| 231 |
+
if (text === undefined || text === null) return;
|
| 232 |
+
let buffer = '';
|
| 233 |
+
const flush = () => {
|
| 234 |
+
if (buffer) {
|
| 235 |
+
container.appendChild(document.createTextNode(buffer));
|
| 236 |
+
buffer = '';
|
| 237 |
+
}
|
| 238 |
+
};
|
| 239 |
+
for (let i = 0; i < text.length; i++) {
|
| 240 |
+
const ch = text[i];
|
| 241 |
+
const code = text.charCodeAt(i);
|
| 242 |
+
if (ch === '\\') {
|
| 243 |
+
buffer += '\\\\';
|
| 244 |
+
continue;
|
| 245 |
+
}
|
| 246 |
+
if (ch === '\n' || ch === '\r' || ch === '\t' || code < 32 || code === 127) {
|
| 247 |
+
flush();
|
| 248 |
+
const span = document.createElement('span');
|
| 249 |
+
span.className = 'esc-control';
|
| 250 |
+
if (ch === '\n') {
|
| 251 |
+
span.textContent = '\\n';
|
| 252 |
+
} else if (ch === '\r') {
|
| 253 |
+
span.textContent = '\\r';
|
| 254 |
+
} else if (ch === '\t') {
|
| 255 |
+
span.textContent = '\\t';
|
| 256 |
+
} else {
|
| 257 |
+
span.textContent = '\\x' + code.toString(16).padStart(2, '0');
|
| 258 |
+
}
|
| 259 |
+
container.appendChild(span);
|
| 260 |
+
continue;
|
| 261 |
+
}
|
| 262 |
+
buffer += ch;
|
| 263 |
+
}
|
| 264 |
+
flush();
|
| 265 |
+
}
|
| 266 |
+
|
| 267 |
+
function appendTokenText(container, text, kindHint) {
|
| 268 |
+
const display = (text !== undefined && text !== null) ? text : '';
|
| 269 |
+
const kind = resolveKind(display, kindHint);
|
| 270 |
+
while (container.firstChild) {
|
| 271 |
+
container.removeChild(container.firstChild);
|
| 272 |
+
}
|
| 273 |
+
if (kind === 'raw') {
|
| 274 |
+
const span = document.createElement('span');
|
| 275 |
+
span.className = 'esc-raw';
|
| 276 |
+
span.textContent = display;
|
| 277 |
+
container.appendChild(span);
|
| 278 |
+
return;
|
| 279 |
+
}
|
| 280 |
+
if (kind === 'control') {
|
| 281 |
+
appendEscapedWithControlColor(container, display);
|
| 282 |
+
return;
|
| 283 |
+
}
|
| 284 |
+
container.textContent = display;
|
| 285 |
+
}
|
| 286 |
+
|
| 287 |
+
function formatTopkColumn(topkData, modelName, titleClass) {
|
| 288 |
+
const column = document.createElement('div');
|
| 289 |
+
column.className = 'topk-column';
|
| 290 |
+
const title = document.createElement('div');
|
| 291 |
+
title.className = 'topk-title ' + titleClass;
|
| 292 |
+
title.textContent = modelName;
|
| 293 |
+
column.appendChild(title);
|
| 294 |
+
const list = document.createElement('div');
|
| 295 |
+
list.className = 'topk-list';
|
| 296 |
+
column.appendChild(list);
|
| 297 |
+
|
| 298 |
+
if (!topkData) {
|
| 299 |
+
list.textContent = 'N/A';
|
| 300 |
+
return column;
|
| 301 |
+
}
|
| 302 |
+
try {
|
| 303 |
+
const data = topkData;
|
| 304 |
+
let actualId = null;
|
| 305 |
+
let rank = null;
|
| 306 |
+
let actualProb = null;
|
| 307 |
+
let topkList = [];
|
| 308 |
+
if (data.length >= 4) {
|
| 309 |
+
[actualId, rank, actualProb, topkList] = data;
|
| 310 |
+
} else {
|
| 311 |
+
[actualId, rank, topkList] = data;
|
| 312 |
+
}
|
| 313 |
+
topkList.forEach((item, idx) => {
|
| 314 |
+
const tokenId = item[0];
|
| 315 |
+
const prob = item[1];
|
| 316 |
+
const tokenText = item[2];
|
| 317 |
+
const isRaw = item.length > 3 ? item[3] : false;
|
| 318 |
+
const isHit = tokenId === actualId;
|
| 319 |
+
const rankClass = isHit ? 'topk-rank hit' : 'topk-rank';
|
| 320 |
+
const rawText = (tokenText !== undefined && tokenText !== null) ? tokenText : '';
|
| 321 |
+
const displayText = (rawText !== '') ? rawText : ('[' + tokenId + ']');
|
| 322 |
+
|
| 323 |
+
const row = document.createElement('div');
|
| 324 |
+
row.className = 'topk-item';
|
| 325 |
+
|
| 326 |
+
const rankSpan = document.createElement('span');
|
| 327 |
+
rankSpan.className = rankClass;
|
| 328 |
+
rankSpan.textContent = (idx + 1) + '.';
|
| 329 |
+
row.appendChild(rankSpan);
|
| 330 |
+
|
| 331 |
+
const tokenSpan = document.createElement('span');
|
| 332 |
+
tokenSpan.className = 'topk-token';
|
| 333 |
+
tokenSpan.title = 'ID: ' + tokenId;
|
| 334 |
+
appendTokenText(tokenSpan, displayText, isRaw);
|
| 335 |
+
row.appendChild(tokenSpan);
|
| 336 |
+
|
| 337 |
+
const probSpan = document.createElement('span');
|
| 338 |
+
probSpan.className = 'topk-prob';
|
| 339 |
+
probSpan.textContent = (prob * 100).toFixed(2) + '%';
|
| 340 |
+
row.appendChild(probSpan);
|
| 341 |
+
|
| 342 |
+
if (isHit) {
|
| 343 |
+
const hit = document.createElement('span');
|
| 344 |
+
hit.className = 'topk-hit';
|
| 345 |
+
hit.textContent = '✓';
|
| 346 |
+
row.appendChild(hit);
|
| 347 |
+
}
|
| 348 |
+
|
| 349 |
+
list.appendChild(row);
|
| 350 |
+
});
|
| 351 |
+
|
| 352 |
+
if (rank > 10) {
|
| 353 |
+
let probSuffix = '';
|
| 354 |
+
const probVal = parseFloat(actualProb);
|
| 355 |
+
if (!isNaN(probVal)) {
|
| 356 |
+
probSuffix = ' (' + (probVal * 100).toFixed(4) + '%)';
|
| 357 |
+
}
|
| 358 |
+
const miss = document.createElement('div');
|
| 359 |
+
miss.className = 'topk-item topk-miss';
|
| 360 |
+
miss.textContent = 'Actual rank: ' + rank + probSuffix;
|
| 361 |
+
list.appendChild(miss);
|
| 362 |
+
}
|
| 363 |
+
return column;
|
| 364 |
+
} catch (e) {
|
| 365 |
+
console.error('Error in formatTopkColumn for ' + modelName + ':', e);
|
| 366 |
+
console.error('topkData:', topkData);
|
| 367 |
+
list.textContent = 'Error: ' + e.message;
|
| 368 |
+
return column;
|
| 369 |
+
}
|
| 370 |
+
}
|
| 371 |
+
|
| 372 |
+
function formatTokenChips(modelData, label, labelClass) {
|
| 373 |
+
const block = document.createElement('div');
|
| 374 |
+
block.className = 'token-block';
|
| 375 |
+
const labelSpan = document.createElement('span');
|
| 376 |
+
labelSpan.className = 'label ' + labelClass;
|
| 377 |
+
labelSpan.textContent = label + ':';
|
| 378 |
+
block.appendChild(labelSpan);
|
| 379 |
+
|
| 380 |
+
const chips = document.createElement('div');
|
| 381 |
+
chips.className = 'token-chips';
|
| 382 |
+
block.appendChild(chips);
|
| 383 |
+
|
| 384 |
+
if (!modelData) {
|
| 385 |
+
const na = document.createElement('span');
|
| 386 |
+
na.className = 'topk-token token-chip';
|
| 387 |
+
na.textContent = 'N/A';
|
| 388 |
+
chips.appendChild(na);
|
| 389 |
+
return block;
|
| 390 |
+
}
|
| 391 |
+
try {
|
| 392 |
+
const tokenList = modelData;
|
| 393 |
+
tokenList.forEach((item) => {
|
| 394 |
+
const tokenId = item[0];
|
| 395 |
+
const tokenText = item[1];
|
| 396 |
+
const kindHint = item.length > 2 ? item[2] : false;
|
| 397 |
+
const probVal = item.length > 3 ? item[3] : null;
|
| 398 |
+
const displayText = (tokenText !== undefined && tokenText !== null) ? tokenText : '';
|
| 399 |
+
|
| 400 |
+
const group = document.createElement('span');
|
| 401 |
+
group.className = 'token-chip-group';
|
| 402 |
+
group.title = 'ID: ' + tokenId;
|
| 403 |
+
|
| 404 |
+
const idSpan = document.createElement('span');
|
| 405 |
+
idSpan.className = 'token-id';
|
| 406 |
+
idSpan.textContent = '[' + tokenId + ']';
|
| 407 |
+
group.appendChild(idSpan);
|
| 408 |
+
|
| 409 |
+
const chipSpan = document.createElement('span');
|
| 410 |
+
chipSpan.className = 'topk-token token-chip';
|
| 411 |
+
appendTokenText(chipSpan, displayText, kindHint);
|
| 412 |
+
group.appendChild(chipSpan);
|
| 413 |
+
|
| 414 |
+
if (probVal !== null && probVal !== undefined) {
|
| 415 |
+
const probSpan = document.createElement('span');
|
| 416 |
+
probSpan.className = 'token-prob';
|
| 417 |
+
const numericProb = typeof probVal === 'number' ? probVal : parseFloat(probVal);
|
| 418 |
+
if (!isNaN(numericProb)) {
|
| 419 |
+
probSpan.textContent = (numericProb * 100).toFixed(2) + '%';
|
| 420 |
+
} else {
|
| 421 |
+
probSpan.textContent = String(probVal);
|
| 422 |
+
}
|
| 423 |
+
group.appendChild(probSpan);
|
| 424 |
+
}
|
| 425 |
+
|
| 426 |
+
chips.appendChild(group);
|
| 427 |
+
});
|
| 428 |
+
return block;
|
| 429 |
+
} catch (e) {
|
| 430 |
+
console.error('Error in formatTokenChips for ' + label + ':', e);
|
| 431 |
+
console.error('modelData:', modelData);
|
| 432 |
+
const err = document.createElement('span');
|
| 433 |
+
err.className = 'topk-token token-chip';
|
| 434 |
+
err.textContent = 'Error: ' + e.message;
|
| 435 |
+
chips.appendChild(err);
|
| 436 |
+
return block;
|
| 437 |
+
}
|
| 438 |
+
}
|
| 439 |
+
|
| 440 |
+
tooltip.replaceChildren();
|
| 441 |
+
|
| 442 |
+
const bytesRow = document.createElement('div');
|
| 443 |
+
const bytesLabel = document.createElement('span');
|
| 444 |
+
bytesLabel.className = 'label';
|
| 445 |
+
bytesLabel.textContent = 'Bytes:';
|
| 446 |
+
const bytesValue = document.createElement('span');
|
| 447 |
+
bytesValue.className = 'bytes';
|
| 448 |
+
bytesValue.textContent = bytes || '(empty)';
|
| 449 |
+
bytesRow.appendChild(bytesLabel);
|
| 450 |
+
bytesRow.appendChild(document.createTextNode(' '));
|
| 451 |
+
bytesRow.appendChild(bytesValue);
|
| 452 |
+
tooltip.appendChild(bytesRow);
|
| 453 |
+
|
| 454 |
+
const rwkvRow = document.createElement('div');
|
| 455 |
+
const rwkvLabel = document.createElement('span');
|
| 456 |
+
rwkvLabel.className = 'label';
|
| 457 |
+
rwkvLabel.textContent = 'RWKV Compression Rate:';
|
| 458 |
+
const rwkvValue = document.createElement('span');
|
| 459 |
+
rwkvValue.className = 'loss-a';
|
| 460 |
+
rwkvValue.textContent = compressionA || '(empty)';
|
| 461 |
+
if (avgCompressionA) {
|
| 462 |
+
rwkvValue.textContent += ' (avg: ' + avgCompressionA + '%)';
|
| 463 |
+
}
|
| 464 |
+
rwkvRow.appendChild(rwkvLabel);
|
| 465 |
+
rwkvRow.appendChild(document.createTextNode(' '));
|
| 466 |
+
rwkvRow.appendChild(rwkvValue);
|
| 467 |
+
tooltip.appendChild(rwkvRow);
|
| 468 |
+
|
| 469 |
+
const qwenRow = document.createElement('div');
|
| 470 |
+
const qwenLabel = document.createElement('span');
|
| 471 |
+
qwenLabel.className = 'label';
|
| 472 |
+
qwenLabel.textContent = 'Qwen Compression Rate:';
|
| 473 |
+
const qwenValue = document.createElement('span');
|
| 474 |
+
qwenValue.className = 'loss-b';
|
| 475 |
+
qwenValue.textContent = compressionB || '(empty)';
|
| 476 |
+
if (avgCompressionB) {
|
| 477 |
+
qwenValue.textContent += ' (avg: ' + avgCompressionB + '%)';
|
| 478 |
+
}
|
| 479 |
+
qwenRow.appendChild(qwenLabel);
|
| 480 |
+
qwenRow.appendChild(document.createTextNode(' '));
|
| 481 |
+
qwenRow.appendChild(qwenValue);
|
| 482 |
+
tooltip.appendChild(qwenRow);
|
| 483 |
+
|
| 484 |
+
const hr = document.createElement('hr');
|
| 485 |
+
hr.style.borderColor = '#555';
|
| 486 |
+
hr.style.margin = '6px 0';
|
| 487 |
+
tooltip.appendChild(hr);
|
| 488 |
+
|
| 489 |
+
tooltip.appendChild(formatTokenChips(modelA, 'RWKV', 'model-a'));
|
| 490 |
+
tooltip.appendChild(formatTokenChips(modelB, 'Qwen', 'model-b'));
|
| 491 |
+
|
| 492 |
+
if (top5A || top5B) {
|
| 493 |
+
const topkSection = document.createElement('div');
|
| 494 |
+
topkSection.className = 'topk-section';
|
| 495 |
+
const topkContainer = document.createElement('div');
|
| 496 |
+
topkContainer.className = 'topk-container';
|
| 497 |
+
topkContainer.appendChild(formatTopkColumn(top5A, 'RWKV Top10', 'model-a'));
|
| 498 |
+
topkContainer.appendChild(formatTopkColumn(top5B, 'Qwen Top10', 'model-b'));
|
| 499 |
+
topkSection.appendChild(topkContainer);
|
| 500 |
+
tooltip.appendChild(topkSection);
|
| 501 |
+
}
|
| 502 |
+
|
| 503 |
+
tooltip.style.display = 'block';
|
| 504 |
+
});
|
| 505 |
+
|
| 506 |
+
token.addEventListener('mousemove', (e) => {
|
| 507 |
+
const tooltipRect = tooltip.getBoundingClientRect();
|
| 508 |
+
const viewportWidth = window.innerWidth;
|
| 509 |
+
const viewportHeight = window.innerHeight;
|
| 510 |
+
|
| 511 |
+
let x = e.clientX + 15;
|
| 512 |
+
let y = e.clientY + 15;
|
| 513 |
+
|
| 514 |
+
if (x + tooltipRect.width > viewportWidth - 10) {
|
| 515 |
+
x = e.clientX - tooltipRect.width - 15;
|
| 516 |
+
}
|
| 517 |
+
if (y + tooltipRect.height > viewportHeight - 10) {
|
| 518 |
+
y = e.clientY - tooltipRect.height - 15;
|
| 519 |
+
}
|
| 520 |
+
if (x < 10) x = 10;
|
| 521 |
+
if (y < 10) y = 10;
|
| 522 |
+
|
| 523 |
+
tooltip.style.left = x + 'px';
|
| 524 |
+
tooltip.style.top = y + 'px';
|
| 525 |
+
});
|
| 526 |
+
|
| 527 |
+
token.addEventListener('mouseleave', () => {
|
| 528 |
+
tooltip.style.display = 'none';
|
| 529 |
+
});
|
| 530 |
+
});
|
| 531 |
+
|
| 532 |
+
const slider = document.getElementById('color-range-slider');
|
| 533 |
+
const rangeValue = document.getElementById('color-range-value');
|
| 534 |
+
|
| 535 |
+
// Collect all tuned_delta values
|
| 536 |
+
const tokenData = [];
|
| 537 |
+
const linebreakTokens = [];
|
| 538 |
+
tokenSpans.forEach((token, idx) => {
|
| 539 |
+
if (token.dataset.hasLinebreak === '1') {
|
| 540 |
+
linebreakTokens.push(token);
|
| 541 |
+
return;
|
| 542 |
+
}
|
| 543 |
+
const tunedDelta = parseFloat(token.dataset.tunedDelta);
|
| 544 |
+
if (!isNaN(tunedDelta)) {
|
| 545 |
+
tokenData.push({ token, tunedDelta, absDelta: Math.abs(tunedDelta) });
|
| 546 |
+
}
|
| 547 |
+
});
|
| 548 |
+
|
| 549 |
+
// Calculate max_abs_tuned_delta for normalization
|
| 550 |
+
const maxAbsDelta = Math.max(...tokenData.map(d => d.absDelta), 1e-9);
|
| 551 |
+
|
| 552 |
+
// Sort by |tuned_delta| to get rankings
|
| 553 |
+
const sortedByAbs = [...tokenData].sort((a, b) => b.absDelta - a.absDelta);
|
| 554 |
+
sortedByAbs.forEach((item, rank) => {
|
| 555 |
+
item.rank = rank; // rank 0 = largest deviation
|
| 556 |
+
});
|
| 557 |
+
|
| 558 |
+
function tunedDeltaToColor(tunedDelta, maxAbsDelta, exponent) {
|
| 559 |
+
// Normalize to [-1, 1]
|
| 560 |
+
const normalized = Math.max(-1, Math.min(1, tunedDelta / maxAbsDelta));
|
| 561 |
+
let r, g, b;
|
| 562 |
+
if (normalized < 0) {
|
| 563 |
+
// Green (RWKV better)
|
| 564 |
+
const intensity = Math.pow(-normalized, exponent);
|
| 565 |
+
r = Math.round(255 * (1 - intensity * 0.85));
|
| 566 |
+
g = 255;
|
| 567 |
+
b = Math.round(255 * (1 - intensity * 0.85));
|
| 568 |
+
} else {
|
| 569 |
+
// Red (RWKV worse)
|
| 570 |
+
const intensity = Math.pow(normalized, exponent);
|
| 571 |
+
r = 255;
|
| 572 |
+
g = Math.round(255 * (1 - intensity * 0.85));
|
| 573 |
+
b = Math.round(255 * (1 - intensity * 0.85));
|
| 574 |
+
}
|
| 575 |
+
return `rgb(${r}, ${g}, ${b})`;
|
| 576 |
+
}
|
| 577 |
+
|
| 578 |
+
function updateColors(colorRangePercent) {
|
| 579 |
+
// colorRangePercent: 0-100, represents the proportion of tokens to color
|
| 580 |
+
const colorCount = Math.round(tokenData.length * colorRangePercent / 100);
|
| 581 |
+
|
| 582 |
+
// Calculate exponent: 100% -> 0.5, 0% -> 1.0
|
| 583 |
+
const exponent = 1 - (colorRangePercent / 100) * 0.5;
|
| 584 |
+
|
| 585 |
+
// Calculate max deviation within the colored range
|
| 586 |
+
let maxAbsDeltaInRange = 1e-9;
|
| 587 |
+
tokenData.forEach(item => {
|
| 588 |
+
if (item.rank < colorCount) {
|
| 589 |
+
maxAbsDeltaInRange = Math.max(maxAbsDeltaInRange, item.absDelta);
|
| 590 |
+
}
|
| 591 |
+
});
|
| 592 |
+
|
| 593 |
+
tokenData.forEach(item => {
|
| 594 |
+
if (item.rank < colorCount) {
|
| 595 |
+
// Use dynamic normalization based on colored range
|
| 596 |
+
item.token.style.backgroundColor = tunedDeltaToColor(item.tunedDelta, maxAbsDeltaInRange, exponent);
|
| 597 |
+
} else {
|
| 598 |
+
// Outside color range, white
|
| 599 |
+
item.token.style.backgroundColor = 'rgb(255, 255, 255)';
|
| 600 |
+
}
|
| 601 |
+
});
|
| 602 |
+
linebreakTokens.forEach(token => {
|
| 603 |
+
token.style.backgroundColor = 'rgb(255, 255, 255)';
|
| 604 |
+
});
|
| 605 |
+
}
|
| 606 |
+
|
| 607 |
+
slider.addEventListener('input', (e) => {
|
| 608 |
+
const val = parseFloat(e.target.value);
|
| 609 |
+
rangeValue.textContent = val.toFixed(1) + '%';
|
| 610 |
+
updateColors(val);
|
| 611 |
+
});
|
| 612 |
+
|
| 613 |
+
// Apply default color range on page load
|
| 614 |
+
updateColors(10);
|
| 615 |
+
|
visualization/html_generator.py
CHANGED
|
@@ -4,16 +4,22 @@ HTML visualization generator for UncheatableEval.
|
|
| 4 |
Generates interactive HTML visualizations comparing byte-level losses between two models.
|
| 5 |
"""
|
| 6 |
|
| 7 |
-
import
|
| 8 |
import json
|
| 9 |
import math
|
| 10 |
import re
|
|
|
|
| 11 |
from typing import List, Tuple, Optional, Set
|
| 12 |
|
| 13 |
import numpy as np
|
| 14 |
|
|
|
|
|
|
|
|
|
|
| 15 |
from core.helpers import TokenizerBytesConverter
|
| 16 |
|
|
|
|
|
|
|
| 17 |
|
| 18 |
# Compression rate conversion factor
|
| 19 |
COMPRESSION_RATE_FACTOR = (1.0 / math.log(2.0)) * 0.125 * 100.0
|
|
@@ -113,10 +119,17 @@ def get_token_info_for_text(text: str) -> dict:
|
|
| 113 |
qwen_boundaries = set([0] + [t[1] for t in qwen_tokens])
|
| 114 |
rwkv_boundaries = set([0] + [t[1] for t in rwkv_tokens])
|
| 115 |
utf8_boundaries = set([0])
|
|
|
|
|
|
|
| 116 |
byte_pos = 0
|
| 117 |
for ch in text:
|
| 118 |
-
|
|
|
|
| 119 |
utf8_boundaries.add(byte_pos)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 120 |
common_boundaries = sorted(qwen_boundaries & rwkv_boundaries & utf8_boundaries)
|
| 121 |
# Ensure we always include the end boundary
|
| 122 |
text_end = len(text.encode("utf-8"))
|
|
@@ -124,6 +137,75 @@ def get_token_info_for_text(text: str) -> dict:
|
|
| 124 |
common_boundaries.append(text_end)
|
| 125 |
common_boundaries = sorted(common_boundaries)
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
return {
|
| 128 |
"common_boundaries": common_boundaries,
|
| 129 |
"qwen_tokens": qwen_tokens,
|
|
@@ -145,6 +227,8 @@ def generate_comparison_html(
|
|
| 145 |
tokenizer_b=None,
|
| 146 |
model_type_a: str = "hf",
|
| 147 |
model_type_b: str = "rwkv7",
|
|
|
|
|
|
|
| 148 |
) -> str:
|
| 149 |
"""
|
| 150 |
Generate an interactive HTML visualization comparing two models.
|
|
@@ -161,9 +245,11 @@ def generate_comparison_html(
|
|
| 161 |
tokenizer_b: Tokenizer for model B
|
| 162 |
model_type_a: Type of model A ("hf" or "rwkv7")
|
| 163 |
model_type_b: Type of model B ("hf" or "rwkv7")
|
|
|
|
|
|
|
| 164 |
|
| 165 |
Returns:
|
| 166 |
-
HTML string with interactive visualization
|
| 167 |
"""
|
| 168 |
|
| 169 |
def decode_token(token_id: int, tokenizer, model_type: str) -> Tuple[str, bool]:
|
|
@@ -197,7 +283,12 @@ def generate_comparison_html(
|
|
| 197 |
try:
|
| 198 |
if model_type in ["rwkv", "rwkv7"]:
|
| 199 |
# RWKV tokenizer provides raw bytes
|
| 200 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 201 |
if token_bytes:
|
| 202 |
try:
|
| 203 |
decoded = token_bytes.decode("utf-8")
|
|
@@ -290,7 +381,7 @@ def generate_comparison_html(
|
|
| 290 |
|
| 291 |
# Get token info
|
| 292 |
text_bytes = text.encode("utf-8")
|
| 293 |
-
token_info = get_token_info_for_text(text)
|
| 294 |
common_boundaries = token_info["common_boundaries"]
|
| 295 |
qwen_tokens = token_info["qwen_tokens"]
|
| 296 |
rwkv_tokens = token_info["rwkv_tokens"]
|
|
@@ -301,14 +392,13 @@ def generate_comparison_html(
|
|
| 301 |
|
| 302 |
def get_tokens_for_range(byte_start, byte_end, token_list):
|
| 303 |
result = []
|
| 304 |
-
for t_start, t_end, token_id, t_bytes in token_list:
|
| 305 |
if t_start < byte_end and t_end > byte_start:
|
| 306 |
-
result.append((token_id, t_bytes))
|
| 307 |
return result
|
| 308 |
|
| 309 |
# Build tokens based on common boundaries
|
| 310 |
tokens = []
|
| 311 |
-
token_count = 0
|
| 312 |
for i in range(len(common_boundaries) - 1):
|
| 313 |
start_byte = common_boundaries[i]
|
| 314 |
end_byte = common_boundaries[i + 1]
|
|
@@ -361,22 +451,8 @@ def generate_comparison_html(
|
|
| 361 |
token["word_id"] = word_id_counter
|
| 362 |
word_id_counter += 1
|
| 363 |
|
| 364 |
-
# Build HTML content
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
def escape_for_attr(s):
|
| 368 |
-
# Escape all characters that could break HTML attributes
|
| 369 |
-
# Order matters: & must be escaped first
|
| 370 |
-
return (
|
| 371 |
-
s.replace("&", "&")
|
| 372 |
-
.replace('"', """)
|
| 373 |
-
.replace("'", "'")
|
| 374 |
-
.replace("<", "<")
|
| 375 |
-
.replace(">", ">")
|
| 376 |
-
.replace("\n", " ")
|
| 377 |
-
.replace("\r", " ")
|
| 378 |
-
.replace("\t", "	")
|
| 379 |
-
)
|
| 380 |
|
| 381 |
for token in tokens:
|
| 382 |
token_text = token["text"]
|
|
@@ -402,18 +478,6 @@ def generate_comparison_html(
|
|
| 402 |
except UnicodeDecodeError:
|
| 403 |
return "".join([f"\\x{b:02x}" for b in token_bytes]), True
|
| 404 |
|
| 405 |
-
# Model A (RWKV7) - tokens overlapping this byte range
|
| 406 |
-
model_a_info = ""
|
| 407 |
-
if token["rwkv_tokens"]:
|
| 408 |
-
model_a_list = [[tid, *token_bytes_to_display_text(tb)] for tid, tb in token["rwkv_tokens"]]
|
| 409 |
-
model_a_info = base64.b64encode(json.dumps(model_a_list, ensure_ascii=False).encode("utf-8")).decode("ascii")
|
| 410 |
-
|
| 411 |
-
# Model B (Qwen3) - tokens overlapping this byte range
|
| 412 |
-
model_b_info = ""
|
| 413 |
-
if token["qwen_tokens"]:
|
| 414 |
-
model_b_list = [[tid, *token_bytes_to_display_text(tb)] for tid, tb in token["qwen_tokens"]]
|
| 415 |
-
model_b_info = base64.b64encode(json.dumps(model_b_list, ensure_ascii=False).encode("utf-8")).decode("ascii")
|
| 416 |
-
|
| 417 |
raw_bytes = list(text_bytes[byte_start:byte_end])
|
| 418 |
losses_a = byte_losses_a[byte_start:byte_end]
|
| 419 |
losses_b = byte_losses_b[byte_start:byte_end]
|
|
@@ -426,8 +490,8 @@ def generate_comparison_html(
|
|
| 426 |
avg_compression_a_token = sum(losses_a) / len(losses_a) * COMPRESSION_RATE_FACTOR if losses_a else 0
|
| 427 |
avg_compression_b_token = sum(losses_b) / len(losses_b) * COMPRESSION_RATE_FACTOR if losses_b else 0
|
| 428 |
|
| 429 |
-
|
| 430 |
-
|
| 431 |
if topk_predictions_a is not None and model_a_token_ranges:
|
| 432 |
model_a_token_idx = find_token_for_byte(byte_start, model_a_token_ranges)
|
| 433 |
if model_a_token_idx is not None and model_a_token_idx < len(topk_predictions_a):
|
|
@@ -435,19 +499,18 @@ def generate_comparison_html(
|
|
| 435 |
try:
|
| 436 |
if len(pred) >= 4:
|
| 437 |
actual_id, rank, actual_prob, topk_list = pred[0], pred[1], pred[2], pred[3]
|
| 438 |
-
|
| 439 |
actual_id,
|
| 440 |
rank,
|
| 441 |
actual_prob,
|
| 442 |
[[tid, prob, *decode_token(tid, tokenizer_a, model_type_a)] for tid, prob in topk_list],
|
| 443 |
]
|
| 444 |
else:
|
| 445 |
-
|
| 446 |
pred[0],
|
| 447 |
pred[1],
|
| 448 |
[[tid, prob, *decode_token(tid, tokenizer_a, model_type_a)] for tid, prob in pred[2]],
|
| 449 |
]
|
| 450 |
-
topk_a_json = base64.b64encode(json.dumps(decoded_pred, ensure_ascii=False).encode("utf-8")).decode("ascii")
|
| 451 |
except Exception as e:
|
| 452 |
pass
|
| 453 |
if topk_predictions_b is not None and model_b_token_ranges:
|
|
@@ -457,20 +520,17 @@ def generate_comparison_html(
|
|
| 457 |
try:
|
| 458 |
if len(pred) >= 4:
|
| 459 |
actual_id, rank, actual_prob, topk_list = pred[0], pred[1], pred[2], pred[3]
|
| 460 |
-
|
| 461 |
actual_id,
|
| 462 |
rank,
|
| 463 |
actual_prob,
|
| 464 |
[[tid, prob, *decode_token(tid, tokenizer_b, model_type_b)] for tid, prob in topk_list],
|
| 465 |
]
|
| 466 |
else:
|
| 467 |
-
|
| 468 |
-
topk_b_json = base64.b64encode(json.dumps(decoded_pred, ensure_ascii=False).encode("utf-8")).decode("ascii")
|
| 469 |
except Exception as e:
|
| 470 |
pass
|
| 471 |
|
| 472 |
-
token_count += 1
|
| 473 |
-
|
| 474 |
token_deltas = deltas[byte_start:byte_end]
|
| 475 |
avg_token_delta = sum(token_deltas) / len(token_deltas) if token_deltas else 0
|
| 476 |
tuned_delta = avg_token_delta - avg_delta
|
|
@@ -478,695 +538,124 @@ def generate_comparison_html(
|
|
| 478 |
# Initial rendering uses white color, JavaScript will apply colors based on slider
|
| 479 |
r, g, b = 255, 255, 255
|
| 480 |
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
if char == "<":
|
| 484 |
-
escaped_char = "<"
|
| 485 |
-
elif char == ">":
|
| 486 |
-
escaped_char = ">"
|
| 487 |
-
elif char == "&":
|
| 488 |
-
escaped_char = "&"
|
| 489 |
-
elif char == "\t":
|
| 490 |
-
escaped_char = " "
|
| 491 |
-
else:
|
| 492 |
-
escaped_char = char
|
| 493 |
-
token_html_parts.append(escaped_char)
|
| 494 |
-
|
| 495 |
-
token_span_content = "".join(token_html_parts)
|
| 496 |
-
data_attrs = (
|
| 497 |
-
f'data-model-a="{escape_for_attr(model_a_info)}" '
|
| 498 |
-
f'data-model-b="{escape_for_attr(model_b_info)}" '
|
| 499 |
-
f'data-bytes="{escape_for_attr(bytes_str)}" '
|
| 500 |
-
f'data-compression-a="{escape_for_attr(compression_a_str)}" '
|
| 501 |
-
f'data-compression-b="{escape_for_attr(compression_b_str)}" '
|
| 502 |
-
f'data-avg-compression-a="{avg_compression_a_token:.2f}" '
|
| 503 |
-
f'data-avg-compression-b="{avg_compression_b_token:.2f}" '
|
| 504 |
-
f'data-tuned-delta="{tuned_delta:.6f}" '
|
| 505 |
-
f'data-topk-a="{escape_for_attr(topk_a_json)}" '
|
| 506 |
-
f'data-topk-b="{escape_for_attr(topk_b_json)}"'
|
| 507 |
-
)
|
| 508 |
-
style_attr = f'style="background-color: rgb({r},{g},{b})"'
|
| 509 |
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
|
| 521 |
-
|
| 522 |
-
|
| 523 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 524 |
|
| 525 |
delta_color = "#64ff64" if avg_delta < 0 else "#ff6464"
|
| 526 |
|
| 527 |
-
|
| 528 |
-
|
| 529 |
-
|
| 530 |
-
|
| 531 |
-
|
| 532 |
-
|
| 533 |
-
|
| 534 |
-
|
| 535 |
-
|
| 536 |
-
|
| 537 |
-
|
| 538 |
-
|
| 539 |
-
|
| 540 |
-
|
| 541 |
-
|
| 542 |
-
|
| 543 |
-
|
| 544 |
-
|
| 545 |
-
|
| 546 |
-
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
flex-wrap: wrap;
|
| 554 |
-
gap: 20px;
|
| 555 |
-
font-size: 12px;
|
| 556 |
-
color: #c8c8c8;
|
| 557 |
-
}}
|
| 558 |
-
.legend {{
|
| 559 |
-
display: flex;
|
| 560 |
-
gap: 15px;
|
| 561 |
-
margin-top: 10px;
|
| 562 |
-
}}
|
| 563 |
-
.legend-item {{
|
| 564 |
-
display: flex;
|
| 565 |
-
align-items: center;
|
| 566 |
-
gap: 5px;
|
| 567 |
-
}}
|
| 568 |
-
.legend-box {{
|
| 569 |
-
width: 20px;
|
| 570 |
-
height: 12px;
|
| 571 |
-
border: 1px solid #666;
|
| 572 |
-
}}
|
| 573 |
-
.content {{
|
| 574 |
-
background-color: white;
|
| 575 |
-
margin: 10px;
|
| 576 |
-
padding: 15px;
|
| 577 |
-
border: 1px solid #ccc;
|
| 578 |
-
font-size: 14px;
|
| 579 |
-
line-height: 1.8;
|
| 580 |
-
white-space: pre-wrap;
|
| 581 |
-
overflow-wrap: anywhere;
|
| 582 |
-
word-wrap: break-word;
|
| 583 |
-
position: relative;
|
| 584 |
-
}}
|
| 585 |
-
.content span {{
|
| 586 |
-
padding: 1px 0;
|
| 587 |
-
}}
|
| 588 |
-
.word {{
|
| 589 |
-
cursor: pointer;
|
| 590 |
-
position: relative;
|
| 591 |
-
}}
|
| 592 |
-
.word:hover {{
|
| 593 |
-
outline: 2px solid #007bff;
|
| 594 |
-
outline-offset: 1px;
|
| 595 |
-
}}
|
| 596 |
-
.word.highlighted {{
|
| 597 |
-
outline: 2px solid #ff6b6b;
|
| 598 |
-
outline-offset: 1px;
|
| 599 |
-
}}
|
| 600 |
-
#svg-overlay {{
|
| 601 |
-
position: fixed;
|
| 602 |
-
top: 0;
|
| 603 |
-
left: 0;
|
| 604 |
-
width: 100%;
|
| 605 |
-
height: 100%;
|
| 606 |
-
pointer-events: none;
|
| 607 |
-
z-index: 1000;
|
| 608 |
-
}}
|
| 609 |
-
.link-line {{
|
| 610 |
-
stroke: #007bff;
|
| 611 |
-
stroke-width: 2;
|
| 612 |
-
fill: none;
|
| 613 |
-
opacity: 0.7;
|
| 614 |
-
}}
|
| 615 |
-
.link-dot {{
|
| 616 |
-
fill: #007bff;
|
| 617 |
-
opacity: 0.8;
|
| 618 |
-
}}
|
| 619 |
-
.token {{
|
| 620 |
-
position: relative;
|
| 621 |
-
cursor: help;
|
| 622 |
-
}}
|
| 623 |
-
.token:hover {{
|
| 624 |
-
outline: 1px dashed #666;
|
| 625 |
-
}}
|
| 626 |
-
#tooltip {{
|
| 627 |
-
position: fixed;
|
| 628 |
-
background-color: rgba(0, 0, 0, 0.9);
|
| 629 |
-
color: white;
|
| 630 |
-
padding: 10px 14px;
|
| 631 |
-
border-radius: 6px;
|
| 632 |
-
font-size: 12px;
|
| 633 |
-
max-width: 500px;
|
| 634 |
-
z-index: 2000;
|
| 635 |
-
pointer-events: none;
|
| 636 |
-
display: none;
|
| 637 |
-
line-height: 1.6;
|
| 638 |
-
box-shadow: 0 2px 10px rgba(0,0,0,0.3);
|
| 639 |
-
}}
|
| 640 |
-
#tooltip .label {{
|
| 641 |
-
color: #aaa;
|
| 642 |
-
font-weight: bold;
|
| 643 |
-
}}
|
| 644 |
-
#tooltip .bytes {{
|
| 645 |
-
color: #a5f3fc;
|
| 646 |
-
font-family: monospace;
|
| 647 |
-
}}
|
| 648 |
-
#tooltip .loss-a {{
|
| 649 |
-
color: #86efac;
|
| 650 |
-
font-family: monospace;
|
| 651 |
-
}}
|
| 652 |
-
#tooltip .loss-b {{
|
| 653 |
-
color: #fca5a5;
|
| 654 |
-
font-family: monospace;
|
| 655 |
-
}}
|
| 656 |
-
#tooltip .model-a {{
|
| 657 |
-
color: #fcd34d;
|
| 658 |
-
}}
|
| 659 |
-
#tooltip .model-b {{
|
| 660 |
-
color: #7dd3fc;
|
| 661 |
-
}}
|
| 662 |
-
#tooltip .topk-section {{
|
| 663 |
-
margin-top: 8px;
|
| 664 |
-
padding-top: 8px;
|
| 665 |
-
border-top: 1px solid #555;
|
| 666 |
-
}}
|
| 667 |
-
#tooltip .topk-container {{
|
| 668 |
-
display: flex;
|
| 669 |
-
gap: 16px;
|
| 670 |
-
}}
|
| 671 |
-
#tooltip .topk-column {{
|
| 672 |
-
flex: 1;
|
| 673 |
-
min-width: 180px;
|
| 674 |
-
}}
|
| 675 |
-
#tooltip .topk-title {{
|
| 676 |
-
color: #aaa;
|
| 677 |
-
font-weight: bold;
|
| 678 |
-
margin-bottom: 4px;
|
| 679 |
-
font-size: 11px;
|
| 680 |
-
}}
|
| 681 |
-
#tooltip .topk-title.model-a {{
|
| 682 |
-
color: #86efac;
|
| 683 |
-
}}
|
| 684 |
-
#tooltip .topk-title.model-b {{
|
| 685 |
-
color: #fca5a5;
|
| 686 |
-
}}
|
| 687 |
-
#tooltip .topk-list {{
|
| 688 |
-
font-size: 11px;
|
| 689 |
-
}}
|
| 690 |
-
#tooltip .topk-item {{
|
| 691 |
-
display: flex;
|
| 692 |
-
gap: 4px;
|
| 693 |
-
padding: 1px 0;
|
| 694 |
-
align-items: flex-start;
|
| 695 |
-
}}
|
| 696 |
-
#tooltip .token-block {{
|
| 697 |
-
margin-top: 6px;
|
| 698 |
-
display: flex;
|
| 699 |
-
align-items: center;
|
| 700 |
-
gap: 6px;
|
| 701 |
-
white-space: nowrap;
|
| 702 |
-
}}
|
| 703 |
-
#tooltip .token-chips {{
|
| 704 |
-
display: flex;
|
| 705 |
-
flex-wrap: nowrap;
|
| 706 |
-
gap: 4px;
|
| 707 |
-
}}
|
| 708 |
-
#tooltip .token-chip-group {{
|
| 709 |
-
display: inline-flex;
|
| 710 |
-
align-items: center;
|
| 711 |
-
gap: 4px;
|
| 712 |
-
}}
|
| 713 |
-
#tooltip .token-id {{
|
| 714 |
-
color: #888;
|
| 715 |
-
font-family: monospace;
|
| 716 |
-
}}
|
| 717 |
-
#tooltip .token-chip {{
|
| 718 |
-
max-width: 100%;
|
| 719 |
-
}}
|
| 720 |
-
#tooltip .topk-rank {{
|
| 721 |
-
color: #888;
|
| 722 |
-
min-width: 18px;
|
| 723 |
-
}}
|
| 724 |
-
#tooltip .topk-rank.hit {{
|
| 725 |
-
color: #ffd700;
|
| 726 |
-
}}
|
| 727 |
-
#tooltip .topk-token {{
|
| 728 |
-
color: #a5f3fc;
|
| 729 |
-
white-space: pre-wrap;
|
| 730 |
-
overflow-wrap: anywhere;
|
| 731 |
-
word-break: break-word;
|
| 732 |
-
font-family: monospace;
|
| 733 |
-
background-color: rgba(255, 255, 255, 0.08);
|
| 734 |
-
padding: 0 4px;
|
| 735 |
-
border-radius: 3px;
|
| 736 |
-
display: inline-block;
|
| 737 |
-
max-width: 100%;
|
| 738 |
-
}}
|
| 739 |
-
#tooltip .esc-control {{
|
| 740 |
-
color: #fbbf24;
|
| 741 |
-
}}
|
| 742 |
-
#tooltip .esc-raw {{
|
| 743 |
-
color: #fb7185;
|
| 744 |
-
}}
|
| 745 |
-
#tooltip .topk-prob {{
|
| 746 |
-
color: #86efac;
|
| 747 |
-
min-width: 45px;
|
| 748 |
-
text-align: right;
|
| 749 |
-
}}
|
| 750 |
-
#tooltip .topk-hit {{
|
| 751 |
-
color: #22c55e;
|
| 752 |
-
}}
|
| 753 |
-
#tooltip .topk-miss {{
|
| 754 |
-
color: #ef4444;
|
| 755 |
-
font-style: italic;
|
| 756 |
-
}}
|
| 757 |
-
</style>
|
| 758 |
-
</head>
|
| 759 |
-
<body>
|
| 760 |
-
<svg id="svg-overlay"></svg>
|
| 761 |
-
<div id="tooltip"></div>
|
| 762 |
-
<div class="header">
|
| 763 |
-
<div class="meta">
|
| 764 |
-
<div>Model A: {model_a_name}</div>
|
| 765 |
-
<div>Model B: {model_b_name}</div>
|
| 766 |
-
<div>RWKV Compression: {avg_compression_a:.2f}%</div>
|
| 767 |
-
<div>Qwen Compression: {avg_compression_b:.2f}%</div>
|
| 768 |
-
<div style="color: {delta_color}">Avg Delta: {avg_delta_compression:+.2f}%</div>
|
| 769 |
-
</div>
|
| 770 |
-
<div class="legend">
|
| 771 |
-
<div class="legend-item">
|
| 772 |
-
<div class="legend-box" style="background-color: rgb(77, 255, 77)"></div>
|
| 773 |
-
<span>RWKV better than avg</span>
|
| 774 |
-
</div>
|
| 775 |
-
<div class="legend-item">
|
| 776 |
-
<div class="legend-box" style="background-color: rgb(255, 255, 255)"></div>
|
| 777 |
-
<span>Equal to avg</span>
|
| 778 |
</div>
|
| 779 |
-
<div class="legend
|
| 780 |
-
<div class="legend-
|
| 781 |
-
|
| 782 |
-
|
| 783 |
-
|
| 784 |
-
<
|
| 785 |
-
|
| 786 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 787 |
</div>
|
| 788 |
</div>
|
| 789 |
-
|
| 790 |
-
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
-
|
| 796 |
-
|
| 797 |
-
|
| 798 |
-
|
| 799 |
-
|
| 800 |
-
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
}}
|
| 808 |
-
|
| 809 |
-
function pickRectByY(rects, targetY) {{
|
| 810 |
-
if (!rects || rects.length === 0) return null;
|
| 811 |
-
let best = rects[0];
|
| 812 |
-
let bestDist = Infinity;
|
| 813 |
-
rects.forEach(r => {{
|
| 814 |
-
const cy = r.top + r.height / 2;
|
| 815 |
-
const dist = Math.abs(cy - targetY);
|
| 816 |
-
if (dist < bestDist) {{
|
| 817 |
-
best = r;
|
| 818 |
-
bestDist = dist;
|
| 819 |
-
}}
|
| 820 |
-
}});
|
| 821 |
-
return best;
|
| 822 |
-
}}
|
| 823 |
-
|
| 824 |
-
function getAnchorRect(element, targetY) {{
|
| 825 |
-
const rects = Array.from(element.getClientRects());
|
| 826 |
-
if (rects.length === 0) return element.getBoundingClientRect();
|
| 827 |
-
if (rects.length === 1) return rects[0];
|
| 828 |
-
const picked = pickRectByY(rects, targetY);
|
| 829 |
-
return picked || rects[0];
|
| 830 |
-
}}
|
| 831 |
-
|
| 832 |
-
function drawLines(hoveredWord, evt) {{
|
| 833 |
-
clearLines();
|
| 834 |
-
|
| 835 |
-
const wordText = hoveredWord.getAttribute('data-word');
|
| 836 |
-
const wordId = parseInt(hoveredWord.getAttribute('data-word-id'));
|
| 837 |
-
const sameWords = wordGroups[wordText] || [];
|
| 838 |
-
|
| 839 |
-
const previousWords = sameWords.filter(w => {{
|
| 840 |
-
const id = parseInt(w.getAttribute('data-word-id'));
|
| 841 |
-
return id < wordId;
|
| 842 |
-
}});
|
| 843 |
-
|
| 844 |
-
if (previousWords.length === 0) return;
|
| 845 |
-
|
| 846 |
-
sameWords.forEach(w => w.classList.add('highlighted'));
|
| 847 |
-
|
| 848 |
-
const targetY = evt ? evt.clientY : (hoveredWord.getBoundingClientRect().top + hoveredWord.getBoundingClientRect().height / 2);
|
| 849 |
-
const hoveredRect = getAnchorRect(hoveredWord, targetY);
|
| 850 |
-
const hoveredX = hoveredRect.left + hoveredRect.width / 2;
|
| 851 |
-
const hoveredY = hoveredRect.top + hoveredRect.height / 2;
|
| 852 |
-
|
| 853 |
-
previousWords.forEach(prevWord => {{
|
| 854 |
-
const prevRect = getAnchorRect(prevWord, hoveredY);
|
| 855 |
-
const prevX = prevRect.left + prevRect.width / 2;
|
| 856 |
-
const prevY = prevRect.top + prevRect.height / 2;
|
| 857 |
-
|
| 858 |
-
const midX = (hoveredX + prevX) / 2;
|
| 859 |
-
const midY = Math.min(hoveredY, prevY) - 30;
|
| 860 |
-
|
| 861 |
-
const path = document.createElementNS('http://www.w3.org/2000/svg', 'path');
|
| 862 |
-
path.setAttribute('class', 'link-line');
|
| 863 |
-
path.setAttribute('d', `M ${{prevX}} ${{prevY}} Q ${{midX}} ${{midY}} ${{hoveredX}} ${{hoveredY}}`);
|
| 864 |
-
svgOverlay.appendChild(path);
|
| 865 |
-
|
| 866 |
-
const dot1 = document.createElementNS('http://www.w3.org/2000/svg', 'circle');
|
| 867 |
-
dot1.setAttribute('class', 'link-dot');
|
| 868 |
-
dot1.setAttribute('cx', prevX);
|
| 869 |
-
dot1.setAttribute('cy', prevY);
|
| 870 |
-
dot1.setAttribute('r', 4);
|
| 871 |
-
svgOverlay.appendChild(dot1);
|
| 872 |
-
|
| 873 |
-
const dot2 = document.createElementNS('http://www.w3.org/2000/svg', 'circle');
|
| 874 |
-
dot2.setAttribute('class', 'link-dot');
|
| 875 |
-
dot2.setAttribute('cx', hoveredX);
|
| 876 |
-
dot2.setAttribute('cy', hoveredY);
|
| 877 |
-
dot2.setAttribute('r', 4);
|
| 878 |
-
svgOverlay.appendChild(dot2);
|
| 879 |
-
}});
|
| 880 |
-
}}
|
| 881 |
-
|
| 882 |
-
words.forEach(word => {{
|
| 883 |
-
word.addEventListener('mouseenter', (e) => drawLines(word, e));
|
| 884 |
-
word.addEventListener('mouseleave', clearLines);
|
| 885 |
-
}});
|
| 886 |
-
|
| 887 |
-
window.addEventListener('scroll', clearLines);
|
| 888 |
-
|
| 889 |
-
const tooltip = document.getElementById('tooltip');
|
| 890 |
-
const tokenSpans = document.querySelectorAll('.token');
|
| 891 |
-
|
| 892 |
-
tokenSpans.forEach(token => {{
|
| 893 |
-
token.addEventListener('mouseenter', (e) => {{
|
| 894 |
-
const modelA = token.getAttribute('data-model-a') || '';
|
| 895 |
-
const modelB = token.getAttribute('data-model-b') || '';
|
| 896 |
-
const bytes = token.getAttribute('data-bytes') || '';
|
| 897 |
-
const compressionA = token.getAttribute('data-compression-a') || '';
|
| 898 |
-
const compressionB = token.getAttribute('data-compression-b') || '';
|
| 899 |
-
const avgCompressionA = token.getAttribute('data-avg-compression-a') || '';
|
| 900 |
-
const avgCompressionB = token.getAttribute('data-avg-compression-b') || '';
|
| 901 |
-
const top5A = token.getAttribute('data-topk-a') || '';
|
| 902 |
-
const top5B = token.getAttribute('data-topk-b') || '';
|
| 903 |
-
|
| 904 |
-
function decodeBase64Json(base64Str) {{
|
| 905 |
-
const binaryString = atob(base64Str);
|
| 906 |
-
const bytes = new Uint8Array(binaryString.length);
|
| 907 |
-
for (let i = 0; i < binaryString.length; i++) {{
|
| 908 |
-
bytes[i] = binaryString.charCodeAt(i);
|
| 909 |
-
}}
|
| 910 |
-
const jsonStr = new TextDecoder('utf-8').decode(bytes);
|
| 911 |
-
return JSON.parse(jsonStr);
|
| 912 |
-
}}
|
| 913 |
-
|
| 914 |
-
function escapeControlChars(text) {{
|
| 915 |
-
if (!text) return text;
|
| 916 |
-
let out = '';
|
| 917 |
-
for (let i = 0; i < text.length; i++) {{
|
| 918 |
-
const ch = text[i];
|
| 919 |
-
const code = text.charCodeAt(i);
|
| 920 |
-
if (ch === '\\\\') {{
|
| 921 |
-
out += '\\\\\\\\';
|
| 922 |
-
}} else if (ch === '\\n') {{
|
| 923 |
-
out += '\\\\n';
|
| 924 |
-
}} else if (ch === '\\r') {{
|
| 925 |
-
out += '\\\\r';
|
| 926 |
-
}} else if (ch === '\\t') {{
|
| 927 |
-
out += '\\\\t';
|
| 928 |
-
}} else if (code < 32 || code === 127) {{
|
| 929 |
-
out += '\\\\x' + code.toString(16).padStart(2, '0');
|
| 930 |
-
}} else {{
|
| 931 |
-
out += ch;
|
| 932 |
-
}}
|
| 933 |
-
}}
|
| 934 |
-
return out;
|
| 935 |
-
}}
|
| 936 |
-
|
| 937 |
-
function renderEscapedWithControlColor(text) {{
|
| 938 |
-
const escaped = (text || '')
|
| 939 |
-
.replace(/&/g, '&')
|
| 940 |
-
.replace(/</g, '<')
|
| 941 |
-
.replace(/>/g, '>');
|
| 942 |
-
return escaped.replace(/\\\\(x[0-9a-fA-F]{2}|[nrt])/g, '<span class="esc-control">\\\\$1</span>');
|
| 943 |
-
}}
|
| 944 |
-
|
| 945 |
-
function formatTopkColumn(topkBase64, modelName, titleClass) {{
|
| 946 |
-
if (!topkBase64) return '<div class="topk-column"><div class="topk-title ' + titleClass + '">' + modelName + '</div><div class="topk-list">N/A</div></div>';
|
| 947 |
-
try {{
|
| 948 |
-
const data = decodeBase64Json(topkBase64);
|
| 949 |
-
let actualId = null;
|
| 950 |
-
let rank = null;
|
| 951 |
-
let actualProb = null;
|
| 952 |
-
let topkList = [];
|
| 953 |
-
if (data.length >= 4) {{
|
| 954 |
-
[actualId, rank, actualProb, topkList] = data;
|
| 955 |
-
}} else {{
|
| 956 |
-
[actualId, rank, topkList] = data;
|
| 957 |
-
}}
|
| 958 |
-
let html = '<div class="topk-column">';
|
| 959 |
-
html += '<div class="topk-title ' + titleClass + '">' + modelName + '</div>';
|
| 960 |
-
html += '<div class="topk-list">';
|
| 961 |
-
topkList.forEach((item, idx) => {{
|
| 962 |
-
const tokenId = item[0];
|
| 963 |
-
const prob = item[1];
|
| 964 |
-
const tokenText = item[2];
|
| 965 |
-
const isRaw = item.length > 3 ? item[3] : false;
|
| 966 |
-
const isHit = tokenId === actualId;
|
| 967 |
-
const rankClass = isHit ? 'topk-rank hit' : 'topk-rank';
|
| 968 |
-
const rawText = (tokenText !== undefined && tokenText !== null) ? tokenText : '';
|
| 969 |
-
let displayText = '';
|
| 970 |
-
let htmlText = '';
|
| 971 |
-
if (isRaw) {{
|
| 972 |
-
displayText = (rawText !== '') ? rawText : ('[' + tokenId + ']');
|
| 973 |
-
const escapedText = displayText
|
| 974 |
-
.replace(/&/g, '&')
|
| 975 |
-
.replace(/</g, '<')
|
| 976 |
-
.replace(/>/g, '>');
|
| 977 |
-
htmlText = '<span class="esc-raw">' + escapedText + '</span>';
|
| 978 |
-
}} else {{
|
| 979 |
-
const visibleText = escapeControlChars(rawText);
|
| 980 |
-
displayText = (visibleText !== '') ? visibleText : ('[' + tokenId + ']');
|
| 981 |
-
htmlText = renderEscapedWithControlColor(displayText);
|
| 982 |
-
}}
|
| 983 |
-
html += '<div class="topk-item">';
|
| 984 |
-
html += '<span class="' + rankClass + '">' + (idx + 1) + '.</span>';
|
| 985 |
-
html += '<span class="topk-token" title="ID: ' + tokenId + '">' + htmlText + '</span>';
|
| 986 |
-
html += '<span class="topk-prob">' + (prob * 100).toFixed(1) + '%</span>';
|
| 987 |
-
if (isHit) html += '<span class="topk-hit">✓</span>';
|
| 988 |
-
html += '</div>';
|
| 989 |
-
}});
|
| 990 |
-
if (rank > 10) {{
|
| 991 |
-
let probSuffix = '';
|
| 992 |
-
const probVal = parseFloat(actualProb);
|
| 993 |
-
if (!isNaN(probVal)) {{
|
| 994 |
-
probSuffix = ' (' + (probVal * 100).toFixed(4) + '%)';
|
| 995 |
-
}}
|
| 996 |
-
html += '<div class="topk-item topk-miss">Actual rank: ' + rank + probSuffix + '</div>';
|
| 997 |
-
}}
|
| 998 |
-
html += '</div></div>';
|
| 999 |
-
return html;
|
| 1000 |
-
}} catch (e) {{
|
| 1001 |
-
console.error('Error in formatTopkColumn for ' + modelName + ':', e);
|
| 1002 |
-
console.error('topkBase64:', topkBase64);
|
| 1003 |
-
return '<div class="topk-column"><div class="topk-title ' + titleClass + '">' + modelName + '</div><div class="topk-list">Error: ' + e.message + '</div></div>';
|
| 1004 |
-
}}
|
| 1005 |
-
}}
|
| 1006 |
-
|
| 1007 |
-
function formatTokenChips(modelBase64, label, labelClass) {{
|
| 1008 |
-
if (!modelBase64) {{
|
| 1009 |
-
return '<div class="token-block"><span class="label ' + labelClass + '">' + label + ':</span> <span class="topk-token token-chip">N/A</span></div>';
|
| 1010 |
-
}}
|
| 1011 |
-
try {{
|
| 1012 |
-
const tokenList = decodeBase64Json(modelBase64);
|
| 1013 |
-
let html = '<div class="token-block">';
|
| 1014 |
-
html += '<span class="label ' + labelClass + '">' + label + ':</span>';
|
| 1015 |
-
html += '<div class="token-chips">';
|
| 1016 |
-
tokenList.forEach((item) => {{
|
| 1017 |
-
const tokenId = item[0];
|
| 1018 |
-
const tokenText = item[1];
|
| 1019 |
-
const isRaw = item.length > 2 ? item[2] : false;
|
| 1020 |
-
let displayText = '';
|
| 1021 |
-
let htmlText = '';
|
| 1022 |
-
if (isRaw) {{
|
| 1023 |
-
displayText = tokenText || '';
|
| 1024 |
-
const escapedText = displayText
|
| 1025 |
-
.replace(/&/g, '&')
|
| 1026 |
-
.replace(/</g, '<')
|
| 1027 |
-
.replace(/>/g, '>');
|
| 1028 |
-
htmlText = '<span class="esc-raw">' + escapedText + '</span>';
|
| 1029 |
-
}} else {{
|
| 1030 |
-
const visible = escapeControlChars(tokenText || '');
|
| 1031 |
-
displayText = (visible !== '') ? visible : '';
|
| 1032 |
-
htmlText = renderEscapedWithControlColor(displayText);
|
| 1033 |
-
}}
|
| 1034 |
-
html += '<span class="token-chip-group" title="ID: ' + tokenId + '">';
|
| 1035 |
-
html += '<span class="token-id">[' + tokenId + ']</span>';
|
| 1036 |
-
html += '<span class="topk-token token-chip">' + htmlText + '</span>';
|
| 1037 |
-
html += '</span>';
|
| 1038 |
-
}});
|
| 1039 |
-
html += '</div></div>';
|
| 1040 |
-
return html;
|
| 1041 |
-
}} catch (e) {{
|
| 1042 |
-
console.error('Error in formatTokenChips for ' + label + ':', e);
|
| 1043 |
-
console.error('modelBase64:', modelBase64);
|
| 1044 |
-
return '<div class="token-block"><span class="label ' + labelClass + '">' + label + ':</span> <span class="topk-token token-chip">Error: ' + e.message + '</span></div>';
|
| 1045 |
-
}}
|
| 1046 |
-
}}
|
| 1047 |
-
|
| 1048 |
-
let tooltipHtml = `
|
| 1049 |
-
<div><span class="label">Bytes:</span> <span class="bytes">${{bytes || '(empty)'}}</span></div>
|
| 1050 |
-
<div><span class="label">RWKV Compression Rate:</span> <span class="loss-a">${{compressionA || '(empty)'}}${{avgCompressionA ? ' (avg: ' + avgCompressionA + '%)' : ''}}</span></div>
|
| 1051 |
-
<div><span class="label">Qwen Compression Rate:</span> <span class="loss-b">${{compressionB || '(empty)'}}${{avgCompressionB ? ' (avg: ' + avgCompressionB + '%)' : ''}}</span></div>
|
| 1052 |
-
<hr style="border-color: #555; margin: 6px 0;">
|
| 1053 |
-
${{formatTokenChips(modelA, 'RWKV', 'model-a')}}
|
| 1054 |
-
${{formatTokenChips(modelB, 'Qwen', 'model-b')}}
|
| 1055 |
-
`;
|
| 1056 |
-
if (top5A || top5B) {{
|
| 1057 |
-
tooltipHtml += '<div class="topk-section"><div class="topk-container">';
|
| 1058 |
-
tooltipHtml += formatTopkColumn(top5A, 'RWKV Top10', 'model-a');
|
| 1059 |
-
tooltipHtml += formatTopkColumn(top5B, 'Qwen Top10', 'model-b');
|
| 1060 |
-
tooltipHtml += '</div></div>';
|
| 1061 |
-
}}
|
| 1062 |
-
tooltip.innerHTML = tooltipHtml;
|
| 1063 |
-
tooltip.style.display = 'block';
|
| 1064 |
-
}});
|
| 1065 |
-
|
| 1066 |
-
token.addEventListener('mousemove', (e) => {{
|
| 1067 |
-
const tooltipRect = tooltip.getBoundingClientRect();
|
| 1068 |
-
const viewportWidth = window.innerWidth;
|
| 1069 |
-
const viewportHeight = window.innerHeight;
|
| 1070 |
-
|
| 1071 |
-
let x = e.clientX + 15;
|
| 1072 |
-
let y = e.clientY + 15;
|
| 1073 |
-
|
| 1074 |
-
if (x + tooltipRect.width > viewportWidth - 10) {{
|
| 1075 |
-
x = e.clientX - tooltipRect.width - 15;
|
| 1076 |
-
}}
|
| 1077 |
-
if (y + tooltipRect.height > viewportHeight - 10) {{
|
| 1078 |
-
y = e.clientY - tooltipRect.height - 15;
|
| 1079 |
-
}}
|
| 1080 |
-
if (x < 10) x = 10;
|
| 1081 |
-
if (y < 10) y = 10;
|
| 1082 |
-
|
| 1083 |
-
tooltip.style.left = x + 'px';
|
| 1084 |
-
tooltip.style.top = y + 'px';
|
| 1085 |
-
}});
|
| 1086 |
-
|
| 1087 |
-
token.addEventListener('mouseleave', () => {{
|
| 1088 |
-
tooltip.style.display = 'none';
|
| 1089 |
-
}});
|
| 1090 |
-
}});
|
| 1091 |
-
|
| 1092 |
-
const slider = document.getElementById('color-range-slider');
|
| 1093 |
-
const rangeValue = document.getElementById('color-range-value');
|
| 1094 |
-
|
| 1095 |
-
// Collect all tuned_delta values
|
| 1096 |
-
const tokenData = [];
|
| 1097 |
-
tokenSpans.forEach((token, idx) => {{
|
| 1098 |
-
const tunedDelta = parseFloat(token.getAttribute('data-tuned-delta'));
|
| 1099 |
-
if (!isNaN(tunedDelta)) {{
|
| 1100 |
-
tokenData.push({{ token, tunedDelta, absDelta: Math.abs(tunedDelta) }});
|
| 1101 |
-
}}
|
| 1102 |
-
}});
|
| 1103 |
-
|
| 1104 |
-
// Calculate max_abs_tuned_delta for normalization
|
| 1105 |
-
const maxAbsDelta = Math.max(...tokenData.map(d => d.absDelta), 1e-9);
|
| 1106 |
-
|
| 1107 |
-
// Sort by |tuned_delta| to get rankings
|
| 1108 |
-
const sortedByAbs = [...tokenData].sort((a, b) => b.absDelta - a.absDelta);
|
| 1109 |
-
sortedByAbs.forEach((item, rank) => {{
|
| 1110 |
-
item.rank = rank; // rank 0 = largest deviation
|
| 1111 |
-
}});
|
| 1112 |
-
|
| 1113 |
-
function tunedDeltaToColor(tunedDelta, maxAbsDelta, exponent) {{
|
| 1114 |
-
// Normalize to [-1, 1]
|
| 1115 |
-
const normalized = Math.max(-1, Math.min(1, tunedDelta / maxAbsDelta));
|
| 1116 |
-
let r, g, b;
|
| 1117 |
-
if (normalized < 0) {{
|
| 1118 |
-
// Green (RWKV better)
|
| 1119 |
-
const intensity = Math.pow(-normalized, exponent);
|
| 1120 |
-
r = Math.round(255 * (1 - intensity * 0.85));
|
| 1121 |
-
g = 255;
|
| 1122 |
-
b = Math.round(255 * (1 - intensity * 0.85));
|
| 1123 |
-
}} else {{
|
| 1124 |
-
// Red (RWKV worse)
|
| 1125 |
-
const intensity = Math.pow(normalized, exponent);
|
| 1126 |
-
r = 255;
|
| 1127 |
-
g = Math.round(255 * (1 - intensity * 0.85));
|
| 1128 |
-
b = Math.round(255 * (1 - intensity * 0.85));
|
| 1129 |
-
}}
|
| 1130 |
-
return `rgb(${{r}}, ${{g}}, ${{b}})`;
|
| 1131 |
-
}}
|
| 1132 |
-
|
| 1133 |
-
function updateColors(colorRangePercent) {{
|
| 1134 |
-
// colorRangePercent: 0-100, represents the proportion of tokens to color
|
| 1135 |
-
const colorCount = Math.round(tokenData.length * colorRangePercent / 100);
|
| 1136 |
-
|
| 1137 |
-
// Calculate exponent: 100% -> 0.5, 0% -> 1.0
|
| 1138 |
-
const exponent = 1 - (colorRangePercent / 100) * 0.5;
|
| 1139 |
-
|
| 1140 |
-
// Calculate max deviation within the colored range
|
| 1141 |
-
let maxAbsDeltaInRange = 1e-9;
|
| 1142 |
-
tokenData.forEach(item => {{
|
| 1143 |
-
if (item.rank < colorCount) {{
|
| 1144 |
-
maxAbsDeltaInRange = Math.max(maxAbsDeltaInRange, item.absDelta);
|
| 1145 |
-
}}
|
| 1146 |
-
}});
|
| 1147 |
-
|
| 1148 |
-
tokenData.forEach(item => {{
|
| 1149 |
-
if (item.rank < colorCount) {{
|
| 1150 |
-
// Use dynamic normalization based on colored range
|
| 1151 |
-
item.token.style.backgroundColor = tunedDeltaToColor(item.tunedDelta, maxAbsDeltaInRange, exponent);
|
| 1152 |
-
}} else {{
|
| 1153 |
-
// Outside color range, white
|
| 1154 |
-
item.token.style.backgroundColor = 'rgb(255, 255, 255)';
|
| 1155 |
-
}}
|
| 1156 |
-
}});
|
| 1157 |
-
}}
|
| 1158 |
-
|
| 1159 |
-
slider.addEventListener('input', (e) => {{
|
| 1160 |
-
const val = parseFloat(e.target.value);
|
| 1161 |
-
rangeValue.textContent = val.toFixed(1) + '%';
|
| 1162 |
-
updateColors(val);
|
| 1163 |
-
}});
|
| 1164 |
-
|
| 1165 |
-
// Apply default color range on page load
|
| 1166 |
-
updateColors(10);
|
| 1167 |
-
</script>
|
| 1168 |
-
</body>
|
| 1169 |
-
</html>
|
| 1170 |
-
"""
|
| 1171 |
-
|
| 1172 |
-
return html
|
|
|
|
| 4 |
Generates interactive HTML visualizations comparing byte-level losses between two models.
|
| 5 |
"""
|
| 6 |
|
| 7 |
+
import bisect
|
| 8 |
import json
|
| 9 |
import math
|
| 10 |
import re
|
| 11 |
+
from pathlib import Path
|
| 12 |
from typing import List, Tuple, Optional, Set
|
| 13 |
|
| 14 |
import numpy as np
|
| 15 |
|
| 16 |
+
from core.escaping import escape_json_for_script
|
| 17 |
+
from core.render_model import RenderModel, TokenInfo, build_display
|
| 18 |
+
from visualization.render import render_page
|
| 19 |
from core.helpers import TokenizerBytesConverter
|
| 20 |
|
| 21 |
+
ASSETS_DIR = Path(__file__).resolve().parent / "assets"
|
| 22 |
+
|
| 23 |
|
| 24 |
# Compression rate conversion factor
|
| 25 |
COMPRESSION_RATE_FACTOR = (1.0 / math.log(2.0)) * 0.125 * 100.0
|
|
|
|
| 119 |
qwen_boundaries = set([0] + [t[1] for t in qwen_tokens])
|
| 120 |
rwkv_boundaries = set([0] + [t[1] for t in rwkv_tokens])
|
| 121 |
utf8_boundaries = set([0])
|
| 122 |
+
whitespace_boundaries = set()
|
| 123 |
+
linebreak_boundaries = set()
|
| 124 |
byte_pos = 0
|
| 125 |
for ch in text:
|
| 126 |
+
ch_bytes = ch.encode("utf-8")
|
| 127 |
+
byte_pos += len(ch_bytes)
|
| 128 |
utf8_boundaries.add(byte_pos)
|
| 129 |
+
if ch.isspace():
|
| 130 |
+
whitespace_boundaries.add(byte_pos)
|
| 131 |
+
if ch in ("\n", "\r"):
|
| 132 |
+
linebreak_boundaries.add(byte_pos)
|
| 133 |
common_boundaries = sorted(qwen_boundaries & rwkv_boundaries & utf8_boundaries)
|
| 134 |
# Ensure we always include the end boundary
|
| 135 |
text_end = len(text.encode("utf-8"))
|
|
|
|
| 137 |
common_boundaries.append(text_end)
|
| 138 |
common_boundaries = sorted(common_boundaries)
|
| 139 |
|
| 140 |
+
# Refine overly large segments to avoid giant spans in the UI.
|
| 141 |
+
max_segment_bytes = 24
|
| 142 |
+
utf8_sorted = sorted(utf8_boundaries)
|
| 143 |
+
linebreak_sorted = sorted(linebreak_boundaries)
|
| 144 |
+
|
| 145 |
+
def split_by_max(start: int, end: int) -> List[int]:
|
| 146 |
+
if end - start <= max_segment_bytes:
|
| 147 |
+
return [end]
|
| 148 |
+
left = bisect.bisect_right(utf8_sorted, start)
|
| 149 |
+
right = bisect.bisect_left(utf8_sorted, end)
|
| 150 |
+
candidates = utf8_sorted[left:right]
|
| 151 |
+
if not candidates:
|
| 152 |
+
return [end]
|
| 153 |
+
out = []
|
| 154 |
+
pos = start
|
| 155 |
+
idx = 0
|
| 156 |
+
while pos < end:
|
| 157 |
+
limit = min(end, pos + max_segment_bytes)
|
| 158 |
+
j = bisect.bisect_right(candidates, limit) - 1
|
| 159 |
+
if j < idx:
|
| 160 |
+
out.append(end)
|
| 161 |
+
break
|
| 162 |
+
split_at = None
|
| 163 |
+
for k in range(j, idx - 1, -1):
|
| 164 |
+
if candidates[k] in whitespace_boundaries:
|
| 165 |
+
split_at = candidates[k]
|
| 166 |
+
j = k
|
| 167 |
+
break
|
| 168 |
+
if split_at is None:
|
| 169 |
+
split_at = candidates[j]
|
| 170 |
+
if split_at <= pos:
|
| 171 |
+
split_at = candidates[j]
|
| 172 |
+
out.append(split_at)
|
| 173 |
+
pos = split_at
|
| 174 |
+
idx = j + 1
|
| 175 |
+
if pos >= end:
|
| 176 |
+
break
|
| 177 |
+
if idx >= len(candidates):
|
| 178 |
+
out.append(end)
|
| 179 |
+
break
|
| 180 |
+
if not out:
|
| 181 |
+
out = [end]
|
| 182 |
+
elif out[-1] != end:
|
| 183 |
+
out.append(end)
|
| 184 |
+
return out
|
| 185 |
+
|
| 186 |
+
def split_segment(start: int, end: int) -> List[int]:
|
| 187 |
+
if start >= end:
|
| 188 |
+
return []
|
| 189 |
+
lb_left = bisect.bisect_right(linebreak_sorted, start)
|
| 190 |
+
lb_right = bisect.bisect_left(linebreak_sorted, end)
|
| 191 |
+
linebreaks = linebreak_sorted[lb_left:lb_right]
|
| 192 |
+
if not linebreaks:
|
| 193 |
+
return split_by_max(start, end)
|
| 194 |
+
out = []
|
| 195 |
+
seg_start = start
|
| 196 |
+
for lb in linebreaks:
|
| 197 |
+
out.extend(split_by_max(seg_start, lb))
|
| 198 |
+
seg_start = lb
|
| 199 |
+
out.extend(split_by_max(seg_start, end))
|
| 200 |
+
return out
|
| 201 |
+
|
| 202 |
+
refined_boundaries = [common_boundaries[0]] if common_boundaries else [0]
|
| 203 |
+
for i in range(len(common_boundaries) - 1):
|
| 204 |
+
start = common_boundaries[i]
|
| 205 |
+
end = common_boundaries[i + 1]
|
| 206 |
+
refined_boundaries.extend(split_segment(start, end))
|
| 207 |
+
common_boundaries = sorted(set(refined_boundaries))
|
| 208 |
+
|
| 209 |
return {
|
| 210 |
"common_boundaries": common_boundaries,
|
| 211 |
"qwen_tokens": qwen_tokens,
|
|
|
|
| 227 |
tokenizer_b=None,
|
| 228 |
model_type_a: str = "hf",
|
| 229 |
model_type_b: str = "rwkv7",
|
| 230 |
+
token_info_override: Optional[dict] = None,
|
| 231 |
+
return_render_model: bool = False,
|
| 232 |
) -> str:
|
| 233 |
"""
|
| 234 |
Generate an interactive HTML visualization comparing two models.
|
|
|
|
| 245 |
tokenizer_b: Tokenizer for model B
|
| 246 |
model_type_a: Type of model A ("hf" or "rwkv7")
|
| 247 |
model_type_b: Type of model B ("hf" or "rwkv7")
|
| 248 |
+
token_info_override: Optional precomputed token info (for offline tests).
|
| 249 |
+
return_render_model: If True, return (html, render_model_dict)
|
| 250 |
|
| 251 |
Returns:
|
| 252 |
+
HTML string with interactive visualization, or (html, render_model_dict) if return_render_model=True
|
| 253 |
"""
|
| 254 |
|
| 255 |
def decode_token(token_id: int, tokenizer, model_type: str) -> Tuple[str, bool]:
|
|
|
|
| 283 |
try:
|
| 284 |
if model_type in ["rwkv", "rwkv7"]:
|
| 285 |
# RWKV tokenizer provides raw bytes
|
| 286 |
+
try:
|
| 287 |
+
token_bytes = tokenizer.decodeBytes([token_id])
|
| 288 |
+
except Exception as e:
|
| 289 |
+
if token_id == 0:
|
| 290 |
+
return f"[{token_id}]", False
|
| 291 |
+
raise e
|
| 292 |
if token_bytes:
|
| 293 |
try:
|
| 294 |
decoded = token_bytes.decode("utf-8")
|
|
|
|
| 381 |
|
| 382 |
# Get token info
|
| 383 |
text_bytes = text.encode("utf-8")
|
| 384 |
+
token_info = token_info_override if token_info_override is not None else get_token_info_for_text(text)
|
| 385 |
common_boundaries = token_info["common_boundaries"]
|
| 386 |
qwen_tokens = token_info["qwen_tokens"]
|
| 387 |
rwkv_tokens = token_info["rwkv_tokens"]
|
|
|
|
| 392 |
|
| 393 |
def get_tokens_for_range(byte_start, byte_end, token_list):
|
| 394 |
result = []
|
| 395 |
+
for idx, (t_start, t_end, token_id, t_bytes) in enumerate(token_list):
|
| 396 |
if t_start < byte_end and t_end > byte_start:
|
| 397 |
+
result.append((idx, token_id, t_bytes))
|
| 398 |
return result
|
| 399 |
|
| 400 |
# Build tokens based on common boundaries
|
| 401 |
tokens = []
|
|
|
|
| 402 |
for i in range(len(common_boundaries) - 1):
|
| 403 |
start_byte = common_boundaries[i]
|
| 404 |
end_byte = common_boundaries[i + 1]
|
|
|
|
| 451 |
token["word_id"] = word_id_counter
|
| 452 |
word_id_counter += 1
|
| 453 |
|
| 454 |
+
# Build render model (HTML content built in JS)
|
| 455 |
+
render_tokens = []
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 456 |
|
| 457 |
for token in tokens:
|
| 458 |
token_text = token["text"]
|
|
|
|
| 478 |
except UnicodeDecodeError:
|
| 479 |
return "".join([f"\\x{b:02x}" for b in token_bytes]), True
|
| 480 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 481 |
raw_bytes = list(text_bytes[byte_start:byte_end])
|
| 482 |
losses_a = byte_losses_a[byte_start:byte_end]
|
| 483 |
losses_b = byte_losses_b[byte_start:byte_end]
|
|
|
|
| 490 |
avg_compression_a_token = sum(losses_a) / len(losses_a) * COMPRESSION_RATE_FACTOR if losses_a else 0
|
| 491 |
avg_compression_b_token = sum(losses_b) / len(losses_b) * COMPRESSION_RATE_FACTOR if losses_b else 0
|
| 492 |
|
| 493 |
+
topk_a_data = None
|
| 494 |
+
topk_b_data = None
|
| 495 |
if topk_predictions_a is not None and model_a_token_ranges:
|
| 496 |
model_a_token_idx = find_token_for_byte(byte_start, model_a_token_ranges)
|
| 497 |
if model_a_token_idx is not None and model_a_token_idx < len(topk_predictions_a):
|
|
|
|
| 499 |
try:
|
| 500 |
if len(pred) >= 4:
|
| 501 |
actual_id, rank, actual_prob, topk_list = pred[0], pred[1], pred[2], pred[3]
|
| 502 |
+
topk_a_data = [
|
| 503 |
actual_id,
|
| 504 |
rank,
|
| 505 |
actual_prob,
|
| 506 |
[[tid, prob, *decode_token(tid, tokenizer_a, model_type_a)] for tid, prob in topk_list],
|
| 507 |
]
|
| 508 |
else:
|
| 509 |
+
topk_a_data = [
|
| 510 |
pred[0],
|
| 511 |
pred[1],
|
| 512 |
[[tid, prob, *decode_token(tid, tokenizer_a, model_type_a)] for tid, prob in pred[2]],
|
| 513 |
]
|
|
|
|
| 514 |
except Exception as e:
|
| 515 |
pass
|
| 516 |
if topk_predictions_b is not None and model_b_token_ranges:
|
|
|
|
| 520 |
try:
|
| 521 |
if len(pred) >= 4:
|
| 522 |
actual_id, rank, actual_prob, topk_list = pred[0], pred[1], pred[2], pred[3]
|
| 523 |
+
topk_b_data = [
|
| 524 |
actual_id,
|
| 525 |
rank,
|
| 526 |
actual_prob,
|
| 527 |
[[tid, prob, *decode_token(tid, tokenizer_b, model_type_b)] for tid, prob in topk_list],
|
| 528 |
]
|
| 529 |
else:
|
| 530 |
+
topk_b_data = [pred[0], pred[1], [[tid, prob, *decode_token(tid, tokenizer_b, model_type_b)] for tid, prob in pred[2]]]
|
|
|
|
| 531 |
except Exception as e:
|
| 532 |
pass
|
| 533 |
|
|
|
|
|
|
|
| 534 |
token_deltas = deltas[byte_start:byte_end]
|
| 535 |
avg_token_delta = sum(token_deltas) / len(token_deltas) if token_deltas else 0
|
| 536 |
tuned_delta = avg_token_delta - avg_delta
|
|
|
|
| 538 |
# Initial rendering uses white color, JavaScript will apply colors based on slider
|
| 539 |
r, g, b = 255, 255, 255
|
| 540 |
|
| 541 |
+
raw_display_text = token_text
|
| 542 |
+
display_text = token_text.replace("\t", " ")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 543 |
|
| 544 |
+
def classify_kind(text_value: str, is_raw_value: bool) -> str:
|
| 545 |
+
return build_display(text_value, is_raw=is_raw_value).kind
|
| 546 |
+
|
| 547 |
+
def get_actual_prob(topk_predictions, token_idx: Optional[int]):
|
| 548 |
+
if not topk_predictions or token_idx is None:
|
| 549 |
+
return None
|
| 550 |
+
if token_idx < 0 or token_idx >= len(topk_predictions):
|
| 551 |
+
return None
|
| 552 |
+
pred = topk_predictions[token_idx]
|
| 553 |
+
if isinstance(pred, (list, tuple)) and len(pred) >= 3:
|
| 554 |
+
return pred[2]
|
| 555 |
+
return None
|
| 556 |
+
|
| 557 |
+
model_tokens_render = {}
|
| 558 |
+
if token["rwkv_tokens"]:
|
| 559 |
+
rwkv_items = []
|
| 560 |
+
for tok_idx, tid, tb in token["rwkv_tokens"]:
|
| 561 |
+
txt, is_raw = token_bytes_to_display_text(tb)
|
| 562 |
+
rwkv_items.append([tid, txt, classify_kind(txt, is_raw), get_actual_prob(topk_predictions_a, tok_idx)])
|
| 563 |
+
model_tokens_render["rwkv"] = rwkv_items
|
| 564 |
+
if token["qwen_tokens"]:
|
| 565 |
+
qwen_items = []
|
| 566 |
+
for tok_idx, tid, tb in token["qwen_tokens"]:
|
| 567 |
+
txt, is_raw = token_bytes_to_display_text(tb)
|
| 568 |
+
qwen_items.append([tid, txt, classify_kind(txt, is_raw), get_actual_prob(topk_predictions_b, tok_idx)])
|
| 569 |
+
model_tokens_render["qwen"] = qwen_items
|
| 570 |
+
|
| 571 |
+
display_info = build_display(raw_display_text, is_raw=not decoded_ok)
|
| 572 |
+
if display_info.kind == "control":
|
| 573 |
+
display_text = raw_display_text
|
| 574 |
+
display_info.text = display_text
|
| 575 |
+
render_tokens.append(
|
| 576 |
+
TokenInfo(
|
| 577 |
+
byte_start=byte_start,
|
| 578 |
+
byte_end=byte_end,
|
| 579 |
+
display=display_info,
|
| 580 |
+
is_word=token["type"] == "word",
|
| 581 |
+
word_id=token.get("word_id"),
|
| 582 |
+
word_key=token.get("word_lower"),
|
| 583 |
+
bytes_hex=bytes_str,
|
| 584 |
+
compression={"rwkv": compression_a_str, "qwen": compression_b_str},
|
| 585 |
+
model_tokens=model_tokens_render,
|
| 586 |
+
loss={"rwkv": avg_compression_a_token, "qwen": avg_compression_b_token},
|
| 587 |
+
topk={
|
| 588 |
+
"rwkv": topk_a_data,
|
| 589 |
+
"qwen": topk_b_data,
|
| 590 |
+
},
|
| 591 |
+
tuned_delta=tuned_delta,
|
| 592 |
+
)
|
| 593 |
+
)
|
| 594 |
|
| 595 |
delta_color = "#64ff64" if avg_delta < 0 else "#ff6464"
|
| 596 |
|
| 597 |
+
render_model = RenderModel(
|
| 598 |
+
text=text,
|
| 599 |
+
tokens=render_tokens,
|
| 600 |
+
meta={
|
| 601 |
+
"model_a": model_a_name,
|
| 602 |
+
"model_b": model_b_name,
|
| 603 |
+
"avg_compression": {
|
| 604 |
+
"rwkv": avg_compression_a,
|
| 605 |
+
"qwen": avg_compression_b,
|
| 606 |
+
},
|
| 607 |
+
"avg_delta": avg_delta,
|
| 608 |
+
"avg_delta_compression": avg_delta_compression,
|
| 609 |
+
},
|
| 610 |
+
)
|
| 611 |
+
render_model_json = escape_json_for_script(render_model.to_dict())
|
| 612 |
+
|
| 613 |
+
style_block = (ASSETS_DIR / "main.css").read_text(encoding="utf-8")
|
| 614 |
+
|
| 615 |
+
header_html = f"""
|
| 616 |
+
<div class="header">
|
| 617 |
+
<div class="meta">
|
| 618 |
+
<div>Model A: {model_a_name}</div>
|
| 619 |
+
<div>Model B: {model_b_name}</div>
|
| 620 |
+
<div>RWKV Compression: {avg_compression_a:.2f}%</div>
|
| 621 |
+
<div>Qwen Compression: {avg_compression_b:.2f}%</div>
|
| 622 |
+
<div style="color: {delta_color}">Avg Delta: {avg_delta_compression:+.2f}%</div>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 623 |
</div>
|
| 624 |
+
<div class="legend">
|
| 625 |
+
<div class="legend-item">
|
| 626 |
+
<div class="legend-box" style="background-color: rgb(77, 255, 77)"></div>
|
| 627 |
+
<span>RWKV better than avg</span>
|
| 628 |
+
</div>
|
| 629 |
+
<div class="legend-item">
|
| 630 |
+
<div class="legend-box" style="background-color: rgb(255, 255, 255)"></div>
|
| 631 |
+
<span>Equal to avg</span>
|
| 632 |
+
</div>
|
| 633 |
+
<div class="legend-item">
|
| 634 |
+
<div class="legend-box" style="background-color: rgb(255, 77, 77)"></div>
|
| 635 |
+
<span>RWKV worse than avg</span>
|
| 636 |
+
</div>
|
| 637 |
+
<div class="legend-item" style="margin-left: 20px;">
|
| 638 |
+
<span style="color: #aaa;">Color Range:</span>
|
| 639 |
+
<input type="range" id="color-range-slider" min="0" max="100" value="10" step="0.1" style="width: 200px; vertical-align: middle;">
|
| 640 |
+
<span id="color-range-value" style="color: #fff; min-width: 45px; display: inline-block;">10%</span>
|
| 641 |
+
</div>
|
| 642 |
</div>
|
| 643 |
</div>
|
| 644 |
+
""".strip("\n")
|
| 645 |
+
|
| 646 |
+
script_body = (ASSETS_DIR / "main.js").read_text(encoding="utf-8")
|
| 647 |
+
|
| 648 |
+
html_doc = render_page(
|
| 649 |
+
{
|
| 650 |
+
"page_title": "Model Comparison",
|
| 651 |
+
"style_block": style_block.strip("\n"),
|
| 652 |
+
"header_html": header_html,
|
| 653 |
+
"content_html": "",
|
| 654 |
+
"render_model_json": render_model_json,
|
| 655 |
+
"script_body": script_body.strip("\n"),
|
| 656 |
+
}
|
| 657 |
+
)
|
| 658 |
+
|
| 659 |
+
if return_render_model:
|
| 660 |
+
return html_doc, render_model.to_dict()
|
| 661 |
+
return html_doc
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
visualization/render.py
ADDED
|
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Page rendering using a lightweight template.
|
| 3 |
+
"""
|
| 4 |
+
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from string import Template
|
| 7 |
+
from typing import Dict
|
| 8 |
+
|
| 9 |
+
|
| 10 |
+
_TEMPLATE_PATH = Path(__file__).resolve().parent / "templates" / "page.html.tmpl"
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def render_page(context: Dict[str, str]) -> str:
|
| 14 |
+
template_text = _TEMPLATE_PATH.read_text(encoding="utf-8")
|
| 15 |
+
template = Template(template_text)
|
| 16 |
+
return template.safe_substitute(context)
|
visualization/templates/page.html.tmpl
ADDED
|
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<!DOCTYPE html>
|
| 2 |
+
<html>
|
| 3 |
+
<head>
|
| 4 |
+
<meta charset="UTF-8">
|
| 5 |
+
<title>$page_title</title>
|
| 6 |
+
<style>
|
| 7 |
+
$style_block
|
| 8 |
+
</style>
|
| 9 |
+
</head>
|
| 10 |
+
<body>
|
| 11 |
+
<svg id="svg-overlay"></svg>
|
| 12 |
+
<div id="tooltip"></div>
|
| 13 |
+
$header_html
|
| 14 |
+
<div class="content">$content_html</div>
|
| 15 |
+
<script id="render-model" type="application/json">$render_model_json</script>
|
| 16 |
+
<script>
|
| 17 |
+
$script_body
|
| 18 |
+
</script>
|
| 19 |
+
</body>
|
| 20 |
+
</html>
|