Jellyfish042 commited on
Commit
44c2c6d
·
1 Parent(s): 257183f

bug fix and improvements

Browse files
app.py CHANGED
@@ -12,7 +12,8 @@ import gradio as gr
12
  import torch
13
 
14
  # Detect device
15
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
16
  IS_CPU = DEVICE == "cpu"
17
 
18
  # Model configuration
 
12
  import torch
13
 
14
  # Detect device
15
+ # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
16
+ DEVICE = "cpu"
17
  IS_CPU = DEVICE == "cpu"
18
 
19
  # Model configuration
core/evaluator.py CHANGED
@@ -47,7 +47,7 @@ def extract_topk_predictions(logit: torch.Tensor, target_ids: torch.Tensor, k: i
47
  k: number of top predictions to extract (default: 10)
48
 
49
  Returns:
50
- list: [[actual_id, rank, [[id1, prob1], [id2, prob2], ...]], ...]
51
  """
52
  probs = F.softmax(logit, dim=-1)
53
  top_probs, top_ids = torch.topk(probs, k, dim=-1)
@@ -59,7 +59,7 @@ def extract_topk_predictions(logit: torch.Tensor, target_ids: torch.Tensor, k: i
59
  rank = (probs[pos] > actual_prob).sum().item() + 1
60
 
61
  topk_list = [[top_ids[pos, i].item(), round(top_probs[pos, i].item(), 6)] for i in range(k)]
62
- results.append([target_id, rank, topk_list])
63
 
64
  return results
65
 
 
47
  k: number of top predictions to extract (default: 10)
48
 
49
  Returns:
50
+ list: [[actual_id, rank, actual_prob, [[id1, prob1], [id2, prob2], ...]], ...]
51
  """
52
  probs = F.softmax(logit, dim=-1)
53
  top_probs, top_ids = torch.topk(probs, k, dim=-1)
 
59
  rank = (probs[pos] > actual_prob).sum().item() + 1
60
 
61
  topk_list = [[top_ids[pos, i].item(), round(top_probs[pos, i].item(), 6)] for i in range(k)]
62
+ results.append([target_id, rank, actual_prob, topk_list])
63
 
64
  return results
65
 
precompute_example.py CHANGED
@@ -26,7 +26,8 @@ QWEN_MODEL_ID = "Qwen/Qwen3-1.7B-Base"
26
  RWKV_MODEL_FILENAME = "rwkv7-g1c-1.5b-20260110-ctx8192.pth"
27
 
28
  # Detect device
29
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
30
  IS_CPU = DEVICE == "cpu"
31
 
32
 
 
26
  RWKV_MODEL_FILENAME = "rwkv7-g1c-1.5b-20260110-ctx8192.pth"
27
 
28
  # Detect device
29
+ # DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
30
+ DEVICE = "cpu"
31
  IS_CPU = DEVICE == "cpu"
32
 
33
 
precomputed/example_metadata.json CHANGED
@@ -1,7 +1,7 @@
1
  {
2
  "example_text": "The Bitter Lesson\nRich Sutton\nMarch 13, 2019\nThe biggest lesson that can be read from 70 years of AI research is that general methods that leverage computation are ultimately the most effective, and by a large margin. The ultimate reason for this is Moore's law, or rather its generalization of continued exponentially falling cost per unit of computation. Most AI research has been conducted as if the computation available to the agent were constant (in which case leveraging human knowledge would be one of the only ways to improve performance) but, over a slightly longer time than a typical research project, massively more computation inevitably becomes available. Seeking an improvement that makes a difference in the shorter term, researchers seek to leverage their human knowledge of the domain, but the only thing that matters in the long run is the leveraging of computation. These two need not run counter to each other, but in practice they tend to. Time spent on one is time not spent on the other. There are psychological commitments to investment in one approach or the other. And the human-knowledge approach tends to complicate methods in ways that make them less suited to taking advantage of general methods leveraging computation. There were many examples of AI researchers' belated learning of this bitter lesson, and it is instructive to review some of the most prominent.\n\nIn computer chess, the methods that defeated the world champion, Kasparov, in 1997, were based on massive, deep search. At the time, this was looked upon with dismay by the majority of computer-chess researchers who had pursued methods that leveraged human understanding of the special structure of chess. When a simpler, search-based approach with special hardware and software proved vastly more effective, these human-knowledge-based chess researchers were not good losers. They said that ``brute force\" search may have won this time, but it was not a general strategy, and anyway it was not how people played chess. These researchers wanted methods based on human input to win and were disappointed when they did not.\n\nA similar pattern of research progress was seen in computer Go, only delayed by a further 20 years. Enormous initial efforts went into avoiding search by taking advantage of human knowledge, or of the special features of the game, but all those efforts proved irrelevant, or worse, once search was applied effectively at scale. Also important was the use of learning by self play to learn a value function (as it was in many other games and even in chess, although learning did not play a big role in the 1997 program that first beat a world champion). Learning by self play, and learning in general, is like search in that it enables massive computation to be brought to bear. Search and learning are the two most important classes of techniques for utilizing massive amounts of computation in AI research. In computer Go, as in computer chess, researchers' initial effort was directed towards utilizing human understanding (so that less search was needed) and only much later was much greater success had by embracing search and learning.\n\nIn speech recognition, there was an early competition, sponsored by DARPA, in the 1970s. Entrants included a host of special methods that took advantage of human knowledge---knowledge of words, of phonemes, of the human vocal tract, etc. On the other side were newer methods that were more statistical in nature and did much more computation, based on hidden Markov models (HMMs). Again, the statistical methods won out over the human-knowledge-based methods. This led to a major change in all of natural language processing, gradually over decades, where statistics and computation came to dominate the field. The recent rise of deep learning in speech recognition is the most recent step in this consistent direction. Deep learning methods rely even less on human knowledge, and use even more computation, together with learning on huge training sets, to produce dramatically better speech recognition systems. As in the games, researchers always tried to make systems that worked the way the researchers thought their own minds worked---they tried to put that knowledge in their systems---but it proved ultimately counterproductive, and a colossal waste of researcher's time, when, through Moore's law, massive computation became available and a means was found to put it to good use.\n\nIn computer vision, there has been a similar pattern. Early methods conceived of vision as searching for edges, or generalized cylinders, or in terms of SIFT features. But today all this is discarded. Modern deep-learning neural networks use only the notions of convolution and certain kinds of invariances, and perform much better.\n\nThis is a big lesson. As a field, we still have not thoroughly learned it, as we are continuing to make the same kind of mistakes. To see this, and to effectively resist it, we have to understand the appeal of these mistakes. We have to learn the bitter lesson that building in how we think we think does not work in the long run. The bitter lesson is based on the historical observations that 1) AI researchers have often tried to build knowledge into their agents, 2) this always helps in the short term, and is personally satisfying to the researcher, but 3) in the long run it plateaus and even inhibits further progress, and 4) breakthrough progress eventually arrives by an opposing approach based on scaling computation by search and learning. The eventual success is tinged with bitterness, and often incompletely digested, because it is success over a favored, human-centric approach.\n\nOne thing that should be learned from the bitter lesson is the great power of general purpose methods, of methods that continue to scale with increased computation even as the available computation becomes very great. The two methods that seem to scale arbitrarily in this way are search and learning.\n\nThe second general point to be learned from the bitter lesson is that the actual contents of minds are tremendously, irredeemably complex; we should stop trying to find simple ways to think about the contents of minds, such as simple ways to think about space, objects, multiple agents, or symmetries. All these are part of the arbitrary, intrinsically-complex, outside world. They are not what should be built in, as their complexity is endless; instead we should build in only the meta-methods that can find and capture this arbitrary complexity. Essential to these methods is that they can find good approximations, but the search for them should be by our methods, not by us. We want AI agents that can discover like we can, not which contain what we have discovered. Building in our discoveries only makes it harder to see how the discovering process can be done.\n",
3
- "qwen_inference_time": 21.767822980880737,
4
- "rwkv_inference_time": 33.561607122421265,
5
  "qwen_compression_rate": 48.14428559434192,
6
- "rwkv_compression_rate": 47.624574152536056
7
  }
 
1
  {
2
  "example_text": "The Bitter Lesson\nRich Sutton\nMarch 13, 2019\nThe biggest lesson that can be read from 70 years of AI research is that general methods that leverage computation are ultimately the most effective, and by a large margin. The ultimate reason for this is Moore's law, or rather its generalization of continued exponentially falling cost per unit of computation. Most AI research has been conducted as if the computation available to the agent were constant (in which case leveraging human knowledge would be one of the only ways to improve performance) but, over a slightly longer time than a typical research project, massively more computation inevitably becomes available. Seeking an improvement that makes a difference in the shorter term, researchers seek to leverage their human knowledge of the domain, but the only thing that matters in the long run is the leveraging of computation. These two need not run counter to each other, but in practice they tend to. Time spent on one is time not spent on the other. There are psychological commitments to investment in one approach or the other. And the human-knowledge approach tends to complicate methods in ways that make them less suited to taking advantage of general methods leveraging computation. There were many examples of AI researchers' belated learning of this bitter lesson, and it is instructive to review some of the most prominent.\n\nIn computer chess, the methods that defeated the world champion, Kasparov, in 1997, were based on massive, deep search. At the time, this was looked upon with dismay by the majority of computer-chess researchers who had pursued methods that leveraged human understanding of the special structure of chess. When a simpler, search-based approach with special hardware and software proved vastly more effective, these human-knowledge-based chess researchers were not good losers. They said that ``brute force\" search may have won this time, but it was not a general strategy, and anyway it was not how people played chess. These researchers wanted methods based on human input to win and were disappointed when they did not.\n\nA similar pattern of research progress was seen in computer Go, only delayed by a further 20 years. Enormous initial efforts went into avoiding search by taking advantage of human knowledge, or of the special features of the game, but all those efforts proved irrelevant, or worse, once search was applied effectively at scale. Also important was the use of learning by self play to learn a value function (as it was in many other games and even in chess, although learning did not play a big role in the 1997 program that first beat a world champion). Learning by self play, and learning in general, is like search in that it enables massive computation to be brought to bear. Search and learning are the two most important classes of techniques for utilizing massive amounts of computation in AI research. In computer Go, as in computer chess, researchers' initial effort was directed towards utilizing human understanding (so that less search was needed) and only much later was much greater success had by embracing search and learning.\n\nIn speech recognition, there was an early competition, sponsored by DARPA, in the 1970s. Entrants included a host of special methods that took advantage of human knowledge---knowledge of words, of phonemes, of the human vocal tract, etc. On the other side were newer methods that were more statistical in nature and did much more computation, based on hidden Markov models (HMMs). Again, the statistical methods won out over the human-knowledge-based methods. This led to a major change in all of natural language processing, gradually over decades, where statistics and computation came to dominate the field. The recent rise of deep learning in speech recognition is the most recent step in this consistent direction. Deep learning methods rely even less on human knowledge, and use even more computation, together with learning on huge training sets, to produce dramatically better speech recognition systems. As in the games, researchers always tried to make systems that worked the way the researchers thought their own minds worked---they tried to put that knowledge in their systems---but it proved ultimately counterproductive, and a colossal waste of researcher's time, when, through Moore's law, massive computation became available and a means was found to put it to good use.\n\nIn computer vision, there has been a similar pattern. Early methods conceived of vision as searching for edges, or generalized cylinders, or in terms of SIFT features. But today all this is discarded. Modern deep-learning neural networks use only the notions of convolution and certain kinds of invariances, and perform much better.\n\nThis is a big lesson. As a field, we still have not thoroughly learned it, as we are continuing to make the same kind of mistakes. To see this, and to effectively resist it, we have to understand the appeal of these mistakes. We have to learn the bitter lesson that building in how we think we think does not work in the long run. The bitter lesson is based on the historical observations that 1) AI researchers have often tried to build knowledge into their agents, 2) this always helps in the short term, and is personally satisfying to the researcher, but 3) in the long run it plateaus and even inhibits further progress, and 4) breakthrough progress eventually arrives by an opposing approach based on scaling computation by search and learning. The eventual success is tinged with bitterness, and often incompletely digested, because it is success over a favored, human-centric approach.\n\nOne thing that should be learned from the bitter lesson is the great power of general purpose methods, of methods that continue to scale with increased computation even as the available computation becomes very great. The two methods that seem to scale arbitrarily in this way are search and learning.\n\nThe second general point to be learned from the bitter lesson is that the actual contents of minds are tremendously, irredeemably complex; we should stop trying to find simple ways to think about the contents of minds, such as simple ways to think about space, objects, multiple agents, or symmetries. All these are part of the arbitrary, intrinsically-complex, outside world. They are not what should be built in, as their complexity is endless; instead we should build in only the meta-methods that can find and capture this arbitrary complexity. Essential to these methods is that they can find good approximations, but the search for them should be by our methods, not by us. We want AI agents that can discover like we can, not which contain what we have discovered. Building in our discoveries only makes it harder to see how the discovering process can be done.\n",
3
+ "qwen_inference_time": 20.516680479049683,
4
+ "rwkv_inference_time": 31.14354944229126,
5
  "qwen_compression_rate": 48.14428559434192,
6
+ "rwkv_compression_rate": 47.62502588510778
7
  }
precomputed/example_visualization.html CHANGED
The diff for this file is too large to render. See raw diff
 
visualization/html_generator.py CHANGED
@@ -4,6 +4,7 @@ HTML visualization generator for UncheatableEval.
4
  Generates interactive HTML visualizations comparing byte-level losses between two models.
5
  """
6
 
 
7
  import json
8
  import math
9
  import re
@@ -20,6 +21,7 @@ COMPRESSION_RATE_FACTOR = (1.0 / math.log(2.0)) * 0.125 * 100.0
20
  # Global tokenizers (lazy loaded)
21
  _qwen_tokenizer = None
22
  _rwkv_tokenizer = None
 
23
 
24
 
25
  def get_qwen_tokenizer():
@@ -83,12 +85,9 @@ def get_token_info_for_text(text: str) -> dict:
83
  byte_pos = 0
84
  for idx, (token_id, token_bytes) in enumerate(qwen_id_and_bytes):
85
  start = byte_pos
86
- end = byte_pos + len(token_bytes)
87
- try:
88
- token_str = bytes(token_bytes).decode("utf-8")
89
- except UnicodeDecodeError:
90
- token_str = repr(bytes(token_bytes))
91
- qwen_tokens.append((start, end, token_id, token_str))
92
  byte_to_qwen[start] = idx
93
  byte_pos = end
94
 
@@ -106,18 +105,24 @@ def get_token_info_for_text(text: str) -> dict:
106
  token_bytes = rwkv_tokenizer.decodeBytes([token_id])
107
  start = byte_pos
108
  end = byte_pos + len(token_bytes)
109
- try:
110
- token_str = token_bytes.decode("utf-8")
111
- except UnicodeDecodeError:
112
- token_str = repr(token_bytes)
113
- rwkv_tokens.append((start, end, token_id, token_str))
114
  byte_to_rwkv[start] = idx
115
  byte_pos = end
116
 
117
- # Get common boundaries
118
  qwen_boundaries = set([0] + [t[1] for t in qwen_tokens])
119
  rwkv_boundaries = set([0] + [t[1] for t in rwkv_tokens])
120
- common_boundaries = sorted(qwen_boundaries & rwkv_boundaries)
 
 
 
 
 
 
 
 
 
 
121
 
122
  return {
123
  "common_boundaries": common_boundaries,
@@ -163,16 +168,58 @@ def generate_comparison_html(
163
 
164
  def decode_token(token_id: int, tokenizer, model_type: str) -> str:
165
  """Decode a single token ID to text using the appropriate tokenizer."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
166
  if tokenizer is None:
167
  return f"[{token_id}]"
168
  try:
169
  if model_type in ["rwkv", "rwkv7"]:
170
- # RWKV tokenizer uses decode method
171
- decoded = tokenizer.decode([token_id])
172
- return decoded if decoded else f"[{token_id}]"
 
 
 
 
 
 
173
  else:
174
- # HuggingFace tokenizer
 
 
 
 
 
 
 
 
 
 
 
 
 
 
175
  decoded = tokenizer.decode([token_id])
 
 
176
  return decoded if decoded else f"[{token_id}]"
177
  except Exception as e:
178
  print(f"Warning: Failed to decode token {token_id} ({model_type}): {e}")
@@ -250,9 +297,9 @@ def generate_comparison_html(
250
 
251
  def get_tokens_for_range(byte_start, byte_end, token_list):
252
  result = []
253
- for t_start, t_end, token_id, t_str in token_list:
254
  if t_start < byte_end and t_end > byte_start:
255
- result.append((token_id, t_str))
256
  return result
257
 
258
  # Build tokens based on common boundaries
@@ -262,15 +309,18 @@ def generate_comparison_html(
262
  start_byte = common_boundaries[i]
263
  end_byte = common_boundaries[i + 1]
264
  token_bytes = text_bytes[start_byte:end_byte]
 
265
  try:
266
  token_text = token_bytes.decode("utf-8")
267
  except UnicodeDecodeError:
268
- continue
 
 
269
 
270
  qwen_toks = get_tokens_for_range(start_byte, end_byte, qwen_tokens)
271
  rwkv_toks = get_tokens_for_range(start_byte, end_byte, rwkv_tokens)
272
 
273
- if re.search(r"\w", token_text, re.UNICODE):
274
  tokens.append(
275
  {
276
  "type": "word",
@@ -334,11 +384,31 @@ def generate_comparison_html(
334
  model_b_token_idx = find_token_for_byte(byte_start, model_b_token_ranges)
335
 
336
  # Build token info strings showing all tokens in this byte range
337
- # Model A (RWKV7) - show all tokens that overlap with this byte range
338
- model_a_info = ", ".join([f"[{idx}] {repr(s)}" for idx, s in token["rwkv_tokens"]])
339
-
340
- # Model B (Qwen3) - show all tokens that overlap with this byte range
341
- model_b_info = ", ".join([f"[{idx}] {repr(s)}" for idx, s in token["qwen_tokens"]])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
342
 
343
  raw_bytes = list(text_bytes[byte_start:byte_end])
344
  losses_a = byte_losses_a[byte_start:byte_end]
@@ -359,14 +429,20 @@ def generate_comparison_html(
359
  if model_a_token_idx is not None and model_a_token_idx < len(topk_predictions_a):
360
  pred = topk_predictions_a[model_a_token_idx]
361
  try:
362
- decoded_pred = [
363
- pred[0],
364
- pred[1],
365
- [[tid, prob, decode_token(tid, tokenizer_a, model_type_a)] for tid, prob in pred[2]],
366
- ]
367
- # Use base64 encoding to avoid escaping issues
368
- import base64
369
-
 
 
 
 
 
 
370
  topk_a_json = base64.b64encode(json.dumps(decoded_pred, ensure_ascii=False).encode("utf-8")).decode("ascii")
371
  except Exception as e:
372
  pass
@@ -375,10 +451,16 @@ def generate_comparison_html(
375
  if model_b_token_idx is not None and model_b_token_idx < len(topk_predictions_b):
376
  pred = topk_predictions_b[model_b_token_idx]
377
  try:
378
- decoded_pred = [pred[0], pred[1], [[tid, prob, decode_token(tid, tokenizer_b, model_type_b)] for tid, prob in pred[2]]]
379
- # Use base64 encoding to avoid escaping issues
380
- import base64
381
-
 
 
 
 
 
 
382
  topk_b_json = base64.b64encode(json.dumps(decoded_pred, ensure_ascii=False).encode("utf-8")).decode("ascii")
383
  except Exception as e:
384
  pass
@@ -607,7 +689,31 @@ def generate_comparison_html(
607
  display: flex;
608
  gap: 4px;
609
  padding: 1px 0;
 
 
 
 
 
610
  align-items: center;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
611
  }}
612
  #tooltip .topk-rank {{
613
  color: #888;
@@ -618,11 +724,15 @@ def generate_comparison_html(
618
  }}
619
  #tooltip .topk-token {{
620
  color: #a5f3fc;
621
- max-width: 100px;
622
- overflow: hidden;
623
- text-overflow: ellipsis;
624
- white-space: nowrap;
625
  font-family: monospace;
 
 
 
 
 
626
  }}
627
  #tooltip .topk-prob {{
628
  color: #86efac;
@@ -751,8 +861,8 @@ def generate_comparison_html(
751
 
752
  tokenSpans.forEach(token => {{
753
  token.addEventListener('mouseenter', (e) => {{
754
- const modelA = token.getAttribute('data-model-a') || 'N/A';
755
- const modelB = token.getAttribute('data-model-b') || 'N/A';
756
  const bytes = token.getAttribute('data-bytes') || '';
757
  const compressionA = token.getAttribute('data-compression-a') || '';
758
  const compressionB = token.getAttribute('data-compression-b') || '';
@@ -761,18 +871,52 @@ def generate_comparison_html(
761
  const top5A = token.getAttribute('data-topk-a') || '';
762
  const top5B = token.getAttribute('data-topk-b') || '';
763
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
764
  function formatTopkColumn(topkBase64, modelName, titleClass) {{
765
  if (!topkBase64) return '<div class="topk-column"><div class="topk-title ' + titleClass + '">' + modelName + '</div><div class="topk-list">N/A</div></div>';
766
  try {{
767
- // Decode base64 to UTF-8 string (atob() doesn't support UTF-8, need proper decoding)
768
- const binaryString = atob(topkBase64);
769
- const bytes = new Uint8Array(binaryString.length);
770
- for (let i = 0; i < binaryString.length; i++) {{
771
- bytes[i] = binaryString.charCodeAt(i);
 
 
 
 
772
  }}
773
- const topkJson = new TextDecoder('utf-8').decode(bytes);
774
- const data = JSON.parse(topkJson);
775
- const [actualId, rank, topkList] = data;
776
  let html = '<div class="topk-column">';
777
  html += '<div class="topk-title ' + titleClass + '">' + modelName + '</div>';
778
  html += '<div class="topk-list">';
@@ -780,8 +924,13 @@ def generate_comparison_html(
780
  const [tokenId, prob, tokenText] = item;
781
  const isHit = tokenId === actualId;
782
  const rankClass = isHit ? 'topk-rank hit' : 'topk-rank';
783
- const displayText = tokenText || '[' + tokenId + ']';
784
- const escapedText = displayText.replace(/</g, '&lt;').replace(/>/g, '&gt;');
 
 
 
 
 
785
  html += '<div class="topk-item">';
786
  html += '<span class="' + rankClass + '">' + (idx + 1) + '.</span>';
787
  html += '<span class="topk-token" title="ID: ' + tokenId + '">' + escapedText + '</span>';
@@ -790,7 +939,12 @@ def generate_comparison_html(
790
  html += '</div>';
791
  }});
792
  if (rank > 10) {{
793
- html += '<div class="topk-item topk-miss">Actual rank: ' + rank + '</div>';
 
 
 
 
 
794
  }}
795
  html += '</div></div>';
796
  return html;
@@ -801,13 +955,45 @@ def generate_comparison_html(
801
  }}
802
  }}
803
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
804
  let tooltipHtml = `
805
  <div><span class="label">Bytes:</span> <span class="bytes">${{bytes || '(empty)'}}</span></div>
806
  <div><span class="label">RWKV Compression Rate:</span> <span class="loss-a">${{compressionA || '(empty)'}}${{avgCompressionA ? ' (avg: ' + avgCompressionA + '%)' : ''}}</span></div>
807
  <div><span class="label">Qwen Compression Rate:</span> <span class="loss-b">${{compressionB || '(empty)'}}${{avgCompressionB ? ' (avg: ' + avgCompressionB + '%)' : ''}}</span></div>
808
  <hr style="border-color: #555; margin: 6px 0;">
809
- <div><span class="label">RWKV:</span> <span class="model-a">${{modelA || '(empty)'}}</span></div>
810
- <div><span class="label">Qwen:</span> <span class="model-b">${{modelB || '(empty)'}}</span></div>
811
  `;
812
  if (top5A || top5B) {{
813
  tooltipHtml += '<div class="topk-section"><div class="topk-container">';
 
4
  Generates interactive HTML visualizations comparing byte-level losses between two models.
5
  """
6
 
7
+ import base64
8
  import json
9
  import math
10
  import re
 
21
  # Global tokenizers (lazy loaded)
22
  _qwen_tokenizer = None
23
  _rwkv_tokenizer = None
24
+ _token_bytes_converter_cache = {}
25
 
26
 
27
  def get_qwen_tokenizer():
 
85
  byte_pos = 0
86
  for idx, (token_id, token_bytes) in enumerate(qwen_id_and_bytes):
87
  start = byte_pos
88
+ token_bytes_blob = bytes(token_bytes)
89
+ end = byte_pos + len(token_bytes_blob)
90
+ qwen_tokens.append((start, end, token_id, token_bytes_blob))
 
 
 
91
  byte_to_qwen[start] = idx
92
  byte_pos = end
93
 
 
105
  token_bytes = rwkv_tokenizer.decodeBytes([token_id])
106
  start = byte_pos
107
  end = byte_pos + len(token_bytes)
108
+ rwkv_tokens.append((start, end, token_id, token_bytes))
 
 
 
 
109
  byte_to_rwkv[start] = idx
110
  byte_pos = end
111
 
112
+ # Get common boundaries, but keep only UTF-8 codepoint boundaries
113
  qwen_boundaries = set([0] + [t[1] for t in qwen_tokens])
114
  rwkv_boundaries = set([0] + [t[1] for t in rwkv_tokens])
115
+ utf8_boundaries = set([0])
116
+ byte_pos = 0
117
+ for ch in text:
118
+ byte_pos += len(ch.encode("utf-8"))
119
+ utf8_boundaries.add(byte_pos)
120
+ common_boundaries = sorted(qwen_boundaries & rwkv_boundaries & utf8_boundaries)
121
+ # Ensure we always include the end boundary
122
+ text_end = len(text.encode("utf-8"))
123
+ if text_end not in common_boundaries:
124
+ common_boundaries.append(text_end)
125
+ common_boundaries = sorted(common_boundaries)
126
 
127
  return {
128
  "common_boundaries": common_boundaries,
 
168
 
169
  def decode_token(token_id: int, tokenizer, model_type: str) -> str:
170
  """Decode a single token ID to text using the appropriate tokenizer."""
171
+ def bytes_to_hex_str(byte_values) -> str:
172
+ return "".join([f"\\x{b:02x}" for b in byte_values])
173
+
174
+ def get_bytes_converter(tokenizer):
175
+ if tokenizer is None:
176
+ return None
177
+ key = getattr(tokenizer, "name_or_path", None)
178
+ if not key:
179
+ key = str(id(tokenizer))
180
+ if key not in _token_bytes_converter_cache:
181
+ try:
182
+ _token_bytes_converter_cache[key] = TokenizerBytesConverter(
183
+ model_name_or_path=getattr(tokenizer, "name_or_path", None),
184
+ tokenizer=tokenizer,
185
+ trust_remote_code=True,
186
+ )
187
+ except Exception:
188
+ _token_bytes_converter_cache[key] = None
189
+ return _token_bytes_converter_cache.get(key)
190
+
191
  if tokenizer is None:
192
  return f"[{token_id}]"
193
  try:
194
  if model_type in ["rwkv", "rwkv7"]:
195
+ # RWKV tokenizer provides raw bytes
196
+ token_bytes = tokenizer.decodeBytes([token_id])
197
+ if token_bytes:
198
+ try:
199
+ decoded = token_bytes.decode("utf-8")
200
+ return decoded if decoded else f"[{token_id}]"
201
+ except UnicodeDecodeError:
202
+ return bytes_to_hex_str(token_bytes)
203
+ return f"[{token_id}]"
204
  else:
205
+ # HuggingFace tokenizer: prefer raw bytes when possible
206
+ converter = get_bytes_converter(tokenizer)
207
+ token_bytes = None
208
+ if converter is not None:
209
+ try:
210
+ token_bytes = converter.token_to_bytes(token_id)
211
+ except Exception:
212
+ token_bytes = None
213
+ if token_bytes:
214
+ try:
215
+ decoded = bytes(token_bytes).decode("utf-8")
216
+ return decoded if decoded else f"[{token_id}]"
217
+ except UnicodeDecodeError:
218
+ return bytes_to_hex_str(token_bytes)
219
+
220
  decoded = tokenizer.decode([token_id])
221
+ if decoded and "�" not in decoded:
222
+ return decoded
223
  return decoded if decoded else f"[{token_id}]"
224
  except Exception as e:
225
  print(f"Warning: Failed to decode token {token_id} ({model_type}): {e}")
 
297
 
298
  def get_tokens_for_range(byte_start, byte_end, token_list):
299
  result = []
300
+ for t_start, t_end, token_id, t_bytes in token_list:
301
  if t_start < byte_end and t_end > byte_start:
302
+ result.append((token_id, t_bytes))
303
  return result
304
 
305
  # Build tokens based on common boundaries
 
309
  start_byte = common_boundaries[i]
310
  end_byte = common_boundaries[i + 1]
311
  token_bytes = text_bytes[start_byte:end_byte]
312
+ decoded_ok = True
313
  try:
314
  token_text = token_bytes.decode("utf-8")
315
  except UnicodeDecodeError:
316
+ # Show raw bytes when UTF-8 decoding fails
317
+ token_text = "".join([f"\\x{b:02x}" for b in token_bytes])
318
+ decoded_ok = False
319
 
320
  qwen_toks = get_tokens_for_range(start_byte, end_byte, qwen_tokens)
321
  rwkv_toks = get_tokens_for_range(start_byte, end_byte, rwkv_tokens)
322
 
323
+ if decoded_ok and re.search(r"\w", token_text, re.UNICODE):
324
  tokens.append(
325
  {
326
  "type": "word",
 
384
  model_b_token_idx = find_token_for_byte(byte_start, model_b_token_ranges)
385
 
386
  # Build token info strings showing all tokens in this byte range
387
+ def token_bytes_to_display_text(token_bytes: bytes) -> str:
388
+ if token_bytes is None:
389
+ return ""
390
+ if isinstance(token_bytes, list):
391
+ token_bytes = bytes(token_bytes)
392
+ if isinstance(token_bytes, str):
393
+ return token_bytes
394
+ if len(token_bytes) == 0:
395
+ return ""
396
+ try:
397
+ return token_bytes.decode("utf-8")
398
+ except UnicodeDecodeError:
399
+ return "".join([f"\\x{b:02x}" for b in token_bytes])
400
+
401
+ # Model A (RWKV7) - tokens overlapping this byte range
402
+ model_a_info = ""
403
+ if token["rwkv_tokens"]:
404
+ model_a_list = [[tid, token_bytes_to_display_text(tb)] for tid, tb in token["rwkv_tokens"]]
405
+ model_a_info = base64.b64encode(json.dumps(model_a_list, ensure_ascii=False).encode("utf-8")).decode("ascii")
406
+
407
+ # Model B (Qwen3) - tokens overlapping this byte range
408
+ model_b_info = ""
409
+ if token["qwen_tokens"]:
410
+ model_b_list = [[tid, token_bytes_to_display_text(tb)] for tid, tb in token["qwen_tokens"]]
411
+ model_b_info = base64.b64encode(json.dumps(model_b_list, ensure_ascii=False).encode("utf-8")).decode("ascii")
412
 
413
  raw_bytes = list(text_bytes[byte_start:byte_end])
414
  losses_a = byte_losses_a[byte_start:byte_end]
 
429
  if model_a_token_idx is not None and model_a_token_idx < len(topk_predictions_a):
430
  pred = topk_predictions_a[model_a_token_idx]
431
  try:
432
+ if len(pred) >= 4:
433
+ actual_id, rank, actual_prob, topk_list = pred[0], pred[1], pred[2], pred[3]
434
+ decoded_pred = [
435
+ actual_id,
436
+ rank,
437
+ actual_prob,
438
+ [[tid, prob, decode_token(tid, tokenizer_a, model_type_a)] for tid, prob in topk_list],
439
+ ]
440
+ else:
441
+ decoded_pred = [
442
+ pred[0],
443
+ pred[1],
444
+ [[tid, prob, decode_token(tid, tokenizer_a, model_type_a)] for tid, prob in pred[2]],
445
+ ]
446
  topk_a_json = base64.b64encode(json.dumps(decoded_pred, ensure_ascii=False).encode("utf-8")).decode("ascii")
447
  except Exception as e:
448
  pass
 
451
  if model_b_token_idx is not None and model_b_token_idx < len(topk_predictions_b):
452
  pred = topk_predictions_b[model_b_token_idx]
453
  try:
454
+ if len(pred) >= 4:
455
+ actual_id, rank, actual_prob, topk_list = pred[0], pred[1], pred[2], pred[3]
456
+ decoded_pred = [
457
+ actual_id,
458
+ rank,
459
+ actual_prob,
460
+ [[tid, prob, decode_token(tid, tokenizer_b, model_type_b)] for tid, prob in topk_list],
461
+ ]
462
+ else:
463
+ decoded_pred = [pred[0], pred[1], [[tid, prob, decode_token(tid, tokenizer_b, model_type_b)] for tid, prob in pred[2]]]
464
  topk_b_json = base64.b64encode(json.dumps(decoded_pred, ensure_ascii=False).encode("utf-8")).decode("ascii")
465
  except Exception as e:
466
  pass
 
689
  display: flex;
690
  gap: 4px;
691
  padding: 1px 0;
692
+ align-items: flex-start;
693
+ }}
694
+ #tooltip .token-block {{
695
+ margin-top: 6px;
696
+ display: flex;
697
  align-items: center;
698
+ gap: 6px;
699
+ white-space: nowrap;
700
+ }}
701
+ #tooltip .token-chips {{
702
+ display: flex;
703
+ flex-wrap: nowrap;
704
+ gap: 4px;
705
+ }}
706
+ #tooltip .token-chip-group {{
707
+ display: inline-flex;
708
+ align-items: center;
709
+ gap: 4px;
710
+ }}
711
+ #tooltip .token-id {{
712
+ color: #888;
713
+ font-family: monospace;
714
+ }}
715
+ #tooltip .token-chip {{
716
+ max-width: 100%;
717
  }}
718
  #tooltip .topk-rank {{
719
  color: #888;
 
724
  }}
725
  #tooltip .topk-token {{
726
  color: #a5f3fc;
727
+ white-space: pre-wrap;
728
+ overflow-wrap: anywhere;
729
+ word-break: break-word;
 
730
  font-family: monospace;
731
+ background-color: rgba(255, 255, 255, 0.08);
732
+ padding: 0 4px;
733
+ border-radius: 3px;
734
+ display: inline-block;
735
+ max-width: 100%;
736
  }}
737
  #tooltip .topk-prob {{
738
  color: #86efac;
 
861
 
862
  tokenSpans.forEach(token => {{
863
  token.addEventListener('mouseenter', (e) => {{
864
+ const modelA = token.getAttribute('data-model-a') || '';
865
+ const modelB = token.getAttribute('data-model-b') || '';
866
  const bytes = token.getAttribute('data-bytes') || '';
867
  const compressionA = token.getAttribute('data-compression-a') || '';
868
  const compressionB = token.getAttribute('data-compression-b') || '';
 
871
  const top5A = token.getAttribute('data-topk-a') || '';
872
  const top5B = token.getAttribute('data-topk-b') || '';
873
 
874
+ function decodeBase64Json(base64Str) {{
875
+ const binaryString = atob(base64Str);
876
+ const bytes = new Uint8Array(binaryString.length);
877
+ for (let i = 0; i < binaryString.length; i++) {{
878
+ bytes[i] = binaryString.charCodeAt(i);
879
+ }}
880
+ const jsonStr = new TextDecoder('utf-8').decode(bytes);
881
+ return JSON.parse(jsonStr);
882
+ }}
883
+
884
+ function escapeControlChars(text) {{
885
+ if (!text) return text;
886
+ let out = '';
887
+ for (let i = 0; i < text.length; i++) {{
888
+ const ch = text[i];
889
+ const code = text.charCodeAt(i);
890
+ if (ch === '\\\\') {{
891
+ out += '\\\\\\\\';
892
+ }} else if (ch === '\\n') {{
893
+ out += '\\\\n';
894
+ }} else if (ch === '\\r') {{
895
+ out += '\\\\r';
896
+ }} else if (ch === '\\t') {{
897
+ out += '\\\\t';
898
+ }} else if (code < 32 || code === 127) {{
899
+ out += '\\\\x' + code.toString(16).padStart(2, '0');
900
+ }} else {{
901
+ out += ch;
902
+ }}
903
+ }}
904
+ return out;
905
+ }}
906
+
907
  function formatTopkColumn(topkBase64, modelName, titleClass) {{
908
  if (!topkBase64) return '<div class="topk-column"><div class="topk-title ' + titleClass + '">' + modelName + '</div><div class="topk-list">N/A</div></div>';
909
  try {{
910
+ const data = decodeBase64Json(topkBase64);
911
+ let actualId = null;
912
+ let rank = null;
913
+ let actualProb = null;
914
+ let topkList = [];
915
+ if (data.length >= 4) {{
916
+ [actualId, rank, actualProb, topkList] = data;
917
+ }} else {{
918
+ [actualId, rank, topkList] = data;
919
  }}
 
 
 
920
  let html = '<div class="topk-column">';
921
  html += '<div class="topk-title ' + titleClass + '">' + modelName + '</div>';
922
  html += '<div class="topk-list">';
 
924
  const [tokenId, prob, tokenText] = item;
925
  const isHit = tokenId === actualId;
926
  const rankClass = isHit ? 'topk-rank hit' : 'topk-rank';
927
+ const rawText = (tokenText !== undefined && tokenText !== null) ? tokenText : '';
928
+ const visibleText = escapeControlChars(rawText);
929
+ const displayText = (visibleText !== '') ? visibleText : ('[' + tokenId + ']');
930
+ const escapedText = displayText
931
+ .replace(/&/g, '&amp;')
932
+ .replace(/</g, '&lt;')
933
+ .replace(/>/g, '&gt;');
934
  html += '<div class="topk-item">';
935
  html += '<span class="' + rankClass + '">' + (idx + 1) + '.</span>';
936
  html += '<span class="topk-token" title="ID: ' + tokenId + '">' + escapedText + '</span>';
 
939
  html += '</div>';
940
  }});
941
  if (rank > 10) {{
942
+ let probSuffix = '';
943
+ const probVal = parseFloat(actualProb);
944
+ if (!isNaN(probVal)) {{
945
+ probSuffix = ' (' + (probVal * 100).toFixed(4) + '%)';
946
+ }}
947
+ html += '<div class="topk-item topk-miss">Actual rank: ' + rank + probSuffix + '</div>';
948
  }}
949
  html += '</div></div>';
950
  return html;
 
955
  }}
956
  }}
957
 
958
+ function formatTokenChips(modelBase64, label, labelClass) {{
959
+ if (!modelBase64) {{
960
+ return '<div class="token-block"><span class="label ' + labelClass + '">' + label + ':</span> <span class="topk-token token-chip">N/A</span></div>';
961
+ }}
962
+ try {{
963
+ const tokenList = decodeBase64Json(modelBase64);
964
+ let html = '<div class="token-block">';
965
+ html += '<span class="label ' + labelClass + '">' + label + ':</span>';
966
+ html += '<div class="token-chips">';
967
+ tokenList.forEach((item) => {{
968
+ const tokenId = item[0];
969
+ const tokenText = item[1];
970
+ const visible = escapeControlChars(tokenText || '');
971
+ const displayText = (visible !== '') ? visible : '';
972
+ const escapedText = displayText
973
+ .replace(/&/g, '&amp;')
974
+ .replace(/</g, '&lt;')
975
+ .replace(/>/g, '&gt;');
976
+ html += '<span class="token-chip-group" title="ID: ' + tokenId + '">';
977
+ html += '<span class="token-id">[' + tokenId + ']</span>';
978
+ html += '<span class="topk-token token-chip">' + escapedText + '</span>';
979
+ html += '</span>';
980
+ }});
981
+ html += '</div></div>';
982
+ return html;
983
+ }} catch (e) {{
984
+ console.error('Error in formatTokenChips for ' + label + ':', e);
985
+ console.error('modelBase64:', modelBase64);
986
+ return '<div class="token-block"><span class="label ' + labelClass + '">' + label + ':</span> <span class="topk-token token-chip">Error: ' + e.message + '</span></div>';
987
+ }}
988
+ }}
989
+
990
  let tooltipHtml = `
991
  <div><span class="label">Bytes:</span> <span class="bytes">${{bytes || '(empty)'}}</span></div>
992
  <div><span class="label">RWKV Compression Rate:</span> <span class="loss-a">${{compressionA || '(empty)'}}${{avgCompressionA ? ' (avg: ' + avgCompressionA + '%)' : ''}}</span></div>
993
  <div><span class="label">Qwen Compression Rate:</span> <span class="loss-b">${{compressionB || '(empty)'}}${{avgCompressionB ? ' (avg: ' + avgCompressionB + '%)' : ''}}</span></div>
994
  <hr style="border-color: #555; margin: 6px 0;">
995
+ ${{formatTokenChips(modelA, 'RWKV', 'model-a')}}
996
+ ${{formatTokenChips(modelB, 'Qwen', 'model-b')}}
997
  `;
998
  if (top5A || top5B) {{
999
  tooltipHtml += '<div class="topk-section"><div class="topk-container">';