Jellyfish042 commited on
Commit
15b2f1f
·
1 Parent(s): 3ef3c58
.claude/settings.local.json CHANGED
@@ -8,7 +8,8 @@
8
  "Bash(git branch:*)",
9
  "Bash(git commit -m \"$\\(cat <<''EOF''\nFix Gradio compatibility for HuggingFace Spaces\n\n- Upgrade gradio to >=5.0.0 to fix API schema bug\n- Add server_name and server_port to demo.launch\\(\\)\n\nCo-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>\nEOF\n\\)\")",
10
  "Bash(git commit:*)",
11
- "Bash(git reset:*)"
 
12
  ]
13
  }
14
  }
 
8
  "Bash(git branch:*)",
9
  "Bash(git commit -m \"$\\(cat <<''EOF''\nFix Gradio compatibility for HuggingFace Spaces\n\n- Upgrade gradio to >=5.0.0 to fix API schema bug\n- Add server_name and server_port to demo.launch\\(\\)\n\nCo-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>\nEOF\n\\)\")",
10
  "Bash(git commit:*)",
11
+ "Bash(git reset:*)",
12
+ "Bash(and top-10 predictions\" to better reflect what users see in the tooltip.\nAlso updated color legend to match the swapped model positions.\n\nCo-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>\nEOF\n\\)\")"
13
  ]
14
  }
15
  }
app.py CHANGED
@@ -35,6 +35,12 @@ _qwen_tokenizer = None
35
  _rwkv_model = None
36
  _rwkv_tokenizer = None
37
  _rwkv_model_path = None
 
 
 
 
 
 
38
 
39
 
40
  def download_rwkv_model(progress=None):
@@ -124,12 +130,36 @@ def validate_input(text: str) -> tuple[bool, str]:
124
  return True, text
125
 
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  def initialize_models():
128
  """Initialize and cache both models at startup."""
129
- global _qwen_model, _qwen_tokenizer, _rwkv_model, _rwkv_tokenizer, _rwkv_model_path
130
 
131
  print("Initializing models...")
132
 
 
 
 
133
  # Download RWKV model if needed
134
  print("Checking RWKV7 model...")
135
  _rwkv_model_path = download_rwkv_model()
@@ -142,6 +172,10 @@ def initialize_models():
142
  print("Loading RWKV7-G1C-1.5B...")
143
  _rwkv_model, _rwkv_tokenizer = load_rwkv7_model(_rwkv_model_path)
144
 
 
 
 
 
145
  print("Models loaded successfully!")
146
 
147
 
@@ -165,7 +199,7 @@ def run_evaluation(text: str, progress=gr.Progress()):
165
  from visualization.html_generator import generate_comparison_html
166
 
167
  # Use cached models
168
- global _qwen_model, _qwen_tokenizer, _rwkv_model, _rwkv_tokenizer
169
 
170
  # Validate input
171
  valid, result = validate_input(text)
@@ -177,12 +211,38 @@ def run_evaluation(text: str, progress=gr.Progress()):
177
  try:
178
  # Step 1: Evaluate Qwen (using cached model)
179
  progress(0, desc="Evaluating with Qwen3...")
 
 
 
 
 
 
180
  result_qwen = evaluate_hf_single_sample(_qwen_model, _qwen_tokenizer, text, bos_mode="add_newline_token")
181
 
 
 
 
 
 
 
 
182
  # Step 2: Evaluate RWKV7 (using cached model)
183
  progress(0, desc="Evaluating with RWKV7...")
 
 
 
 
 
 
184
  result_rwkv = evaluate_rwkv7_single_sample(_rwkv_model, _rwkv_tokenizer, text)
185
 
 
 
 
 
 
 
 
186
  # Step 3: Generate visualization
187
  progress(0, desc="Generating visualization...")
188
  html = generate_comparison_html(
@@ -202,7 +262,24 @@ def run_evaluation(text: str, progress=gr.Progress()):
202
  # Wrap HTML for iframe display
203
  wrapped_html = wrap_html_in_iframe(html)
204
 
205
- return wrapped_html
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  except torch.cuda.OutOfMemoryError:
208
  if torch.cuda.is_available():
@@ -218,7 +295,18 @@ def run_evaluation(text: str, progress=gr.Progress()):
218
 
219
  def clear_inputs():
220
  """Clear all inputs and outputs."""
221
- return "", None
 
 
 
 
 
 
 
 
 
 
 
222
 
223
 
224
  # Build Gradio UI
@@ -245,14 +333,22 @@ with gr.Blocks(title="Compression-Lens: RWKV-7 vs Qwen3", theme=gr.themes.Soft()
245
 
246
  gr.Markdown("---")
247
 
 
 
 
 
 
248
  with gr.Row():
249
  with gr.Column():
250
  output_html = gr.HTML(label="Visualization")
251
 
252
  # Event handlers
253
- clear_btn.click(fn=clear_inputs, outputs=[text_input, output_html])
 
 
254
 
255
- run_btn.click(fn=run_evaluation, inputs=[text_input], outputs=[output_html])
 
256
 
257
 
258
  if __name__ == "__main__":
 
35
  _rwkv_model = None
36
  _rwkv_tokenizer = None
37
  _rwkv_model_path = None
38
+ _stats_manager = None
39
+
40
+ # Precomputed example cache
41
+ _precomputed_html = None
42
+ _precomputed_text = None
43
+ PRECOMPUTED_DIR = SCRIPT_DIR / "precomputed"
44
 
45
 
46
  def download_rwkv_model(progress=None):
 
130
  return True, text
131
 
132
 
133
+ def load_precomputed_example():
134
+ """Load precomputed example visualization."""
135
+ global _precomputed_html, _precomputed_text
136
+
137
+ html_path = PRECOMPUTED_DIR / "example_visualization.html"
138
+ metadata_path = PRECOMPUTED_DIR / "example_metadata.json"
139
+
140
+ if html_path.exists() and metadata_path.exists():
141
+ import json
142
+ with open(html_path, "r", encoding="utf-8") as f:
143
+ _precomputed_html = f.read()
144
+ with open(metadata_path, "r", encoding="utf-8") as f:
145
+ metadata = json.load(f)
146
+ _precomputed_text = metadata.get("example_text", "")
147
+ print(f"Loaded precomputed example ({len(_precomputed_text)} chars)")
148
+ return True
149
+ else:
150
+ print("No precomputed example found. Run precompute_example.py first.")
151
+ return False
152
+
153
+
154
  def initialize_models():
155
  """Initialize and cache both models at startup."""
156
+ global _qwen_model, _qwen_tokenizer, _rwkv_model, _rwkv_tokenizer, _rwkv_model_path, _stats_manager
157
 
158
  print("Initializing models...")
159
 
160
+ # Load precomputed example first
161
+ load_precomputed_example()
162
+
163
  # Download RWKV model if needed
164
  print("Checking RWKV7 model...")
165
  _rwkv_model_path = download_rwkv_model()
 
172
  print("Loading RWKV7-G1C-1.5B...")
173
  _rwkv_model, _rwkv_tokenizer = load_rwkv7_model(_rwkv_model_path)
174
 
175
+ # Initialize stats manager
176
+ from core.inference_stats import InferenceStatsManager
177
+ _stats_manager = InferenceStatsManager()
178
+
179
  print("Models loaded successfully!")
180
 
181
 
 
199
  from visualization.html_generator import generate_comparison_html
200
 
201
  # Use cached models
202
+ global _qwen_model, _qwen_tokenizer, _rwkv_model, _rwkv_tokenizer, _stats_manager
203
 
204
  # Validate input
205
  valid, result = validate_input(text)
 
211
  try:
212
  # Step 1: Evaluate Qwen (using cached model)
213
  progress(0, desc="Evaluating with Qwen3...")
214
+
215
+ # Get token count for prediction
216
+ qwen_inputs = _qwen_tokenizer(text, return_tensors="pt", add_special_tokens=False)
217
+ qwen_token_count = qwen_inputs["input_ids"].shape[-1]
218
+ qwen_predicted_time = _stats_manager.predict_time("qwen", qwen_token_count)
219
+
220
  result_qwen = evaluate_hf_single_sample(_qwen_model, _qwen_tokenizer, text, bos_mode="add_newline_token")
221
 
222
+ # Save stats and print comparison
223
+ _stats_manager.add_record("qwen", qwen_token_count, result_qwen["inference_time"])
224
+ if qwen_predicted_time is not None:
225
+ print(f"Qwen3 completed in {result_qwen['inference_time']:.2f}s (predicted: {qwen_predicted_time:.2f}s)")
226
+ else:
227
+ print(f"Qwen3 completed in {result_qwen['inference_time']:.2f}s")
228
+
229
  # Step 2: Evaluate RWKV7 (using cached model)
230
  progress(0, desc="Evaluating with RWKV7...")
231
+
232
+ # Get token count for prediction
233
+ rwkv_tokenized = _rwkv_tokenizer.encode(text)
234
+ rwkv_token_count = len(rwkv_tokenized.ids if hasattr(rwkv_tokenized, "ids") else rwkv_tokenized)
235
+ rwkv_predicted_time = _stats_manager.predict_time("rwkv", rwkv_token_count)
236
+
237
  result_rwkv = evaluate_rwkv7_single_sample(_rwkv_model, _rwkv_tokenizer, text)
238
 
239
+ # Save stats and print comparison
240
+ _stats_manager.add_record("rwkv", rwkv_token_count, result_rwkv["inference_time"])
241
+ if rwkv_predicted_time is not None:
242
+ print(f"RWKV7 completed in {result_rwkv['inference_time']:.2f}s (predicted: {rwkv_predicted_time:.2f}s)")
243
+ else:
244
+ print(f"RWKV7 completed in {result_rwkv['inference_time']:.2f}s")
245
+
246
  # Step 3: Generate visualization
247
  progress(0, desc="Generating visualization...")
248
  html = generate_comparison_html(
 
262
  # Wrap HTML for iframe display
263
  wrapped_html = wrap_html_in_iframe(html)
264
 
265
+ # Generate timing information for UI display
266
+ timing_lines = ["## ⏱️ Inference Timing\n"]
267
+
268
+ # Qwen timing
269
+ if qwen_predicted_time is not None:
270
+ timing_lines.append(f"**Qwen3-1.7B-Base**: {result_qwen['inference_time']:.2f}s (predicted: {qwen_predicted_time:.2f}s)")
271
+ else:
272
+ timing_lines.append(f"**Qwen3-1.7B-Base**: {result_qwen['inference_time']:.2f}s")
273
+
274
+ # RWKV timing
275
+ if rwkv_predicted_time is not None:
276
+ timing_lines.append(f"**RWKV7-G1C-1.5B**: {result_rwkv['inference_time']:.2f}s (predicted: {rwkv_predicted_time:.2f}s)")
277
+ else:
278
+ timing_lines.append(f"**RWKV7-G1C-1.5B**: {result_rwkv['inference_time']:.2f}s")
279
+
280
+ timing_text = "\n\n".join(timing_lines)
281
+
282
+ return wrapped_html, timing_text
283
 
284
  except torch.cuda.OutOfMemoryError:
285
  if torch.cuda.is_available():
 
295
 
296
  def clear_inputs():
297
  """Clear all inputs and outputs."""
298
+ return "", None, ""
299
+
300
+
301
+ def get_default_example():
302
+ """Get the default example for display on page load."""
303
+ global _precomputed_html, _precomputed_text
304
+
305
+ if _precomputed_html and _precomputed_text:
306
+ wrapped_html = wrap_html_in_iframe(_precomputed_html)
307
+ return _precomputed_text, wrapped_html, ""
308
+ else:
309
+ return "", None, ""
310
 
311
 
312
  # Build Gradio UI
 
333
 
334
  gr.Markdown("---")
335
 
336
+ # Timing information display
337
+ with gr.Row():
338
+ with gr.Column():
339
+ timing_info = gr.Markdown(label="Inference Timing")
340
+
341
  with gr.Row():
342
  with gr.Column():
343
  output_html = gr.HTML(label="Visualization")
344
 
345
  # Event handlers
346
+ clear_btn.click(fn=clear_inputs, outputs=[text_input, output_html, timing_info])
347
+
348
+ run_btn.click(fn=run_evaluation, inputs=[text_input], outputs=[output_html, timing_info])
349
 
350
+ # Load default example on page load
351
+ demo.load(fn=get_default_example, outputs=[text_input, output_html, timing_info])
352
 
353
 
354
  if __name__ == "__main__":
core/evaluator.py CHANGED
@@ -7,6 +7,7 @@ Provides single-sample evaluation functions for Qwen3 and RWKV7 models.
7
  import gc
8
  import math
9
  import os
 
10
  from typing import List, Dict, Any, Optional
11
 
12
  import torch
@@ -95,6 +96,8 @@ def evaluate_hf_single_sample(model, tokenizer, text: str, bos_mode: str = "add_
95
  Returns:
96
  dict with byte_wise_losses, top5_predictions, compression_rate, etc.
97
  """
 
 
98
  # Create token-to-bytes converter
99
  token2bytes_converter = TokenizerBytesConverter(model_name_or_path=tokenizer.name_or_path, tokenizer=tokenizer)
100
 
@@ -162,6 +165,7 @@ def evaluate_hf_single_sample(model, tokenizer, text: str, bos_mode: str = "add_
162
  num_bytes = len(text.encode("utf-8"))
163
  avg_loss = total_loss / seq_length
164
  compression_rate = avg_loss * COMPRESSION_RATE_FACTOR
 
165
 
166
  return {
167
  "byte_wise_losses": byte_wise_losses,
@@ -172,6 +176,7 @@ def evaluate_hf_single_sample(model, tokenizer, text: str, bos_mode: str = "add_
172
  "num_bytes": num_bytes,
173
  "model_name": getattr(model.config, "_name_or_path", "unknown"),
174
  "tokenizer": tokenizer,
 
175
  }
176
 
177
 
@@ -188,6 +193,8 @@ def evaluate_rwkv7_single_sample(model, tokenizer, text: str) -> Dict[str, Any]:
188
  Returns:
189
  dict with byte_wise_losses, top5_predictions, compression_rate, etc.
190
  """
 
 
191
  # Tokenize
192
  tokenized = tokenizer.encode(text)
193
  if hasattr(tokenized, "ids"):
@@ -239,6 +246,7 @@ def evaluate_rwkv7_single_sample(model, tokenizer, text: str) -> Dict[str, Any]:
239
  num_bytes = len(text.encode("utf-8"))
240
  avg_loss = total_loss / input_length
241
  compression_rate = avg_loss * COMPRESSION_RATE_FACTOR
 
242
 
243
  return {
244
  "byte_wise_losses": byte_wise_losses,
@@ -249,4 +257,5 @@ def evaluate_rwkv7_single_sample(model, tokenizer, text: str) -> Dict[str, Any]:
249
  "num_bytes": num_bytes,
250
  "model_name": "RWKV7-G1C-1.5B",
251
  "tokenizer": tokenizer,
 
252
  }
 
7
  import gc
8
  import math
9
  import os
10
+ import time
11
  from typing import List, Dict, Any, Optional
12
 
13
  import torch
 
96
  Returns:
97
  dict with byte_wise_losses, top5_predictions, compression_rate, etc.
98
  """
99
+ start_time = time.time()
100
+
101
  # Create token-to-bytes converter
102
  token2bytes_converter = TokenizerBytesConverter(model_name_or_path=tokenizer.name_or_path, tokenizer=tokenizer)
103
 
 
165
  num_bytes = len(text.encode("utf-8"))
166
  avg_loss = total_loss / seq_length
167
  compression_rate = avg_loss * COMPRESSION_RATE_FACTOR
168
+ inference_time = time.time() - start_time
169
 
170
  return {
171
  "byte_wise_losses": byte_wise_losses,
 
176
  "num_bytes": num_bytes,
177
  "model_name": getattr(model.config, "_name_or_path", "unknown"),
178
  "tokenizer": tokenizer,
179
+ "inference_time": inference_time,
180
  }
181
 
182
 
 
193
  Returns:
194
  dict with byte_wise_losses, top5_predictions, compression_rate, etc.
195
  """
196
+ start_time = time.time()
197
+
198
  # Tokenize
199
  tokenized = tokenizer.encode(text)
200
  if hasattr(tokenized, "ids"):
 
246
  num_bytes = len(text.encode("utf-8"))
247
  avg_loss = total_loss / input_length
248
  compression_rate = avg_loss * COMPRESSION_RATE_FACTOR
249
+ inference_time = time.time() - start_time
250
 
251
  return {
252
  "byte_wise_losses": byte_wise_losses,
 
257
  "num_bytes": num_bytes,
258
  "model_name": "RWKV7-G1C-1.5B",
259
  "tokenizer": tokenizer,
260
+ "inference_time": inference_time,
261
  }
core/inference_stats.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Inference statistics manager for tracking and predicting model inference times.
3
+
4
+ This module provides functionality to:
5
+ - Record historical inference statistics (token count, inference time)
6
+ - Predict inference time using k-nearest neighbors algorithm
7
+ - Persist statistics to disk for cross-session usage
8
+ """
9
+
10
+ import json
11
+ import os
12
+ from datetime import datetime
13
+ from pathlib import Path
14
+ from typing import Dict, List, Optional, Tuple
15
+
16
+
17
+ class InferenceStatsManager:
18
+ """Manages inference statistics for time prediction."""
19
+
20
+ def __init__(self, cache_dir: Optional[str] = None):
21
+ """
22
+ Initialize the statistics manager.
23
+
24
+ Args:
25
+ cache_dir: Optional custom cache directory. If None, uses default.
26
+ """
27
+ if cache_dir is None:
28
+ # Use user's cache directory
29
+ if os.name == 'nt': # Windows
30
+ base_cache = os.path.expandvars(r'%LOCALAPPDATA%')
31
+ else: # Unix-like
32
+ base_cache = os.path.expanduser('~/.cache')
33
+
34
+ cache_dir = os.path.join(base_cache, 'uncheatableeval_lens')
35
+
36
+ self.cache_dir = Path(cache_dir)
37
+ self.stats_file = self.cache_dir / 'inference_stats.json'
38
+
39
+ # Create cache directory if it doesn't exist
40
+ self.cache_dir.mkdir(parents=True, exist_ok=True)
41
+
42
+ def _load_stats(self) -> List[Dict]:
43
+ """
44
+ Load statistics from JSON file.
45
+
46
+ Returns:
47
+ List of statistics records, empty list if file doesn't exist.
48
+ """
49
+ if not self.stats_file.exists():
50
+ return []
51
+
52
+ try:
53
+ with open(self.stats_file, 'r', encoding='utf-8') as f:
54
+ return json.load(f)
55
+ except (json.JSONDecodeError, IOError) as e:
56
+ print(f"Warning: Failed to load statistics file: {e}")
57
+ return []
58
+
59
+ def _save_stats(self, stats: List[Dict]) -> None:
60
+ """
61
+ Save statistics to JSON file.
62
+
63
+ Args:
64
+ stats: List of statistics records to save.
65
+ """
66
+ try:
67
+ with open(self.stats_file, 'w', encoding='utf-8') as f:
68
+ json.dump(stats, f, indent=2, ensure_ascii=False)
69
+ except IOError as e:
70
+ print(f"Warning: Failed to save statistics file: {e}")
71
+
72
+ def add_record(self, model_name: str, input_tokens: int, inference_time: float) -> None:
73
+ """
74
+ Add a new inference record to the statistics.
75
+
76
+ Args:
77
+ model_name: Name of the model ("qwen" or "rwkv")
78
+ input_tokens: Number of input tokens
79
+ inference_time: Inference time in seconds
80
+ """
81
+ stats = self._load_stats()
82
+
83
+ record = {
84
+ "model_name": model_name,
85
+ "input_tokens": input_tokens,
86
+ "inference_time": inference_time,
87
+ "timestamp": datetime.now().isoformat()
88
+ }
89
+
90
+ stats.append(record)
91
+ self._save_stats(stats)
92
+
93
+ def _find_k_nearest(self, records: List[Dict], target_tokens: int, k: int) -> List[Tuple[Dict, float]]:
94
+ """
95
+ Find k nearest records by token count.
96
+
97
+ Args:
98
+ records: List of historical records
99
+ target_tokens: Target token count
100
+ k: Number of nearest neighbors to find
101
+
102
+ Returns:
103
+ List of (record, distance) tuples, sorted by distance
104
+ """
105
+ # Calculate distances
106
+ distances = []
107
+ for record in records:
108
+ distance = abs(record["input_tokens"] - target_tokens)
109
+ distances.append((record, distance))
110
+
111
+ # Sort by distance and return top k
112
+ distances.sort(key=lambda x: x[1])
113
+ return distances[:k]
114
+
115
+ def predict_time(self, model_name: str, input_tokens: int, k: int = 5) -> Optional[float]:
116
+ """
117
+ Predict inference time using k-nearest neighbors algorithm.
118
+
119
+ Args:
120
+ model_name: Name of the model ("qwen" or "rwkv")
121
+ input_tokens: Number of input tokens
122
+ k: Number of nearest neighbors to use (default: 5)
123
+
124
+ Returns:
125
+ Predicted inference time in seconds, or None if no historical data
126
+ """
127
+ stats = self._load_stats()
128
+
129
+ # Filter records for the specific model
130
+ model_records = [r for r in stats if r["model_name"] == model_name]
131
+
132
+ if not model_records:
133
+ return None
134
+
135
+ # Find k nearest neighbors
136
+ nearest = self._find_k_nearest(model_records, input_tokens, k)
137
+
138
+ if not nearest:
139
+ return None
140
+
141
+ # Calculate weighted average using inverse distance weighting
142
+ total_weight = 0.0
143
+ weighted_sum = 0.0
144
+
145
+ for record, distance in nearest:
146
+ # Inverse distance weighting: weight = 1 / (1 + distance)
147
+ weight = 1.0 / (1.0 + distance)
148
+ weighted_sum += weight * record["inference_time"]
149
+ total_weight += weight
150
+
151
+ if total_weight == 0:
152
+ return None
153
+
154
+ return weighted_sum / total_weight
155
+
precompute_example.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Precompute example evaluation results for the default demo.
3
+
4
+ This script runs the evaluation on the example text and saves the results
5
+ so they can be loaded instantly when users visit the page.
6
+ """
7
+
8
+ import json
9
+ import os
10
+ import sys
11
+ from pathlib import Path
12
+
13
+ # Add parent directory to path
14
+ sys.path.insert(0, str(Path(__file__).parent))
15
+
16
+ import torch
17
+
18
+ # Get the directory where this script is located
19
+ SCRIPT_DIR = Path(__file__).parent.absolute()
20
+ MODELS_DIR = SCRIPT_DIR / "models"
21
+ SUPPORT_DIR = SCRIPT_DIR / "support"
22
+ PRECOMPUTED_DIR = SCRIPT_DIR / "precomputed"
23
+
24
+ # Model configuration
25
+ QWEN_MODEL_ID = "Qwen/Qwen3-1.7B-Base"
26
+ RWKV_MODEL_FILENAME = "rwkv7-g1c-1.5b-20260110-ctx8192.pth"
27
+
28
+ # Detect device
29
+ DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
30
+ IS_CPU = DEVICE == "cpu"
31
+
32
+
33
+ def download_rwkv_model():
34
+ """Download RWKV7 model if not exists."""
35
+ from huggingface_hub import hf_hub_download
36
+
37
+ model_path = MODELS_DIR / RWKV_MODEL_FILENAME
38
+
39
+ if model_path.exists():
40
+ return str(model_path)
41
+
42
+ MODELS_DIR.mkdir(parents=True, exist_ok=True)
43
+
44
+ downloaded_path = hf_hub_download(
45
+ repo_id="BlinkDL/rwkv7-g1", filename=RWKV_MODEL_FILENAME, local_dir=str(MODELS_DIR), local_dir_use_symlinks=False
46
+ )
47
+
48
+ return downloaded_path
49
+
50
+
51
+ def load_qwen_model():
52
+ """Load Qwen3-1.7B-Base model."""
53
+ from transformers import AutoTokenizer, AutoModelForCausalLM
54
+
55
+ tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL_ID, trust_remote_code=True)
56
+
57
+ if IS_CPU:
58
+ model_kwargs = {"torch_dtype": torch.float32, "device_map": None, "trust_remote_code": True, "low_cpu_mem_usage": True}
59
+ model = AutoModelForCausalLM.from_pretrained(QWEN_MODEL_ID, **model_kwargs).eval()
60
+ else:
61
+ model_kwargs = {"torch_dtype": torch.bfloat16, "device_map": "auto", "trust_remote_code": True}
62
+ try:
63
+ model = AutoModelForCausalLM.from_pretrained(QWEN_MODEL_ID, attn_implementation="flash_attention_2", **model_kwargs).eval()
64
+ except Exception:
65
+ model = AutoModelForCausalLM.from_pretrained(QWEN_MODEL_ID, **model_kwargs).eval()
66
+
67
+ return model, tokenizer
68
+
69
+
70
+ def load_rwkv7_model(model_path: str):
71
+ """Load RWKV7-G1C-1.5B model."""
72
+ os.environ["RWKV_JIT_ON"] = "1"
73
+ os.environ["RWKV_V7_ON"] = "1"
74
+
75
+ if IS_CPU:
76
+ os.environ["RWKV_CUDA_ON"] = "0"
77
+ else:
78
+ os.environ["RWKV_CUDA_ON"] = "1"
79
+
80
+ from rwkv.model import RWKV
81
+ from rwkv.rwkv_tokenizer import TRIE_TOKENIZER
82
+
83
+ if IS_CPU:
84
+ strategy = "cpu fp32"
85
+ else:
86
+ strategy = "cuda fp16"
87
+
88
+ if model_path.endswith(".pth"):
89
+ model_path = model_path[:-4]
90
+
91
+ model = RWKV(model=model_path, strategy=strategy)
92
+
93
+ vocab_path = str(SUPPORT_DIR / "rwkv_vocab_v20230424.txt")
94
+ tokenizer = TRIE_TOKENIZER(vocab_path)
95
+
96
+ return model, tokenizer
97
+
98
+
99
+ def precompute_example():
100
+ """Precompute the example and save results."""
101
+ from core.evaluator import evaluate_hf_single_sample, evaluate_rwkv7_single_sample
102
+ from visualization.html_generator import generate_comparison_html
103
+
104
+ # Read example text
105
+ example_file = SCRIPT_DIR / "the_bitter_lesson.txt"
106
+ with open(example_file, "r", encoding="utf-8") as f:
107
+ example_text = f.read()
108
+
109
+ print(f"Example text length: {len(example_text)} characters")
110
+
111
+ # Download and load models
112
+ print("Downloading RWKV model if needed...")
113
+ rwkv_model_path = download_rwkv_model()
114
+
115
+ print("Loading Qwen3-1.7B-Base...")
116
+ qwen_model, qwen_tokenizer = load_qwen_model()
117
+
118
+ print("Loading RWKV7-G1C-1.5B...")
119
+ rwkv_model, rwkv_tokenizer = load_rwkv7_model(rwkv_model_path)
120
+
121
+ # Run evaluations
122
+ print("Evaluating with Qwen3...")
123
+ result_qwen = evaluate_hf_single_sample(qwen_model, qwen_tokenizer, example_text, bos_mode="add_newline_token")
124
+ print(f"Qwen3 completed in {result_qwen['inference_time']:.2f}s")
125
+
126
+ print("Evaluating with RWKV7...")
127
+ result_rwkv = evaluate_rwkv7_single_sample(rwkv_model, rwkv_tokenizer, example_text)
128
+ print(f"RWKV7 completed in {result_rwkv['inference_time']:.2f}s")
129
+
130
+ # Generate HTML visualization
131
+ print("Generating visualization...")
132
+ html = generate_comparison_html(
133
+ text=example_text,
134
+ byte_losses_a=result_rwkv["byte_wise_losses"],
135
+ byte_losses_b=result_qwen["byte_wise_losses"],
136
+ model_a_name="RWKV7-G1C-1.5B",
137
+ model_b_name="Qwen3-1.7B-Base",
138
+ topk_predictions_a=result_rwkv["top5_predictions"],
139
+ topk_predictions_b=result_qwen["top5_predictions"],
140
+ tokenizer_a=result_rwkv["tokenizer"],
141
+ tokenizer_b=result_qwen["tokenizer"],
142
+ model_type_a="rwkv7",
143
+ model_type_b="hf",
144
+ )
145
+
146
+ # Save precomputed results
147
+ PRECOMPUTED_DIR.mkdir(parents=True, exist_ok=True)
148
+
149
+ # Save HTML
150
+ html_path = PRECOMPUTED_DIR / "example_visualization.html"
151
+ with open(html_path, "w", encoding="utf-8") as f:
152
+ f.write(html)
153
+ print(f"Saved HTML to {html_path}")
154
+
155
+ # Save metadata
156
+ metadata = {
157
+ "example_text": example_text,
158
+ "qwen_inference_time": result_qwen["inference_time"],
159
+ "rwkv_inference_time": result_rwkv["inference_time"],
160
+ "qwen_compression_rate": result_qwen["compression_rate"],
161
+ "rwkv_compression_rate": result_rwkv["compression_rate"],
162
+ }
163
+ metadata_path = PRECOMPUTED_DIR / "example_metadata.json"
164
+ with open(metadata_path, "w", encoding="utf-8") as f:
165
+ json.dump(metadata, f, ensure_ascii=False, indent=2)
166
+ print(f"Saved metadata to {metadata_path}")
167
+
168
+ print("Done! Precomputed example is ready.")
169
+
170
+
171
+ if __name__ == "__main__":
172
+ precompute_example()
precomputed/example_metadata.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "example_text": "The Bitter Lesson\nRich Sutton\nMarch 13, 2019\nThe biggest lesson that can be read from 70 years of AI research is that general methods that leverage computation are ultimately the most effective, and by a large margin. The ultimate reason for this is Moore's law, or rather its generalization of continued exponentially falling cost per unit of computation. Most AI research has been conducted as if the computation available to the agent were constant (in which case leveraging human knowledge would be one of the only ways to improve performance) but, over a slightly longer time than a typical research project, massively more computation inevitably becomes available. Seeking an improvement that makes a difference in the shorter term, researchers seek to leverage their human knowledge of the domain, but the only thing that matters in the long run is the leveraging of computation. These two need not run counter to each other, but in practice they tend to. Time spent on one is time not spent on the other. There are psychological commitments to investment in one approach or the other. And the human-knowledge approach tends to complicate methods in ways that make them less suited to taking advantage of general methods leveraging computation. There were many examples of AI researchers' belated learning of this bitter lesson, and it is instructive to review some of the most prominent.\n\nIn computer chess, the methods that defeated the world champion, Kasparov, in 1997, were based on massive, deep search. At the time, this was looked upon with dismay by the majority of computer-chess researchers who had pursued methods that leveraged human understanding of the special structure of chess. When a simpler, search-based approach with special hardware and software proved vastly more effective, these human-knowledge-based chess researchers were not good losers. They said that ``brute force\" search may have won this time, but it was not a general strategy, and anyway it was not how people played chess. These researchers wanted methods based on human input to win and were disappointed when they did not.\n\nA similar pattern of research progress was seen in computer Go, only delayed by a further 20 years. Enormous initial efforts went into avoiding search by taking advantage of human knowledge, or of the special features of the game, but all those efforts proved irrelevant, or worse, once search was applied effectively at scale. Also important was the use of learning by self play to learn a value function (as it was in many other games and even in chess, although learning did not play a big role in the 1997 program that first beat a world champion). Learning by self play, and learning in general, is like search in that it enables massive computation to be brought to bear. Search and learning are the two most important classes of techniques for utilizing massive amounts of computation in AI research. In computer Go, as in computer chess, researchers' initial effort was directed towards utilizing human understanding (so that less search was needed) and only much later was much greater success had by embracing search and learning.\n\nIn speech recognition, there was an early competition, sponsored by DARPA, in the 1970s. Entrants included a host of special methods that took advantage of human knowledge---knowledge of words, of phonemes, of the human vocal tract, etc. On the other side were newer methods that were more statistical in nature and did much more computation, based on hidden Markov models (HMMs). Again, the statistical methods won out over the human-knowledge-based methods. This led to a major change in all of natural language processing, gradually over decades, where statistics and computation came to dominate the field. The recent rise of deep learning in speech recognition is the most recent step in this consistent direction. Deep learning methods rely even less on human knowledge, and use even more computation, together with learning on huge training sets, to produce dramatically better speech recognition systems. As in the games, researchers always tried to make systems that worked the way the researchers thought their own minds worked---they tried to put that knowledge in their systems---but it proved ultimately counterproductive, and a colossal waste of researcher's time, when, through Moore's law, massive computation became available and a means was found to put it to good use.\n\nIn computer vision, there has been a similar pattern. Early methods conceived of vision as searching for edges, or generalized cylinders, or in terms of SIFT features. But today all this is discarded. Modern deep-learning neural networks use only the notions of convolution and certain kinds of invariances, and perform much better.\n\nThis is a big lesson. As a field, we still have not thoroughly learned it, as we are continuing to make the same kind of mistakes. To see this, and to effectively resist it, we have to understand the appeal of these mistakes. We have to learn the bitter lesson that building in how we think we think does not work in the long run. The bitter lesson is based on the historical observations that 1) AI researchers have often tried to build knowledge into their agents, 2) this always helps in the short term, and is personally satisfying to the researcher, but 3) in the long run it plateaus and even inhibits further progress, and 4) breakthrough progress eventually arrives by an opposing approach based on scaling computation by search and learning. The eventual success is tinged with bitterness, and often incompletely digested, because it is success over a favored, human-centric approach.\n\nOne thing that should be learned from the bitter lesson is the great power of general purpose methods, of methods that continue to scale with increased computation even as the available computation becomes very great. The two methods that seem to scale arbitrarily in this way are search and learning.\n\nThe second general point to be learned from the bitter lesson is that the actual contents of minds are tremendously, irredeemably complex; we should stop trying to find simple ways to think about the contents of minds, such as simple ways to think about space, objects, multiple agents, or symmetries. All these are part of the arbitrary, intrinsically-complex, outside world. They are not what should be built in, as their complexity is endless; instead we should build in only the meta-methods that can find and capture this arbitrary complexity. Essential to these methods is that they can find good approximations, but the search for them should be by our methods, not by us. We want AI agents that can discover like we can, not which contain what we have discovered. Building in our discoveries only makes it harder to see how the discovering process can be done.\n",
3
+ "qwen_inference_time": 23.581797122955322,
4
+ "rwkv_inference_time": 34.39247703552246,
5
+ "qwen_compression_rate": 48.14428559434192,
6
+ "rwkv_compression_rate": 47.624574152536056
7
+ }
precomputed/example_visualization.html ADDED
The diff for this file is too large to render. See raw diff
 
the_bitter_lesson.txt ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ The Bitter Lesson
2
+ Rich Sutton
3
+ March 13, 2019
4
+ The biggest lesson that can be read from 70 years of AI research is that general methods that leverage computation are ultimately the most effective, and by a large margin. The ultimate reason for this is Moore's law, or rather its generalization of continued exponentially falling cost per unit of computation. Most AI research has been conducted as if the computation available to the agent were constant (in which case leveraging human knowledge would be one of the only ways to improve performance) but, over a slightly longer time than a typical research project, massively more computation inevitably becomes available. Seeking an improvement that makes a difference in the shorter term, researchers seek to leverage their human knowledge of the domain, but the only thing that matters in the long run is the leveraging of computation. These two need not run counter to each other, but in practice they tend to. Time spent on one is time not spent on the other. There are psychological commitments to investment in one approach or the other. And the human-knowledge approach tends to complicate methods in ways that make them less suited to taking advantage of general methods leveraging computation. There were many examples of AI researchers' belated learning of this bitter lesson, and it is instructive to review some of the most prominent.
5
+
6
+ In computer chess, the methods that defeated the world champion, Kasparov, in 1997, were based on massive, deep search. At the time, this was looked upon with dismay by the majority of computer-chess researchers who had pursued methods that leveraged human understanding of the special structure of chess. When a simpler, search-based approach with special hardware and software proved vastly more effective, these human-knowledge-based chess researchers were not good losers. They said that ``brute force" search may have won this time, but it was not a general strategy, and anyway it was not how people played chess. These researchers wanted methods based on human input to win and were disappointed when they did not.
7
+
8
+ A similar pattern of research progress was seen in computer Go, only delayed by a further 20 years. Enormous initial efforts went into avoiding search by taking advantage of human knowledge, or of the special features of the game, but all those efforts proved irrelevant, or worse, once search was applied effectively at scale. Also important was the use of learning by self play to learn a value function (as it was in many other games and even in chess, although learning did not play a big role in the 1997 program that first beat a world champion). Learning by self play, and learning in general, is like search in that it enables massive computation to be brought to bear. Search and learning are the two most important classes of techniques for utilizing massive amounts of computation in AI research. In computer Go, as in computer chess, researchers' initial effort was directed towards utilizing human understanding (so that less search was needed) and only much later was much greater success had by embracing search and learning.
9
+
10
+ In speech recognition, there was an early competition, sponsored by DARPA, in the 1970s. Entrants included a host of special methods that took advantage of human knowledge---knowledge of words, of phonemes, of the human vocal tract, etc. On the other side were newer methods that were more statistical in nature and did much more computation, based on hidden Markov models (HMMs). Again, the statistical methods won out over the human-knowledge-based methods. This led to a major change in all of natural language processing, gradually over decades, where statistics and computation came to dominate the field. The recent rise of deep learning in speech recognition is the most recent step in this consistent direction. Deep learning methods rely even less on human knowledge, and use even more computation, together with learning on huge training sets, to produce dramatically better speech recognition systems. As in the games, researchers always tried to make systems that worked the way the researchers thought their own minds worked---they tried to put that knowledge in their systems---but it proved ultimately counterproductive, and a colossal waste of researcher's time, when, through Moore's law, massive computation became available and a means was found to put it to good use.
11
+
12
+ In computer vision, there has been a similar pattern. Early methods conceived of vision as searching for edges, or generalized cylinders, or in terms of SIFT features. But today all this is discarded. Modern deep-learning neural networks use only the notions of convolution and certain kinds of invariances, and perform much better.
13
+
14
+ This is a big lesson. As a field, we still have not thoroughly learned it, as we are continuing to make the same kind of mistakes. To see this, and to effectively resist it, we have to understand the appeal of these mistakes. We have to learn the bitter lesson that building in how we think we think does not work in the long run. The bitter lesson is based on the historical observations that 1) AI researchers have often tried to build knowledge into their agents, 2) this always helps in the short term, and is personally satisfying to the researcher, but 3) in the long run it plateaus and even inhibits further progress, and 4) breakthrough progress eventually arrives by an opposing approach based on scaling computation by search and learning. The eventual success is tinged with bitterness, and often incompletely digested, because it is success over a favored, human-centric approach.
15
+
16
+ One thing that should be learned from the bitter lesson is the great power of general purpose methods, of methods that continue to scale with increased computation even as the available computation becomes very great. The two methods that seem to scale arbitrarily in this way are search and learning.
17
+
18
+ The second general point to be learned from the bitter lesson is that the actual contents of minds are tremendously, irredeemably complex; we should stop trying to find simple ways to think about the contents of minds, such as simple ways to think about space, objects, multiple agents, or symmetries. All these are part of the arbitrary, intrinsically-complex, outside world. They are not what should be built in, as their complexity is endless; instead we should build in only the meta-methods that can find and capture this arbitrary complexity. Essential to these methods is that they can find good approximations, but the search for them should be by our methods, not by us. We want AI agents that can discover like we can, not which contain what we have discovered. Building in our discoveries only makes it harder to see how the discovering process can be done.