Spaces:
Running
Running
Commit
·
15b2f1f
1
Parent(s):
3ef3c58
update
Browse files- .claude/settings.local.json +2 -1
- app.py +102 -6
- core/evaluator.py +9 -0
- core/inference_stats.py +155 -0
- precompute_example.py +172 -0
- precomputed/example_metadata.json +7 -0
- precomputed/example_visualization.html +0 -0
- the_bitter_lesson.txt +18 -0
.claude/settings.local.json
CHANGED
|
@@ -8,7 +8,8 @@
|
|
| 8 |
"Bash(git branch:*)",
|
| 9 |
"Bash(git commit -m \"$\\(cat <<''EOF''\nFix Gradio compatibility for HuggingFace Spaces\n\n- Upgrade gradio to >=5.0.0 to fix API schema bug\n- Add server_name and server_port to demo.launch\\(\\)\n\nCo-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>\nEOF\n\\)\")",
|
| 10 |
"Bash(git commit:*)",
|
| 11 |
-
"Bash(git reset:*)"
|
|
|
|
| 12 |
]
|
| 13 |
}
|
| 14 |
}
|
|
|
|
| 8 |
"Bash(git branch:*)",
|
| 9 |
"Bash(git commit -m \"$\\(cat <<''EOF''\nFix Gradio compatibility for HuggingFace Spaces\n\n- Upgrade gradio to >=5.0.0 to fix API schema bug\n- Add server_name and server_port to demo.launch\\(\\)\n\nCo-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>\nEOF\n\\)\")",
|
| 10 |
"Bash(git commit:*)",
|
| 11 |
+
"Bash(git reset:*)",
|
| 12 |
+
"Bash(and top-10 predictions\" to better reflect what users see in the tooltip.\nAlso updated color legend to match the swapped model positions.\n\nCo-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>\nEOF\n\\)\")"
|
| 13 |
]
|
| 14 |
}
|
| 15 |
}
|
app.py
CHANGED
|
@@ -35,6 +35,12 @@ _qwen_tokenizer = None
|
|
| 35 |
_rwkv_model = None
|
| 36 |
_rwkv_tokenizer = None
|
| 37 |
_rwkv_model_path = None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 38 |
|
| 39 |
|
| 40 |
def download_rwkv_model(progress=None):
|
|
@@ -124,12 +130,36 @@ def validate_input(text: str) -> tuple[bool, str]:
|
|
| 124 |
return True, text
|
| 125 |
|
| 126 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
def initialize_models():
|
| 128 |
"""Initialize and cache both models at startup."""
|
| 129 |
-
global _qwen_model, _qwen_tokenizer, _rwkv_model, _rwkv_tokenizer, _rwkv_model_path
|
| 130 |
|
| 131 |
print("Initializing models...")
|
| 132 |
|
|
|
|
|
|
|
|
|
|
| 133 |
# Download RWKV model if needed
|
| 134 |
print("Checking RWKV7 model...")
|
| 135 |
_rwkv_model_path = download_rwkv_model()
|
|
@@ -142,6 +172,10 @@ def initialize_models():
|
|
| 142 |
print("Loading RWKV7-G1C-1.5B...")
|
| 143 |
_rwkv_model, _rwkv_tokenizer = load_rwkv7_model(_rwkv_model_path)
|
| 144 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 145 |
print("Models loaded successfully!")
|
| 146 |
|
| 147 |
|
|
@@ -165,7 +199,7 @@ def run_evaluation(text: str, progress=gr.Progress()):
|
|
| 165 |
from visualization.html_generator import generate_comparison_html
|
| 166 |
|
| 167 |
# Use cached models
|
| 168 |
-
global _qwen_model, _qwen_tokenizer, _rwkv_model, _rwkv_tokenizer
|
| 169 |
|
| 170 |
# Validate input
|
| 171 |
valid, result = validate_input(text)
|
|
@@ -177,12 +211,38 @@ def run_evaluation(text: str, progress=gr.Progress()):
|
|
| 177 |
try:
|
| 178 |
# Step 1: Evaluate Qwen (using cached model)
|
| 179 |
progress(0, desc="Evaluating with Qwen3...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
result_qwen = evaluate_hf_single_sample(_qwen_model, _qwen_tokenizer, text, bos_mode="add_newline_token")
|
| 181 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
# Step 2: Evaluate RWKV7 (using cached model)
|
| 183 |
progress(0, desc="Evaluating with RWKV7...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 184 |
result_rwkv = evaluate_rwkv7_single_sample(_rwkv_model, _rwkv_tokenizer, text)
|
| 185 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
# Step 3: Generate visualization
|
| 187 |
progress(0, desc="Generating visualization...")
|
| 188 |
html = generate_comparison_html(
|
|
@@ -202,7 +262,24 @@ def run_evaluation(text: str, progress=gr.Progress()):
|
|
| 202 |
# Wrap HTML for iframe display
|
| 203 |
wrapped_html = wrap_html_in_iframe(html)
|
| 204 |
|
| 205 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 206 |
|
| 207 |
except torch.cuda.OutOfMemoryError:
|
| 208 |
if torch.cuda.is_available():
|
|
@@ -218,7 +295,18 @@ def run_evaluation(text: str, progress=gr.Progress()):
|
|
| 218 |
|
| 219 |
def clear_inputs():
|
| 220 |
"""Clear all inputs and outputs."""
|
| 221 |
-
return "", None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
|
| 224 |
# Build Gradio UI
|
|
@@ -245,14 +333,22 @@ with gr.Blocks(title="Compression-Lens: RWKV-7 vs Qwen3", theme=gr.themes.Soft()
|
|
| 245 |
|
| 246 |
gr.Markdown("---")
|
| 247 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 248 |
with gr.Row():
|
| 249 |
with gr.Column():
|
| 250 |
output_html = gr.HTML(label="Visualization")
|
| 251 |
|
| 252 |
# Event handlers
|
| 253 |
-
clear_btn.click(fn=clear_inputs, outputs=[text_input, output_html])
|
|
|
|
|
|
|
| 254 |
|
| 255 |
-
|
|
|
|
| 256 |
|
| 257 |
|
| 258 |
if __name__ == "__main__":
|
|
|
|
| 35 |
_rwkv_model = None
|
| 36 |
_rwkv_tokenizer = None
|
| 37 |
_rwkv_model_path = None
|
| 38 |
+
_stats_manager = None
|
| 39 |
+
|
| 40 |
+
# Precomputed example cache
|
| 41 |
+
_precomputed_html = None
|
| 42 |
+
_precomputed_text = None
|
| 43 |
+
PRECOMPUTED_DIR = SCRIPT_DIR / "precomputed"
|
| 44 |
|
| 45 |
|
| 46 |
def download_rwkv_model(progress=None):
|
|
|
|
| 130 |
return True, text
|
| 131 |
|
| 132 |
|
| 133 |
+
def load_precomputed_example():
|
| 134 |
+
"""Load precomputed example visualization."""
|
| 135 |
+
global _precomputed_html, _precomputed_text
|
| 136 |
+
|
| 137 |
+
html_path = PRECOMPUTED_DIR / "example_visualization.html"
|
| 138 |
+
metadata_path = PRECOMPUTED_DIR / "example_metadata.json"
|
| 139 |
+
|
| 140 |
+
if html_path.exists() and metadata_path.exists():
|
| 141 |
+
import json
|
| 142 |
+
with open(html_path, "r", encoding="utf-8") as f:
|
| 143 |
+
_precomputed_html = f.read()
|
| 144 |
+
with open(metadata_path, "r", encoding="utf-8") as f:
|
| 145 |
+
metadata = json.load(f)
|
| 146 |
+
_precomputed_text = metadata.get("example_text", "")
|
| 147 |
+
print(f"Loaded precomputed example ({len(_precomputed_text)} chars)")
|
| 148 |
+
return True
|
| 149 |
+
else:
|
| 150 |
+
print("No precomputed example found. Run precompute_example.py first.")
|
| 151 |
+
return False
|
| 152 |
+
|
| 153 |
+
|
| 154 |
def initialize_models():
|
| 155 |
"""Initialize and cache both models at startup."""
|
| 156 |
+
global _qwen_model, _qwen_tokenizer, _rwkv_model, _rwkv_tokenizer, _rwkv_model_path, _stats_manager
|
| 157 |
|
| 158 |
print("Initializing models...")
|
| 159 |
|
| 160 |
+
# Load precomputed example first
|
| 161 |
+
load_precomputed_example()
|
| 162 |
+
|
| 163 |
# Download RWKV model if needed
|
| 164 |
print("Checking RWKV7 model...")
|
| 165 |
_rwkv_model_path = download_rwkv_model()
|
|
|
|
| 172 |
print("Loading RWKV7-G1C-1.5B...")
|
| 173 |
_rwkv_model, _rwkv_tokenizer = load_rwkv7_model(_rwkv_model_path)
|
| 174 |
|
| 175 |
+
# Initialize stats manager
|
| 176 |
+
from core.inference_stats import InferenceStatsManager
|
| 177 |
+
_stats_manager = InferenceStatsManager()
|
| 178 |
+
|
| 179 |
print("Models loaded successfully!")
|
| 180 |
|
| 181 |
|
|
|
|
| 199 |
from visualization.html_generator import generate_comparison_html
|
| 200 |
|
| 201 |
# Use cached models
|
| 202 |
+
global _qwen_model, _qwen_tokenizer, _rwkv_model, _rwkv_tokenizer, _stats_manager
|
| 203 |
|
| 204 |
# Validate input
|
| 205 |
valid, result = validate_input(text)
|
|
|
|
| 211 |
try:
|
| 212 |
# Step 1: Evaluate Qwen (using cached model)
|
| 213 |
progress(0, desc="Evaluating with Qwen3...")
|
| 214 |
+
|
| 215 |
+
# Get token count for prediction
|
| 216 |
+
qwen_inputs = _qwen_tokenizer(text, return_tensors="pt", add_special_tokens=False)
|
| 217 |
+
qwen_token_count = qwen_inputs["input_ids"].shape[-1]
|
| 218 |
+
qwen_predicted_time = _stats_manager.predict_time("qwen", qwen_token_count)
|
| 219 |
+
|
| 220 |
result_qwen = evaluate_hf_single_sample(_qwen_model, _qwen_tokenizer, text, bos_mode="add_newline_token")
|
| 221 |
|
| 222 |
+
# Save stats and print comparison
|
| 223 |
+
_stats_manager.add_record("qwen", qwen_token_count, result_qwen["inference_time"])
|
| 224 |
+
if qwen_predicted_time is not None:
|
| 225 |
+
print(f"Qwen3 completed in {result_qwen['inference_time']:.2f}s (predicted: {qwen_predicted_time:.2f}s)")
|
| 226 |
+
else:
|
| 227 |
+
print(f"Qwen3 completed in {result_qwen['inference_time']:.2f}s")
|
| 228 |
+
|
| 229 |
# Step 2: Evaluate RWKV7 (using cached model)
|
| 230 |
progress(0, desc="Evaluating with RWKV7...")
|
| 231 |
+
|
| 232 |
+
# Get token count for prediction
|
| 233 |
+
rwkv_tokenized = _rwkv_tokenizer.encode(text)
|
| 234 |
+
rwkv_token_count = len(rwkv_tokenized.ids if hasattr(rwkv_tokenized, "ids") else rwkv_tokenized)
|
| 235 |
+
rwkv_predicted_time = _stats_manager.predict_time("rwkv", rwkv_token_count)
|
| 236 |
+
|
| 237 |
result_rwkv = evaluate_rwkv7_single_sample(_rwkv_model, _rwkv_tokenizer, text)
|
| 238 |
|
| 239 |
+
# Save stats and print comparison
|
| 240 |
+
_stats_manager.add_record("rwkv", rwkv_token_count, result_rwkv["inference_time"])
|
| 241 |
+
if rwkv_predicted_time is not None:
|
| 242 |
+
print(f"RWKV7 completed in {result_rwkv['inference_time']:.2f}s (predicted: {rwkv_predicted_time:.2f}s)")
|
| 243 |
+
else:
|
| 244 |
+
print(f"RWKV7 completed in {result_rwkv['inference_time']:.2f}s")
|
| 245 |
+
|
| 246 |
# Step 3: Generate visualization
|
| 247 |
progress(0, desc="Generating visualization...")
|
| 248 |
html = generate_comparison_html(
|
|
|
|
| 262 |
# Wrap HTML for iframe display
|
| 263 |
wrapped_html = wrap_html_in_iframe(html)
|
| 264 |
|
| 265 |
+
# Generate timing information for UI display
|
| 266 |
+
timing_lines = ["## ⏱️ Inference Timing\n"]
|
| 267 |
+
|
| 268 |
+
# Qwen timing
|
| 269 |
+
if qwen_predicted_time is not None:
|
| 270 |
+
timing_lines.append(f"**Qwen3-1.7B-Base**: {result_qwen['inference_time']:.2f}s (predicted: {qwen_predicted_time:.2f}s)")
|
| 271 |
+
else:
|
| 272 |
+
timing_lines.append(f"**Qwen3-1.7B-Base**: {result_qwen['inference_time']:.2f}s")
|
| 273 |
+
|
| 274 |
+
# RWKV timing
|
| 275 |
+
if rwkv_predicted_time is not None:
|
| 276 |
+
timing_lines.append(f"**RWKV7-G1C-1.5B**: {result_rwkv['inference_time']:.2f}s (predicted: {rwkv_predicted_time:.2f}s)")
|
| 277 |
+
else:
|
| 278 |
+
timing_lines.append(f"**RWKV7-G1C-1.5B**: {result_rwkv['inference_time']:.2f}s")
|
| 279 |
+
|
| 280 |
+
timing_text = "\n\n".join(timing_lines)
|
| 281 |
+
|
| 282 |
+
return wrapped_html, timing_text
|
| 283 |
|
| 284 |
except torch.cuda.OutOfMemoryError:
|
| 285 |
if torch.cuda.is_available():
|
|
|
|
| 295 |
|
| 296 |
def clear_inputs():
|
| 297 |
"""Clear all inputs and outputs."""
|
| 298 |
+
return "", None, ""
|
| 299 |
+
|
| 300 |
+
|
| 301 |
+
def get_default_example():
|
| 302 |
+
"""Get the default example for display on page load."""
|
| 303 |
+
global _precomputed_html, _precomputed_text
|
| 304 |
+
|
| 305 |
+
if _precomputed_html and _precomputed_text:
|
| 306 |
+
wrapped_html = wrap_html_in_iframe(_precomputed_html)
|
| 307 |
+
return _precomputed_text, wrapped_html, ""
|
| 308 |
+
else:
|
| 309 |
+
return "", None, ""
|
| 310 |
|
| 311 |
|
| 312 |
# Build Gradio UI
|
|
|
|
| 333 |
|
| 334 |
gr.Markdown("---")
|
| 335 |
|
| 336 |
+
# Timing information display
|
| 337 |
+
with gr.Row():
|
| 338 |
+
with gr.Column():
|
| 339 |
+
timing_info = gr.Markdown(label="Inference Timing")
|
| 340 |
+
|
| 341 |
with gr.Row():
|
| 342 |
with gr.Column():
|
| 343 |
output_html = gr.HTML(label="Visualization")
|
| 344 |
|
| 345 |
# Event handlers
|
| 346 |
+
clear_btn.click(fn=clear_inputs, outputs=[text_input, output_html, timing_info])
|
| 347 |
+
|
| 348 |
+
run_btn.click(fn=run_evaluation, inputs=[text_input], outputs=[output_html, timing_info])
|
| 349 |
|
| 350 |
+
# Load default example on page load
|
| 351 |
+
demo.load(fn=get_default_example, outputs=[text_input, output_html, timing_info])
|
| 352 |
|
| 353 |
|
| 354 |
if __name__ == "__main__":
|
core/evaluator.py
CHANGED
|
@@ -7,6 +7,7 @@ Provides single-sample evaluation functions for Qwen3 and RWKV7 models.
|
|
| 7 |
import gc
|
| 8 |
import math
|
| 9 |
import os
|
|
|
|
| 10 |
from typing import List, Dict, Any, Optional
|
| 11 |
|
| 12 |
import torch
|
|
@@ -95,6 +96,8 @@ def evaluate_hf_single_sample(model, tokenizer, text: str, bos_mode: str = "add_
|
|
| 95 |
Returns:
|
| 96 |
dict with byte_wise_losses, top5_predictions, compression_rate, etc.
|
| 97 |
"""
|
|
|
|
|
|
|
| 98 |
# Create token-to-bytes converter
|
| 99 |
token2bytes_converter = TokenizerBytesConverter(model_name_or_path=tokenizer.name_or_path, tokenizer=tokenizer)
|
| 100 |
|
|
@@ -162,6 +165,7 @@ def evaluate_hf_single_sample(model, tokenizer, text: str, bos_mode: str = "add_
|
|
| 162 |
num_bytes = len(text.encode("utf-8"))
|
| 163 |
avg_loss = total_loss / seq_length
|
| 164 |
compression_rate = avg_loss * COMPRESSION_RATE_FACTOR
|
|
|
|
| 165 |
|
| 166 |
return {
|
| 167 |
"byte_wise_losses": byte_wise_losses,
|
|
@@ -172,6 +176,7 @@ def evaluate_hf_single_sample(model, tokenizer, text: str, bos_mode: str = "add_
|
|
| 172 |
"num_bytes": num_bytes,
|
| 173 |
"model_name": getattr(model.config, "_name_or_path", "unknown"),
|
| 174 |
"tokenizer": tokenizer,
|
|
|
|
| 175 |
}
|
| 176 |
|
| 177 |
|
|
@@ -188,6 +193,8 @@ def evaluate_rwkv7_single_sample(model, tokenizer, text: str) -> Dict[str, Any]:
|
|
| 188 |
Returns:
|
| 189 |
dict with byte_wise_losses, top5_predictions, compression_rate, etc.
|
| 190 |
"""
|
|
|
|
|
|
|
| 191 |
# Tokenize
|
| 192 |
tokenized = tokenizer.encode(text)
|
| 193 |
if hasattr(tokenized, "ids"):
|
|
@@ -239,6 +246,7 @@ def evaluate_rwkv7_single_sample(model, tokenizer, text: str) -> Dict[str, Any]:
|
|
| 239 |
num_bytes = len(text.encode("utf-8"))
|
| 240 |
avg_loss = total_loss / input_length
|
| 241 |
compression_rate = avg_loss * COMPRESSION_RATE_FACTOR
|
|
|
|
| 242 |
|
| 243 |
return {
|
| 244 |
"byte_wise_losses": byte_wise_losses,
|
|
@@ -249,4 +257,5 @@ def evaluate_rwkv7_single_sample(model, tokenizer, text: str) -> Dict[str, Any]:
|
|
| 249 |
"num_bytes": num_bytes,
|
| 250 |
"model_name": "RWKV7-G1C-1.5B",
|
| 251 |
"tokenizer": tokenizer,
|
|
|
|
| 252 |
}
|
|
|
|
| 7 |
import gc
|
| 8 |
import math
|
| 9 |
import os
|
| 10 |
+
import time
|
| 11 |
from typing import List, Dict, Any, Optional
|
| 12 |
|
| 13 |
import torch
|
|
|
|
| 96 |
Returns:
|
| 97 |
dict with byte_wise_losses, top5_predictions, compression_rate, etc.
|
| 98 |
"""
|
| 99 |
+
start_time = time.time()
|
| 100 |
+
|
| 101 |
# Create token-to-bytes converter
|
| 102 |
token2bytes_converter = TokenizerBytesConverter(model_name_or_path=tokenizer.name_or_path, tokenizer=tokenizer)
|
| 103 |
|
|
|
|
| 165 |
num_bytes = len(text.encode("utf-8"))
|
| 166 |
avg_loss = total_loss / seq_length
|
| 167 |
compression_rate = avg_loss * COMPRESSION_RATE_FACTOR
|
| 168 |
+
inference_time = time.time() - start_time
|
| 169 |
|
| 170 |
return {
|
| 171 |
"byte_wise_losses": byte_wise_losses,
|
|
|
|
| 176 |
"num_bytes": num_bytes,
|
| 177 |
"model_name": getattr(model.config, "_name_or_path", "unknown"),
|
| 178 |
"tokenizer": tokenizer,
|
| 179 |
+
"inference_time": inference_time,
|
| 180 |
}
|
| 181 |
|
| 182 |
|
|
|
|
| 193 |
Returns:
|
| 194 |
dict with byte_wise_losses, top5_predictions, compression_rate, etc.
|
| 195 |
"""
|
| 196 |
+
start_time = time.time()
|
| 197 |
+
|
| 198 |
# Tokenize
|
| 199 |
tokenized = tokenizer.encode(text)
|
| 200 |
if hasattr(tokenized, "ids"):
|
|
|
|
| 246 |
num_bytes = len(text.encode("utf-8"))
|
| 247 |
avg_loss = total_loss / input_length
|
| 248 |
compression_rate = avg_loss * COMPRESSION_RATE_FACTOR
|
| 249 |
+
inference_time = time.time() - start_time
|
| 250 |
|
| 251 |
return {
|
| 252 |
"byte_wise_losses": byte_wise_losses,
|
|
|
|
| 257 |
"num_bytes": num_bytes,
|
| 258 |
"model_name": "RWKV7-G1C-1.5B",
|
| 259 |
"tokenizer": tokenizer,
|
| 260 |
+
"inference_time": inference_time,
|
| 261 |
}
|
core/inference_stats.py
ADDED
|
@@ -0,0 +1,155 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Inference statistics manager for tracking and predicting model inference times.
|
| 3 |
+
|
| 4 |
+
This module provides functionality to:
|
| 5 |
+
- Record historical inference statistics (token count, inference time)
|
| 6 |
+
- Predict inference time using k-nearest neighbors algorithm
|
| 7 |
+
- Persist statistics to disk for cross-session usage
|
| 8 |
+
"""
|
| 9 |
+
|
| 10 |
+
import json
|
| 11 |
+
import os
|
| 12 |
+
from datetime import datetime
|
| 13 |
+
from pathlib import Path
|
| 14 |
+
from typing import Dict, List, Optional, Tuple
|
| 15 |
+
|
| 16 |
+
|
| 17 |
+
class InferenceStatsManager:
|
| 18 |
+
"""Manages inference statistics for time prediction."""
|
| 19 |
+
|
| 20 |
+
def __init__(self, cache_dir: Optional[str] = None):
|
| 21 |
+
"""
|
| 22 |
+
Initialize the statistics manager.
|
| 23 |
+
|
| 24 |
+
Args:
|
| 25 |
+
cache_dir: Optional custom cache directory. If None, uses default.
|
| 26 |
+
"""
|
| 27 |
+
if cache_dir is None:
|
| 28 |
+
# Use user's cache directory
|
| 29 |
+
if os.name == 'nt': # Windows
|
| 30 |
+
base_cache = os.path.expandvars(r'%LOCALAPPDATA%')
|
| 31 |
+
else: # Unix-like
|
| 32 |
+
base_cache = os.path.expanduser('~/.cache')
|
| 33 |
+
|
| 34 |
+
cache_dir = os.path.join(base_cache, 'uncheatableeval_lens')
|
| 35 |
+
|
| 36 |
+
self.cache_dir = Path(cache_dir)
|
| 37 |
+
self.stats_file = self.cache_dir / 'inference_stats.json'
|
| 38 |
+
|
| 39 |
+
# Create cache directory if it doesn't exist
|
| 40 |
+
self.cache_dir.mkdir(parents=True, exist_ok=True)
|
| 41 |
+
|
| 42 |
+
def _load_stats(self) -> List[Dict]:
|
| 43 |
+
"""
|
| 44 |
+
Load statistics from JSON file.
|
| 45 |
+
|
| 46 |
+
Returns:
|
| 47 |
+
List of statistics records, empty list if file doesn't exist.
|
| 48 |
+
"""
|
| 49 |
+
if not self.stats_file.exists():
|
| 50 |
+
return []
|
| 51 |
+
|
| 52 |
+
try:
|
| 53 |
+
with open(self.stats_file, 'r', encoding='utf-8') as f:
|
| 54 |
+
return json.load(f)
|
| 55 |
+
except (json.JSONDecodeError, IOError) as e:
|
| 56 |
+
print(f"Warning: Failed to load statistics file: {e}")
|
| 57 |
+
return []
|
| 58 |
+
|
| 59 |
+
def _save_stats(self, stats: List[Dict]) -> None:
|
| 60 |
+
"""
|
| 61 |
+
Save statistics to JSON file.
|
| 62 |
+
|
| 63 |
+
Args:
|
| 64 |
+
stats: List of statistics records to save.
|
| 65 |
+
"""
|
| 66 |
+
try:
|
| 67 |
+
with open(self.stats_file, 'w', encoding='utf-8') as f:
|
| 68 |
+
json.dump(stats, f, indent=2, ensure_ascii=False)
|
| 69 |
+
except IOError as e:
|
| 70 |
+
print(f"Warning: Failed to save statistics file: {e}")
|
| 71 |
+
|
| 72 |
+
def add_record(self, model_name: str, input_tokens: int, inference_time: float) -> None:
|
| 73 |
+
"""
|
| 74 |
+
Add a new inference record to the statistics.
|
| 75 |
+
|
| 76 |
+
Args:
|
| 77 |
+
model_name: Name of the model ("qwen" or "rwkv")
|
| 78 |
+
input_tokens: Number of input tokens
|
| 79 |
+
inference_time: Inference time in seconds
|
| 80 |
+
"""
|
| 81 |
+
stats = self._load_stats()
|
| 82 |
+
|
| 83 |
+
record = {
|
| 84 |
+
"model_name": model_name,
|
| 85 |
+
"input_tokens": input_tokens,
|
| 86 |
+
"inference_time": inference_time,
|
| 87 |
+
"timestamp": datetime.now().isoformat()
|
| 88 |
+
}
|
| 89 |
+
|
| 90 |
+
stats.append(record)
|
| 91 |
+
self._save_stats(stats)
|
| 92 |
+
|
| 93 |
+
def _find_k_nearest(self, records: List[Dict], target_tokens: int, k: int) -> List[Tuple[Dict, float]]:
|
| 94 |
+
"""
|
| 95 |
+
Find k nearest records by token count.
|
| 96 |
+
|
| 97 |
+
Args:
|
| 98 |
+
records: List of historical records
|
| 99 |
+
target_tokens: Target token count
|
| 100 |
+
k: Number of nearest neighbors to find
|
| 101 |
+
|
| 102 |
+
Returns:
|
| 103 |
+
List of (record, distance) tuples, sorted by distance
|
| 104 |
+
"""
|
| 105 |
+
# Calculate distances
|
| 106 |
+
distances = []
|
| 107 |
+
for record in records:
|
| 108 |
+
distance = abs(record["input_tokens"] - target_tokens)
|
| 109 |
+
distances.append((record, distance))
|
| 110 |
+
|
| 111 |
+
# Sort by distance and return top k
|
| 112 |
+
distances.sort(key=lambda x: x[1])
|
| 113 |
+
return distances[:k]
|
| 114 |
+
|
| 115 |
+
def predict_time(self, model_name: str, input_tokens: int, k: int = 5) -> Optional[float]:
|
| 116 |
+
"""
|
| 117 |
+
Predict inference time using k-nearest neighbors algorithm.
|
| 118 |
+
|
| 119 |
+
Args:
|
| 120 |
+
model_name: Name of the model ("qwen" or "rwkv")
|
| 121 |
+
input_tokens: Number of input tokens
|
| 122 |
+
k: Number of nearest neighbors to use (default: 5)
|
| 123 |
+
|
| 124 |
+
Returns:
|
| 125 |
+
Predicted inference time in seconds, or None if no historical data
|
| 126 |
+
"""
|
| 127 |
+
stats = self._load_stats()
|
| 128 |
+
|
| 129 |
+
# Filter records for the specific model
|
| 130 |
+
model_records = [r for r in stats if r["model_name"] == model_name]
|
| 131 |
+
|
| 132 |
+
if not model_records:
|
| 133 |
+
return None
|
| 134 |
+
|
| 135 |
+
# Find k nearest neighbors
|
| 136 |
+
nearest = self._find_k_nearest(model_records, input_tokens, k)
|
| 137 |
+
|
| 138 |
+
if not nearest:
|
| 139 |
+
return None
|
| 140 |
+
|
| 141 |
+
# Calculate weighted average using inverse distance weighting
|
| 142 |
+
total_weight = 0.0
|
| 143 |
+
weighted_sum = 0.0
|
| 144 |
+
|
| 145 |
+
for record, distance in nearest:
|
| 146 |
+
# Inverse distance weighting: weight = 1 / (1 + distance)
|
| 147 |
+
weight = 1.0 / (1.0 + distance)
|
| 148 |
+
weighted_sum += weight * record["inference_time"]
|
| 149 |
+
total_weight += weight
|
| 150 |
+
|
| 151 |
+
if total_weight == 0:
|
| 152 |
+
return None
|
| 153 |
+
|
| 154 |
+
return weighted_sum / total_weight
|
| 155 |
+
|
precompute_example.py
ADDED
|
@@ -0,0 +1,172 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Precompute example evaluation results for the default demo.
|
| 3 |
+
|
| 4 |
+
This script runs the evaluation on the example text and saves the results
|
| 5 |
+
so they can be loaded instantly when users visit the page.
|
| 6 |
+
"""
|
| 7 |
+
|
| 8 |
+
import json
|
| 9 |
+
import os
|
| 10 |
+
import sys
|
| 11 |
+
from pathlib import Path
|
| 12 |
+
|
| 13 |
+
# Add parent directory to path
|
| 14 |
+
sys.path.insert(0, str(Path(__file__).parent))
|
| 15 |
+
|
| 16 |
+
import torch
|
| 17 |
+
|
| 18 |
+
# Get the directory where this script is located
|
| 19 |
+
SCRIPT_DIR = Path(__file__).parent.absolute()
|
| 20 |
+
MODELS_DIR = SCRIPT_DIR / "models"
|
| 21 |
+
SUPPORT_DIR = SCRIPT_DIR / "support"
|
| 22 |
+
PRECOMPUTED_DIR = SCRIPT_DIR / "precomputed"
|
| 23 |
+
|
| 24 |
+
# Model configuration
|
| 25 |
+
QWEN_MODEL_ID = "Qwen/Qwen3-1.7B-Base"
|
| 26 |
+
RWKV_MODEL_FILENAME = "rwkv7-g1c-1.5b-20260110-ctx8192.pth"
|
| 27 |
+
|
| 28 |
+
# Detect device
|
| 29 |
+
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
|
| 30 |
+
IS_CPU = DEVICE == "cpu"
|
| 31 |
+
|
| 32 |
+
|
| 33 |
+
def download_rwkv_model():
|
| 34 |
+
"""Download RWKV7 model if not exists."""
|
| 35 |
+
from huggingface_hub import hf_hub_download
|
| 36 |
+
|
| 37 |
+
model_path = MODELS_DIR / RWKV_MODEL_FILENAME
|
| 38 |
+
|
| 39 |
+
if model_path.exists():
|
| 40 |
+
return str(model_path)
|
| 41 |
+
|
| 42 |
+
MODELS_DIR.mkdir(parents=True, exist_ok=True)
|
| 43 |
+
|
| 44 |
+
downloaded_path = hf_hub_download(
|
| 45 |
+
repo_id="BlinkDL/rwkv7-g1", filename=RWKV_MODEL_FILENAME, local_dir=str(MODELS_DIR), local_dir_use_symlinks=False
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
return downloaded_path
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def load_qwen_model():
|
| 52 |
+
"""Load Qwen3-1.7B-Base model."""
|
| 53 |
+
from transformers import AutoTokenizer, AutoModelForCausalLM
|
| 54 |
+
|
| 55 |
+
tokenizer = AutoTokenizer.from_pretrained(QWEN_MODEL_ID, trust_remote_code=True)
|
| 56 |
+
|
| 57 |
+
if IS_CPU:
|
| 58 |
+
model_kwargs = {"torch_dtype": torch.float32, "device_map": None, "trust_remote_code": True, "low_cpu_mem_usage": True}
|
| 59 |
+
model = AutoModelForCausalLM.from_pretrained(QWEN_MODEL_ID, **model_kwargs).eval()
|
| 60 |
+
else:
|
| 61 |
+
model_kwargs = {"torch_dtype": torch.bfloat16, "device_map": "auto", "trust_remote_code": True}
|
| 62 |
+
try:
|
| 63 |
+
model = AutoModelForCausalLM.from_pretrained(QWEN_MODEL_ID, attn_implementation="flash_attention_2", **model_kwargs).eval()
|
| 64 |
+
except Exception:
|
| 65 |
+
model = AutoModelForCausalLM.from_pretrained(QWEN_MODEL_ID, **model_kwargs).eval()
|
| 66 |
+
|
| 67 |
+
return model, tokenizer
|
| 68 |
+
|
| 69 |
+
|
| 70 |
+
def load_rwkv7_model(model_path: str):
|
| 71 |
+
"""Load RWKV7-G1C-1.5B model."""
|
| 72 |
+
os.environ["RWKV_JIT_ON"] = "1"
|
| 73 |
+
os.environ["RWKV_V7_ON"] = "1"
|
| 74 |
+
|
| 75 |
+
if IS_CPU:
|
| 76 |
+
os.environ["RWKV_CUDA_ON"] = "0"
|
| 77 |
+
else:
|
| 78 |
+
os.environ["RWKV_CUDA_ON"] = "1"
|
| 79 |
+
|
| 80 |
+
from rwkv.model import RWKV
|
| 81 |
+
from rwkv.rwkv_tokenizer import TRIE_TOKENIZER
|
| 82 |
+
|
| 83 |
+
if IS_CPU:
|
| 84 |
+
strategy = "cpu fp32"
|
| 85 |
+
else:
|
| 86 |
+
strategy = "cuda fp16"
|
| 87 |
+
|
| 88 |
+
if model_path.endswith(".pth"):
|
| 89 |
+
model_path = model_path[:-4]
|
| 90 |
+
|
| 91 |
+
model = RWKV(model=model_path, strategy=strategy)
|
| 92 |
+
|
| 93 |
+
vocab_path = str(SUPPORT_DIR / "rwkv_vocab_v20230424.txt")
|
| 94 |
+
tokenizer = TRIE_TOKENIZER(vocab_path)
|
| 95 |
+
|
| 96 |
+
return model, tokenizer
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
def precompute_example():
|
| 100 |
+
"""Precompute the example and save results."""
|
| 101 |
+
from core.evaluator import evaluate_hf_single_sample, evaluate_rwkv7_single_sample
|
| 102 |
+
from visualization.html_generator import generate_comparison_html
|
| 103 |
+
|
| 104 |
+
# Read example text
|
| 105 |
+
example_file = SCRIPT_DIR / "the_bitter_lesson.txt"
|
| 106 |
+
with open(example_file, "r", encoding="utf-8") as f:
|
| 107 |
+
example_text = f.read()
|
| 108 |
+
|
| 109 |
+
print(f"Example text length: {len(example_text)} characters")
|
| 110 |
+
|
| 111 |
+
# Download and load models
|
| 112 |
+
print("Downloading RWKV model if needed...")
|
| 113 |
+
rwkv_model_path = download_rwkv_model()
|
| 114 |
+
|
| 115 |
+
print("Loading Qwen3-1.7B-Base...")
|
| 116 |
+
qwen_model, qwen_tokenizer = load_qwen_model()
|
| 117 |
+
|
| 118 |
+
print("Loading RWKV7-G1C-1.5B...")
|
| 119 |
+
rwkv_model, rwkv_tokenizer = load_rwkv7_model(rwkv_model_path)
|
| 120 |
+
|
| 121 |
+
# Run evaluations
|
| 122 |
+
print("Evaluating with Qwen3...")
|
| 123 |
+
result_qwen = evaluate_hf_single_sample(qwen_model, qwen_tokenizer, example_text, bos_mode="add_newline_token")
|
| 124 |
+
print(f"Qwen3 completed in {result_qwen['inference_time']:.2f}s")
|
| 125 |
+
|
| 126 |
+
print("Evaluating with RWKV7...")
|
| 127 |
+
result_rwkv = evaluate_rwkv7_single_sample(rwkv_model, rwkv_tokenizer, example_text)
|
| 128 |
+
print(f"RWKV7 completed in {result_rwkv['inference_time']:.2f}s")
|
| 129 |
+
|
| 130 |
+
# Generate HTML visualization
|
| 131 |
+
print("Generating visualization...")
|
| 132 |
+
html = generate_comparison_html(
|
| 133 |
+
text=example_text,
|
| 134 |
+
byte_losses_a=result_rwkv["byte_wise_losses"],
|
| 135 |
+
byte_losses_b=result_qwen["byte_wise_losses"],
|
| 136 |
+
model_a_name="RWKV7-G1C-1.5B",
|
| 137 |
+
model_b_name="Qwen3-1.7B-Base",
|
| 138 |
+
topk_predictions_a=result_rwkv["top5_predictions"],
|
| 139 |
+
topk_predictions_b=result_qwen["top5_predictions"],
|
| 140 |
+
tokenizer_a=result_rwkv["tokenizer"],
|
| 141 |
+
tokenizer_b=result_qwen["tokenizer"],
|
| 142 |
+
model_type_a="rwkv7",
|
| 143 |
+
model_type_b="hf",
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
# Save precomputed results
|
| 147 |
+
PRECOMPUTED_DIR.mkdir(parents=True, exist_ok=True)
|
| 148 |
+
|
| 149 |
+
# Save HTML
|
| 150 |
+
html_path = PRECOMPUTED_DIR / "example_visualization.html"
|
| 151 |
+
with open(html_path, "w", encoding="utf-8") as f:
|
| 152 |
+
f.write(html)
|
| 153 |
+
print(f"Saved HTML to {html_path}")
|
| 154 |
+
|
| 155 |
+
# Save metadata
|
| 156 |
+
metadata = {
|
| 157 |
+
"example_text": example_text,
|
| 158 |
+
"qwen_inference_time": result_qwen["inference_time"],
|
| 159 |
+
"rwkv_inference_time": result_rwkv["inference_time"],
|
| 160 |
+
"qwen_compression_rate": result_qwen["compression_rate"],
|
| 161 |
+
"rwkv_compression_rate": result_rwkv["compression_rate"],
|
| 162 |
+
}
|
| 163 |
+
metadata_path = PRECOMPUTED_DIR / "example_metadata.json"
|
| 164 |
+
with open(metadata_path, "w", encoding="utf-8") as f:
|
| 165 |
+
json.dump(metadata, f, ensure_ascii=False, indent=2)
|
| 166 |
+
print(f"Saved metadata to {metadata_path}")
|
| 167 |
+
|
| 168 |
+
print("Done! Precomputed example is ready.")
|
| 169 |
+
|
| 170 |
+
|
| 171 |
+
if __name__ == "__main__":
|
| 172 |
+
precompute_example()
|
precomputed/example_metadata.json
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"example_text": "The Bitter Lesson\nRich Sutton\nMarch 13, 2019\nThe biggest lesson that can be read from 70 years of AI research is that general methods that leverage computation are ultimately the most effective, and by a large margin. The ultimate reason for this is Moore's law, or rather its generalization of continued exponentially falling cost per unit of computation. Most AI research has been conducted as if the computation available to the agent were constant (in which case leveraging human knowledge would be one of the only ways to improve performance) but, over a slightly longer time than a typical research project, massively more computation inevitably becomes available. Seeking an improvement that makes a difference in the shorter term, researchers seek to leverage their human knowledge of the domain, but the only thing that matters in the long run is the leveraging of computation. These two need not run counter to each other, but in practice they tend to. Time spent on one is time not spent on the other. There are psychological commitments to investment in one approach or the other. And the human-knowledge approach tends to complicate methods in ways that make them less suited to taking advantage of general methods leveraging computation. There were many examples of AI researchers' belated learning of this bitter lesson, and it is instructive to review some of the most prominent.\n\nIn computer chess, the methods that defeated the world champion, Kasparov, in 1997, were based on massive, deep search. At the time, this was looked upon with dismay by the majority of computer-chess researchers who had pursued methods that leveraged human understanding of the special structure of chess. When a simpler, search-based approach with special hardware and software proved vastly more effective, these human-knowledge-based chess researchers were not good losers. They said that ``brute force\" search may have won this time, but it was not a general strategy, and anyway it was not how people played chess. These researchers wanted methods based on human input to win and were disappointed when they did not.\n\nA similar pattern of research progress was seen in computer Go, only delayed by a further 20 years. Enormous initial efforts went into avoiding search by taking advantage of human knowledge, or of the special features of the game, but all those efforts proved irrelevant, or worse, once search was applied effectively at scale. Also important was the use of learning by self play to learn a value function (as it was in many other games and even in chess, although learning did not play a big role in the 1997 program that first beat a world champion). Learning by self play, and learning in general, is like search in that it enables massive computation to be brought to bear. Search and learning are the two most important classes of techniques for utilizing massive amounts of computation in AI research. In computer Go, as in computer chess, researchers' initial effort was directed towards utilizing human understanding (so that less search was needed) and only much later was much greater success had by embracing search and learning.\n\nIn speech recognition, there was an early competition, sponsored by DARPA, in the 1970s. Entrants included a host of special methods that took advantage of human knowledge---knowledge of words, of phonemes, of the human vocal tract, etc. On the other side were newer methods that were more statistical in nature and did much more computation, based on hidden Markov models (HMMs). Again, the statistical methods won out over the human-knowledge-based methods. This led to a major change in all of natural language processing, gradually over decades, where statistics and computation came to dominate the field. The recent rise of deep learning in speech recognition is the most recent step in this consistent direction. Deep learning methods rely even less on human knowledge, and use even more computation, together with learning on huge training sets, to produce dramatically better speech recognition systems. As in the games, researchers always tried to make systems that worked the way the researchers thought their own minds worked---they tried to put that knowledge in their systems---but it proved ultimately counterproductive, and a colossal waste of researcher's time, when, through Moore's law, massive computation became available and a means was found to put it to good use.\n\nIn computer vision, there has been a similar pattern. Early methods conceived of vision as searching for edges, or generalized cylinders, or in terms of SIFT features. But today all this is discarded. Modern deep-learning neural networks use only the notions of convolution and certain kinds of invariances, and perform much better.\n\nThis is a big lesson. As a field, we still have not thoroughly learned it, as we are continuing to make the same kind of mistakes. To see this, and to effectively resist it, we have to understand the appeal of these mistakes. We have to learn the bitter lesson that building in how we think we think does not work in the long run. The bitter lesson is based on the historical observations that 1) AI researchers have often tried to build knowledge into their agents, 2) this always helps in the short term, and is personally satisfying to the researcher, but 3) in the long run it plateaus and even inhibits further progress, and 4) breakthrough progress eventually arrives by an opposing approach based on scaling computation by search and learning. The eventual success is tinged with bitterness, and often incompletely digested, because it is success over a favored, human-centric approach.\n\nOne thing that should be learned from the bitter lesson is the great power of general purpose methods, of methods that continue to scale with increased computation even as the available computation becomes very great. The two methods that seem to scale arbitrarily in this way are search and learning.\n\nThe second general point to be learned from the bitter lesson is that the actual contents of minds are tremendously, irredeemably complex; we should stop trying to find simple ways to think about the contents of minds, such as simple ways to think about space, objects, multiple agents, or symmetries. All these are part of the arbitrary, intrinsically-complex, outside world. They are not what should be built in, as their complexity is endless; instead we should build in only the meta-methods that can find and capture this arbitrary complexity. Essential to these methods is that they can find good approximations, but the search for them should be by our methods, not by us. We want AI agents that can discover like we can, not which contain what we have discovered. Building in our discoveries only makes it harder to see how the discovering process can be done.\n",
|
| 3 |
+
"qwen_inference_time": 23.581797122955322,
|
| 4 |
+
"rwkv_inference_time": 34.39247703552246,
|
| 5 |
+
"qwen_compression_rate": 48.14428559434192,
|
| 6 |
+
"rwkv_compression_rate": 47.624574152536056
|
| 7 |
+
}
|
precomputed/example_visualization.html
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
the_bitter_lesson.txt
ADDED
|
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
The Bitter Lesson
|
| 2 |
+
Rich Sutton
|
| 3 |
+
March 13, 2019
|
| 4 |
+
The biggest lesson that can be read from 70 years of AI research is that general methods that leverage computation are ultimately the most effective, and by a large margin. The ultimate reason for this is Moore's law, or rather its generalization of continued exponentially falling cost per unit of computation. Most AI research has been conducted as if the computation available to the agent were constant (in which case leveraging human knowledge would be one of the only ways to improve performance) but, over a slightly longer time than a typical research project, massively more computation inevitably becomes available. Seeking an improvement that makes a difference in the shorter term, researchers seek to leverage their human knowledge of the domain, but the only thing that matters in the long run is the leveraging of computation. These two need not run counter to each other, but in practice they tend to. Time spent on one is time not spent on the other. There are psychological commitments to investment in one approach or the other. And the human-knowledge approach tends to complicate methods in ways that make them less suited to taking advantage of general methods leveraging computation. There were many examples of AI researchers' belated learning of this bitter lesson, and it is instructive to review some of the most prominent.
|
| 5 |
+
|
| 6 |
+
In computer chess, the methods that defeated the world champion, Kasparov, in 1997, were based on massive, deep search. At the time, this was looked upon with dismay by the majority of computer-chess researchers who had pursued methods that leveraged human understanding of the special structure of chess. When a simpler, search-based approach with special hardware and software proved vastly more effective, these human-knowledge-based chess researchers were not good losers. They said that ``brute force" search may have won this time, but it was not a general strategy, and anyway it was not how people played chess. These researchers wanted methods based on human input to win and were disappointed when they did not.
|
| 7 |
+
|
| 8 |
+
A similar pattern of research progress was seen in computer Go, only delayed by a further 20 years. Enormous initial efforts went into avoiding search by taking advantage of human knowledge, or of the special features of the game, but all those efforts proved irrelevant, or worse, once search was applied effectively at scale. Also important was the use of learning by self play to learn a value function (as it was in many other games and even in chess, although learning did not play a big role in the 1997 program that first beat a world champion). Learning by self play, and learning in general, is like search in that it enables massive computation to be brought to bear. Search and learning are the two most important classes of techniques for utilizing massive amounts of computation in AI research. In computer Go, as in computer chess, researchers' initial effort was directed towards utilizing human understanding (so that less search was needed) and only much later was much greater success had by embracing search and learning.
|
| 9 |
+
|
| 10 |
+
In speech recognition, there was an early competition, sponsored by DARPA, in the 1970s. Entrants included a host of special methods that took advantage of human knowledge---knowledge of words, of phonemes, of the human vocal tract, etc. On the other side were newer methods that were more statistical in nature and did much more computation, based on hidden Markov models (HMMs). Again, the statistical methods won out over the human-knowledge-based methods. This led to a major change in all of natural language processing, gradually over decades, where statistics and computation came to dominate the field. The recent rise of deep learning in speech recognition is the most recent step in this consistent direction. Deep learning methods rely even less on human knowledge, and use even more computation, together with learning on huge training sets, to produce dramatically better speech recognition systems. As in the games, researchers always tried to make systems that worked the way the researchers thought their own minds worked---they tried to put that knowledge in their systems---but it proved ultimately counterproductive, and a colossal waste of researcher's time, when, through Moore's law, massive computation became available and a means was found to put it to good use.
|
| 11 |
+
|
| 12 |
+
In computer vision, there has been a similar pattern. Early methods conceived of vision as searching for edges, or generalized cylinders, or in terms of SIFT features. But today all this is discarded. Modern deep-learning neural networks use only the notions of convolution and certain kinds of invariances, and perform much better.
|
| 13 |
+
|
| 14 |
+
This is a big lesson. As a field, we still have not thoroughly learned it, as we are continuing to make the same kind of mistakes. To see this, and to effectively resist it, we have to understand the appeal of these mistakes. We have to learn the bitter lesson that building in how we think we think does not work in the long run. The bitter lesson is based on the historical observations that 1) AI researchers have often tried to build knowledge into their agents, 2) this always helps in the short term, and is personally satisfying to the researcher, but 3) in the long run it plateaus and even inhibits further progress, and 4) breakthrough progress eventually arrives by an opposing approach based on scaling computation by search and learning. The eventual success is tinged with bitterness, and often incompletely digested, because it is success over a favored, human-centric approach.
|
| 15 |
+
|
| 16 |
+
One thing that should be learned from the bitter lesson is the great power of general purpose methods, of methods that continue to scale with increased computation even as the available computation becomes very great. The two methods that seem to scale arbitrarily in this way are search and learning.
|
| 17 |
+
|
| 18 |
+
The second general point to be learned from the bitter lesson is that the actual contents of minds are tremendously, irredeemably complex; we should stop trying to find simple ways to think about the contents of minds, such as simple ways to think about space, objects, multiple agents, or symmetries. All these are part of the arbitrary, intrinsically-complex, outside world. They are not what should be built in, as their complexity is endless; instead we should build in only the meta-methods that can find and capture this arbitrary complexity. Essential to these methods is that they can find good approximations, but the search for them should be by our methods, not by us. We want AI agents that can discover like we can, not which contain what we have discovered. Building in our discoveries only makes it harder to see how the discovering process can be done.
|