Spaces:
Running
Running
Commit
·
a2836e3
1
Parent(s):
cddd3a5
Update transformers version and simplify evaluator code
Browse files- Upgrade transformers to 5.0.0rc0
- Simplify BOS token logic to always use newline token
- Apply code formatting improvements
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
- core/evaluator.py +17 -35
- requirements.txt +1 -1
core/evaluator.py
CHANGED
|
@@ -57,10 +57,7 @@ def extract_topk_predictions(logit: torch.Tensor, target_ids: torch.Tensor, k: i
|
|
| 57 |
actual_prob = probs[pos, target_id].item()
|
| 58 |
rank = (probs[pos] > actual_prob).sum().item() + 1
|
| 59 |
|
| 60 |
-
topk_list = [
|
| 61 |
-
[top_ids[pos, i].item(), round(top_probs[pos, i].item(), 6)]
|
| 62 |
-
for i in range(k)
|
| 63 |
-
]
|
| 64 |
results.append([target_id, rank, topk_list])
|
| 65 |
|
| 66 |
return results
|
|
@@ -85,12 +82,7 @@ def count_rwkv_parameters_in_billions(rwkv_model) -> float:
|
|
| 85 |
|
| 86 |
|
| 87 |
@torch.no_grad()
|
| 88 |
-
def evaluate_hf_single_sample(
|
| 89 |
-
model,
|
| 90 |
-
tokenizer,
|
| 91 |
-
text: str,
|
| 92 |
-
bos_mode: str = "add_newline_token"
|
| 93 |
-
) -> Dict[str, Any]:
|
| 94 |
"""
|
| 95 |
Evaluate a HuggingFace model on a single text sample.
|
| 96 |
|
|
@@ -104,20 +96,18 @@ def evaluate_hf_single_sample(
|
|
| 104 |
dict with byte_wise_losses, top5_predictions, compression_rate, etc.
|
| 105 |
"""
|
| 106 |
# Create token-to-bytes converter
|
| 107 |
-
token2bytes_converter = TokenizerBytesConverter(
|
| 108 |
-
model_name_or_path=tokenizer.name_or_path,
|
| 109 |
-
tokenizer=tokenizer
|
| 110 |
-
)
|
| 111 |
|
| 112 |
# Determine BOS token
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
|
|
|
| 121 |
|
| 122 |
bos_tensor = torch.tensor([bos_token], device=model.device).unsqueeze(0)
|
| 123 |
|
|
@@ -149,9 +139,7 @@ def evaluate_hf_single_sample(
|
|
| 149 |
raise ValueError("Token bytes don't match original text bytes")
|
| 150 |
|
| 151 |
# Extract top-k predictions
|
| 152 |
-
sample_topk = extract_topk_predictions(
|
| 153 |
-
logit[:-1], input_chunk.squeeze(0)[1:]
|
| 154 |
-
)
|
| 155 |
|
| 156 |
# Calculate byte-wise losses
|
| 157 |
byte_wise_losses = []
|
|
@@ -183,16 +171,12 @@ def evaluate_hf_single_sample(
|
|
| 183 |
"num_tokens": seq_length,
|
| 184 |
"num_bytes": num_bytes,
|
| 185 |
"model_name": getattr(model.config, "_name_or_path", "unknown"),
|
| 186 |
-
"tokenizer": tokenizer
|
| 187 |
}
|
| 188 |
|
| 189 |
|
| 190 |
@torch.no_grad()
|
| 191 |
-
def evaluate_rwkv7_single_sample(
|
| 192 |
-
model,
|
| 193 |
-
tokenizer,
|
| 194 |
-
text: str
|
| 195 |
-
) -> Dict[str, Any]:
|
| 196 |
"""
|
| 197 |
Evaluate a RWKV7 model on a single text sample.
|
| 198 |
|
|
@@ -241,9 +225,7 @@ def evaluate_rwkv7_single_sample(
|
|
| 241 |
token_bytes = [tokenizer.decodeBytes([token]) for token in input_chunk[1:]]
|
| 242 |
|
| 243 |
# Extract top-k predictions
|
| 244 |
-
sample_topk = extract_topk_predictions(
|
| 245 |
-
logit[:-1], torch.tensor(input_chunk[1:]).to(device)
|
| 246 |
-
)
|
| 247 |
|
| 248 |
# Calculate byte-wise losses
|
| 249 |
byte_wise_losses = []
|
|
@@ -266,5 +248,5 @@ def evaluate_rwkv7_single_sample(
|
|
| 266 |
"num_tokens": input_length,
|
| 267 |
"num_bytes": num_bytes,
|
| 268 |
"model_name": "RWKV7-G1C-1.5B",
|
| 269 |
-
"tokenizer": tokenizer
|
| 270 |
}
|
|
|
|
| 57 |
actual_prob = probs[pos, target_id].item()
|
| 58 |
rank = (probs[pos] > actual_prob).sum().item() + 1
|
| 59 |
|
| 60 |
+
topk_list = [[top_ids[pos, i].item(), round(top_probs[pos, i].item(), 6)] for i in range(k)]
|
|
|
|
|
|
|
|
|
|
| 61 |
results.append([target_id, rank, topk_list])
|
| 62 |
|
| 63 |
return results
|
|
|
|
| 82 |
|
| 83 |
|
| 84 |
@torch.no_grad()
|
| 85 |
+
def evaluate_hf_single_sample(model, tokenizer, text: str, bos_mode: str = "add_newline_token") -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 86 |
"""
|
| 87 |
Evaluate a HuggingFace model on a single text sample.
|
| 88 |
|
|
|
|
| 96 |
dict with byte_wise_losses, top5_predictions, compression_rate, etc.
|
| 97 |
"""
|
| 98 |
# Create token-to-bytes converter
|
| 99 |
+
token2bytes_converter = TokenizerBytesConverter(model_name_or_path=tokenizer.name_or_path, tokenizer=tokenizer)
|
|
|
|
|
|
|
|
|
|
| 100 |
|
| 101 |
# Determine BOS token
|
| 102 |
+
bos_token = tokenizer.encode("\n")[0]
|
| 103 |
+
# if bos_mode in ["add_default_bos", "replace_with_bos"]:
|
| 104 |
+
# bos_token = tokenizer.bos_token_id
|
| 105 |
+
# elif bos_mode in ["add_default_eos", "replace_with_eos"]:
|
| 106 |
+
# bos_token = tokenizer.eos_token_id
|
| 107 |
+
# elif bos_mode in ["add_newline_token", "replace_with_newline_token"]:
|
| 108 |
+
# bos_token = tokenizer.encode("\n")[0]
|
| 109 |
+
# else:
|
| 110 |
+
# bos_token = tokenizer.bos_token_id
|
| 111 |
|
| 112 |
bos_tensor = torch.tensor([bos_token], device=model.device).unsqueeze(0)
|
| 113 |
|
|
|
|
| 139 |
raise ValueError("Token bytes don't match original text bytes")
|
| 140 |
|
| 141 |
# Extract top-k predictions
|
| 142 |
+
sample_topk = extract_topk_predictions(logit[:-1], input_chunk.squeeze(0)[1:])
|
|
|
|
|
|
|
| 143 |
|
| 144 |
# Calculate byte-wise losses
|
| 145 |
byte_wise_losses = []
|
|
|
|
| 171 |
"num_tokens": seq_length,
|
| 172 |
"num_bytes": num_bytes,
|
| 173 |
"model_name": getattr(model.config, "_name_or_path", "unknown"),
|
| 174 |
+
"tokenizer": tokenizer,
|
| 175 |
}
|
| 176 |
|
| 177 |
|
| 178 |
@torch.no_grad()
|
| 179 |
+
def evaluate_rwkv7_single_sample(model, tokenizer, text: str) -> Dict[str, Any]:
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
"""
|
| 181 |
Evaluate a RWKV7 model on a single text sample.
|
| 182 |
|
|
|
|
| 225 |
token_bytes = [tokenizer.decodeBytes([token]) for token in input_chunk[1:]]
|
| 226 |
|
| 227 |
# Extract top-k predictions
|
| 228 |
+
sample_topk = extract_topk_predictions(logit[:-1], torch.tensor(input_chunk[1:]).to(device))
|
|
|
|
|
|
|
| 229 |
|
| 230 |
# Calculate byte-wise losses
|
| 231 |
byte_wise_losses = []
|
|
|
|
| 248 |
"num_tokens": input_length,
|
| 249 |
"num_bytes": num_bytes,
|
| 250 |
"model_name": "RWKV7-G1C-1.5B",
|
| 251 |
+
"tokenizer": tokenizer,
|
| 252 |
}
|
requirements.txt
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
torch>=2.0.0
|
| 2 |
-
transformers
|
| 3 |
tokenizers>=0.15.0
|
| 4 |
gradio>=5.15.0
|
| 5 |
numpy>=1.24.0
|
|
|
|
| 1 |
torch>=2.0.0
|
| 2 |
+
transformers==5.0.0rc0
|
| 3 |
tokenizers>=0.15.0
|
| 4 |
gradio>=5.15.0
|
| 5 |
numpy>=1.24.0
|