Jellyfish042 Claude Sonnet 4.5 commited on
Commit
a2836e3
·
1 Parent(s): cddd3a5

Update transformers version and simplify evaluator code

Browse files

- Upgrade transformers to 5.0.0rc0
- Simplify BOS token logic to always use newline token
- Apply code formatting improvements

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>

Files changed (2) hide show
  1. core/evaluator.py +17 -35
  2. requirements.txt +1 -1
core/evaluator.py CHANGED
@@ -57,10 +57,7 @@ def extract_topk_predictions(logit: torch.Tensor, target_ids: torch.Tensor, k: i
57
  actual_prob = probs[pos, target_id].item()
58
  rank = (probs[pos] > actual_prob).sum().item() + 1
59
 
60
- topk_list = [
61
- [top_ids[pos, i].item(), round(top_probs[pos, i].item(), 6)]
62
- for i in range(k)
63
- ]
64
  results.append([target_id, rank, topk_list])
65
 
66
  return results
@@ -85,12 +82,7 @@ def count_rwkv_parameters_in_billions(rwkv_model) -> float:
85
 
86
 
87
  @torch.no_grad()
88
- def evaluate_hf_single_sample(
89
- model,
90
- tokenizer,
91
- text: str,
92
- bos_mode: str = "add_newline_token"
93
- ) -> Dict[str, Any]:
94
  """
95
  Evaluate a HuggingFace model on a single text sample.
96
 
@@ -104,20 +96,18 @@ def evaluate_hf_single_sample(
104
  dict with byte_wise_losses, top5_predictions, compression_rate, etc.
105
  """
106
  # Create token-to-bytes converter
107
- token2bytes_converter = TokenizerBytesConverter(
108
- model_name_or_path=tokenizer.name_or_path,
109
- tokenizer=tokenizer
110
- )
111
 
112
  # Determine BOS token
113
- if bos_mode in ["add_default_bos", "replace_with_bos"]:
114
- bos_token = tokenizer.bos_token_id
115
- elif bos_mode in ["add_default_eos", "replace_with_eos"]:
116
- bos_token = tokenizer.eos_token_id
117
- elif bos_mode in ["add_newline_token", "replace_with_newline_token"]:
118
- bos_token = tokenizer.encode("\n")[0]
119
- else:
120
- bos_token = tokenizer.bos_token_id
 
121
 
122
  bos_tensor = torch.tensor([bos_token], device=model.device).unsqueeze(0)
123
 
@@ -149,9 +139,7 @@ def evaluate_hf_single_sample(
149
  raise ValueError("Token bytes don't match original text bytes")
150
 
151
  # Extract top-k predictions
152
- sample_topk = extract_topk_predictions(
153
- logit[:-1], input_chunk.squeeze(0)[1:]
154
- )
155
 
156
  # Calculate byte-wise losses
157
  byte_wise_losses = []
@@ -183,16 +171,12 @@ def evaluate_hf_single_sample(
183
  "num_tokens": seq_length,
184
  "num_bytes": num_bytes,
185
  "model_name": getattr(model.config, "_name_or_path", "unknown"),
186
- "tokenizer": tokenizer
187
  }
188
 
189
 
190
  @torch.no_grad()
191
- def evaluate_rwkv7_single_sample(
192
- model,
193
- tokenizer,
194
- text: str
195
- ) -> Dict[str, Any]:
196
  """
197
  Evaluate a RWKV7 model on a single text sample.
198
 
@@ -241,9 +225,7 @@ def evaluate_rwkv7_single_sample(
241
  token_bytes = [tokenizer.decodeBytes([token]) for token in input_chunk[1:]]
242
 
243
  # Extract top-k predictions
244
- sample_topk = extract_topk_predictions(
245
- logit[:-1], torch.tensor(input_chunk[1:]).to(device)
246
- )
247
 
248
  # Calculate byte-wise losses
249
  byte_wise_losses = []
@@ -266,5 +248,5 @@ def evaluate_rwkv7_single_sample(
266
  "num_tokens": input_length,
267
  "num_bytes": num_bytes,
268
  "model_name": "RWKV7-G1C-1.5B",
269
- "tokenizer": tokenizer
270
  }
 
57
  actual_prob = probs[pos, target_id].item()
58
  rank = (probs[pos] > actual_prob).sum().item() + 1
59
 
60
+ topk_list = [[top_ids[pos, i].item(), round(top_probs[pos, i].item(), 6)] for i in range(k)]
 
 
 
61
  results.append([target_id, rank, topk_list])
62
 
63
  return results
 
82
 
83
 
84
  @torch.no_grad()
85
+ def evaluate_hf_single_sample(model, tokenizer, text: str, bos_mode: str = "add_newline_token") -> Dict[str, Any]:
 
 
 
 
 
86
  """
87
  Evaluate a HuggingFace model on a single text sample.
88
 
 
96
  dict with byte_wise_losses, top5_predictions, compression_rate, etc.
97
  """
98
  # Create token-to-bytes converter
99
+ token2bytes_converter = TokenizerBytesConverter(model_name_or_path=tokenizer.name_or_path, tokenizer=tokenizer)
 
 
 
100
 
101
  # Determine BOS token
102
+ bos_token = tokenizer.encode("\n")[0]
103
+ # if bos_mode in ["add_default_bos", "replace_with_bos"]:
104
+ # bos_token = tokenizer.bos_token_id
105
+ # elif bos_mode in ["add_default_eos", "replace_with_eos"]:
106
+ # bos_token = tokenizer.eos_token_id
107
+ # elif bos_mode in ["add_newline_token", "replace_with_newline_token"]:
108
+ # bos_token = tokenizer.encode("\n")[0]
109
+ # else:
110
+ # bos_token = tokenizer.bos_token_id
111
 
112
  bos_tensor = torch.tensor([bos_token], device=model.device).unsqueeze(0)
113
 
 
139
  raise ValueError("Token bytes don't match original text bytes")
140
 
141
  # Extract top-k predictions
142
+ sample_topk = extract_topk_predictions(logit[:-1], input_chunk.squeeze(0)[1:])
 
 
143
 
144
  # Calculate byte-wise losses
145
  byte_wise_losses = []
 
171
  "num_tokens": seq_length,
172
  "num_bytes": num_bytes,
173
  "model_name": getattr(model.config, "_name_or_path", "unknown"),
174
+ "tokenizer": tokenizer,
175
  }
176
 
177
 
178
  @torch.no_grad()
179
+ def evaluate_rwkv7_single_sample(model, tokenizer, text: str) -> Dict[str, Any]:
 
 
 
 
180
  """
181
  Evaluate a RWKV7 model on a single text sample.
182
 
 
225
  token_bytes = [tokenizer.decodeBytes([token]) for token in input_chunk[1:]]
226
 
227
  # Extract top-k predictions
228
+ sample_topk = extract_topk_predictions(logit[:-1], torch.tensor(input_chunk[1:]).to(device))
 
 
229
 
230
  # Calculate byte-wise losses
231
  byte_wise_losses = []
 
248
  "num_tokens": input_length,
249
  "num_bytes": num_bytes,
250
  "model_name": "RWKV7-G1C-1.5B",
251
+ "tokenizer": tokenizer,
252
  }
requirements.txt CHANGED
@@ -1,5 +1,5 @@
1
  torch>=2.0.0
2
- transformers>=4.35.0
3
  tokenizers>=0.15.0
4
  gradio>=5.15.0
5
  numpy>=1.24.0
 
1
  torch>=2.0.0
2
+ transformers==5.0.0rc0
3
  tokenizers>=0.15.0
4
  gradio>=5.15.0
5
  numpy>=1.24.0