tim1900 commited on
Commit
aaed7e2
·
verified ·
1 Parent(s): 7955bad

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +29 -107
README.md CHANGED
@@ -205,29 +205,7 @@ for i, (c, t) in enumerate(zip(chunks, token_pos)):
205
  ## Experimental
206
  The following script supports specifying max tokens per chunk. If max_tokens_per_chunk is specified, texts will be forced to choose a best possible position from history to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold. If max_tokens_per_chunk is None, it acts the same as above. This script can be seen as a new experimental version of the scripts above.
207
  ```python
208
- import torch
209
- from transformers import AutoTokenizer, BertForTokenClassification
210
- import math
211
-
212
- model_path = "tim1900/bert-chunker-3"
213
-
214
- tokenizer = AutoTokenizer.from_pretrained(
215
- model_path,
216
- padding_side="right",
217
- model_max_length=255,
218
- trust_remote_code=True,
219
- )
220
-
221
- device = "cpu" # or 'cuda'
222
-
223
- model = BertForTokenClassification.from_pretrained(
224
- model_path,
225
- ).to(device)
226
-
227
- def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk = None):
228
- # slide window chunking with a prob_threshold if max_tokens_per_chunk == None.
229
- # If max_tokens_per_chunk is not None, slide window chunking with a prob_threshold, and, sometimes forced to choose a best possible position to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold.
230
-
231
  with torch.no_grad():
232
 
233
  # slide context window chunking
@@ -251,16 +229,13 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
251
  best_logits = torch.finfo(torch.float32).min
252
  is_chunk_start = True
253
 
254
-
255
  print(f"Processing {input_ids.shape[1]} tokens...")
256
  while windows_end <= input_ids.shape[1]:
257
 
258
  windows_end = windows_start + MAX_TOKENS - 2
259
-
260
  ids = torch.cat((CLS, input_ids[:, windows_start:windows_end], SEP), 1)
261
-
262
  ids = ids.to(model.device)
263
-
264
  output = model(
265
  input_ids=ids,
266
  attention_mask=torch.ones(1, ids.shape[1], device=model.device),
@@ -281,13 +256,14 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
281
 
282
 
283
  unchunk_tokens_this_window = greater_rows_indices[0] if greater_rows_indices[0]!=0 else greater_rows_indices[1]#exclude the fist index
 
284
  # manually chunk
285
- if max_tokens_per_chunk is not None and unchunk_tokens + unchunk_tokens_this_window > max_tokens_per_chunk:
286
  big_windows_end = max_tokens_per_chunk - unchunk_tokens
287
  if is_chunk_start:
288
  max_value, max_index= logit_diff[:,1:big_windows_end].max(), logit_diff[:,1:big_windows_end].argmax() + 1
289
  else:
290
- max_value, max_index= logit_diff[:,:big_windows_end].max(), logit_diff[:,:big_windows_end].argmax()
291
  if best_logits < max_value:
292
  backup_pos = windows_start + max_index
293
 
@@ -295,8 +271,8 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
295
 
296
 
297
  split_str_pos = [tokens.token_to_chars(backup_pos + 1).start]
298
- split_str_poses += split_str_pos
299
- token_pos += [backup_pos + 1]
300
  best_logits = torch.finfo(torch.float32).min
301
  backup_pos = -1
302
  unchunk_tokens = 0
@@ -305,51 +281,51 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
305
  # auto chunk
306
  else:
307
  split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
308
- split_str_poses += split_str_pos
309
- token_pos += [sp + windows_start + 1 for sp in greater_rows_indices if sp > 0]
310
 
311
  windows_start = greater_rows_indices[-1] + windows_start
 
 
 
312
  is_chunk_start = True
313
 
314
  else:
315
 
316
- unchunk_tokens_this_window = (windows_end - windows_start)
317
  # manually chunk
318
- if max_tokens_per_chunk is not None and unchunk_tokens + unchunk_tokens_this_window > max_tokens_per_chunk:
319
  big_windows_end = max_tokens_per_chunk - unchunk_tokens
320
  if is_chunk_start:
321
  max_value, max_index= logit_diff[:,1:big_windows_end].max(), logit_diff[:,1:big_windows_end].argmax() + 1
322
  else:
323
- max_value, max_index= logit_diff[:,:big_windows_end].max(), logit_diff[:,:big_windows_end].argmax()
324
  if best_logits < max_value:
325
  backup_pos = windows_start + max_index
326
 
327
 
328
  windows_start = backup_pos
329
  split_str_pos = [tokens.token_to_chars(backup_pos + 1).start]
330
- split_str_poses += split_str_pos
331
- token_pos += [backup_pos + 1]
332
  best_logits = torch.finfo(torch.float32).min
333
  backup_pos = -1
334
  unchunk_tokens = 0
335
  is_chunk_start = True
336
  else:
337
  # auto leave
338
- if max_tokens_per_chunk is not None:
339
- if is_chunk_start:
340
- # is chunk start, need to rule out first position
341
- max_value, max_index= logit_diff[:,1:].max(), logit_diff[:,1:].argmax() + 1
342
 
343
- else:
344
- max_value, max_index= logit_diff[:,:].max(), logit_diff[:,:].argmax()
345
- if best_logits < max_value:
346
- best_logits = max_value
347
- backup_pos = windows_start + max_index
348
-
349
- unchunk_tokens += MAX_TOKENS - 2
350
- windows_start = windows_end
351
  is_chunk_start = False
352
-
353
 
354
  substrings = [
355
  text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses + [len(text)])
@@ -357,59 +333,6 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
357
  token_pos = [0] + token_pos
358
  return substrings, token_pos
359
 
360
-
361
-
362
- # chunking code docs
363
- print("\n>>>>>>>>> Chunking code docs...")
364
- doc = r"""
365
- Of course, as our first example shows, it is not always _necessary_ to declare an expression holder before it is created or used. But doing so provides an extra measure of clarity to models, so we strongly recommend it.
366
-
367
- ## Chapter 4 The Basics
368
-
369
- ## Chapter 5 The DCP Ruleset
370
-
371
- ### 5.1 A taxonomy of curvature
372
-
373
- In disciplined convex programming, a scalar expression is classified by its _curvature_. There are four categories of curvature: _constant_, _affine_, _convex_, and _concave_. For a function \(f:\mathbf{R}^{n}\rightarrow\mathbf{R}\) defined on all \(\mathbf{R}^{n}\)the categories have the following meanings:
374
-
375
- \[\begin{array}{llll}\text{constant}&f(\alpha x+(1-\alpha)y)=f(x)&\forall x,y\in \mathbf{R}^{n},\;\alpha\in\mathbf{R}\\ \text{affine}&f(\alpha x+(1-\alpha)y)=\alpha f(x)+(1-\alpha)f(y)&\forall x,y\in \mathbf{R}^{n},\;\alpha\in\mathbf{R}\\ \text{convex}&f(\alpha x+(1-\alpha)y)\leq\alpha f(x)+(1-\alpha)f(y)&\forall x,y \in\mathbf{R}^{n},\;\alpha\in[0,1]\\ \text{concave}&f(\alpha x+(1-\alpha)y)\geq\alpha f(x)+(1-\alpha)f(y)&\forall x,y \in\mathbf{R}^{n},\;\alpha\in[0,1]\end{array}\]
376
-
377
- Of course, there is significant overlap in these categories. For example, constant expressions are also affine, and (real) affine expressions are both convex and concave.
378
-
379
- Convex and concave expressions are real by definition. Complex constant and affine expressions can be constructed, but their usage is more limited; for example, they cannot appear as the left- or right-hand side of an inequality constraint.
380
-
381
- ### Top-level rules
382
-
383
- CVX supports three different types of disciplined convex programs:
384
-
385
- * A _minimization problem_, consisting of a convex objective function and zero or more constraints.
386
- * A _maximization problem_, consisting of a concave objective function and zero or more constraints.
387
- * A _feasibility problem_, consisting of one or more constraints and no objective.
388
-
389
- ### Constraints
390
-
391
- Three types of constraints may be specified in disciplined convex programs:
392
-
393
- * An _equality constraint_, constructed using \(==\), where both sides are affine.
394
- * A _less-than inequality constraint_, using \(<=\), where the left side is convex and the right side is concave.
395
- * A _greater-than inequality constraint_, using \(>=\), where the left side is concave and the right side is convex.
396
-
397
- _Non_-equality constraints, constructed using \(\sim=\), are never allowed. (Such constraints are not convex.)
398
-
399
- One or both sides of an equality constraint may be complex; inequality constraints, on the other hand, must be real. A complex equality constraint is equivalent to two real equality constraints, one for the real part and one for the imaginary part. An equality constraint with a real side and a complex side has the effect of constraining the imaginary part of the complex side to be zero."""
400
- # Chunk the text. The prob_threshold should be between (0, 1). The lower it is, the more chunks will be generated.
401
- # Therefore adjust it to your need, when prob_threshold is small like 0.000001, each token is one chunk,
402
- # when it is set to 1, the whole text will be one chunk.
403
- # slide window chunking with a prob_threshold if max_tokens_per_chunk == None.
404
- # If max_tokens_per_chunk is not None, slide window chunking with a prob_threshold, and, sometimes forced to choose a best possible position to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold.
405
- chunks, token_pos = chunk_text(model, doc, tokenizer, prob_threshold=0.5, max_tokens_per_chunk=None)
406
-
407
- # print chunks
408
- for i, (c, t) in enumerate(zip(chunks, token_pos)):
409
- print(f"-----chunk: {i}----token_idx: {t}--------")
410
- print(c)
411
-
412
-
413
  # chunking ads
414
  print("\n>>>>>>>>> Chunking ads...")
415
 
@@ -457,9 +380,8 @@ Published on: 6 August 2024"
457
  # Chunk the text. The prob_threshold should be between (0, 1). The lower it is, the more chunks will be generated.
458
  # Therefore adjust it to your need, when prob_threshold is small like 0.000001, each token is one chunk,
459
  # when it is set to 1, the whole text will be one chunk.
460
- # slide window chunking with a prob_threshold if max_tokens_per_chunk == None.
461
- # If max_tokens_per_chunk is not None, slide window chunking with a prob_threshold, and, sometimes forced to choose a best possible position to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold.
462
- chunks, token_pos = chunk_text(model, ad, tokenizer, prob_threshold=0.5, max_tokens_per_chunk = 400)
463
 
464
  # print chunks
465
  for i, (c, t) in enumerate(zip(chunks, token_pos)):
 
205
  ## Experimental
206
  The following script supports specifying max tokens per chunk. If max_tokens_per_chunk is specified, texts will be forced to choose a best possible position from history to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold. If max_tokens_per_chunk is None, it acts the same as above. This script can be seen as a new experimental version of the scripts above.
207
  ```python
208
+ def chunk_text_with_max_chunk_size(model, text, tokenizer, prob_threshold=0.5,max_tokens_per_chunk = 400):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  with torch.no_grad():
210
 
211
  # slide context window chunking
 
229
  best_logits = torch.finfo(torch.float32).min
230
  is_chunk_start = True
231
 
232
+ STEP = (MAX_TOKENS - 2)//2
233
  print(f"Processing {input_ids.shape[1]} tokens...")
234
  while windows_end <= input_ids.shape[1]:
235
 
236
  windows_end = windows_start + MAX_TOKENS - 2
 
237
  ids = torch.cat((CLS, input_ids[:, windows_start:windows_end], SEP), 1)
 
238
  ids = ids.to(model.device)
 
239
  output = model(
240
  input_ids=ids,
241
  attention_mask=torch.ones(1, ids.shape[1], device=model.device),
 
256
 
257
 
258
  unchunk_tokens_this_window = greater_rows_indices[0] if greater_rows_indices[0]!=0 else greater_rows_indices[1]#exclude the fist index
259
+
260
  # manually chunk
261
+ if unchunk_tokens + unchunk_tokens_this_window > max_tokens_per_chunk:
262
  big_windows_end = max_tokens_per_chunk - unchunk_tokens
263
  if is_chunk_start:
264
  max_value, max_index= logit_diff[:,1:big_windows_end].max(), logit_diff[:,1:big_windows_end].argmax() + 1
265
  else:
266
+ max_value, max_index= logit_diff[:,1:big_windows_end].max(), logit_diff[:,1:big_windows_end].argmax() + 1
267
  if best_logits < max_value:
268
  backup_pos = windows_start + max_index
269
 
 
271
 
272
 
273
  split_str_pos = [tokens.token_to_chars(backup_pos + 1).start]
274
+ split_str_poses = split_str_poses + split_str_pos
275
+ token_pos = token_pos + [backup_pos]
276
  best_logits = torch.finfo(torch.float32).min
277
  backup_pos = -1
278
  unchunk_tokens = 0
 
281
  # auto chunk
282
  else:
283
  split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
284
+ split_str_poses = split_str_poses + split_str_pos
285
+ token_pos = token_pos+ [sp + windows_start for sp in greater_rows_indices if sp > 0]
286
 
287
  windows_start = greater_rows_indices[-1] + windows_start
288
+ best_logits = torch.finfo(torch.float32).min
289
+ backup_pos = -1
290
+ unchunk_tokens = 0
291
  is_chunk_start = True
292
 
293
  else:
294
 
295
+ unchunk_tokens_this_window = min(windows_end - windows_start,STEP)
296
  # manually chunk
297
+ if unchunk_tokens + unchunk_tokens_this_window > max_tokens_per_chunk:
298
  big_windows_end = max_tokens_per_chunk - unchunk_tokens
299
  if is_chunk_start:
300
  max_value, max_index= logit_diff[:,1:big_windows_end].max(), logit_diff[:,1:big_windows_end].argmax() + 1
301
  else:
302
+ max_value, max_index= logit_diff[:,1:big_windows_end].max(), logit_diff[:,1:big_windows_end].argmax() + 1
303
  if best_logits < max_value:
304
  backup_pos = windows_start + max_index
305
 
306
 
307
  windows_start = backup_pos
308
  split_str_pos = [tokens.token_to_chars(backup_pos + 1).start]
309
+ split_str_poses = split_str_poses + split_str_pos
310
+ token_pos = token_pos + [backup_pos]
311
  best_logits = torch.finfo(torch.float32).min
312
  backup_pos = -1
313
  unchunk_tokens = 0
314
  is_chunk_start = True
315
  else:
316
  # auto leave
317
+ if is_chunk_start:
318
+ max_value, max_index= logit_diff[:,1:].max(), logit_diff[:,1:].argmax() + 1
 
 
319
 
320
+ else:
321
+ max_value, max_index= logit_diff[:,1:].max(), logit_diff[:,1:].argmax() + 1
322
+ if best_logits < max_value:
323
+ best_logits = max_value
324
+ backup_pos = windows_start + max_index
325
+
326
+ unchunk_tokens = unchunk_tokens + STEP
327
+ windows_start = windows_start + STEP
328
  is_chunk_start = False
 
329
 
330
  substrings = [
331
  text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses + [len(text)])
 
333
  token_pos = [0] + token_pos
334
  return substrings, token_pos
335
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
336
  # chunking ads
337
  print("\n>>>>>>>>> Chunking ads...")
338
 
 
380
  # Chunk the text. The prob_threshold should be between (0, 1). The lower it is, the more chunks will be generated.
381
  # Therefore adjust it to your need, when prob_threshold is small like 0.000001, each token is one chunk,
382
  # when it is set to 1, the whole text will be one chunk.
383
+ # Slide window chunking with a prob_threshold, and, will be forced to choose a best possible position to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold.
384
+ chunks, token_pos = chunk_text_with_max_chunk_size(model, ad, tokenizer, prob_threshold=0.5, max_tokens_per_chunk = 400)
 
385
 
386
  # print chunks
387
  for i, (c, t) in enumerate(zip(chunks, token_pos)):