tim1900
/

bert-chunker-3

@@ -205,29 +205,7 @@ for i, (c, t) in enumerate(zip(chunks, token_pos)):
 ## Experimental
 The following script supports specifying max tokens per chunk. If max_tokens_per_chunk is specified, texts will be forced to choose a best possible position from history to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold. If max_tokens_per_chunk is None, it acts the same as above. This script can be seen as a new experimental version of the scripts above.
 ```python
-import torch
-from transformers import AutoTokenizer, BertForTokenClassification
-import math
-model_path = "tim1900/bert-chunker-3"
-tokenizer = AutoTokenizer.from_pretrained(
-    model_path,
-    padding_side="right",
-    model_max_length=255,
-    trust_remote_code=True,
-)
-device = "cpu"  # or 'cuda'
-model = BertForTokenClassification.from_pretrained(
-    model_path,
-).to(device)
-def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk = None):
-    # slide window chunking with a prob_threshold if max_tokens_per_chunk == None.
-    # If max_tokens_per_chunk is not None, slide window chunking with a prob_threshold, and, sometimes forced to choose a best possible position to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold.
     with torch.no_grad():
         # slide context window chunking
@@ -251,16 +229,13 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
         best_logits = torch.finfo(torch.float32).min
         is_chunk_start = True
         print(f"Processing {input_ids.shape[1]} tokens...")
         while windows_end <= input_ids.shape[1]:
             windows_end = windows_start + MAX_TOKENS - 2
             ids = torch.cat((CLS, input_ids[:, windows_start:windows_end], SEP), 1)
             ids = ids.to(model.device)
             output = model(
                 input_ids=ids,
                 attention_mask=torch.ones(1, ids.shape[1], device=model.device),
@@ -281,13 +256,14 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
                 unchunk_tokens_this_window = greater_rows_indices[0] if greater_rows_indices[0]!=0 else greater_rows_indices[1]#exclude the fist index
                 # manually chunk
-                if max_tokens_per_chunk is not None and  unchunk_tokens + unchunk_tokens_this_window > max_tokens_per_chunk:
                     big_windows_end = max_tokens_per_chunk - unchunk_tokens
                     if is_chunk_start:
                         max_value, max_index= logit_diff[:,1:big_windows_end].max(),  logit_diff[:,1:big_windows_end].argmax() + 1
                     else:
-                        max_value, max_index= logit_diff[:,:big_windows_end].max(),  logit_diff[:,:big_windows_end].argmax()
                     if best_logits < max_value:
                         backup_pos = windows_start + max_index
@@ -295,8 +271,8 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
                     split_str_pos = [tokens.token_to_chars(backup_pos + 1).start]
-                    split_str_poses += split_str_pos
-                    token_pos += [backup_pos + 1]
                     best_logits = torch.finfo(torch.float32).min
                     backup_pos = -1
                     unchunk_tokens = 0
@@ -305,51 +281,51 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
                 # auto chunk
                 else:
                     split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
-                    split_str_poses += split_str_pos
-                    token_pos += [sp + windows_start + 1 for sp in greater_rows_indices if sp > 0]
                     windows_start = greater_rows_indices[-1] + windows_start
                     is_chunk_start = True
             else:
-                unchunk_tokens_this_window = (windows_end - windows_start)
                 # manually chunk
-                if max_tokens_per_chunk is not None and unchunk_tokens + unchunk_tokens_this_window > max_tokens_per_chunk:
                     big_windows_end =  max_tokens_per_chunk - unchunk_tokens
                     if is_chunk_start:
                         max_value, max_index= logit_diff[:,1:big_windows_end].max(),  logit_diff[:,1:big_windows_end].argmax() + 1
                     else:
-                        max_value, max_index= logit_diff[:,:big_windows_end].max(),  logit_diff[:,:big_windows_end].argmax()
                     if best_logits < max_value:
                         backup_pos = windows_start + max_index
                     windows_start = backup_pos
                     split_str_pos = [tokens.token_to_chars(backup_pos + 1).start]
-                    split_str_poses += split_str_pos
-                    token_pos += [backup_pos + 1]
                     best_logits = torch.finfo(torch.float32).min
                     backup_pos = -1
                     unchunk_tokens = 0
                     is_chunk_start = True
                 else:
                 # auto leave
-                    if max_tokens_per_chunk is not None:
-                        if is_chunk_start:
-                            # is chunk start, need to rule out first position
-                            max_value, max_index= logit_diff[:,1:].max(),  logit_diff[:,1:].argmax() + 1
-                        else:
-                            max_value, max_index= logit_diff[:,:].max(),  logit_diff[:,:].argmax()
-                        if best_logits < max_value:
-                            best_logits = max_value
-                            backup_pos = windows_start + max_index
-                    unchunk_tokens += MAX_TOKENS - 2
-                    windows_start = windows_end
                     is_chunk_start = False
         substrings = [
             text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses + [len(text)])
@@ -357,59 +333,6 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
         token_pos = [0] + token_pos
     return substrings, token_pos
-# chunking code docs
-print("\n>>>>>>>>> Chunking code docs...")
-doc = r"""
-Of course, as our first example shows, it is not always _necessary_ to declare an expression holder before it is created or used. But doing so provides an extra measure of clarity to models, so we strongly recommend it.
-## Chapter 4 The Basics
-## Chapter 5 The DCP Ruleset
-### 5.1 A taxonomy of curvature
-In disciplined convex programming, a scalar expression is classified by its _curvature_. There are four categories of curvature: _constant_, _affine_, _convex_, and _concave_. For a function \(f:\mathbf{R}^{n}\rightarrow\mathbf{R}\) defined on all \(\mathbf{R}^{n}\)the categories have the following meanings:
-\[\begin{array}{llll}\text{constant}&f(\alpha x+(1-\alpha)y)=f(x)&\forall x,y\in \mathbf{R}^{n},\;\alpha\in\mathbf{R}\\ \text{affine}&f(\alpha x+(1-\alpha)y)=\alpha f(x)+(1-\alpha)f(y)&\forall x,y\in \mathbf{R}^{n},\;\alpha\in\mathbf{R}\\ \text{convex}&f(\alpha x+(1-\alpha)y)\leq\alpha f(x)+(1-\alpha)f(y)&\forall x,y \in\mathbf{R}^{n},\;\alpha\in[0,1]\\ \text{concave}&f(\alpha x+(1-\alpha)y)\geq\alpha f(x)+(1-\alpha)f(y)&\forall x,y \in\mathbf{R}^{n},\;\alpha\in[0,1]\end{array}\]
-Of course, there is significant overlap in these categories. For example, constant expressions are also affine, and (real) affine expressions are both convex and concave.
-Convex and concave expressions are real by definition. Complex constant and affine expressions can be constructed, but their usage is more limited; for example, they cannot appear as the left- or right-hand side of an inequality constraint.
-### Top-level rules
-CVX supports three different types of disciplined convex programs:
-* A _minimization problem_, consisting of a convex objective function and zero or more constraints.
-* A _maximization problem_, consisting of a concave objective function and zero or more constraints.
-* A _feasibility problem_, consisting of one or more constraints and no objective.
-### Constraints
-Three types of constraints may be specified in disciplined convex programs:
-* An _equality constraint_, constructed using \(==\), where both sides are affine.
-* A _less-than inequality constraint_, using \(<=\), where the left side is convex and the right side is concave.
-* A _greater-than inequality constraint_, using \(>=\), where the left side is concave and the right side is convex.
-_Non_-equality constraints, constructed using \(\sim=\), are never allowed. (Such constraints are not convex.)
-One or both sides of an equality constraint may be complex; inequality constraints, on the other hand, must be real. A complex equality constraint is equivalent to two real equality constraints, one for the real part and one for the imaginary part. An equality constraint with a real side and a complex side has the effect of constraining the imaginary part of the complex side to be zero."""
-# Chunk the text. The prob_threshold should be between (0, 1). The lower it is, the more chunks will be generated.
-# Therefore adjust it to your need, when prob_threshold is small like 0.000001, each token is one chunk,
-# when it is set to 1, the whole text will be one chunk.
-# slide window chunking with a prob_threshold if max_tokens_per_chunk == None.
-# If max_tokens_per_chunk is not None, slide window chunking with a prob_threshold, and, sometimes forced to choose a best possible position to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold.
-chunks, token_pos = chunk_text(model, doc, tokenizer, prob_threshold=0.5, max_tokens_per_chunk=None)
-# print chunks
-for i, (c, t) in enumerate(zip(chunks, token_pos)):
-    print(f"-----chunk: {i}----token_idx: {t}--------")
-    print(c)
 # chunking ads
 print("\n>>>>>>>>> Chunking ads...")
@@ -457,9 +380,8 @@ Published on: 6 August 2024"
 # Chunk the text. The prob_threshold should be between (0, 1). The lower it is, the more chunks will be generated.
 # Therefore adjust it to your need, when prob_threshold is small like 0.000001, each token is one chunk,
 # when it is set to 1, the whole text will be one chunk.
-# slide window chunking with a prob_threshold if max_tokens_per_chunk == None.
-# If max_tokens_per_chunk is not None, slide window chunking with a prob_threshold, and, sometimes forced to choose a best possible position to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold.
-chunks, token_pos = chunk_text(model, ad, tokenizer, prob_threshold=0.5, max_tokens_per_chunk = 400)
 # print chunks
 for i, (c, t) in enumerate(zip(chunks, token_pos)):

 ## Experimental
 The following script supports specifying max tokens per chunk. If max_tokens_per_chunk is specified, texts will be forced to choose a best possible position from history to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold. If max_tokens_per_chunk is None, it acts the same as above. This script can be seen as a new experimental version of the scripts above.
 ```python
+def chunk_text_with_max_chunk_size(model, text, tokenizer, prob_threshold=0.5,max_tokens_per_chunk = 400):
     with torch.no_grad():
         # slide context window chunking
         best_logits = torch.finfo(torch.float32).min
         is_chunk_start = True
+        STEP = (MAX_TOKENS - 2)//2
         print(f"Processing {input_ids.shape[1]} tokens...")
         while windows_end <= input_ids.shape[1]:
             windows_end = windows_start + MAX_TOKENS - 2
             ids = torch.cat((CLS, input_ids[:, windows_start:windows_end], SEP), 1)
             ids = ids.to(model.device)
             output = model(
                 input_ids=ids,
                 attention_mask=torch.ones(1, ids.shape[1], device=model.device),
                 unchunk_tokens_this_window = greater_rows_indices[0] if greater_rows_indices[0]!=0 else greater_rows_indices[1]#exclude the fist index
                 # manually chunk
+                if unchunk_tokens + unchunk_tokens_this_window > max_tokens_per_chunk:
                     big_windows_end = max_tokens_per_chunk - unchunk_tokens
                     if is_chunk_start:
                         max_value, max_index= logit_diff[:,1:big_windows_end].max(),  logit_diff[:,1:big_windows_end].argmax() + 1
                     else:
+                        max_value, max_index= logit_diff[:,1:big_windows_end].max(),  logit_diff[:,1:big_windows_end].argmax() + 1
                     if best_logits < max_value:
                         backup_pos = windows_start + max_index
                     split_str_pos = [tokens.token_to_chars(backup_pos + 1).start]
+                    split_str_poses = split_str_poses + split_str_pos
+                    token_pos = token_pos + [backup_pos]
                     best_logits = torch.finfo(torch.float32).min
                     backup_pos = -1
                     unchunk_tokens = 0
                 # auto chunk
                 else:
                     split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
+                    split_str_poses = split_str_poses + split_str_pos
+                    token_pos = token_pos+ [sp + windows_start for sp in greater_rows_indices if sp > 0]
                     windows_start = greater_rows_indices[-1] + windows_start
+                    best_logits = torch.finfo(torch.float32).min
+                    backup_pos = -1
+                    unchunk_tokens = 0
                     is_chunk_start = True
             else:
+                unchunk_tokens_this_window = min(windows_end - windows_start,STEP)
                 # manually chunk
+                if unchunk_tokens + unchunk_tokens_this_window > max_tokens_per_chunk:
                     big_windows_end =  max_tokens_per_chunk - unchunk_tokens
                     if is_chunk_start:
                         max_value, max_index= logit_diff[:,1:big_windows_end].max(),  logit_diff[:,1:big_windows_end].argmax() + 1
                     else:
+                        max_value, max_index= logit_diff[:,1:big_windows_end].max(),  logit_diff[:,1:big_windows_end].argmax() + 1
                     if best_logits < max_value:
                         backup_pos = windows_start + max_index
                     windows_start = backup_pos
                     split_str_pos = [tokens.token_to_chars(backup_pos + 1).start]
+                    split_str_poses = split_str_poses + split_str_pos
+                    token_pos = token_pos + [backup_pos]
                     best_logits = torch.finfo(torch.float32).min
                     backup_pos = -1
                     unchunk_tokens = 0
                     is_chunk_start = True
                 else:
                 # auto leave
+                    if is_chunk_start:
+                        max_value, max_index= logit_diff[:,1:].max(),  logit_diff[:,1:].argmax() + 1
+                    else:
+                            max_value, max_index= logit_diff[:,1:].max(),  logit_diff[:,1:].argmax() + 1
+                    if best_logits < max_value:
+                        best_logits = max_value
+                        backup_pos = windows_start + max_index
+                    unchunk_tokens = unchunk_tokens + STEP
+                    windows_start = windows_start + STEP
                     is_chunk_start = False
         substrings = [
             text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses + [len(text)])
         token_pos = [0] + token_pos
     return substrings, token_pos
 # chunking ads
 print("\n>>>>>>>>> Chunking ads...")
 # Chunk the text. The prob_threshold should be between (0, 1). The lower it is, the more chunks will be generated.
 # Therefore adjust it to your need, when prob_threshold is small like 0.000001, each token is one chunk,
 # when it is set to 1, the whole text will be one chunk.
+# Slide window chunking with a prob_threshold, and, will be forced to choose a best possible position to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold.
+chunks, token_pos = chunk_text_with_max_chunk_size(model, ad, tokenizer, prob_threshold=0.5, max_tokens_per_chunk = 400)
 # print chunks
 for i, (c, t) in enumerate(zip(chunks, token_pos)):