Update README.md
Browse files
README.md
CHANGED
|
@@ -205,29 +205,7 @@ for i, (c, t) in enumerate(zip(chunks, token_pos)):
|
|
| 205 |
## Experimental
|
| 206 |
The following script supports specifying max tokens per chunk. If max_tokens_per_chunk is specified, texts will be forced to choose a best possible position from history to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold. If max_tokens_per_chunk is None, it acts the same as above. This script can be seen as a new experimental version of the scripts above.
|
| 207 |
```python
|
| 208 |
-
|
| 209 |
-
from transformers import AutoTokenizer, BertForTokenClassification
|
| 210 |
-
import math
|
| 211 |
-
|
| 212 |
-
model_path = "tim1900/bert-chunker-3"
|
| 213 |
-
|
| 214 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
| 215 |
-
model_path,
|
| 216 |
-
padding_side="right",
|
| 217 |
-
model_max_length=255,
|
| 218 |
-
trust_remote_code=True,
|
| 219 |
-
)
|
| 220 |
-
|
| 221 |
-
device = "cpu" # or 'cuda'
|
| 222 |
-
|
| 223 |
-
model = BertForTokenClassification.from_pretrained(
|
| 224 |
-
model_path,
|
| 225 |
-
).to(device)
|
| 226 |
-
|
| 227 |
-
def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk = None):
|
| 228 |
-
# slide window chunking with a prob_threshold if max_tokens_per_chunk == None.
|
| 229 |
-
# If max_tokens_per_chunk is not None, slide window chunking with a prob_threshold, and, sometimes forced to choose a best possible position to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold.
|
| 230 |
-
|
| 231 |
with torch.no_grad():
|
| 232 |
|
| 233 |
# slide context window chunking
|
|
@@ -251,16 +229,13 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
|
|
| 251 |
best_logits = torch.finfo(torch.float32).min
|
| 252 |
is_chunk_start = True
|
| 253 |
|
| 254 |
-
|
| 255 |
print(f"Processing {input_ids.shape[1]} tokens...")
|
| 256 |
while windows_end <= input_ids.shape[1]:
|
| 257 |
|
| 258 |
windows_end = windows_start + MAX_TOKENS - 2
|
| 259 |
-
|
| 260 |
ids = torch.cat((CLS, input_ids[:, windows_start:windows_end], SEP), 1)
|
| 261 |
-
|
| 262 |
ids = ids.to(model.device)
|
| 263 |
-
|
| 264 |
output = model(
|
| 265 |
input_ids=ids,
|
| 266 |
attention_mask=torch.ones(1, ids.shape[1], device=model.device),
|
|
@@ -281,13 +256,14 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
|
|
| 281 |
|
| 282 |
|
| 283 |
unchunk_tokens_this_window = greater_rows_indices[0] if greater_rows_indices[0]!=0 else greater_rows_indices[1]#exclude the fist index
|
|
|
|
| 284 |
# manually chunk
|
| 285 |
-
if
|
| 286 |
big_windows_end = max_tokens_per_chunk - unchunk_tokens
|
| 287 |
if is_chunk_start:
|
| 288 |
max_value, max_index= logit_diff[:,1:big_windows_end].max(), logit_diff[:,1:big_windows_end].argmax() + 1
|
| 289 |
else:
|
| 290 |
-
max_value, max_index= logit_diff[
|
| 291 |
if best_logits < max_value:
|
| 292 |
backup_pos = windows_start + max_index
|
| 293 |
|
|
@@ -295,8 +271,8 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
|
|
| 295 |
|
| 296 |
|
| 297 |
split_str_pos = [tokens.token_to_chars(backup_pos + 1).start]
|
| 298 |
-
split_str_poses
|
| 299 |
-
token_pos
|
| 300 |
best_logits = torch.finfo(torch.float32).min
|
| 301 |
backup_pos = -1
|
| 302 |
unchunk_tokens = 0
|
|
@@ -305,51 +281,51 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
|
|
| 305 |
# auto chunk
|
| 306 |
else:
|
| 307 |
split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
|
| 308 |
-
split_str_poses
|
| 309 |
-
token_pos
|
| 310 |
|
| 311 |
windows_start = greater_rows_indices[-1] + windows_start
|
|
|
|
|
|
|
|
|
|
| 312 |
is_chunk_start = True
|
| 313 |
|
| 314 |
else:
|
| 315 |
|
| 316 |
-
unchunk_tokens_this_window = (windows_end - windows_start)
|
| 317 |
# manually chunk
|
| 318 |
-
if
|
| 319 |
big_windows_end = max_tokens_per_chunk - unchunk_tokens
|
| 320 |
if is_chunk_start:
|
| 321 |
max_value, max_index= logit_diff[:,1:big_windows_end].max(), logit_diff[:,1:big_windows_end].argmax() + 1
|
| 322 |
else:
|
| 323 |
-
max_value, max_index= logit_diff[
|
| 324 |
if best_logits < max_value:
|
| 325 |
backup_pos = windows_start + max_index
|
| 326 |
|
| 327 |
|
| 328 |
windows_start = backup_pos
|
| 329 |
split_str_pos = [tokens.token_to_chars(backup_pos + 1).start]
|
| 330 |
-
split_str_poses
|
| 331 |
-
token_pos
|
| 332 |
best_logits = torch.finfo(torch.float32).min
|
| 333 |
backup_pos = -1
|
| 334 |
unchunk_tokens = 0
|
| 335 |
is_chunk_start = True
|
| 336 |
else:
|
| 337 |
# auto leave
|
| 338 |
-
if
|
| 339 |
-
|
| 340 |
-
# is chunk start, need to rule out first position
|
| 341 |
-
max_value, max_index= logit_diff[:,1:].max(), logit_diff[:,1:].argmax() + 1
|
| 342 |
|
| 343 |
-
|
| 344 |
-
max_value, max_index= logit_diff[
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
unchunk_tokens
|
| 350 |
-
windows_start =
|
| 351 |
is_chunk_start = False
|
| 352 |
-
|
| 353 |
|
| 354 |
substrings = [
|
| 355 |
text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses + [len(text)])
|
|
@@ -357,59 +333,6 @@ def chunk_text(model, text, tokenizer, prob_threshold=0.5, max_tokens_per_chunk
|
|
| 357 |
token_pos = [0] + token_pos
|
| 358 |
return substrings, token_pos
|
| 359 |
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
# chunking code docs
|
| 363 |
-
print("\n>>>>>>>>> Chunking code docs...")
|
| 364 |
-
doc = r"""
|
| 365 |
-
Of course, as our first example shows, it is not always _necessary_ to declare an expression holder before it is created or used. But doing so provides an extra measure of clarity to models, so we strongly recommend it.
|
| 366 |
-
|
| 367 |
-
## Chapter 4 The Basics
|
| 368 |
-
|
| 369 |
-
## Chapter 5 The DCP Ruleset
|
| 370 |
-
|
| 371 |
-
### 5.1 A taxonomy of curvature
|
| 372 |
-
|
| 373 |
-
In disciplined convex programming, a scalar expression is classified by its _curvature_. There are four categories of curvature: _constant_, _affine_, _convex_, and _concave_. For a function \(f:\mathbf{R}^{n}\rightarrow\mathbf{R}\) defined on all \(\mathbf{R}^{n}\)the categories have the following meanings:
|
| 374 |
-
|
| 375 |
-
\[\begin{array}{llll}\text{constant}&f(\alpha x+(1-\alpha)y)=f(x)&\forall x,y\in \mathbf{R}^{n},\;\alpha\in\mathbf{R}\\ \text{affine}&f(\alpha x+(1-\alpha)y)=\alpha f(x)+(1-\alpha)f(y)&\forall x,y\in \mathbf{R}^{n},\;\alpha\in\mathbf{R}\\ \text{convex}&f(\alpha x+(1-\alpha)y)\leq\alpha f(x)+(1-\alpha)f(y)&\forall x,y \in\mathbf{R}^{n},\;\alpha\in[0,1]\\ \text{concave}&f(\alpha x+(1-\alpha)y)\geq\alpha f(x)+(1-\alpha)f(y)&\forall x,y \in\mathbf{R}^{n},\;\alpha\in[0,1]\end{array}\]
|
| 376 |
-
|
| 377 |
-
Of course, there is significant overlap in these categories. For example, constant expressions are also affine, and (real) affine expressions are both convex and concave.
|
| 378 |
-
|
| 379 |
-
Convex and concave expressions are real by definition. Complex constant and affine expressions can be constructed, but their usage is more limited; for example, they cannot appear as the left- or right-hand side of an inequality constraint.
|
| 380 |
-
|
| 381 |
-
### Top-level rules
|
| 382 |
-
|
| 383 |
-
CVX supports three different types of disciplined convex programs:
|
| 384 |
-
|
| 385 |
-
* A _minimization problem_, consisting of a convex objective function and zero or more constraints.
|
| 386 |
-
* A _maximization problem_, consisting of a concave objective function and zero or more constraints.
|
| 387 |
-
* A _feasibility problem_, consisting of one or more constraints and no objective.
|
| 388 |
-
|
| 389 |
-
### Constraints
|
| 390 |
-
|
| 391 |
-
Three types of constraints may be specified in disciplined convex programs:
|
| 392 |
-
|
| 393 |
-
* An _equality constraint_, constructed using \(==\), where both sides are affine.
|
| 394 |
-
* A _less-than inequality constraint_, using \(<=\), where the left side is convex and the right side is concave.
|
| 395 |
-
* A _greater-than inequality constraint_, using \(>=\), where the left side is concave and the right side is convex.
|
| 396 |
-
|
| 397 |
-
_Non_-equality constraints, constructed using \(\sim=\), are never allowed. (Such constraints are not convex.)
|
| 398 |
-
|
| 399 |
-
One or both sides of an equality constraint may be complex; inequality constraints, on the other hand, must be real. A complex equality constraint is equivalent to two real equality constraints, one for the real part and one for the imaginary part. An equality constraint with a real side and a complex side has the effect of constraining the imaginary part of the complex side to be zero."""
|
| 400 |
-
# Chunk the text. The prob_threshold should be between (0, 1). The lower it is, the more chunks will be generated.
|
| 401 |
-
# Therefore adjust it to your need, when prob_threshold is small like 0.000001, each token is one chunk,
|
| 402 |
-
# when it is set to 1, the whole text will be one chunk.
|
| 403 |
-
# slide window chunking with a prob_threshold if max_tokens_per_chunk == None.
|
| 404 |
-
# If max_tokens_per_chunk is not None, slide window chunking with a prob_threshold, and, sometimes forced to choose a best possible position to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold.
|
| 405 |
-
chunks, token_pos = chunk_text(model, doc, tokenizer, prob_threshold=0.5, max_tokens_per_chunk=None)
|
| 406 |
-
|
| 407 |
-
# print chunks
|
| 408 |
-
for i, (c, t) in enumerate(zip(chunks, token_pos)):
|
| 409 |
-
print(f"-----chunk: {i}----token_idx: {t}--------")
|
| 410 |
-
print(c)
|
| 411 |
-
|
| 412 |
-
|
| 413 |
# chunking ads
|
| 414 |
print("\n>>>>>>>>> Chunking ads...")
|
| 415 |
|
|
@@ -457,9 +380,8 @@ Published on: 6 August 2024"
|
|
| 457 |
# Chunk the text. The prob_threshold should be between (0, 1). The lower it is, the more chunks will be generated.
|
| 458 |
# Therefore adjust it to your need, when prob_threshold is small like 0.000001, each token is one chunk,
|
| 459 |
# when it is set to 1, the whole text will be one chunk.
|
| 460 |
-
#
|
| 461 |
-
|
| 462 |
-
chunks, token_pos = chunk_text(model, ad, tokenizer, prob_threshold=0.5, max_tokens_per_chunk = 400)
|
| 463 |
|
| 464 |
# print chunks
|
| 465 |
for i, (c, t) in enumerate(zip(chunks, token_pos)):
|
|
|
|
| 205 |
## Experimental
|
| 206 |
The following script supports specifying max tokens per chunk. If max_tokens_per_chunk is specified, texts will be forced to choose a best possible position from history to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold. If max_tokens_per_chunk is None, it acts the same as above. This script can be seen as a new experimental version of the scripts above.
|
| 207 |
```python
|
| 208 |
+
def chunk_text_with_max_chunk_size(model, text, tokenizer, prob_threshold=0.5,max_tokens_per_chunk = 400):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
with torch.no_grad():
|
| 210 |
|
| 211 |
# slide context window chunking
|
|
|
|
| 229 |
best_logits = torch.finfo(torch.float32).min
|
| 230 |
is_chunk_start = True
|
| 231 |
|
| 232 |
+
STEP = (MAX_TOKENS - 2)//2
|
| 233 |
print(f"Processing {input_ids.shape[1]} tokens...")
|
| 234 |
while windows_end <= input_ids.shape[1]:
|
| 235 |
|
| 236 |
windows_end = windows_start + MAX_TOKENS - 2
|
|
|
|
| 237 |
ids = torch.cat((CLS, input_ids[:, windows_start:windows_end], SEP), 1)
|
|
|
|
| 238 |
ids = ids.to(model.device)
|
|
|
|
| 239 |
output = model(
|
| 240 |
input_ids=ids,
|
| 241 |
attention_mask=torch.ones(1, ids.shape[1], device=model.device),
|
|
|
|
| 256 |
|
| 257 |
|
| 258 |
unchunk_tokens_this_window = greater_rows_indices[0] if greater_rows_indices[0]!=0 else greater_rows_indices[1]#exclude the fist index
|
| 259 |
+
|
| 260 |
# manually chunk
|
| 261 |
+
if unchunk_tokens + unchunk_tokens_this_window > max_tokens_per_chunk:
|
| 262 |
big_windows_end = max_tokens_per_chunk - unchunk_tokens
|
| 263 |
if is_chunk_start:
|
| 264 |
max_value, max_index= logit_diff[:,1:big_windows_end].max(), logit_diff[:,1:big_windows_end].argmax() + 1
|
| 265 |
else:
|
| 266 |
+
max_value, max_index= logit_diff[:,1:big_windows_end].max(), logit_diff[:,1:big_windows_end].argmax() + 1
|
| 267 |
if best_logits < max_value:
|
| 268 |
backup_pos = windows_start + max_index
|
| 269 |
|
|
|
|
| 271 |
|
| 272 |
|
| 273 |
split_str_pos = [tokens.token_to_chars(backup_pos + 1).start]
|
| 274 |
+
split_str_poses = split_str_poses + split_str_pos
|
| 275 |
+
token_pos = token_pos + [backup_pos]
|
| 276 |
best_logits = torch.finfo(torch.float32).min
|
| 277 |
backup_pos = -1
|
| 278 |
unchunk_tokens = 0
|
|
|
|
| 281 |
# auto chunk
|
| 282 |
else:
|
| 283 |
split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
|
| 284 |
+
split_str_poses = split_str_poses + split_str_pos
|
| 285 |
+
token_pos = token_pos+ [sp + windows_start for sp in greater_rows_indices if sp > 0]
|
| 286 |
|
| 287 |
windows_start = greater_rows_indices[-1] + windows_start
|
| 288 |
+
best_logits = torch.finfo(torch.float32).min
|
| 289 |
+
backup_pos = -1
|
| 290 |
+
unchunk_tokens = 0
|
| 291 |
is_chunk_start = True
|
| 292 |
|
| 293 |
else:
|
| 294 |
|
| 295 |
+
unchunk_tokens_this_window = min(windows_end - windows_start,STEP)
|
| 296 |
# manually chunk
|
| 297 |
+
if unchunk_tokens + unchunk_tokens_this_window > max_tokens_per_chunk:
|
| 298 |
big_windows_end = max_tokens_per_chunk - unchunk_tokens
|
| 299 |
if is_chunk_start:
|
| 300 |
max_value, max_index= logit_diff[:,1:big_windows_end].max(), logit_diff[:,1:big_windows_end].argmax() + 1
|
| 301 |
else:
|
| 302 |
+
max_value, max_index= logit_diff[:,1:big_windows_end].max(), logit_diff[:,1:big_windows_end].argmax() + 1
|
| 303 |
if best_logits < max_value:
|
| 304 |
backup_pos = windows_start + max_index
|
| 305 |
|
| 306 |
|
| 307 |
windows_start = backup_pos
|
| 308 |
split_str_pos = [tokens.token_to_chars(backup_pos + 1).start]
|
| 309 |
+
split_str_poses = split_str_poses + split_str_pos
|
| 310 |
+
token_pos = token_pos + [backup_pos]
|
| 311 |
best_logits = torch.finfo(torch.float32).min
|
| 312 |
backup_pos = -1
|
| 313 |
unchunk_tokens = 0
|
| 314 |
is_chunk_start = True
|
| 315 |
else:
|
| 316 |
# auto leave
|
| 317 |
+
if is_chunk_start:
|
| 318 |
+
max_value, max_index= logit_diff[:,1:].max(), logit_diff[:,1:].argmax() + 1
|
|
|
|
|
|
|
| 319 |
|
| 320 |
+
else:
|
| 321 |
+
max_value, max_index= logit_diff[:,1:].max(), logit_diff[:,1:].argmax() + 1
|
| 322 |
+
if best_logits < max_value:
|
| 323 |
+
best_logits = max_value
|
| 324 |
+
backup_pos = windows_start + max_index
|
| 325 |
+
|
| 326 |
+
unchunk_tokens = unchunk_tokens + STEP
|
| 327 |
+
windows_start = windows_start + STEP
|
| 328 |
is_chunk_start = False
|
|
|
|
| 329 |
|
| 330 |
substrings = [
|
| 331 |
text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses + [len(text)])
|
|
|
|
| 333 |
token_pos = [0] + token_pos
|
| 334 |
return substrings, token_pos
|
| 335 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 336 |
# chunking ads
|
| 337 |
print("\n>>>>>>>>> Chunking ads...")
|
| 338 |
|
|
|
|
| 380 |
# Chunk the text. The prob_threshold should be between (0, 1). The lower it is, the more chunks will be generated.
|
| 381 |
# Therefore adjust it to your need, when prob_threshold is small like 0.000001, each token is one chunk,
|
| 382 |
# when it is set to 1, the whole text will be one chunk.
|
| 383 |
+
# Slide window chunking with a prob_threshold, and, will be forced to choose a best possible position to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold.
|
| 384 |
+
chunks, token_pos = chunk_text_with_max_chunk_size(model, ad, tokenizer, prob_threshold=0.5, max_tokens_per_chunk = 400)
|
|
|
|
| 385 |
|
| 386 |
# print chunks
|
| 387 |
for i, (c, t) in enumerate(zip(chunks, token_pos)):
|