Update README.md
Browse files
README.md
CHANGED
|
@@ -205,6 +205,25 @@ for i, (c, t) in enumerate(zip(chunks, token_pos)):
|
|
| 205 |
## Experimental
|
| 206 |
The following script supports specifying max tokens per chunk. Chunker will be forced to choose a best possible position from history to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold. This script can be seen as a new experimental version of the scripts above.
|
| 207 |
```python
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 208 |
def chunk_text_with_max_chunk_size(model, text, tokenizer, prob_threshold=0.5,max_tokens_per_chunk = 400):
|
| 209 |
with torch.no_grad():
|
| 210 |
|
|
@@ -379,8 +398,7 @@ Published on: 6 August 2024"
|
|
| 379 |
"""
|
| 380 |
# Chunk the text. The prob_threshold should be between (0, 1). The lower it is, the more chunks will be generated.
|
| 381 |
# Therefore adjust it to your need, when prob_threshold is small like 0.000001, each token is one chunk,
|
| 382 |
-
# when it is set to 1, the whole text will be one chunk.
|
| 383 |
-
# Slide window chunking with a prob_threshold, and, will be forced to choose a best possible position to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold.
|
| 384 |
chunks, token_pos = chunk_text_with_max_chunk_size(model, ad, tokenizer, prob_threshold=0.5, max_tokens_per_chunk = 400)
|
| 385 |
|
| 386 |
# print chunks
|
|
|
|
| 205 |
## Experimental
|
| 206 |
The following script supports specifying max tokens per chunk. Chunker will be forced to choose a best possible position from history to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold. This script can be seen as a new experimental version of the scripts above.
|
| 207 |
```python
|
| 208 |
+
import torch
|
| 209 |
+
from transformers import AutoTokenizer, BertForTokenClassification
|
| 210 |
+
import math
|
| 211 |
+
|
| 212 |
+
model_path = "tim1900/bert-chunker-3"
|
| 213 |
+
|
| 214 |
+
tokenizer = AutoTokenizer.from_pretrained(
|
| 215 |
+
model_path,
|
| 216 |
+
padding_side="right",
|
| 217 |
+
model_max_length=255,
|
| 218 |
+
trust_remote_code=True,
|
| 219 |
+
)
|
| 220 |
+
|
| 221 |
+
device = "cpu" # or 'cuda'
|
| 222 |
+
|
| 223 |
+
model = BertForTokenClassification.from_pretrained(
|
| 224 |
+
model_path,
|
| 225 |
+
).to(device)
|
| 226 |
+
|
| 227 |
def chunk_text_with_max_chunk_size(model, text, tokenizer, prob_threshold=0.5,max_tokens_per_chunk = 400):
|
| 228 |
with torch.no_grad():
|
| 229 |
|
|
|
|
| 398 |
"""
|
| 399 |
# Chunk the text. The prob_threshold should be between (0, 1). The lower it is, the more chunks will be generated.
|
| 400 |
# Therefore adjust it to your need, when prob_threshold is small like 0.000001, each token is one chunk,
|
| 401 |
+
# when it is set to 1, the whole text will be one chunk, and, will be forced to choose a best possible position to chunk when it is about to exceed the max_tokens_per_chunk and no token satisfy the prob_threshold.
|
|
|
|
| 402 |
chunks, token_pos = chunk_text_with_max_chunk_size(model, ad, tokenizer, prob_threshold=0.5, max_tokens_per_chunk = 400)
|
| 403 |
|
| 404 |
# print chunks
|