Update README.md
Browse files
README.md
CHANGED
|
@@ -299,6 +299,11 @@ def chunk_text_with_max_chunk_size(model, text, tokenizer, prob_threshold=0.5,ma
|
|
| 299 |
|
| 300 |
# auto chunk
|
| 301 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 302 |
split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
|
| 303 |
split_str_poses = split_str_poses + split_str_pos
|
| 304 |
token_pos = token_pos+ [sp + windows_start for sp in greater_rows_indices if sp > 0]
|
|
|
|
| 299 |
|
| 300 |
# auto chunk
|
| 301 |
else:
|
| 302 |
+
if len(greater_rows_indices) >= 2:
|
| 303 |
+
for gi, (gri0,gri1) in enumerate(zip(greater_rows_indices[:-1],greater_rows_indices[1:])):
|
| 304 |
+
if gri1 - gri0 > max_tokens_per_chunk:
|
| 305 |
+
greater_rows_indices=greater_rows_indices[:gi+1]
|
| 306 |
+
break
|
| 307 |
split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
|
| 308 |
split_str_poses = split_str_poses + split_str_pos
|
| 309 |
token_pos = token_pos+ [sp + windows_start for sp in greater_rows_indices if sp > 0]
|