Update README.md
Browse files
README.md
CHANGED
|
@@ -284,6 +284,11 @@ def chunk_text_with_max_chunk_size(model, text, tokenizer, prob_threshold=0.5,ma
|
|
| 284 |
|
| 285 |
# auto chunk
|
| 286 |
else:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
|
| 288 |
split_str_poses = split_str_poses + split_str_pos
|
| 289 |
token_pos = token_pos+ [sp + windows_start for sp in greater_rows_indices if sp > 0]
|
|
|
|
| 284 |
|
| 285 |
# auto chunk
|
| 286 |
else:
|
| 287 |
+
if len(greater_rows_indices) >= 2:
|
| 288 |
+
for gi, (gri0,gri1) in enumerate(zip(greater_rows_indices[:-1],greater_rows_indices[1:])):
|
| 289 |
+
if gri1 - gri0 > max_tokens_per_chunk:
|
| 290 |
+
greater_rows_indices=greater_rows_indices[:gi+1]
|
| 291 |
+
break
|
| 292 |
split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
|
| 293 |
split_str_poses = split_str_poses + split_str_pos
|
| 294 |
token_pos = token_pos+ [sp + windows_start for sp in greater_rows_indices if sp > 0]
|