tim1900 commited on
Commit
20ec6bd
·
verified ·
1 Parent(s): 9e5107e

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +5 -0
README.md CHANGED
@@ -284,6 +284,11 @@ def chunk_text_with_max_chunk_size(model, text, tokenizer, prob_threshold=0.5,ma
284
 
285
  # auto chunk
286
  else:
 
 
 
 
 
287
  split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
288
  split_str_poses = split_str_poses + split_str_pos
289
  token_pos = token_pos+ [sp + windows_start for sp in greater_rows_indices if sp > 0]
 
284
 
285
  # auto chunk
286
  else:
287
+ if len(greater_rows_indices) >= 2:
288
+ for gi, (gri0,gri1) in enumerate(zip(greater_rows_indices[:-1],greater_rows_indices[1:])):
289
+ if gri1 - gri0 > max_tokens_per_chunk:
290
+ greater_rows_indices=greater_rows_indices[:gi+1]
291
+ break
292
  split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
293
  split_str_poses = split_str_poses + split_str_pos
294
  token_pos = token_pos+ [sp + windows_start for sp in greater_rows_indices if sp > 0]