tim1900 commited on
Commit
53eb91e
·
verified ·
1 Parent(s): b9567d4

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +5 -0
README.md CHANGED
@@ -299,6 +299,11 @@ def chunk_text_with_max_chunk_size(model, text, tokenizer, prob_threshold=0.5,ma
299
 
300
  # auto chunk
301
  else:
 
 
 
 
 
302
  split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
303
  split_str_poses = split_str_poses + split_str_pos
304
  token_pos = token_pos+ [sp + windows_start for sp in greater_rows_indices if sp > 0]
 
299
 
300
  # auto chunk
301
  else:
302
+ if len(greater_rows_indices) >= 2:
303
+ for gi, (gri0,gri1) in enumerate(zip(greater_rows_indices[:-1],greater_rows_indices[1:])):
304
+ if gri1 - gri0 > max_tokens_per_chunk:
305
+ greater_rows_indices=greater_rows_indices[:gi+1]
306
+ break
307
  split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
308
  split_str_poses = split_str_poses + split_str_pos
309
  token_pos = token_pos+ [sp + windows_start for sp in greater_rows_indices if sp > 0]