tim1900
/

bert-chunker-Chinese-2

Token Classification

Model card Files Files and versions

tim1900 commited on May 21, 2025

Commit

20ec6bd

·

verified ·

1 Parent(s): 9e5107e

Update README.md

Files changed (1) hide show

README.md +5 -0

README.md CHANGED Viewed

@@ -284,6 +284,11 @@ def chunk_text_with_max_chunk_size(model, text, tokenizer, prob_threshold=0.5,ma
                 # auto chunk
                 else:
                     split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
                     split_str_poses = split_str_poses + split_str_pos
                     token_pos = token_pos+ [sp + windows_start for sp in greater_rows_indices if sp > 0]

                 # auto chunk
                 else:
+                    if len(greater_rows_indices) >= 2:
+                        for gi, (gri0,gri1) in enumerate(zip(greater_rows_indices[:-1],greater_rows_indices[1:])):
+                            if gri1 - gri0 > max_tokens_per_chunk:
+                                greater_rows_indices=greater_rows_indices[:gi+1]
+                                break
                     split_str_pos = [tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices if sp > 0]
                     split_str_poses = split_str_poses + split_str_pos
                     token_pos = token_pos+ [sp + windows_start for sp in greater_rows_indices if sp > 0]