Upload 7 files

Browse files

Files changed (7) hide show

README.md +53 -1
config.json +32 -0
modeling_bertchunke_zh.py +222 -0
special_tokens_map.json +37 -0
tokenizer.json +0 -0
tokenizer_config.json +58 -0
vocab.txt +0 -0

README.md CHANGED Viewed

@@ -1,3 +1,55 @@
 ---
-license: mit
 ---

 ---
+language:
+- en
+- zh
+pipeline_tag: token-classification
 ---
+# bert-chunker-chinese
+## Introduction
+bert-chunker-chinese is a chinese text chunker based on BERT with a classifier head to predict the start token of chunks (for use in RAG, etc), and using a sliding window it cuts documents of any size into chunks. It was finetuned on top of [bge-small-zh-v1.5](https://huggingface.co/BAAI/bge-small-zh-v1.5).
+This repo includes model checkpoint, BertChunker class definition file and all the other files needed.
+## Quickstart
+Download this repository. Then enter it. Run the following:
+```python
+# -*- coding: utf-8 -*-
+import safetensors
+from transformers import AutoConfig,AutoTokenizer
+from modeling_bertchunke_zh import BertChunker
+# load config and tokenizer
+config = AutoConfig.from_pretrained(
+    "tim1900/bert-chunker-chinese",
+    trust_remote_code=True,
+)
+tokenizer = AutoTokenizer.from_pretrained(
+    "tim1900/bert-chunker-chinese",
+    padding_side="right",
+    model_max_length=config.max_position_embeddings,
+    trust_remote_code=True,
+)
+# initialize model
+model = BertChunker(config)
+device='cpu' # or 'cuda'
+model.to(device)
+# load tim1900/bert-chunker-chinese/model.safetensors
+state_dict = safetensors.torch.load_file(f"./model.safetensors")
+model.load_state_dict(state_dict)
+# text to be chunked
+text='''起点中文网(www.qidian.com)创立于2002年5月，是国内知名的原创文学网站，隶属于阅文集团旗下。起点中文网以推动中国原创文学事业为宗旨，长期致力于原创文学作者的挖掘与培养，并取得了巨大成果：2003年10月，起点中文网开启“在线收费阅读”服务，成为真正意义上的网络文学赢利模式的先锋之一，就此奠定了原创文学的行业基础。此后，起点又推出了作家福利、文学交互、内容发掘推广、版权管理等机制和体系，为原创文学的发展注入了巨大活力，有力推动了中国文学原创事业的发展。在清晨的微光中，一只孤独的猫头鹰在古老的橡树上低声吟唱，它的歌声如同夜色的回声，穿越了时间的迷雾。树叶在微风中轻轻摇曳，仿佛在诉说着古老的故事，每一个音符都带着森林的秘密。一位年轻的程序员正专注地敲打着键盘，代码的海洋在他眼前展开。他的手指在键盘上飞舞，如同钢琴家在演奏一曲复杂的交响乐。屏幕上的光标闪烁，仿佛在等待着下一个指令，引领他进入未知的数字世界。'''
+# chunk the text. The lower threshold is, the more chunks will be generated. Can be negative or positive.
+chunks=model.chunk_text(text, tokenizer, threshold=0.5)
+# print chunks
+for i, c in enumerate(chunks):
+    print(f'-----chunk: {i}------------')
+    print(c)
+```

config.json ADDED Viewed

	@@ -0,0 +1,32 @@

+{
+  "_name_or_path": "/data/bge-small-zh-v1.5",
+  "architectures": [
+    "BertChunker"
+  ],
+  "attention_probs_dropout_prob": 0.1,
+  "classifier_dropout": null,
+  "gradient_checkpointing": false,
+  "hidden_act": "gelu",
+  "hidden_dropout_prob": 0.1,
+  "hidden_size": 512,
+  "id2label": {
+    "0": "LABEL_0"
+  },
+  "initializer_range": 0.02,
+  "intermediate_size": 2048,
+  "label2id": {
+    "LABEL_0": 0
+  },
+  "layer_norm_eps": 1e-12,
+  "max_position_embeddings": 512,
+  "model_type": "bert",
+  "num_attention_heads": 8,
+  "num_hidden_layers": 4,
+  "pad_token_id": 0,
+  "position_embedding_type": "absolute",
+  "torch_dtype": "float32",
+  "transformers_version": "4.46.3",
+  "type_vocab_size": 2,
+  "use_cache": true,
+  "vocab_size": 21128
+}

modeling_bertchunke_zh.py ADDED Viewed

	@@ -0,0 +1,222 @@

+from transformers.modeling_utils import PreTrainedModel
+from torch import nn
+from transformers.models.bert.configuration_bert import BertConfig
+from transformers.models.bert.modeling_bert import BertModel
+import torch
+import torch.nn.functional as F
+class BertChunker(PreTrainedModel):
+    config_class = BertConfig
+    def __init__(self, config, ):
+        super().__init__(config)
+        self.model = BertModel(config)
+        self.chunklayer = nn.Linear(config.hidden_size, 2)
+    def forward(self, input_ids=None, attention_mask=None,labels=None, **kwargs):
+        model_output = self.model(
+            input_ids=input_ids, attention_mask=attention_mask, **kwargs
+        )
+        token_embeddings = model_output[0]
+        logits = self.chunklayer(token_embeddings)
+        model_output["logits"]=logits
+        loss = None
+        logits = logits.contiguous()
+        if labels!=None:
+            labels = labels.contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()#用-100
+            # loss_fct = nn.CrossEntropyLoss(ignore_index=50257)
+            logits = logits.view(-1, logits.shape[-1])
+            labels = labels.view(-1)
+            # Enable model parallelism
+            labels = labels.to(labels.device)
+            loss = loss_fct(logits, labels)
+            model_output["loss"]=loss
+        return model_output
+    def chunk_text(self, text:str, tokenizer,threshold=0.5)->list[str]:
+    # slide context window
+        MAX_TOKENS=self.model.config.max_position_embeddings
+        tokens=tokenizer(text, return_tensors="pt",truncation=False)
+        input_ids=tokens['input_ids'].to(self.device)
+        attention_mask=tokens['attention_mask'][:,0:MAX_TOKENS]
+        attention_mask=attention_mask.to(self.device)
+        CLS=input_ids[:,0].unsqueeze(0)
+        SEP=input_ids[:,-1].unsqueeze(0)
+        input_ids=input_ids[:,1:-1]
+        self.eval()
+        split_str_poses=[]
+        windows_start =0
+        windows_end= 0
+        while windows_end <= input_ids.shape[1]:
+            windows_end= windows_start + MAX_TOKENS-2
+            ids=torch.cat((CLS, input_ids[:,windows_start:windows_end],SEP),1)
+            ids=ids.to(self.device)
+            output=self(input_ids=ids,attention_mask=torch.ones(1, ids.shape[1],device=self.device))
+            logits = output['logits'][:, 1:-1,:]
+            chunk_probabilities = F.softmax(logits, dim=-1)[:,:,1]
+            chunk_decision = (chunk_probabilities>threshold)
+            greater_rows_indices = torch.where(chunk_decision)[1].tolist()
+            # null or not
+            if len(greater_rows_indices)>0 and (not (greater_rows_indices[0] == 0 and len(greater_rows_indices)==1)):
+                split_str_pos=[tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices]
+                split_str_poses += split_str_pos
+                windows_start = greater_rows_indices[-1] + windows_start
+            else:
+                windows_start = windows_end
+        substrings = [text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses+[len(text)])]
+        return substrings
+    def chunk_text_smooth(self, text:str, tokenizer,threshold=0)->list[str]:
+    # slide context window
+        MAX_TOKENS=self.model.config.max_position_embeddings
+        tokens=tokenizer(text, return_tensors="pt",truncation=False)
+        input_ids=tokens['input_ids'].to(self.device)
+        attention_mask=tokens['attention_mask'][:,0:MAX_TOKENS]
+        attention_mask=attention_mask.to(self.device)
+        CLS=input_ids[:,0].unsqueeze(0)
+        SEP=input_ids[:,-1].unsqueeze(0)
+        input_ids=input_ids[:,1:-1]
+        self.eval()
+        split_str_poses=[]
+        windows_start =0
+        windows_end= 0
+        prob_pair_list=[]
+        for j in range(input_ids.shape[1]):
+            prob_pair_list.append([])
+        while windows_start <= input_ids.shape[1]:
+            windows_end= windows_start + MAX_TOKENS-2
+            ids=torch.cat((CLS, input_ids[:,windows_start:windows_end],SEP),1)
+            ids=ids.to(self.device)
+            output=self(input_ids=ids,attention_mask=torch.ones(1, ids.shape[1],device=self.device))
+            logits = output['logits'][:, 1:-1,:]
+            chunk_probabilities = F.softmax(logits, dim=-1).tolist()
+            # is_left_greater = ((logits[:,:, 0] + threshold) < logits[:,:, 1])
+            for i in range(windows_start, windows_start + len(chunk_probabilities[0])):
+                prob_pair_list[i].append(chunk_probabilities[0][i-windows_start][1])
+            # split_str_pos=[tokens.token_to_chars(sp + windows_start + 1).start for sp in greater_rows_indices]
+            # split_str_poses += split_str_pos
+            windows_start = windows_start + MAX_TOKENS//2-1
+        split_str_poses=[]
+        for i in range(len(prob_pair_list)):
+            if sum(prob_pair_list[i])/len(prob_pair_list[i])>threshold:
+                split_str_poses+=[tokens.token_to_chars(i + 1).start]
+        substrings = [text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses+[len(text)])]
+        return substrings
+    def chunk_text_fast(
+        self, text: str, tokenizer, batchsize=20, threshold=0
+    ) -> list[str]:
+    # chunk the text faster with a fixed context window, batchsize is the number of windows run per batch.
+        self.eval()
+        split_str_poses=[]
+        MAX_TOKENS = self.model.config.max_position_embeddings
+        USEFUL_TOKENS = MAX_TOKENS - 2 # delete cls and sep
+        tokens = tokenizer(text, return_tensors="pt", truncation=False)
+        input_ids = tokens["input_ids"]
+        CLS = tokenizer.cls_token_id
+        SEP = tokenizer.sep_token_id
+        input_ids = input_ids[:, 1:-1].squeeze().contiguous()# delete cls and sep
+        token_num = input_ids.shape[0]
+        seq_num = input_ids.shape[0] // (USEFUL_TOKENS)
+        left_token_num = input_ids.shape[0] % (USEFUL_TOKENS)
+        if seq_num > 0:
+            reshaped_input_ids = input_ids[: seq_num * USEFUL_TOKENS].view( seq_num, USEFUL_TOKENS )
+            i = torch.arange(seq_num).unsqueeze(1)
+            j = torch.arange(USEFUL_TOKENS).repeat(seq_num, 1)
+            bias = 1 # 1 bias by cls token
+            position_id = i * (USEFUL_TOKENS) + j + bias
+            position_id = position_id.to(self.device)
+            reshaped_input_ids = torch.cat(
+                (
+                    torch.full((reshaped_input_ids.shape[0], 1), CLS),
+                    reshaped_input_ids,
+                    torch.full((reshaped_input_ids.shape[0], 1), SEP),
+                ),
+                1,
+            )
+            batch_num = seq_num // batchsize
+            left_seq_num = seq_num % batchsize
+            for i in range(batch_num):
+                batch_input = reshaped_input_ids[i : i + batchsize, :].to(self.device)
+                attention_mask = torch.ones(batch_input.shape[0], batch_input.shape[1]).to(self.device)
+                output = self(input_ids=batch_input, attention_mask=attention_mask)
+                logits = output['logits'][:, 1:-1,:]#delete cls and sep
+                is_left_greater = ((logits[:,:, 0] + threshold) < logits[:,:, 1])
+                pos = is_left_greater * position_id[i : i + batchsize, :]
+                pos = pos[pos>0].tolist()
+                split_str_poses += [tokens.token_to_chars(p).start for p in pos]
+            if left_seq_num > 0:
+                batch_input = reshaped_input_ids[-left_seq_num:, :].to(self.device)
+                attention_mask = torch.ones(batch_input.shape[0], batch_input.shape[1]).to(self.device)
+                output = self(input_ids=batch_input, attention_mask=attention_mask)
+                logits = output['logits'][:, 1:-1,:]#delete cls and sep
+                is_left_greater = ((logits[:,:, 0] + threshold) < logits[:,:, 1])
+                pos = is_left_greater * position_id[-left_seq_num:, :]
+                pos = pos[pos>0].tolist()
+                split_str_poses += [tokens.token_to_chars(p).start for p in pos]
+        if left_token_num > 0:
+            left_input_ids = torch.cat([torch.tensor([CLS]), input_ids[-left_token_num:], torch.tensor([SEP])])
+            left_input_ids = left_input_ids.unsqueeze(0).to(self.device)
+            attention_mask = torch.ones(left_input_ids.shape[0], left_input_ids.shape[1]).to(self.device)
+            output = self(input_ids=left_input_ids, attention_mask=attention_mask)
+            logits = output['logits'][:, 1:-1,:]#delete cls and sep
+            is_left_greater = ((logits[:,:, 0] + threshold) < logits[:,:, 1])
+            bias = token_num - (left_input_ids.shape[1] - 2) + 1
+            pos = (torch.where(is_left_greater)[1] + bias).tolist()
+            split_str_poses += [tokens.token_to_chars(p).start for p in pos]
+        substrings = [text[i:j] for i, j in zip([0] + split_str_poses, split_str_poses+[len(text)])]
+        return substrings

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,37 @@

+{
+  "cls_token": {
+    "content": "[CLS]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "mask_token": {
+    "content": "[MASK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "pad_token": {
+    "content": "[PAD]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "sep_token": {
+    "content": "[SEP]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  },
+  "unk_token": {
+    "content": "[UNK]",
+    "lstrip": false,
+    "normalized": false,
+    "rstrip": false,
+    "single_word": false
+  }
+}

tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,58 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "100": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "101": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "102": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "103": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": true,
+  "cls_token": "[CLS]",
+  "do_basic_tokenize": true,
+  "do_lower_case": false,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "padding_side": "right",
+  "sep_token": "[SEP]",
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "BertTokenizer",
+  "unk_token": "[UNK]"
+}

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff