Update README.md
Browse files
README.md
CHANGED
|
@@ -1,243 +1,77 @@
|
|
| 1 |
---
|
| 2 |
-
license: apache-2.0
|
| 3 |
-
language: zh
|
| 4 |
tags:
|
| 5 |
-
-
|
| 6 |
-
-
|
|
|
|
| 7 |
- chinese
|
| 8 |
-
-
|
| 9 |
-
|
| 10 |
-
widget:
|
| 11 |
-
- text: "新闻分类任务:【微软披露拓扑量子计算机计划!】这篇文章的类别是什么?故事/文化/娱乐/体育/财经/房产/汽车/教育/科技"
|
| 12 |
-
- type: "text-generation"
|
| 13 |
---
|
| 14 |
|
| 15 |
-
# Randeng-T5-784M-MultiTask-Chinese
|
| 16 |
|
| 17 |
-
|
| 18 |
-
- Github: [Fengshenbang-LM](https://github.com/IDEA-CCNL/Fengshenbang-LM)
|
| 19 |
|
| 20 |
-
##
|
| 21 |
|
| 22 |
-
|
| 23 |
|
| 24 |
-
|
| 25 |
|
| 26 |
-
|
| 27 |
|
| 28 |
-
|
| 29 |
|
| 30 |
-
|
| 31 |
|
| 32 |
-
|
|
|
|
| 33 |
|
| 34 |
-
|
| 35 |
-
| :----: | :----: | :----: | :----: | :----: | :----: |
|
| 36 |
-
| 通用 General | 自然语言转换 NLT | 燃灯 Randeng | MultiTask | 784M | 多任务-中文 MultiTask-Chinese |
|
| 37 |
|
| 38 |
-
|
| 39 |
-
## 模型信息 Model Information
|
| 40 |
-
|
| 41 |
-
参考论文:[Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](http://jmlr.org/papers/v21/20-074.html)
|
| 42 |
-
|
| 43 |
-
基于[Randeng-T5-784M](https://huggingface.co/IDEA-CCNL/Randeng-T5-784M),我们在收集的100+个中文领域的多任务数据集(从中采样了30w+个样本)上微调了它,得到了此多任务版本。这些多任务包括:情感分析,新闻分类,文本分类,意图识别,自然语言推理,多项选择,指代消解,抽取式阅读理解,实体识别,关键词抽取,生成式摘要。
|
| 44 |
-
|
| 45 |
-
Based on [Randeng-T5-784M](https://huggingface.co/IDEA-CCNL/Randeng-T5-784M), we fine-tuned it on a collection of 100+ multitasking datasets in Chinese domains (from which 30w+ samples were sampled) to obtain this multitasking version. These multitasks include: sentiment analysis, news classification, text classification, intention recognition, natural language inference, multiple choice, denotational disambiguation, extractive reading comprehension, entity recognition, keyword extraction, and generative summarization.
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
## 使用 Usage
|
| 49 |
|
| 50 |
```python
|
| 51 |
-
import
|
| 52 |
-
from transformers import T5Tokenizer, T5Config, T5ForConditionalGeneration
|
| 53 |
-
|
| 54 |
-
# load tokenizer and model
|
| 55 |
-
pretrained_model = "IDEA-CCNL/Randeng-T5-784M-MultiTask-Chinese"
|
| 56 |
-
|
| 57 |
-
special_tokens = ["<extra_id_{}>".format(i) for i in range(100)]
|
| 58 |
-
tokenizer = T5Tokenizer.from_pretrained(
|
| 59 |
-
pretrained_model,
|
| 60 |
-
do_lower_case=True,
|
| 61 |
-
max_length=512,
|
| 62 |
-
truncation=True,
|
| 63 |
-
additional_special_tokens=special_tokens,
|
| 64 |
-
)
|
| 65 |
-
config = T5Config.from_pretrained(pretrained_model)
|
| 66 |
-
model = T5ForConditionalGeneration.from_pretrained(pretrained_model, config=config)
|
| 67 |
-
model.resize_token_embeddings(len(tokenizer))
|
| 68 |
-
model.eval()
|
| 69 |
-
|
| 70 |
-
# tokenize
|
| 71 |
-
text = "新闻分类任务:【微软披露拓扑量子计算机计划!】这篇文章的类别是什么?故事/文化/娱乐/体育/财经/房产/汽车/教育/科技"
|
| 72 |
-
encode_dict = tokenizer(text, max_length=512, padding='max_length',truncation=True)
|
| 73 |
-
|
| 74 |
-
inputs = {
|
| 75 |
-
"input_ids": torch.tensor([encode_dict['input_ids']]).long(),
|
| 76 |
-
"attention_mask": torch.tensor([encode_dict['attention_mask']]).long(),
|
| 77 |
-
}
|
| 78 |
-
|
| 79 |
-
# generate answer
|
| 80 |
-
logits = model.generate(
|
| 81 |
-
input_ids = inputs['input_ids'],
|
| 82 |
-
max_length=100,
|
| 83 |
-
do_sample= True
|
| 84 |
-
# early_stopping=True,
|
| 85 |
-
)
|
| 86 |
-
|
| 87 |
-
logits=logits[:,1:]
|
| 88 |
-
predict_label = [tokenizer.decode(i,skip_special_tokens=True) for i in logits]
|
| 89 |
-
print(predict_label)
|
| 90 |
-
|
| 91 |
-
# model output: 科技
|
| 92 |
-
```
|
| 93 |
-
|
| 94 |
-
除了分类任务,其他任务的数据构造例子如下:
|
| 95 |
|
| 96 |
-
|
| 97 |
|
| 98 |
-
|
| 99 |
-
|
| 100 |
-
"文本分类":{"text_a":"钢琴块3别踩白块儿3钢琴块3是一款简洁的钢琴模拟软件,在Android平台上,类似的软件还是比较多的。","choices":["相机","影视娱乐","棋牌中心","新闻","财经","策略","休闲益智","教育"]},
|
| 101 |
-
'新闻分类':{"text_a":"微软披露拓扑量子计算机计划!","choices":["故事","文化","娱乐","体育","财经","房产","汽车","教育","科技"]},
|
| 102 |
-
'情感分析':{"text_a":"刚买iphone13 pro 还不到一个月,天天死机最差的一次购物体验","choices":["好评","差评"]},
|
| 103 |
-
'意图识别':{"text_a":"打电话给吴小军。","choices":["放音乐","播放下一首","打电话","退出导航","开始导航","其他","暂停音乐","导航","开导航"]},
|
| 104 |
|
| 105 |
-
|
| 106 |
-
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
-
|
| 109 |
-
'指代消解':{"text_a":"李鸣觉得董客这人,踏实得叫人难受。可因为孟野和森森太疯,他只好去找董客聊天,但在董客眼里,李鸣也是不正常,他竟然放着现成的大学不愿上。","question":"【他】指的是【李鸣】吗?","choices":["是","不是"]},
|
| 110 |
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
'关键词识别':{"text_a":"今儿在大众点评,找到了口碑不错的老茶故事私房菜。","question":"请问这篇文章的关键词是大众点评、老茶私房菜吗?,"choices":["是","不是"]}
|
| 115 |
-
|
| 116 |
-
"生成式摘要":{"text_a":"针对传统的流量分类管理系统存在不稳定、结果反馈不及时、分类结果显示不直观等问题,设计一个基于web的在线的流量分类管理系统.该系统采用流中前5个包(排除3次握手包)所含信息作为特征值计算资源,集成一种或多种分类算法用于在线网络流量分类,应用数据可视化技术处理分类结果.实验表明:在采用适应在线分类的特征集和c4.5决策树算法做分类时,系统能快速做出分类,且精度达到94%以上;数据可视化有助于人机交互,改善分类指导."}
|
| 117 |
-
}
|
| 118 |
|
| 119 |
-
#
|
| 120 |
-
|
| 121 |
-
"
|
| 122 |
-
"prompt": "{}任��:【{}】这篇文章的情感态度是什么?{}",
|
| 123 |
-
"keys_order": ["subtask_type","text_a", "verbalizer"],
|
| 124 |
-
"data_type": "classification",
|
| 125 |
-
},
|
| 126 |
-
"文本分类": {
|
| 127 |
-
"prompt": "{}任务:【{}】这篇文章的类别是什么?{}",
|
| 128 |
-
"keys_order": ["subtask_type","text_a", "verbalizer"],
|
| 129 |
-
"data_type": "classification",
|
| 130 |
-
},
|
| 131 |
-
"新闻分类": {
|
| 132 |
-
"prompt": "{}任务:【{}】这篇文章的类别是什么?{}",
|
| 133 |
-
"keys_order": ["subtask_type","text_a", "verbalizer"],
|
| 134 |
-
"data_type": "classification",
|
| 135 |
-
},
|
| 136 |
-
"意图识别": {
|
| 137 |
-
"prompt": "{}任务:【{}】这句话的意图是什么?{}",
|
| 138 |
-
"keys_order": ["subtask_type","text_a", "verbalizer"],
|
| 139 |
-
"data_type": "classification",
|
| 140 |
-
},
|
| 141 |
-
# --------------------
|
| 142 |
-
"自然语言推理": {
|
| 143 |
-
"prompt": "{}任务:【{}】和【{}】,以上两句话的逻辑关系是什么?{}",
|
| 144 |
-
"keys_order": ["subtask_type","text_a", "text_b", "verbalizer"],
|
| 145 |
-
"data_type": "classification",
|
| 146 |
-
},
|
| 147 |
-
"语义匹配": {
|
| 148 |
-
"prompt": "{}任务:【{}】和【{}】,以上两句话的内容是否相似?{}",
|
| 149 |
-
"keys_order": ["subtask_type","text_a", "text_b", "verbalizer"],
|
| 150 |
-
"data_type": "classification",
|
| 151 |
-
},
|
| 152 |
-
# -----------------------
|
| 153 |
-
"指代消解": {
|
| 154 |
-
"prompt": "{}任务:文章【{}】中{}{}",
|
| 155 |
-
"keys_order": ["subtask_type","text_a", "question", "verbalizer"],
|
| 156 |
-
"data_type": "classification",
|
| 157 |
-
},
|
| 158 |
-
"多项选择": {
|
| 159 |
-
"prompt": "{}任务:阅读文章【{}】问题【{}】?{}",
|
| 160 |
-
"keys_order": ["subtask_type","text_a", "question", "verbalizer"],
|
| 161 |
-
"data_type": "classification",
|
| 162 |
-
},
|
| 163 |
-
# ------------------------
|
| 164 |
-
"抽取式阅读理解": {
|
| 165 |
-
"prompt": "{}任务:阅读文章【{}】问题【{}】的答案是什么?",
|
| 166 |
-
"keys_order": ["subtask_type","text_a", "question"],
|
| 167 |
-
"data_type": "mrc",
|
| 168 |
-
},
|
| 169 |
-
"实体识别": {
|
| 170 |
-
"prompt": "{}任务:找出【{}】这篇文章中所有【{}】类型的实体?",
|
| 171 |
-
"keys_order": ["subtask_type","text_a", "question"],
|
| 172 |
-
"data_type": "ner",
|
| 173 |
-
},
|
| 174 |
-
# ------------------------
|
| 175 |
-
"关键词抽取": {
|
| 176 |
-
"prompt": "{}任务:【{}】这篇文章的关键词是什么?",
|
| 177 |
-
"keys_order": ["subtask_type","text_a"],
|
| 178 |
-
"data_type": "keys",
|
| 179 |
-
},
|
| 180 |
-
"关键词识别":{
|
| 181 |
-
"prompt": "{}任务:阅读文章【{}】问题【{}】{}",
|
| 182 |
-
"keys_order": ["subtask_type","text_a","question","verbalizer"],
|
| 183 |
-
"data_type": "classification",
|
| 184 |
-
},
|
| 185 |
-
"生成式摘要": {
|
| 186 |
-
"prompt": "{}任务:【{}】这篇文章的摘要是什么?",
|
| 187 |
-
"keys_order": ["subtask_type","text_a"],
|
| 188 |
-
"data_type": "summ",
|
| 189 |
-
},
|
| 190 |
-
}
|
| 191 |
-
|
| 192 |
-
def get_instruction(sample):
|
| 193 |
-
|
| 194 |
-
template = dataset2instruction[sample["subtask_type"]]
|
| 195 |
-
# print(template)
|
| 196 |
-
# print(sample)
|
| 197 |
-
sample["instruction"] = template["prompt"].format(*[
|
| 198 |
-
sample[k] for k in template["keys_order"]
|
| 199 |
-
])
|
| 200 |
-
|
| 201 |
-
print(sample["instruction"])
|
| 202 |
|
| 203 |
-
|
| 204 |
-
|
| 205 |
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
- [预训练](https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/main/fengshen/examples/pretrain_t5)
|
| 209 |
-
- [微调](https://github.com/IDEA-CCNL/Fengshenbang-LM/tree/main/fengshen/examples/mt5_summary)
|
| 210 |
|
| 211 |
-
|
|
|
|
|
|
|
| 212 |
|
| 213 |
-
|
| 214 |
-
|
| 215 |
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
如果您在您的工作中使用了我们的模型,可以引用我们的[论文](https://arxiv.org/abs/2209.02970):
|
| 219 |
-
|
| 220 |
-
If you are using the resource for your work, please cite the our [paper](https://arxiv.org/abs/2209.02970):
|
| 221 |
-
|
| 222 |
-
```text
|
| 223 |
-
@article{fengshenbang,
|
| 224 |
-
author = {Jiaxing Zhang and Ruyi Gan and Junjie Wang and Yuxiang Zhang and Lin Zhang and Ping Yang and Xinyu Gao and Ziwei Wu and Xiaoqun Dong and Junqing He and Jianheng Zhuo and Qi Yang and Yongfeng Huang and Xiayu Li and Yanghan Wu and Junyu Lu and Xinyu Zhu and Weifeng Chen and Ting Han and Kunhao Pan and Rui Wang and Hao Wang and Xiaojun Wu and Zhongshen Zeng and Chongpei Chen},
|
| 225 |
-
title = {Fengshenbang 1.0: Being the Foundation of Chinese Cognitive Intelligence},
|
| 226 |
-
journal = {CoRR},
|
| 227 |
-
volume = {abs/2209.02970},
|
| 228 |
-
year = {2022}
|
| 229 |
}
|
| 230 |
```
|
| 231 |
|
| 232 |
-
|
| 233 |
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
```text
|
| 237 |
-
@misc{Fengshenbang-LM,
|
| 238 |
-
title={Fengshenbang-LM},
|
| 239 |
-
author={IDEA-CCNL},
|
| 240 |
-
year={2021},
|
| 241 |
-
howpublished={\url{https://github.com/IDEA-CCNL/Fengshenbang-LM}},
|
| 242 |
-
}
|
| 243 |
-
```
|
|
|
|
| 1 |
---
|
|
|
|
|
|
|
| 2 |
tags:
|
| 3 |
+
- transformer
|
| 4 |
+
- t5
|
| 5 |
+
- text2text-generation
|
| 6 |
- chinese
|
| 7 |
+
- multitask
|
| 8 |
+
- tokenizer
|
|
|
|
|
|
|
|
|
|
| 9 |
---
|
| 10 |
|
| 11 |
+
# Randeng-T5-784M-MultiTask-Chinese-with-Tokenizer-JSON
|
| 12 |
|
| 13 |
+
This repository hosts a modified version of the [IDEA-CCNL/Randeng-T5-784M-MultiTask-Chinese](https://huggingface.co/IDEA-CCNL/Randeng-T5-784M-MultiTask-Chinese) model. The primary purpose of this repository is to **include the `tokenizer.json` file**, which was missing in the original release.
|
|
|
|
| 14 |
|
| 15 |
+
## Motivation for this Repository
|
| 16 |
|
| 17 |
+
The original `IDEA-CCNL/Randeng-T5-784M-MultiTask-Chinese` model is an excellent T5-based model for various Chinese NLP tasks. However, it was released with only a `spiece.model` file for its tokenizer, lacking the `tokenizer.json` file.
|
| 18 |
|
| 19 |
+
While the Python `transformers` library can generally load the tokenizer from `spiece.model`, this absence caused issues for environments that strictly prefer or require `tokenizer.json` (e.g., certain versions or implementations of the Rust `tokenizers` library, or other frameworks that rely on this standardized format).
|
| 20 |
|
| 21 |
+
To enhance usability and compatibility across different platforms and libraries, this repository was created to provide the model with the commonly expected `tokenizer.json` file.
|
| 22 |
|
| 23 |
+
## Changes Made
|
| 24 |
|
| 25 |
+
The following modifications have been made to the original `IDEA-CCNL/Randeng-T5-784M-MultiTask-Chinese` model files:
|
| 26 |
|
| 27 |
+
* **Added `tokenizer.json`:** The primary change is the inclusion of the `tokenizer.json` file, generated from the original `spiece.model` using the Python `transformers` library's `save_pretrained()` method. This ensures broader compatibility and easier loading for various applications.
|
| 28 |
+
* **No Model Weights Changes:** **Crucially, the model weights (`pytorch_model.bin` or `model.safetensors`) themselves have not been altered in any way.** This repository provides the exact same powerful pre-trained model, just with an updated tokenizer serialization format.
|
| 29 |
|
| 30 |
+
## How to Use
|
|
|
|
|
|
|
| 31 |
|
| 32 |
+
You can load this model and its tokenizer using the Hugging Face `transformers` library:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
|
| 34 |
```python
|
| 35 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
|
| 37 |
+
model_name = "your-username/Randeng-T5-784M-MultiTask-Chinese-with-Tokenizer-JSON" # Replace with your actual repository name
|
| 38 |
|
| 39 |
+
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
| 40 |
+
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
|
|
|
|
|
|
|
|
|
|
|
|
|
| 41 |
|
| 42 |
+
text = "你好,这是一个测试。"
|
| 43 |
+
inputs = tokenizer(text, return_tensors="pt")
|
| 44 |
+
outputs = model.generate(**inputs)
|
| 45 |
+
print(tokenizer.decode(outputs[0], skip_special_tokens=True))
|
| 46 |
+
````
|
| 47 |
|
| 48 |
+
For Rust users (and others requiring `tokenizer.json`):
|
|
|
|
| 49 |
|
| 50 |
+
```rust
|
| 51 |
+
use tokenizers::Tokenizer;
|
| 52 |
+
use std::error::Error;
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
#[tokio::main]
|
| 55 |
+
async fn main() -> Result<(), Box<dyn Error>> {
|
| 56 |
+
let model_id = "your-username/Randeng-T5-784M-MultiTask-Chinese-with-Tokenizer-JSON"; // Replace with your actual repository name
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 57 |
|
| 58 |
+
// The Tokenizer::from_pretrained will now find and use tokenizer.json
|
| 59 |
+
let tokenizer = Tokenizer::from_pretrained(model_id, None).await?;
|
| 60 |
|
| 61 |
+
let text = "你好,这是一个中文文本。";
|
| 62 |
+
let encoding = tokenizer.encode(text, true).unwrap();
|
|
|
|
|
|
|
| 63 |
|
| 64 |
+
println!("Original text: {}", text);
|
| 65 |
+
println!("Tokens: {:?}", encoding.get_tokens());
|
| 66 |
+
println!("IDs: {:?}", encoding.get_ids());
|
| 67 |
|
| 68 |
+
let decoded_text = tokenizer.decode(encoding.get_ids(), true).unwrap();
|
| 69 |
+
println!("Decoded text: {}", decoded_text);
|
| 70 |
|
| 71 |
+
Ok(())
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
}
|
| 73 |
```
|
| 74 |
|
| 75 |
+
## Original Model Information
|
| 76 |
|
| 77 |
+
For more details about the original `IDEA-CCNL/Randeng-T5-784M-MultiTask-Chinese` model, its training, capabilities, and benchmarks, please refer to its official repository: [IDEA-CCNL/Randeng-T5-784M-MultiTask-Chinese](https://huggingface.co/IDEA-CCNL/Randeng-T5-784M-MultiTask-Chinese).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|