Create README.md
Browse files
README.md
ADDED
|
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language: zh
|
| 3 |
+
datasets: c2m
|
| 4 |
+
inference:
|
| 5 |
+
parameters:
|
| 6 |
+
max_length: 108
|
| 7 |
+
num_return_sequences: 1
|
| 8 |
+
do_sample: True
|
| 9 |
+
widget:
|
| 10 |
+
- text: "晋太元中,武陵人捕鱼为业。缘溪行,忘路之远近。忽逢桃花林,夹岸数百步,中无杂树,芳草鲜美,落英缤纷。渔人甚异之,复前行,欲穷其林。林尽水源,便得一山,山有小口,仿佛若有光。便舍船,从口入。初极狭,才通人。复行数十步,豁然开朗。土地平旷,屋舍俨然,有良田、美池、桑竹之属。阡陌交通,鸡犬相闻。其中往来种作,男女衣着,悉如外人。黄发垂髫,并怡然自乐。"
|
| 11 |
+
example_title: "桃花源记"
|
| 12 |
+
- text: "往者不可谏,来者犹可追。"
|
| 13 |
+
example_title: "来者犹可追"
|
| 14 |
+
- text: "逝者如斯夫!不舍昼夜。"
|
| 15 |
+
example_title: "逝者如斯夫"
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
---
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# 文言文 to 现代文
|
| 22 |
+
|
| 23 |
+
## Model description
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
## How to use
|
| 27 |
+
使用 pipeline 调用模型:
|
| 28 |
+
|
| 29 |
+
```python
|
| 30 |
+
>>> from transformers import pipeline
|
| 31 |
+
>>> model_checkpoint = "supermy/c2m-mt5"
|
| 32 |
+
>>> translator = pipeline("translation",
|
| 33 |
+
model=model_checkpoint,
|
| 34 |
+
num_return_sequences=1,
|
| 35 |
+
max_length=52,
|
| 36 |
+
truncation=True,)
|
| 37 |
+
|
| 38 |
+
>>> translator("往者不可谏,来者犹可追。")
|
| 39 |
+
[{'translation_text': '过 去 的 事 情 不能 劝 谏 , 未来 的 事 情 还 可以 追 回 来 。 如 果 过 去 的 事 情 不能 劝 谏 , 那 么 , 未来 的 事 情 还 可以 追 回 来 。 如 果 过 去 的 事 情'}]
|
| 40 |
+
|
| 41 |
+
>>> translator("福兮祸所伏,祸兮福所倚。",do_sample=True)
|
| 42 |
+
[{'translation_text': '幸 福 是 祸 患 所 隐 藏 的 , 灾 祸 是 福 祸 所 依 托 的 。 这 些 都 是 幸 福 所 依 托 的 。 这 些 都 是 幸 福 所 带 来 的 。 幸 福 啊 , 也 是 幸 福'}]
|
| 43 |
+
|
| 44 |
+
>>> translator("成事不说,遂事不谏,既往不咎。", num_return_sequences=1,do_sample=True)
|
| 45 |
+
[{'translation_text': '事 情 不 高 兴 , 事 情 不 劝 谏 , 过 去 的 事 就 不 会 责 怪 。 事 情 没 有 多 久 了 , 事 情 没 有 多 久 , 事 情 没 有 多 久 了 , 事 情 没 有 多'}]
|
| 46 |
+
|
| 47 |
+
>>> translator("逝者如斯夫!不舍昼夜。",num_return_sequences=1,max_length=30)
|
| 48 |
+
[{'translation_text': '逝 去 的 人 就 像 这 样 啊 , 不分 昼夜 地 去 追 赶 它 们 。 这 样 的 人 就 不 会 忘 记'}]
|
| 49 |
+
|
| 50 |
+
```
|
| 51 |
+
Here is how to use this model to get the features of a given text in PyTorch:
|
| 52 |
+
|
| 53 |
+
```python
|
| 54 |
+
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
|
| 55 |
+
tokenizer = AutoTokenizer.from_pretrained("supermy/c2m-mt5")
|
| 56 |
+
model = AutoModelForSeq2SeqLM.from_pretrained("supermy/c2m-mt5")
|
| 57 |
+
text = "用你喜欢的任何文本替换我。"
|
| 58 |
+
encoded_input = tokenizer(text, return_tensors='pt')
|
| 59 |
+
output = model(**encoded_input)
|
| 60 |
+
```
|
| 61 |
+
|
| 62 |
+
|
| 63 |
+
|
| 64 |
+
## Training data
|
| 65 |
+
|
| 66 |
+
非常全的文言文(古文)-现代文平行语料,基本涵盖了大部分经典古籍著作。
|
| 67 |
+
|
| 68 |
+
原始爬取的数据是篇章级对齐,经过脚本分句(按照句号分号感叹号问号划分)以及人工校对,形成共计约96万句对。目录bitext下是文言文-现代文对齐的平行数据。此外,目录source下是文言文单语数据,target下是现代文单语数据,这两个目录下的文件内容按行对齐。
|
| 69 |
+
|
| 70 |
+
以下为数据统计信息。其中,短篇章中包括了《论语》、《孟子》、《左传》等篇幅较短的古籍,已和《资治通鉴》合并。
|
| 71 |
+
|
| 72 |
+
|书名|句数
|
| 73 |
+
|:--|:--|
|
| 74 |
+
短篇章和资治通鉴|348727
|
| 75 |
+
元史|21182
|
| 76 |
+
北史|25823
|
| 77 |
+
北书|10947
|
| 78 |
+
南史|13838
|
| 79 |
+
南齐书|13137
|
| 80 |
+
史记|17701
|
| 81 |
+
后汉书|17753
|
| 82 |
+
周书|14930
|
| 83 |
+
太平广记|59358
|
| 84 |
+
宋书|23794
|
| 85 |
+
宋史|77853
|
| 86 |
+
徐霞客游记|22750
|
| 87 |
+
新五代史|10147
|
| 88 |
+
新唐书|12359
|
| 89 |
+
旧五代史|11377
|
| 90 |
+
旧唐书|29185
|
| 91 |
+
明史|85179
|
| 92 |
+
晋书|21133
|
| 93 |
+
梁书|14318
|
| 94 |
+
水经注全|11630
|
| 95 |
+
汉书|37622
|
| 96 |
+
辽史|9278
|
| 97 |
+
金史|13758
|
| 98 |
+
陈书|7096
|
| 99 |
+
隋书|8204
|
| 100 |
+
魏书|28178
|
| 101 |
+
**总计**|**967257**
|
| 102 |
+
|
| 103 |
+
《短篇章和资治通鉴》中各书籍统计如下(此部分数据量不完全准确):
|
| 104 |
+
|
| 105 |
+
|书名|句数
|
| 106 |
+
|:--|:--|
|
| 107 |
+
资治通鉴|7.95w
|
| 108 |
+
左传|1.09w
|
| 109 |
+
大学章句集注| 86
|
| 110 |
+
反经| 4211
|
| 111 |
+
公孙龙子| 73
|
| 112 |
+
管子| 6266
|
| 113 |
+
鬼谷子| 385
|
| 114 |
+
韩非子| 4325
|
| 115 |
+
淮南子| 2669
|
| 116 |
+
黄帝内经| 6162
|
| 117 |
+
皇帝四经| 243
|
| 118 |
+
将苑| 100
|
| 119 |
+
金刚经| 193
|
| 120 |
+
孔子家语| 138
|
| 121 |
+
老子| 398
|
| 122 |
+
了凡四训| 31
|
| 123 |
+
礼记| 4917
|
| 124 |
+
列子| 1735
|
| 125 |
+
六韬| 693
|
| 126 |
+
六祖坛经| 949
|
| 127 |
+
论语| 988
|
| 128 |
+
吕氏春秋| 2473
|
| 129 |
+
孟子| 1654
|
| 130 |
+
梦溪笔谈| 1280
|
| 131 |
+
墨子| 2921
|
| 132 |
+
千字文| 82
|
| 133 |
+
清史稿| 1604
|
| 134 |
+
三字经| 234
|
| 135 |
+
山海经| 919
|
| 136 |
+
伤寒论| 712
|
| 137 |
+
商君书| 916
|
| 138 |
+
尚书| 1048
|
| 139 |
+
世说新语| 3044
|
| 140 |
+
司马法| 132
|
| 141 |
+
搜神记| 1963
|
| 142 |
+
搜神后记| 540
|
| 143 |
+
素书| 61
|
| 144 |
+
孙膑兵法| 230
|
| 145 |
+
孙子兵法| 338
|
| 146 |
+
天工开物| 807
|
| 147 |
+
尉缭子| 226
|
| 148 |
+
文昌孝经| 194
|
| 149 |
+
文心雕龙| 1388
|
| 150 |
+
吴子| 136
|
| 151 |
+
孝经| 102
|
| 152 |
+
笑林广记| 1496
|
| 153 |
+
荀子| 3131
|
| 154 |
+
颜氏家训| 510
|
| 155 |
+
仪礼| 2495
|
| 156 |
+
易传| 711
|
| 157 |
+
逸周书| 1505
|
| 158 |
+
战国策| 3318
|
| 159 |
+
���观政要| 1291
|
| 160 |
+
中庸| 206
|
| 161 |
+
周礼| 2026
|
| 162 |
+
周易| 460
|
| 163 |
+
庄子| 1698
|
| 164 |
+
百战奇略| 800
|
| 165 |
+
论衡| 1.19w
|
| 166 |
+
智囊|2165
|
| 167 |
+
罗织经|188
|
| 168 |
+
朱子家训|31
|
| 169 |
+
抱朴子|217
|
| 170 |
+
地藏经|547
|
| 171 |
+
国语|3841
|
| 172 |
+
容斋随笔|2921
|
| 173 |
+
幼学琼林|1372
|
| 174 |
+
三略|268
|
| 175 |
+
围炉夜话|387
|
| 176 |
+
冰鉴|120
|
| 177 |
+
|
| 178 |
+
|
| 179 |
+
如果您使用该语料库,请注明出处:https://github.com/NiuTrans/Classical-Modern
|
| 180 |
+
|
| 181 |
+
感谢为该语料库做出贡献的成员:丁佳鹏、杨文权、刘晓晴、曹润柘、罗应峰。
|
| 182 |
+
```
|
| 183 |
+
```
|
| 184 |
+
|
| 185 |
+
## Training procedure
|
| 186 |
+
|
| 187 |
+
在英伟达16G显卡训练了 4 天整,共计68 次。
|
| 188 |
+
|
| 189 |
+
[文言文数据集](https://huggingface.co/datasets/supermy/Classical-Modern) 训练数据. 模型 [MT5](google/mt5-small)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
```
|
| 193 |
+
|
| 194 |
+
[INFO|trainer.py:1628] 2022-12-15 16:08:36,696 >> ***** Running training *****
|
| 195 |
+
[INFO|trainer.py:1629] 2022-12-15 16:08:36,696 >> Num examples = 967255
|
| 196 |
+
[INFO|trainer.py:1630] 2022-12-15 16:08:36,697 >> Num Epochs = 6
|
| 197 |
+
[INFO|trainer.py:1631] 2022-12-15 16:08:36,697 >> Instantaneous batch size per device = 12
|
| 198 |
+
[INFO|trainer.py:1632] 2022-12-15 16:08:36,697 >> Total train batch size (w. parallel, distributed & accumulation) = 12
|
| 199 |
+
[INFO|trainer.py:1633] 2022-12-15 16:08:36,697 >> Gradient Accumulation steps = 1
|
| 200 |
+
[INFO|trainer.py:1634] 2022-12-15 16:08:36,697 >> Total optimization steps = 483630
|
| 201 |
+
[INFO|trainer.py:1654] 2022-12-15 16:08:36,698 >> Continuing training from checkpoint, will skip to saved global_step
|
| 202 |
+
[INFO|trainer.py:1655] 2022-12-15 16:08:36,698 >> Continuing training from epoch 5
|
| 203 |
+
[INFO|trainer.py:1656] 2022-12-15 16:08:36,698 >> Continuing training from global step 465000
|
| 204 |
+
|
| 205 |
+
{'loss': 5.2906, 'learning_rate': 1.8743667679837894e-06, 'epoch': 5.78}
|
| 206 |
+
{'loss': 5.3196, 'learning_rate': 1.8226743584971985e-06, 'epoch': 5.78}
|
| 207 |
+
|
| 208 |
+
{'loss': 5.3467, 'learning_rate': 6.513243595310464e-08, 'epoch': 5.99}
|
| 209 |
+
{'loss': 5.3363, 'learning_rate': 1.344002646651366e-08, 'epoch': 6.0}
|
| 210 |
+
{'train_runtime': 6277.5234, 'train_samples_per_second': 924.494, 'train_steps_per_second': 77.042, 'train_loss': 0.2044413571775476, 'epoch': 6.0}
|
| 211 |
+
***** train metrics *****
|
| 212 |
+
epoch = 6.0
|
| 213 |
+
train_loss = 0.2044
|
| 214 |
+
train_runtime = 1:44:37.52
|
| 215 |
+
train_samples = 967255
|
| 216 |
+
train_samples_per_second = 924.494
|
| 217 |
+
train_steps_per_second = 77.042
|
| 218 |
+
12/15/2022 17:53:23 - INFO - __main__ - *** Evaluate ***
|
| 219 |
+
[INFO|trainer.py:2920] 2022-12-15 17:53:23,729 >> ***** Running Evaluation *****
|
| 220 |
+
[INFO|trainer.py:2922] 2022-12-15 17:53:23,729 >> Num examples = 200
|
| 221 |
+
[INFO|trainer.py:2925] 2022-12-15 17:53:23,729 >> Batch size = 12
|
| 222 |
+
100%|██████████| 17/17 [00:07<00:00, 2.29it/s]
|
| 223 |
+
[INFO|modelcard.py:443] 2022-12-15 17:53:32,737 >> Dropping the following result as it does not have all the necessary fields:
|
| 224 |
+
{'task': {'name': 'Translation', 'type': 'translation'}, 'metrics': [{'name': 'Bleu', 'type': 'bleu', 'value': 0.7225}]}
|
| 225 |
+
***** eval metrics *****
|
| 226 |
+
epoch = 6.0
|
| 227 |
+
eval_bleu = 0.7225
|
| 228 |
+
eval_gen_len = 12.285
|
| 229 |
+
eval_loss = 6.6782
|
| 230 |
+
eval_runtime = 0:00:07.77
|
| 231 |
+
eval_samples = 200
|
| 232 |
+
eval_samples_per_second = 25.721
|
| 233 |
+
eval_steps_per_second = 2.186
|
| 234 |
+
```
|