Alic-Li commited on
Commit
13fe3ba
·
verified ·
1 Parent(s): 8294eaf

Upload 12 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,7 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ img/img_1.png filter=lfs diff=lfs merge=lfs -text
37
+ img/img_2.png filter=lfs diff=lfs merge=lfs -text
38
+ img/img_3.png filter=lfs diff=lfs merge=lfs -text
39
+ miniGoose.png filter=lfs diff=lfs merge=lfs -text
API_DEMO_CHAT.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ########################################################################################################
2
+ # The RWKV Language Model - https://github.com/BlinkDL/RWKV-LM
3
+ ########################################################################################################
4
+
5
+ print("RWKV Chat Simple Demo")
6
+
7
+ import os, copy, types, gc, sys, re
8
+ import numpy as np
9
+ from prompt_toolkit import prompt
10
+ import torch
11
+ from transformers import AutoTokenizer
12
+
13
+ torch.backends.cudnn.benchmark = True
14
+ torch.backends.cudnn.allow_tf32 = True
15
+ torch.backends.cuda.matmul.allow_tf32 = True
16
+ os.environ["RWKV_V7_ON"] = "1" # enable this for rwkv-7 models
17
+ os.environ["RWKV_JIT_ON"] = "1"
18
+ os.environ["RWKV_CUDA_ON"] = "0" # !!! '1' to compile CUDA kernel (10x faster), requires c++ compiler & cuda libraries !!!
19
+
20
+ from rwkv.model import RWKV
21
+ from rwkv.utils import PIPELINE
22
+
23
+ ########################################################################################################
24
+
25
+ args = types.SimpleNamespace()
26
+
27
+ args.strategy = "cuda fp16" # use CUDA, fp16
28
+
29
+ args.MODEL_NAME = "./rwkv-final-sft-2048.pth"
30
+
31
+
32
+ ########################################################################################################
33
+ STATE_NAME = None # use vanilla zero initial state?
34
+
35
+ # use custom state? much better chat results (download from https://huggingface.co/BlinkDL/temp-latest-training-models/tree/main)
36
+ # note: this is English Single-round QA state (will forget what you previously say)
37
+ # STATE_NAME = "E://RWKV-Runner//models//rwkv-x060-eng_single_round_qa-1B6-20240516-ctx2048"
38
+ ########################################################################################################
39
+
40
+ GEN_TEMP = 1.0
41
+ GEN_TOP_P = 0.3
42
+ GEN_alpha_presence = 0.5
43
+ GEN_alpha_frequency = 0.5
44
+ GEN_penalty_decay = 0.996
45
+
46
+ if STATE_NAME != None:
47
+ GEN_TOP_P = 0.2
48
+ GEN_alpha_presence = 0.3
49
+ GEN_alpha_frequency = 0.3
50
+
51
+ CHUNK_LEN = 16 # split input into chunks to save VRAM (shorter -> slower, but saves VRAM)
52
+
53
+ ########################################################################################################
54
+
55
+ print(f"Loading model - {args.MODEL_NAME}")
56
+ model = RWKV(model=args.MODEL_NAME, strategy=args.strategy)
57
+ pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
58
+ tokenizer = AutoTokenizer.from_pretrained("./MiniMind2_tokenizer")
59
+
60
+ model_tokens = []
61
+ model_state = None
62
+
63
+ if STATE_NAME != None: # load custom state
64
+ args = model.args
65
+ state_raw = torch.load(STATE_NAME + '.pth')
66
+ state_init = [None for i in range(args.n_layer * 3)]
67
+ for i in range(args.n_layer):
68
+ dd = model.strategy[i]
69
+ dev = dd.device
70
+ atype = dd.atype
71
+ state_init[i*3+0] = torch.zeros(args.n_embd, dtype=atype, requires_grad=False, device=dev).contiguous()
72
+ state_init[i*3+1] = state_raw[f'blocks.{i}.att.time_state'].transpose(1,2).to(dtype=torch.float, device=dev).requires_grad_(False).contiguous()
73
+ state_init[i*3+2] = torch.zeros(args.n_embd, dtype=atype, requires_grad=False, device=dev).contiguous()
74
+ model_state = copy.deepcopy(state_init)
75
+
76
+ def run_rnn(ctx):
77
+ global model_tokens, model_state
78
+
79
+ ctx = ctx.replace("\r\n", "\n")
80
+
81
+ tokens = tokenizer.encode(ctx)
82
+ tokens = [int(x) for x in tokens]
83
+ model_tokens += tokens
84
+
85
+ # print(f"### model ###\n{model_tokens}\n[{pipeline.decode(model_tokens)}]") # debug
86
+
87
+ while len(tokens) > 0:
88
+ out, model_state = model.forward(tokens[:CHUNK_LEN], model_state)
89
+ tokens = tokens[CHUNK_LEN:]
90
+
91
+ return out
92
+
93
+ if STATE_NAME == None: # use initial prompt if we are not loading a state
94
+ init_ctx = "User: hi" + "\n\n"
95
+ init_ctx += "Assistant: Hi. I am your assistant and I will provide expert full response in full details. Please feel free to ask any question and I will always answer it." + "\n\n"
96
+ # run_rnn(init_ctx)
97
+ # print(init_ctx, end="")
98
+
99
+ while True:
100
+ msg = prompt("<|im_start|>user:")
101
+ msg = msg.strip()
102
+ msg = re.sub(r"\n+", "\n", msg)
103
+ if len(msg) > 0:
104
+ occurrence = {}
105
+ out_tokens = []
106
+ out_last = 0
107
+
108
+ out = run_rnn("<|im_start|>user\n" + msg + "<|im_end|>\n" + "<|im_start|>assistant\n")
109
+ print("\nAssistant:", end="")
110
+
111
+ eos_token_id = tokenizer.eos_token_id
112
+ pad_token_id = tokenizer.pad_token_id
113
+
114
+ for i in range(99999):
115
+ for n in occurrence:
116
+ out[n] -= GEN_alpha_presence + occurrence[n] * GEN_alpha_frequency # repetition penalty
117
+ out[0] -= 1e10 # disable END_OF_TEXT
118
+
119
+ token = pipeline.sample_logits(out, temperature=GEN_TEMP, top_p=GEN_TOP_P)
120
+
121
+ out, model_state = model.forward([token], model_state)
122
+ model_tokens += [token]
123
+
124
+ out_tokens += [token]
125
+
126
+ for xxx in occurrence:
127
+ occurrence[xxx] *= GEN_penalty_decay
128
+ occurrence[token] = 1 + (occurrence[token] if token in occurrence else 0)
129
+
130
+ tmp = tokenizer.decode(out_tokens[out_last:])
131
+ if ("\ufffd" not in tmp) and (not tmp.endswith("\n")):
132
+ print(tmp, end="", flush=True)
133
+ out_last = i + 1
134
+
135
+ # 使用 token_id 判断是否为 eos_token
136
+ if token == eos_token_id:
137
+ print(tmp, end="\n\n", flush=True)
138
+ break
139
+ else:
140
+ print("!!! Error: please say something !!!")
MiniMind2_tokenizer/special_tokens_map.json ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<|im_start|>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "<|im_end|>",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "<|endoftext|>",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "unk_token": {
24
+ "content": "<|endoftext|>",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ }
30
+ }
MiniMind2_tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
MiniMind2_tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "add_bos_token": false,
3
+ "add_eos_token": false,
4
+ "add_prefix_space": false,
5
+ "added_tokens_decoder": {
6
+ "0": {
7
+ "content": "<|endoftext|>",
8
+ "lstrip": false,
9
+ "normalized": false,
10
+ "rstrip": false,
11
+ "single_word": false,
12
+ "special": true
13
+ },
14
+ "1": {
15
+ "content": "<|im_start|>",
16
+ "lstrip": false,
17
+ "normalized": false,
18
+ "rstrip": false,
19
+ "single_word": false,
20
+ "special": true
21
+ },
22
+ "2": {
23
+ "content": "<|im_end|>",
24
+ "lstrip": false,
25
+ "normalized": false,
26
+ "rstrip": false,
27
+ "single_word": false,
28
+ "special": true
29
+ }
30
+ },
31
+ "additional_special_tokens": [],
32
+ "bos_token": "<|im_start|>",
33
+ "chat_template": "{% if messages[0]['role'] == 'system' %}{% set system_message = messages[0]['content'] %}{{ '<|im_start|>system\\n' + system_message + '<|im_end|>\\n' }}{% else %}{{ '<|im_start|>system\\nYou are a helpful assistant<|im_end|>\\n' }}{% endif %}{% for message in messages %}{% set content = message['content'] %}{% if message['role'] == 'user' %}{{ '<|im_start|>user\\n' + content + '<|im_end|>\\n<|im_start|>assistant\\n' }}{% elif message['role'] == 'assistant' %}{{ content + '<|im_end|>' + '\\n' }}{% endif %}{% endfor %}",
34
+ "clean_up_tokenization_spaces": false,
35
+ "eos_token": "<|im_end|>",
36
+ "extra_special_tokens": {},
37
+ "legacy": true,
38
+ "model_max_length": 32768,
39
+ "pad_token": "<|endoftext|>",
40
+ "sp_model_kwargs": {},
41
+ "spaces_between_special_tokens": false,
42
+ "tokenizer_class": "PreTrainedTokenizer",
43
+ "unk_token": "<|endoftext|>"
44
+ }
README.md CHANGED
@@ -1,3 +1,85 @@
1
- ---
2
- license: apache-2.0
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ datasets:
4
+ - jingyaogong/minimind_dataset
5
+ language:
6
+ - zh
7
+ - en
8
+ tags:
9
+ - 34.2M
10
+ ---
11
+
12
+
13
+ # 🪿 Mini-RWKV-V7-LM
14
+ 🚀 让我们来从头训练一个属于自己的Mini-RWKV-7吧~ 小小的鹅也能飞得很高喔~
15
+
16
+ <div align="center">
17
+ <img src="./miniGoose.png" width="200" height="200" style="display: block; margin: auto;">
18
+ </div>
19
+
20
+ ## 🌟 模型简介
21
+ 前往 [**Mini_RWKV_7**](https://github.com/Alic-Li/Mini_RWKV_7 ) 查看完整项目
22
+
23
+ 本模型是基于 **RWKV-V7 架构** 训练的一个 **34M 参数量** 的语言模型`Mini-RWKV-V7-LM-34M`。它在保持轻量的同时,具备良好的语言理解和生成能力,非常适合资源极其有限的设备部署和快速迭代开发。
24
+
25
+ ---
26
+
27
+ ## 📦 模型结构
28
+
29
+ | 参数 | 数值 |
30
+ |------|------|
31
+ | 参数量 | 34.2M 🎯 |
32
+ | 层数 | 8 🧱 |
33
+ | 隐藏维度 | 512 📐 |
34
+ | 上下文长度 | 512->1024->2048 📏 |
35
+ | 词表大小 | 6400 📚 |
36
+ - Vocab 和MiniMind的保持一致
37
+ ---
38
+
39
+ ## 🧪 训练信息
40
+
41
+ - 🪿 架构:[RWKV-V7](https://github.com/BlinkDL/RWKV-LM)
42
+ - 📚 数据源:[minimind_dataset](https://huggingface.co/datasets/jingyaogong/minimind_dataset) 特别感谢MiniMind的作者 [@jingyaogong](https://github.com/jingyaogong)开源了训练数据集 🤗
43
+ - 📈 学习率:动态调整
44
+ - 🖥️ 硬件:可以使用4060laptop等显卡进行训练,甚至Radeon 780M 核显也可以在轻薄本上进行训练 😜
45
+ - 👀我是在AMD Instinct MI300X 上快速复现的(十分感谢AMD公司的对我个人以及RWKV的云算力赞助)😊
46
+ - 📦 模型大小:68.4MB 参数量 34.2M Params
47
+ - 📊 预损失曲线:预训练收敛稳定 loss = 2.12左右波动(因为预训练数据量比较少)
48
+ - 📊 SFT训练损失曲线 SFT训练最终loss=0.5左右波动
49
+
50
+ ---
51
+ ## 🎉 效果展示
52
+ ![火星旅行小说](./img/img_1.png)
53
+ ![冒牌排序代码](./img/img_2.png)
54
+ ![心理问答](./img/img_3.png)
55
+ ---
56
+
57
+ ## 🧰 推理方法
58
+
59
+ ### 🐍 安装依赖
60
+
61
+ ```bash
62
+ pip install -r torch numpy prompt_toolkit transformers rwkv
63
+ ```
64
+ - 如果你使用的是AAMD显卡,请安装对应最新版本的torch
65
+ - 比如说```pip3 install torch --index-url https://download.pytorch.org/whl/rocm6.3```
66
+ - 具体安装指令可以参考[Pytorch官网下载链接](https://pytorch.org/get-started/locally/)
67
+
68
+
69
+ ### 🧪 加载模型 & 推理示例
70
+
71
+ ```bash
72
+ python3 ./API_DEMO_CHAT.py
73
+ ```
74
+ ## 📢 致谢
75
+
76
+ - 🖥️ 感谢AMD公司的对我个人以及RWKV的云算力赞助
77
+ - 🙌 感谢 RWKV 社区提供的开源代码和训练框架!
78
+ - 🚀 感谢 [MiniMind](https://github.com/jingyaogong/minimind) 提供的 README 模板灵感!
79
+ - 如发现 bug 或有任何建议,欢迎提交 issue 或 PR 🛠️
80
+
81
+ ---
82
+
83
+ 🎉 感谢小伙伴们使用 **Mini_RWKV_7**!如果你喜欢这个项目,欢迎推给大家一起来玩!🌟
84
+
85
+ ---
img/img_1.png ADDED

Git LFS Details

  • SHA256: 342bec9c43999bc6ffdb3263398827b4cc6b3226a19312632db9eba5a1e9b716
  • Pointer size: 131 Bytes
  • Size of remote file: 260 kB
img/img_2.png ADDED

Git LFS Details

  • SHA256: 2bf560bd59577bfb5e3cf7182c8aaeefa983b19dd477fdd7e3245cb312b0e78b
  • Pointer size: 131 Bytes
  • Size of remote file: 272 kB
img/img_3.png ADDED

Git LFS Details

  • SHA256: d29f08a86171af93877b96597dd02a30f56956054a8fcf16291912c635e7a867
  • Pointer size: 131 Bytes
  • Size of remote file: 349 kB
miniGoose.png ADDED

Git LFS Details

  • SHA256: f3b3cdb84721cda5d1944473e2e1b37d4cfb7a078a05ef7211c5edcc17909a0c
  • Pointer size: 131 Bytes
  • Size of remote file: 862 kB
rwkv-final-sft-1024.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a03dd08fbbc44e93fda601a8db61e7018bfd10831c871c9b2c5beaed9dab4f28
3
+ size 68354364
rwkv-final-sft-2048.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09459cc9b8cf413e71ab867d7be5673f4d5b554d8fb87cf8669e4aa34599152f
3
+ size 68354364
rwkv-final-sft-512.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:da5384f647c2eb6cebe067acce030d0590e047c61b54dee21179083a6d42b672
3
+ size 68354364