copper-light commited on
Commit
e0447d0
·
1 Parent(s): 397ac9b

Update: first

Browse files
T5.ipynb ADDED
@@ -0,0 +1,158 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 2,
6
+ "id": "2c4eea53",
7
+ "metadata": {},
8
+ "outputs": [
9
+ {
10
+ "ename": "AttributeError",
11
+ "evalue": "'OutStream' object has no attribute 'reconfigure'",
12
+ "output_type": "error",
13
+ "traceback": [
14
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
15
+ "\u001b[31mAttributeError\u001b[39m Traceback (most recent call last)",
16
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[2]\u001b[39m\u001b[32m, line 2\u001b[39m\n\u001b[32m 1\u001b[39m \u001b[38;5;28;01mimport\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[34;01msys\u001b[39;00m\n\u001b[32m----> \u001b[39m\u001b[32m2\u001b[39m \u001b[43msys\u001b[49m\u001b[43m.\u001b[49m\u001b[43mstdout\u001b[49m\u001b[43m.\u001b[49m\u001b[43mreconfigure\u001b[49m(encoding=\u001b[33m'\u001b[39m\u001b[33mutf-8\u001b[39m\u001b[33m'\u001b[39m)\n",
17
+ "\u001b[31mAttributeError\u001b[39m: 'OutStream' object has no attribute 'reconfigure'"
18
+ ]
19
+ }
20
+ ],
21
+ "source": [
22
+ "import sys\n",
23
+ "sys.stdout.reconfigure(encoding='utf-8')"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 4,
29
+ "id": "c2185684",
30
+ "metadata": {},
31
+ "outputs": [
32
+ {
33
+ "name": "stdout",
34
+ "output_type": "stream",
35
+ "text": [
36
+ "Collecting simpleT5\n",
37
+ " Using cached simplet5-0.1.4.tar.gz (7.3 kB)\n",
38
+ " Installing build dependencies: started\n",
39
+ " Installing build dependencies: finished with status 'done'\n",
40
+ " Getting requirements to build wheel: started\n",
41
+ " Getting requirements to build wheel: finished with status 'done'\n",
42
+ " Preparing metadata (pyproject.toml): started\n",
43
+ " Preparing metadata (pyproject.toml): finished with status 'done'\n",
44
+ "Requirement already satisfied: numpy in d:\\programming\\workspace_python\\study_llm\\venv\\lib\\site-packages (from simpleT5) (2.1.2)\n",
45
+ "Requirement already satisfied: pandas in d:\\programming\\workspace_python\\study_llm\\venv\\lib\\site-packages (from simpleT5) (2.3.1)\n",
46
+ "Collecting sentencepiece (from simpleT5)\n",
47
+ " Using cached sentencepiece-0.2.0.tar.gz (2.6 MB)\n",
48
+ " Installing build dependencies: started\n",
49
+ " Installing build dependencies: finished with status 'done'\n",
50
+ " Getting requirements to build wheel: started\n",
51
+ " Getting requirements to build wheel: finished with status 'error'\n"
52
+ ]
53
+ },
54
+ {
55
+ "name": "stderr",
56
+ "output_type": "stream",
57
+ "text": [
58
+ " error: subprocess-exited-with-error\n",
59
+ " \n",
60
+ " × Getting requirements to build wheel did not run successfully.\n",
61
+ " │ exit code: 1\n",
62
+ " ╰─> [48 lines of output]\n",
63
+ " Traceback (most recent call last):\n",
64
+ " File \u001b[35m\"D:\\Programming\\workspace_python\\Study_LLM\\venv\\Lib\\site-packages\\pip\\_vendor\\pyproject_hooks\\_in_process\\_in_process.py\"\u001b[0m, line \u001b[35m389\u001b[0m, in \u001b[35m<module>\u001b[0m\n",
65
+ " \u001b[31mmain\u001b[0m\u001b[1;31m()\u001b[0m\n",
66
+ " \u001b[31m~~~~\u001b[0m\u001b[1;31m^^\u001b[0m\n",
67
+ " File \u001b[35m\"D:\\Programming\\workspace_python\\Study_LLM\\venv\\Lib\\site-packages\\pip\\_vendor\\pyproject_hooks\\_in_process\\_in_process.py\"\u001b[0m, line \u001b[35m373\u001b[0m, in \u001b[35mmain\u001b[0m\n",
68
+ " json_out[\"return_val\"] = \u001b[31mhook\u001b[0m\u001b[1;31m(**hook_input[\"kwargs\"])\u001b[0m\n",
69
+ " \u001b[31m~~~~\u001b[0m\u001b[1;31m^^^^^^^^^^^^^^^^^^^^^^^^\u001b[0m\n",
70
+ " File \u001b[35m\"D:\\Programming\\workspace_python\\Study_LLM\\venv\\Lib\\site-packages\\pip\\_vendor\\pyproject_hooks\\_in_process\\_in_process.py\"\u001b[0m, line \u001b[35m143\u001b[0m, in \u001b[35mget_requires_for_build_wheel\u001b[0m\n",
71
+ " return hook(config_settings)\n",
72
+ " File \u001b[35m\"C:\\Users\\han\\AppData\\Local\\Temp\\pip-build-env-72ht70wk\\overlay\\Lib\\site-packages\\setuptools\\build_meta.py\"\u001b[0m, line \u001b[35m331\u001b[0m, in \u001b[35mget_requires_for_build_wheel\u001b[0m\n",
73
+ " return \u001b[31mself._get_build_requires\u001b[0m\u001b[1;31m(config_settings, requirements=[])\u001b[0m\n",
74
+ " \u001b[31m~~~~~~~~~~~~~~~~~~~~~~~~\u001b[0m\u001b[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\u001b[0m\n",
75
+ " File \u001b[35m\"C:\\Users\\han\\AppData\\Local\\Temp\\pip-build-env-72ht70wk\\overlay\\Lib\\site-packages\\setuptools\\build_meta.py\"\u001b[0m, line \u001b[35m301\u001b[0m, in \u001b[35m_get_build_requires\u001b[0m\n",
76
+ " \u001b[31mself.run_setup\u001b[0m\u001b[1;31m()\u001b[0m\n",
77
+ " \u001b[31m~~~~~~~~~~~~~~\u001b[0m\u001b[1;31m^^\u001b[0m\n",
78
+ " File \u001b[35m\"C:\\Users\\han\\AppData\\Local\\Temp\\pip-build-env-72ht70wk\\overlay\\Lib\\site-packages\\setuptools\\build_meta.py\"\u001b[0m, line \u001b[35m512\u001b[0m, in \u001b[35mrun_setup\u001b[0m\n",
79
+ " \u001b[31msuper().run_setup\u001b[0m\u001b[1;31m(setup_script=setup_script)\u001b[0m\n",
80
+ " \u001b[31m~~~~~~~~~~~~~~~~~\u001b[0m\u001b[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^\u001b[0m\n",
81
+ " File \u001b[35m\"C:\\Users\\han\\AppData\\Local\\Temp\\pip-build-env-72ht70wk\\overlay\\Lib\\site-packages\\setuptools\\build_meta.py\"\u001b[0m, line \u001b[35m317\u001b[0m, in \u001b[35mrun_setup\u001b[0m\n",
82
+ " \u001b[31mexec\u001b[0m\u001b[1;31m(code, locals())\u001b[0m\n",
83
+ " \u001b[31m~~~~\u001b[0m\u001b[1;31m^^^^^^^^^^^^^^^^\u001b[0m\n",
84
+ " File \u001b[35m\"<string>\"\u001b[0m, line \u001b[35m128\u001b[0m, in \u001b[35m<module>\u001b[0m\n",
85
+ " File \u001b[35m\"C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.13_3.13.1520.0_x64__qbz5n2kfra8p0\\Lib\\subprocess.py\"\u001b[0m, line \u001b[35m414\u001b[0m, in \u001b[35mcheck_call\u001b[0m\n",
86
+ " retcode = call(*popenargs, **kwargs)\n",
87
+ " File \u001b[35m\"C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.13_3.13.1520.0_x64__qbz5n2kfra8p0\\Lib\\subprocess.py\"\u001b[0m, line \u001b[35m395\u001b[0m, in \u001b[35mcall\u001b[0m\n",
88
+ " with \u001b[31mPopen\u001b[0m\u001b[1;31m(*popenargs, **kwargs)\u001b[0m as p:\n",
89
+ " \u001b[31m~~~~~\u001b[0m\u001b[1;31m^^^^^^^^^^^^^^^^^^^^^^\u001b[0m\n",
90
+ " File \u001b[35m\"C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.13_3.13.1520.0_x64__qbz5n2kfra8p0\\Lib\\subprocess.py\"\u001b[0m, line \u001b[35m1039\u001b[0m, in \u001b[35m__init__\u001b[0m\n",
91
+ " \u001b[31mself._execute_child\u001b[0m\u001b[1;31m(args, executable, preexec_fn, close_fds,\u001b[0m\n",
92
+ " \u001b[31m~~~~~~~~~~~~~~~~~~~\u001b[0m\u001b[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\u001b[0m\n",
93
+ " \u001b[1;31mpass_fds, cwd, env,\u001b[0m\n",
94
+ " \u001b[1;31m^^^^^^^^^^^^^^^^^^^\u001b[0m\n",
95
+ " ...<5 lines>...\n",
96
+ " \u001b[1;31mgid, gids, uid, umask,\u001b[0m\n",
97
+ " \u001b[1;31m^^^^^^^^^^^^^^^^^^^^^^\u001b[0m\n",
98
+ " \u001b[1;31mstart_new_session, process_group)\u001b[0m\n",
99
+ " \u001b[1;31m^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\u001b[0m\n",
100
+ " File \u001b[35m\"C:\\Program Files\\WindowsApps\\PythonSoftwareFoundation.Python.3.13_3.13.1520.0_x64__qbz5n2kfra8p0\\Lib\\subprocess.py\"\u001b[0m, line \u001b[35m1554\u001b[0m, in \u001b[35m_execute_child\u001b[0m\n",
101
+ " hp, ht, pid, tid = \u001b[31m_winapi.CreateProcess\u001b[0m\u001b[1;31m(executable, args,\u001b[0m\n",
102
+ " \u001b[31m~~~~~~~~~~~~~~~~~~~~~\u001b[0m\u001b[1;31m^^^^^^^^^^^^^^^^^^\u001b[0m\n",
103
+ " \u001b[1;31m# no special security\u001b[0m\n",
104
+ " \u001b[1;31m^^^^^^^^^^^^^^^^^^^^^\u001b[0m\n",
105
+ " ...<4 lines>...\n",
106
+ " \u001b[1;31mcwd,\u001b[0m\n",
107
+ " \u001b[1;31m^^^^\u001b[0m\n",
108
+ " \u001b[1;31mstartupinfo)\u001b[0m\n",
109
+ " \u001b[1;31m^^^^^^^^^^^^\u001b[0m\n",
110
+ " \u001b[1;35mFileNotFoundError\u001b[0m: \u001b[35m[WinError 2] 吏\\x80\\xec젙\\xeb맂 \\xed뙆\\xec씪\\xec쓣 李얠쓣 \\xec닔 \\xec뾾\\xec뒿\\xeb땲\\xeb떎\u001b[0m\n",
111
+ " [end of output]\n",
112
+ " \n",
113
+ " note: This error originates from a subprocess, and is likely not a problem with pip.\n",
114
+ "error: subprocess-exited-with-error\n",
115
+ "\n",
116
+ "× Getting requirements to build wheel did not run successfully.\n",
117
+ "│ exit code: 1\n",
118
+ "╰─> See above for output.\n",
119
+ "\n",
120
+ "note: This error originates from a subprocess, and is likely not a problem with pip.\n"
121
+ ]
122
+ }
123
+ ],
124
+ "source": [
125
+ "!pip install simpleT5 "
126
+ ]
127
+ },
128
+ {
129
+ "cell_type": "code",
130
+ "execution_count": null,
131
+ "id": "566cf25f",
132
+ "metadata": {},
133
+ "outputs": [],
134
+ "source": []
135
+ }
136
+ ],
137
+ "metadata": {
138
+ "kernelspec": {
139
+ "display_name": "venv",
140
+ "language": "python",
141
+ "name": "python3"
142
+ },
143
+ "language_info": {
144
+ "codemirror_mode": {
145
+ "name": "ipython",
146
+ "version": 3
147
+ },
148
+ "file_extension": ".py",
149
+ "mimetype": "text/x-python",
150
+ "name": "python",
151
+ "nbconvert_exporter": "python",
152
+ "pygments_lexer": "ipython3",
153
+ "version": "3.13.5"
154
+ }
155
+ },
156
+ "nbformat": 4,
157
+ "nbformat_minor": 5
158
+ }
config.json/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 0,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 2,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_embd": 768,
14
+ "n_head": 12,
15
+ "n_inner": null,
16
+ "n_layer": 12,
17
+ "n_positions": 1024,
18
+ "reorder_and_upcast_attn": false,
19
+ "resid_pdrop": 0.1,
20
+ "scale_attn_by_inverse_layer_idx": false,
21
+ "scale_attn_weights": true,
22
+ "summary_activation": null,
23
+ "summary_first_dropout": 0.1,
24
+ "summary_proj_to_labels": true,
25
+ "summary_type": "cls_index",
26
+ "summary_use_proj": true,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.54.1",
29
+ "use_cache": true,
30
+ "vocab_size": 11954
31
+ }
glm.ipynb ADDED
@@ -0,0 +1,569 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 43,
6
+ "id": "03ab65cb",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "from tokenizers.models import BPE\n",
11
+ "from tokenizers import Tokenizer\n",
12
+ "from tokenizers.decoders import ByteLevel as ByteLevelDecoder\n",
13
+ "from tokenizers.normalizers import Sequence, Lowercase\n",
14
+ "from tokenizers.pre_tokenizers import ByteLevel\n",
15
+ "from tokenizers.trainers import BpeTrainer"
16
+ ]
17
+ },
18
+ {
19
+ "cell_type": "code",
20
+ "execution_count": 44,
21
+ "id": "efe37f35",
22
+ "metadata": {},
23
+ "outputs": [],
24
+ "source": [
25
+ "tokenizer = Tokenizer(BPE())\n",
26
+ "tokenizer.normalizer = Sequence([Lowercase()])\n",
27
+ "tokenizer.pre_tokenizer = ByteLevel()\n",
28
+ "tokenizer.decoder = ByteLevelDecoder()"
29
+ ]
30
+ },
31
+ {
32
+ "cell_type": "code",
33
+ "execution_count": 45,
34
+ "id": "6a596c74",
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "trainer = BpeTrainer(vocab_size = 50000, initial_alphabet=ByteLevel.alphabet(), special_tokens=['<s>', '<pad>', '</s>','</unk>', '<mask>'])\n",
39
+ "tokenizer.train([\"../../datasets/austen-emma.txt\"], trainer)"
40
+ ]
41
+ },
42
+ {
43
+ "cell_type": "code",
44
+ "execution_count": 46,
45
+ "id": "00138f04",
46
+ "metadata": {},
47
+ "outputs": [],
48
+ "source": [
49
+ "tokenizer.save(\"tokenizer_gpt/tokenizer.json\")"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 47,
55
+ "id": "ca9061de",
56
+ "metadata": {},
57
+ "outputs": [],
58
+ "source": [
59
+ "from transformers import (GPT2TokenizerFast, GPT2Config, GPT2LMHeadModel)\n",
60
+ "tokenizer_gpt = GPT2TokenizerFast.from_pretrained(\"tokenizer_gpt\")"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 48,
66
+ "id": "4e5f77ea",
67
+ "metadata": {},
68
+ "outputs": [
69
+ {
70
+ "data": {
71
+ "text/plain": [
72
+ "1"
73
+ ]
74
+ },
75
+ "execution_count": 48,
76
+ "metadata": {},
77
+ "output_type": "execute_result"
78
+ }
79
+ ],
80
+ "source": [
81
+ "tokenizer_gpt.add_special_tokens({\n",
82
+ " \"eos_token\": \"</s>\",\n",
83
+ " \"bos_token\": \"<s>\",\n",
84
+ " \"unk_token\": \"<unk>\",\n",
85
+ " \"pad_token\": \"<pad>\",\n",
86
+ " \"mask_token\": \"<mask>\"\n",
87
+ "})"
88
+ ]
89
+ },
90
+ {
91
+ "cell_type": "code",
92
+ "execution_count": 49,
93
+ "id": "ab84b4f2",
94
+ "metadata": {},
95
+ "outputs": [
96
+ {
97
+ "data": {
98
+ "text/plain": [
99
+ "[0, 469, 293, 225, 2]"
100
+ ]
101
+ },
102
+ "execution_count": 49,
103
+ "metadata": {},
104
+ "output_type": "execute_result"
105
+ }
106
+ ],
107
+ "source": [
108
+ "tokenizer_gpt.encode(\"<s> thisis </s>\")"
109
+ ]
110
+ },
111
+ {
112
+ "cell_type": "code",
113
+ "execution_count": 50,
114
+ "id": "2cda86e9",
115
+ "metadata": {},
116
+ "outputs": [
117
+ {
118
+ "data": {
119
+ "text/plain": [
120
+ "[0, 469, 361, 225, 2]"
121
+ ]
122
+ },
123
+ "execution_count": 50,
124
+ "metadata": {},
125
+ "output_type": "execute_result"
126
+ }
127
+ ],
128
+ "source": [
129
+ "tokenizer_gpt.encode(\"<s> this is </s>\")"
130
+ ]
131
+ },
132
+ {
133
+ "cell_type": "code",
134
+ "execution_count": 51,
135
+ "id": "5d7d2260",
136
+ "metadata": {},
137
+ "outputs": [],
138
+ "source": [
139
+ "config = GPT2Config(\n",
140
+ " vocab_size = tokenizer_gpt.vocab_size,\n",
141
+ " bos_token_id = tokenizer_gpt.bos_token_id,\n",
142
+ " eos_token_id = tokenizer_gpt.eos_token_id\n",
143
+ ")\n",
144
+ "\n",
145
+ "model = GPT2LMHeadModel(config)"
146
+ ]
147
+ },
148
+ {
149
+ "cell_type": "code",
150
+ "execution_count": 52,
151
+ "id": "45d2da31",
152
+ "metadata": {},
153
+ "outputs": [
154
+ {
155
+ "data": {
156
+ "text/plain": [
157
+ "GPT2Config {\n",
158
+ " \"activation_function\": \"gelu_new\",\n",
159
+ " \"attn_pdrop\": 0.1,\n",
160
+ " \"bos_token_id\": 0,\n",
161
+ " \"embd_pdrop\": 0.1,\n",
162
+ " \"eos_token_id\": 2,\n",
163
+ " \"initializer_range\": 0.02,\n",
164
+ " \"layer_norm_epsilon\": 1e-05,\n",
165
+ " \"model_type\": \"gpt2\",\n",
166
+ " \"n_embd\": 768,\n",
167
+ " \"n_head\": 12,\n",
168
+ " \"n_inner\": null,\n",
169
+ " \"n_layer\": 12,\n",
170
+ " \"n_positions\": 1024,\n",
171
+ " \"reorder_and_upcast_attn\": false,\n",
172
+ " \"resid_pdrop\": 0.1,\n",
173
+ " \"scale_attn_by_inverse_layer_idx\": false,\n",
174
+ " \"scale_attn_weights\": true,\n",
175
+ " \"summary_activation\": null,\n",
176
+ " \"summary_first_dropout\": 0.1,\n",
177
+ " \"summary_proj_to_labels\": true,\n",
178
+ " \"summary_type\": \"cls_index\",\n",
179
+ " \"summary_use_proj\": true,\n",
180
+ " \"transformers_version\": \"4.54.1\",\n",
181
+ " \"use_cache\": true,\n",
182
+ " \"vocab_size\": 11954\n",
183
+ "}"
184
+ ]
185
+ },
186
+ "execution_count": 52,
187
+ "metadata": {},
188
+ "output_type": "execute_result"
189
+ }
190
+ ],
191
+ "source": [
192
+ "config"
193
+ ]
194
+ },
195
+ {
196
+ "cell_type": "code",
197
+ "execution_count": 53,
198
+ "id": "692b12c5",
199
+ "metadata": {},
200
+ "outputs": [
201
+ {
202
+ "data": {
203
+ "text/plain": [
204
+ "GPT2LMHeadModel(\n",
205
+ " (transformer): GPT2Model(\n",
206
+ " (wte): Embedding(11954, 768)\n",
207
+ " (wpe): Embedding(1024, 768)\n",
208
+ " (drop): Dropout(p=0.1, inplace=False)\n",
209
+ " (h): ModuleList(\n",
210
+ " (0-11): 12 x GPT2Block(\n",
211
+ " (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
212
+ " (attn): GPT2Attention(\n",
213
+ " (c_attn): Conv1D(nf=2304, nx=768)\n",
214
+ " (c_proj): Conv1D(nf=768, nx=768)\n",
215
+ " (attn_dropout): Dropout(p=0.1, inplace=False)\n",
216
+ " (resid_dropout): Dropout(p=0.1, inplace=False)\n",
217
+ " )\n",
218
+ " (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
219
+ " (mlp): GPT2MLP(\n",
220
+ " (c_fc): Conv1D(nf=3072, nx=768)\n",
221
+ " (c_proj): Conv1D(nf=768, nx=3072)\n",
222
+ " (act): NewGELUActivation()\n",
223
+ " (dropout): Dropout(p=0.1, inplace=False)\n",
224
+ " )\n",
225
+ " )\n",
226
+ " )\n",
227
+ " (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)\n",
228
+ " )\n",
229
+ " (lm_head): Linear(in_features=768, out_features=11954, bias=False)\n",
230
+ ")"
231
+ ]
232
+ },
233
+ "execution_count": 53,
234
+ "metadata": {},
235
+ "output_type": "execute_result"
236
+ }
237
+ ],
238
+ "source": [
239
+ "model"
240
+ ]
241
+ },
242
+ {
243
+ "cell_type": "code",
244
+ "execution_count": 54,
245
+ "id": "49b1f246",
246
+ "metadata": {},
247
+ "outputs": [],
248
+ "source": [
249
+ "with open(\"../../datasets/austen-emma.txt\", \"r\", encoding='utf-8') as f:\n",
250
+ " content = f.readlines()"
251
+ ]
252
+ },
253
+ {
254
+ "cell_type": "code",
255
+ "execution_count": 55,
256
+ "id": "306508f7",
257
+ "metadata": {},
258
+ "outputs": [],
259
+ "source": [
260
+ "content_p = []\n",
261
+ "for c in content:\n",
262
+ " if len(c) > 10:\n",
263
+ " content_p.append(c.strip())\n",
264
+ "content_p = ' '.join(content_p) + tokenizer_gpt.eos_token"
265
+ ]
266
+ },
267
+ {
268
+ "cell_type": "code",
269
+ "execution_count": 56,
270
+ "id": "4536854a",
271
+ "metadata": {},
272
+ "outputs": [],
273
+ "source": [
274
+ "tokenized_content = tokenizer_gpt.encode(content_p)"
275
+ ]
276
+ },
277
+ {
278
+ "cell_type": "code",
279
+ "execution_count": 57,
280
+ "id": "a1b0aacc",
281
+ "metadata": {},
282
+ "outputs": [
283
+ {
284
+ "data": {
285
+ "text/plain": [
286
+ "195221"
287
+ ]
288
+ },
289
+ "execution_count": 57,
290
+ "metadata": {},
291
+ "output_type": "execute_result"
292
+ }
293
+ ],
294
+ "source": [
295
+ "len(tokenized_content)"
296
+ ]
297
+ },
298
+ {
299
+ "cell_type": "code",
300
+ "execution_count": 58,
301
+ "id": "b475fdde",
302
+ "metadata": {},
303
+ "outputs": [],
304
+ "source": [
305
+ "sample_len = 100\n",
306
+ "examples = []\n",
307
+ "for i in range(0, len(tokenized_content) - sample_len + 1):\n",
308
+ " examples.append(\n",
309
+ " tokenized_content[i:i+ sample_len]\n",
310
+ " )\n",
311
+ "\n",
312
+ "train_data = []\n",
313
+ "labels = []\n",
314
+ "for example in examples:\n",
315
+ " train_data.append(example[:-1])\n",
316
+ " labels.append(example[1:])"
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": 59,
322
+ "id": "ed046662",
323
+ "metadata": {},
324
+ "outputs": [],
325
+ "source": [
326
+ "import torch\n",
327
+ "from torch.utils.data import TensorDataset, DataLoader\n",
328
+ "import torch.nn.functional as F\n",
329
+ "\n",
330
+ "buffer = 500\n",
331
+ "batch_size = 64\n",
332
+ "\n",
333
+ "train_data = torch.Tensor(train_data).to(dtype=torch.long).cuda()\n",
334
+ "labels = torch.Tensor(labels).to(dtype=torch.long).cuda()\n",
335
+ "dataset = TensorDataset(train_data, labels)\n",
336
+ "\n",
337
+ "loader = DataLoader(dataset, batch_size=batch_size, drop_last=True, shuffle=True)"
338
+ ]
339
+ },
340
+ {
341
+ "cell_type": "code",
342
+ "execution_count": 60,
343
+ "id": "dc687478",
344
+ "metadata": {},
345
+ "outputs": [],
346
+ "source": [
347
+ "from torch.optim import Adam\n",
348
+ "import torch.nn as nn\n",
349
+ "from tqdm import tqdm\n",
350
+ "\n",
351
+ "model = model.cuda()\n",
352
+ "optimizer = Adam(model.parameters(), lr=3e-5, eps=1e-08)\n",
353
+ "criterion = nn.CrossEntropyLoss()"
354
+ ]
355
+ },
356
+ {
357
+ "cell_type": "code",
358
+ "execution_count": null,
359
+ "id": "99b192b4",
360
+ "metadata": {},
361
+ "outputs": [
362
+ {
363
+ "name": "stderr",
364
+ "output_type": "stream",
365
+ "text": [
366
+ " 0%| | 0/3048 [00:00<?, ?it/s]C:\\Users\\han\\AppData\\Local\\Temp\\ipykernel_32488\\611308887.py:17: UserWarning: Implicit dimension choice for softmax has been deprecated. Change the call to include dim=X as an argument.\n",
367
+ " pred = F.softmax(pred)\n",
368
+ "100%|██████████| 3048/3048 [07:58<00:00, 6.37it/s, loss: 0.0061, aucc: 0.4009]"
369
+ ]
370
+ },
371
+ {
372
+ "name": "stdout",
373
+ "output_type": "stream",
374
+ "text": [
375
+ "tensor(0.0085, device='cuda:0', grad_fn=<DivBackward0>)\n"
376
+ ]
377
+ },
378
+ {
379
+ "name": "stderr",
380
+ "output_type": "stream",
381
+ "text": [
382
+ "\n"
383
+ ]
384
+ },
385
+ {
386
+ "ename": "TypeError",
387
+ "evalue": "len() takes exactly one argument (0 given)",
388
+ "output_type": "error",
389
+ "traceback": [
390
+ "\u001b[31m---------------------------------------------------------------------------\u001b[39m",
391
+ "\u001b[31mTypeError\u001b[39m Traceback (most recent call last)",
392
+ "\u001b[36mCell\u001b[39m\u001b[36m \u001b[39m\u001b[32mIn[79]\u001b[39m\u001b[32m, line 26\u001b[39m\n\u001b[32m 23\u001b[39m progress.set_postfix_str(\u001b[33mf\u001b[39m\u001b[33m\"\u001b[39m\u001b[33mloss: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00mloss.cpu().detach().numpy()\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.04f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m, aucc: \u001b[39m\u001b[38;5;132;01m{\u001b[39;00maucc.cpu()/cnt\u001b[38;5;132;01m:\u001b[39;00m\u001b[33m.04f\u001b[39m\u001b[38;5;132;01m}\u001b[39;00m\u001b[33m\"\u001b[39m)\n\u001b[32m 25\u001b[39m \u001b[38;5;28mprint\u001b[39m(losses / \u001b[38;5;28mlen\u001b[39m(loader))\n\u001b[32m---> \u001b[39m\u001b[32m26\u001b[39m \u001b[38;5;28mprint\u001b[39m(aucc / \u001b[38;5;28;43mlen\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m)\n",
393
+ "\u001b[31mTypeError\u001b[39m: len() takes exactly one argument (0 given)"
394
+ ]
395
+ }
396
+ ],
397
+ "source": [
398
+ "model.train()\n",
399
+ "# for epoch in tqdm(range(0, 1)):\n",
400
+ "losses = 0.\n",
401
+ "aucc = 0\n",
402
+ "cnt = 0\n",
403
+ "progress = tqdm(loader)\n",
404
+ "for x, y in progress:\n",
405
+ " pred = model(x).logits\n",
406
+ " \n",
407
+ " y = F.one_hot(y, num_classes=tokenizer_gpt.vocab_size)\n",
408
+ " \n",
409
+ " loss = criterion(pred.to(dtype=torch.float32), y.to(dtype=torch.float32))\n",
410
+ " optimizer.zero_grad()\n",
411
+ " loss.backward()\n",
412
+ " optimizer.step()\n",
413
+ " \n",
414
+ " pred = F.softmax(pred)\n",
415
+ " y = torch.argmax(y, dim=2)\n",
416
+ " aucc += torch.sum(pred == y)\n",
417
+ " cnt += (batch_size*99)\n",
418
+ " losses += loss\n",
419
+ " progress.set_postfix_str(f\"loss: {loss.cpu().detach().numpy():.04f}, aucc: {aucc.cpu()/cnt:.04f}\")\n",
420
+ " \n",
421
+ "print(losses / len(loader))\n",
422
+ "print(aucc / cnt)"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "code",
427
+ "execution_count": 90,
428
+ "id": "5f0c3c7a",
429
+ "metadata": {},
430
+ "outputs": [],
431
+ "source": [
432
+ "def generate(start, model):\n",
433
+ " input_token_ids = tokenizer_gpt.encode(start, return_tensors='pt').cuda()\n",
434
+ " output = model.generate(\n",
435
+ " input_token_ids,\n",
436
+ " max_length= 500,\n",
437
+ " num_beams = 5,\n",
438
+ " temperature=0.7,\n",
439
+ " no_repeat_ngram_size=2,\n",
440
+ " num_return_sequences=1\n",
441
+ " )\n",
442
+ " return tokenizer_gpt.decode(output[0])"
443
+ ]
444
+ },
445
+ {
446
+ "cell_type": "code",
447
+ "execution_count": 92,
448
+ "id": "7acad8eb",
449
+ "metadata": {},
450
+ "outputs": [
451
+ {
452
+ "name": "stderr",
453
+ "output_type": "stream",
454
+ "text": [
455
+ "The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.\n",
456
+ "The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.\n",
457
+ "Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.\n"
458
+ ]
459
+ },
460
+ {
461
+ "data": {
462
+ "text/plain": [
463
+ "' yes!\" cried emma.--\"my dearest harriet, mr. weston, who had been walking away from hartfield, he would never have seen him before; and while she did not quite understand how it might be supposed that she could give up the idea of any body\\'s coming to such a thing by his manners. \"i am afraid,\" he replied, \"that you must be the greatest pleasure. you do not know what your father would have heard.\" \"oh! yes, my dear--but i dare say i am sure i shall think they will think you are very much obliged to be sure.--but this is an excellent miss smith, however, indeed; but there is being _you are quite enough to make one of course.--i can hardly ever hear of every thing to see nothing else.\" she comes in love with me, sir?--well--a very bad.--he is coming over this morning and yet quite complete in the whole.\" emma could not likely to call upon the same glance at all these words, if they walked on that sort of his feelings: i must take care about ten months ago, perhaps--what an old acquaintance with my father--and so far off in spite of the smallest degree or two men, i was forced to bring a few weeks ago as far as well, unless they are going to wait for _she is a great regard for a young lady\\'s conduct, though her husband and therefore.\" chapter xvi and mrs. frank churchill came out of having given me--he came back again, when they were all my mother might have done? she walked up in fact.--the same evening it was sitting down again--the case at this day?\" \"miss woodhouse\\'s eyes to keep your feelings did you walked off, and then?\" emma\\'s situation. chapter xiv some young woman\\'s manners to give me! he seemed to feel that period at weymouth.\" and nobody else, papa) the worst of hearing every body else--\" she came over-morrow.--they are so pleased with all that moment.--a little boys appeared till within half-day or four weeks, or three times a beautiful creature into the oddest creature!\" chapter xvii when she might not making a moment--\"the change?--you must wait a mile off!--but if this evening without being taken place, moreover, but one subject after making no account.--it seems every other people could never saw me; he made her mind was giving themselves off; her eyes made up these young'"
464
+ ]
465
+ },
466
+ "execution_count": 92,
467
+ "metadata": {},
468
+ "output_type": "execute_result"
469
+ }
470
+ ],
471
+ "source": [
472
+ "generate(\" \", model)"
473
+ ]
474
+ },
475
+ {
476
+ "cell_type": "code",
477
+ "execution_count": 93,
478
+ "id": "b169491a",
479
+ "metadata": {},
480
+ "outputs": [],
481
+ "source": [
482
+ "from transformers import (WEIGHTS_NAME, CONFIG_NAME)"
483
+ ]
484
+ },
485
+ {
486
+ "cell_type": "code",
487
+ "execution_count": 94,
488
+ "id": "690df94b",
489
+ "metadata": {},
490
+ "outputs": [
491
+ {
492
+ "data": {
493
+ "text/plain": [
494
+ "'pytorch_model.bin'"
495
+ ]
496
+ },
497
+ "execution_count": 94,
498
+ "metadata": {},
499
+ "output_type": "execute_result"
500
+ }
501
+ ],
502
+ "source": [
503
+ "WEIGHTS_NAME"
504
+ ]
505
+ },
506
+ {
507
+ "cell_type": "code",
508
+ "execution_count": 95,
509
+ "id": "b6d192d9",
510
+ "metadata": {},
511
+ "outputs": [
512
+ {
513
+ "data": {
514
+ "text/plain": [
515
+ "'config.json'"
516
+ ]
517
+ },
518
+ "execution_count": 95,
519
+ "metadata": {},
520
+ "output_type": "execute_result"
521
+ }
522
+ ],
523
+ "source": [
524
+ "CONFIG_NAME"
525
+ ]
526
+ },
527
+ {
528
+ "cell_type": "code",
529
+ "execution_count": 96,
530
+ "id": "e4abd96c",
531
+ "metadata": {},
532
+ "outputs": [],
533
+ "source": [
534
+ "model.save_pretrained(WEIGHTS_NAME)"
535
+ ]
536
+ },
537
+ {
538
+ "cell_type": "code",
539
+ "execution_count": 97,
540
+ "id": "d719459b",
541
+ "metadata": {},
542
+ "outputs": [],
543
+ "source": [
544
+ "config.save_pretrained(CONFIG_NAME)"
545
+ ]
546
+ }
547
+ ],
548
+ "metadata": {
549
+ "kernelspec": {
550
+ "display_name": "venv",
551
+ "language": "python",
552
+ "name": "python3"
553
+ },
554
+ "language_info": {
555
+ "codemirror_mode": {
556
+ "name": "ipython",
557
+ "version": 3
558
+ },
559
+ "file_extension": ".py",
560
+ "mimetype": "text/x-python",
561
+ "name": "python",
562
+ "nbconvert_exporter": "python",
563
+ "pygments_lexer": "ipython3",
564
+ "version": "3.13.5"
565
+ }
566
+ },
567
+ "nbformat": 4,
568
+ "nbformat_minor": 5
569
+ }
pytorch_model.bin/config.json ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 0,
8
+ "embd_pdrop": 0.1,
9
+ "eos_token_id": 2,
10
+ "initializer_range": 0.02,
11
+ "layer_norm_epsilon": 1e-05,
12
+ "model_type": "gpt2",
13
+ "n_embd": 768,
14
+ "n_head": 12,
15
+ "n_inner": null,
16
+ "n_layer": 12,
17
+ "n_positions": 1024,
18
+ "reorder_and_upcast_attn": false,
19
+ "resid_pdrop": 0.1,
20
+ "scale_attn_by_inverse_layer_idx": false,
21
+ "scale_attn_weights": true,
22
+ "summary_activation": null,
23
+ "summary_first_dropout": 0.1,
24
+ "summary_proj_to_labels": true,
25
+ "summary_type": "cls_index",
26
+ "summary_use_proj": true,
27
+ "torch_dtype": "float32",
28
+ "transformers_version": "4.54.1",
29
+ "use_cache": true,
30
+ "vocab_size": 11954
31
+ }
pytorch_model.bin/generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 0,
4
+ "eos_token_id": 2,
5
+ "transformers_version": "4.54.1"
6
+ }
pytorch_model.bin/model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9b9efd942375a58ba5fd94dda20b8cdff0abbe41d52ae86752b2d44ad56bf86c
3
+ size 380107392
tokenizer_gpt/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff