Chydfile commited on
Commit
13ce9de
·
1 Parent(s): eb51d02

First model version_2

Browse files
Files changed (6) hide show
  1. merges.txt +1 -1
  2. special_tokens_map.json +1 -1
  3. test.ipynb +334 -0
  4. tokenizer.json +0 -0
  5. tokenizer_config.json +1 -1
  6. vocab.json +0 -0
merges.txt CHANGED
@@ -1,4 +1,4 @@
1
- #version: 0.2
2
  Ġ t
3
  Ġ a
4
  h e
 
1
+ #version: 0.2 - Trained by `huggingface/tokenizers`
2
  Ġ t
3
  Ġ a
4
  h e
special_tokens_map.json CHANGED
@@ -1 +1 @@
1
- {"bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}, "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true}}
 
1
+ {"bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "unk_token": "<|endoftext|>"}
test.ipynb ADDED
@@ -0,0 +1,334 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "metadata": {
3
+ "language_info": {
4
+ "codemirror_mode": {
5
+ "name": "ipython",
6
+ "version": 3
7
+ },
8
+ "file_extension": ".py",
9
+ "mimetype": "text/x-python",
10
+ "name": "python",
11
+ "nbconvert_exporter": "python",
12
+ "pygments_lexer": "ipython3",
13
+ "version": "3.7.3"
14
+ },
15
+ "orig_nbformat": 2,
16
+ "kernelspec": {
17
+ "name": "python3",
18
+ "display_name": "Python 3.7.3 64-bit",
19
+ "metadata": {
20
+ "interpreter": {
21
+ "hash": "31f2aee4e71d21fbe5cf8b01ff0e069b9275f58929596ceb00d14d90e3e16cd6"
22
+ }
23
+ }
24
+ }
25
+ },
26
+ "nbformat": 4,
27
+ "nbformat_minor": 2,
28
+ "cells": [
29
+ {
30
+ "cell_type": "code",
31
+ "execution_count": 15,
32
+ "metadata": {},
33
+ "outputs": [],
34
+ "source": [
35
+ "from huggingface_hub.inference_api import InferenceApi"
36
+ ]
37
+ },
38
+ {
39
+ "cell_type": "code",
40
+ "execution_count": 18,
41
+ "metadata": {},
42
+ "outputs": [],
43
+ "source": [
44
+ "inference = InferenceApi(repo_id=\"Mary222/made-ai-dungeon\", token=\"api_zGlifNbNLScvdQWYSXrLQJvMYetMLFCfuN\")"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 19,
50
+ "metadata": {},
51
+ "outputs": [
52
+ {
53
+ "output_type": "execute_result",
54
+ "data": {
55
+ "text/plain": [
56
+ "{'error': 'Unrecognized model in Mary222/made-ai-dungeon. Should have a `model_type` key in its config.json, or contain one of the following strings in its name: vision-encoder-decoder, trocr, fnet, segformer, gptj, layoutlmv2, beit, rembert, visual_bert, canine, roformer, clip, bigbird_pegasus, deit, luke, detr, gpt_neo, big_bird, speech_to_text_2, speech_to_text, vit, wav2vec2, m2m_100, convbert, led, blenderbot-small, retribert, ibert, mt5, t5, mobilebert, distilbert, albert, bert-generation, camembert, xlm-roberta, pegasus, marian, mbart, megatron-bert, mpnet, bart, blenderbot, reformer, longformer, roberta, deberta-v2, deberta, flaubert, fsmt, squeezebert, hubert, bert, openai-gpt, gpt2, transfo-xl, xlnet, xlm-prophetnet, prophetnet, xlm, ctrl, electra, speech-encoder-decoder, encoder-decoder, funnel, lxmert, dpr, layoutlm, rag, tapas, splinter, sew-d, sew, unispeech-sat, unispeech'}"
57
+ ]
58
+ },
59
+ "metadata": {},
60
+ "execution_count": 19
61
+ }
62
+ ],
63
+ "source": [
64
+ "inference(inputs=\"The goal of life is\")"
65
+ ]
66
+ },
67
+ {
68
+ "cell_type": "code",
69
+ "execution_count": 20,
70
+ "metadata": {},
71
+ "outputs": [],
72
+ "source": [
73
+ "inference_gpt = InferenceApi(repo_id=\"Mary222/MADE_AI_Dungeon_model_RUS\", token=\"api_zGlifNbNLScvdQWYSXrLQJvMYetMLFCfuN\")"
74
+ ]
75
+ },
76
+ {
77
+ "cell_type": "code",
78
+ "execution_count": 21,
79
+ "metadata": {},
80
+ "outputs": [
81
+ {
82
+ "output_type": "execute_result",
83
+ "data": {
84
+ "text/plain": [
85
+ "{'error': 'Pipeline cannot infer suitable model classes from Mary222/MADE_AI_Dungeon_model_RUS'}"
86
+ ]
87
+ },
88
+ "metadata": {},
89
+ "execution_count": 21
90
+ }
91
+ ],
92
+ "source": [
93
+ "inference_gpt(inputs=\"The goal of life is\")"
94
+ ]
95
+ },
96
+ {
97
+ "cell_type": "code",
98
+ "execution_count": 13,
99
+ "metadata": {},
100
+ "outputs": [
101
+ {
102
+ "output_type": "stream",
103
+ "name": "stderr",
104
+ "text": [
105
+ "Downloading: 100%|██████████| 665/665 [00:00<00:00, 243kB/s]\n",
106
+ "Downloading: 100%|██████████| 548M/548M [00:52<00:00, 10.4MB/s]\n",
107
+ "Downloading: 100%|██████████| 1.04M/1.04M [00:00<00:00, 1.23MB/s]\n",
108
+ "Downloading: 100%|██████████| 456k/456k [00:00<00:00, 784kB/s]\n",
109
+ "Downloading: 100%|██████████| 1.36M/1.36M [00:00<00:00, 1.45MB/s]\n",
110
+ "Using pad_token, but it is not set yet.\n",
111
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n"
112
+ ]
113
+ },
114
+ {
115
+ "output_type": "execute_result",
116
+ "data": {
117
+ "text/plain": [
118
+ "[{'generated_text': \"Hello, I'm a language model, I'm writing a new language for you. But first, I'd like to tell you about the language itself\"},\n",
119
+ " {'generated_text': \"Hello, I'm a language model, and I'm trying to be as expressive as possible. In order to be expressive, it is necessary to know\"},\n",
120
+ " {'generated_text': \"Hello, I'm a language model, so I don't get much of a license anymore, but I'm probably more familiar with other languages on that\"},\n",
121
+ " {'generated_text': \"Hello, I'm a language model, a functional model... It's not me, it's me!\\n\\nI won't bore you with how\"},\n",
122
+ " {'generated_text': \"Hello, I'm a language model, not an object model.\\n\\nIn a nutshell, I need to give language model a set of properties that\"}]"
123
+ ]
124
+ },
125
+ "metadata": {},
126
+ "execution_count": 13
127
+ }
128
+ ],
129
+ "source": [
130
+ "from transformers import pipeline, set_seed\n",
131
+ "generator = pipeline('text-generation', model='gpt2')\n",
132
+ "set_seed(42)\n",
133
+ "generator(\"Hello, I'm a language model,\", max_length=30, num_return_sequences=5)"
134
+ ]
135
+ },
136
+ {
137
+ "cell_type": "code",
138
+ "execution_count": null,
139
+ "metadata": {},
140
+ "outputs": [],
141
+ "source": [
142
+ "from transformers import BertConfig, BertModel"
143
+ ]
144
+ },
145
+ {
146
+ "cell_type": "code",
147
+ "execution_count": null,
148
+ "metadata": {},
149
+ "outputs": [],
150
+ "source": [
151
+ "from transformers import GPT2Tokenizer"
152
+ ]
153
+ },
154
+ {
155
+ "cell_type": "code",
156
+ "execution_count": 27,
157
+ "metadata": {},
158
+ "outputs": [
159
+ {
160
+ "output_type": "error",
161
+ "ename": "ImportError",
162
+ "evalue": "cannot import name 'tokenizer' from 'transformers' (/Users/mariapopova/Library/Python/3.7/lib/python/site-packages/transformers/__init__.py)",
163
+ "traceback": [
164
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
165
+ "\u001b[0;31mImportError\u001b[0m Traceback (most recent call last)",
166
+ "\u001b[0;32m<ipython-input-27-6aff32822989>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mtransformers\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mAutoConfig\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mAutoModel\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mtokenizer\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
167
+ "\u001b[0;31mImportError\u001b[0m: cannot import name 'tokenizer' from 'transformers' (/Users/mariapopova/Library/Python/3.7/lib/python/site-packages/transformers/__init__.py)"
168
+ ]
169
+ }
170
+ ],
171
+ "source": [
172
+ "from transformers import AutoConfig, AutoModel"
173
+ ]
174
+ },
175
+ {
176
+ "cell_type": "code",
177
+ "execution_count": 24,
178
+ "metadata": {},
179
+ "outputs": [],
180
+ "source": [
181
+ "model = AutoModel.from_pretrained('gpt2')"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 25,
187
+ "metadata": {},
188
+ "outputs": [],
189
+ "source": [
190
+ "import os\n",
191
+ "os.makedirs(\"Users/Project/GPT2_standard\")"
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "code",
196
+ "execution_count": 32,
197
+ "metadata": {},
198
+ "outputs": [],
199
+ "source": [
200
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")"
201
+ ]
202
+ },
203
+ {
204
+ "cell_type": "code",
205
+ "execution_count": 36,
206
+ "metadata": {},
207
+ "outputs": [
208
+ {
209
+ "output_type": "execute_result",
210
+ "data": {
211
+ "text/plain": [
212
+ "('tokenizer/tokenizer_config.json',\n",
213
+ " 'tokenizer/special_tokens_map.json',\n",
214
+ " 'tokenizer/vocab.json',\n",
215
+ " 'tokenizer/merges.txt',\n",
216
+ " 'tokenizer/added_tokens.json')"
217
+ ]
218
+ },
219
+ "metadata": {},
220
+ "execution_count": 36
221
+ }
222
+ ],
223
+ "source": [
224
+ "model.save_pretrained(\"GPT2_model\")\n",
225
+ "tokenizer.save_pretrained(\"tokenizer\")"
226
+ ]
227
+ },
228
+ {
229
+ "cell_type": "code",
230
+ "execution_count": 30,
231
+ "metadata": {},
232
+ "outputs": [],
233
+ "source": [
234
+ "from transformers import GPT2Tokenizer"
235
+ ]
236
+ },
237
+ {
238
+ "cell_type": "code",
239
+ "execution_count": 34,
240
+ "metadata": {},
241
+ "outputs": [],
242
+ "source": [
243
+ "inference_gpt_standard = InferenceApi(repo_id=\"Mary222/GPT2_standard\", token=\"api_zGlifNbNLScvdQWYSXrLQJvMYetMLFCfuN\")"
244
+ ]
245
+ },
246
+ {
247
+ "cell_type": "code",
248
+ "execution_count": 35,
249
+ "metadata": {},
250
+ "outputs": [
251
+ {
252
+ "output_type": "execute_result",
253
+ "data": {
254
+ "text/plain": [
255
+ "{'error': \"Can't load tokenizer using from_pretrained, please update its configuration: No such file or directory (os error 2)\"}"
256
+ ]
257
+ },
258
+ "metadata": {},
259
+ "execution_count": 35
260
+ }
261
+ ],
262
+ "source": [
263
+ "inference_gpt_standard(inputs=\"The goal of life is\")"
264
+ ]
265
+ },
266
+ {
267
+ "cell_type": "code",
268
+ "execution_count": null,
269
+ "metadata": {},
270
+ "outputs": [],
271
+ "source": [
272
+ "tokenizer = GPT2Tokenizer.from_pretrained(\"gpt2\")"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": 38,
278
+ "metadata": {},
279
+ "outputs": [
280
+ {
281
+ "output_type": "execute_result",
282
+ "data": {
283
+ "text/plain": [
284
+ "('./tokenizer_config.json',\n",
285
+ " './special_tokens_map.json',\n",
286
+ " './vocab.json',\n",
287
+ " './merges.txt',\n",
288
+ " './added_tokens.json',\n",
289
+ " './tokenizer.json')"
290
+ ]
291
+ },
292
+ "metadata": {},
293
+ "execution_count": 38
294
+ }
295
+ ],
296
+ "source": [
297
+ "from transformers import AutoTokenizer\n",
298
+ "AutoTokenizer.from_pretrained(\"gpt2\").save_pretrained(\".\")"
299
+ ]
300
+ },
301
+ {
302
+ "cell_type": "code",
303
+ "execution_count": 39,
304
+ "metadata": {},
305
+ "outputs": [
306
+ {
307
+ "output_type": "execute_result",
308
+ "data": {
309
+ "text/plain": [
310
+ "('./tokenizer_config.json',\n",
311
+ " './special_tokens_map.json',\n",
312
+ " './vocab.json',\n",
313
+ " './merges.txt',\n",
314
+ " './added_tokens.json',\n",
315
+ " './tokenizer.json')"
316
+ ]
317
+ },
318
+ "metadata": {},
319
+ "execution_count": 39
320
+ }
321
+ ],
322
+ "source": [
323
+ "tokenizer"
324
+ ]
325
+ },
326
+ {
327
+ "cell_type": "code",
328
+ "execution_count": null,
329
+ "metadata": {},
330
+ "outputs": [],
331
+ "source": []
332
+ }
333
+ ]
334
+ }
tokenizer.json CHANGED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json CHANGED
@@ -1 +1 @@
1
- {"errors": "replace", "unk_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "bos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "eos_token": {"content": "<|endoftext|>", "single_word": false, "lstrip": false, "rstrip": false, "normalized": true, "__type": "AddedToken"}, "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "tokenizer_file": "/Users/mariapopova/.cache/huggingface/transformers/16a2f78023c8dc511294f0c97b5e10fde3ef9889ad6d11ffaa2a00714e73926e.cf2d0ecb83b6df91b3dbb53f1d1e4c311578bfd3aa0e04934215a49bf9898df0", "name_or_path": "gpt2", "tokenizer_class": "GPT2Tokenizer"}
 
1
+ {"unk_token": "<|endoftext|>", "bos_token": "<|endoftext|>", "eos_token": "<|endoftext|>", "add_prefix_space": false, "model_max_length": 1024, "special_tokens_map_file": null, "name_or_path": "gpt2", "tokenizer_class": "GPT2Tokenizer"}
vocab.json CHANGED
The diff for this file is too large to render. See raw diff