dilip025 commited on
Commit
32c6518
·
verified ·
1 Parent(s): ee25a5b

Upload 3 files

Browse files
Files changed (2) hide show
  1. README.md +33 -0
  2. tokenizer_config.json +7 -22
README.md ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Mini GPT1 Clone
2
+
3
+ This is a decoder-only transformer model (GPT1-style) trained from scratch using PyTorch.
4
+
5
+ ## Model Details
6
+
7
+ - **Architecture**: Decoder-only Transformer
8
+ - **Layers**: 6
9
+ - **Embedding Size**: 512
10
+ - **Heads**: 8
11
+ - **Feedforward Dim**: 2048
12
+ - **Sequence Length**: 256
13
+ - **Vocab Size**: 35,000
14
+
15
+ ## Tokenizer
16
+
17
+ Trained using `ByteLevelBPETokenizer` from the `tokenizers` library.
18
+
19
+ ## Inference Example
20
+
21
+ ```python
22
+ from transformers import PreTrainedTokenizerFast, AutoModelForCausalLM
23
+ import torch
24
+
25
+ tokenizer = PreTrainedTokenizerFast(tokenizer_file="tokenizer/tokenizer.json")
26
+ model = AutoModelForCausalLM.from_pretrained("dilip025/mini-gpt1")
27
+
28
+ prompt = "Once upon a time,"
29
+ input_ids = tokenizer(prompt, return_tensors="pt").input_ids
30
+ outputs = model.generate(input_ids, max_length=50)
31
+ print(tokenizer.decode(outputs[0], skip_special_tokens=True))
32
+ License
33
+ MIT
tokenizer_config.json CHANGED
@@ -1,23 +1,8 @@
1
  {
2
- "add_bos_token": false,
3
- "add_prefix_space": false,
4
- "added_tokens_decoder": {
5
- "30000": {
6
- "content": "<|endoftext|>",
7
- "lstrip": false,
8
- "normalized": true,
9
- "rstrip": false,
10
- "single_word": false,
11
- "special": true
12
- }
13
- },
14
- "bos_token": "<|endoftext|>",
15
- "clean_up_tokenization_spaces": false,
16
- "eos_token": "<|endoftext|>",
17
- "errors": "replace",
18
- "extra_special_tokens": {},
19
- "model_max_length": 1000000000000000019884624838656,
20
- "pad_token": null,
21
- "tokenizer_class": "GPT2Tokenizer",
22
- "unk_token": "<|endoftext|>"
23
- }
 
1
  {
2
+ "add_prefix_space": true,
3
+ "model_max_length": 256,
4
+ "tokenizer_class": "PreTrainedTokenizerFast",
5
+ "unk_token": "<unk>",
6
+ "bos_token": "<s>",
7
+ "eos_token": "</s>"
8
+ }