McClain commited on
Commit
ecaaa86
·
verified ·
1 Parent(s): b5656c9

Upload 7 files

Browse files
README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # PlasmidGPT Model
2
+
3
+ This is a GPT-2 based model for engineered plasmid sequence generation, converted from PyTorch `.pt` format to HuggingFace transformers format.
4
+
5
+ This is a supervised fine-tuned (SFT) version of [PlasmidGPT](https://github.com/lingxusb/PlasmidGPT) for engineered plasmids. This work was done by **Angus Cunningham** while at **Prof. Chris Barnes' lab at UCL**.
6
+
7
+ ## Model Details
8
+
9
+ - **Architecture**: GPT-2
10
+ - **Vocab Size**: 30,002
11
+ - **Hidden Size**: 768
12
+ - **Number of Layers**: 12
13
+ - **Number of Heads**: 12
14
+ - **Max Position Embeddings**: 2048
15
+ - **Parameters**: ~124M
16
+
17
+ ## Usage
18
+
19
+ ```python
20
+ from transformers import AutoModelForCausalLM, AutoTokenizer
21
+
22
+ model = AutoModelForCausalLM.from_pretrained("./plasmidgpt-model")
23
+ tokenizer = AutoTokenizer.from_pretrained("./plasmidgpt-model")
24
+
25
+ # Basic generation
26
+ inputs = tokenizer("ATGC", return_tensors="pt")
27
+ outputs = model.generate(**inputs, max_length=100)
28
+ generated_sequence = tokenizer.decode(outputs[0], skip_special_tokens=True)
29
+ print(generated_sequence)
30
+
31
+ # With sampling (for more diverse outputs)
32
+ outputs = model.generate(**inputs, max_length=100, do_sample=True, temperature=0.8, top_p=0.9)
33
+ generated_sequence = tokenizer.decode(outputs[0], skip_special_tokens=True)
34
+ print(generated_sequence)
35
+ ```
36
+
37
+ ### Example Outputs
38
+
39
+ **Input:** `ATGCGATCG`
40
+ **Generated:** `ATGCGATCGGTGGTAGGCACTGGATGATGGCCCTGCAGTGTAGCCGTAGTTATGAGCCTCGGGATTCTTTGATGATTCAGCCACCCTCATCATCCTCCTCCTCC...`
41
+
42
+ **Input:** `ATGGCC`
43
+ **Generated:** `ATGGCCTACATACCTTCAATTACCGAAACAAGGTGGTTCATCTCTAACGCTGTCCATAAAACCGCCCAGTCTAGCTATCGCCATTTGCGCATCTAACGTGGTAGGCACTCCGGGTCCGCGCC...`
44
+
45
+ ## Compatible With
46
+
47
+ This model is compatible with the architecture from [McClain/plasmidgpt-addgene-gpt2](https://huggingface.co/McClain/plasmidgpt-addgene-gpt2), but with different weights from the pretrained model.
48
+
49
+ ## Files
50
+
51
+ - `config.json`: Model configuration
52
+ - `generation_config.json`: Generation parameters
53
+ - `model.safetensors`: Model weights in SafeTensors format
54
+ - `tokenizer.json`: Fast tokenizer data
55
+ - `tokenizer_config.json`: Tokenizer configuration
56
+ - `special_tokens_map.json`: Special token mappings
config.json ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "activation_function": "gelu_new",
3
+ "architectures": [
4
+ "GPT2LMHeadModel"
5
+ ],
6
+ "attn_pdrop": 0.1,
7
+ "bos_token_id": 50256,
8
+ "dtype": "float32",
9
+ "embd_pdrop": 0.1,
10
+ "eos_token_id": 50256,
11
+ "initializer_range": 0.02,
12
+ "layer_norm_epsilon": 1e-05,
13
+ "model_type": "gpt2",
14
+ "n_ctx": 2048,
15
+ "n_embd": 768,
16
+ "n_head": 12,
17
+ "n_inner": null,
18
+ "n_layer": 12,
19
+ "n_positions": 2048,
20
+ "reorder_and_upcast_attn": false,
21
+ "resid_pdrop": 0.1,
22
+ "scale_attn_by_inverse_layer_idx": false,
23
+ "scale_attn_weights": true,
24
+ "summary_activation": null,
25
+ "summary_first_dropout": 0.1,
26
+ "summary_proj_to_labels": true,
27
+ "summary_type": "cls_index",
28
+ "summary_use_proj": true,
29
+ "task_specific_params": {
30
+ "text-generation": {
31
+ "do_sample": true,
32
+ "max_length": 50
33
+ }
34
+ },
35
+ "transformers_version": "4.57.1",
36
+ "use_cache": true,
37
+ "vocab_size": 30002
38
+ }
generation_config.json ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "_from_model_config": true,
3
+ "bos_token_id": 50256,
4
+ "eos_token_id": 50256,
5
+ "transformers_version": "4.57.1"
6
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:437b7ecef6996e7c8cc523b59c2aaa492f344e17320551f2b797369f50b881f7
3
+ size 489030728
special_tokens_map.json ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "bos_token": {
3
+ "content": "<s>",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "eos_token": {
10
+ "content": "[SEP]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ }
23
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[UNK]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[CLS]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[SEP]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[PAD]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ },
43
+ "30000": {
44
+ "content": "<s>",
45
+ "lstrip": false,
46
+ "normalized": false,
47
+ "rstrip": false,
48
+ "single_word": false,
49
+ "special": true
50
+ },
51
+ "30001": {
52
+ "content": "</s>",
53
+ "lstrip": false,
54
+ "normalized": false,
55
+ "rstrip": false,
56
+ "single_word": false,
57
+ "special": true
58
+ }
59
+ },
60
+ "bos_token": "<s>",
61
+ "clean_up_tokenization_spaces": false,
62
+ "eos_token": "[SEP]",
63
+ "extra_special_tokens": {},
64
+ "max_length": null,
65
+ "model_max_length": 1000000000000000019884624838656,
66
+ "pad_to_multiple_of": null,
67
+ "pad_token": "[PAD]",
68
+ "pad_token_type_id": 0,
69
+ "padding_side": "left",
70
+ "tokenizer_class": "PreTrainedTokenizerFast"
71
+ }