Publish final model only and clean public repo
Browse files- CodonTranslator/__pycache__/__init__.cpython-312.pyc +0 -0
- CodonTranslator/__pycache__/layers.cpython-312.pyc +0 -0
- CodonTranslator/__pycache__/models.cpython-312.pyc +0 -0
- CodonTranslator/__pycache__/tokenizer.cpython-312.pyc +0 -0
- CodonTranslator/__pycache__/translator.cpython-312.pyc +0 -0
- README.md +1 -2
- __pycache__/precompute_embeddings.cpython-312.pyc +0 -0
- __pycache__/resplit_data_v3.cpython-312.pyc +0 -0
- __pycache__/sampling.cpython-312.pyc +0 -0
- __pycache__/train.cpython-312.pyc +0 -0
- final_model/model.safetensors +1 -1
- final_model/trainer_state.json +1 -1
- src/__pycache__/__init__.cpython-312.pyc +0 -0
- src/__pycache__/dataset.cpython-312.pyc +0 -0
- src/__pycache__/layers.cpython-312.pyc +0 -0
- src/__pycache__/models.cpython-312.pyc +0 -0
- src/__pycache__/sampler.cpython-312.pyc +0 -0
- src/__pycache__/tokenizer.cpython-312.pyc +0 -0
- src/__pycache__/trainer.cpython-312.pyc +0 -0
- training_checkpoints/checkpoint-71000/config.json +0 -17
- training_checkpoints/checkpoint-71000/model.safetensors +0 -3
- training_checkpoints/checkpoint-71000/optimizer.pt +0 -3
- training_checkpoints/checkpoint-71000/scheduler.pt +0 -3
- training_checkpoints/checkpoint-71000/trainer_config.json +0 -17
- training_checkpoints/checkpoint-71000/trainer_state.json +0 -4
- training_checkpoints/checkpoint-71000/vocab.json +0 -78
CodonTranslator/__pycache__/__init__.cpython-312.pyc
DELETED
|
Binary file (222 Bytes)
|
|
|
CodonTranslator/__pycache__/layers.cpython-312.pyc
DELETED
|
Binary file (17.4 kB)
|
|
|
CodonTranslator/__pycache__/models.cpython-312.pyc
DELETED
|
Binary file (20.1 kB)
|
|
|
CodonTranslator/__pycache__/tokenizer.cpython-312.pyc
DELETED
|
Binary file (11.2 kB)
|
|
|
CodonTranslator/__pycache__/translator.cpython-312.pyc
DELETED
|
Binary file (29.9 kB)
|
|
|
README.md
CHANGED
|
@@ -18,7 +18,6 @@ CodonTranslator is a protein-conditioned codon sequence generation model trained
|
|
| 18 |
This repository is the public model and training-code release. It contains:
|
| 19 |
|
| 20 |
- `final_model/`: inference-ready weights
|
| 21 |
-
- `training_checkpoints/checkpoint-71000/`: a resumable training checkpoint
|
| 22 |
- `src/`, `train.py`, `sampling.py`: training and inference code
|
| 23 |
- `resplit_data_v3.py`: the `data_v3` reconstruction pipeline
|
| 24 |
- `slurm/`: the single-node H200 training and data rebuild submission scripts
|
|
@@ -104,7 +103,7 @@ python sampling.py \
|
|
| 104 |
|
| 105 |
- Training uses precomputed `embeddings_v2` for species conditioning.
|
| 106 |
- The data split is built in protein space with MMseqs clustering and binomial-species test holdout.
|
| 107 |
-
- `
|
| 108 |
- For compatibility, released model directories contain both `trainer_config.json` and `config.json`.
|
| 109 |
|
| 110 |
## Sampling arguments
|
|
|
|
| 18 |
This repository is the public model and training-code release. It contains:
|
| 19 |
|
| 20 |
- `final_model/`: inference-ready weights
|
|
|
|
| 21 |
- `src/`, `train.py`, `sampling.py`: training and inference code
|
| 22 |
- `resplit_data_v3.py`: the `data_v3` reconstruction pipeline
|
| 23 |
- `slurm/`: the single-node H200 training and data rebuild submission scripts
|
|
|
|
| 103 |
|
| 104 |
- Training uses precomputed `embeddings_v2` for species conditioning.
|
| 105 |
- The data split is built in protein space with MMseqs clustering and binomial-species test holdout.
|
| 106 |
+
- `final_model/` is the published inference entrypoint.
|
| 107 |
- For compatibility, released model directories contain both `trainer_config.json` and `config.json`.
|
| 108 |
|
| 109 |
## Sampling arguments
|
__pycache__/precompute_embeddings.cpython-312.pyc
DELETED
|
Binary file (24.1 kB)
|
|
|
__pycache__/resplit_data_v3.cpython-312.pyc
DELETED
|
Binary file (57.6 kB)
|
|
|
__pycache__/sampling.cpython-312.pyc
DELETED
|
Binary file (17.7 kB)
|
|
|
__pycache__/train.cpython-312.pyc
DELETED
|
Binary file (21.8 kB)
|
|
|
final_model/model.safetensors
CHANGED
|
@@ -1,3 +1,3 @@
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:
|
| 3 |
size 1284544520
|
|
|
|
| 1 |
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:8e2a8594fa00b268f493f3779f704f81a8bda9501480ba95a263c2479816d951
|
| 3 |
size 1284544520
|
final_model/trainer_state.json
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
{
|
| 2 |
"epoch": 2,
|
| 3 |
-
"global_step":
|
| 4 |
}
|
|
|
|
| 1 |
{
|
| 2 |
"epoch": 2,
|
| 3 |
+
"global_step": 72049
|
| 4 |
}
|
src/__pycache__/__init__.cpython-312.pyc
DELETED
|
Binary file (637 Bytes)
|
|
|
src/__pycache__/dataset.cpython-312.pyc
DELETED
|
Binary file (41 kB)
|
|
|
src/__pycache__/layers.cpython-312.pyc
DELETED
|
Binary file (21.5 kB)
|
|
|
src/__pycache__/models.cpython-312.pyc
DELETED
|
Binary file (25.8 kB)
|
|
|
src/__pycache__/sampler.cpython-312.pyc
DELETED
|
Binary file (36.1 kB)
|
|
|
src/__pycache__/tokenizer.cpython-312.pyc
DELETED
|
Binary file (17.1 kB)
|
|
|
src/__pycache__/trainer.cpython-312.pyc
DELETED
|
Binary file (65.7 kB)
|
|
|
training_checkpoints/checkpoint-71000/config.json
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"max_length": 2048,
|
| 3 |
-
"max_species_prefix": 0,
|
| 4 |
-
"max_protein_prefix": 1024,
|
| 5 |
-
"hidden_size": 750,
|
| 6 |
-
"num_hidden_layers": 20,
|
| 7 |
-
"num_attention_heads": 15,
|
| 8 |
-
"mlp_ratio": 3.2,
|
| 9 |
-
"prepend_species": true,
|
| 10 |
-
"prepend_protein": true,
|
| 11 |
-
"species_embedding_dim": 1024,
|
| 12 |
-
"esm_model_name": "esmc_300m",
|
| 13 |
-
"esm_device": "cuda:0",
|
| 14 |
-
"esm_dtype": "bf16",
|
| 15 |
-
"attn_impl": "mha",
|
| 16 |
-
"num_kv_groups": 5
|
| 17 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
training_checkpoints/checkpoint-71000/model.safetensors
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:07bc223f4d934e2baff5a8085a78348766b6a8324aa091a1459fce2b2c6d3837
|
| 3 |
-
size 1284544520
|
|
|
|
|
|
|
|
|
|
|
|
training_checkpoints/checkpoint-71000/optimizer.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:751570fed64f000a53218f2c9a7e47a4503a302760f1c0d6b52b63ce4a25cec8
|
| 3 |
-
size 1237115851
|
|
|
|
|
|
|
|
|
|
|
|
training_checkpoints/checkpoint-71000/scheduler.pt
DELETED
|
@@ -1,3 +0,0 @@
|
|
| 1 |
-
version https://git-lfs.github.com/spec/v1
|
| 2 |
-
oid sha256:bdca58db103d9ad6aba34334e8a03e08e780b7fe95ef0677f2519e7b16023ff8
|
| 3 |
-
size 1465
|
|
|
|
|
|
|
|
|
|
|
|
training_checkpoints/checkpoint-71000/trainer_config.json
DELETED
|
@@ -1,17 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"max_length": 2048,
|
| 3 |
-
"max_species_prefix": 0,
|
| 4 |
-
"max_protein_prefix": 1024,
|
| 5 |
-
"hidden_size": 750,
|
| 6 |
-
"num_hidden_layers": 20,
|
| 7 |
-
"num_attention_heads": 15,
|
| 8 |
-
"mlp_ratio": 3.2,
|
| 9 |
-
"prepend_species": true,
|
| 10 |
-
"prepend_protein": true,
|
| 11 |
-
"species_embedding_dim": 1024,
|
| 12 |
-
"esm_model_name": "esmc_300m",
|
| 13 |
-
"esm_device": "cuda:0",
|
| 14 |
-
"esm_dtype": "bf16",
|
| 15 |
-
"attn_impl": "mha",
|
| 16 |
-
"num_kv_groups": 5
|
| 17 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
training_checkpoints/checkpoint-71000/trainer_state.json
DELETED
|
@@ -1,4 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"epoch": 2,
|
| 3 |
-
"global_step": 71000
|
| 4 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
training_checkpoints/checkpoint-71000/vocab.json
DELETED
|
@@ -1,78 +0,0 @@
|
|
| 1 |
-
{
|
| 2 |
-
"special_token_str": {
|
| 3 |
-
"bos": "<bos>",
|
| 4 |
-
"eos": "<stop>",
|
| 5 |
-
"pad": "<pad>",
|
| 6 |
-
"unk": "<unk>"
|
| 7 |
-
},
|
| 8 |
-
"vocab": {
|
| 9 |
-
"<bos>": 2,
|
| 10 |
-
"<pad>": 0,
|
| 11 |
-
"<stop>": 3,
|
| 12 |
-
"<unk>": 1,
|
| 13 |
-
"AAA": 4,
|
| 14 |
-
"AAC": 5,
|
| 15 |
-
"AAG": 6,
|
| 16 |
-
"AAT": 7,
|
| 17 |
-
"ACA": 8,
|
| 18 |
-
"ACC": 9,
|
| 19 |
-
"ACG": 10,
|
| 20 |
-
"ACT": 11,
|
| 21 |
-
"AGA": 12,
|
| 22 |
-
"AGC": 13,
|
| 23 |
-
"AGG": 14,
|
| 24 |
-
"AGT": 15,
|
| 25 |
-
"ATA": 16,
|
| 26 |
-
"ATC": 17,
|
| 27 |
-
"ATG": 18,
|
| 28 |
-
"ATT": 19,
|
| 29 |
-
"CAA": 20,
|
| 30 |
-
"CAC": 21,
|
| 31 |
-
"CAG": 22,
|
| 32 |
-
"CAT": 23,
|
| 33 |
-
"CCA": 24,
|
| 34 |
-
"CCC": 25,
|
| 35 |
-
"CCG": 26,
|
| 36 |
-
"CCT": 27,
|
| 37 |
-
"CGA": 28,
|
| 38 |
-
"CGC": 29,
|
| 39 |
-
"CGG": 30,
|
| 40 |
-
"CGT": 31,
|
| 41 |
-
"CTA": 32,
|
| 42 |
-
"CTC": 33,
|
| 43 |
-
"CTG": 34,
|
| 44 |
-
"CTT": 35,
|
| 45 |
-
"GAA": 36,
|
| 46 |
-
"GAC": 37,
|
| 47 |
-
"GAG": 38,
|
| 48 |
-
"GAT": 39,
|
| 49 |
-
"GCA": 40,
|
| 50 |
-
"GCC": 41,
|
| 51 |
-
"GCG": 42,
|
| 52 |
-
"GCT": 43,
|
| 53 |
-
"GGA": 44,
|
| 54 |
-
"GGC": 45,
|
| 55 |
-
"GGG": 46,
|
| 56 |
-
"GGT": 47,
|
| 57 |
-
"GTA": 48,
|
| 58 |
-
"GTC": 49,
|
| 59 |
-
"GTG": 50,
|
| 60 |
-
"GTT": 51,
|
| 61 |
-
"TAA": 52,
|
| 62 |
-
"TAC": 53,
|
| 63 |
-
"TAG": 54,
|
| 64 |
-
"TAT": 55,
|
| 65 |
-
"TCA": 56,
|
| 66 |
-
"TCC": 57,
|
| 67 |
-
"TCG": 58,
|
| 68 |
-
"TCT": 59,
|
| 69 |
-
"TGA": 60,
|
| 70 |
-
"TGC": 61,
|
| 71 |
-
"TGG": 62,
|
| 72 |
-
"TGT": 63,
|
| 73 |
-
"TTA": 64,
|
| 74 |
-
"TTC": 65,
|
| 75 |
-
"TTG": 66,
|
| 76 |
-
"TTT": 67
|
| 77 |
-
}
|
| 78 |
-
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|