Finalize Bertose glycan encoder release names
Browse files
README.md
CHANGED
|
@@ -9,28 +9,28 @@ tags:
|
|
| 9 |
- pytorch
|
| 10 |
---
|
| 11 |
|
| 12 |
-
# Bertose
|
| 13 |
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
This repository contains the leakage-proof Bertose V5B1 checkpoint used for WURCS glycan embeddings. It is intended for inference through the companion cloud notebook.
|
| 17 |
|
| 18 |
## Files
|
| 19 |
|
| 20 |
-
- `checkpoints/
|
| 21 |
- `vocab/bpe_vocabulary.json` - WURCS BPE vocabulary.
|
| 22 |
-
- `src/
|
| 23 |
-
- `src/
|
| 24 |
- `src/wurcs_bpe_tokenizer.py` - WURCS BPE tokenizer.
|
| 25 |
|
| 26 |
-
##
|
| 27 |
|
| 28 |
-
|
| 29 |
|
| 30 |
## Output
|
| 31 |
|
| 32 |
Dense glycan embeddings. The companion notebook defaults to `[CLS]` pooling and also supports mean pooling over valid glycan tokens.
|
| 33 |
|
| 34 |
-
##
|
|
|
|
|
|
|
| 35 |
|
| 36 |
-
|
|
|
|
| 9 |
- pytorch
|
| 10 |
---
|
| 11 |
|
| 12 |
+
# Bertose Glycan Encoder
|
| 13 |
|
| 14 |
+
This repository contains the Bertose checkpoint for WURCS glycan embedding inference. It is the release-facing glycan representation model used by the companion notebook.
|
|
|
|
|
|
|
| 15 |
|
| 16 |
## Files
|
| 17 |
|
| 18 |
+
- `checkpoints/bertose_glycan_encoder.pt` - Bertose glycan encoder checkpoint.
|
| 19 |
- `vocab/bpe_vocabulary.json` - WURCS BPE vocabulary.
|
| 20 |
+
- `src/bertose_model.py` - Bertose model definition.
|
| 21 |
+
- `src/bertose_layers.py` - Transformer layers used by Bertose.
|
| 22 |
- `src/wurcs_bpe_tokenizer.py` - WURCS BPE tokenizer.
|
| 23 |
|
| 24 |
+
## Input
|
| 25 |
|
| 26 |
+
Provide one WURCS glycan string or a CSV batch with a WURCS column. The notebook examples use `wurcs` as the column name.
|
| 27 |
|
| 28 |
## Output
|
| 29 |
|
| 30 |
Dense glycan embeddings. The companion notebook defaults to `[CLS]` pooling and also supports mean pooling over valid glycan tokens.
|
| 31 |
|
| 32 |
+
## Notes
|
| 33 |
+
|
| 34 |
+
This repository does not perform IUPAC-condensed to WURCS conversion. For now, provide WURCS directly.
|
| 35 |
|
| 36 |
+
The final license and citation text should be set before public release.
|
SHA256SUMS
CHANGED
|
@@ -1,31 +1,9 @@
|
|
| 1 |
-
684888c0ebb17f374298b65ee2807526c066094c701bcc7ebbe1c1095f494fc1 ./.cache/huggingface/.gitignore
|
| 2 |
-
e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 ./.cache/huggingface/upload/.gitattributes.lock
|
| 3 |
-
4f977599262d1c1dfa55f309216144478f7afa722e1728f2aa1a9b80f9108ff9 ./.cache/huggingface/upload/.gitattributes.metadata
|
| 4 |
-
e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 ./.cache/huggingface/upload/README.md.lock
|
| 5 |
-
eb1e822610f91fafddda84f20a2d32b4cdedf1a6cfe0e218ac87cf487cd7ecbc ./.cache/huggingface/upload/README.md.metadata
|
| 6 |
-
e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 ./.cache/huggingface/upload/SHA256SUMS.lock
|
| 7 |
-
08e938d1f7308ad98a9f2ae03b0efed2d8037248b2c3b85831a17f2fc27dbfeb ./.cache/huggingface/upload/SHA256SUMS.metadata
|
| 8 |
-
e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 ./.cache/huggingface/upload/checkpoints/bertose_v5b_excluded.pt.lock
|
| 9 |
-
d9026e0f48c8644ce51f25966cb0508a41c73969758eaa523a17dbebe3198c07 ./.cache/huggingface/upload/checkpoints/bertose_v5b_excluded.pt.metadata
|
| 10 |
-
e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 ./.cache/huggingface/upload/config.json.lock
|
| 11 |
-
95eee4589c49a337c15b2f1309655e780f8d4a979765baa4b98a820d4a28ffca ./.cache/huggingface/upload/config.json.metadata
|
| 12 |
-
e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 ./.cache/huggingface/upload/requirements.txt.lock
|
| 13 |
-
ead93b0a87bf2a70c99da445f90a9eee6de5da42788acbf8d469575d2d0dd75f ./.cache/huggingface/upload/requirements.txt.metadata
|
| 14 |
-
e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 ./.cache/huggingface/upload/src/multimodal_glycan_bert_v3.py.lock
|
| 15 |
-
2c560629b83d39c59e004cdb2d2ca4b375440860f75624378974504f30c8cfe1 ./.cache/huggingface/upload/src/multimodal_glycan_bert_v3.py.metadata
|
| 16 |
-
e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 ./.cache/huggingface/upload/src/wurcs_bpe_tokenizer.py.lock
|
| 17 |
-
4034d7d05d0c8b83ddea008e2b718fc6603497c03e767eae7818727bfb906d2a ./.cache/huggingface/upload/src/wurcs_bpe_tokenizer.py.metadata
|
| 18 |
-
e3b0c44298fc1c149afbf4c8996fb92427ae41e4649b934ca495991b7852b855 ./.cache/huggingface/upload/vocab/bpe_vocabulary.json.lock
|
| 19 |
-
a36b656101e757be53780150c039df53ab5b10a2946c2df3e3a8120669d7ed43 ./.cache/huggingface/upload/vocab/bpe_vocabulary.json.metadata
|
| 20 |
622368f62c23e97e9137c277eaadcc93ee3901cbb420b591422bb1c2e19689a5 ./.gitattributes
|
| 21 |
-
|
| 22 |
-
9ff260796c28e9f254d87da95592c686874d3954429d926d99afd2a8f9b6c08f ./checkpoints/
|
| 23 |
-
|
| 24 |
6a56e6f73b8f874470ecde6e538f3f5029ae23aa6c10559817d1c2a8b59b7c0f ./requirements.txt
|
| 25 |
-
|
| 26 |
-
|
| 27 |
-
09d284eb0e13f05d3770449523c0079e4ced2f8dd44d741ce05c39571fbc5022 ./src/__pycache__/wurcs_bpe_tokenizer.cpython-312.pyc
|
| 28 |
-
b69f14c9976951325e3a0a4e8107a16126e67d410e966650f513f1f538a732bb ./src/glycan_bert.py
|
| 29 |
-
0d9ce16bf90242f38621d64cd974ea5679bff4c2013bea8d7bffe1b8dd120794 ./src/multimodal_glycan_bert_v3.py
|
| 30 |
0bc54399362945601bcfd403441fc80968d173200dd0561f57568b2053a94839 ./src/wurcs_bpe_tokenizer.py
|
| 31 |
6a572afdf53f1494ab96c896876b824ca7ea749777352606aa9f96bf270ceecc ./vocab/bpe_vocabulary.json
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
622368f62c23e97e9137c277eaadcc93ee3901cbb420b591422bb1c2e19689a5 ./.gitattributes
|
| 2 |
+
acca581727a41c8ecc6f92e45539f5a5cec3a5d5bdf9113a38e9b911e24eb766 ./README.md
|
| 3 |
+
9ff260796c28e9f254d87da95592c686874d3954429d926d99afd2a8f9b6c08f ./checkpoints/bertose_glycan_encoder.pt
|
| 4 |
+
7233493ffa6eaf57fd3db9e1583ff59a831099175597d64e36e1b28aa31b4cf7 ./config.json
|
| 5 |
6a56e6f73b8f874470ecde6e538f3f5029ae23aa6c10559817d1c2a8b59b7c0f ./requirements.txt
|
| 6 |
+
b69f14c9976951325e3a0a4e8107a16126e67d410e966650f513f1f538a732bb ./src/bertose_layers.py
|
| 7 |
+
f247a6c09132a61cb649acfe022b269b5b94c37a5069fcb62045f3340b96b191 ./src/bertose_model.py
|
|
|
|
|
|
|
|
|
|
| 8 |
0bc54399362945601bcfd403441fc80968d173200dd0561f57568b2053a94839 ./src/wurcs_bpe_tokenizer.py
|
| 9 |
6a572afdf53f1494ab96c896876b824ca7ea749777352606aa9f96bf270ceecc ./vocab/bpe_vocabulary.json
|
checkpoints/{bertose_v5b_excluded.pt → bertose_glycan_encoder.pt}
RENAMED
|
File without changes
|
config.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
{
|
| 2 |
"model_family": "Bertose",
|
| 3 |
-
"release_name": "bertose-
|
| 4 |
-
"checkpoint": "checkpoints/
|
| 5 |
"vocabulary": "vocab/bpe_vocabulary.json",
|
| 6 |
"embedding_dim": 768,
|
| 7 |
"max_glycan_length": 256,
|
|
|
|
| 1 |
{
|
| 2 |
"model_family": "Bertose",
|
| 3 |
+
"release_name": "bertose-glycan-encoder",
|
| 4 |
+
"checkpoint": "checkpoints/bertose_glycan_encoder.pt",
|
| 5 |
"vocabulary": "vocab/bpe_vocabulary.json",
|
| 6 |
"embedding_dim": 768,
|
| 7 |
"max_glycan_length": 256,
|
src/{glycan_bert.py → bertose_layers.py}
RENAMED
|
File without changes
|
src/{multimodal_glycan_bert_v3.py → bertose_model.py}
RENAMED
|
@@ -15,9 +15,9 @@ from typing import Dict, Optional, Tuple
|
|
| 15 |
import math
|
| 16 |
|
| 17 |
try:
|
| 18 |
-
from .
|
| 19 |
except ImportError:
|
| 20 |
-
from
|
| 21 |
|
| 22 |
|
| 23 |
class ConvGlycanBERTEmbeddings(nn.Module):
|
|
|
|
| 15 |
import math
|
| 16 |
|
| 17 |
try:
|
| 18 |
+
from .bertose_layers import GlycanBERTConfig, GlycanBERTEmbeddings, GlycanBERTLayer
|
| 19 |
except ImportError:
|
| 20 |
+
from bertose_layers import GlycanBERTConfig, GlycanBERTEmbeddings, GlycanBERTLayer
|
| 21 |
|
| 22 |
|
| 23 |
class ConvGlycanBERTEmbeddings(nn.Module):
|