Apply AFFINose display capitalization
Browse files- README.md +9 -9
- SHA256SUMS +7 -7
- config.json +2 -2
- src/affinose_dataset.py +4 -4
- src/affinose_inference.py +6 -6
- src/affinose_model.py +20 -20
- src/bertose_layers.py +3 -3
- src/bertose_model.py +5 -5
README.md
CHANGED
|
@@ -11,19 +11,19 @@ tags:
|
|
| 11 |
- pytorch
|
| 12 |
---
|
| 13 |
|
| 14 |
-
#
|
| 15 |
|
| 16 |
-
This repository contains the
|
| 17 |
|
| 18 |
## Files
|
| 19 |
|
| 20 |
-
- `checkpoints/affinose_interaction_model.pt` -
|
| 21 |
- `vocab/bpe_vocabulary.json` - WURCS BPE vocabulary for glycan tokenization.
|
| 22 |
-
- `src/affinose_model.py` -
|
| 23 |
- `src/affinose_inference.py` - standalone inference helper.
|
| 24 |
- `src/affinose_dataset.py` - tokenizer and data utility helpers.
|
| 25 |
-
- `src/bertose_model.py` -
|
| 26 |
-
- `src/bertose_layers.py` - Transformer layers used by
|
| 27 |
- `src/wurcs_bpe_tokenizer.py` - WURCS BPE tokenizer.
|
| 28 |
|
| 29 |
## Input
|
|
@@ -32,7 +32,7 @@ Provide one protein-glycan pair or a CSV batch. Glycans should be WURCS strings.
|
|
| 32 |
|
| 33 |
## Protein Embedding Requirement
|
| 34 |
|
| 35 |
-
|
| 36 |
|
| 37 |
ESM-C is a separate EvolutionaryScale protein model. The ESM-C weights are not included in this repository. Users should install the `esm` package and let it download ESM-C 300M into their own runtime cache.
|
| 38 |
|
|
@@ -50,11 +50,11 @@ output = esmc.logits(
|
|
| 50 |
protein_embeddings = output.embeddings # per-residue ESM-C 300M embeddings
|
| 51 |
```
|
| 52 |
|
| 53 |
-
If Hugging Face requests authentication for ESM-C, users should authenticate with their own Hugging Face account/token and accept any required EvolutionaryScale terms.
|
| 54 |
|
| 55 |
## Output
|
| 56 |
|
| 57 |
-
A scalar protein-glycan interaction score from the trained
|
| 58 |
|
| 59 |
## Scope
|
| 60 |
|
|
|
|
| 11 |
- pytorch
|
| 12 |
---
|
| 13 |
|
| 14 |
+
# AFFINose Interaction Model
|
| 15 |
|
| 16 |
+
This repository contains the AFFINose checkpoint for protein-glycan interaction inference. AFFINose combines BERTose glycan token representations with per-residue ESM-C protein embeddings and returns a scalar interaction score.
|
| 17 |
|
| 18 |
## Files
|
| 19 |
|
| 20 |
+
- `checkpoints/affinose_interaction_model.pt` - AFFINose interaction checkpoint.
|
| 21 |
- `vocab/bpe_vocabulary.json` - WURCS BPE vocabulary for glycan tokenization.
|
| 22 |
+
- `src/affinose_model.py` - AFFINose architecture.
|
| 23 |
- `src/affinose_inference.py` - standalone inference helper.
|
| 24 |
- `src/affinose_dataset.py` - tokenizer and data utility helpers.
|
| 25 |
+
- `src/bertose_model.py` - BERTose model definition used for glycan encoding.
|
| 26 |
+
- `src/bertose_layers.py` - Transformer layers used by BERTose.
|
| 27 |
- `src/wurcs_bpe_tokenizer.py` - WURCS BPE tokenizer.
|
| 28 |
|
| 29 |
## Input
|
|
|
|
| 32 |
|
| 33 |
## Protein Embedding Requirement
|
| 34 |
|
| 35 |
+
AFFINose expects per-residue ESM-C 300M embeddings with shape `[L, 960]`. Do not mean-pool the protein before passing it into AFFINose.
|
| 36 |
|
| 37 |
ESM-C is a separate EvolutionaryScale protein model. The ESM-C weights are not included in this repository. Users should install the `esm` package and let it download ESM-C 300M into their own runtime cache.
|
| 38 |
|
|
|
|
| 50 |
protein_embeddings = output.embeddings # per-residue ESM-C 300M embeddings
|
| 51 |
```
|
| 52 |
|
| 53 |
+
If Hugging Face requests authentication for ESM-C, users should authenticate with their own Hugging Face account/token and accept any required EvolutionaryScale terms. BERTose/AFFINose tokens are not required once these repositories are public.
|
| 54 |
|
| 55 |
## Output
|
| 56 |
|
| 57 |
+
A scalar protein-glycan interaction score from the trained AFFINose head.
|
| 58 |
|
| 59 |
## Scope
|
| 60 |
|
SHA256SUMS
CHANGED
|
@@ -1,12 +1,12 @@
|
|
| 1 |
622368f62c23e97e9137c277eaadcc93ee3901cbb420b591422bb1c2e19689a5 ./.gitattributes
|
| 2 |
-
|
| 3 |
533fe4f9317782e39ad7980caa0f47ad92f1be999dd9489425a9429e2e7c15cb ./checkpoints/affinose_interaction_model.pt
|
| 4 |
-
|
| 5 |
1be88f15fd905882c711f7ceb59b619a93f9cee2c6c7c031f8dbabd35b29e9e0 ./requirements.txt
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
|
| 9 |
-
|
| 10 |
-
|
| 11 |
0bc54399362945601bcfd403441fc80968d173200dd0561f57568b2053a94839 ./src/wurcs_bpe_tokenizer.py
|
| 12 |
6a572afdf53f1494ab96c896876b824ca7ea749777352606aa9f96bf270ceecc ./vocab/bpe_vocabulary.json
|
|
|
|
| 1 |
622368f62c23e97e9137c277eaadcc93ee3901cbb420b591422bb1c2e19689a5 ./.gitattributes
|
| 2 |
+
f57fc6622ab0a2eca11a18e28d94744334b62640e25a00e5e3893f6eeefe3e7a ./README.md
|
| 3 |
533fe4f9317782e39ad7980caa0f47ad92f1be999dd9489425a9429e2e7c15cb ./checkpoints/affinose_interaction_model.pt
|
| 4 |
+
b8621448bb34d81b66e2b0050f3663375841d8f333d3973892ece32e7cc31880 ./config.json
|
| 5 |
1be88f15fd905882c711f7ceb59b619a93f9cee2c6c7c031f8dbabd35b29e9e0 ./requirements.txt
|
| 6 |
+
81c746162e528c4469ffadb2e3d71ce9d2c34fd4e4a8b176cf0808e26ba60617 ./src/affinose_dataset.py
|
| 7 |
+
b4c729626e405cbe7564d7021f51e5b3ba1272e6318ad4439dcdf002ec88fde6 ./src/affinose_inference.py
|
| 8 |
+
8041084cbe694d26d356feb7c0f66173e89c98b0ef18c73b407084e7a7d1b7a3 ./src/affinose_model.py
|
| 9 |
+
3587aa789041e4ee215dcaa0286847d45621e9514c0ed3efa314fb89cf40f4d7 ./src/bertose_layers.py
|
| 10 |
+
773023d70be02ecf4a26b79b18b2b70ee1b39e01ec287386cb162590aaf90767 ./src/bertose_model.py
|
| 11 |
0bc54399362945601bcfd403441fc80968d173200dd0561f57568b2053a94839 ./src/wurcs_bpe_tokenizer.py
|
| 12 |
6a572afdf53f1494ab96c896876b824ca7ea749777352606aa9f96bf270ceecc ./vocab/bpe_vocabulary.json
|
config.json
CHANGED
|
@@ -1,9 +1,9 @@
|
|
| 1 |
{
|
| 2 |
-
"model_family": "
|
| 3 |
"release_name": "affinose-interaction-model",
|
| 4 |
"checkpoint": "checkpoints/affinose_interaction_model.pt",
|
| 5 |
"vocabulary": "vocab/bpe_vocabulary.json",
|
| 6 |
-
"glycan_encoder": "
|
| 7 |
"glycan_dim": 768,
|
| 8 |
"protein_encoder": "ESM-C 300M",
|
| 9 |
"protein_dim": 960,
|
|
|
|
| 1 |
{
|
| 2 |
+
"model_family": "AFFINose",
|
| 3 |
"release_name": "affinose-interaction-model",
|
| 4 |
"checkpoint": "checkpoints/affinose_interaction_model.pt",
|
| 5 |
"vocabulary": "vocab/bpe_vocabulary.json",
|
| 6 |
+
"glycan_encoder": "BERTose glycan encoder",
|
| 7 |
"glycan_dim": 768,
|
| 8 |
"protein_encoder": "ESM-C 300M",
|
| 9 |
"protein_dim": 960,
|
src/affinose_dataset.py
CHANGED
|
@@ -1,7 +1,7 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
|
| 4 |
-
|
| 5 |
"""
|
| 6 |
|
| 7 |
import json
|
|
@@ -26,7 +26,7 @@ logger = logging.getLogger(__name__)
|
|
| 26 |
|
| 27 |
def load_bpe_tokenizer(vocab_path: str):
|
| 28 |
"""
|
| 29 |
-
Load the
|
| 30 |
|
| 31 |
Bypasses downstream_tasks package imports. Adds utils
|
| 32 |
directory to sys.path and imports WURCSBPETokenizer directly.
|
|
@@ -75,7 +75,7 @@ class AffinoseInteractionDataset(Dataset):
|
|
| 75 |
Dataset for glycan-protein interaction with cross-attention support.
|
| 76 |
|
| 77 |
Returns:
|
| 78 |
-
- BPE-tokenized glycan sequences for live
|
| 79 |
- Per-residue ESM-C protein embeddings [Lp, D] (NOT mean-pooled)
|
| 80 |
- Masks for both sides (for cross-attention padding)
|
| 81 |
|
|
|
|
| 1 |
"""
|
| 2 |
+
AFFINose dataset utilities — per-residue protein embeddings for cross-attention
|
| 3 |
|
| 4 |
+
AFFINose keeps per-residue ESM-C embeddings [L, 960] rather than mean-pooled vectors so glycan tokens can cross-attend to protein residues.
|
| 5 |
"""
|
| 6 |
|
| 7 |
import json
|
|
|
|
| 26 |
|
| 27 |
def load_bpe_tokenizer(vocab_path: str):
|
| 28 |
"""
|
| 29 |
+
Load the BERTose BPE tokenizer directly from source.
|
| 30 |
|
| 31 |
Bypasses downstream_tasks package imports. Adds utils
|
| 32 |
directory to sys.path and imports WURCSBPETokenizer directly.
|
|
|
|
| 75 |
Dataset for glycan-protein interaction with cross-attention support.
|
| 76 |
|
| 77 |
Returns:
|
| 78 |
+
- BPE-tokenized glycan sequences for live BERTose forward pass
|
| 79 |
- Per-residue ESM-C protein embeddings [Lp, D] (NOT mean-pooled)
|
| 80 |
- Masks for both sides (for cross-attention padding)
|
| 81 |
|
src/affinose_inference.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
|
| 4 |
-
Architecture:
|
| 5 |
-
GLYCAN: WURCS -> BPE ->
|
| 6 |
PROTEIN: ESM-C per-residue -> [B, Lp, 960] -> proj -> [B, Lp, 512]
|
| 7 |
|
|
| 8 |
2x CrossAttentionBlock(d=512, 8H, FFN=1024)
|
|
@@ -50,11 +50,11 @@ class AffinosePredictor:
|
|
| 50 |
logger.info(f"Loading BPE tokenizer from {vocab_path}")
|
| 51 |
self.tokenizer = load_bpe_tokenizer(vocab_path)
|
| 52 |
|
| 53 |
-
logger.info(f"Loading
|
| 54 |
bertose_config, seq_embeddings, seq_layers = load_bertose_encoder(
|
| 55 |
bertose_checkpoint, freeze_layers=12)
|
| 56 |
|
| 57 |
-
logger.info("Building
|
| 58 |
self.model = AffinoseInteractionModel(
|
| 59 |
seq_embeddings=seq_embeddings,
|
| 60 |
seq_layers=seq_layers,
|
|
@@ -149,7 +149,7 @@ class AffinosePredictor:
|
|
| 149 |
|
| 150 |
|
| 151 |
def main():
|
| 152 |
-
parser = argparse.ArgumentParser(description="
|
| 153 |
parser.add_argument("--checkpoint", required=True)
|
| 154 |
parser.add_argument("--bertose_checkpoint", required=True)
|
| 155 |
parser.add_argument("--vocab_path", required=True)
|
|
|
|
| 1 |
"""
|
| 2 |
+
AFFINose inference — standalone prediction pipeline
|
| 3 |
|
| 4 |
+
Architecture: BERTose glycan encoding + ESM-C protein embeddings + cross-attention fusion.
|
| 5 |
+
GLYCAN: WURCS -> BPE -> BERTose (frozen) -> [B, Lg, 768] -> proj -> [B, Lg, 512]
|
| 6 |
PROTEIN: ESM-C per-residue -> [B, Lp, 960] -> proj -> [B, Lp, 512]
|
| 7 |
|
|
| 8 |
2x CrossAttentionBlock(d=512, 8H, FFN=1024)
|
|
|
|
| 50 |
logger.info(f"Loading BPE tokenizer from {vocab_path}")
|
| 51 |
self.tokenizer = load_bpe_tokenizer(vocab_path)
|
| 52 |
|
| 53 |
+
logger.info(f"Loading BERTose from {bertose_checkpoint}")
|
| 54 |
bertose_config, seq_embeddings, seq_layers = load_bertose_encoder(
|
| 55 |
bertose_checkpoint, freeze_layers=12)
|
| 56 |
|
| 57 |
+
logger.info("Building AFFINose interaction model")
|
| 58 |
self.model = AffinoseInteractionModel(
|
| 59 |
seq_embeddings=seq_embeddings,
|
| 60 |
seq_layers=seq_layers,
|
|
|
|
| 149 |
|
| 150 |
|
| 151 |
def main():
|
| 152 |
+
parser = argparse.ArgumentParser(description="AFFINose interaction inference")
|
| 153 |
parser.add_argument("--checkpoint", required=True)
|
| 154 |
parser.add_argument("--bertose_checkpoint", required=True)
|
| 155 |
parser.add_argument("--vocab_path", required=True)
|
src/affinose_model.py
CHANGED
|
@@ -1,8 +1,8 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
|
| 4 |
Architecture:
|
| 5 |
-
GLYCAN: WURCS → BPE →
|
| 6 |
↓ proj(768→512)
|
| 7 |
PROTEIN: precomputed ESM-C → [B, Lp, 960] ↓
|
| 8 |
↓ proj(960→512) ↓
|
|
@@ -15,7 +15,7 @@ Architecture:
|
|
| 15 |
[B, 1024]
|
| 16 |
↓ MLP → binding score
|
| 17 |
|
| 18 |
-
This release exposes the manuscript-facing
|
| 19 |
"""
|
| 20 |
|
| 21 |
import os
|
|
@@ -30,10 +30,10 @@ import torch.nn.functional as F
|
|
| 30 |
|
| 31 |
|
| 32 |
# ============================================================================
|
| 33 |
-
#
|
| 34 |
# ============================================================================
|
| 35 |
def _default_bertose_root() -> Path:
|
| 36 |
-
"""Resolve the
|
| 37 |
env_root = os.environ.get("BERTOSE_ROOT") or os.environ.get("BERTOSE_REPO_ROOT")
|
| 38 |
if env_root:
|
| 39 |
return Path(env_root).expanduser().resolve()
|
|
@@ -50,7 +50,7 @@ BERTOSE_ROOT = _default_bertose_root()
|
|
| 50 |
|
| 51 |
|
| 52 |
def _ensure_bertose_imports():
|
| 53 |
-
"""Add
|
| 54 |
source_dir = Path(__file__).resolve().parent
|
| 55 |
roots = [
|
| 56 |
str(source_dir),
|
|
@@ -63,7 +63,7 @@ def _ensure_bertose_imports():
|
|
| 63 |
|
| 64 |
|
| 65 |
def load_bertose_config():
|
| 66 |
-
"""Create
|
| 67 |
_ensure_bertose_imports()
|
| 68 |
try:
|
| 69 |
from model.bertose_model import MultimodalGlycanBERTConfig
|
|
@@ -80,10 +80,10 @@ def load_bertose_encoder(
|
|
| 80 |
checkpoint_path: str, freeze_layers: int = 4
|
| 81 |
):
|
| 82 |
"""
|
| 83 |
-
Load
|
| 84 |
|
| 85 |
Args:
|
| 86 |
-
checkpoint_path: Path to pretrained
|
| 87 |
freeze_layers: Number of transformer layers to freeze (0-indexed).
|
| 88 |
|
| 89 |
Returns:
|
|
@@ -145,7 +145,7 @@ def load_bertose_encoder(
|
|
| 145 |
p.numel() for layer in seq_layers for p in layer.parameters()
|
| 146 |
)
|
| 147 |
print(
|
| 148 |
-
f"
|
| 149 |
f"{trainable:,} trainable (frozen layers 0-{freeze_layers - 1})"
|
| 150 |
)
|
| 151 |
|
|
@@ -440,7 +440,7 @@ class CrossAttentionBlock(nn.Module):
|
|
| 440 |
|
| 441 |
|
| 442 |
# ============================================================================
|
| 443 |
-
#
|
| 444 |
# ============================================================================
|
| 445 |
|
| 446 |
|
|
@@ -448,7 +448,7 @@ class AffinoseInteractionModel(nn.Module):
|
|
| 448 |
"""
|
| 449 |
Glycan-protein interaction predictor with cross-attention.
|
| 450 |
|
| 451 |
-
Glycan: Live
|
| 452 |
Protein: Precomputed ESM-C per-residue [B, Lp, 960]
|
| 453 |
Cross-attention: 2 bidirectional layers in shared 512-dim space
|
| 454 |
SWE: Variable-length → fixed [B, 512] for each side
|
|
@@ -476,9 +476,9 @@ class AffinoseInteractionModel(nn.Module):
|
|
| 476 |
):
|
| 477 |
"""
|
| 478 |
Args:
|
| 479 |
-
seq_embeddings: Pretrained
|
| 480 |
-
seq_layers: Pretrained
|
| 481 |
-
glycan_dim:
|
| 482 |
protein_dim: ESM-C per-residue dimension (960).
|
| 483 |
shared_dim: Shared space for cross-attention (512).
|
| 484 |
num_cross_layers: Number of cross-attention blocks.
|
|
@@ -505,7 +505,7 @@ class AffinoseInteractionModel(nn.Module):
|
|
| 505 |
print(f" pooling_mode={pooling_mode}")
|
| 506 |
print(f" interaction_mode={interaction_mode}")
|
| 507 |
|
| 508 |
-
# ===
|
| 509 |
self.seq_embeddings = seq_embeddings
|
| 510 |
self.seq_layers = seq_layers
|
| 511 |
|
|
@@ -634,7 +634,7 @@ class AffinoseInteractionModel(nn.Module):
|
|
| 634 |
Returns:
|
| 635 |
[B] binding score predictions.
|
| 636 |
"""
|
| 637 |
-
# === 1.
|
| 638 |
x = self.seq_embeddings(token_ids, branch_depths, linkage_types)
|
| 639 |
for layer in self.seq_layers:
|
| 640 |
x = layer(x, attention_mask)
|
|
@@ -727,12 +727,12 @@ class AffinoseInteractionLoss(nn.Module):
|
|
| 727 |
# ============================================================================
|
| 728 |
if __name__ == "__main__":
|
| 729 |
print("=" * 60)
|
| 730 |
-
print("
|
| 731 |
print("=" * 60)
|
| 732 |
|
| 733 |
-
# Mock
|
| 734 |
class MockEmbeddings(nn.Module):
|
| 735 |
-
"""Mock
|
| 736 |
|
| 737 |
def __init__(self, dim: int = 768):
|
| 738 |
super().__init__()
|
|
|
|
| 1 |
"""
|
| 2 |
+
AFFINose interaction model — cross-attention with live BERTose encoding
|
| 3 |
|
| 4 |
Architecture:
|
| 5 |
+
GLYCAN: WURCS → BPE → BERTose (live, freeze layers 0-3) → [B, Lg, 768]
|
| 6 |
↓ proj(768→512)
|
| 7 |
PROTEIN: precomputed ESM-C → [B, Lp, 960] ↓
|
| 8 |
↓ proj(960→512) ↓
|
|
|
|
| 15 |
[B, 1024]
|
| 16 |
↓ MLP → binding score
|
| 17 |
|
| 18 |
+
This release exposes the manuscript-facing AFFINose architecture: BERTose glycan tokens, per-residue ESM-C protein embeddings, bidirectional cross-attention, pooled fusion and scalar interaction scoring.
|
| 19 |
"""
|
| 20 |
|
| 21 |
import os
|
|
|
|
| 30 |
|
| 31 |
|
| 32 |
# ============================================================================
|
| 33 |
+
# BERTose model imports
|
| 34 |
# ============================================================================
|
| 35 |
def _default_bertose_root() -> Path:
|
| 36 |
+
"""Resolve the BERTose source root without assuming a specific local path."""
|
| 37 |
env_root = os.environ.get("BERTOSE_ROOT") or os.environ.get("BERTOSE_REPO_ROOT")
|
| 38 |
if env_root:
|
| 39 |
return Path(env_root).expanduser().resolve()
|
|
|
|
| 50 |
|
| 51 |
|
| 52 |
def _ensure_bertose_imports():
|
| 53 |
+
"""Add BERTose source directories to sys.path if not already present."""
|
| 54 |
source_dir = Path(__file__).resolve().parent
|
| 55 |
roots = [
|
| 56 |
str(source_dir),
|
|
|
|
| 63 |
|
| 64 |
|
| 65 |
def load_bertose_config():
|
| 66 |
+
"""Create BERTose config matching the BERTose glycan encoder checkpoint."""
|
| 67 |
_ensure_bertose_imports()
|
| 68 |
try:
|
| 69 |
from model.bertose_model import MultimodalGlycanBERTConfig
|
|
|
|
| 80 |
checkpoint_path: str, freeze_layers: int = 4
|
| 81 |
):
|
| 82 |
"""
|
| 83 |
+
Load BERTose sequence encoder with pretrained weights.
|
| 84 |
|
| 85 |
Args:
|
| 86 |
+
checkpoint_path: Path to pretrained BERTose checkpoint.
|
| 87 |
freeze_layers: Number of transformer layers to freeze (0-indexed).
|
| 88 |
|
| 89 |
Returns:
|
|
|
|
| 145 |
p.numel() for layer in seq_layers for p in layer.parameters()
|
| 146 |
)
|
| 147 |
print(
|
| 148 |
+
f" BERTose encoder: {total:,} params total, "
|
| 149 |
f"{trainable:,} trainable (frozen layers 0-{freeze_layers - 1})"
|
| 150 |
)
|
| 151 |
|
|
|
|
| 440 |
|
| 441 |
|
| 442 |
# ============================================================================
|
| 443 |
+
# AFFINose interaction model
|
| 444 |
# ============================================================================
|
| 445 |
|
| 446 |
|
|
|
|
| 448 |
"""
|
| 449 |
Glycan-protein interaction predictor with cross-attention.
|
| 450 |
|
| 451 |
+
Glycan: Live BERTose (partially frozen) → per-token [B, Lg, 768]
|
| 452 |
Protein: Precomputed ESM-C per-residue [B, Lp, 960]
|
| 453 |
Cross-attention: 2 bidirectional layers in shared 512-dim space
|
| 454 |
SWE: Variable-length → fixed [B, 512] for each side
|
|
|
|
| 476 |
):
|
| 477 |
"""
|
| 478 |
Args:
|
| 479 |
+
seq_embeddings: Pretrained BERTose embedding layer.
|
| 480 |
+
seq_layers: Pretrained BERTose transformer layers.
|
| 481 |
+
glycan_dim: BERTose output dimension (768).
|
| 482 |
protein_dim: ESM-C per-residue dimension (960).
|
| 483 |
shared_dim: Shared space for cross-attention (512).
|
| 484 |
num_cross_layers: Number of cross-attention blocks.
|
|
|
|
| 505 |
print(f" pooling_mode={pooling_mode}")
|
| 506 |
print(f" interaction_mode={interaction_mode}")
|
| 507 |
|
| 508 |
+
# === BERTose sequence encoder (partially frozen) ===
|
| 509 |
self.seq_embeddings = seq_embeddings
|
| 510 |
self.seq_layers = seq_layers
|
| 511 |
|
|
|
|
| 634 |
Returns:
|
| 635 |
[B] binding score predictions.
|
| 636 |
"""
|
| 637 |
+
# === 1. BERTose forward: per-token embeddings ===
|
| 638 |
x = self.seq_embeddings(token_ids, branch_depths, linkage_types)
|
| 639 |
for layer in self.seq_layers:
|
| 640 |
x = layer(x, attention_mask)
|
|
|
|
| 727 |
# ============================================================================
|
| 728 |
if __name__ == "__main__":
|
| 729 |
print("=" * 60)
|
| 730 |
+
print("AFFINose interaction model architecture sanity check")
|
| 731 |
print("=" * 60)
|
| 732 |
|
| 733 |
+
# Mock BERTose encoder (for testing without cluster)
|
| 734 |
class MockEmbeddings(nn.Module):
|
| 735 |
+
"""Mock BERTose embeddings for local testing."""
|
| 736 |
|
| 737 |
def __init__(self, dim: int = 768):
|
| 738 |
super().__init__()
|
src/bertose_layers.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
|
| 4 |
Transformer blocks adapted for WURCS glycan tokenization.
|
| 5 |
"""
|
|
@@ -10,7 +10,7 @@ import math
|
|
| 10 |
|
| 11 |
|
| 12 |
class GlycanBERTConfig:
|
| 13 |
-
"""Configuration for the
|
| 14 |
|
| 15 |
def __init__(
|
| 16 |
self,
|
|
@@ -202,7 +202,7 @@ class GlycanBERTLayer(nn.Module):
|
|
| 202 |
|
| 203 |
class GlycanBERT(nn.Module):
|
| 204 |
"""
|
| 205 |
-
|
| 206 |
"""
|
| 207 |
|
| 208 |
def __init__(self, config: GlycanBERTConfig):
|
|
|
|
| 1 |
"""
|
| 2 |
+
BERTose transformer layers.
|
| 3 |
|
| 4 |
Transformer blocks adapted for WURCS glycan tokenization.
|
| 5 |
"""
|
|
|
|
| 10 |
|
| 11 |
|
| 12 |
class GlycanBERTConfig:
|
| 13 |
+
"""Configuration for the BERTose transformer stack."""
|
| 14 |
|
| 15 |
def __init__(
|
| 16 |
self,
|
|
|
|
| 202 |
|
| 203 |
class GlycanBERT(nn.Module):
|
| 204 |
"""
|
| 205 |
+
BERTose transformer stack for masked language modeling.
|
| 206 |
"""
|
| 207 |
|
| 208 |
def __init__(self, config: GlycanBERTConfig):
|
src/bertose_model.py
CHANGED
|
@@ -1,5 +1,5 @@
|
|
| 1 |
"""
|
| 2 |
-
|
| 3 |
|
| 4 |
Core glycan representation model with three modalities:
|
| 5 |
- Sequence (WURCS atomic tokenization)
|
|
@@ -148,7 +148,7 @@ def create_residue_level_mask(
|
|
| 148 |
|
| 149 |
|
| 150 |
class MultimodalGlycanBERTConfig:
|
| 151 |
-
"""Configuration for the
|
| 152 |
|
| 153 |
def __init__(
|
| 154 |
self,
|
|
@@ -610,7 +610,7 @@ class CrossAttentionLayer(nn.Module):
|
|
| 610 |
|
| 611 |
class MultimodalGlycanBERT(nn.Module):
|
| 612 |
"""
|
| 613 |
-
|
| 614 |
|
| 615 |
Architecture:
|
| 616 |
1. Separate encoders for each modality (sequence, MS, 3D structure)
|
|
@@ -722,7 +722,7 @@ class MultimodalGlycanBERT(nn.Module):
|
|
| 722 |
return_dict: bool = True,
|
| 723 |
) -> Dict[str, torch.Tensor]:
|
| 724 |
"""
|
| 725 |
-
Forward pass for
|
| 726 |
|
| 727 |
Args:
|
| 728 |
seq_token_ids: (batch_size, seq_len) - Sequence token IDs
|
|
@@ -970,7 +970,7 @@ class MultimodalGlycanBERT(nn.Module):
|
|
| 970 |
if __name__ == "__main__":
|
| 971 |
# Test the model
|
| 972 |
print("="*80)
|
| 973 |
-
print("Testing
|
| 974 |
print("="*80)
|
| 975 |
|
| 976 |
# Create config
|
|
|
|
| 1 |
"""
|
| 2 |
+
BERTose model
|
| 3 |
|
| 4 |
Core glycan representation model with three modalities:
|
| 5 |
- Sequence (WURCS atomic tokenization)
|
|
|
|
| 148 |
|
| 149 |
|
| 150 |
class MultimodalGlycanBERTConfig:
|
| 151 |
+
"""Configuration for the BERTose model."""
|
| 152 |
|
| 153 |
def __init__(
|
| 154 |
self,
|
|
|
|
| 610 |
|
| 611 |
class MultimodalGlycanBERT(nn.Module):
|
| 612 |
"""
|
| 613 |
+
BERTose model for glycan representation learning.
|
| 614 |
|
| 615 |
Architecture:
|
| 616 |
1. Separate encoders for each modality (sequence, MS, 3D structure)
|
|
|
|
| 722 |
return_dict: bool = True,
|
| 723 |
) -> Dict[str, torch.Tensor]:
|
| 724 |
"""
|
| 725 |
+
Forward pass for BERTose.
|
| 726 |
|
| 727 |
Args:
|
| 728 |
seq_token_ids: (batch_size, seq_len) - Sequence token IDs
|
|
|
|
| 970 |
if __name__ == "__main__":
|
| 971 |
# Test the model
|
| 972 |
print("="*80)
|
| 973 |
+
print("Testing BERTose model")
|
| 974 |
print("="*80)
|
| 975 |
|
| 976 |
# Create config
|