Initial upload: weights, config, code, README, requirements

Browse files

Files changed (6) hide show

README.md +57 -0
config.json +9 -0
model.pt +3 -0
model.py +48 -0
model.safetensors +3 -0
requirements.txt +5 -0

README.md ADDED Viewed

	@@ -0,0 +1,57 @@

+---
+library_name: "pytorch"
+tags:
+  - protein
+  - biosequence
+  - cnn
+  - embedding
+license: apache-2.0
+---
+# CNNED_Protein
+CNN-based embedding model for protein/bio sequences (triplet/contrastive training ready).
+## Model Summary
+- **Input**: one-hot encoded sequence of shape `(B, A, L)`
+- **Encoder**: 1D CNN + AvgPooling stacks
+- **Output**: L2-normalized embedding `(B, D)` via projection head
+- **Training**: Designed for triplet/contrastive loss (anchor, positive, negative)
+### Config
+- `alphabet_size`: 27
+- `target_size`: 128
+- `channel`: 256
+- `depth`: 3
+- `kernel_size`: 7
+- `l2norm`: True
+## Usage
+```python
+import json, torch
+from safetensors.torch import load_file
+# Load config
+cfg = json.load(open("config.json","r"))
+from model import CNNED_Protein
+model = CNNED_Protein(**cfg).eval()
+# Load weights
+try:
+    sd = load_file("model.safetensors")
+except Exception:
+    sd = torch.load("model.pt", map_location="cpu")
+model.load_state_dict(sd, strict=True)
+model.eval()
+# Dummy inference
+# x: (B, A, L) one-hot tensor
+x = torch.randn(2, cfg['alphabet_size'], 512)
+y, z = model.encode(x)
+print(y.shape)  # (2, target_size)
+```
+## Notes
+- TripletMarginLoss / InfoNCE 등 metric learning에 적합.
+- Sequence length 변동에 robust (global pooling).

config.json ADDED Viewed

	@@ -0,0 +1,9 @@

+{
+  "model_class": "CNNED_Protein",
+  "alphabet_size": 27,
+  "target_size": 128,
+  "channel": 256,
+  "depth": 3,
+  "kernel_size": 7,
+  "l2norm": true
+}

model.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:b836a1eaccd791f88d122c2773a3c80ef75f7c47dbc3dcdec998600a0e6825ae
+size 4277097

model.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class CNNED_Protein(nn.Module):
+    def __init__(self, alphabet_size: int, target_size: int,
+                 channel: int, depth: int, kernel_size: int, l2norm: bool = True):
+        super().__init__()
+        C_in = alphabet_size
+        C = channel
+        K = kernel_size
+        pad = K // 2
+        blocks = [
+            nn.Conv1d(C_in, C, K, stride=1, padding=pad, bias=False),
+            nn.BatchNorm1d(C),
+            nn.ReLU(inplace=True),
+        ]
+        for _ in range(depth - 1):
+            blocks += [
+                nn.Conv1d(C, C, K, stride=1, padding=pad, bias=False),
+                nn.BatchNorm1d(C),
+                nn.ReLU(inplace=True),
+                nn.AvgPool1d(2),
+            ]
+        self.conv = nn.Sequential(*blocks)
+        self.pool = nn.AdaptiveAvgPool1d(1)
+        self.proj = nn.Sequential(
+            nn.Linear(C, C),
+            nn.ReLU(inplace=True),
+            nn.Linear(C, target_size),
+        )
+        self.l2norm = l2norm
+    def encode(self, x: torch.Tensor):
+        # x: (B, A, L)
+        z = self.conv(x)              # (B, C, L')
+        z = self.pool(z).squeeze(-1)  # (B, C)
+        y = self.proj(z)              # (B, D)
+        if self.l2norm:
+            y = F.normalize(y, dim=-1)
+        return y, z
+    def forward(self, a: torch.Tensor, p: torch.Tensor, n: torch.Tensor):
+        ay, _ = self.encode(a)
+        py, _ = self.encode(p)
+        ny, _ = self.encode(n)
+        return ay, py, ny

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:aee9a63233ca456f0f7a044ef706053698d96d4d52639e07f20be38854c61414
+size 4272400

requirements.txt ADDED Viewed

	@@ -0,0 +1,5 @@

+torch>=2.0.0
+numpy>=1.24
+safetensors>=0.4.0
+tqdm>=4.65
+biopython>=1.83