Upload folder using huggingface_hub
Browse files- README.md +66 -0
- pytorch_model.bin +3 -0
- trainable_codon_encoder.py +95 -0
README.md
ADDED
|
@@ -0,0 +1,66 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
---
|
| 2 |
+
language:
|
| 3 |
+
- en
|
| 4 |
+
tags:
|
| 5 |
+
- biology
|
| 6 |
+
- genomics
|
| 7 |
+
- codon-optimization
|
| 8 |
+
- p-adic-math
|
| 9 |
+
- hyperbolic-geometry
|
| 10 |
+
- ddg-prediction
|
| 11 |
+
license: other
|
| 12 |
+
metrics:
|
| 13 |
+
- spearmanr
|
| 14 |
+
---
|
| 15 |
+
|
| 16 |
+
# Ternary Codon Encoder: P-adic Hyperbolic Embeddings
|
| 17 |
+
|
| 18 |
+
The Ternary Codon Encoder is a neural embedding model that maps the 64 genetic codons into a 16-dimensional hyperbolic space. It is the first model to explicitly use **3-adic valuation** as a mathematical prior to organize the genetic code's hierarchical structure.
|
| 19 |
+
|
| 20 |
+
## Model Description
|
| 21 |
+
|
| 22 |
+
- **Architecture:** MLP-based encoder (12-dim one-hot input $
|
| 23 |
+
- **Mathematical Foundation:** Leverages 3-adic mathematics to represent the discrete hierarchy of the codon table.
|
| 24 |
+
- **Latent Space:** Poincaré ball where radial distance encodes 3-adic valuation (conservation/variability).
|
| 25 |
+
|
| 26 |
+
## Key Discoveries
|
| 27 |
+
|
| 28 |
+
- **Physics Dimension:** Latent dimension 13 correlates strongly ($
|
| 29 |
+
- **Linear Stability Manifold:** Provides high-quality feature vectors for sequence-only protein stability ($\Delta\Delta G$) prediction.
|
| 30 |
+
- **Synonymous Cohesion:** Synonymous codons cluster together in hyperbolic space while maintaining clear boundaries between amino acid groups.
|
| 31 |
+
|
| 32 |
+
## Performance
|
| 33 |
+
|
| 34 |
+
- **DDG Spearman $
|
| 35 |
+
- **Improvement:** +105% over baseline p-adic embedding models.
|
| 36 |
+
|
| 37 |
+
## Usage
|
| 38 |
+
|
| 39 |
+
```python
|
| 40 |
+
import torch
|
| 41 |
+
from trainable_codon_encoder import TrainableCodonEncoder
|
| 42 |
+
|
| 43 |
+
# Load model
|
| 44 |
+
encoder = TrainableCodonEncoder(latent_dim=16, hidden_dim=64)
|
| 45 |
+
checkpoint = torch.load("pytorch_model.bin", map_location="cpu")
|
| 46 |
+
encoder.load_state_dict(checkpoint["model_state_dict"])
|
| 47 |
+
encoder.eval()
|
| 48 |
+
|
| 49 |
+
# Get embedding for a codon (e.g., ATG index 14)
|
| 50 |
+
codon_idx = torch.tensor([14])
|
| 51 |
+
with torch.no_grad():
|
| 52 |
+
z_hyp = encoder(codon_idx)
|
| 53 |
+
|
| 54 |
+
print(f"Hyperbolic Embedding: {z_hyp}")
|
| 55 |
+
```
|
| 56 |
+
|
| 57 |
+
## Citation
|
| 58 |
+
|
| 59 |
+
```bibtex
|
| 60 |
+
@software{ternary_codon_2026,
|
| 61 |
+
author = {AI Whisperers},
|
| 62 |
+
title = {Ternary Codon Encoder: P-adic Hyperbolic Embeddings},
|
| 63 |
+
year = {2026},
|
| 64 |
+
url = {https://huggingface.co/ai-whisperers/ternary-codon-encoder}
|
| 65 |
+
}
|
| 66 |
+
```
|
pytorch_model.bin
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d6113ab0ebf09b6acd0f2ff8a9a47a479468752c5676a034337e32fa7add8861
|
| 3 |
+
size 51921
|
trainable_codon_encoder.py
ADDED
|
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
import torch.nn as nn
|
| 3 |
+
import torch.nn.functional as F
|
| 4 |
+
import math
|
| 5 |
+
from typing import Optional
|
| 6 |
+
|
| 7 |
+
# =============================================================================
|
| 8 |
+
# Biology Data
|
| 9 |
+
# =============================================================================
|
| 10 |
+
|
| 11 |
+
GENETIC_CODE = {
|
| 12 |
+
'ATA':'I', 'ATC':'I', 'ATT':'I', 'ATG':'M',
|
| 13 |
+
'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
|
| 14 |
+
'AAC':'N', 'AAT':'N', 'AAA':'K', 'AAG':'K',
|
| 15 |
+
'AGC':'S', 'AGT':'S', 'AGA':'R', 'AGG':'R',
|
| 16 |
+
'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
|
| 17 |
+
'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
|
| 18 |
+
'CAC':'H', 'CAT':'H', 'CAA':'Q', 'CAG':'Q',
|
| 19 |
+
'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
|
| 20 |
+
'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',
|
| 21 |
+
'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
|
| 22 |
+
'GAC':'D', 'GAT':'D', 'GAA':'E', 'GAG':'E',
|
| 23 |
+
'GGA':'G', 'GGC':'A', 'GGG':'G', 'GGT':'G', # Note: GGC is G, typo in some maps but let's be careful
|
| 24 |
+
'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
|
| 25 |
+
'TTC':'F', 'TTT':'F', 'TTA':'L', 'TTG':'L',
|
| 26 |
+
'TAC':'Y', 'TAT':'Y', 'TAA':'*', 'TAG':'*',
|
| 27 |
+
'TGC':'C', 'TGT':'C', 'TGA':'*', 'TGG':'W',
|
| 28 |
+
}
|
| 29 |
+
# Correction
|
| 30 |
+
GENETIC_CODE['GGC'] = 'G'
|
| 31 |
+
|
| 32 |
+
BASES = ['A', 'C', 'G', 'T']
|
| 33 |
+
CODON_TO_INDEX = {b1+b2+b3: i for i, (b1,b2,b3) in enumerate([(b1,b2,b3) for b1 in BASES for b2 in BASES for b3 in BASES])}
|
| 34 |
+
INDEX_TO_CODON = {v: k for k, v in CODON_TO_INDEX.items()}
|
| 35 |
+
|
| 36 |
+
# =============================================================================
|
| 37 |
+
# Hyperbolic Utilities
|
| 38 |
+
# =============================================================================
|
| 39 |
+
|
| 40 |
+
def exp_map_zero(x: torch.Tensor, c: float = 1.0) -> torch.Tensor:
|
| 41 |
+
sqrt_c = math.sqrt(c)
|
| 42 |
+
norm_x = torch.norm(x, p=2, dim=-1, keepdim=True)
|
| 43 |
+
norm_x = torch.clamp(norm_x, min=1e-15)
|
| 44 |
+
res = torch.tanh(sqrt_c * norm_x) * x / (sqrt_c * norm_x)
|
| 45 |
+
return res
|
| 46 |
+
|
| 47 |
+
def project_to_poincare(z: torch.Tensor, max_norm: float = 0.95, c: float = 1.0) -> torch.Tensor:
|
| 48 |
+
norm = torch.norm(z, p=2, dim=-1, keepdim=True)
|
| 49 |
+
mask = norm > max_norm
|
| 50 |
+
projected = (z / norm) * max_norm
|
| 51 |
+
return torch.where(mask, projected, z)
|
| 52 |
+
|
| 53 |
+
# =============================================================================
|
| 54 |
+
# Codon Encoder
|
| 55 |
+
# =============================================================================
|
| 56 |
+
|
| 57 |
+
class CodonEncoderMLP(nn.Module):
|
| 58 |
+
def __init__(self, latent_dim=16, hidden_dim=64, dropout=0.1):
|
| 59 |
+
super().__init__()
|
| 60 |
+
self.encoder = nn.Sequential(
|
| 61 |
+
nn.Linear(12, hidden_dim), nn.LayerNorm(hidden_dim), nn.SiLU(), nn.Dropout(dropout),
|
| 62 |
+
nn.Linear(hidden_dim, hidden_dim), nn.LayerNorm(hidden_dim), nn.SiLU(), nn.Dropout(dropout),
|
| 63 |
+
nn.Linear(hidden_dim, latent_dim)
|
| 64 |
+
)
|
| 65 |
+
def forward(self, x): return self.encoder(x)
|
| 66 |
+
|
| 67 |
+
class TrainableCodonEncoder(nn.Module):
|
| 68 |
+
def __init__(self, latent_dim=16, hidden_dim=64, curvature=1.0, max_radius=0.9, dropout=0.1):
|
| 69 |
+
super().__init__()
|
| 70 |
+
self.latent_dim = latent_dim; self.curvature = curvature; self.max_radius = max_radius
|
| 71 |
+
self.encoder = CodonEncoderMLP(latent_dim, hidden_dim, dropout)
|
| 72 |
+
|
| 73 |
+
# Precompute one-hots
|
| 74 |
+
onehots = torch.zeros(64, 12)
|
| 75 |
+
base_to_idx = {'A':0, 'C':1, 'G':2, 'T':3, 'U':3}
|
| 76 |
+
for i in range(64):
|
| 77 |
+
codon = INDEX_TO_CODON[i]
|
| 78 |
+
for pos, base in enumerate(codon):
|
| 79 |
+
onehots[i, pos*4 + base_to_idx[base]] = 1.0
|
| 80 |
+
self.register_buffer('codon_onehots', onehots)
|
| 81 |
+
|
| 82 |
+
def encode_all(self):
|
| 83 |
+
z_tangent = self.encoder(self.codon_onehots)
|
| 84 |
+
z_hyp = exp_map_zero(z_tangent, c=self.curvature)
|
| 85 |
+
return project_to_poincare(z_hyp, max_norm=self.max_radius, c=self.curvature)
|
| 86 |
+
|
| 87 |
+
def forward(self, codon_indices):
|
| 88 |
+
flat_indices = codon_indices.flatten()
|
| 89 |
+
onehots = self.codon_onehots[flat_indices]
|
| 90 |
+
z_tangent = self.encoder(onehots)
|
| 91 |
+
z_hyp = exp_map_zero(z_tangent, c=self.curvature)
|
| 92 |
+
z_hyp = project_to_poincare(z_hyp, max_norm=self.max_radius, c=self.curvature)
|
| 93 |
+
if len(codon_indices.shape) > 1:
|
| 94 |
+
z_hyp = z_hyp.view(*codon_indices.shape, self.latent_dim)
|
| 95 |
+
return z_hyp
|