Spaces:

KangjieXu
/

CASKP

Sleeping

App Files Files Community

KangjieXu commited on Feb 14

Commit

8b8182d

verified ·

1 Parent(s): cf5d567

Upload 8 files

Browse files

Files changed (8) hide show

Dockerfile +15 -0
README.md +11 -10
app.py +72 -0
model.py +277 -0
requirements.txt +15 -0
static/script.js +34 -0
static/style.css +10 -0
templates/index.html +32 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,15 @@

+FROM python:3.10-slim
+WORKDIR /app
+# 安装底层系统依赖（RDKit 和 编译工具）
+RUN apt-get update && apt-get install -y \
+    git \
+    git-lfs \
+    build-essential \
+    libgl1 \
+    libglib2.0-0 \
+    && rm -rf /var/lib/apt/lists/*
+# 锁定 NumPy 1.x 和 Torch 2.4.1 (CPU版)
+RUN pip install --no-cache-dir "numpy<2"

README.md CHANGED Viewed

@@ -1,10 +1,11 @@
----
-title: CASKP
-emoji: 🚀
-colorFrom: red
-colorTo: yellow
-sdk: docker
-pinned: false
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+---
+title: CASKP Predictor
+emoji: 🧬
+colorFrom: green
+colorTo: blue
+sdk: docker
+pinned: false
+---
+# CASKP: Physics-Informed kcat Predictor for β-CAs
+This model integrates **ESM-2** embeddings with **Rosetta Docking Scores** to predict the kcat of β-Carbonic Anhydrases.

app.py ADDED Viewed

	@@ -0,0 +1,72 @@

+from flask import Flask, render_template, request, jsonify
+import torch
+from transformers import AutoTokenizer
+from huggingface_hub import hf_hub_download
+import os
+from model import FullKcatPredictor # 确保 model.py 在同级目录
+app = Flask(__name__)
+# --- 配置 ---
+DEVICE = torch.device("cpu")
+ESM_MODEL_NAME = "facebook/esm2_t33_650M_UR50D"
+REPO_ID = "KangjieXu/CASKP-model" # 修改为你的模型仓库
+MODEL = None
+TOKENIZER = None
+def load_model():
+    global MODEL, TOKENIZER
+    TOKENIZER = AutoTokenizer.from_pretrained(ESM_MODEL_NAME)
+    # 从 Hub 下载权重
+    weights_path = hf_hub_download(repo_id=REPO_ID, filename="caskp_final_model.pt")
+    MODEL = FullKcatPredictor(
+        esm_model_name=ESM_MODEL_NAME,
+        struct_dim=1,
+        d_model=256,
+        d_multiscale=128,
+        num_heads=8,
+        use_amsff=True
+    )
+    MODEL.load_state_dict(torch.load(weights_path, map_location=DEVICE))
+    MODEL.eval()
+@app.route('/')
+def index():
+    return render_template('index.html')
+@app.route('/predict', methods=['POST'])
+def predict():
+    try:
+        data = request.json
+        sequence = data.get('sequence', '').strip().upper()
+        struct_val = float(data.get('score', -7.5)) # 默认值
+        if not sequence:
+            return jsonify({'error': 'Sequence is empty'})
+        # 推理逻辑
+        inputs = TOKENIZER(sequence, return_tensors='pt', padding="max_length", max_length=512, truncation=True)
+        struct_features = torch.tensor([[struct_val]], dtype=torch.float)
+        with torch.no_grad():
+            log_kcat = MODEL(
+                input_ids=inputs['input_ids'],
+                attention_mask=inputs['attention_mask'],
+                struct_features=struct_features
+            ).item()
+        return jsonify({
+            'kcat': round(10**log_kcat, 4),
+            'log_kcat': round(log_kcat, 4),
+            'status': 'success'
+        })
+    except Exception as e:
+        return jsonify({'error': str(e)})
+if __name__ == '__main__':
+    load_model()
+    # HF Spaces 必须监听 7860 端口
+    app.run(host='0.0.0.0', port=7860)

model.py ADDED Viewed

	@@ -0,0 +1,277 @@

+import torch
+import torch.nn as nn
+from transformers import EsmModel
+from torch_geometric.nn import GATv2Conv
+from torch_geometric.data import Data, Batch
+from rdkit import Chem
+from rdkit.Chem import AllChem
+# --- Helper Functions for Graph Creation ---
+def get_atom_features(atom):
+    possible_atoms = ['C', 'O', 'N', 'S', 'F', 'Cl', 'Br', 'I', 'P', 'Co', 'Fe', 'Cu', 'Zn', 'Mg', 'Mn', 'Cr', 'Ni']
+    features = [0] * (len(possible_atoms) + 1)
+    try:
+        idx = possible_atoms.index(atom.GetSymbol())
+        features[idx] = 1
+    except ValueError:
+        features[-1] = 1
+    return features
+def get_bond_features(bond):
+    bond_type = bond.GetBondType()
+    return [
+        bond_type == Chem.rdchem.BondType.SINGLE,
+        bond_type == Chem.rdchem.BondType.DOUBLE,
+        bond_type == Chem.rdchem.BondType.TRIPLE,
+        bond_type == Chem.rdchem.BondType.AROMATIC
+    ]
+def smiles_to_pyg_graph(smiles_string):
+    """
+    Converts a SMILES string into a PyTorch Geometric Data object.
+    Returns None if the SMILES string is invalid.
+    """
+    try:
+        mol = Chem.MolFromSmiles(smiles_string)
+        if mol is None: return None
+        mol = Chem.AddHs(mol)
+        AllChem.EmbedMolecule(mol, AllChem.ETKDG())
+        atom_features_list = [get_atom_features(atom) for atom in mol.GetAtoms()]
+        x = torch.tensor(atom_features_list, dtype=torch.float)
+        if mol.GetNumBonds() > 0:
+            edge_indices, edge_attrs = [], []
+            for bond in mol.GetBonds():
+                i = bond.GetBeginAtomIdx()
+                j = bond.GetEndAtomIdx()
+                edge_indices.append((i, j))
+                edge_indices.append((j, i))
+                bond_features = get_bond_features(bond)
+                edge_attrs.append(bond_features)
+                edge_attrs.append(bond_features)
+            edge_index = torch.tensor(edge_indices, dtype=torch.long).t().contiguous()
+            edge_attr = torch.tensor(edge_attrs, dtype=torch.float)
+        else:
+            edge_index = torch.empty((2, 0), dtype=torch.long)
+            edge_attr = torch.empty((0, 4), dtype=torch.float)
+        return Data(x=x, edge_index=edge_index, edge_attr=edge_attr)
+    except Exception:
+        return None
+# =====================================================================================
+# ==                      TransKP Model and Components (Locked)                      ==
+# =====================================================================================
+class SubstrateGNN(nn.Module):
+    """
+    Graph Attention Network (GATv2) for processing substrate SMILES strings.
+    """
+    def __init__(self, input_dim, hidden_dim, output_dim, heads=4, dropout=0.1):
+        super(SubstrateGNN, self).__init__()
+        self.conv1 = GATv2Conv(input_dim, hidden_dim, heads=heads, dropout=dropout, concat=True)
+        self.conv2 = GATv2Conv(hidden_dim * heads, hidden_dim, heads=heads, dropout=dropout, concat=True)
+        self.conv3 = GATv2Conv(hidden_dim * heads, output_dim, heads=1, dropout=dropout, concat=False)
+        self.elu = nn.ELU()
+        self.dropout = nn.Dropout(p=dropout)
+    def forward(self, data):
+        x, edge_index = data.x, data.edge_index
+        x = self.dropout(self.elu(self.conv1(x, edge_index)))
+        x = self.dropout(self.elu(self.conv2(x, edge_index)))
+        x = self.conv3(x, edge_index)
+        if hasattr(data, 'batch') and data.batch is not None:
+            from torch_geometric.nn import global_mean_pool
+            graph_embedding = global_mean_pool(x, data.batch)
+        else:
+            graph_embedding = x.mean(dim=0, keepdim=True)
+        return graph_embedding
+class FusionBlock(nn.Module):
+    """
+    A single block for cross-modal fusion, combining self-attention and cross-attention.
+    """
+    def __init__(self, d_model, num_heads, dim_feedforward, dropout=0.1):
+        super(FusionBlock, self).__init__()
+        self.self_attn_protein = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
+        self.cross_attn_prot_to_sub = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
+        self.ffn_protein = nn.Sequential(
+            nn.Linear(d_model, dim_feedforward), nn.ReLU(), nn.Dropout(dropout),
+            nn.Linear(dim_feedforward, d_model), nn.Dropout(dropout)
+        )
+        self.norm1 = nn.LayerNorm(d_model)
+        self.norm2 = nn.LayerNorm(d_model)
+        self.norm3 = nn.LayerNorm(d_model)
+    def forward(self, protein_emb, substrate_emb, protein_mask=None):
+        protein_emb = self.norm1(protein_emb + self._sa_block(protein_emb, protein_mask))
+        protein_emb = self.norm2(protein_emb + self._ca_block(protein_emb, substrate_emb))
+        protein_emb = self.norm3(protein_emb + self.ffn_protein(protein_emb))
+        return protein_emb
+    def _sa_block(self, x, key_padding_mask):
+        x, _ = self.self_attn_protein(x, x, x, key_padding_mask=key_padding_mask)
+        return x
+    def _ca_block(self, query, key_value):
+        x, _ = self.cross_attn_prot_to_sub(query, key_value, key_value)
+        return x
+class DeepFusionKcatPredictor(nn.Module):
+    """
+    The TransKP model that integrates ESM-2 for protein sequences and a GNN for substrates.
+    """
+    def __init__(self, esm_model_name, gnn_input_dim, gnn_hidden_dim, gnn_heads, d_model,
+                 num_fusion_blocks, num_attn_heads, dim_feedforward, dropout=0.1):
+        super(DeepFusionKcatPredictor, self).__init__()
+        self.esm_model = EsmModel.from_pretrained(esm_model_name)
+        self.protein_projection = nn.Linear(self.esm_model.config.hidden_size, d_model)
+        self.gnn = SubstrateGNN(input_dim=gnn_input_dim, hidden_dim=gnn_hidden_dim, output_dim=d_model, heads=gnn_heads)
+        self.fusion_blocks = nn.ModuleList([
+            FusionBlock(d_model, num_attn_heads, dim_feedforward, dropout) for _ in range(num_fusion_blocks)
+        ])
+        self.output_regressor = nn.Sequential(
+            nn.Linear(d_model, d_model // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(d_model // 2, 1)
+        )
+    def forward(self, input_ids, attention_mask, smiles_list):
+        batch_size = input_ids.shape[0]
+        device = input_ids.device
+        final_predictions = torch.zeros(batch_size, device=device, dtype=torch.float32)
+        graphs = [smiles_to_pyg_graph(s) for s in smiles_list]
+        valid_indices = [i for i, g in enumerate(graphs) if g is not None]
+        if valid_indices:
+            valid_graphs = [graphs[i] for i in valid_indices]
+            graph_batch = Batch.from_data_list(valid_graphs).to(device)
+            substrate_embedding = self.gnn(graph_batch).unsqueeze(1)
+            valid_input_ids = input_ids[valid_indices]
+            valid_attention_mask = attention_mask[valid_indices]
+            esm_outputs = self.esm_model(input_ids=valid_input_ids, attention_mask=valid_attention_mask)
+            protein_embedding = self.protein_projection(esm_outputs.last_hidden_state)
+            fused_output = protein_embedding
+            key_padding_mask = (valid_attention_mask == 0)
+            for block in self.fusion_blocks:
+                fused_output = block(fused_output, substrate_embedding, protein_mask=key_padding_mask)
+            masked_fused_output = fused_output * valid_attention_mask.unsqueeze(-1)
+            summed_output = masked_fused_output.sum(dim=1)
+            non_pad_count = valid_attention_mask.sum(dim=1, keepdim=True)
+            pooled_output = summed_output / non_pad_count.clamp(min=1e-9)
+            predicted_kcat = self.output_regressor(pooled_output).squeeze(-1)
+            final_predictions[valid_indices] = predicted_kcat.to(torch.float32)
+        return final_predictions
+# =====================================================================================
+# ==                      CASKP Model and Components (New Code)                      ==
+# =====================================================================================
+class AMSFF(nn.Module):
+    """
+    Adaptive Multi-Scale Feature Fusion (AMSFF) block.
+    Extracts multi-scale features from sequence embeddings using 1D convolutions.
+    """
+    def __init__(self, d_model, d_multiscale, dropout=0.1):
+        super(AMSFF, self).__init__()
+        self.d_model = d_model
+        self.conv_k3 = nn.Conv1d(d_model, d_multiscale, kernel_size=3, padding=1)
+        self.conv_k9 = nn.Conv1d(d_model, d_multiscale, kernel_size=9, padding=4)
+        self.conv_k21 = nn.Conv1d(d_model, d_multiscale, kernel_size=21, padding=10)
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(dropout)
+        self.projection = nn.Linear(d_multiscale * 3, d_model)
+    def forward(self, seq_embedding):
+        x = seq_embedding.transpose(1, 2)
+        h_local = self.relu(self.conv_k3(x))
+        h_medium = self.relu(self.conv_k9(x))
+        h_global = self.relu(self.conv_k21(x))
+        h_multi_scale = torch.cat([h_local, h_medium, h_global], dim=1)
+        h_multi_scale = h_multi_scale.transpose(1, 2)
+        projected_features = self.dropout(self.projection(h_multi_scale))
+        return projected_features
+class HyperAttention(nn.Module):
+    """
+    HyperAttention Fusion block.
+    Fuses sequence and structure embeddings using spatial cross-attention.
+    """
+    def __init__(self, d_model, struct_dim, num_heads=8, dropout=0.1):
+        super(HyperAttention, self).__init__()
+        self.d_model = d_model
+        self.spatial_attention = nn.MultiheadAttention(d_model, num_heads, dropout=dropout, batch_first=True)
+        self.struct_projection = nn.Linear(struct_dim, d_model)
+        self.norm1 = nn.LayerNorm(d_model)
+    def forward(self, seq_embedding, struct_features):
+        struct_kv = self.struct_projection(struct_features).unsqueeze(1)
+        spatial_out, _ = self.spatial_attention(seq_embedding, struct_kv, struct_kv)
+        fused_embedding = self.norm1(seq_embedding + spatial_out)
+        return fused_embedding
+class FullKcatPredictor(nn.Module):
+    """
+    The CASKP model, integrating ESM-2, AMSFF, HyperAttention, and a regressor.
+    """
+    def __init__(self, esm_model_name, struct_dim, d_model=256, d_multiscale=128, num_heads=8, dropout=0.1, use_amsff=True):
+        super(FullKcatPredictor, self).__init__()
+        self.use_amsff = use_amsff
+        self.esm_model = EsmModel.from_pretrained(esm_model_name)
+        self.protein_projection = nn.Linear(self.esm_model.config.hidden_size, d_model)
+        if self.use_amsff:
+            self.amsff = AMSFF(d_model, d_multiscale, dropout)
+        self.hyper_attention = HyperAttention(d_model, struct_dim, num_heads, dropout)
+        self.output_regressor = nn.Sequential(
+            nn.Linear(d_model, d_model // 2),
+            nn.ReLU(),
+            nn.Dropout(dropout),
+            nn.Linear(d_model // 2, 1)
+        )
+    def forward(self, input_ids, attention_mask, struct_features):
+        esm_outputs = self.esm_model(input_ids=input_ids, attention_mask=attention_mask)
+        protein_embedding = self.protein_projection(esm_outputs.last_hidden_state)
+        if self.use_amsff:
+            seq_feat_multiscale = self.amsff(protein_embedding)
+            fused_output = self.hyper_attention(seq_feat_multiscale, struct_features)
+        else:
+            fused_output = self.hyper_attention(protein_embedding, struct_features)
+        masked_fused_output = fused_output * attention_mask.unsqueeze(-1)
+        summed_output = masked_fused_output.sum(dim=1)
+        non_pad_count = attention_mask.sum(dim=1, keepdim=True)
+        pooled_output = summed_output / non_pad_count.clamp(min=1e-9)
+        predicted_kcat = self.output_regressor(pooled_output)
+        return predicted_kcat

requirements.txt ADDED Viewed

	@@ -0,0 +1,15 @@

+huggingface_hub>=0.28.0
+transformers>=4.48.0
+numpy<2
+xgboost
+scikit-learn
+flask
+rdkit
+# 图神经网络组件 (适配 Torch 2.4)
+--find-links https://data.pyg.org/whl/torch-2.4.0+cpu.html
+torch_geometric
+torch-scatter
+torch-sparse
+torch-cluster
+torch-spline-conv

static/script.js ADDED Viewed

	@@ -0,0 +1,34 @@

+async function runPrediction() {
+    const seq = document.getElementById('sequence').value;
+    const score = document.getElementById('score').value;
+    const btn = document.getElementById('btn');
+    const resBox = document.getElementById('result');
+    if(!seq) { alert("Please enter a sequence!"); return; }
+    btn.innerText = "Processing...";
+    btn.disabled = true;
+    try {
+        const response = await fetch('/predict', {
+            method: 'POST',
+            headers: { 'Content-Type': 'application/json' },
+            body: JSON.stringify({ sequence: seq, score: score })
+        });
+        const data = await response.json();
+        if(data.status === 'success') {
+            document.getElementById('kcat_val').innerText = data.kcat;
+            document.getElementById('log_kcat_val').innerText = data.log_kcat;
+            resBox.style.display = "block";
+        } else {
+            alert("Error: " + data.error);
+        }
+    } catch (e) {
+        alert("Request failed!");
+    } finally {
+        btn.innerText = "Predict kcat";
+        btn.disabled = false;
+    }
+}

static/style.css ADDED Viewed

	@@ -0,0 +1,10 @@

+body { font-family: 'Segoe UI', Tahoma, Geneva, Verdana, sans-serif; background: #f4f7f6; display: flex; justify-content: center; padding: 50px; }
+.container { background: white; padding: 30px; border-radius: 12px; box-shadow: 0 4px 15px rgba(0,0,0,0.1); width: 100%; max-width: 600px; }
+h1 { color: #2c3e50; text-align: center; }
+.input-group { margin-bottom: 20px; }
+label { display: block; margin-bottom: 8px; font-weight: bold; color: #34495e; }
+textarea { width: 100%; height: 120px; padding: 10px; border: 1px solid #ddd; border-radius: 6px; box-sizing: border-box; }
+input { width: 100%; padding: 10px; border: 1px solid #ddd; border-radius: 6px; box-sizing: border-box; }
+button { width: 100%; padding: 12px; background: #27ae60; color: white; border: none; border-radius: 6px; cursor: pointer; font-size: 16px; }
+button:hover { background: #219150; }
+.result-box { margin-top: 30px; padding: 20px; background: #e8f6ef; border-radius: 8px; border-left: 5px solid #27ae60; }

templates/index.html ADDED Viewed

	@@ -0,0 +1,32 @@

+<!DOCTYPE html>
+<html>
+<head>
+    <title>CASKP Predictor</title>
+    <link rel="stylesheet" href="/static/style.css">
+</head>
+<body>
+    <div class="container">
+        <h1>🧬 CASKP Predictor</h1>
+        <p>Physics-Informed kcat Prediction for β-CAs</p>
+        <div class="input-group">
+            <label>Protein Sequence:</label>
+            <textarea id="sequence" placeholder="Enter amino acid sequence (e.g., MSK...)"></textarea>
+        </div>
+        <div class="input-group">
+            <label>Rosetta Docking Score (Physics Prior):</label>
+            <input type="number" id="score" value="-7.5" step="0.1">
+        </div>
+        <button onclick="runPrediction()" id="btn">Predict kcat</button>
+        <div id="result" class="result-box" style="display:none;">
+            <h3>Prediction Results:</h3>
+            <p>kcat (s⁻¹): <strong id="kcat_val">-</strong></p>
+            <p>log10(kcat): <strong id="log_kcat_val">-</strong></p>
+        </div>
+    </div>
+    <script src="/static/script.js"></script>
+</body>
+</html>