dsk129 commited on
Commit
4f558d2
·
verified ·
1 Parent(s): f408d1f

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +98 -0
app.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import numpy as np
3
+ import gradio as gr
4
+ import matplotlib.pyplot as plt
5
+ from transformers import AutoTokenizer, EsmModel
6
+ from sklearn.decomposition import PCA
7
+ from Bio.PDB import PDBParser, PDBIO
8
+ import py3Dmol
9
+ import tempfile
10
+ import os
11
+
12
+ # Load ESM-1b model and tokenizer
13
+ model = EsmModel.from_pretrained("facebook/esm1b_t33_650M_UR50S", output_hidden_states=True)
14
+ tokenizer = AutoTokenizer.from_pretrained("facebook/esm1b_t33_650M_UR50S")
15
+
16
+ # Compute PCA and return scaled values for selected components
17
+ def compute_scaled_pca_scores(seq, components):
18
+ inputs = tokenizer(seq, return_tensors="pt")
19
+ with torch.no_grad():
20
+ outputs = model(**inputs)
21
+ embedding = outputs.last_hidden_state[0]
22
+
23
+ L = len(seq)
24
+ embedding = embedding[1:L+1] # remove CLS and EOS
25
+
26
+ pca = PCA(n_components=max(components) + 1)
27
+ pca_result = pca.fit_transform(embedding.detach().cpu().numpy())
28
+
29
+ scaled_components = []
30
+ for c in components:
31
+ selected = pca_result[:, c]
32
+ scaled = (selected - selected.min()) / (selected.max() - selected.min()) * 100
33
+ scaled_components.append(scaled)
34
+
35
+ return scaled_components
36
+
37
+ # Inject scores into B-factor column and save each PDB separately
38
+ def inject_bfactors_and_save(pdb_file, scores_list, component_indices):
39
+ parser = PDBParser(QUIET=True)
40
+ structure = parser.get_structure("prot", pdb_file.name)
41
+ output_paths = []
42
+
43
+ for scores, idx in zip(scores_list, component_indices):
44
+ i = 0
45
+ for model in structure:
46
+ for chain in model:
47
+ for residue in chain:
48
+ if i >= len(scores):
49
+ break
50
+ for atom in residue:
51
+ atom.bfactor = float(scores[i])
52
+ i += 1
53
+ out_path = tempfile.NamedTemporaryFile(delete=False, suffix=f"_PC{idx}.pdb").name
54
+ io = PDBIO()
55
+ io.set_structure(structure)
56
+ io.save(out_path)
57
+ output_paths.append(out_path)
58
+
59
+ return output_paths
60
+
61
+ # Render structure with py3Dmol
62
+ def render_structure(pdb_path):
63
+ with open(pdb_path, 'r') as f:
64
+ pdb_data = f.read()
65
+ view = py3Dmol.view(width=600, height=400)
66
+ view.addModel(pdb_data, 'pdb')
67
+ view.setStyle({'cartoon': {'color': 'bfactor'}})
68
+ view.zoomTo()
69
+ return view._make_html()
70
+
71
+ # Gradio interface logic
72
+ def process(seq, pdb_file, component_string):
73
+ try:
74
+ components = [int(c.strip()) for c in component_string.split(",") if c.strip().isdigit()]
75
+ except:
76
+ return "Error: Please input a comma-separated list of integers.", [], ""
77
+
78
+ scores_list = compute_scaled_pca_scores(seq, components)
79
+ pdb_paths = inject_bfactors_and_save(pdb_file, scores_list, components)
80
+ html_view = render_structure(pdb_paths[0]) if pdb_paths else ""
81
+ return pdb_paths, html_view
82
+
83
+ # Gradio UI
84
+ demo = gr.Interface(
85
+ fn=process,
86
+ inputs=[
87
+ gr.Textbox(label="Input Protein Sequence (1-letter code)"),
88
+ gr.File(label="Upload PDB File", file_types=[".pdb"]),
89
+ gr.Textbox(label="Comma-separated PCA Components (e.g. 0,1,2)")
90
+ ],
91
+ outputs=[
92
+ gr.File(label="Download PDBs with PCA Projections", file_types=[".pdb"], file_count="multiple"),
93
+ gr.HTML(label="Interactive Structure Viewer (first PC only)")
94
+ ],
95
+ title="ESM-1b PCA Component Projection with Interactive 3D Structure"
96
+ )
97
+
98
+ demo.launch()