update from 10/06/2025
Browse files- app.py +144 -142
- introduction_page.md +106 -0
app.py
CHANGED
|
@@ -63,34 +63,39 @@ def select_best_model(
|
|
| 63 |
|
| 64 |
# Definition of the tools for the MCP server
|
| 65 |
# Function to return a fasta file
|
| 66 |
-
def create_fasta_file(
|
| 67 |
"""Create a FASTA file from a protein sequence string with a unique name.
|
| 68 |
|
| 69 |
Args:
|
| 70 |
-
|
| 71 |
-
name (str, optional):
|
| 72 |
-
seq_name (str, optional): The name/identifier for the sequence. Defaults to "
|
| 73 |
|
| 74 |
|
| 75 |
Returns:
|
| 76 |
str: Name of the created FASTA file
|
| 77 |
"""
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
# Remove any trailing/leading whitespace but preserve line breaks
|
| 79 |
-
lines =
|
| 80 |
|
| 81 |
# Check if the first line is a FASTA header
|
| 82 |
if not lines[0].startswith('>'):
|
| 83 |
# If no header provided, add one
|
| 84 |
if seq_name is None:
|
| 85 |
-
seq_name = "
|
| 86 |
-
|
| 87 |
|
| 88 |
# Create FASTA content (preserving line breaks)
|
| 89 |
-
fasta_content =
|
| 90 |
|
| 91 |
# Generate a unique file name
|
| 92 |
unique_id = hashlib.sha256(uuid4().bytes).hexdigest()[:8]
|
| 93 |
-
file_name = f"
|
| 94 |
file_path = here / "inputs/fasta" / file_name
|
| 95 |
|
| 96 |
# Write the FASTA file
|
|
@@ -113,7 +118,7 @@ def create_json_config(
|
|
| 113 |
num_trunk_recycles (int): Number of trunk recycles from slider
|
| 114 |
seed (int): Random seed from slider
|
| 115 |
options (list): List of selected options from checkbox group
|
| 116 |
-
name (str, optional):
|
| 117 |
|
| 118 |
Returns:
|
| 119 |
str: Name of the created JSON file
|
|
@@ -132,7 +137,7 @@ def create_json_config(
|
|
| 132 |
}
|
| 133 |
|
| 134 |
# Generate file name based on provided name or unique ID
|
| 135 |
-
file_name = f"
|
| 136 |
file_path = here / "inputs/config" / file_name
|
| 137 |
|
| 138 |
# Write the JSON file
|
|
@@ -142,43 +147,43 @@ def create_json_config(
|
|
| 142 |
|
| 143 |
# Function to compute Chai1 inference
|
| 144 |
def compute_Chai1(
|
| 145 |
-
|
| 146 |
-
|
| 147 |
):
|
| 148 |
"""Compute a Chai1 simulation.
|
| 149 |
|
| 150 |
Args:
|
| 151 |
-
|
| 152 |
If not provided, uses the default input file.
|
| 153 |
-
|
| 154 |
If not provided, uses the default quick inference configuration.
|
| 155 |
|
| 156 |
Returns:
|
| 157 |
-
|
| 158 |
"""
|
|
|
|
| 159 |
with app.run():
|
| 160 |
-
|
| 161 |
force_redownload = False
|
| 162 |
|
| 163 |
print("🧬 checking inference dependencies")
|
| 164 |
download_inference_dependencies.remote(force=force_redownload)
|
| 165 |
|
| 166 |
-
#
|
| 167 |
-
if not
|
| 168 |
-
|
| 169 |
-
print(f"🧬 running Chai inference on {
|
| 170 |
-
|
| 171 |
-
print(
|
| 172 |
-
fasta_content = Path(
|
| 173 |
|
| 174 |
# Define inference config file
|
| 175 |
-
if not
|
| 176 |
-
|
| 177 |
-
|
| 178 |
-
print(f"🧬 loading Chai inference config from {
|
| 179 |
-
inference_config = json.loads(Path(
|
| 180 |
|
| 181 |
-
#
|
| 182 |
run_id = hashlib.sha256(uuid4().bytes).hexdigest()[:8] # short id
|
| 183 |
print(f"🧬 running inference with {run_id=}")
|
| 184 |
|
|
@@ -189,24 +194,82 @@ def compute_Chai1(
|
|
| 189 |
output_dir.mkdir(parents=True, exist_ok=True)
|
| 190 |
|
| 191 |
print(f"🧬 saving results to disk locally in {output_dir}")
|
|
|
|
|
|
|
|
|
|
|
|
|
| 192 |
for ii, (scores, cif) in enumerate(results):
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
scores_to_print=["aggregate_score", "ptm", "iptm"],
|
| 199 |
-
number_of_scores=len(results),
|
| 200 |
-
results_dir=str(Path(output_dir, "score"))
|
| 201 |
-
)
|
| 202 |
-
# Take the last cif file and convert it to pdb
|
| 203 |
-
cif_name = str(Path(output_dir, "molecules"))+"/"+str(run_id)+"-preds.model_idx_"+str(best_model)+".cif"
|
| 204 |
-
pdb_name = cif_name.split('.cif')[0] + '.pdb'
|
| 205 |
-
st = gemmi.read_structure(cif_name)
|
| 206 |
-
st.write_minimal_pdb(pdb_name)
|
| 207 |
|
| 208 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
# Create the Gradio interface
|
| 212 |
reps = [{"model": 0,"style": "cartoon","color": "hydrophobicity"}]
|
|
@@ -223,100 +286,16 @@ with gr.Blocks(theme=theme) as demo:
|
|
| 223 |
|
| 224 |
gr.Image("images/logo1.png", show_label=False,width=400)
|
| 225 |
|
| 226 |
-
|
| 227 |
-
|
| 228 |
-
|
| 229 |
-
# Stakes
|
| 230 |
-
|
| 231 |
-
The industry is being deeply changed by the development of LLMs and the recent possibilities to provide them access to external tools.
|
| 232 |
-
For years now companies are using simulation tools in order faster and reduce the development cost of a product.
|
| 233 |
-
One of the challenge in the coming years will be to create agents that can setup, run and process simulations to faster the development of new products.
|
| 234 |
-
|
| 235 |
-
# Objective
|
| 236 |
-
|
| 237 |
-
This project is a first step in this creating AI agents that perform simulations on existing softwares.
|
| 238 |
-
1) Several domains are of major interest:
|
| 239 |
-
- CFD (Computational Fluid Dynamics) simulations
|
| 240 |
-
- Biology simulations (Protein Folding, Molecular Dynamics, etc.)
|
| 241 |
-
- All applications that use neural networks
|
| 242 |
-
|
| 243 |
-
--> This project is focused on the protein folding domain, but the same principles can be applied to other domains.
|
| 244 |
-
|
| 245 |
-
2) Generally, industrial computations are performed on HPC clusters, which have access to large ressources.
|
| 246 |
-
|
| 247 |
-
--> The simulation need to run on a separate server
|
| 248 |
-
|
| 249 |
-
3) The LLM needs to be able to access the simulation results in order to provide a complete answer to the user.
|
| 250 |
-
|
| 251 |
-
--> The simulation results need to be accessible by the LLM
|
| 252 |
-
|
| 253 |
-
## Modal
|
| 254 |
-
|
| 255 |
-
Modal (https://modal.com/) is a serverless platform that provides a simple way to run any application with the latest CPU and GPU hardware.
|
| 256 |
-
|
| 257 |
-
## Chai-1 Model
|
| 258 |
-
|
| 259 |
-
Chai-1 (https://www.chaidiscovery.com/blog/introducing-chai-1) is a multi-modal foundation model for molecular structure prediction that performs at the state-of-the-art across a variety of benchmarks.
|
| 260 |
-
Chai-1 enables unified prediction of proteins, small molecules, DNA, RNA, glycosylations, and more.
|
| 261 |
-
Chai-1 use on Modal server is an example on how to run folding simulations.
|
| 262 |
-
Thus, it is a good choice to start with.
|
| 263 |
-
|
| 264 |
-
# Instructions
|
| 265 |
-
1. Upload a Fasta sequence file containing the molecule sequence.
|
| 266 |
-
2. Click the "Run" button to start the simulation.
|
| 267 |
-
3. The output will be a 3D visualization of the molecule.
|
| 268 |
-
|
| 269 |
-
## Simulation parameters choice
|
| 270 |
-
If no config or fasta files are created, default values are chosen:
|
| 271 |
-
- chai1_default_input.fasta
|
| 272 |
-
- chai1_quick_inference.json
|
| 273 |
-
|
| 274 |
-
The files content is diplayed at the bottom of the page.
|
| 275 |
-
The default json configuration makes the computation fast (about 2min) but results can be disappointing.
|
| 276 |
-
Please use chai1_default_inference.json to have a wonderful protein 😃.
|
| 277 |
-
|
| 278 |
-
- chai1_default_input.fasta
|
| 279 |
-
```
|
| 280 |
-
>protein|name=example-of-long-protein
|
| 281 |
-
AGSHSMRYFSTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASPRGEPRAPWVEQEGPEYWDRETQKYKRQAQTDRVSLRNLRGYYNQSEAGSHTLQWMFGCDLGPDGRLLRGYDQSAYDGKDYIALNEDLRSWTAADTAAQITQRKWEAAREAEQRRAYLEGTCVEWLRRYLENGKETLQRAEHPKTHVTHHPVSDHEATLRCWALGFYPAEITLTWQWDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPEPLTLRWEP
|
| 282 |
-
>protein|name=example-of-short-protein
|
| 283 |
-
AIQRTPKIQVYSRHPAENGKSNFLNCYVSGFHPSDIEVDLLKNGERIEKVEHSDLSFSKDWSFYLLYYTEFTPTEKDEYACRVNHVTLSQPKIVKWDRDM
|
| 284 |
-
>protein|name=example-peptide
|
| 285 |
-
GAAL
|
| 286 |
-
>ligand|name=example-ligand-as-smiles
|
| 287 |
-
CCCCCCCCCCCCCC(=O)O
|
| 288 |
-
```
|
| 289 |
-
- chai1_quick_inference.json
|
| 290 |
-
```json
|
| 291 |
-
{
|
| 292 |
-
"num_trunk_recycles": 1,
|
| 293 |
-
"num_diffn_timesteps": 10,
|
| 294 |
-
"seed": 42,
|
| 295 |
-
"use_esm_embeddings": true
|
| 296 |
-
"use_msa_server": false
|
| 297 |
-
}
|
| 298 |
-
```
|
| 299 |
-
|
| 300 |
-
# Work performed
|
| 301 |
-
This interface allows you to run Chai1 simulations on a given Fasta sequence file.
|
| 302 |
-
The Chai1 model is designed to predict the 3D structure of proteins based on their amino acid sequences.
|
| 303 |
-
You can input a Fasta file containing the sequence of the molecule you want to simulate, and the output will be a 3D representation of the molecule based on the Chai1 model.
|
| 304 |
-
|
| 305 |
-
You can input a Fasta file containing the sequence of the molecule you want to simulate.
|
| 306 |
-
The output will be a 3D representation of the molecule based on the Chai1 model.
|
| 307 |
-
|
| 308 |
-
# Disclaimer
|
| 309 |
-
This interface is for educational and research purposes only. The results may vary based on the input sequence and the Chai1 model's capabilities.
|
| 310 |
-
# Contact
|
| 311 |
-
For any issues or questions, please contact the developer or refer to the documentation.
|
| 312 |
-
""")
|
| 313 |
|
| 314 |
|
| 315 |
with gr.Tab("Configuration 📦"):
|
| 316 |
|
| 317 |
gr.Markdown(
|
| 318 |
"""
|
| 319 |
-
## Fasta file and configuration generator
|
| 320 |
""")
|
| 321 |
|
| 322 |
with gr.Row():
|
|
@@ -324,15 +303,15 @@ with gr.Blocks(theme=theme) as demo:
|
|
| 324 |
slider_nb = gr.Slider(1, 500, value=300, label="Number of diffusion time steps", info="Choose the number of diffusion time steps for the simulation", step=1, interactive=True, elem_id="num_iterations")
|
| 325 |
slider_trunk = gr.Slider(1, 5, value=3, label="Number of trunk recycles", info="Choose the number of iterations for the simulation", step=1, interactive=True, elem_id="trunk_number")
|
| 326 |
slider_seed = gr.Slider(1, 100, value=42, label="Seed", info="Choose the seed", step=1, interactive=True, elem_id="seed")
|
| 327 |
-
check_options = gr.CheckboxGroup(["ESM_embeddings", "MSA_server"], value=["ESM_embeddings",], label="
|
| 328 |
-
config_name = gr.Textbox(placeholder="Enter a name for the
|
| 329 |
button_json = gr.Button("Create Config file")
|
| 330 |
button_json.click(fn=create_json_config, inputs=[slider_nb, slider_trunk, slider_seed, check_options, config_name], outputs=[])
|
| 331 |
|
| 332 |
|
| 333 |
with gr.Column(scale=1):
|
| 334 |
fasta_input = gr.Textbox(placeholder="Fasta format sequences", label="Fasta content", lines=10)
|
| 335 |
-
fasta_name = gr.Textbox(placeholder="Enter
|
| 336 |
fasta_button = gr.Button("Create Fasta file")
|
| 337 |
fasta_button.click(fn=create_fasta_file, inputs=[fasta_input, fasta_name], outputs=[])
|
| 338 |
|
|
@@ -352,29 +331,52 @@ with gr.Blocks(theme=theme) as demo:
|
|
| 352 |
inp1 = gr.FileExplorer(root_dir=here / "inputs/fasta",
|
| 353 |
value="chai1_default_input.fasta",
|
| 354 |
label="Input Fasta file",
|
| 355 |
-
file_count='single'
|
| 356 |
-
glob="*.fasta")
|
| 357 |
|
| 358 |
with gr.Column(scale=1):
|
| 359 |
inp2 = gr.FileExplorer(root_dir=here / "inputs/config",
|
| 360 |
value="chai1_quick_inference.json",
|
| 361 |
label="Configuration file",
|
| 362 |
-
file_count='single'
|
| 363 |
-
glob="*.json")
|
| 364 |
btn_refresh = gr.Button("Refresh available files")
|
| 365 |
|
| 366 |
# Only workaround I found to update the file explorer
|
| 367 |
def update_file_explorer():
|
|
|
|
| 368 |
return gr.FileExplorer(root_dir=here), gr.FileExplorer(root_dir=here)
|
| 369 |
def update_file_explorer_2():
|
|
|
|
| 370 |
return gr.FileExplorer(root_dir=here / "inputs/fasta"), gr.FileExplorer(root_dir=here / "inputs/config")
|
| 371 |
|
| 372 |
btn_refresh.click(update_file_explorer, outputs=[inp1,inp2]).then(update_file_explorer_2, outputs=[inp1, inp2])
|
| 373 |
|
| 374 |
-
out = Molecule3D(label="Plot the 3D Molecule", reps=reps)
|
| 375 |
-
|
| 376 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
|
|
|
|
|
|
|
|
|
|
| 378 |
|
| 379 |
# Launch both the Gradio web interface and the MCP server
|
| 380 |
if __name__ == "__main__":
|
|
|
|
| 63 |
|
| 64 |
# Definition of the tools for the MCP server
|
| 65 |
# Function to return a fasta file
|
| 66 |
+
def create_fasta_file(file_content: str, name: Optional[str] = None, seq_name: Optional[str] = None) -> str:
|
| 67 |
"""Create a FASTA file from a protein sequence string with a unique name.
|
| 68 |
|
| 69 |
Args:
|
| 70 |
+
file_content (str): The content of the FASTA file required with optional line breaks
|
| 71 |
+
name (str, optional): FASTA file name ending with .fasta ideally. If not provided, a unique ID will be generated
|
| 72 |
+
seq_name (str, optional): The name/identifier for the sequence. Defaults to "protein"
|
| 73 |
|
| 74 |
|
| 75 |
Returns:
|
| 76 |
str: Name of the created FASTA file
|
| 77 |
"""
|
| 78 |
+
# If the file_content is empty, raise an error
|
| 79 |
+
if not file_content.strip():
|
| 80 |
+
print("Fasta file content cannot be empty so the example fasta file will be used")
|
| 81 |
+
file_content = ">protein|name=example-protein\nAGSHSMRYFSTSVSRPGRGEPRFIAVGYVDDTQFVRFD"
|
| 82 |
+
|
| 83 |
# Remove any trailing/leading whitespace but preserve line breaks
|
| 84 |
+
lines = file_content.strip().split('\n')
|
| 85 |
|
| 86 |
# Check if the first line is a FASTA header
|
| 87 |
if not lines[0].startswith('>'):
|
| 88 |
# If no header provided, add one
|
| 89 |
if seq_name is None:
|
| 90 |
+
seq_name = "protein"
|
| 91 |
+
file_content = f">{seq_name}\n{file_content}"
|
| 92 |
|
| 93 |
# Create FASTA content (preserving line breaks)
|
| 94 |
+
fasta_content = file_content
|
| 95 |
|
| 96 |
# Generate a unique file name
|
| 97 |
unique_id = hashlib.sha256(uuid4().bytes).hexdigest()[:8]
|
| 98 |
+
file_name = f"{name if name else unique_id}"
|
| 99 |
file_path = here / "inputs/fasta" / file_name
|
| 100 |
|
| 101 |
# Write the FASTA file
|
|
|
|
| 118 |
num_trunk_recycles (int): Number of trunk recycles from slider
|
| 119 |
seed (int): Random seed from slider
|
| 120 |
options (list): List of selected options from checkbox group
|
| 121 |
+
name (str, optional): JSON config file name ending with .json ideally. If not provided, a unique ID will be generated
|
| 122 |
|
| 123 |
Returns:
|
| 124 |
str: Name of the created JSON file
|
|
|
|
| 137 |
}
|
| 138 |
|
| 139 |
# Generate file name based on provided name or unique ID
|
| 140 |
+
file_name = f"{name if name else hashlib.sha256(uuid4().bytes).hexdigest()[:8]}"
|
| 141 |
file_path = here / "inputs/config" / file_name
|
| 142 |
|
| 143 |
# Write the JSON file
|
|
|
|
| 147 |
|
| 148 |
# Function to compute Chai1 inference
|
| 149 |
def compute_Chai1(
|
| 150 |
+
fasta_file_name: Optional[str] = "",
|
| 151 |
+
inference_config_file_name: Optional[str] = "",
|
| 152 |
):
|
| 153 |
"""Compute a Chai1 simulation.
|
| 154 |
|
| 155 |
Args:
|
| 156 |
+
fasta_file_name (str, optional): FASTA file name to use for the Chai1 simulation.
|
| 157 |
If not provided, uses the default input file.
|
| 158 |
+
inference_config_file_name (str, optional): JSON configuration file name for inference.
|
| 159 |
If not provided, uses the default quick inference configuration.
|
| 160 |
|
| 161 |
Returns:
|
| 162 |
+
pd.DataFrame: DataFrame containing model scores and CIF file paths
|
| 163 |
"""
|
| 164 |
+
import pandas as pd
|
| 165 |
with app.run():
|
|
|
|
| 166 |
force_redownload = False
|
| 167 |
|
| 168 |
print("🧬 checking inference dependencies")
|
| 169 |
download_inference_dependencies.remote(force=force_redownload)
|
| 170 |
|
| 171 |
+
# Define fasta file
|
| 172 |
+
if not fasta_file_name:
|
| 173 |
+
fasta_file_name = here / "inputs/fasta" / "chai1_default_input.fasta"
|
| 174 |
+
print(f"🧬 running Chai inference on {fasta_file_name}")
|
| 175 |
+
fasta_file_name = here / "inputs/fasta" / fasta_file_name
|
| 176 |
+
print(fasta_file_name)
|
| 177 |
+
fasta_content = Path(fasta_file_name).read_text()
|
| 178 |
|
| 179 |
# Define inference config file
|
| 180 |
+
if not inference_config_file_name:
|
| 181 |
+
inference_config_file_name = here / "inputs/config" / "chai1_quick_inference.json"
|
| 182 |
+
inference_config_file_name = here / "inputs/config" / inference_config_file_name
|
| 183 |
+
print(f"🧬 loading Chai inference config from {inference_config_file_name}")
|
| 184 |
+
inference_config = json.loads(Path(inference_config_file_name).read_text())
|
| 185 |
|
| 186 |
+
# Generate a unique run ID
|
| 187 |
run_id = hashlib.sha256(uuid4().bytes).hexdigest()[:8] # short id
|
| 188 |
print(f"🧬 running inference with {run_id=}")
|
| 189 |
|
|
|
|
| 194 |
output_dir.mkdir(parents=True, exist_ok=True)
|
| 195 |
|
| 196 |
print(f"🧬 saving results to disk locally in {output_dir}")
|
| 197 |
+
|
| 198 |
+
# Create lists to store data for DataFrame
|
| 199 |
+
model_data = []
|
| 200 |
+
|
| 201 |
for ii, (scores, cif) in enumerate(results):
|
| 202 |
+
score_file = Path(output_dir, "score") / f"{run_id}-scores.model_idx_{ii}.npz"
|
| 203 |
+
cif_file = Path(output_dir, "molecules") / f"{run_id}-preds.model_idx_{ii}.cif"
|
| 204 |
+
|
| 205 |
+
score_file.write_bytes(scores)
|
| 206 |
+
cif_file.write_text(cif)
|
| 207 |
+
|
| 208 |
+
# Load score data
|
| 209 |
+
data = load(str(score_file))
|
| 210 |
+
|
| 211 |
+
if not data["has_inter_chain_clashes"][0]:
|
| 212 |
+
model_data.append({
|
| 213 |
+
"Model Index": ii,
|
| 214 |
+
"Aggregate Score": float(data["aggregate_score"][0]),
|
| 215 |
+
"PTM": float(data["ptm"][0]),
|
| 216 |
+
"IPTM": float(data["iptm"][0]),
|
| 217 |
+
"CIF File": str(cif_file).split("/")[-1], # Get just the file name
|
| 218 |
+
})
|
| 219 |
|
| 220 |
+
# Create DataFrame from collected data
|
| 221 |
+
results_df = pd.DataFrame(model_data).sort_values("Aggregate Score", ascending=False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 222 |
|
| 223 |
+
return results_df
|
| 224 |
+
|
| 225 |
+
|
| 226 |
+
# Function to plot the 3D protein structure
|
| 227 |
+
def plot_protein(result_df) -> str:
|
| 228 |
+
"""Plot the 3D structure of a protein using the DataFrame from compute_Chai1.
|
| 229 |
|
| 230 |
+
Args:
|
| 231 |
+
result_df (pd.DataFrame): DataFrame containing model information and scores
|
| 232 |
+
|
| 233 |
+
Returns:
|
| 234 |
+
str: Path to the generated PDB file of the best model.
|
| 235 |
+
"""
|
| 236 |
+
if result_df.empty:
|
| 237 |
+
return "" # Return empty string instead of None for type safety
|
| 238 |
+
|
| 239 |
+
# Get the CIF file path of the model with highest aggregate score (already sorted)
|
| 240 |
+
best_cif = str(Path("results/molecules") / result_df.iloc[0]["CIF File"])
|
| 241 |
+
|
| 242 |
+
# Generate PDB file name
|
| 243 |
+
pdb_file = best_cif.replace('.cif', '.pdb')
|
| 244 |
+
|
| 245 |
+
# Convert CIF to PDB if it doesn't exist
|
| 246 |
+
if not Path(pdb_file).exists():
|
| 247 |
+
st = gemmi.read_structure(best_cif)
|
| 248 |
+
st.write_minimal_pdb(pdb_file)
|
| 249 |
+
|
| 250 |
+
return pdb_file
|
| 251 |
+
|
| 252 |
+
# Function to plot a CIF file
|
| 253 |
+
def show_cif_file(cif_file):
|
| 254 |
+
"""Plot a 3D structure from a CIF file with the Molecule3D library.
|
| 255 |
+
|
| 256 |
+
Args:
|
| 257 |
+
cif_file: A protein structure file in CIF format. This can be a file uploaded by the user.
|
| 258 |
+
If None, the function will return None.
|
| 259 |
+
|
| 260 |
+
Returns:
|
| 261 |
+
str or None: PDB file name if successful, None if no file was provided
|
| 262 |
+
or if conversion failed.
|
| 263 |
+
"""
|
| 264 |
+
if not cif_file:
|
| 265 |
+
return None
|
| 266 |
+
|
| 267 |
+
cif_path = Path(cif_file.name)
|
| 268 |
+
st = gemmi.read_structure(str(cif_path))
|
| 269 |
+
pdb_file = cif_path.with_suffix('.pdb')
|
| 270 |
+
st.write_minimal_pdb(str(pdb_file)) # Convert PosixPath to string
|
| 271 |
+
|
| 272 |
+
return str(pdb_file)
|
| 273 |
|
| 274 |
# Create the Gradio interface
|
| 275 |
reps = [{"model": 0,"style": "cartoon","color": "hydrophobicity"}]
|
|
|
|
| 286 |
|
| 287 |
gr.Image("images/logo1.png", show_label=False,width=400)
|
| 288 |
|
| 289 |
+
with open("introduction_page.md", "r") as f:
|
| 290 |
+
intro_md = f.read()
|
| 291 |
+
gr.Markdown(intro_md)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 292 |
|
| 293 |
|
| 294 |
with gr.Tab("Configuration 📦"):
|
| 295 |
|
| 296 |
gr.Markdown(
|
| 297 |
"""
|
| 298 |
+
## Fasta file and configuration generator (optional)
|
| 299 |
""")
|
| 300 |
|
| 301 |
with gr.Row():
|
|
|
|
| 303 |
slider_nb = gr.Slider(1, 500, value=300, label="Number of diffusion time steps", info="Choose the number of diffusion time steps for the simulation", step=1, interactive=True, elem_id="num_iterations")
|
| 304 |
slider_trunk = gr.Slider(1, 5, value=3, label="Number of trunk recycles", info="Choose the number of iterations for the simulation", step=1, interactive=True, elem_id="trunk_number")
|
| 305 |
slider_seed = gr.Slider(1, 100, value=42, label="Seed", info="Choose the seed", step=1, interactive=True, elem_id="seed")
|
| 306 |
+
check_options = gr.CheckboxGroup(["ESM_embeddings", "MSA_server"], value=["ESM_embeddings",], label="Additional options", info="Options to use ESM embeddings and MSA server", elem_id="options")
|
| 307 |
+
config_name = gr.Textbox(placeholder="Enter a name for the json file (optional)", label="JSON file name")
|
| 308 |
button_json = gr.Button("Create Config file")
|
| 309 |
button_json.click(fn=create_json_config, inputs=[slider_nb, slider_trunk, slider_seed, check_options, config_name], outputs=[])
|
| 310 |
|
| 311 |
|
| 312 |
with gr.Column(scale=1):
|
| 313 |
fasta_input = gr.Textbox(placeholder="Fasta format sequences", label="Fasta content", lines=10)
|
| 314 |
+
fasta_name = gr.Textbox(placeholder="Enter the name of the fasta file name (optional)", label="Fasta file name")
|
| 315 |
fasta_button = gr.Button("Create Fasta file")
|
| 316 |
fasta_button.click(fn=create_fasta_file, inputs=[fasta_input, fasta_name], outputs=[])
|
| 317 |
|
|
|
|
| 331 |
inp1 = gr.FileExplorer(root_dir=here / "inputs/fasta",
|
| 332 |
value="chai1_default_input.fasta",
|
| 333 |
label="Input Fasta file",
|
| 334 |
+
file_count='single')
|
|
|
|
| 335 |
|
| 336 |
with gr.Column(scale=1):
|
| 337 |
inp2 = gr.FileExplorer(root_dir=here / "inputs/config",
|
| 338 |
value="chai1_quick_inference.json",
|
| 339 |
label="Configuration file",
|
| 340 |
+
file_count='single')
|
|
|
|
| 341 |
btn_refresh = gr.Button("Refresh available files")
|
| 342 |
|
| 343 |
# Only workaround I found to update the file explorer
|
| 344 |
def update_file_explorer():
|
| 345 |
+
"""Don't need to be used by LLMs, but useful for the interface to update the file explorer"""
|
| 346 |
return gr.FileExplorer(root_dir=here), gr.FileExplorer(root_dir=here)
|
| 347 |
def update_file_explorer_2():
|
| 348 |
+
"""Don't need to be used by LLMs, but useful for the interface to update the file explorer"""
|
| 349 |
return gr.FileExplorer(root_dir=here / "inputs/fasta"), gr.FileExplorer(root_dir=here / "inputs/config")
|
| 350 |
|
| 351 |
btn_refresh.click(update_file_explorer, outputs=[inp1,inp2]).then(update_file_explorer_2, outputs=[inp1, inp2])
|
| 352 |
|
| 353 |
+
#out = Molecule3D(label="Plot the 3D Molecule", reps=reps)
|
| 354 |
+
out = gr.DataFrame(
|
| 355 |
+
headers=["Model Index", "Aggregate Score", "PTM", "IPTM", "CIF File"],
|
| 356 |
+
datatype=["number", "number", "number", "number", "str"],
|
| 357 |
+
label="Inference Results sorted by Aggregate Score",
|
| 358 |
+
visible=True,
|
| 359 |
+
)
|
| 360 |
+
out2 = Molecule3D(label="Plot the 3D Molecule", reps=reps)
|
| 361 |
+
|
| 362 |
+
btn = gr.Button("Run Simulation")
|
| 363 |
+
btn.click(fn=compute_Chai1, inputs=[inp1 , inp2], outputs=[out]).then(
|
| 364 |
+
fn=plot_protein,
|
| 365 |
+
inputs=out,
|
| 366 |
+
outputs=out2
|
| 367 |
+
)
|
| 368 |
+
|
| 369 |
+
|
| 370 |
+
with gr.Tab("Show protein from a cif file 💻"):
|
| 371 |
+
|
| 372 |
+
gr.Markdown(
|
| 373 |
+
"""
|
| 374 |
+
## Plot a 3D structure from a CIF file
|
| 375 |
+
""")
|
| 376 |
|
| 377 |
+
cif_input = gr.File(label="Input CIF file", file_count='single')
|
| 378 |
+
cif_output = Molecule3D(label="Plot the 3D Molecule", reps=reps)
|
| 379 |
+
cif_input.change(fn=show_cif_file, inputs=cif_input, outputs=cif_output)
|
| 380 |
|
| 381 |
# Launch both the Gradio web interface and the MCP server
|
| 382 |
if __name__ == "__main__":
|
introduction_page.md
ADDED
|
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
<style>
|
| 2 |
+
code[class*="language-bash"], pre[class*="language-bash"] {
|
| 3 |
+
background: #fff !important;
|
| 4 |
+
}
|
| 5 |
+
</style>
|
| 6 |
+
|
| 7 |
+
# Stakes
|
| 8 |
+
|
| 9 |
+
The industry is being deeply changed by the development of LLMs and the recent possibilities to provide them access to external tools. For years, companies have used simulation tools to accelerate and reduce the cost of product development. One of the main challenges in the coming years will be to create agents that can set up, run, and process simulations to further accelerate innovation.
|
| 10 |
+
|
| 11 |
+
# Objective
|
| 12 |
+
|
| 13 |
+
This project is a first step in creating AI agents that perform simulations on existing software. Key domains include:
|
| 14 |
+
- **CFD** (Computational Fluid Dynamics) simulations
|
| 15 |
+
- **Biology** (Protein Folding, Molecular Dynamics, etc.)
|
| 16 |
+
- **Neural network applications**
|
| 17 |
+
|
| 18 |
+
This project focuses on protein folding, but the same principles can be applied to other domains. In particular it uses [Chai-1](https://www.chaidiscovery.com/blog/introducing-chai-1), which is a multi-modal foundation model for molecular structure prediction, performing at state-of-the-art levels across a variety of benchmarks. Chai-1 enables unified prediction of proteins, small molecules, DNA, RNA, glycosylations, and more. Using Chai-1 on Modal is a great example of running folding simulations.
|
| 19 |
+
|
| 20 |
+
Industrial computations are often performed on HPC clusters with large resources, so simulations typically run on separate servers. The LLM must be able to access simulation results to provide complete answers to users. To this purpose, [Modal](https://modal.com/), a serverless platform that provides a simple way to run any application with the latest CPU and GPU hardware will be used.
|
| 21 |
+
|
| 22 |
+
[Demonstration](https://www.youtube.com/watch?v=P9cAKxJ9Zh8)
|
| 23 |
+
|
| 24 |
+
---
|
| 25 |
+
|
| 26 |
+
# Instructions
|
| 27 |
+
|
| 28 |
+
<div style="background-color:#f5f5f5; border-radius:8px; padding:18px 24px; margin-bottom:24px; border:1px solid #cccccc;">
|
| 29 |
+
|
| 30 |
+
### 1. <span style="color:#2563eb;">(Optional) Create your JSON configuration file</span>
|
| 31 |
+
<small>Default configuration is available if you skip this step.</small>
|
| 32 |
+
|
| 33 |
+
- Set your simulation parameters and generate the JSON config file. A unique identifier will be assigned (e.g., `chai_{run_id}_config.json`).
|
| 34 |
+
- **Parameters:**
|
| 35 |
+
- <b>Number of diffusion time steps:</b> 1 to 500
|
| 36 |
+
- <b>Number of trunk recycles:</b> 1 to 5
|
| 37 |
+
- <b>Seed:</b> 1 to 100
|
| 38 |
+
- <b>ESM_embeddings:</b> Include or not
|
| 39 |
+
- <b>MSA_server:</b> Include or not
|
| 40 |
+
|
| 41 |
+
### 2. <span style="color:#2563eb;">(Optional) Upload a FASTA file with your molecule sequence</span>
|
| 42 |
+
<small>Default FASTA files are available if you skip this step.</small>
|
| 43 |
+
|
| 44 |
+
- Write your FASTA content and create the file. A unique identifier will be assigned (e.g., `chai_{run_id}_input.fasta`).
|
| 45 |
+
- <b style="color:#b91c1c;">Warning:</b> The header must be well formatted for Chai1 to process it.
|
| 46 |
+
|
| 47 |
+
**FASTA template:**
|
| 48 |
+
```fasta
|
| 49 |
+
>{molecule_type}|{molecule_name}
|
| 50 |
+
Sequence (for protein/RNA/DNA) or SMILES for ligand
|
| 51 |
+
```
|
| 52 |
+
|
| 53 |
+
**Accepted molecule types:**
|
| 54 |
+
`protein`/ `rna`/ `dna` / `ligand`
|
| 55 |
+
|
| 56 |
+
**Default input (provided by Chai1):**
|
| 57 |
+
```fasta
|
| 58 |
+
>protein|name=example-of-long-protein
|
| 59 |
+
AGSHSMRYFSTSVSRPGRGEPRFIAVGYVDDTQFVRFDSDAASPRGEPRAPWVEQEGPEYWDRETQKYKRQAQTDRVSLRNLRGYYNQSEAGSHTLQWMFGCDLGPDGRLLRGYDQSAYDGKDYIALNEDLRSWTAADTAAQITQRKWEAAREAEQRRAYLEGTCVEWLRRYLENGKETLQRAEHPKTHVTHHPVSDHEATLRCWALGFYPAEITLTWQWDGEDQTQDTELVETRPAGDGTFQKWAAVVVPSGEEQRYTCHVQHEGLPEPLTLRWEP
|
| 60 |
+
|
| 61 |
+
>protein|name=example-of-short-protein
|
| 62 |
+
AIQRTPKIQVYSRHPAENGKSNFLNCYVSGFHPSDIEVDLLKNGERIEKVEHSDLSFSKDWSFYLLYYTEFTPTEKDEYACRVNHVTLSQPKIVKWDRDM
|
| 63 |
+
|
| 64 |
+
>protein|name=example-peptide
|
| 65 |
+
GAAL
|
| 66 |
+
|
| 67 |
+
>ligand|name=example-ligand-as-smiles
|
| 68 |
+
CCCCCCCCCCCCCC(=O)O
|
| 69 |
+
```
|
| 70 |
+
<small>For a peptide, use `protein` as the molecule type.</small>
|
| 71 |
+
|
| 72 |
+
**Other example:**
|
| 73 |
+
```fasta
|
| 74 |
+
>protein|lysozyme
|
| 75 |
+
MNIFEMLRIDEGLRLKIYKDTEGYYTIGIGHLLTKSPDLNAAKSELDKAIGRNCNGVITKDEAEKLFNQDVDAAVRGILRNAKLKPVYDSLDAVRRCAAINQVFQMGETGVAGFTNSLRMLQQKRWDEAAVNLAKSRWYNQTPDRAKRVITTFRTGTWDAYKNL
|
| 76 |
+
```
|
| 77 |
+
### 3. <span style="color:#2563eb;">Select your config and FASTA files</span>
|
| 78 |
+
<small>Files are stored in your working directory as you create them.</small>
|
| 79 |
+
|
| 80 |
+
### 4. <span style="color:#2563eb;">Click the "Run" button to start the simulation</span>
|
| 81 |
+
|
| 82 |
+
### 5. <span style="color:#2563eb;">View the 3D visualization of your molecule</span>
|
| 83 |
+
</div>
|
| 84 |
+
|
| 85 |
+
## Simulation parameters choice
|
| 86 |
+
If no config or fasta files are created, default values are chosen:
|
| 87 |
+
- chai1_default_input.fasta
|
| 88 |
+
- chai1_quick_inference.json
|
| 89 |
+
|
| 90 |
+
The files content is diplayed at the bottom of the page.
|
| 91 |
+
The default json configuration makes the computation fast (about 2min) but results can be disappointing.
|
| 92 |
+
Please use chai1_default_inference.json to have a wonderful protein 😃.
|
| 93 |
+
|
| 94 |
+
- chai1_quick_inference.json
|
| 95 |
+
```json
|
| 96 |
+
{
|
| 97 |
+
"num_trunk_recycles": 1,
|
| 98 |
+
"num_diffn_timesteps": 10,
|
| 99 |
+
"seed": 42,
|
| 100 |
+
"use_esm_embeddings": true
|
| 101 |
+
"use_msa_server": false
|
| 102 |
+
}
|
| 103 |
+
```
|
| 104 |
+
|
| 105 |
+
# Contact
|
| 106 |
+
For any issues or questions, please contact the developer or refer to the documentation.
|