GitLab CI commited on
Commit
b8c7219
·
0 Parent(s):

Latest changes

Browse files
.gitattributes ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
2
+ *.bin filter=lfs diff=lfs merge=lfs -text
.gitignore ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ venv
2
+ .gradio
3
+ __pycache__
4
+ data.txt
README.md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Tmprot
3
+ emoji: 💻
4
+ colorFrom: yellow
5
+ colorTo: pink
6
+ sdk: gradio
7
+ sdk_version: 5.43.1
8
+ app_file: app.py
9
+ pinned: true
10
+ license: gpl-3.0
11
+ short_description: 'Application for protein melting temperature prediction '
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import torch
3
+ from pathlib import Path
4
+ from helpers import load_model, parse_fasta_string
5
+ from io import StringIO
6
+ import csv
7
+ import tempfile
8
+ import transformers
9
+ # mute esm warning for weights
10
+ transformers.logging.set_verbosity_error()
11
+
12
+ # Constants
13
+ MODEL_NAME = "esm2_t33_650M_UR50D"
14
+ CURRENT_DIR = Path(__file__).parent
15
+ PATH_MODEL = CURRENT_DIR / "model"
16
+ DEVICE = "cpu"
17
+ # DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu') # arount 2 mins for fireprot
18
+ VALID_AMINO_ACIDS = set("ACDEFGHIKLMNPQRSTVWY")
19
+
20
+ model, tokenizer = load_model(MODEL_NAME, PATH_MODEL, DEVICE)
21
+
22
+ def predict_tm(seq_text, seq_file, threshold):
23
+ if seq_file is not None:
24
+ with open(seq_file.name, "r", encoding="utf-8") as f:
25
+ fasta_str = f.read()
26
+ elif seq_text.strip():
27
+ fasta_str = seq_text
28
+ else:
29
+ return "Please provide a sequence via text or file."
30
+
31
+ try:
32
+ records = parse_fasta_string(fasta_str)
33
+ except Exception as e:
34
+ return f"FASTA parsing failed: {str(e)}"
35
+
36
+ if not records:
37
+ return "No valid sequences found."
38
+
39
+ results = []
40
+ for i, record in enumerate(records, 1):
41
+ seq = record["sequence"].upper()
42
+ if len(seq) < 20:
43
+ return f"Sequence '{record['id']}' is too short (<20 amino acids)."
44
+ if len(seq) > 2000:
45
+ return f"Sequence '{record['id']}' is too long (>2000 amino acids)."
46
+
47
+ if not set(seq).issubset(VALID_AMINO_ACIDS):
48
+ invalid = "".join(set(seq) - VALID_AMINO_ACIDS)
49
+ return f"Invalid characters in sequence: {invalid}"
50
+ inputs = tokenizer(seq, return_tensors="pt", max_length=512, truncation=True, padding=True)
51
+ inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
52
+ with torch.no_grad():
53
+ outputs = model(**inputs)
54
+ prediction = outputs.logits.squeeze().item()
55
+ results.append({"id": record["id"], "tm": round(prediction, 2)})
56
+ results_sorted = sorted(results, key=lambda x: x["tm"], reverse=True)
57
+ table = [
58
+ [i + 1, r["id"], r["tm"], "Yes" if r["tm"] > float(threshold) else "No"]
59
+ for i, r in enumerate(results_sorted)
60
+ ]
61
+ csv_buffer = StringIO()
62
+ writer = csv.writer(csv_buffer)
63
+ writer.writerow(["Rank", "ID", "Predicted Tm [°C]", f"Thermostable"])
64
+ writer.writerows(table)
65
+ csv_str = csv_buffer.getvalue()
66
+ with tempfile.NamedTemporaryFile(delete=False, suffix=".csv", mode="w", encoding="utf-8") as tmp:
67
+ tmp.write(csv_str)
68
+ tmp_path = tmp.name
69
+ return table, tmp_path
70
+
71
+ demo = gr.Blocks(theme=gr.themes.Origin())
72
+ with demo:
73
+ with gr.Row():
74
+ with gr.Column(scale=1):
75
+ gr.Image("assets/TmProt_logo.png", width=100, height=100, show_label=False, show_download_button=False, container=False, show_share_button=False, show_fullscreen_button=False, interactive=False)
76
+ with gr.Column(scale=7):
77
+ gr.Markdown("""
78
+ # TmProt
79
+ ## Protein Thermostability Predictor
80
+ """)
81
+ gr.Markdown(value="""
82
+ ### TmProt is a machine-learning-based protein thermostability predictor that leverages a fine-tuned ESM-2 protein language model to estimate melting temperatures (Tm) of protein sequences. It enables users to upload protein sequences in FASTA format (either pasted as text or uploaded as a file), and outputs predicted Tm values ranked by a user-defined thermostability threshold.
83
+
84
+ **Paper:** [https://doi.org/10.64898/2026.05.07.723192](https://doi.org/10.64898/2026.05.07.723192)
85
+ **GitHub:** [https://github.com/loschmidt/TmProt](https://github.com/loschmidt/TmProt)
86
+ """
87
+ )
88
+
89
+ with gr.Column(scale=1):
90
+ gr.Image("assets/logo.png", width=100, height=100, show_label=False, show_download_button=False, container=False, show_share_button=False, show_fullscreen_button=False, interactive=False)
91
+ with gr.Row():
92
+ with gr.Column(scale=4):
93
+ seq_text = gr.Textbox(
94
+ label="FASTA sequences",
95
+ lines=6,
96
+ placeholder=">seq\nMKTIIALSYIFCLVFA",
97
+ value="",
98
+ )
99
+ seq_file = gr.File(label="Or upload FASTA file", file_types=[".fasta", ".fa", ".txt"], type="filepath")
100
+ btn = gr.Button("Predict")
101
+ cutoff_bins = [str(x) for x in range(20, 101, 10)]
102
+ cutoff_bar = gr.Radio(
103
+ choices=cutoff_bins,
104
+ label="Select thermostability threshold (°C)",
105
+ info="Default is 60°C",
106
+ value="60"
107
+ )
108
+ with gr.Column(scale=4):
109
+ output = gr.Dataframe(headers=["Rank", "ID", "Predicted Tm [°C]", "Thermostable"], label="Results")
110
+ download_btn = gr.DownloadButton(label="Download CSV")
111
+ btn.click(
112
+ predict_tm,
113
+ inputs=[seq_text, seq_file, cutoff_bar],
114
+ outputs=[output, download_btn]
115
+ )
116
+ with gr.Row():
117
+ gr.Examples(
118
+ examples = [
119
+ [""">I1W5V5
120
+ MSIENLSSNKSFGGWHKQYSHVSNTLNCAMRFAIYLPPQASTGAKVPVLYWLSGLTCSDENFMQKAGAQRLAAELGIAIVAPDTSPRGEGVADDEGYDLGQGAGFYVNATQAPWNRHYQMYDYVVNELPELIESMFPVSDKRAIAGHSMGGHGALTIALRNPERYQSVSAFSPINNPVNCPWGQKAFTAYLGKDTDTWREYDASLLMRAAKQYVPALVDQGEADNFLAEQLKPEVLEAAASSNNYPLELRSHEGYDHSYYFIASFIEDHLRFHSNYLNA
121
+ """,
122
+ None, # seq_file is None (we use text)
123
+ "60" # threshold
124
+ ],
125
+ [
126
+ """>R4YJ85
127
+ MINLEKALAGRRILIVDDLVEARSSLKKMATILGGDNIDVATDGIEAMSLIHEHEYDIVLSDYNLGRTKDGQQILEEARFTQRLRATSLFIVITGENAIDMVMGALEYDPDGYITKPYTLNMLKERLIRIITIKEELRKVNKAIDLQKYDLAIKYCLEVLDSNPRLRLPASRILGQLLMRQKRFQQALKIYSQLLNERSVSWAKLGQAICIFKLGDPNSALALLNRALVDHPLYVQCYDWIAKILLTLDKPLEAQAALEKAIVISPKAVLRQMELGRIAYENGDMVTAEPAFKYSVRLGRFSCHKSAKNYLQFVRSAQALLINPKERQTQNKANEAFRALTELKQDFSDDKDSLFEASIVESKTHLKMENLDEAKRSANDAEDMLAKLECPKIDYKLQMTETFIETDQSVKAQKMIDELKSAELSDKQIIMLNRLDNDLNGEALKRHSTSLNDQGVSHYEKGELEEAIIAFDQATHYEQAGISVLLNSIQAKISLMERDSPDKKILKNVRSLLIRIGEIAKDDERFARYSRLRKTYDRLCRAAAK
128
+ """,
129
+ None,
130
+ "50"
131
+ ],
132
+ ],
133
+ inputs=[seq_text, seq_file, cutoff_bar],
134
+ label="Click an example to try TmProt instantly",
135
+ examples_per_page=2,
136
+ )
137
+ with gr.Row():
138
+ with gr.Column(scale=1):
139
+ pass
140
+ with gr.Column(scale=7):
141
+ gr.Markdown(value="""
142
+ ## Features
143
+
144
+ - Predict protein melting temperature (Tm) from amino acid sequences
145
+
146
+ - Accepts input via FASTA text or FASTA file upload
147
+
148
+ - Supports sequences from 20 to 2000 amino acids in length
149
+
150
+ - Outputs a ranked table with predicted Tm and thermostability status based on user-chosen threshold
151
+
152
+ - CSV download option for easy export and downstream analysis
153
+
154
+ ## Model Overview
155
+
156
+ - Base Model: facebook/esm2_t33_650M_UR50D (650M parameters)
157
+
158
+ - Fine-tuning method: LoRA (Low-Rank Adaptation) using PEFT framework
159
+
160
+ - Task: Regression prediction of protein melting temperature (Tm)
161
+
162
+ - Training Data: ProMelt dataset (merged Meltome Atlas + ProTherm) with ~45,000 protein sequences and experimental Tm values
163
+
164
+ - Output: Single linear regression output neuron predicting Tm in °C
165
+ """
166
+ )
167
+ with gr.Column(scale=1):
168
+ pass
169
+
170
+ if __name__ == "__main__":
171
+ demo.launch(share=True, allowed_paths=['./assets'])
assets/TmProt_logo.png ADDED
assets/TmProt_logo.svg ADDED
assets/logo.png ADDED
helpers.py ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ############################################ IMPORTS ###########################################################
2
+ import torch
3
+ from typing import Tuple, List, Dict
4
+ from transformers import AutoTokenizer, EsmForSequenceClassification
5
+ from peft import PeftModel
6
+ from Bio import SeqIO
7
+ from io import StringIO
8
+ ##################################################################################################################
9
+
10
+ def load_model(
11
+ model_name: str, path_model: str, device: str
12
+ ) -> Tuple[torch.nn.Module, AutoTokenizer]:
13
+ """
14
+ Load the ESM model and the PEFT LoRA adapter, set to eval mode and freeze parameters.
15
+ Loading is done on-the-fly: take pre-trained ESM-2 and apply adapters (PeftModel.from_pretrained).
16
+ Args:
17
+ model_name (str): Name of the base ESM model (e.g., 'esm2_t33_650M_UR50D').
18
+ path_model (str): Path to the fine-tuned LoRA adapter.
19
+
20
+ Returns:
21
+ Tuple[torch.nn.Module, AutoTokenizer]: Loaded model and tokenizer.
22
+ """
23
+ esm_model = f"facebook/{model_name}"
24
+ tokenizer = AutoTokenizer.from_pretrained(esm_model)
25
+ base_model = EsmForSequenceClassification.from_pretrained(
26
+ esm_model, num_labels=1
27
+ ).to(device)
28
+ # peft_config = PeftConfig.from_pretrained(str(path_model), local_files_only=True)
29
+ model = PeftModel.from_pretrained(base_model, str(path_model), is_local=True, local_files_only=True).to(device)
30
+ model.eval()
31
+ for param in model.parameters():
32
+ param.requires_grad = False
33
+ return model, tokenizer
34
+
35
+ def parse_fasta_string(fasta_str: str):
36
+ """Parse FASTA string into list of dicts with id and sequence."""
37
+ handle = StringIO(fasta_str)
38
+ return [{"id": rec.id, "sequence": str(rec.seq)} for rec in SeqIO.parse(handle, "fasta")]
model/README.md ADDED
@@ -0,0 +1,300 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ base_model: facebook/esm2_t33_650M_UR50D
3
+ library_name: peft
4
+ tags:
5
+ - protein
6
+ - esm2
7
+ - regression
8
+ - thermostability
9
+ - LoRA
10
+ - peft
11
+ license: lgpl-3.0
12
+ ---
13
+
14
+ # ESM-2 Protein Thermostability Predictor (LoRA Fine-Tuned)
15
+
16
+ This model is a parameter-efficient fine-tuned version of `facebook/esm2_t33_650M_UR50D` using the `PEFT` (`LoRA`) framework. The model is trained to predict protein thermostability (Tm) using the ProMelt dataset (combination of Meltome and ProTherm). The output is produced by a single neuron, albeit some modifications are planned such as MLP for Tm prediction. No additional fine-tuning using BRENDA was conducted.
17
+
18
+ The model uses a single output neuron for regression, though future improvements (e.g., replacing with an MLP head) are planned.
19
+
20
+
21
+ ## Model Details
22
+
23
+ ### Model Description
24
+ - **Base model:** facebook/esm2_t33_650M_UR50D (650M parameters)
25
+
26
+ - **Fine-tuning method:** LoRA (Low-Rank Adaptation) using PEFT
27
+
28
+ - **Task:** Protein thermostability prediction (regression)
29
+
30
+ - **Data:** ProMelt dataset (train/val/test CSV files)
31
+
32
+ - **Output layer:** Single linear regression head
33
+
34
+ - **Library stack:** Hugging Face Transformers, PEFT, PyTorch, Accelerate, MLflow, DagsHub
35
+
36
+ ### Model Features
37
+
38
+ - Parameter-efficient fine-tuning (LoRA) for memory and compute savings
39
+
40
+ - Cosine learning rate schedule
41
+
42
+ - Mixed precision (fp16) training via Accelerate
43
+
44
+ - Early stopping and best model selection based on RMSE
45
+
46
+ - Automatic MLflow logging and artifact tracking
47
+
48
+ ### Additional details
49
+
50
+ - **Developed by:** Loschmidt Laboratories
51
+ - **Model type:** Protein sequence regression model (ESM-2 backbone + LoRA adapter)
52
+ - **Language(s) (NLP):** Protein sequences (amino acids as chars)
53
+ - **License:** This project is licensed under the GNU Lesser General Public License v3.0.
54
+ - **Finetuned from model:** facebook/esm2_t33_650M_UR50D
55
+
56
+ ### Model Sources [optional]
57
+
58
+ <!-- Provide the basic links for the model. -->
59
+
60
+ - **Repository:** [\[LL repo\]](https://git.loschmidt.cz/tmprot/tmprot-predictor)
61
+ - **Paper [optional]:** [In progress]
62
+ - **Demo [optional]:** [In progress]
63
+
64
+ ## Usage
65
+
66
+ <!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
67
+
68
+ ### Direct Use
69
+ ```
70
+ cd src/tmprot
71
+ python cli.py -i ../../test/FIR.fasta -o ../../predictions/ -d "\t"
72
+ ```
73
+
74
+ ### Out-of-Scope Use
75
+ The generated $Tm$-aware embeddings from optimized ESM2 model can be used as features for MLPRegressor.
76
+
77
+ ## Bias, Risks, and Limitations
78
+ Predictions do not generalize well outside the proteomics-based ProMelt dataset, thus the results on the independent sets are worse.
79
+ Additionally:
80
+
81
+ - It does not account for post-translational modifications or environmental factors (e.g., pH, salt, ions).
82
+
83
+ ### Recommendations
84
+ - Use outputs in combination with experimental or domain expertise.
85
+
86
+ - Consider ensemble methods or downstream MLP for robustness.
87
+
88
+ ## How to Get Started with the Model
89
+ Prepare a FASTA file with your protein(s).
90
+
91
+ Use the CLI to predict:
92
+
93
+ python cli.py -i path/to/input.fasta -o path/to/output_directory -d "\t"
94
+
95
+ The output CSV file contains the following columns:
96
+ ```
97
+ Protein_ID, Sequence, Predicted_Tm
98
+ ```
99
+
100
+ For code integration, use the TmPredictor class in `src/tmprot/cli.py`.
101
+ ## Training Details
102
+
103
+ ### Training Data
104
+
105
+ The model was trained on the ProMelt dataset — a curated combination of the Meltome Atlas and ProTherm datasets, containing protein sequences with experimentally measured melting temperatures using proteomics-based approaches. Sequences were filtered to remove duplicates and split into train/val/test sets with sequence identity = 25%. CSV were stored in `../data/promelt/`.
106
+
107
+ ### Training Procedure
108
+ #### Preprocessing
109
+ - Sequence longer than 2000 AAs were filtered out.
110
+ - Sequences tokenized using ESM-2 tokenizer from Hugging Face Transformers.
111
+ - Batched using `DefaultDataCollator` with dynamic padding.
112
+ #### Training Hyperparameters
113
+
114
+ | Parameter | Value |
115
+ |------------------------|------------------------------|
116
+ | Model | facebook/esm2_t33_650M_UR50D |
117
+ | LoRA rank | 1 |
118
+ | LoRA alpha | 1 |
119
+ | LoRA dropout | 0.28 |
120
+ | Learning rate | 4.92e-4 |
121
+ | Weight decay | 1.56e-5 |
122
+ | Gradient clipping | 0.805 |
123
+ | Batch size | 4 |
124
+ | Epochs | 1 |
125
+ | Precision | fp16 (mixed) |
126
+ | Scheduler | Cosine |
127
+ | Optimizer | AdamW |
128
+ | Evaluation strategy | Per epoch |
129
+ | Save strategy | Per epoch |
130
+ | Best model selection | Based on RMSE |
131
+ | Gradient checkpointing | Enabled |
132
+ | MLflow tracking | Enabled (via DagsHub) |
133
+ | Seed | 8893 |
134
+
135
+ - LoRA target modules: query, key, and value
136
+
137
+ - Loss function: MSE loss (via Trainer for regression)
138
+
139
+ - Evaluation metrics: RMSE, R2, Pearson, Spearman
140
+ #### Speeds, Sizes, Times [optional]
141
+ - ~4200 seconds for training and evaluation.
142
+ - Inference speed: ~5 sec/protein
143
+ - 7.3M size for `model` folder with adapters and updated weights.
144
+
145
+ ## Evaluation
146
+
147
+ The model was evaluated on training, validation, and test datasets using multiple regression metrics to assess performance in predicting protein thermostability (Tm). Evaluation was performed after training for one epoch, with early stopping based on the validation RMSE.
148
+ ### Testing Data, Factors & Metrics
149
+
150
+ #### Testing Data
151
+
152
+ The test set consists of ~7300 proteins held out from ProMelt. Care was taken to ensure no >25% sequence identity with training samples.
153
+
154
+ #### Factors
155
+
156
+ [More Information Needed]
157
+
158
+ #### Metrics
159
+ - RMSE (Root Mean Square Error): Measures average prediction error magnitude.
160
+
161
+ - R2 Score (Coefficient of Determination): Indicates the proportion of variance explained by the model.
162
+
163
+ - PCC (Pearson's Correlation Coefficient): Measures linear correlation between predicted and actual Tm values.
164
+
165
+ - SCC (Spearman's Correlation Coefficient): Measures monotonic relationship between predicted and actual Tm values.
166
+
167
+ ### Results
168
+ #### Internal Evaluation Results (ProMelt Train/Val/Test)
169
+ | **Metric** | **Train** | **Validation** | **Test** |
170
+ | ------------------ | --------: | -------------: | -------: |
171
+ | **Loss** | 31.14 | 34.94 | 39.48 |
172
+ | **RMSE** | 5.58 | 5.91 | 6.28 |
173
+ | **R² Score** | 0.685 | 0.656 | 0.687 |
174
+ | **PCC (Pearson)** | 0.828 | 0.810 | 0.830 |
175
+ | **SCC (Spearman)** | 0.635 | 0.585 | 0.617 |
176
+ | **Runtime (s)** | 1602.62 | 178.19 | 337.08 |
177
+ | **Samples/sec** | 21.44 | 21.45 | 21.45 |
178
+ | **Steps/sec** | 5.36 | 5.37 | 5.36 |
179
+ | **Epoch** | 1 | 1 | 1 |
180
+
181
+ #### Independent evaluation
182
+ | **Dataset** | **RMSE** | **R² Score** | **PCC (Pearson)** | **SCC (Spearman)** |
183
+ | ----------------- | -------: | -----------: | ----------------: | -----------------: |
184
+ | **BRENDA** | 15.31 | 0.209 | 0.6693 | 0.5175 |
185
+ | **FireProt** | 14.01 | 0.0618 | 0.5802 | 0.4306 |
186
+ | **ASR** | 7.36 | -0.0749 | 0.2226 | 0.2515 |
187
+ | **CAS** | 6.50 | 0.223 | 0.6330 | 0.4461 |
188
+ | **HLD** | 6.70 | -0.232 | 0.3090 | 0.2722 |
189
+
190
+ These metrics indicate that the model achieves good regression performance on the protein thermostability prediction task, with reasonable generalization from training to test data.
191
+
192
+ #### Summary
193
+
194
+ This model is a LoRA fine-tuned version of the ESM-2 PLM (facebook/esm2_t33_650M_UR50D) designed to predict protein thermostability (Tm) from sequence data. The training was conducted on the ProMelt dataset with a single output regression head. Evaluation shows consistent performance across training, validation, and test splits with RMSE around 5.6-6.3 and good correlation metrics (R2 ~0.65-0.69, PCC ~0.81-0.83). This model provides a lightweight, efficient solution for protein thermostability prediction with potential applications in protein engineering and stability screening.
195
+
196
+ ---
197
+
198
+ ## Model Examination [optional]
199
+
200
+ Interpretability analyses for this model remain to be conducted. Future work may include:
201
+
202
+ - Visualization of attention maps to identify sequence regions most relevant for thermostability.
203
+ - Embedding space analysis to examine clustering of proteins by thermostability.
204
+
205
+
206
+ These studies will help illuminate how the LoRA adapters modulate the ESM-2 backbone to capture thermostability-related features.
207
+
208
+ ---
209
+ ## Environmental Impact
210
+
211
+ <!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
212
+
213
+ Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
214
+
215
+ - **Hardware Type:** 10 GB part (MIG) A100
216
+ - **Hours used:** ?
217
+ - **Cloud Provider:** Metacentrum
218
+ - **Compute Region:** Czech republic
219
+ - **Carbon Emitted:** ?
220
+
221
+ The use of LoRA parameter-efficient fine-tuning significantly reduces training time and energy consumption compared to full model fine-tuning, contributing to lower carbon footprint.
222
+ ## Technical Specifications [optional]
223
+
224
+ ### Model Architecture and Objective
225
+
226
+ - **Backbone:** ESM-2 PLM with 650 million parameters
227
+ - **Fine-tuning:** LoRA adapters applied to attention query, key, and value modules
228
+ - **Output:** Single linear regression head predicting protein melting temperature (Tm)
229
+ - **Objective:** Minimize RMSE between predicted and measured Tm values
230
+
231
+ ### Compute Infrastructure
232
+
233
+ Training utilized a single NVIDIA A100 GPU with mixed precision enabled via the Accelerate library to optimize memory and speed.
234
+
235
+
236
+ #### Hardware
237
+
238
+ - GPU: NVIDIA A100 10GB
239
+ - RAM: 16 GB
240
+
241
+
242
+
243
+ #### Software
244
+
245
+ - Python 3.9+
246
+ - PyTorch==2.5.1
247
+ - transformers==4.47.1
248
+ - pandas==2.2.3
249
+ - accelerate==1.1.1
250
+ - datasets==3.1.0
251
+ - peft==0.13.2
252
+ - scipy==1.14.1
253
+ - scikit-learn==1.5.2
254
+ - prettytable==3.12.0
255
+ - mlflow==2.18.0
256
+ - dagshub (latest stable)
257
+ - optuna (latest stable)
258
+ - seaborn==0.13.2
259
+
260
+
261
+ ## Citation [optional]
262
+ Paper: In progress. A manuscript detailing this model's methodology and performance is currently being prepared and will be linked here once published.
263
+
264
+ **BibTeX:**
265
+
266
+ [TODO]
267
+
268
+ **APA:**
269
+
270
+ [TODO]
271
+
272
+ ## Glossary [optional]
273
+ Tm (Melting Temperature): The temperature at which half of the protein denatures.
274
+
275
+ LoRA (Low-Rank Adaptation): A parameter-efficient fine-tuning method that inserts trainable rank-decomposed matrices into each layer of the transformer.
276
+
277
+ RMSE (Root Mean Squared Error): Common regression metric measuring average model prediction error.
278
+
279
+ PCC (Pearson Correlation Coefficient): Measures the linear correlation between predicted and true values.
280
+
281
+ SCC (Spearman Correlation Coefficient): Measures the rank correlation between predicted and true values.
282
+
283
+ fp16 (Mixed Precision): A technique that uses 16-bit floating point numbers for faster and more memory-efficient training.
284
+
285
+ ## More Information [optional]
286
+
287
+ For additional details, updates, and community discussion:
288
+
289
+ Repository: https://git.loschmidt.cz/tmprot/tmprot-predictor
290
+ ## Model Card Authors [optional]
291
+ - karen.pailozian@fnusa.cz
292
+ - add contacts ...
293
+
294
+ Loschmidt Laboratories (Masaryk University)
295
+ ## Model Card Contact
296
+ Issue Tracker: https://git.loschmidt.cz/tmprot/tmprot-predictor/issues
297
+
298
+ ### Framework versions
299
+
300
+ - PEFT 0.13.2
model/adapter_config.json ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "alpha_pattern": {},
3
+ "auto_mapping": null,
4
+ "base_model_name_or_path": "facebook/esm2_t33_650M_UR50D",
5
+ "bias": "none",
6
+ "fan_in_fan_out": false,
7
+ "inference_mode": true,
8
+ "init_lora_weights": true,
9
+ "layer_replication": null,
10
+ "layers_pattern": null,
11
+ "layers_to_transform": null,
12
+ "loftq_config": {},
13
+ "lora_alpha": 1,
14
+ "lora_dropout": 0.2793910667846842,
15
+ "megatron_config": null,
16
+ "megatron_core": "megatron.core",
17
+ "modules_to_save": [
18
+ "classifier",
19
+ "score"
20
+ ],
21
+ "peft_type": "LORA",
22
+ "r": 1,
23
+ "rank_pattern": {},
24
+ "revision": null,
25
+ "target_modules": [
26
+ "value",
27
+ "query",
28
+ "key"
29
+ ],
30
+ "task_type": "TOKEN_CLS",
31
+ "use_dora": false,
32
+ "use_rslora": false
33
+ }
model/adapter_model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d7bef27e36de848973fe6b78864882f899c2ad59bd17100d7e9d0eac4c24ff85
3
+ size 7605796
model/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "<cls>",
3
+ "eos_token": "<eos>",
4
+ "mask_token": "<mask>",
5
+ "pad_token": "<pad>",
6
+ "unk_token": "<unk>"
7
+ }
model/tokenizer_config.json ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "<cls>",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "<pad>",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "<eos>",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "<unk>",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "32": {
36
+ "content": "<mask>",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "<cls>",
46
+ "eos_token": "<eos>",
47
+ "extra_special_tokens": {},
48
+ "mask_token": "<mask>",
49
+ "model_max_length": 1000000000000000019884624838656,
50
+ "pad_token": "<pad>",
51
+ "tokenizer_class": "EsmTokenizer",
52
+ "unk_token": "<unk>"
53
+ }
model/training_args.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c922ec8f05bb769d8a866b0e2ca376e7ad800ada8814c7738724ebd6570df080
3
+ size 5432
model/vocab.txt ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <cls>
2
+ <pad>
3
+ <eos>
4
+ <unk>
5
+ L
6
+ A
7
+ G
8
+ V
9
+ S
10
+ E
11
+ R
12
+ T
13
+ I
14
+ D
15
+ P
16
+ K
17
+ Q
18
+ N
19
+ F
20
+ Y
21
+ M
22
+ H
23
+ W
24
+ C
25
+ X
26
+ B
27
+ U
28
+ Z
29
+ O
30
+ .
31
+ -
32
+ <null_1>
33
+ <mask>
requirements.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ torch==2.5.1
2
+ transformers==4.46.3
3
+ peft==0.16.0
4
+ biopython==1.85.0