File size: 11,336 Bytes
db4c2cd |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 |
{
"cells": [
{
"cell_type": "code",
"execution_count": 15,
"id": "78731790-cecc-4e7b-9599-c35a9fad1c11",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"A gerar embeddings …\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of EsmModel were not initialized from the model checkpoint at facebook/esm2_t33_650M_UR50D and are newly initialized: ['esm.pooler.dense.bias', 'esm.pooler.dense.weight']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"A fazer predições individuais …\n",
"1/1 [==============================] - 0s 47ms/step\n",
"1/1 [==============================] - 0s 33ms/step\n",
"1/1 [==============================] - 0s 30ms/step\n"
]
},
{
"ename": "ValueError",
"evalue": "in user code:\n\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2341, in predict_function *\n return step_function(self, iterator)\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2327, in step_function **\n outputs = model.distribute_strategy.run(run_step, args=(data,))\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2315, in run_step **\n outputs = model.predict_step(data)\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2283, in predict_step\n return self(x, training=False)\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\utils\\traceback_utils.py\", line 70, in error_handler\n raise e.with_traceback(filtered_tb) from None\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\input_spec.py\", line 298, in assert_input_compatibility\n raise ValueError(\n\n ValueError: Input 0 of layer \"sequential\" is incompatible with the layer: expected shape=(None, 1779), found shape=(None, 1791)\n",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"Cell \u001b[1;32mIn[15], line 47\u001b[0m\n\u001b[0;32m 45\u001b[0m \u001b[38;5;66;03m# --- 4. Ensemble (stacking) -----------------------------------------------\u001b[39;00m\n\u001b[0;32m 46\u001b[0m X_stack \u001b[38;5;241m=\u001b[39m np\u001b[38;5;241m.\u001b[39mconcatenate([y_pb, y_bfd, y_esm], axis\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m---> 47\u001b[0m y_ens \u001b[38;5;241m=\u001b[39m \u001b[43mstacking\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpredict\u001b[49m\u001b[43m(\u001b[49m\u001b[43mX_stack\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 49\u001b[0m \u001b[38;5;66;03m# --- 5. Carregar MultiLabelBinarizer ---------------------------------------\u001b[39;00m\n\u001b[0;32m 50\u001b[0m mlb \u001b[38;5;241m=\u001b[39m joblib\u001b[38;5;241m.\u001b[39mload(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mdata/mlb_597.pkl\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n",
"File \u001b[1;32m~\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\utils\\traceback_utils.py:70\u001b[0m, in \u001b[0;36mfilter_traceback.<locals>.error_handler\u001b[1;34m(*args, **kwargs)\u001b[0m\n\u001b[0;32m 67\u001b[0m filtered_tb \u001b[38;5;241m=\u001b[39m _process_traceback_frames(e\u001b[38;5;241m.\u001b[39m__traceback__)\n\u001b[0;32m 68\u001b[0m \u001b[38;5;66;03m# To get the full stack trace, call:\u001b[39;00m\n\u001b[0;32m 69\u001b[0m \u001b[38;5;66;03m# `tf.debugging.disable_traceback_filtering()`\u001b[39;00m\n\u001b[1;32m---> 70\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m e\u001b[38;5;241m.\u001b[39mwith_traceback(filtered_tb) \u001b[38;5;28;01mfrom\u001b[39;00m\u001b[38;5;250m \u001b[39m\u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[0;32m 71\u001b[0m \u001b[38;5;28;01mfinally\u001b[39;00m:\n\u001b[0;32m 72\u001b[0m \u001b[38;5;28;01mdel\u001b[39;00m filtered_tb\n",
"File \u001b[1;32m~\\AppData\\Local\\Temp\\__autograph_generated_filen1meoyfq.py:15\u001b[0m, in \u001b[0;36mouter_factory.<locals>.inner_factory.<locals>.tf__predict_function\u001b[1;34m(iterator)\u001b[0m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 14\u001b[0m do_return \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mTrue\u001b[39;00m\n\u001b[1;32m---> 15\u001b[0m retval_ \u001b[38;5;241m=\u001b[39m ag__\u001b[38;5;241m.\u001b[39mconverted_call(ag__\u001b[38;5;241m.\u001b[39mld(step_function), (ag__\u001b[38;5;241m.\u001b[39mld(\u001b[38;5;28mself\u001b[39m), ag__\u001b[38;5;241m.\u001b[39mld(iterator)), \u001b[38;5;28;01mNone\u001b[39;00m, fscope)\n\u001b[0;32m 16\u001b[0m \u001b[38;5;28;01mexcept\u001b[39;00m:\n\u001b[0;32m 17\u001b[0m do_return \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mFalse\u001b[39;00m\n",
"\u001b[1;31mValueError\u001b[0m: in user code:\n\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2341, in predict_function *\n return step_function(self, iterator)\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2327, in step_function **\n outputs = model.distribute_strategy.run(run_step, args=(data,))\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2315, in run_step **\n outputs = model.predict_step(data)\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\training.py\", line 2283, in predict_step\n return self(x, training=False)\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\utils\\traceback_utils.py\", line 70, in error_handler\n raise e.with_traceback(filtered_tb) from None\n File \"C:\\Users\\Melvin\\anaconda3\\envs\\protein_env\\lib\\site-packages\\keras\\src\\engine\\input_spec.py\", line 298, in assert_input_compatibility\n raise ValueError(\n\n ValueError: Input 0 of layer \"sequential\" is incompatible with the layer: expected shape=(None, 1779), found shape=(None, 1791)\n"
]
}
],
"source": [
"# %%\n",
"import numpy as np\n",
"import torch\n",
"from transformers import AutoTokenizer, AutoModel\n",
"from tensorflow.keras.models import load_model\n",
"import joblib\n",
"\n",
"# Parâmetros\n",
"SEQ_FASTA = \"MFNVESVERVELCESLLTWIQTFNVDAPCQTAEDLTNGVVMSQVLQKIDPVYFDDNWLNRIKTEVGDNWRLKISNLKKILKGILDYNHEILGQQINDFTLPDVNLIGEHSDAAELGRMLQLILGCAVNCEQKQEYIQAIMMMEESVQHVVMTAIQELMSKESPVSAGHDAYVDLDRQLKKTTEELNEALSAKEEIAQRCHELDMQVAALQEEKSSLLAENQILMERLNQSDSIEDPNSPAGRRHLQLQTQLEQLQEETFRLEAAKDDYRIRCEELEKEISELRQQNDELTTLADEAQSLKDEIDVLRHSSDKVSKLEGQVESYKKKLEDLGDLRRQVKLLEEKNTMYMQNTVSLEEELRKANAARGQLETYKRQVVELQNRLSDESKKADKLDFEYKRLKEKVDGLQKEKDRLRTERDSLKETIEELRCVQAQEGQLTTQGLMPLGSQESSDSLAAEIVTPEIREKLIRLQHENKMLKLNQEDSDNEKIALLQSLLDDANLRKNELETENRLVNQRLLEVQSQVEELQKSLQDQGSKAEDSVLLKKKLEEHLEKLHEANNELQKKRAIIEDLEPRFNNSSLRIEELQEALRKKEEEMKQMEERYKKYLEKAKSVIRTLDPKQNQGAAPEIQALKNQLQERDRLFHSLEKEYEKTKSQRDMEEKYIVSAWYNMGMTLHKKAAEDRLASTGSGQSFLARQRQATSTRRSYPGHVQPATAR\" # (mantém a tua sequência completa)\n",
"TOP_N = 10\n",
"THRESH = 0.37 \n",
"\n",
"# Funções auxiliares\n",
"def get_embedding_mean(model_name, seq, chunk):\n",
" tokenizer = AutoTokenizer.from_pretrained(model_name, do_lower_case=False)\n",
" model = AutoModel.from_pretrained(model_name)\n",
" model.eval()\n",
"\n",
" chunks = [seq[i:i+chunk] for i in range(0, len(seq), chunk)]\n",
" reps = []\n",
" for c in chunks:\n",
" tokens = tokenizer(\" \".join(c), return_tensors=\"pt\", truncation=False, padding=False)\n",
" with torch.no_grad():\n",
" reps.append(model(**tokens).last_hidden_state[:, 0, :].squeeze().numpy())\n",
" return np.mean(reps, axis=0, keepdims=True) # shape (1, dim)\n",
"\n",
"# Embeddings\n",
"print(\"A gerar embeddings …\")\n",
"emb_pb = get_embedding_mean(\"Rostlab/prot_bert\", SEQ_FASTA, 512)\n",
"emb_bfd = get_embedding_mean(\"Rostlab/prot_bert_bfd\", SEQ_FASTA, 512)\n",
"emb_esm = get_embedding_mean(\"facebook/esm2_t33_650M_UR50D\", SEQ_FASTA, 1024)\n",
"\n",
"# Carregar modelos\n",
"mlp_pb = load_model(\"models/mlp_protbert.h5\")\n",
"mlp_bfd = load_model(\"models/mlp_protbertbfd.h5\")\n",
"mlp_esm = load_model(\"models/mlp_esm2.h5\")\n",
"stacking = load_model(\"models/ensemble_stack.h5\")\n",
"\n",
"# Predições dos MLPs base\n",
"print(\"A fazer predições individuais …\")\n",
"y_pb = mlp_pb.predict(emb_pb)[:, :597]\n",
"y_bfd = mlp_bfd.predict(emb_bfd)[:, :597]\n",
"y_esm = mlp_esm.predict(emb_esm)[:, :597]\n",
"\n",
"# --- 4. Ensemble (stacking)\n",
"X_stack = np.concatenate([y_pb, y_bfd, y_esm], axis=1)\n",
"y_ens = stacking.predict(X_stack)\n",
"\n",
"# --- 5. Carregar MultiLabelBinarizer\n",
"mlb = joblib.load(\"data/mlb_597.pkl\")\n",
"GO = mlb.classes_\n",
"\n",
"# --- 6. Função para mostrar resultados\n",
"def print_results(name, y_pred):\n",
" print(f\"\\n {name}\")\n",
" # GO terms acima do limiar\n",
" terms = mlb.inverse_transform((y_pred >= THRESH).astype(int))\n",
" print(f\" GO terms com prob ≥ {THRESH}:\")\n",
" print(\" \", terms[0] if terms[0] else \"Nenhum\")\n",
"\n",
" # Top-N\n",
" top_idx = np.argsort(-y_pred[0])[:TOP_N]\n",
" print(f\" Top {TOP_N} mais prováveis:\")\n",
" for i in top_idx:\n",
" print(f\" {GO[i]} : {y_pred[0][i]:.4f}\")\n",
"\n",
"# Imprimir tudo\n",
"print_results(\"ProtBERT (MLP)\", y_pb)\n",
"print_results(\"ProtBERT-BFD (MLP)\", y_bfd)\n",
"print_results(\"ESM-2 (MLP)\", y_esm)\n",
"print_results(\"Ensemble (Stacking)\", y_ens)\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "70a3035b-01cd-4c63-b34d-d520d2aa88bf",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
|