{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "from Bio import SeqIO\n",
    "from DeepPD.data_helper import Data2EqlTensor,Seqs2EqlTensor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('LLSEVEELNMSLTALREK', 18)"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "file_path = './homo_test.fa'\n",
    "data = []\n",
    "for record in SeqIO.parse(file_path, 'fasta'):\n",
    "    data.append((record.id, str(record.seq)))\n",
    "\n",
    "data[0][1],len(data[0][1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "default_padding_value: 1\n",
      "length>40: 0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "torch.Size([6, 40])"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "seqs,ids = Data2EqlTensor(data,40)\n",
    "seqs.shape"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "tensor([[ 4,  4,  8,  9,  7,  9,  9,  4, 17, 20,  8,  4, 11,  5,  4, 10,  9, 15,\n",
       "          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,\n",
       "          1,  1,  1,  1],\n",
       "        [11,  5, 21, 19,  6,  8,  4, 14, 16, 15,  8, 21,  6, 10,  1,  1,  1,  1,\n",
       "          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,\n",
       "          1,  1,  1,  1],\n",
       "        [ 7, 17, 18, 21, 18, 12,  4, 18, 17, 17,  7, 13,  6, 21,  4, 19,  9,  4,\n",
       "         13,  6, 10,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,\n",
       "          1,  1,  1,  1],\n",
       "        [17, 16, 22, 16,  4,  8,  5, 13, 13,  4, 15, 15,  1,  1,  1,  1,  1,  1,\n",
       "          1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,\n",
       "          1,  1,  1,  1],\n",
       "        [ 7,  4,  7,  5,  4, 19,  9,  9, 14,  9, 15, 14, 17,  8,  5,  4, 13, 18,\n",
       "          4, 15,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,\n",
       "          1,  1,  1,  1],\n",
       "        [16,  5, 11, 11, 12, 12,  5, 13, 17, 12, 12, 18,  4,  8, 13, 16, 11, 15,\n",
       "          9, 15,  9,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,\n",
       "          1,  1,  1,  1]])"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "seqs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Some weights of the model checkpoint at ./DeepPD/BERT were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias']\n",
      "- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
      "- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n"
     ]
    }
   ],
   "source": [
    "from DeepPD.predictor import predict\n",
    "import torch"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "device = \"cuda\" if torch.cuda.is_available() else \"cpu\"\n",
    "\n",
    "def homo_classifier(file,threshold):\n",
    "    data = []\n",
    "    for record in SeqIO.parse(file, 'fasta'):\n",
    "        data.append((record.id, str(record.seq)))\n",
    "    seqs,ids = Data2EqlTensor(data,40)\n",
    "    homo_peptide_pred = predict(seqs,data, './weight-Homo/4.pth', threshold, device)\n",
    "    return homo_peptide_pred"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "default_padding_value: 1\n",
      "length>40: 0\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "[['peptide_1', 'LLSEVEELNMSLTALREK', '0.296', 'Non-Peptide'],\n",
       " ['peptide_2', 'TAHYGSLPQKSHGR', '0.013', 'Non-Peptide'],\n",
       " ['peptide_3', 'VNFHFILFNNVDGHLYELDGR', '0.809', 'Peptide'],\n",
       " ['peptide_4', 'NQWQLSADDLKK', '0.827', 'Peptide'],\n",
       " ['peptide_5', 'VLVALYEEPEKPNSALDFLK', '0.868', 'Peptide'],\n",
       " ['peptide_6', 'QATTIIADNIIFLSDQTKEKE', '0.043', 'Non-Peptide']]"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "out = homo_classifier(file_path,0.5)\n",
    "out"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "env3.8",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.0"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}