ctheodoris
/

Geneformer

@@ -139,14 +139,15 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "def preprocess_classifier_batch(cell_batch):\n",
-    "    max_batch_len = max([len(i) for i in cell_batch[\"input_ids\"]])\n",
     "    def pad_label_example(example):\n",
     "        example[\"labels\"] = np.pad(example[\"labels\"], \n",
-    "                                   (0, max_batch_len-len(example[\"input_ids\"])), \n",
     "                                   mode='constant', constant_values=-100)\n",
     "        example[\"input_ids\"] = np.pad(example[\"input_ids\"], \n",
-    "                                      (0, max_batch_len-len(example[\"input_ids\"])), \n",
     "                                      mode='constant', constant_values=token_dictionary.get(\"<pad>\"))\n",
     "        example[\"attention_mask\"] = (example[\"input_ids\"] != token_dictionary.get(\"<pad>\")).astype(int)\n",
     "        return example\n",
@@ -158,10 +159,19 @@
     "    predict_logits = []\n",
     "    predict_labels = []\n",
     "    model.eval()\n",
-    "    for i in range(0, len(evalset), forward_batch_size):\n",
-    "        max_range = min(i+forward_batch_size,len(evalset))\n",
     "        batch_evalset = evalset.select([i for i in range(i, max_range)])\n",
-    "        padded_batch = preprocess_classifier_batch(batch_evalset)\n",
     "        padded_batch.set_format(type=\"torch\")\n",
     "        \n",
     "        input_data_batch = padded_batch[\"input_ids\"]\n",
@@ -224,7 +234,16 @@
     "    all_weighted_roc_auc = [a*b for a,b in zip(all_roc_auc, wts)]\n",
     "    roc_auc = np.sum(all_weighted_roc_auc)\n",
     "    roc_auc_sd = math.sqrt(np.average((all_roc_auc-roc_auc)**2, weights=wts))\n",
-    "    return mean_tpr, roc_auc, roc_auc_sd"
    ]
   },
   {
@@ -327,7 +346,7 @@
     "        \n",
     "        # load model\n",
     "        model = BertForTokenClassification.from_pretrained(\n",
-    "            \"/path/to/pretrained_model/\",\n",
     "            num_labels=2,\n",
     "            output_attentions = False,\n",
     "            output_hidden_states = False\n",

    "metadata": {},
    "outputs": [],
    "source": [
+    "def preprocess_classifier_batch(cell_batch, max_len):\n",
+    "    if max_len == None:\n",
+    "        max_len = max([len(i) for i in cell_batch[\"input_ids\"]])\n",
     "    def pad_label_example(example):\n",
     "        example[\"labels\"] = np.pad(example[\"labels\"], \n",
+    "                                   (0, max_len-len(example[\"input_ids\"])), \n",
     "                                   mode='constant', constant_values=-100)\n",
     "        example[\"input_ids\"] = np.pad(example[\"input_ids\"], \n",
+    "                                      (0, max_len-len(example[\"input_ids\"])), \n",
     "                                      mode='constant', constant_values=token_dictionary.get(\"<pad>\"))\n",
     "        example[\"attention_mask\"] = (example[\"input_ids\"] != token_dictionary.get(\"<pad>\")).astype(int)\n",
     "        return example\n",
     "    predict_logits = []\n",
     "    predict_labels = []\n",
     "    model.eval()\n",
+    "    \n",
+    "    # ensure there is at least 2 examples in each batch to avoid incorrect tensor dims\n",
+    "    evalset_len = len(evalset)\n",
+    "    max_divisible = find_largest_div(evalset_len, forward_batch_size)\n",
+    "    if len(evalset) - max_divisible == 1:\n",
+    "        evalset_len = max_divisible\n",
+    "    \n",
+    "    max_evalset_len = max(evalset.select([i for i in range(evalset_len)])[\"length\"])\n",
+    "    \n",
+    "    for i in range(0, evalset_len, forward_batch_size):\n",
+    "        max_range = min(i+forward_batch_size, evalset_len)\n",
     "        batch_evalset = evalset.select([i for i in range(i, max_range)])\n",
+    "        padded_batch = preprocess_classifier_batch(batch_evalset, max_evalset_len)\n",
     "        padded_batch.set_format(type=\"torch\")\n",
     "        \n",
     "        input_data_batch = padded_batch[\"input_ids\"]\n",
     "    all_weighted_roc_auc = [a*b for a,b in zip(all_roc_auc, wts)]\n",
     "    roc_auc = np.sum(all_weighted_roc_auc)\n",
     "    roc_auc_sd = math.sqrt(np.average((all_roc_auc-roc_auc)**2, weights=wts))\n",
+    "    return mean_tpr, roc_auc, roc_auc_sd\n",
+    "\n",
+    "# Function to find the largest number smaller\n",
+    "# than or equal to N that is divisible by k\n",
+    "def find_largest_div(N, K):\n",
+    "    rem = N % K\n",
+    "    if(rem == 0):\n",
+    "        return N\n",
+    "    else:\n",
+    "        return N - rem"
    ]
   },
   {
     "        \n",
     "        # load model\n",
     "        model = BertForTokenClassification.from_pretrained(\n",
+    "            \"/gladstone/theodoris/lab/ctheodoris/archive/geneformer_files/geneformer/210602_111318_geneformer_27M_L6_emb256_SL2048_E3_B12_LR0.001_LSlinear_WU10000_Oadamw_DS12/models/\",\n",
     "            num_labels=2,\n",
     "            output_attentions = False,\n",
     "            output_hidden_states = False\n",