Fsoft-AIC
/

Codebert-docstring-inconsistency

@@ -1,186 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": 2,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "import os \n",
-    "from transformers import AutoConfig, AutoTokenizer, AutoModelForSequenceClassification, DataCollatorWithPadding"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 3,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "model_name_or_path = \"/datadrive/namlh31/codebridge/Codebert-docstring-inconsistency\"\n",
-    "config = AutoConfig.from_pretrained(\n",
-    " model_name_or_path,\n",
-    ")\n",
-    "tokenizer = AutoTokenizer.from_pretrained(\n",
-    "    model_name_or_path\n",
-    ")\n",
-    "model = AutoModelForSequenceClassification.from_pretrained(\n",
-    "model_name_or_path,\n",
-    "config=config,\n",
-    ")"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 5,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "examples = {'code': \"function(str){\\r\\n  var ret = new Array(str.length), len = str.length;\\r\\n  while(len--) ret[len] = str.charCodeAt(len);\\r\\n  return Uint8Array.from(ret);\\r\\n}\",\n",
-    "            'docstring': 'we do not need Buffer pollyfill for now'}"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 17,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "texts = (\n",
-    "        (examples['docstring'], examples['code'])\n",
-    "    )\n",
-    "result = tokenizer(*texts, padding=\"max_length\", max_length=512, truncation=True, return_tensors= 'pt')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 10,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "512\n"
-     ]
-    }
-   ],
-   "source": [
-    "tokenizer.decode(result['input_ids'])\n",
-    "print(len(result['input_ids']))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 22,
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "input = \"\"\"we do not need Buffer pollyfill for now</s></s>function(str){\\r\\n  var ret = new Array(str.length), len = str.length;\\r\\n  while(len--) ret[len] = str.charCodeAt(len);\\r\\n  return Uint8Array.from(ret);\\r\\n}\"\"\"\n",
-    "rs_2 = tokenizer(input, padding=\"max_length\", max_length=512, truncation=True, return_tensors= 'pt')"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 23,
-   "metadata": {},
-   "outputs": [
-    {
-     "data": {
-      "text/plain": [
-       "SequenceClassifierOutput(loss=None, logits=tensor([[ 0.2598, -0.2636]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)"
-      ]
-     },
-     "execution_count": 23,
-     "metadata": {},
-     "output_type": "execute_result"
-    }
-   ],
-   "source": [
-    "model(**rs_2)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 24,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n",
-      "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n",
-      "To disable this warning, you can either:\n",
-      "\t- Avoid using `tokenizers` before the fork if possible\n",
-      "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n"
-     ]
-    }
-   ],
-   "source": [
-    "from transformers import AutoTokenizer, AutoModelForSequenceClassification, pipeline\n",
-    "import torch\n",
-    "device = 0 if torch.cuda.is_available() else -1\n",
-    "pipeline = pipeline(\"text-classification\", model=model, tokenizer=tokenizer, device=device)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": 28,
-   "metadata": {},
-   "outputs": [
-    {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "[{'label': 'Inconsistency', 'score': 0.5601343512535095}]\n"
-     ]
-    }
-   ],
-   "source": [
-    "inputs = \"\"\"we do not need Buffer pollyfill for now</s></s>function(str){\n",
-    "  var ret = new Array(str.length), len = str.length;\n",
-    "  while(len--) ret[len] = str.charCodeAt(len);\n",
-    "  return Uint8Array.from(ret);\n",
-    "}\"\"\"\n",
-    "prediction = pipeline(inputs)\n",
-    "print(prediction)"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "metadata": {},
-   "outputs": [],
-   "source": []
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "namlh31",
-   "language": "python",
-   "name": "python3"
-  },
-  "language_info": {
-   "codemirror_mode": {
-    "name": "ipython",
-    "version": 3
-   },
-   "file_extension": ".py",
-   "mimetype": "text/x-python",
-   "name": "python",
-   "nbconvert_exporter": "python",
-   "pygments_lexer": "ipython3",
-   "version": "3.11.2"
-  },
-  "orig_nbformat": 4
- },
- "nbformat": 4,
- "nbformat_minor": 2
-}