{
"cells": [
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'input_ids': tensor([[ 101, 1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607,\n",
" 2026, 2878, 2166, 1012, 102],\n",
" [ 101, 1045, 5223, 2023, 2061, 2172, 999, 102, 0, 0,\n",
" 0, 0, 0, 0, 0],\n",
" [ 101, 1045, 2293, 2023, 2061, 2172, 999, 102, 0, 0,\n",
" 0, 0, 0, 0, 0],\n",
" [ 101, 1045, 2572, 9364, 1998, 2026, 2154, 2003, 9868, 102,\n",
" 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
" [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
" [1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],\n",
" [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0]])}\n"
]
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"\n",
"sentences = [\n",
" 'I have been waiting for a hugging face course my whole life.',\n",
" 'I hate this so much!',\n",
" 'I love this so much!',\n",
" 'I am disappointed and my day is ruined'\n",
"]\n",
"\n",
"tokens = tokenizer(sentences, padding=True, truncation=True, return_tensors='pt')\n",
"print(tokens)"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012], [1045, 2572, 2200, 9364, 1999, 2017, 1012], [1045, 5223, 2023, 2061, 2172, 999], [1045, 2572, 10215, 2004, 1045, 2572, 2196, 9657, 2055, 2026, 4813, 1012]]\n",
"[1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012]\n",
"[1045, 2572, 2200, 9364, 1999, 2017, 1012]\n",
"[1045, 5223, 2023, 2061, 2172, 999]\n",
"[1045, 2572, 10215, 2004, 1045, 2572, 2196, 9657, 2055, 2026, 4813, 1012]\n"
]
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'\n",
"tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n",
"\n",
"sentences = [\n",
" 'I have been waiting for a hugging face course my whole life.',\n",
" 'I am very disappointed in you.',\n",
" 'I hate this so much!',\n",
" 'I am terrified as I am never confident about my skills.'\n",
"]\n",
"\n",
"tokens = [tokenizer.tokenize(sentence) for sentence in sentences]\n",
"ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]\n",
"\n",
"print(ids)\n",
"print(ids[0])\n",
"print(ids[1])\n",
"print(ids[2])\n",
"print(ids[3])"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
]
}
],
"source": [
"final_inputs = tokenizer.prepare_for_model(ids)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'input_ids': [101, [1045, 2031, 2042, 3403, 2005, 1037, 17662, 2227, 2607, 2026, 2878, 2166, 1012], [1045, 2572, 2200, 9364, 1999, 2017, 1012], [1045, 5223, 2023, 2061, 2172, 999], [1045, 2572, 10215, 2004, 1045, 2572, 2196, 9657, 2055, 2026, 4813, 1012], 102], 'attention_mask': [1, 1, 1, 1, 1, 1]}\n"
]
}
],
"source": [
"print(final_inputs)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"ename": "ValueError",
"evalue": "expected sequence of length 13 at dim 1 (got 7)",
"output_type": "error",
"traceback": [
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[1;31mValueError\u001b[0m Traceback (most recent call last)",
"\u001b[1;32mc:\\Users\\HP\\Desktop\\PythonProjects\\HuggingFace_Beginners\\Batch-Inputs-Together.ipynb Cell 5\u001b[0m line \u001b[0;36m6\n\u001b[0;32m 4\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mtorch\u001b[39;00m\n\u001b[0;32m 5\u001b[0m ids \u001b[39m=\u001b[39m ids\n\u001b[1;32m----> 6\u001b[0m input_ids \u001b[39m=\u001b[39m torch\u001b[39m.\u001b[39;49mtensor(ids)\n",
"\u001b[1;31mValueError\u001b[0m: expected sequence of length 13 at dim 1 (got 7)"
]
}
],
"source": [
"'''trying to create a tensor or numpy array from the list of inputs\n",
"will result in an error. This is because the list of inputs is not \n",
"rectangular i.e they are not of equal dimensions'''\n",
"import torch\n",
"ids = ids\n",
"input_ids = torch.tensor(ids)\n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"'Therefore, you have to pad, here we pad manually. But be sure to\\ncheck out dynamic padding which is almost always better on the CPU and\\nthe GPU!'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''Therefore, you have to pad, here we pad manually. But be sure to\n",
"check out dynamic padding which is almost always better on the CPU and\n",
"the GPU!'''\n"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"BertTokenizerFast(name_or_path='bert-base-cased', vocab_size=28996, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True), added_tokens_decoder={\n",
"\t0: AddedToken(\"[PAD]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t100: AddedToken(\"[UNK]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t101: AddedToken(\"[CLS]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t102: AddedToken(\"[SEP]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"\t103: AddedToken(\"[MASK]\", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),\n",
"}\n"
]
},
{
"data": {
"text/plain": [
"0"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'''The value you use to pad the sentences should not be picked\n",
"randomly. Use tokenizer.pad_token_id to get the value of the pad token'''\n",
"\n",
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained('bert-base-cased')\n",
"print(tokenizer)\n",
"tokenizer.pad_token_id"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'input_ids': [101, 10463, 2023, 2000, 19204, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}\n",
"convert this to tokens.\n"
]
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
"tokens = tokenizer.tokenize('Convert this to tokens.')\n",
"input_ids = tokenizer.convert_tokens_to_ids(tokens)\n",
"inputs = tokenizer.prepare_for_model(input_ids)\n",
"print(inputs)\n",
"\n",
"# decode:\n",
"decode = tokenizer.decode(input_ids)\n",
"print(decode)"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'input_ids': [101, 10463, 2023, 2000, 19204, 2015, 1012, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}\n",
"[CLS] convert this to tokens. [SEP]\n"
]
}
],
"source": [
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer =AutoTokenizer.from_pretrained('bert-base-uncased')\n",
"inputs = tokenizer('Convert this to tokens.')\n",
"print(inputs)\n",
"decode = tokenizer.decode(inputs['input_ids'])\n",
"print(decode)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'input_ids': [101, [[2023, 2003, 1037, 2862, 1997, 11746], [1045, 2097, 3046, 2026, 2190, 2000, 2562, 2009, 2460, 1012], [2009, 2003, 2524, 2000, 4553, 2066, 2023, 1012], [1045, 2572, 5458, 1012]], 102], 'token_type_ids': [0, 0, 0], 'attention_mask': [1, 1, 1]}\n"
]
}
],
"source": [
"'''Enough revision, now pad them:'''\n",
"from transformers import AutoTokenizer\n",
"\n",
"sentences = ['This is a list of sentences',\n",
" 'I will try my best to keep it short.',\n",
" 'It is hard to learn like this.',\n",
" 'I am tired.']\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
"tokens = [tokenizer.tokenize(sentence) for sentence in sentences]\n",
"input_ids = [tokenizer.convert_tokens_to_ids(token) for token in tokens]\n",
"inputs = tokenizer.prepare_for_model([input_ids])\n",
"print(inputs)\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"101\n",
"[[2023, 2003, 1037, 2862, 1997, 11746], [1045, 2097, 3046, 2026, 2190, 2000, 2562, 2009, 2460, 1012], [2009, 2003, 2524, 2000, 4553, 2066, 2023, 1012], [1045, 2572, 5458, 1012]]\n",
"102\n"
]
}
],
"source": [
"for i in range(len(inputs)): print(inputs['input_ids'][i])"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[[2023, 2003, 1037, 2862, 1997, 11746],\n",
" [1045, 2097, 3046, 2026, 2190, 2000, 2562, 2009, 2460, 1012],\n",
" [2009, 2003, 2524, 2000, 4553, 2066, 2023, 1012],\n",
" [1045, 2572, 5458, 1012]]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inputs['input_ids'][1]"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[2023, 2003, 1037, 2862, 1997, 11746]"
]
},
"execution_count": 12,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inputs['input_ids'][1][0]"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1045, 2097, 3046, 2026, 2190, 2000, 2562, 2009, 2460, 1012]"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inputs['input_ids'][1][1]"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[2009, 2003, 2524, 2000, 4553, 2066, 2023, 1012]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inputs['input_ids'][1][2]"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"[1045, 2572, 5458, 1012]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"inputs['input_ids'][1][3]"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {},
"outputs": [],
"source": [
"'''Now we won't receive error.'''\n",
"import torch\n",
"padded_input_ids = [[1045, 2097, 3046, 2026, 2190, 2000, 2562, 2009, 2460, 1012],\n",
" [2023, 2003, 1037, 2862, 1997, 11746, 0, 0, 0, 0],\n",
" [2009, 2003, 2524, 2000, 4553, 2066, 2023, 1012, 0, 0],\n",
" [1045, 2572, 5458, 1012, 0, 0, 0, 0, 0, 0]]\n",
"\n",
"input_ids = torch.tensor(padded_input_ids) "
]
},
{
"cell_type": "code",
"execution_count": 27,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n",
"We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[0.0690, 0.7675]], grad_fn=)\n",
"tensor([[-0.2026, 0.1231]], grad_fn=)\n",
"tensor([[0.0924, 0.7572]], grad_fn=)\n",
"tensor([[0.2478, 0.6774]], grad_fn=)\n",
"tensor([[ 0.0690, 0.7675],\n",
" [-0.2026, 0.1231],\n",
" [ 0.0924, 0.7572],\n",
" [ 0.2478, 0.6774]], grad_fn=)\n"
]
}
],
"source": [
"from transformers import AutoModelForSequenceClassification\n",
"\n",
"ids1 = torch.tensor([padded_input_ids[0]])\n",
"ids2 = torch.tensor([padded_input_ids[1]])\n",
"ids3 = torch.tensor([padded_input_ids[2]])\n",
"ids4 = torch.tensor([padded_input_ids[3]])\n",
"\n",
"all_ids = torch.tensor([padded_input_ids[0], \n",
" padded_input_ids[1], \n",
" padded_input_ids[2],\n",
" padded_input_ids[3]])\n",
"# # same effect:\n",
"# all_ids =torch.tensor(padded_input_ids)\n",
"\n",
"model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')\n",
"print(model(ids1).logits)\n",
"print(model(ids2).logits)\n",
"print(model(ids3).logits)\n",
"print(model(ids4).logits)\n",
"print(model(all_ids).logits)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [],
"source": [
"all_ids = torch.tensor(padded_input_ids)\n",
"attention_mask = torch.tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1],\n",
" [1, 1, 1, 1, 1, 1, 0, 0, 0, 0],\n",
" [1, 1, 1, 1, 1, 1, 1, 1, 0, 0],\n",
" [1, 1, 1, 1, 0, 0, 0, 0, 0, 0]])"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n",
"You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"tensor([[ 0.1353, -0.0232]], grad_fn=)\n",
"tensor([[ 0.1116, -0.2974]], grad_fn=)\n",
"tensor([[ 0.1224, -0.1755]], grad_fn=)\n",
"tensor([[ 0.0059, -0.2736]], grad_fn=)\n",
"tensor([[ 0.1353, -0.0232],\n",
" [ 0.1041, -0.0745],\n",
" [ 0.1715, -0.1862],\n",
" [ 0.0225, -0.1921]], grad_fn=)\n"
]
}
],
"source": [
"model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')\n",
"output1 = model(ids1)\n",
"output2 = model(ids2)\n",
"output3 = model(ids3)\n",
"output4 = model(ids4)\n",
"print(output1.logits)\n",
"print(output2.logits)\n",
"print(output3.logits)\n",
"print(output4.logits)\n",
"\n",
"output = model(all_ids, attention_mask=attention_mask)\n",
"print(output.logits)"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"# In short, this is how you batch input together:\n",
"\n",
"from transformers import AutoTokenizer\n",
"\n",
"tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n",
"raw_inputs = ['I am so slow', 'I wish I had more time in a day',\n",
" 'We all have equal time per day so we need to make the best use of it',\n",
" 'This is a very long sentence that will not fit in the model.. of will it?']\n",
"batch = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')\n"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{'input_ids': tensor([[ 101, 1045, 2572, 2061, 4030, 102, 0, 0, 0, 0, 0, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 101, 1045, 4299, 1045, 2018, 2062, 2051, 1999, 1037, 2154, 102, 0,\n",
" 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [ 101, 2057, 2035, 2031, 5020, 2051, 2566, 2154, 2061, 2057, 2342, 2000,\n",
" 2191, 1996, 2190, 2224, 1997, 2009, 102, 0, 0],\n",
" [ 101, 2023, 2003, 1037, 2200, 2146, 6251, 2008, 2097, 2025, 4906, 1999,\n",
" 1996, 2944, 1012, 1012, 1997, 2097, 2009, 1029, 102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],\n",
" [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],\n",
" [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}\n"
]
}
],
"source": [
"print(batch)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.6"
}
},
"nbformat": 4,
"nbformat_minor": 2
}