{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] } ], "source": [ "import transformers\n", "\n", "from transformers import pipeline\n", "\n", "checkpoint = 'bert-base-uncased'\n", "classifier = pipeline('sentiment-analysis', model=checkpoint)" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'label': 'LABEL_1', 'score': 0.5578101277351379}]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "classifier('This is a test sentence')" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n" ] }, { "data": { "text/plain": [ "[{'generated_text': 'In this course, I will teach you how to make a really big use of the language you\\u202are learning to use.\\u202a'},\n", " {'generated_text': 'In this course, I will teach you how to manipulate sound design to better enhance your sound design while also illustrating the application of certain audio and video technologies. In this section I will introduce some examples of how to manipulate sound design in my introductory video.'}]" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "generator = pipeline('text-generation', model='distilgpt2')\n", "generator('In this course, I will teach you how to',\n", " max_length=50, num_return_sequences = 2\n", " )" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']\n", "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n" ] }, { "data": { "text/plain": [ "[{'label': 'LABEL_0', 'score': 0.6686140894889832}]" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# inside the pipeline function;\n", "\n", "from transformers import pipeline\n", "classifier = pipeline('sentiment-analysis', model=checkpoint)\n", "classifier('I am very sad')\n" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "# Tokenization: Raw text -> Tokenizer -> Tokenized Text -> Input IDs for model" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "\n", "raw_inputs = ['This is a course on huggingface',\n", " 'I am very disgusted at my stupidity']\n", "\n", "inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': tensor([[ 101, 2023, 2003, 1037, 2607, 2006, 17662, 12172, 102],\n", " [ 101, 1045, 2572, 2200, 17733, 2012, 2026, 28072, 102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],\n", " [1, 1, 1, 1, 1, 1, 1, 1, 1]])}" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inputs" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Encoding(num_tokens=9, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])" ] }, "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inputs[0]" ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ 101, 2023, 2003, 1037, 2607, 2006, 17662, 12172, 102],\n", " [ 101, 1045, 2572, 2200, 17733, 2012, 2026, 28072, 102]])" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inputs['input_ids']" ] }, { "cell_type": "code", "execution_count": 12, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],\n", " [1, 1, 1, 1, 1, 1, 1, 1, 1]])" ] }, "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inputs['attention_mask']" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "\n", "raw_inputs = ['This is very good','I am learning slowly.. sad']\n", "\n", "inputs=tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "{'input_ids': tensor([[ 101, 2023, 2003, 2200, 2204, 102, 0, 0, 0],\n", " [ 101, 1045, 2572, 4083, 3254, 1012, 1012, 6517, 102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0],\n", " [1, 1, 1, 1, 1, 1, 1, 1, 1]])}" ] }, "execution_count": 14, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inputs" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[ 101, 2023, 2003, 2200, 2204, 102, 0, 0, 0],\n", " [ 101, 1045, 2572, 4083, 3254, 1012, 1012, 6517, 102]])" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inputs['input_ids']" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "tensor([[1, 1, 1, 1, 1, 1, 0, 0, 0],\n", " [1, 1, 1, 1, 1, 1, 1, 1, 1]])" ] }, "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ "inputs['attention_mask']" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "torch.Size([2, 9, 768])\n" ] } ], "source": [ "from transformers import AutoModel\n", "\n", "model = AutoModel.from_pretrained(checkpoint)\n", "outputs = model(**inputs)\n", "print(outputs.last_hidden_state.shape)" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "{'input_ids': tensor([[ 101, 2023, 2003, 2200, 2204, 102, 0, 0],\n", " [ 101, 1045, 2572, 2667, 2000, 4553, 2242, 102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 0, 0],\n", " [1, 1, 1, 1, 1, 1, 1, 1]])}\n", "torch.Size([2, 8, 768])\n", "torch.Size([2, 2])\n", "SequenceClassifierOutput(loss=None, logits=tensor([[-4.1928, 4.5727],\n", " [ 1.9190, -1.6084]], grad_fn=), hidden_states=None, attentions=None)\n", "tensor([[-4.1928, 4.5727],\n", " [ 1.9190, -1.6084]], grad_fn=)\n", "tensor([[1.5600e-04, 9.9984e-01],\n", " [9.7146e-01, 2.8543e-02]], grad_fn=)\n", "{0: 'NEGATIVE', 1: 'POSITIVE'}\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "checkpoint = 'distilbert-base-uncased-finetuned-sst-2-english'\n", "tokenizer = AutoTokenizer.from_pretrained(checkpoint)\n", "\n", "raw_inputs = ['This is very good', 'I am trying to learn something']\n", "\n", "inputs = tokenizer(raw_inputs, padding=True, truncation=True, return_tensors='pt')\n", "\n", "print(inputs)\n", "\n", "from transformers import AutoModel\n", "\n", "model = AutoModel.from_pretrained(checkpoint)\n", "outputs = model(**inputs)\n", "print(outputs.last_hidden_state.shape)\n", "\n", "from transformers import AutoModelForSequenceClassification\n", "\n", "model = AutoModelForSequenceClassification.from_pretrained(checkpoint)\n", "outputs = model(**inputs)\n", "print(outputs.logits.shape)\n", "print(outputs)\n", "print(outputs.logits)\n", "\n", "import torch\n", "\n", "predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)\n", "print(predictions)\n", "\n", "print(model.config.id2label)\n" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] } ], "source": [ "# instantiate a transformers model:\n", "\n", "from transformers import AutoModel\n", "\n", "bert_model = AutoModel.from_pretrained('bert-base-uncased')\n", "print(type(bert_model))\n", "\n", "gpt_model = AutoModel.from_pretrained('gpt2')\n", "print(type(gpt_model))\n", "\n", "bart_model = AutoModel.from_pretrained('facebook/bart-base')\n", "print(type(bart_model))" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] } ], "source": [ "from transformers import AutoConfig\n", "\n", "bert_config = AutoConfig.from_pretrained('bert-base-uncased')\n", "print(type(bert_config))\n", "\n", "gpt_config = AutoConfig.from_pretrained('gpt2')\n", "print(type(gpt_config))\n", "\n", "bart_config = AutoConfig.from_pretrained('facebook/bart-base')\n", "print(type(bart_config))" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n" ] } ], "source": [ "from transformers import BertConfig\n", "\n", "bert_config = BertConfig.from_pretrained('bert-base-uncased')\n", "print(type(bert_config))\n", "\n", "from transformers import GPT2Config\n", "gpt_config = GPT2Config.from_pretrained('gpt2')\n", "print(type(gpt_config))\n", "\n", "from transformers import BartConfig\n", "bart_config = BartConfig.from_pretrained('facebook/bart-base')\n", "print(type(bart_config))" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BertConfig {\n", " \"architectures\": [\n", " \"BertForMaskedLM\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"classifier_dropout\": null,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-12,\n", " \"max_position_embeddings\": 512,\n", " \"model_type\": \"bert\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"pad_token_id\": 0,\n", " \"position_embedding_type\": \"absolute\",\n", " \"transformers_version\": \"4.34.1\",\n", " \"type_vocab_size\": 2,\n", " \"use_cache\": true,\n", " \"vocab_size\": 30522\n", "}\n", "\n" ] } ], "source": [ "from transformers import BertConfig\n", "\n", "bert_config = BertConfig.from_pretrained('bert-base-uncased')\n", "print(bert_config)" ] }, { "cell_type": "code", "execution_count": 31, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BertModel(\n", " (embeddings): BertEmbeddings(\n", " (word_embeddings): Embedding(30522, 768, padding_idx=0)\n", " (position_embeddings): Embedding(512, 768)\n", " (token_type_embeddings): Embedding(2, 768)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (encoder): BertEncoder(\n", " (layer): ModuleList(\n", " (0-11): 12 x BertLayer(\n", " (attention): BertAttention(\n", " (self): BertSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): BertSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): BertIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): BertOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " (pooler): BertPooler(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (activation): Tanh()\n", " )\n", ")\n" ] } ], "source": [ "from transformers import BertConfig, BertModel\n", "\n", "bert_config = BertConfig.from_pretrained('bert-base-uncased')\n", "bert_model = BertModel(bert_config)\n", "print(bert_model)" ] }, { "cell_type": "code", "execution_count": 32, "metadata": {}, "outputs": [], "source": [ "from transformers import BertConfig, BertModel\n", "\n", "bert_config = BertConfig.from_pretrained('bert-base-uncased', num_hidden_layers=10)\n", "bert_model = BertModel(bert_config)" ] }, { "cell_type": "code", "execution_count": 33, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "BertModel(\n", " (embeddings): BertEmbeddings(\n", " (word_embeddings): Embedding(30522, 768, padding_idx=0)\n", " (position_embeddings): Embedding(512, 768)\n", " (token_type_embeddings): Embedding(2, 768)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (encoder): BertEncoder(\n", " (layer): ModuleList(\n", " (0-9): 10 x BertLayer(\n", " (attention): BertAttention(\n", " (self): BertSelfAttention(\n", " (query): Linear(in_features=768, out_features=768, bias=True)\n", " (key): Linear(in_features=768, out_features=768, bias=True)\n", " (value): Linear(in_features=768, out_features=768, bias=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " (output): BertSelfOutput(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " (intermediate): BertIntermediate(\n", " (dense): Linear(in_features=768, out_features=3072, bias=True)\n", " (intermediate_act_fn): GELUActivation()\n", " )\n", " (output): BertOutput(\n", " (dense): Linear(in_features=3072, out_features=768, bias=True)\n", " (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)\n", " (dropout): Dropout(p=0.1, inplace=False)\n", " )\n", " )\n", " )\n", " )\n", " (pooler): BertPooler(\n", " (dense): Linear(in_features=768, out_features=768, bias=True)\n", " (activation): Tanh()\n", " )\n", ")\n" ] } ], "source": [ "print(bert_model)" ] }, { "cell_type": "code", "execution_count": 34, "metadata": {}, "outputs": [], "source": [ "from transformers import BertConfig, BertModel\n", "\n", "bert_config = BertConfig.from_pretrained('bert-base-uncased', num_hidden_layers = 10)\n", "bert_model = BertModel(bert_config)\n", "\n", "bert_model.save_pretrained('my_bert_model')" ] }, { "cell_type": "code", "execution_count": 35, "metadata": {}, "outputs": [], "source": [ "from transformers import BertModel\n", "\n", "bert_model = BertModel.from_pretrained('my_bert_model')" ] }, { "cell_type": "code", "execution_count": 39, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "\n", "\n", "\n", "\n" ] } ], "source": [ "from transformers import AutoModel\n", "\n", "bert_model = AutoModel.from_pretrained('bert-base-uncased')\n", "print(type(bert_model))\n", "\n", "from transformers import AutoConfig, BertModel\n", "\n", "bert_config = AutoConfig.from_pretrained('bert-base-uncased')\n", "print(type(bert_config))\n", "bert_model = BertModel(bert_config)\n", "print(type(bert_model))\n", "\n", "from transformers import BertConfig, BertModel\n", "bert_config = BertConfig.from_pretrained('bert-base-uncased')\n", "print(type(bert_config))\n", "bert_model = BertModel(bert_config)\n", "print(type(bert_model))" ] }, { "cell_type": "code", "execution_count": 40, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "BertConfig {\n", " \"architectures\": [\n", " \"BertForMaskedLM\"\n", " ],\n", " \"attention_probs_dropout_prob\": 0.1,\n", " \"classifier_dropout\": null,\n", " \"gradient_checkpointing\": false,\n", " \"hidden_act\": \"gelu\",\n", " \"hidden_dropout_prob\": 0.1,\n", " \"hidden_size\": 768,\n", " \"initializer_range\": 0.02,\n", " \"intermediate_size\": 3072,\n", " \"layer_norm_eps\": 1e-12,\n", " \"max_position_embeddings\": 512,\n", " \"model_type\": \"bert\",\n", " \"num_attention_heads\": 12,\n", " \"num_hidden_layers\": 12,\n", " \"pad_token_id\": 0,\n", " \"position_embedding_type\": \"absolute\",\n", " \"transformers_version\": \"4.34.1\",\n", " \"type_vocab_size\": 2,\n", " \"use_cache\": true,\n", " \"vocab_size\": 30522\n", "}" ] }, "execution_count": 40, "metadata": {}, "output_type": "execute_result" } ], "source": [ "from transformers import BertConfig\n", "\n", "bert_config = BertConfig.from_pretrained('bert-base-uncased')\n", "bert_config" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "from transformers import BertConfig, BertModel\n", "new_bert_config = BertConfig.from_pretrained('bert-base-uncased', num_hidden_layers=10)\n", "new_bert_model = BertModel(new_bert_config)\n", "\n", "new_bert_model.save_pretrained('new-bert-model')\n", "\n" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [], "source": [ "load_new_bert_model = BertModel.from_pretrained('new-bert-model')\n" ] }, { "cell_type": "code", "execution_count": 57, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "['let', \"'\", 's', 'try', 'to', 'token', '##ize']\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "['▁let', \"'\", 's', '▁try', '▁to', '▁to', 'ken', 'ize']\n", "[2292, 1005, 1055, 3046, 2000, 19204, 4697]\n", "[101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 102]\n", "{'input_ids': [101, 2292, 1005, 1055, 3046, 2000, 19204, 4697, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]}\n", "[CLS] let's try to tokenize [SEP]\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "# split our input into tokens:\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "tokens = tokenizer.tokenize(\"Let's try to tokenize\")\n", "print(tokens)\n", "\n", "albert_tokenizer = AutoTokenizer.from_pretrained('albert-base-v1')\n", "albert_tokens = albert_tokenizer.tokenize(\"Let's try to tokenize\")\n", "print(albert_tokens)\n", "\n", "# map tokens to respective ids:\n", "input_ids = tokenizer.convert_tokens_to_ids(tokens)\n", "print(input_ids)\n", "\n", "# add special tokens:\n", "final_inputs = tokenizer.prepare_for_model(input_ids)\n", "print(final_inputs['input_ids'])\n", "print(final_inputs)\n", "\n", "print(tokenizer.decode(final_inputs['input_ids']))" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "['lets', 'try', 'to', 'token', '##ize']\n", "[11082, 3046, 2000, 19204, 4697]\n", "{'input_ids': [101, 11082, 3046, 2000, 19204, 4697, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}\n", "[CLS] lets try to tokenize [SEP]\n", "{'input_ids': [101, 11082, 3046, 2000, 19204, 4697, 102], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1]}\n", "[101, 11082, 3046, 2000, 19204, 4697, 102]\n", "[CLS] lets try to tokenize [SEP]\n" ] } ], "source": [ "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "tokens = tokenizer.tokenize(\"lets try to tokenize\")\n", "print(tokens)\n", "input_ids = tokenizer.convert_tokens_to_ids(tokens)\n", "print(input_ids)\n", "final_inputs = tokenizer.prepare_for_model(input_ids)\n", "print(final_inputs)\n", "decoded_inputs = tokenizer.decode(final_inputs['input_ids'])\n", "print(decoded_inputs)\n", "\n", "from transformers import AutoTokenizer\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "inputs = tokenizer('lets try to tokenize')\n", "print(inputs)\n", "print(inputs['input_ids'])\n", "print(tokenizer.decode(inputs['input_ids']))" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.\n" ] }, { "ename": "AttributeError", "evalue": "'list' object has no attribute 'size'", "output_type": "error", "traceback": [ "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", "\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)", "\u001b[1;32mc:\\Users\\HP\\Desktop\\PythonProjects\\HuggingFace_Beginners\\practise basics.ipynb Cell 32\u001b[0m line \u001b[0;36m1\n\u001b[0;32m 9\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtransformers\u001b[39;00m \u001b[39mimport\u001b[39;00m BertModel\n\u001b[0;32m 10\u001b[0m Bert_model \u001b[39m=\u001b[39m BertModel(Bert_config)\n\u001b[1;32m---> 11\u001b[0m outputs \u001b[39m=\u001b[39m Bert_model(\u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49minputs)\n\u001b[0;32m 12\u001b[0m \u001b[39mprint\u001b[39m(outputs\u001b[39m.\u001b[39mlast_hidden_state\u001b[39m.\u001b[39mshape)\n\u001b[0;32m 13\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39mtransformers\u001b[39;00m \u001b[39mimport\u001b[39;00m AutoModelForSequenceClassification\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\torch\\nn\\modules\\module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1516\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_compiled_call_impl(\u001b[39m*\u001b[39margs, \u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs) \u001b[39m# type: ignore[misc]\u001b[39;00m\n\u001b[0;32m 1517\u001b[0m \u001b[39melse\u001b[39;00m:\n\u001b[1;32m-> 1518\u001b[0m \u001b[39mreturn\u001b[39;00m \u001b[39mself\u001b[39;49m\u001b[39m.\u001b[39;49m_call_impl(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\torch\\nn\\modules\\module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[1;34m(self, *args, **kwargs)\u001b[0m\n\u001b[0;32m 1522\u001b[0m \u001b[39m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[0;32m 1523\u001b[0m \u001b[39m# this function, and just call forward.\u001b[39;00m\n\u001b[0;32m 1524\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mnot\u001b[39;00m (\u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_backward_pre_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_hooks \u001b[39mor\u001b[39;00m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39m_forward_pre_hooks\n\u001b[0;32m 1525\u001b[0m \u001b[39mor\u001b[39;00m _global_backward_pre_hooks \u001b[39mor\u001b[39;00m _global_backward_hooks\n\u001b[0;32m 1526\u001b[0m \u001b[39mor\u001b[39;00m _global_forward_hooks \u001b[39mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[1;32m-> 1527\u001b[0m \u001b[39mreturn\u001b[39;00m forward_call(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n\u001b[0;32m 1529\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m 1530\u001b[0m result \u001b[39m=\u001b[39m \u001b[39mNone\u001b[39;00m\n", "File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\transformers\\models\\bert\\modeling_bert.py:970\u001b[0m, in \u001b[0;36mBertModel.forward\u001b[1;34m(self, input_ids, attention_mask, token_type_ids, position_ids, head_mask, inputs_embeds, encoder_hidden_states, encoder_attention_mask, past_key_values, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[0;32m 968\u001b[0m \u001b[39melif\u001b[39;00m input_ids \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 969\u001b[0m \u001b[39mself\u001b[39m\u001b[39m.\u001b[39mwarn_if_padding_and_no_attention_mask(input_ids, attention_mask)\n\u001b[1;32m--> 970\u001b[0m input_shape \u001b[39m=\u001b[39m input_ids\u001b[39m.\u001b[39;49msize()\n\u001b[0;32m 971\u001b[0m \u001b[39melif\u001b[39;00m inputs_embeds \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[0;32m 972\u001b[0m input_shape \u001b[39m=\u001b[39m inputs_embeds\u001b[39m.\u001b[39msize()[:\u001b[39m-\u001b[39m\u001b[39m1\u001b[39m]\n", "\u001b[1;31mAttributeError\u001b[0m: 'list' object has no attribute 'size'" ] } ], "source": [ "from transformers import pipeline\n", "from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, AutoConfig\n", "\n", "tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')\n", "tokens = tokenizer.tokenize(\"Let's try to tokenize\")\n", "input_ids = tokenizer.convert_tokens_to_ids(tokens)\n", "inputs = tokenizer.prepare_for_model(input_ids)\n", "Bert_config = AutoConfig.from_pretrained('bert-base-uncased')\n", "from transformers import BertModel\n", "Bert_model = BertModel(Bert_config)\n", "outputs = Bert_model(**inputs)\n", "print(outputs.last_hidden_state.shape)\n", "from transformers import AutoModelForSequenceClassification\n", "model = AutoModelForSequenceClassification.from_pretrained('bert-base-uncased')\n", "outputs = model(**inputs)\n", "print(outputs.logits)\n", "import torch\n", "predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)\n", "print(predictions)\n", "print(model.config.id2label)\n", "\n", "from transformers import AutoModel, AutoConfig, BertModel, BertConfig\n", "\n", "bert_config = BertConfig.from_pretrained('bert-base-uncased')\n", "bert_model = BertModel(bert_config)\n", "bert_model.save_pretrained(bert_model)\n", "initialize_model = BertModel.from_pretrained('bert_model')\n", "outputs = initialize_model(**inputs)\n", "print(outputs.last_hidden_state.shape)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.6" } }, "nbformat": 4, "nbformat_minor": 2 }