juliensimon
/

reviews-sentiment-analysis

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Training and deploying Hugging Face models on Amazon SageMaker\n",
+    "\n",
+    "* https://huggingface.co/distilbert-base-uncased\n",
+    "* https://huggingface.co/transformers/model_doc/distilbert.html\n",
+    "* https://huggingface.co/datasets/generated_reviews_enth"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 1 - Setup"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "!pip -q install sagemaker \"transformers>=4.4.2\" \"datasets[s3]==1.5.0\" widgetsnbextension ipywidgets huggingface_hub --upgrade"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!curl -s https://packagecloud.io/install/repositories/github/git-lfs/script.deb.sh | bash\n",
+    "!apt-get install git-lfs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import sagemaker\n",
+    "import transformers\n",
+    "import datasets\n",
+    "\n",
+    "print(sagemaker.__version__)\n",
+    "print(transformers.__version__)\n",
+    "print(datasets.__version__)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 2 - Preprocessing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "\n",
+    "train_dataset, valid_dataset = load_dataset('generated_reviews_enth', split=['train', 'validation'])\n",
+    "\n",
+    "print(train_dataset.shape)\n",
+    "print(valid_dataset.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def map_stars_to_sentiment(row):\n",
+    "    return {\n",
+    "        'labels': 1 if row['review_star'] >= 4 else 0\n",
+    "    }"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset = train_dataset.map(map_stars_to_sentiment)\n",
+    "valid_dataset = valid_dataset.map(map_stars_to_sentiment)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset = train_dataset.flatten()\n",
+    "valid_dataset = valid_dataset.flatten()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset = train_dataset.remove_columns(['correct', 'translation.th', 'review_star'])\n",
+    "valid_dataset = valid_dataset.remove_columns(['correct', 'translation.th', 'review_star'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset = train_dataset.rename_column('translation.en', 'text')\n",
+    "valid_dataset = valid_dataset.rename_column('translation.en', 'text')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Tokenize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer\n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained('distilbert-base-uncased')\n",
+    "\n",
+    "def tokenize(batch):\n",
+    "    return tokenizer(batch['text'], padding='max_length', truncation=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset = train_dataset.map(tokenize, batched=True, batch_size=len(train_dataset))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "valid_dataset = valid_dataset.map(tokenize, batched=True, batch_size=len(valid_dataset))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "\n",
+    "json.dumps(train_dataset[0])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "train_dataset = train_dataset.remove_columns(['text'])\n",
+    "valid_dataset = valid_dataset.remove_columns(['text'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 3 - Upload data to S3"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets.filesystems import S3FileSystem\n",
+    "\n",
+    "s3 = S3FileSystem()  \n",
+    "\n",
+    "s3_prefix = 'hugging-face/sentiment-analysis'\n",
+    "bucket = sagemaker.Session().default_bucket()\n",
+    "\n",
+    "train_input_path = 's3://{}/{}/training'.format(bucket, s3_prefix)\n",
+    "train_dataset.save_to_disk(train_input_path, fs=s3)\n",
+    "\n",
+    "valid_input_path = 's3://{}/{}/validation'.format(bucket, s3_prefix)\n",
+    "valid_dataset.save_to_disk(valid_input_path, fs=s3)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(train_input_path)\n",
+    "print(valid_input_path)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 4 - Fine-tune a Hugging Face model on SageMaker"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pygmentize train.py"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "hyperparameters={\n",
+    "    'epochs': 1,\n",
+    "    'train-batch_size': 32,\n",
+    "    'model-name':'distilbert-base-uncased'\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.huggingface import HuggingFace\n",
+    "\n",
+    "huggingface_estimator = HuggingFace(\n",
+    "    role=sagemaker.get_execution_role(),\n",
+    "    # Fine-tuning script\n",
+    "    entry_point='train.py',\n",
+    "    hyperparameters=hyperparameters,\n",
+    "    # Infrastructure\n",
+    "    transformers_version='4.6.1',\n",
+    "    pytorch_version='1.7.1',\n",
+    "    py_version='py36',\n",
+    "    instance_type='ml.p3.2xlarge',  # 1 GPUs, $4.131/hour in eu-west-1\n",
+    "    instance_count=1,\n",
+    "    # Enable spot instances\n",
+    "    use_spot_instances=True,        # 70% discount is typical\n",
+    "    max_run = 3600,\n",
+    "    max_wait = 7200\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "scrolled": true
+   },
+   "outputs": [],
+   "source": [
+    "huggingface_estimator.fit({'train': train_input_path, 'valid': valid_input_path})"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 5 - Deploy the model on SageMaker"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "huggingface_predictor = huggingface_estimator.deploy(\n",
+    "    initial_instance_count=1,\n",
+    "    instance_type='ml.m5.xlarge')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_data = {\n",
+    "   \"inputs\": \"This is a very nice camera, I'm super happy with it.\"\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prediction = huggingface_predictor.predict(test_data)\n",
+    "print(prediction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_data = {\n",
+    "   \"inputs\": \"Terrible purchase, I want my money back!\"\n",
+    "}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "prediction = huggingface_predictor.predict(test_data)\n",
+    "print(prediction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "huggingface_predictor.delete_endpoint()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# 6 - Push our model to the Hugging Face hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# In a terminal, login to the Hub with 'huggingface-cli login' and your hub credentials"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Create a new repo on the Hugging Face hub"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "repo_name='reviews-sentiment-analysis'"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%sh -s $repo_name\n",
+    "huggingface-cli repo create -y $1\n",
+    "git clone https://huggingface.co/juliensimon/$1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Extract our model and push files to our hub repo"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%sh -s $huggingface_estimator.model_data $repo_name\n",
+    "aws s3 cp $1 .\n",
+    "tar xvz -C $2 -f model.tar.gz"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%%sh -s $repo_name\n",
+    "cd $1\n",
+    "git add .\n",
+    "git commit -m 'Initial version'\n",
+    "git push"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Grab our model from the hub and work locally"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# With the Auto* API\n",
+    "from transformers import AutoTokenizer, AutoModelForSequenceClassification \n",
+    "\n",
+    "tokenizer = AutoTokenizer.from_pretrained('juliensimon/'+repo_name)\n",
+    "model = AutoModelForSequenceClassification.from_pretrained('juliensimon/'+repo_name)\n",
+    "\n",
+    "# With the pipeline API\n",
+    "from transformers import pipeline\n",
+    "\n",
+    "classifier = pipeline('sentiment-analysis', model='juliensimon/'+repo_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "classifier(\"This is a very nice camera, I'm super happy with it.\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "classifier(\"Terrible purchase, I want my money back!\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Grab our model from the hub and deploy it on a SageMaker endpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from sagemaker.huggingface.model import HuggingFaceModel\n",
+    "\n",
+    "hub = {\n",
+    "  'HF_MODEL_ID':'juliensimon/'+repo_name, \n",
+    "  'HF_TASK':'sentiment-analysis'\n",
+    "}\n",
+    "\n",
+    "huggingface_model = HuggingFaceModel(\n",
+    "   env=hub,                                                \n",
+    "   role=sagemaker.get_execution_role(),                                        \n",
+    "   transformers_version='4.6.1',                           \n",
+    "   pytorch_version='1.7.1',                                \n",
+    "   py_version='py36'                                      \n",
+    ")\n",
+    "\n",
+    "huggingface_predictor = huggingface_model.deploy(\n",
+    "   initial_instance_count=1,\n",
+    "   instance_type='ml.m5.xlarge'\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "test_data = {\n",
+    "   'inputs': \"This is a very nice camera, I'm super happy with it.\"\n",
+    "}\n",
+    "\n",
+    "prediction = huggingface_predictor.predict(test_data)\n",
+    "print(prediction)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "huggingface_predictor.delete_endpoint()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "instance_type": "ml.m5.4xlarge",
+  "kernelspec": {
+   "display_name": "Python 3 (PyTorch 1.6 Python 3.6 CPU Optimized)",
+   "language": "python",
+   "name": "python3__SAGEMAKER_INTERNAL__arn:aws:sagemaker:eu-west-1:470317259841:image/pytorch-1.6-cpu-py36-ubuntu16.04-v1"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.6.13"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}

code/train.py ADDED Viewed

	@@ -0,0 +1,88 @@

+import random, sys, argparse, os, logging, torch
+from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer, TrainingArguments
+from sklearn.metrics import accuracy_score, precision_recall_fscore_support
+from datasets import load_from_disk
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    # hyperparameters sent by the client are passed as command-line arguments to the script.
+    parser.add_argument("--epochs", type=int, default=3)
+    parser.add_argument("--train-batch-size", type=int, default=32)
+    parser.add_argument("--eval-batch-size", type=int, default=64)
+    parser.add_argument("--save-strategy", type=str, default='no')
+    parser.add_argument("--save-steps", type=int, default=500)
+    parser.add_argument("--model-name", type=str)
+    parser.add_argument("--learning-rate", type=str, default=5e-5)
+    # Data, model, and output directories
+    parser.add_argument("--output-data-dir", type=str, default=os.environ["SM_OUTPUT_DATA_DIR"])
+    parser.add_argument("--model-dir", type=str, default=os.environ["SM_MODEL_DIR"])
+    parser.add_argument("--n-gpus", type=str, default=os.environ["SM_NUM_GPUS"])
+    parser.add_argument("--train-dir", type=str, default=os.environ["SM_CHANNEL_TRAIN"])
+    parser.add_argument("--valid-dir", type=str, default=os.environ["SM_CHANNEL_VALID"])
+    args, _ = parser.parse_known_args()
+    # load datasets
+    train_dataset = load_from_disk(args.train_dir)
+    valid_dataset = load_from_disk(args.valid_dir)
+    logger = logging.getLogger(__name__)
+    logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
+    logger.info(f" loaded valid_dataset length is: {len(valid_dataset)}")
+    # compute metrics function for binary classification
+    def compute_metrics(pred):
+        labels = pred.label_ids
+        preds = pred.predictions.argmax(-1)
+        precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average="binary")
+        acc = accuracy_score(labels, preds)
+        return {"accuracy": acc, "f1": f1, "precision": precision, "recall": recall}
+    # download model from model hub
+    model = AutoModelForSequenceClassification.from_pretrained(args.model_name)
+    # download the tokenizer too, which will be saved in the model artifact
+    # and used at prediction time
+    tokenizer = AutoTokenizer.from_pretrained(args.model_name)
+    # define training args
+    training_args = TrainingArguments(
+        output_dir=args.model_dir,
+        num_train_epochs=args.epochs,
+        per_device_train_batch_size=args.train_batch_size,
+        per_device_eval_batch_size=args.eval_batch_size,
+        save_strategy=args.save_strategy,
+        save_steps=args.save_steps,
+        evaluation_strategy="epoch",
+        logging_dir=f"{args.output_data_dir}/logs",
+        learning_rate=float(args.learning_rate),
+    )
+    # create Trainer instance
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        tokenizer=tokenizer,
+        compute_metrics=compute_metrics,
+        train_dataset=train_dataset,
+        eval_dataset=valid_dataset,
+    )
+    # train model
+    trainer.train()
+    # evaluate model
+    eval_result = trainer.evaluate(eval_dataset=valid_dataset)
+    # writes eval result to file which can be accessed later in s3 output
+    with open(os.path.join(args.output_data_dir, "eval_results.txt"), "w") as writer:
+        print(f"***** Eval results *****")
+        for key, value in sorted(eval_result.items()):
+            writer.write(f"{key} = {value}\n")
+    # Saves the model to s3
+    trainer.save_model(args.model_dir)