Spaces:

Ankit15nov
/

EXL_AI_Engineer_Training

Running

App Files Files Community

Ankit15nov commited on Sep 13, 2023

Commit

8cfdceb

1 Parent(s): 6bb0190

week 3 assignment 2

Browse files

Files changed (1) hide show

Stanford's_HELM_LM_Evaluation.ipynb +717 -0

Stanford's_HELM_LM_Evaluation.ipynb ADDED Viewed

	@@ -0,0 +1,717 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "hcPOD7h1s68l"
+   },
+   "source": [
+    "### Stanford's Heuristic Evaluation of Language Models\n",
+    "\n",
+    "Based on the work in this [repository](https://github.com/stanford-crfm/helm), we'll be implementing HELM to evaluate a model today!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "1XneqmcEtpRe"
+   },
+   "source": [
+    "As always, let's grab some dependencies! \n",
+    "\n",
+    "**PLEASE RESTART YOUR ENV AFTER INSTALLING THESE DEPENDENCIES**"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "id": "vAbfmcNjtsPr"
+   },
+   "outputs": [],
+   "source": [
+    "!pip install -q crfm-helm\n",
+    "!pip install -q typing_extensions==4.5.0"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {
+    "id": "HgAjBesZyZJx"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Looking in indexes: https://download.pytorch.org/whl/cu118\n",
+      "Requirement already satisfied: torch in /home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages (1.12.1)\n",
+      "Collecting torch\n",
+      "  Downloading https://download.pytorch.org/whl/cu118/torch-2.0.1%2Bcu118-cp310-cp310-linux_x86_64.whl (2267.3 MB)\n",
+      "\u001b[2K     \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/2.3 GB\u001b[0m \u001b[31m318.5 MB/s\u001b[0m eta \u001b[36m0:00:03\u001b[0m^C\n",
+      "\u001b[2K     \u001b[91m━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m\u001b[91m╸\u001b[0m\u001b[90m━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/2.3 GB\u001b[0m \u001b[31m302.0 MB/s\u001b[0m eta \u001b[36m0:00:04\u001b[0m\n",
+      "\u001b[?25h\u001b[31mERROR: Operation cancelled by user\u001b[0m\u001b[31m\n",
+      "\u001b[0m"
+     ]
+    }
+   ],
+   "source": [
+    "!pip install -U torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "dckk0SXyweMm"
+   },
+   "source": [
+    "Now, let's dive in to doing a simple evaluation!"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {
+    "id": "3Li97dKSttn-"
+   },
+   "outputs": [],
+   "source": [
+    "!echo 'entries: [{description: \"mmlu:subject=philosophy,model=huggingface/gpt2\", priority: 1}]' > run_specs.conf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "BFK0KChJwlI-",
+    "outputId": "8385bfce-0e7b-409f-c436-0f0de4b02348"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "main {\n",
+      "  Read 1 run entries from run_specs.conf\n",
+      "  1 entries produced 1 run specs\n",
+      "  run_specs {\n",
+      "    RunSpec(name='mmlu:subject=philosophy,method=multiple_choice_joint,model=huggingface_gpt2', scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'philosophy'}), adapter_spec=AdapterSpec(method='multiple_choice_joint', global_prefix='', instructions='The following are multiple choice questions (with answers) about philosophy.\\n', input_prefix='Question: ', input_suffix='\\n', reference_prefix='A. ', reference_suffix='\\n', output_prefix='Answer: ', output_suffix='\\n', instance_prefix='\\n', substitutions=[], max_train_instances=5, max_eval_instances=1, num_outputs=5, num_train_trials=1, sample_train=True, model='huggingface/gpt2', temperature=0.0, max_tokens=1, stop_sequences=['\\n'], random=None), metric_specs=[MetricSpec(class_name='helm.benchmark.basic_metrics.BasicMetric', args={'names': ['exact_match', 'quasi_exact_match', 'prefix_exact_match', 'quasi_prefix_exact_match']})], data_augmenter_spec=DataAugmenterSpec(perturbation_specs=[], should_augment_train_instances=False, should_include_original_train=False, should_skip_unchanged_train=False, should_augment_eval_instances=False, should_include_original_eval=False, should_skip_unchanged_eval=False, seeds_per_instance=1), groups=['mmlu'])\n",
+      "  } [0.0s]\n",
+      "  Running in local mode with base path: prod_env\n",
+      "Looking in path: prod_env\n",
+      "  AutoClient: cache_path = prod_env/cache\n",
+      "  AutoClient: mongo_uri = \n",
+      "  Created cache with config: SqliteCacheConfig(path='prod_env/cache/huggingface.sqlite')\n",
+      "  Found 1 account(s).\n",
+      "  0%|                                                     | 0/1 [00:00<?, ?it/s]  Running mmlu:subject=philosophy,method=multiple_choice_joint,model=huggingface_gpt2 {\n",
+      "    scenario.get_instances {\n",
+      "      ensure_file_downloaded {\n",
+      "        Not downloading https://people.eecs.berkeley.edu/~hendrycks/data.tar because benchmark_output/scenarios/mmlu/data already exists\n",
+      "      } [0.0s]\n",
+      "      benchmark_output/scenarios/mmlu/data/auxiliary_train/philosophy_auxiliary_train.csv doesn't exist, skipping\n",
+      "      Reading benchmark_output/scenarios/mmlu/data/dev/philosophy_dev.csv\n",
+      "      Reading benchmark_output/scenarios/mmlu/data/val/philosophy_val.csv\n",
+      "      Reading benchmark_output/scenarios/mmlu/data/test/philosophy_test.csv\n",
+      "    } [0.004s]\n",
+      "    350 instances, 5 train instances, 1/345 eval instances\n",
+      "    DataPreprocessor.preprocess {\n",
+      "    } [0.0s]\n",
+      "    MultipleChoiceJointAdapter.adapt {\n",
+      "      6 instances, choosing 5/5 train instances, 1 eval instances\n",
+      "      Adapting with train_trial_index=0 {\n",
+      "        Sampled 5 examples for trial #0.\n",
+      "        Parallelizing computation on 1 items over 4 threads {\n",
+      "          Created cache with config: SqliteCacheConfig(path='prod_env/cache/huggingface.sqlite')\n",
+      "\n",
+      "          Loading huggingface/gpt2 with Hugging Face Transformers {\n",
+      "  0%|                                                     | 0/1 [00:00<?, ?it/s]\u001b[A          } [0.053s]\n",
+      "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 18.02it/s]\n",
+      "        } [0.056s]\n",
+      "        Sample prompts {\n",
+      "          reference index = None, request_mode = None {\n",
+      "            The following are multiple choice questions (with answers) about philosophy.\n",
+      "            \n",
+      "            Question: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\n",
+      "            A. metaphysics\n",
+      "            B. epistemology\n",
+      "            C. quantum physics\n",
+      "            D. axiology\n",
+      "            Answer: A\n",
+      "            \n",
+      "            Question: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\n",
+      "            A. pleasure.\n",
+      "            B. happiness.\n",
+      "            C. good.\n",
+      "            D. virtue.\n",
+      "            Answer: C\n",
+      "            \n",
+      "            Question: Psychological egoism is:\n",
+      "            A. an ethical theory about how we ought to behave.\n",
+      "            B. a generalization concerning the way people tend to behave.\n",
+      "            C. a claim about human nature and the ways people are capable of behaving.\n",
+      "            D. none of the above.\n",
+      "            Answer: C\n",
+      "            \n",
+      "            Question: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\n",
+      "            A. optimist\n",
+      "            B. satisfied\n",
+      "            C. nominally religious\n",
+      "            D. pessimist\n",
+      "            Answer: D\n",
+      "            \n",
+      "            Question: According to d'Holbach, people always act according to _____.\n",
+      "            A. free choices\n",
+      "            B. dictates of the soul\n",
+      "            C. necessary natural laws\n",
+      "            D. undetermined will\n",
+      "            Answer: C\n",
+      "            \n",
+      "            Question: What does the notion of “meaning in life” refer to?\n",
+      "            A. external meaning\n",
+      "            B. god's plan\n",
+      "            C. internalmeaning\n",
+      "            D. meaning in an afterlife\n",
+      "            Answer:\n",
+      "          } [0.0s]\n",
+      "        } [0.0s]\n",
+      "      } [0.056s]\n",
+      "      1 requests\n",
+      "    } [0.056s]\n",
+      "    Executor.execute {\n",
+      "      Parallelizing computation on 1 items over 4 threads {\n",
+      "        Created cache with config: SqliteCacheConfig(path='prod_env/cache/huggingface.sqlite')\n",
+      "\n",
+      "        CUDA is available, initializing with a GPU...\n",
+      "  0%|                                                     | 0/1 [00:00<?, ?it/s]        Loading Hugging Face model for config gpt2 {\n",
+      "\u001b[A/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/torch/cuda/__init__.py:146: UserWarning: \n",
+      "NVIDIA A10G with CUDA capability sm_86 is not compatible with the current PyTorch installation.\n",
+      "The current PyTorch install supports CUDA capabilities sm_37 sm_50 sm_60 sm_70.\n",
+      "If you want to use the NVIDIA A10G GPU with PyTorch, please check the instructions at https://pytorch.org/get-started/locally/\n",
+      "\n",
+      "  warnings.warn(incompatible_device_warn.format(device_name, capability, \" \".join(arch_list), device_name))\n",
+      "        } [1.554s]\n",
+      "        Loading Hugging Face tokenizer model for config gpt2 {\n",
+      "        } [0.128s]\n",
+      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
+      "        HuggingFace error: CUDA error: no kernel image is available for execution on the device\n",
+      "CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\n",
+      "For debugging consider passing CUDA_LAUNCH_BLOCKING=1.\n",
+      "        Request failed. Retrying (attempt #2) in 1 seconds... (See above for error details)\n",
+      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
+      "        HuggingFace error: CUDA error: no kernel image is available for execution on the device\n",
+      "CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\n",
+      "For debugging consider passing CUDA_LAUNCH_BLOCKING=1.\n",
+      "        Request failed. Retrying (attempt #3) in 11 seconds... (See above for error details)\n",
+      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
+      "        HuggingFace error: CUDA error: no kernel image is available for execution on the device\n",
+      "CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\n",
+      "For debugging consider passing CUDA_LAUNCH_BLOCKING=1.\n",
+      "        Request failed. Retrying (attempt #4) in 31 seconds... (See above for error details)\n",
+      "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
+      "        HuggingFace error: CUDA error: no kernel image is available for execution on the device\n",
+      "CUDA kernel errors might be asynchronously reported at some other API call,so the stacktrace below might be incorrect.\n",
+      "For debugging consider passing CUDA_LAUNCH_BLOCKING=1.\n",
+      "        Request failed. Retrying (attempt #5) in 71 seconds... (See above for error details)\n"
+     ]
+    }
+   ],
+   "source": [
+    "!helm-run --conf-paths run_specs.conf --local --max-eval-instances 1 --suite v1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "colab": {
+     "base_uri": "https://localhost:8080/"
+    },
+    "id": "2nMIkB4Sz-4W",
+    "outputId": "e0821e91-4af2-42e4-8b6e-13ec560841e5"
+   },
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "main {\n",
+      "  Reading schema from schema.yaml...\n",
+      "  Reading contamination information from contamination.yaml...\n",
+      "  validate_contamination {\n",
+      "  } [0.0s]\n",
+      "  0%|                                                     | 0/7 [00:00<?, ?it/s]  WARNING: costs.json doesn't have run_spec.json or stats.json, skipping\n",
+      "  WARNING: groups.json doesn't have run_spec.json or stats.json, skipping\n",
+      "  WARNING: groups_metadata.json doesn't have run_spec.json or stats.json, skipping\n",
+      "  WARNING: mmlu:subject=philosophy,method=multiple_choice_joint,model=huggingface_gpt2 doesn't have run_spec.json or stats.json, skipping\n",
+      "  WARNING: run_specs.json doesn't have run_spec.json or stats.json, skipping\n",
+      "  WARNING: runs.json doesn't have run_spec.json or stats.json, skipping\n",
+      "  WARNING: summary.json doesn't have run_spec.json or stats.json, skipping\n",
+      "100%|██████████████████████████████████████████| 7/7 [00:00<00:00, 36024.70it/s]\n",
+      "  Summarizer.check_metrics_defined {\n",
+      "  } [0.0s]\n",
+      "  Summarizer.write_executive_summary {\n",
+      "    Writing 43 characters to benchmark_output/runs/v1/summary.json\n",
+      "  } [0.0s]\n",
+      "  Writing 2 characters to benchmark_output/runs/v1/runs.json\n",
+      "  Writing 2 characters to benchmark_output/runs/v1/run_specs.json\n",
+      "  Writing 5062 characters to benchmark_output/runs/v1/groups.json\n",
+      "  Writing 29556 characters to benchmark_output/runs/v1/groups_metadata.json\n",
+      "  Summarizer.write_cost_report {\n",
+      "    Writing 2 characters to benchmark_output/runs/v1/costs.json\n",
+      "  } [0.0s]\n",
+      "  Parallelizing computation on 0 items over 8 threads {\n",
+      "0it [00:00, ?it/s]\n",
+      "  } [0.0s]\n",
+      "  Symlinking benchmark_output/runs/v1 to latest.\n",
+      "  Done.\n",
+      "} [0.36s]\n"
+     ]
+    }
+   ],
+   "source": [
+    "!helm-summarize --suite v1"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "-vu7kWwK5aMm"
+   },
+   "source": [
+    "You can now check out the results in the `benchmark_output` folder!"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {
+    "id": "23kU_nPO5pRi"
+   },
+   "source": [
+    "### Assignment Part 3:\n",
+    "\n",
+    "Try this process out on any Hugging Face model (maybe `BLOOMz`, as an example) and use an alternate suite or metric to examine!\n",
+    "\n",
+    "Refer to the [docs](https://crfm-helm.readthedocs.io/en/latest/) for a comprehensive overview of the available options!\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {
+    "id": "a4C7shhp6Q8u"
+   },
+   "outputs": [],
+   "source": [
+    "### YOUR CODE HERE"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!echo 'entries: [{description: \"mmlu:subject=philosophy,model=gooseai/gpt-j-6b\", priority: 1}]' > run_specs.conf"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "main {\n",
+      "  Read 1 run entries from run_specs.conf\n",
+      "  1 entries produced 1 run specs\n",
+      "  run_specs {\n",
+      "    RunSpec(name='mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b', scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'philosophy'}), adapter_spec=AdapterSpec(method='multiple_choice_joint', global_prefix='', instructions='The following are multiple choice questions (with answers) about philosophy.\\n', input_prefix='Question: ', input_suffix='\\n', reference_prefix='A. ', reference_suffix='\\n', output_prefix='Answer: ', output_suffix='\\n', instance_prefix='\\n', substitutions=[], max_train_instances=5, max_eval_instances=1, num_outputs=5, num_train_trials=1, sample_train=True, model='gooseai/gpt-j-6b', temperature=0.0, max_tokens=1, stop_sequences=['\\n'], random=None), metric_specs=[MetricSpec(class_name='helm.benchmark.basic_metrics.BasicMetric', args={'names': ['exact_match', 'quasi_exact_match', 'prefix_exact_match', 'quasi_prefix_exact_match']})], data_augmenter_spec=DataAugmenterSpec(perturbation_specs=[], should_augment_train_instances=False, should_include_original_train=False, should_skip_unchanged_train=False, should_augment_eval_instances=False, should_include_original_eval=False, should_skip_unchanged_eval=False, seeds_per_instance=1), groups=['mmlu'])\n",
+      "  } [0.0s]\n",
+      "  Running in local mode with base path: prod_env\n",
+      "Looking in path: prod_env\n",
+      "  AutoClient: cache_path = prod_env/cache\n",
+      "  AutoClient: mongo_uri = \n",
+      "  Created cache with config: SqliteCacheConfig(path='prod_env/cache/huggingface.sqlite')\n",
+      "  Found 1 account(s).\n",
+      "  0%|                                                     | 0/1 [00:00<?, ?it/s]  Running mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b {\n",
+      "    scenario.get_instances {\n",
+      "      ensure_file_downloaded {\n",
+      "        Not downloading https://people.eecs.berkeley.edu/~hendrycks/data.tar because benchmark_output/scenarios/mmlu/data already exists\n",
+      "      } [0.0s]\n",
+      "      benchmark_output/scenarios/mmlu/data/auxiliary_train/philosophy_auxiliary_train.csv doesn't exist, skipping\n",
+      "      Reading benchmark_output/scenarios/mmlu/data/dev/philosophy_dev.csv\n",
+      "      Reading benchmark_output/scenarios/mmlu/data/val/philosophy_val.csv\n",
+      "      Reading benchmark_output/scenarios/mmlu/data/test/philosophy_test.csv\n",
+      "    } [0.004s]\n",
+      "    350 instances, 5 train instances, 1/345 eval instances\n",
+      "    DataPreprocessor.preprocess {\n",
+      "    } [0.0s]\n",
+      "    MultipleChoiceJointAdapter.adapt {\n",
+      "      6 instances, choosing 5/5 train instances, 1 eval instances\n",
+      "      Adapting with train_trial_index=0 {\n",
+      "        Sampled 5 examples for trial #0.\n",
+      "        Parallelizing computation on 1 items over 4 threads {\n",
+      "          Created cache with config: SqliteCacheConfig(path='prod_env/cache/EleutherAI.sqlite')\n",
+      "\n",
+      "          Loading EleutherAI/gpt-j-6B with Hugging Face Transformers {\n",
+      "  0%|                                                     | 0/1 [00:00<?, ?it/s]\u001b[A            Local files do not exist for HuggingFace tokenizer: EleutherAI/gpt-j-6B. Downloading...\n",
+      "\n",
+      "\n",
+      "Downloading (…)okenizer_config.json: 100%|█████| 619/619 [00:00<00:00, 7.77MB/s]\u001b[A\u001b[A\n",
+      "\n",
+      "\n",
+      "Downloading (…)olve/main/vocab.json: 100%|███| 798k/798k [00:00<00:00, 48.5MB/s]\u001b[A\u001b[A\n",
+      "\n",
+      "\n",
+      "Downloading (…)olve/main/merges.txt: 100%|███| 456k/456k [00:00<00:00, 84.1MB/s]\u001b[A\u001b[A\n",
+      "\n",
+      "\n",
+      "Downloading (…)/main/tokenizer.json: 100%|██| 1.37M/1.37M [00:00<00:00, 114MB/s]\u001b[A\u001b[A\n",
+      "\n",
+      "\n",
+      "Downloading (…)in/added_tokens.json: 100%|█| 4.04k/4.04k [00:00<00:00, 58.4MB/s]\u001b[A\u001b[A\n",
+      "\n",
+      "\n",
+      "Downloading (…)cial_tokens_map.json: 100%|█████| 357/357 [00:00<00:00, 5.15MB/s]\u001b[A\u001b[A\n",
+      "          } [0.804s]\n",
+      "\n",
+      "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.24it/s]\u001b[A\n",
+      "        } [0.809s]\n",
+      "        Sample prompts {\n",
+      "          reference index = None, request_mode = None {\n",
+      "            The following are multiple choice questions (with answers) about philosophy.\n",
+      "            \n",
+      "            Question: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\n",
+      "            A. metaphysics\n",
+      "            B. epistemology\n",
+      "            C. quantum physics\n",
+      "            D. axiology\n",
+      "            Answer: A\n",
+      "            \n",
+      "            Question: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\n",
+      "            A. pleasure.\n",
+      "            B. happiness.\n",
+      "            C. good.\n",
+      "            D. virtue.\n",
+      "            Answer: C\n",
+      "            \n",
+      "            Question: Psychological egoism is:\n",
+      "            A. an ethical theory about how we ought to behave.\n",
+      "            B. a generalization concerning the way people tend to behave.\n",
+      "            C. a claim about human nature and the ways people are capable of behaving.\n",
+      "            D. none of the above.\n",
+      "            Answer: C\n",
+      "            \n",
+      "            Question: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\n",
+      "            A. optimist\n",
+      "            B. satisfied\n",
+      "            C. nominally religious\n",
+      "            D. pessimist\n",
+      "            Answer: D\n",
+      "            \n",
+      "            Question: According to d'Holbach, people always act according to _____.\n",
+      "            A. free choices\n",
+      "            B. dictates of the soul\n",
+      "            C. necessary natural laws\n",
+      "            D. undetermined will\n",
+      "            Answer: C\n",
+      "            \n",
+      "            Question: What does the notion of “meaning in life” refer to?\n",
+      "            A. external meaning\n",
+      "            B. god's plan\n",
+      "            C. internalmeaning\n",
+      "            D. meaning in an afterlife\n",
+      "            Answer:\n",
+      "          } [0.0s]\n",
+      "        } [0.0s]\n",
+      "      } [0.81s]\n",
+      "      1 requests\n",
+      "    } [0.81s]\n",
+      "    Executor.execute {\n",
+      "      Parallelizing computation on 1 items over 4 threads {\n",
+      "\n",
+      "  0%|                                                     | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "      } [0.007s]\n",
+      "    } [0.008s]\n",
+      "  } [0.825s]\n",
+      "  Error when running mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/executor.py\", line 84, in process\n",
+      "    result: RequestResult = self.service.make_request(self.execution_spec.auth, state.request)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/proxy/services/server_service.py\", line 96, in make_request\n",
+      "    request_result: RequestResult = self.client.make_request(request)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/proxy/clients/auto_client.py\", line 172, in make_request\n",
+      "    client: Client = self._get_client(request.model)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/proxy/clients/auto_client.py\", line 108, in _get_client\n",
+      "    api_key=self.credentials[\"gooseaiApiKey\"], cache_config=cache_config, org_id=org_id\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pyhocon/config_tree.py\", line 393, in __getitem__\n",
+      "    val = self.get(item)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pyhocon/config_tree.py\", line 236, in get\n",
+      "    return self._get(ConfigTree.parse_key(key), 0, default)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pyhocon/config_tree.py\", line 176, in _get\n",
+      "    raise ConfigMissingException(\n",
+      "pyhocon.exceptions.ConfigMissingException: 'No configuration setting found for key gooseaiApiKey'\n",
+      "\n",
+      "The above exception was the direct cause of the following exception:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/runner.py\", line 140, in run_all\n",
+      "    self.run_one(run_spec)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/runner.py\", line 214, in run_one\n",
+      "    scenario_state = self.executor.execute(scenario_state)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/common/hierarchical_logger.py\", line 104, in wrapper\n",
+      "    return fn(*args, **kwargs)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/executor.py\", line 73, in execute\n",
+      "    request_states = parallel_map(\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/common/general.py\", line 227, in parallel_map\n",
+      "    results = list(tqdm(executor.map(process, items), total=len(items), disable=None))\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/tqdm/std.py\", line 1195, in __iter__\n",
+      "    for obj in iterable:\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 621, in result_iterator\n",
+      "    yield _result_or_cancel(fs.pop())\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 319, in _result_or_cancel\n",
+      "    return fut.result(timeout)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 458, in result\n",
+      "    return self.__get_result()\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 403, in __get_result\n",
+      "    raise self._exception\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/thread.py\", line 58, in run\n",
+      "    result = self.fn(*self.args, **self.kwargs)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/executor.py\", line 86, in process\n",
+      "    raise ExecutorError(f\"{str(e)} Request: {state.request}\") from e\n",
+      "helm.benchmark.executor.ExecutorError: 'No configuration setting found for key gooseaiApiKey' Request: Request(model='gooseai/gpt-j-6b', embedding=False, prompt=\"The following are multiple choice questions (with answers) about philosophy.\\n\\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\\nA. metaphysics\\nB. epistemology\\nC. quantum physics\\nD. axiology\\nAnswer: A\\n\\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\\nA. pleasure.\\nB. happiness.\\nC. good.\\nD. virtue.\\nAnswer: C\\n\\nQuestion: Psychological egoism is:\\nA. an ethical theory about how we ought to behave.\\nB. a generalization concerning the way people tend to behave.\\nC. a claim about human nature and the ways people are capable of behaving.\\nD. none of the above.\\nAnswer: C\\n\\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\\nA. optimist\\nB. satisfied\\nC. nominally religious\\nD. pessimist\\nAnswer: D\\n\\nQuestion: According to d'Holbach, people always act according to _____.\\nA. free choices\\nB. dictates of the soul\\nC. necessary natural laws\\nD. undetermined will\\nAnswer: C\\n\\nQuestion: What does the notion of “meaning in life” refer to?\\nA. external meaning\\nB. god's plan\\nC. internalmeaning\\nD. meaning in an afterlife\\nAnswer:\", temperature=0.0, num_completions=1, top_k_per_token=5, max_tokens=1, stop_sequences=[], echo_prompt=False, top_p=1, presence_penalty=0, frequency_penalty=0, random=None, messages=None)\n",
+      "\n",
+      "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  1.21it/s]\n",
+      "} [0.843s]\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/bin/helm-run\", line 8, in <module>\n",
+      "    sys.exit(main())\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/common/hierarchical_logger.py\", line 104, in wrapper\n",
+      "    return fn(*args, **kwargs)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/run.py\", line 289, in main\n",
+      "    run_benchmarking(\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/run.py\", line 107, in run_benchmarking\n",
+      "    runner.run_all(run_specs)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/runner.py\", line 149, in run_all\n",
+      "    raise RunnerError(f\"Failed runs: [{failed_runs_str}]\")\n",
+      "helm.benchmark.runner.RunnerError: Failed runs: [\"mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b\"]\n"
+     ]
+    }
+   ],
+   "source": [
+    "!helm-run --conf-paths run_specs.conf --local --max-eval-instances 1 --suite v1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "main {\n",
+      "  Read 1 run entries from run_specs.conf\n",
+      "  1 entries produced 1 run specs\n",
+      "  run_specs {\n",
+      "    RunSpec(name='mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b', scenario_spec=ScenarioSpec(class_name='helm.benchmark.scenarios.mmlu_scenario.MMLUScenario', args={'subject': 'philosophy'}), adapter_spec=AdapterSpec(method='multiple_choice_joint', global_prefix='', instructions='The following are multiple choice questions (with answers) about philosophy.\\n', input_prefix='Question: ', input_suffix='\\n', reference_prefix='A. ', reference_suffix='\\n', output_prefix='Answer: ', output_suffix='\\n', instance_prefix='\\n', substitutions=[], max_train_instances=5, max_eval_instances=1, num_outputs=5, num_train_trials=1, sample_train=True, model='gooseai/gpt-j-6b', temperature=0.0, max_tokens=1, stop_sequences=['\\n'], random=None), metric_specs=[MetricSpec(class_name='helm.benchmark.basic_metrics.BasicMetric', args={'names': ['exact_match', 'quasi_exact_match', 'prefix_exact_match', 'quasi_prefix_exact_match']})], data_augmenter_spec=DataAugmenterSpec(perturbation_specs=[], should_augment_train_instances=False, should_include_original_train=False, should_skip_unchanged_train=False, should_augment_eval_instances=False, should_include_original_eval=False, should_skip_unchanged_eval=False, seeds_per_instance=1), groups=['mmlu'])\n",
+      "  } [0.0s]\n",
+      "  Running in local mode with base path: prod_env\n",
+      "Looking in path: prod_env\n",
+      "  AutoClient: cache_path = prod_env/cache\n",
+      "  AutoClient: mongo_uri = \n",
+      "  Created cache with config: SqliteCacheConfig(path='prod_env/cache/huggingface.sqlite')\n",
+      "  Found 1 account(s).\n",
+      "  0%|                                                     | 0/1 [00:00<?, ?it/s]  Running mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b {\n",
+      "    scenario.get_instances {\n",
+      "      ensure_file_downloaded {\n",
+      "        Not downloading https://people.eecs.berkeley.edu/~hendrycks/data.tar because benchmark_output/scenarios/mmlu/data already exists\n",
+      "      } [0.0s]\n",
+      "      benchmark_output/scenarios/mmlu/data/auxiliary_train/philosophy_auxiliary_train.csv doesn't exist, skipping\n",
+      "      Reading benchmark_output/scenarios/mmlu/data/dev/philosophy_dev.csv\n",
+      "      Reading benchmark_output/scenarios/mmlu/data/val/philosophy_val.csv\n",
+      "      Reading benchmark_output/scenarios/mmlu/data/test/philosophy_test.csv\n",
+      "    } [0.004s]\n",
+      "    350 instances, 5 train instances, 1/345 eval instances\n",
+      "    DataPreprocessor.preprocess {\n",
+      "    } [0.0s]\n",
+      "    MultipleChoiceJointAdapter.adapt {\n",
+      "      6 instances, choosing 5/5 train instances, 1 eval instances\n",
+      "      Adapting with train_trial_index=0 {\n",
+      "        Sampled 5 examples for trial #0.\n",
+      "        Parallelizing computation on 1 items over 4 threads {\n",
+      "          Created cache with config: SqliteCacheConfig(path='prod_env/cache/EleutherAI.sqlite')\n",
+      "\n",
+      "          Loading EleutherAI/gpt-j-6B with Hugging Face Transformers {\n",
+      "  0%|                                                     | 0/1 [00:00<?, ?it/s]\u001b[A          } [0.095s]\n",
+      "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00, 10.32it/s]\n",
+      "        } [0.097s]\n",
+      "        Sample prompts {\n",
+      "          reference index = None, request_mode = None {\n",
+      "            The following are multiple choice questions (with answers) about philosophy.\n",
+      "            \n",
+      "            Question: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\n",
+      "            A. metaphysics\n",
+      "            B. epistemology\n",
+      "            C. quantum physics\n",
+      "            D. axiology\n",
+      "            Answer: A\n",
+      "            \n",
+      "            Question: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\n",
+      "            A. pleasure.\n",
+      "            B. happiness.\n",
+      "            C. good.\n",
+      "            D. virtue.\n",
+      "            Answer: C\n",
+      "            \n",
+      "            Question: Psychological egoism is:\n",
+      "            A. an ethical theory about how we ought to behave.\n",
+      "            B. a generalization concerning the way people tend to behave.\n",
+      "            C. a claim about human nature and the ways people are capable of behaving.\n",
+      "            D. none of the above.\n",
+      "            Answer: C\n",
+      "            \n",
+      "            Question: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\n",
+      "            A. optimist\n",
+      "            B. satisfied\n",
+      "            C. nominally religious\n",
+      "            D. pessimist\n",
+      "            Answer: D\n",
+      "            \n",
+      "            Question: According to d'Holbach, people always act according to _____.\n",
+      "            A. free choices\n",
+      "            B. dictates of the soul\n",
+      "            C. necessary natural laws\n",
+      "            D. undetermined will\n",
+      "            Answer: C\n",
+      "            \n",
+      "            Question: What does the notion of “meaning in life” refer to?\n",
+      "            A. external meaning\n",
+      "            B. god's plan\n",
+      "            C. internalmeaning\n",
+      "            D. meaning in an afterlife\n",
+      "            Answer:\n",
+      "          } [0.0s]\n",
+      "        } [0.0s]\n",
+      "      } [0.098s]\n",
+      "      1 requests\n",
+      "    } [0.098s]\n",
+      "    Executor.execute {\n",
+      "      Parallelizing computation on 1 items over 4 threads {\n",
+      "\n",
+      "  0%|                                                     | 0/1 [00:00<?, ?it/s]\u001b[A\n",
+      "      } [0.008s]\n",
+      "    } [0.008s]\n",
+      "  } [0.113s]\n",
+      "  Error when running mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b:\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/executor.py\", line 84, in process\n",
+      "    result: RequestResult = self.service.make_request(self.execution_spec.auth, state.request)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/proxy/services/server_service.py\", line 96, in make_request\n",
+      "    request_result: RequestResult = self.client.make_request(request)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/proxy/clients/auto_client.py\", line 172, in make_request\n",
+      "    client: Client = self._get_client(request.model)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/proxy/clients/auto_client.py\", line 108, in _get_client\n",
+      "    api_key=self.credentials[\"gooseaiApiKey\"], cache_config=cache_config, org_id=org_id\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pyhocon/config_tree.py\", line 393, in __getitem__\n",
+      "    val = self.get(item)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pyhocon/config_tree.py\", line 236, in get\n",
+      "    return self._get(ConfigTree.parse_key(key), 0, default)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/pyhocon/config_tree.py\", line 176, in _get\n",
+      "    raise ConfigMissingException(\n",
+      "pyhocon.exceptions.ConfigMissingException: 'No configuration setting found for key gooseaiApiKey'\n",
+      "\n",
+      "The above exception was the direct cause of the following exception:\n",
+      "\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/runner.py\", line 140, in run_all\n",
+      "    self.run_one(run_spec)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/runner.py\", line 214, in run_one\n",
+      "    scenario_state = self.executor.execute(scenario_state)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/common/hierarchical_logger.py\", line 104, in wrapper\n",
+      "    return fn(*args, **kwargs)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/executor.py\", line 73, in execute\n",
+      "    request_states = parallel_map(\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/common/general.py\", line 227, in parallel_map\n",
+      "    results = list(tqdm(executor.map(process, items), total=len(items), disable=None))\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/tqdm/std.py\", line 1195, in __iter__\n",
+      "    for obj in iterable:\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 621, in result_iterator\n",
+      "    yield _result_or_cancel(fs.pop())\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 319, in _result_or_cancel\n",
+      "    return fut.result(timeout)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 458, in result\n",
+      "    return self.__get_result()\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/_base.py\", line 403, in __get_result\n",
+      "    raise self._exception\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/concurrent/futures/thread.py\", line 58, in run\n",
+      "    result = self.fn(*self.args, **self.kwargs)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/executor.py\", line 86, in process\n",
+      "    raise ExecutorError(f\"{str(e)} Request: {state.request}\") from e\n",
+      "helm.benchmark.executor.ExecutorError: 'No configuration setting found for key gooseaiApiKey' Request: Request(model='gooseai/gpt-j-6b', embedding=False, prompt=\"The following are multiple choice questions (with answers) about philosophy.\\n\\nQuestion: The study of reality in the broadest sense, an inquiry into the elemental nature of the universe and the things in it, is known as _____.\\nA. metaphysics\\nB. epistemology\\nC. quantum physics\\nD. axiology\\nAnswer: A\\n\\nQuestion: According to Moore’s “ideal utilitarianism,” the right action is the one that brings about the greatest amount of:\\nA. pleasure.\\nB. happiness.\\nC. good.\\nD. virtue.\\nAnswer: C\\n\\nQuestion: Psychological egoism is:\\nA. an ethical theory about how we ought to behave.\\nB. a generalization concerning the way people tend to behave.\\nC. a claim about human nature and the ways people are capable of behaving.\\nD. none of the above.\\nAnswer: C\\n\\nQuestion: Before Tolstoy's Christian conversion, what was his perspective on the meaning of life?\\nA. optimist\\nB. satisfied\\nC. nominally religious\\nD. pessimist\\nAnswer: D\\n\\nQuestion: According to d'Holbach, people always act according to _____.\\nA. free choices\\nB. dictates of the soul\\nC. necessary natural laws\\nD. undetermined will\\nAnswer: C\\n\\nQuestion: What does the notion of “meaning in life” refer to?\\nA. external meaning\\nB. god's plan\\nC. internalmeaning\\nD. meaning in an afterlife\\nAnswer:\", temperature=0.0, num_completions=1, top_k_per_token=5, max_tokens=1, stop_sequences=[], echo_prompt=False, top_p=1, presence_penalty=0, frequency_penalty=0, random=None, messages=None)\n",
+      "\n",
+      "100%|█████████████████████████████████████████████| 1/1 [00:00<00:00,  8.74it/s]\n",
+      "} [0.131s]\n",
+      "Traceback (most recent call last):\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/bin/helm-run\", line 8, in <module>\n",
+      "    sys.exit(main())\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/common/hierarchical_logger.py\", line 104, in wrapper\n",
+      "    return fn(*args, **kwargs)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/run.py\", line 289, in main\n",
+      "    run_benchmarking(\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/run.py\", line 107, in run_benchmarking\n",
+      "    runner.run_all(run_specs)\n",
+      "  File \"/home/ec2-user/anaconda3/envs/pytorch_p310/lib/python3.10/site-packages/helm/benchmark/runner.py\", line 149, in run_all\n",
+      "    raise RunnerError(f\"Failed runs: [{failed_runs_str}]\")\n",
+      "helm.benchmark.runner.RunnerError: Failed runs: [\"mmlu:subject=philosophy,method=multiple_choice_joint,model=gooseai_gpt-j-6b\"]\n"
+     ]
+    }
+   ],
+   "source": [
+    "!helm-run --conf-paths run_specs.conf --local --max-eval-instances 1 --suite v1"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "accelerator": "GPU",
+  "colab": {
+   "gpuType": "A100",
+   "machine_shape": "hm",
+   "provenance": []
+  },
+  "kernelspec": {
+   "display_name": "conda_pytorch_p310",
+   "language": "python",
+   "name": "conda_pytorch_p310"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}