Buckets:

hf-doc-build
/

doc-dev

Files

xet

hf-doc-build/doc-dev / sagemaker /pr_1995 /en /examples /sagemaker-sdk-evaluate-llm-lighteval.html

rtrm

about 1 month ago

download

raw

34 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Evaluate LLMs with Hugging Face Lighteval on Amazon SageMaker","local":"evaluate-llms-with-hugging-face-lighteval-on-amazon-sagemaker","sections":[{"title":"2. Prepare the evaluation configuraiton","local":"2-prepare-the-evaluation-configuraiton","sections":[],"depth":2},{"title":"3. Evaluate Zephyr 7B on TruthfulQA on Amazon SageMaker","local":"3-evaluate-zephyr-7b-on-truthfulqa-on-amazon-sagemaker","sections":[],"depth":2}],"depth":1}">
	<link href="/docs/sagemaker/pr_1995/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/sagemaker/pr_1995/en/_app/immutable/entry/start.c99dd462.js">
	<link rel="modulepreload" href="/docs/sagemaker/pr_1995/en/_app/immutable/chunks/scheduler.aec39e6a.js">
	<link rel="modulepreload" href="/docs/sagemaker/pr_1995/en/_app/immutable/chunks/singletons.3bc577c3.js">
	<link rel="modulepreload" href="/docs/sagemaker/pr_1995/en/_app/immutable/chunks/paths.12b7f279.js">
	<link rel="modulepreload" href="/docs/sagemaker/pr_1995/en/_app/immutable/entry/app.cd901cb4.js">
	<link rel="modulepreload" href="/docs/sagemaker/pr_1995/en/_app/immutable/chunks/preload-helper.73d9078d.js">
	<link rel="modulepreload" href="/docs/sagemaker/pr_1995/en/_app/immutable/chunks/index.4ee0a2d0.js">
	<link rel="modulepreload" href="/docs/sagemaker/pr_1995/en/_app/immutable/nodes/0.381271ed.js">
	<link rel="modulepreload" href="/docs/sagemaker/pr_1995/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/sagemaker/pr_1995/en/_app/immutable/nodes/6.239d537a.js">
	<link rel="modulepreload" href="/docs/sagemaker/pr_1995/en/_app/immutable/chunks/Tip.e2132029.js">
	<link rel="modulepreload" href="/docs/sagemaker/pr_1995/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.8184f5a3.js">
	<link rel="modulepreload" href="/docs/sagemaker/pr_1995/en/_app/immutable/chunks/CodeBlock.db6247f1.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Evaluate LLMs with Hugging Face Lighteval on Amazon SageMaker","local":"evaluate-llms-with-hugging-face-lighteval-on-amazon-sagemaker","sections":[{"title":"2. Prepare the evaluation configuraiton","local":"2-prepare-the-evaluation-configuraiton","sections":[],"depth":2},{"title":"3. Evaluate Zephyr 7B on TruthfulQA on Amazon SageMaker","local":"3-evaluate-zephyr-7b-on-truthfulqa-on-amazon-sagemaker","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 max-sm:gap-0.5 h-6 max-sm:h-5 px-2 max-sm:px-1.5 text-[11px] max-sm:text-[9px] font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0"><svg class="w-3 h-3 max-sm:w-2.5 max-sm:h-2.5" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-6 max-sm:h-5 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible w-3 h-3 max-sm:w-2.5 max-sm:h-2.5 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="evaluate-llms-with-hugging-face-lighteval-on-amazon-sagemaker" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#evaluate-llms-with-hugging-face-lighteval-on-amazon-sagemaker"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Evaluate LLMs with Hugging Face Lighteval on Amazon SageMaker</span></h1> <p data-svelte-h="svelte-1dytuw0">In this sagemaker example, we are going to learn how to evaluate LLMs using Hugging Face <a href="https://github.com/huggingface/lighteval/tree/main" rel="nofollow">lighteval</a>. LightEval is a lightweight LLM evaluation suite that powers <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" rel="nofollow">Hugging Face Open LLM Leaderboard</a>.</p> <p data-svelte-h="svelte-1tkejc9">Evaluating LLMs is crucial for understanding their capabilities and limitations, yet it poses significant challenges due to their complex and opaque nature. LightEval facilitates this evaluation process by enabling LLMs to be assessed on acamedic benchmarks like MMLU or IFEval, providing a structured approach to gauge their performance across diverse tasks.</p> <p data-svelte-h="svelte-1xbim1c">In Detail you will learn how to:</p> <ol data-svelte-h="svelte-iu956w"><li>Setup Development Environment</li> <li>Prepare the evaluation configuraiton</li> <li>Evaluate Zephyr 7B on TruthfulQA on Amazon SageMaker</li></ol> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip install sagemaker --upgrade --quiet<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-s0u97w">If you are going to use Sagemaker in a local environment. You need access to an IAM Role with the required permissions for Sagemaker. You can find <a href="https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html" rel="nofollow">here</a> more about it.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> sagemaker
	<span class="hljs-keyword">import</span> boto3
	sess = sagemaker.Session()
	<span class="hljs-comment"># sagemaker session bucket -> used for uploading data, models and logs</span>
	<span class="hljs-comment"># sagemaker will automatically create this bucket if it not exists</span>
	sagemaker_session_bucket=<span class="hljs-literal">None</span>
	<span class="hljs-keyword">if</span> sagemaker_session_bucket <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">and</span> sess <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:
	<span class="hljs-comment"># set to default bucket if a bucket name is not given</span>
	sagemaker_session_bucket = sess.default_bucket()

	<span class="hljs-keyword">try</span>:
	role = sagemaker.get_execution_role()
	<span class="hljs-keyword">except</span> ValueError:
	iam = boto3.client(<span class="hljs-string">'iam'</span>)
	role = iam.get_role(RoleName=<span class="hljs-string">'sagemaker_execution_role'</span>)[<span class="hljs-string">'Role'</span>][<span class="hljs-string">'Arn'</span>]

	sess = sagemaker.Session(default_bucket=sagemaker_session_bucket)

	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"sagemaker role arn: <span class="hljs-subst">{role}</span>"</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"sagemaker bucket: <span class="hljs-subst">{sess.default_bucket()}</span>"</span>)
	<span class="hljs-built_in">print</span>(<span class="hljs-string">f"sagemaker session region: <span class="hljs-subst">{sess.boto_region_name}</span>"</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="2-prepare-the-evaluation-configuraiton" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#2-prepare-the-evaluation-configuraiton"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>2. Prepare the evaluation configuraiton</span></h2> <p data-svelte-h="svelte-14s330d"><a href="https://github.com/huggingface/lighteval/tree/main" rel="nofollow">LightEval</a> includes script to evaluate LLMs on common benchmarks like MMLU, Truthfulqa, IFEval, and more. It is used to evaluate models on the <a href="https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard" rel="nofollow">Hugging Face Open LLM Leaderboard</a>. lighteval isy built on top of the great <a href="https://github.com/EleutherAI/lm-evaluation-harness" rel="nofollow">Eleuther AI Harness</a> with some additional features and improvements.</p> <p data-svelte-h="svelte-1yoz672">You can find all available benchmarks <a href="https://github.com/huggingface/lighteval/blob/main/examples/tasks/all_tasks.txt" rel="nofollow">here</a>.</p> <p data-svelte-h="svelte-k8g2se">We are going to use Amazon SageMaker Managed Training to evaluate the model. Therefore we will leverage the script available in <a href="https://github.com/huggingface/lighteval/blob/main/run_evals_accelerate.py" rel="nofollow">lighteval</a>. The Hugging Face DLC is not having lighteval installed. This means need to provide a <code>requirements.txt</code> file to install the required dependencies.</p> <p data-svelte-h="svelte-i6nszn">First lets load the <code>run_evals_accelerate.py</code> script and create a <code>requirements.txt</code> file with the required dependencies.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> os
	<span class="hljs-keyword">import</span> requests <span class="hljs-keyword">as</span> r

	lighteval_version = <span class="hljs-string">"0.2.0"</span>

	<span class="hljs-comment"># create scripts directory if not exists</span>
	os.makedirs(<span class="hljs-string">"scripts"</span>, exist_ok=<span class="hljs-literal">True</span>)

	<span class="hljs-comment"># load custom scripts from git</span>
	raw_github_url = <span class="hljs-string">f"https://raw.githubusercontent.com/huggingface/lighteval/v<span class="hljs-subst">{lighteval_version}</span>/run_evals_accelerate.py"</span>
	res = r.get(raw_github_url)
	<span class="hljs-keyword">with</span> <span class="hljs-built_in">open</span>(<span class="hljs-string">"scripts/run_evals_accelerate.py"</span>, <span class="hljs-string">"w"</span>) <span class="hljs-keyword">as</span> f:
	f.write(res.text)

	<span class="hljs-comment"># write requirements.txt </span>
	<span class="hljs-keyword">with</span> <span class="hljs-built_in">open</span>(<span class="hljs-string">"scripts/requirements.txt"</span>, <span class="hljs-string">"w"</span>) <span class="hljs-keyword">as</span> f:
	f.write(<span class="hljs-string">f"lighteval==<span class="hljs-subst">{lighteval_version}</span>"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1709a7">In lighteval, the evaluation is done by running the <code>run_evals_accelerate.py</code> script. The script takes a <code>task</code> argument which is defined as <code>suite\|task\|num_few_shot\|{0 or 1 to automatically reduce num_few_shot if prompt is too long}</code>. Alternatively, you can also provide a path to a txt file with the tasks you want to evaluate the model on, which we are going to do. This makes it easier for you to extend the evaluation to other benchmarks.</p> <p data-svelte-h="svelte-8hjwst">We are going to evaluate the model on the Truthfulqa benchmark with 0 few-shot examples. <a href="https://paperswithcode.com/dataset/truthfulqa" rel="nofollow">TruthfulQA</a> is a benchmark designed to measure whether a language model generates truthful answers to questions, encompassing 817 questions across 38 categories including health, law, finance, and politics.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">with</span> <span class="hljs-built_in">open</span>(<span class="hljs-string">"scripts/tasks.txt"</span>, <span class="hljs-string">"w"</span>) <span class="hljs-keyword">as</span> f:
	f.write(<span class="hljs-string">f"lighteval\|truthfulqa:mc\|0\|0"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1kdp07x">To evaluate a model on all the benchmarks of the Open LLM Leaderboard you can copy this <a href="https://github.com/huggingface/lighteval/blob/v0.2.0/tasks_examples/open_llm_leaderboard_tasks.txt" rel="nofollow">file</a></p> <h2 class="relative group"><a id="3-evaluate-zephyr-7b-on-truthfulqa-on-amazon-sagemaker" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#3-evaluate-zephyr-7b-on-truthfulqa-on-amazon-sagemaker"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>3. Evaluate Zephyr 7B on TruthfulQA on Amazon SageMaker</span></h2> <p data-svelte-h="svelte-1nagj1d">In this example we are going to evaluate the <a href="https://huggingface.co/HuggingFaceH4/zephyr-7b-beta" rel="nofollow">HuggingFaceH4/zephyr-7b-beta</a> on the MMLU benchmark, which is part of the Open LLM Leaderboard.</p> <p data-svelte-h="svelte-1ftyhj8">In addition to the <code>task</code> argument we need to define:</p> <ul data-svelte-h="svelte-njlmgk"><li><code>model_args</code>: Hugging Face Model ID or path, defined as <code>pretrained=HuggingFaceH4/zephyr-7b-beta</code></li> <li><code>model_dtype</code>: The model data type, defined as <code>bfloat16</code>, <code>float16</code> or <code>float32</code></li> <li><code>output_dir</code>: The directory where the evaluation results will be saved, e.g. <code>/opt/ml/model</code></li></ul> <p data-svelte-h="svelte-1h2b2yh">Lightevals can also evaluat peft models or use <code>chat_templates</code> you find more about it <a href="https://github.com/huggingface/lighteval/blob/v0.2.0/run_evals_accelerate.py" rel="nofollow">here</a>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> sagemaker.huggingface <span class="hljs-keyword">import</span> HuggingFace

	<span class="hljs-comment"># hyperparameters, which are passed into the training job</span>
	hyperparameters = {
	<span class="hljs-string">'model_args'</span>: <span class="hljs-string">"pretrained=HuggingFaceH4/zephyr-7b-beta"</span>, <span class="hljs-comment"># Hugging Face Model ID</span>
	<span class="hljs-string">'task'</span>: <span class="hljs-string">'tasks.txt'</span>, <span class="hljs-comment"># 'lighteval\|truthfulqa:mc\|0\|0', </span>
	<span class="hljs-string">'model_dtype'</span>: <span class="hljs-string">'bfloat16'</span>, <span class="hljs-comment"># Torch dtype to load model weights</span>
	<span class="hljs-string">'output_dir'</span>: <span class="hljs-string">'/opt/ml/model'</span> <span class="hljs-comment"># Directory, which sagemaker uploads to s3 after training</span>
	}

	<span class="hljs-comment"># create the Estimator</span>
	huggingface_estimator = HuggingFace(
	entry_point = <span class="hljs-string">'run_evals_accelerate.py'</span>, <span class="hljs-comment"># train script</span>
	source_dir = <span class="hljs-string">'scripts'</span>, <span class="hljs-comment"># directory which includes all the files needed for training</span>
	instance_type = <span class="hljs-string">'ml.g5.4xlarge'</span>, <span class="hljs-comment"># instances type used for the training job</span>
	instance_count = <span class="hljs-number">1</span>, <span class="hljs-comment"># the number of instances used for training</span>
	base_job_name = <span class="hljs-string">"lighteval"</span>, <span class="hljs-comment"># the name of the training job</span>
	role = role, <span class="hljs-comment"># Iam role used in training job to access AWS ressources, e.g. S3</span>
	volume_size = <span class="hljs-number">300</span>, <span class="hljs-comment"># the size of the EBS volume in GB</span>
	transformers_version = <span class="hljs-string">'4.36'</span>, <span class="hljs-comment"># the transformers version used in the training job</span>
	pytorch_version = <span class="hljs-string">'2.1'</span>, <span class="hljs-comment"># the pytorch_version version used in the training job</span>
	py_version = <span class="hljs-string">'py310'</span>, <span class="hljs-comment"># the python version used in the training job</span>
	hyperparameters = hyperparameters,
	environment = {
	<span class="hljs-string">"HUGGINGFACE_HUB_CACHE"</span>: <span class="hljs-string">"/tmp/.cache"</span>,
	<span class="hljs-comment"># "HF_TOKEN": "REPALCE_WITH_YOUR_TOKEN" # needed for private models</span>
	}, <span class="hljs-comment"># set env variable to cache models in /tmp</span>
	)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1alpixz">We can now start our evaluation job, with the <code>.fit()</code>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># starting the train job with our uploaded datasets as input</span>
	huggingface_estimator.fit()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1yn6w5d">After the evaluation job is finished, we can download the evaluation results from the S3 bucket. Lighteval will save the results and generations in the <code>output_dir</code>. The results are savedas json and include detailed information about each task and the model’s performance. The results are available in the <code>results</code> key.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> tarfile
	<span class="hljs-keyword">import</span> json
	<span class="hljs-keyword">import</span> io
	<span class="hljs-keyword">import</span> os
	<span class="hljs-keyword">from</span> sagemaker.s3 <span class="hljs-keyword">import</span> S3Downloader


	<span class="hljs-comment"># download results from s3</span>
	results_tar = S3Downloader.read_bytes(huggingface_estimator.model_data)
	model_id = hyperparameters[<span class="hljs-string">"model_args"</span>].split(<span class="hljs-string">"="</span>)[<span class="hljs-number">1</span>]
	result={}

	<span class="hljs-comment"># Use tarfile to open the tar content directly from bytes</span>
	<span class="hljs-keyword">with</span> tarfile.<span class="hljs-built_in">open</span>(fileobj=io.BytesIO(results_tar), mode=<span class="hljs-string">"r:gz"</span>) <span class="hljs-keyword">as</span> tar:
	<span class="hljs-comment"># Iterate over items in tar archive to find your json file by its path</span>
	<span class="hljs-keyword">for</span> member <span class="hljs-keyword">in</span> tar.getmembers():
	<span class="hljs-comment"># get path of results based on model id used to evaluate</span>
	<span class="hljs-keyword">if</span> os.path.join(<span class="hljs-string">"details"</span>, model_id) <span class="hljs-keyword">in</span> member.name <span class="hljs-keyword">and</span> member.name.endswith(<span class="hljs-string">'.json'</span>):
	<span class="hljs-comment"># Extract the file content</span>
	f = tar.extractfile(member)
	<span class="hljs-keyword">if</span> f <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>:
	content = f.read()
	result = json.loads(content)
	<span class="hljs-keyword">break</span>

	<span class="hljs-comment"># print results</span>
	<span class="hljs-built_in">print</span>(result[<span class="hljs-string">"results"</span>])
	<span class="hljs-comment"># {'lighteval\|truthfulqa:mc\|0': {'truthfulqa_mc1': 0.40636474908200737, 'truthfulqa_mc1_stderr': 0.017193835812093897, 'truthfulqa_mc2': 0.5747003398184238, 'truthfulqa_mc2_stderr': 0.015742356478301463}}</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1d75i5s">In our test we achieved a <code>mc1</code> score of 40.6% and an <code>mc2</code> score of 57.47%. The <code>mc2</code> is the score used in the Open LLM Leaderboard. Zephyr 7B achieved a <code>mc2</code> score of 57.47% on the TruthfulQA benchmark, which is identical to the score on the Open LLM Leaderboard.
	The evaluation on Truthfulqa took <code>999 seconds</code>. The ml.g5.4xlarge instance we used costs <code>$2.03 per hour</code> for on-demand usage. As a result, the total cost for evaluating Zephyr 7B on Truthfulqa was <code>$0.56</code>.</p> <hr> <blockquote class="tip"><p data-svelte-h="svelte-l9qsnl">📍 Find the complete example on GitHub <a href="https://github.com/huggingface/hub-docs/tree/main/notebooks/sagemaker-sdk/evaluate-llm-lighteval/sagemaker-notebook.ipynb" rel="nofollow">here</a>!</p></blockquote> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/hub-docs/blob/main/docs/sagemaker/source/examples/sagemaker-sdk-evaluate-llm-lighteval.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_16nm56f = {
	assets: "/docs/sagemaker/pr_1995/en",
	base: "/docs/sagemaker/pr_1995/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/sagemaker/pr_1995/en/_app/immutable/entry/start.c99dd462.js"),
	import("/docs/sagemaker/pr_1995/en/_app/immutable/entry/app.cd901cb4.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 6],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 34 kB
Xet hash:: b1a6e7e6e2b9e07f78f570c0df3bcb261de8598e68136415a6c8ae93cafdb2f5

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.