Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"A quick tour","local":"a-quick-tour","sections":[{"title":"Types of evaluations","local":"types-of-evaluations","sections":[],"depth":2},{"title":"Load","local":"load","sections":[{"title":"Community modules","local":"community-modules","sections":[],"depth":3},{"title":"List available modules","local":"list-available-modules","sections":[],"depth":3}],"depth":2},{"title":"Module attributes","local":"module-attributes","sections":[],"depth":2},{"title":"Compute","local":"compute","sections":[{"title":"How to compute","local":"how-to-compute","sections":[],"depth":3},{"title":"Calculate a single metric or a batch of metrics","local":"calculate-a-single-metric-or-a-batch-of-metrics","sections":[],"depth":3},{"title":"Distributed evaluation","local":"distributed-evaluation","sections":[],"depth":3}],"depth":2},{"title":"Combining several evaluations","local":"combining-several-evaluations","sections":[],"depth":2},{"title":"Save and push to the Hub","local":"save-and-push-to-the-hub","sections":[],"depth":2},{"title":"Evaluator","local":"evaluator","sections":[],"depth":2},{"title":"Visualization","local":"visualization","sections":[],"depth":2},{"title":"Running evaluation on a suite of tasks","local":"running-evaluation-on-a-suite-of-tasks","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/evaluate/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/entry/start.138f3e02.js"> | |
| <link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/scheduler.5f3e6389.js"> | |
| <link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/singletons.3420a244.js"> | |
| <link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/paths.65ada1b9.js"> | |
| <link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/entry/app.ad076786.js"> | |
| <link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/preload-helper.0572feea.js"> | |
| <link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/index.62df735e.js"> | |
| <link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/nodes/0.11bd9b89.js"> | |
| <link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/nodes/2.811fa8c3.js"> | |
| <link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.e306bd5b.js"> | |
| <link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/CodeBlock.dc1e8be0.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"A quick tour","local":"a-quick-tour","sections":[{"title":"Types of evaluations","local":"types-of-evaluations","sections":[],"depth":2},{"title":"Load","local":"load","sections":[{"title":"Community modules","local":"community-modules","sections":[],"depth":3},{"title":"List available modules","local":"list-available-modules","sections":[],"depth":3}],"depth":2},{"title":"Module attributes","local":"module-attributes","sections":[],"depth":2},{"title":"Compute","local":"compute","sections":[{"title":"How to compute","local":"how-to-compute","sections":[],"depth":3},{"title":"Calculate a single metric or a batch of metrics","local":"calculate-a-single-metric-or-a-batch-of-metrics","sections":[],"depth":3},{"title":"Distributed evaluation","local":"distributed-evaluation","sections":[],"depth":3}],"depth":2},{"title":"Combining several evaluations","local":"combining-several-evaluations","sections":[],"depth":2},{"title":"Save and push to the Hub","local":"save-and-push-to-the-hub","sections":[],"depth":2},{"title":"Evaluator","local":"evaluator","sections":[],"depth":2},{"title":"Visualization","local":"visualization","sections":[],"depth":2},{"title":"Running evaluation on a suite of tasks","local":"running-evaluation-on-a-suite-of-tasks","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="a-quick-tour" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#a-quick-tour"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>A quick tour</span></h1> <p data-svelte-h="svelte-1j07n93">🤗 Evaluate provides access to a wide range of evaluation tools. It covers a range of modalities such as text, computer vision, audio, etc. as well as tools to evaluate models or datasets. These tools are split into three categories.</p> <h2 class="relative group"><a id="types-of-evaluations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#types-of-evaluations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Types of evaluations</span></h2> <p data-svelte-h="svelte-1m3k4ru">There are different aspects of a typical machine learning pipeline that can be evaluated and for each aspect 🤗 Evaluate provides a tool:</p> <ul data-svelte-h="svelte-dsr3mv"><li><strong>Metric</strong>: A metric is used to evaluate a model’s performance and usually involves the model’s predictions as well as some ground truth labels. You can find all integrated metrics at <a href="https://huggingface.co/evaluate-metric" rel="nofollow">evaluate-metric</a>.</li> <li><strong>Comparison</strong>: A comparison is used to compare two models. This can for example be done by comparing their predictions to ground truth labels and computing their agreement. You can find all integrated comparisons at <a href="https://huggingface.co/evaluate-comparison" rel="nofollow">evaluate-comparison</a>.</li> <li><strong>Measurement</strong>: The dataset is as important as the model trained on it. With measurements one can investigate a dataset’s properties. You can find all integrated measurements at <a href="https://huggingface.co/evaluate-measurement" rel="nofollow">evaluate-measurement</a>.</li></ul> <p data-svelte-h="svelte-10lnpjf">Each of these evaluation modules live on Hugging Face Hub as a Space. They come with an interactive widget and a documentation card documenting its use and limitations. For example <a href="https://huggingface.co/spaces/evaluate-metric/accuracy" rel="nofollow">accuracy</a>:</p> <div class="flex justify-center" data-svelte-h="svelte-1u0ctbl"><img src="https://huggingface.co/datasets/evaluate/media/resolve/main/metric-widget.png" width="400"></div> <p data-svelte-h="svelte-7mwz24">Each metric, comparison, and measurement is a separate Python module, but for using any of them, there is a single entry point: <a href="/docs/evaluate/main/en/package_reference/loading_methods#evaluate.load">evaluate.load()</a>!</p> <h2 class="relative group"><a id="load" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#load"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Load</span></h2> <p data-svelte-h="svelte-1skqbca">Any metric, comparison, or measurement is loaded with the <code>evaluate.load</code> function:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> evaluate | |
| <span class="hljs-meta">>>> </span>accuracy = evaluate.load(<span class="hljs-string">"accuracy"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1u9jlzi">If you want to make sure you are loading the right type of evaluation (especially if there are name clashes) you can explicitly pass the type:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>word_length = evaluate.load(<span class="hljs-string">"word_length"</span>, module_type=<span class="hljs-string">"measurement"</span>)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="community-modules" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#community-modules"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Community modules</span></h3> <p data-svelte-h="svelte-172k455">Besides the modules implemented in 🤗 Evaluate you can also load any community module by specifying the repository ID of the metric implementation:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>element_count = evaluate.load(<span class="hljs-string">"lvwerra/element_count"</span>, module_type=<span class="hljs-string">"measurement"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-14gaawr">See the <a href="/docs/evaluate/main/en/creating_and_sharing">Creating and Sharing Guide</a> for information about uploading custom metrics.</p> <h3 class="relative group"><a id="list-available-modules" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#list-available-modules"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>List available modules</span></h3> <p data-svelte-h="svelte-tdby5t">With <a href="/docs/evaluate/main/en/package_reference/loading_methods#evaluate.list_evaluation_modules">list_evaluation_modules()</a> you can check what modules are available on the hub. You can also filter for a specific modules and skip community metrics if you want. You can also see additional information such as likes:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>evaluate.list_evaluation_modules( | |
| <span class="hljs-meta">... </span> module_type=<span class="hljs-string">"comparison"</span>, | |
| <span class="hljs-meta">... </span> include_community=<span class="hljs-literal">False</span>, | |
| <span class="hljs-meta">... </span> with_details=<span class="hljs-literal">True</span>) | |
| [{<span class="hljs-string">'name'</span>: <span class="hljs-string">'mcnemar'</span>, <span class="hljs-string">'type'</span>: <span class="hljs-string">'comparison'</span>, <span class="hljs-string">'community'</span>: <span class="hljs-literal">False</span>, <span class="hljs-string">'likes'</span>: <span class="hljs-number">1</span>}, | |
| {<span class="hljs-string">'name'</span>: <span class="hljs-string">'exact_match'</span>, <span class="hljs-string">'type'</span>: <span class="hljs-string">'comparison'</span>, <span class="hljs-string">'community'</span>: <span class="hljs-literal">False</span>, <span class="hljs-string">'likes'</span>: <span class="hljs-number">0</span>}]<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="module-attributes" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#module-attributes"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Module attributes</span></h2> <p data-svelte-h="svelte-12b2kxj">All evalution modules come with a range of useful attributes that help to use a module stored in a <a href="/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModuleInfo">EvaluationModuleInfo</a> object.</p> <table data-svelte-h="svelte-5veeh0"><thead><tr><th>Attribute</th> <th>Description</th></tr></thead> <tbody><tr><td><code>description</code></td> <td>A short description of the evaluation module.</td></tr> <tr><td><code>citation</code></td> <td>A BibTex string for citation when available.</td></tr> <tr><td><code>features</code></td> <td>A <code>Features</code> object defining the input format.</td></tr> <tr><td><code>inputs_description</code></td> <td>This is equivalent to the modules docstring.</td></tr> <tr><td><code>homepage</code></td> <td>The homepage of the module.</td></tr> <tr><td><code>license</code></td> <td>The license of the module.</td></tr> <tr><td><code>codebase_urls</code></td> <td>Link to the code behind the module.</td></tr> <tr><td><code>reference_urls</code></td> <td>Additional reference URLs.</td></tr></tbody></table> <p data-svelte-h="svelte-1414ywc">Let’s have a look at a few examples. First, let’s look at the <code>description</code> attribute of the accuracy metric:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>accuracy = evaluate.load(<span class="hljs-string">"accuracy"</span>) | |
| <span class="hljs-meta">>>> </span>accuracy.description | |
| Accuracy <span class="hljs-keyword">is</span> the proportion of correct predictions among the total number of cases processed. It can be computed <span class="hljs-keyword">with</span>: | |
| Accuracy = (TP + TN) / (TP + TN + FP + FN) | |
| Where: | |
| TP: <span class="hljs-literal">True</span> positive | |
| TN: <span class="hljs-literal">True</span> negative | |
| FP: <span class="hljs-literal">False</span> positive | |
| FN: <span class="hljs-literal">False</span> negative<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1anocpn">You can see that it describes how the metric works in theory. If you use this metric for your work, especially if it is an academic publication you want to reference it properly. For that you can look at the <code>citation</code> attribute:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>accuracy.citation | |
| <span class="hljs-meta">@article{scikit-learn,</span> | |
| title={Scikit-learn: Machine Learning <span class="hljs-keyword">in</span> {P}ython}, | |
| author={Pedregosa, F. <span class="hljs-keyword">and</span> Varoquaux, G. <span class="hljs-keyword">and</span> Gramfort, A. <span class="hljs-keyword">and</span> Michel, V. | |
| <span class="hljs-keyword">and</span> Thirion, B. <span class="hljs-keyword">and</span> Grisel, O. <span class="hljs-keyword">and</span> Blondel, M. <span class="hljs-keyword">and</span> Prettenhofer, P. | |
| <span class="hljs-keyword">and</span> Weiss, R. <span class="hljs-keyword">and</span> Dubourg, V. <span class="hljs-keyword">and</span> Vanderplas, J. <span class="hljs-keyword">and</span> Passos, A. <span class="hljs-keyword">and</span> | |
| Cournapeau, D. <span class="hljs-keyword">and</span> Brucher, M. <span class="hljs-keyword">and</span> Perrot, M. <span class="hljs-keyword">and</span> Duchesnay, E.}, | |
| journal={Journal of Machine Learning Research}, | |
| volume={<span class="hljs-number">12</span>}, | |
| pages={<span class="hljs-number">2825</span>--<span class="hljs-number">2830</span>}, | |
| year={<span class="hljs-number">2011</span>} | |
| }<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1jd7w6h">Before we can apply a metric or other evaluation module to a use-case, we need to know what the input format of the metric is:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>accuracy.features | |
| { | |
| <span class="hljs-string">'predictions'</span>: Value(dtype=<span class="hljs-string">'int32'</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), | |
| <span class="hljs-string">'references'</span>: Value(dtype=<span class="hljs-string">'int32'</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>) | |
| }<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-2566m3"><p>Note that features always describe the type of a single input element. In general we will add lists of elements so you can always think of a list around the types in <code>features</code>. Evaluate accepts various input formats (Python lists, NumPy arrays, PyTorch tensors, etc.) and converts them to an appropriate format for storage and computation.</p></blockquote> <h2 class="relative group"><a id="compute" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#compute"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Compute</span></h2> <p data-svelte-h="svelte-vmt7rv">Now that we know how the evaluation module works and what should go in there we want to actually use it! When it comes to computing the actual score there are two main ways to do it:</p> <ol data-svelte-h="svelte-1wb6m9u"><li>All-in-one</li> <li>Incremental</li></ol> <p data-svelte-h="svelte-dd9fwy">In the incremental approach the necessary inputs are added to the module with <a href="/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.add">EvaluationModule.add()</a> or <a href="/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.add_batch">EvaluationModule.add_batch()</a> and the score is calculated at the end with <a href="/docs/evaluate/main/en/package_reference/main_classes#evaluate.EvaluationModule.compute">EvaluationModule.compute()</a>. Alternatively, one can pass all the inputs at once to <code>compute()</code>. Let’s have a look at the two approaches.</p> <h3 class="relative group"><a id="how-to-compute" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#how-to-compute"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>How to compute</span></h3> <p data-svelte-h="svelte-1qlkzhd">The simplest way to calculate the score of an evaluation module is by calling <code>compute()</code> directly with the necessary inputs. Simply pass the inputs as seen in <code>features</code> to the <code>compute()</code> method.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>accuracy.compute(references=[<span class="hljs-number">0</span>,<span class="hljs-number">1</span>,<span class="hljs-number">0</span>,<span class="hljs-number">1</span>], predictions=[<span class="hljs-number">1</span>,<span class="hljs-number">0</span>,<span class="hljs-number">0</span>,<span class="hljs-number">1</span>]) | |
| {<span class="hljs-string">'accuracy'</span>: <span class="hljs-number">0.5</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-gjnmuk">Evaluation modules return the results in a dictionary. However, in some instances you build up the predictions iteratively or in a distributed fashion in which case <code>add()</code> or <code>add_batch()</code> are useful.</p> <h3 class="relative group"><a id="calculate-a-single-metric-or-a-batch-of-metrics" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#calculate-a-single-metric-or-a-batch-of-metrics"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Calculate a single metric or a batch of metrics</span></h3> <p data-svelte-h="svelte-1m051l4">In many evaluation pipelines you build the predictions iteratively such as in a for-loop. In that case you could store the predictions in a list and at the end pass them to <code>compute()</code>. With <code>add()</code> and <code>add_batch()</code> you can circumvent the step of storing the predictions separately. If you are only creating single predictions at a time you can use <code>add()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">for</span> ref, pred <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>([<span class="hljs-number">0</span>,<span class="hljs-number">1</span>,<span class="hljs-number">0</span>,<span class="hljs-number">1</span>], [<span class="hljs-number">1</span>,<span class="hljs-number">0</span>,<span class="hljs-number">0</span>,<span class="hljs-number">1</span>]): | |
| <span class="hljs-meta">>>> </span> accuracy.add(references=ref, predictions=pred) | |
| <span class="hljs-meta">>>> </span>accuracy.compute() | |
| {<span class="hljs-string">'accuracy'</span>: <span class="hljs-number">0.5</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-dy6vdz">Once you have gathered all predictions you can call <code>compute()</code> to compute the score based on all stored values. When getting predictions and references in batches you can use <code>add_batch()</code> which adds a list elements for later processing. The rest works as with <code>add()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">for</span> refs, preds <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>([[<span class="hljs-number">0</span>,<span class="hljs-number">1</span>],[<span class="hljs-number">0</span>,<span class="hljs-number">1</span>]], [[<span class="hljs-number">1</span>,<span class="hljs-number">0</span>],[<span class="hljs-number">0</span>,<span class="hljs-number">1</span>]]): | |
| <span class="hljs-meta">>>> </span> accuracy.add_batch(references=refs, predictions=preds) | |
| <span class="hljs-meta">>>> </span>accuracy.compute() | |
| {<span class="hljs-string">'accuracy'</span>: <span class="hljs-number">0.5</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1xf1wgv">This is especially useful when you need to get the predictions from your model in batches:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span><span class="hljs-keyword">for</span> model_inputs, gold_standards <span class="hljs-keyword">in</span> evaluation_dataset: | |
| <span class="hljs-meta">>>> </span> predictions = model(model_inputs) | |
| <span class="hljs-meta">>>> </span> metric.add_batch(references=gold_standards, predictions=predictions) | |
| <span class="hljs-meta">>>> </span>metric.compute()<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="distributed-evaluation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#distributed-evaluation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Distributed evaluation</span></h3> <p data-svelte-h="svelte-fetmj">Computing metrics in a distributed environment can be tricky. Metric evaluation is executed in separate Python processes, or nodes, on different subsets of a dataset. Typically, when a metric score is additive (<code>f(AuB) = f(A) + f(B)</code>), you can use distributed reduce operations to gather the scores for each subset of the dataset. But when a metric is non-additive (<code>f(AuB) ≠ f(A) + f(B)</code>), it’s not that simple. For example, you can’t take the sum of the <a href="https://huggingface.co/spaces/evaluate-metric/f1" rel="nofollow">F1</a> scores of each data subset as your <strong>final metric</strong>.</p> <p data-svelte-h="svelte-1tqog7c">A common way to overcome this issue is to fallback on single process evaluation. The metrics are evaluated on a single GPU, which becomes inefficient.</p> <p data-svelte-h="svelte-gyyzuk">🤗 Evaluate solves this issue by only computing the final metric on the first node. The predictions and references are computed and provided to the metric separately for each node. These are temporarily stored in an Apache Arrow table, avoiding cluttering the GPU or CPU memory. When you are ready to <code>compute()</code> the final metric, the first node is able to access the predictions and references stored on all the other nodes. Once it has gathered all the predictions and references, <code>compute()</code> will perform the final metric evaluation.</p> <p data-svelte-h="svelte-ukalmq">This solution allows 🤗 Evaluate to perform distributed predictions, which is important for evaluation speed in distributed settings. At the same time, you can also use complex non-additive metrics without wasting valuable GPU or CPU memory.</p> <h2 class="relative group"><a id="combining-several-evaluations" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#combining-several-evaluations"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Combining several evaluations</span></h2> <p data-svelte-h="svelte-wme5rr">Often one wants to not only evaluate a single metric but a range of different metrics capturing different aspects of a model. E.g. for classification it is usually a good idea to compute F1-score, recall, and precision in addition to accuracy to get a better picture of model performance. Naturally, you can load a bunch of metrics and call them sequentially. However, a more convenient way is to use the <a href="/docs/evaluate/main/en/package_reference/main_classes#evaluate.combine">combine()</a> function to bundle them together:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>clf_metrics = evaluate.combine([<span class="hljs-string">"accuracy"</span>, <span class="hljs-string">"f1"</span>, <span class="hljs-string">"precision"</span>, <span class="hljs-string">"recall"</span>])<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-13z9gnh">The <code>combine</code> function accepts both the list of names of the metrics as well as an instantiated modules. The <code>compute</code> call then computes each metric:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>clf_metrics.compute(predictions=[<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">0</span>], references=[<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>]) | |
| { | |
| <span class="hljs-string">'accuracy'</span>: <span class="hljs-number">0.667</span>, | |
| <span class="hljs-string">'f1'</span>: <span class="hljs-number">0.667</span>, | |
| <span class="hljs-string">'precision'</span>: <span class="hljs-number">1.0</span>, | |
| <span class="hljs-string">'recall'</span>: <span class="hljs-number">0.5</span> | |
| }<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="save-and-push-to-the-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#save-and-push-to-the-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Save and push to the Hub</span></h2> <p data-svelte-h="svelte-bjlw33">Saving and sharing evaluation results is an important step. We provide the <a href="/docs/evaluate/main/en/package_reference/saving_methods#evaluate.save">evaluate.save()</a> function to easily save metrics results. You can either pass a specific filename or a directory. In the latter case, the results are saved in a file with an automatically created file name. Besides the directory or file name, the function takes any key-value pairs as inputs and stores them in a JSON file.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>result = accuracy.compute(references=[<span class="hljs-number">0</span>,<span class="hljs-number">1</span>,<span class="hljs-number">0</span>,<span class="hljs-number">1</span>], predictions=[<span class="hljs-number">1</span>,<span class="hljs-number">0</span>,<span class="hljs-number">0</span>,<span class="hljs-number">1</span>]) | |
| <span class="hljs-meta">>>> </span>hyperparams = {<span class="hljs-string">"model"</span>: <span class="hljs-string">"bert-base-uncased"</span>} | |
| <span class="hljs-meta">>>> </span>evaluate.save(<span class="hljs-string">"./results/"</span>, experiment=<span class="hljs-string">"run 42"</span>, **result, **hyperparams) | |
| PosixPath(<span class="hljs-string">'results/result-2022_05_30-22_09_11.json'</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-x28agk">The content of the JSON file look like the following:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-punctuation">{</span> | |
| <span class="hljs-attr">"experiment"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"run 42"</span><span class="hljs-punctuation">,</span> | |
| <span class="hljs-attr">"accuracy"</span><span class="hljs-punctuation">:</span> <span class="hljs-number">0.5</span><span class="hljs-punctuation">,</span> | |
| <span class="hljs-attr">"model"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"bert-base-uncased"</span><span class="hljs-punctuation">,</span> | |
| <span class="hljs-attr">"_timestamp"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"2022-05-30T22:09:11.959469"</span><span class="hljs-punctuation">,</span> | |
| <span class="hljs-attr">"_git_commit_hash"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"123456789abcdefghijkl"</span><span class="hljs-punctuation">,</span> | |
| <span class="hljs-attr">"_evaluate_version"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"0.1.0"</span><span class="hljs-punctuation">,</span> | |
| <span class="hljs-attr">"_python_version"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"3.9.12 (main, Mar 26 2022, 15:51:15) \n[Clang 13.1.6 (clang-1316.0.21.2)]"</span><span class="hljs-punctuation">,</span> | |
| <span class="hljs-attr">"_interpreter_path"</span><span class="hljs-punctuation">:</span> <span class="hljs-string">"/Users/leandro/git/evaluate/env/bin/python"</span> | |
| <span class="hljs-punctuation">}</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1j5fdcg">In addition to the specified fields, it also contains useful system information for reproducing the results.</p> <p data-svelte-h="svelte-18ykifj">Besides storing the results locally, you should report them on the model’s repository on the Hub. With the <a href="/docs/evaluate/main/en/package_reference/hub_methods#evaluate.push_to_hub">evaluate.push_to_hub()</a> function, you can easily report evaluation results to the model’s repository:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->evaluate.push_to_hub( | |
| model_id=<span class="hljs-string">"huggingface/gpt2-wikitext2"</span>, <span class="hljs-comment"># model repository on hub</span> | |
| metric_value=<span class="hljs-number">0.5</span>, <span class="hljs-comment"># metric value</span> | |
| metric_type=<span class="hljs-string">"bleu"</span>, <span class="hljs-comment"># metric name, e.g. accuracy.name</span> | |
| metric_name=<span class="hljs-string">"BLEU"</span>, <span class="hljs-comment"># pretty name which is displayed</span> | |
| dataset_type=<span class="hljs-string">"wikitext"</span>, <span class="hljs-comment"># dataset name on the hub</span> | |
| dataset_name=<span class="hljs-string">"WikiText"</span>, <span class="hljs-comment"># pretty name</span> | |
| dataset_split=<span class="hljs-string">"test"</span>, <span class="hljs-comment"># dataset split used</span> | |
| task_type=<span class="hljs-string">"text-generation"</span>, <span class="hljs-comment"># task id, see https://github.com/huggingface/evaluate/blob/main/src/evaluate/config.py#L154-L192</span> | |
| task_name=<span class="hljs-string">"Text Generation"</span> <span class="hljs-comment"># pretty name for task</span> | |
| )<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="evaluator" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#evaluator"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Evaluator</span></h2> <p data-svelte-h="svelte-1sa5tcd">The <a href="/docs/evaluate/main/en/package_reference/evaluator_classes#evaluate.evaluator">evaluate.evaluator()</a> provides automated evaluation and only requires a model, dataset, metric in contrast to the metrics in <code>EvaluationModule</code>s that require the model’s predictions. As such it is easier to evaluate a model on a dataset with a given metric as the inference is handled internally. To make that possible it uses the <a href="https://huggingface.co/docs/transformers/main/en/main_classes/pipelines#transformers.pipeline" rel="nofollow">pipeline</a> abstraction from <code>transformers</code>. However, you can use your own framework as long as it follows the <code>pipeline</code> interface.</p> <p data-svelte-h="svelte-myrcia">To make an evaluation with the <code>evaluator</code> let’s load a <code>transformers</code> pipeline (but you can pass your own custom inference class for any framework as long as it follows the pipeline call API) with an model trained on IMDb, the IMDb test split and the accuracy metric.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline | |
| <span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-keyword">from</span> evaluate <span class="hljs-keyword">import</span> evaluator | |
| <span class="hljs-keyword">import</span> evaluate | |
| pipe = pipeline(<span class="hljs-string">"text-classification"</span>, model=<span class="hljs-string">"lvwerra/distilbert-imdb"</span>, device=<span class="hljs-number">0</span>) | |
| data = load_dataset(<span class="hljs-string">"imdb"</span>, split=<span class="hljs-string">"test"</span>).shuffle().select(<span class="hljs-built_in">range</span>(<span class="hljs-number">1000</span>)) | |
| metric = evaluate.load(<span class="hljs-string">"accuracy"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-19jk2fo">Then you can create an evaluator for text classification and pass the three objects to the <code>compute()</code> method. With the label mapping <code>evaluate</code> provides a method to align the pipeline outputs with the label column in the dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>task_evaluator = evaluator(<span class="hljs-string">"text-classification"</span>) | |
| <span class="hljs-meta">>>> </span>results = task_evaluator.compute(model_or_pipeline=pipe, data=data, metric=metric, | |
| <span class="hljs-meta">... </span> label_mapping={<span class="hljs-string">"NEGATIVE"</span>: <span class="hljs-number">0</span>, <span class="hljs-string">"POSITIVE"</span>: <span class="hljs-number">1</span>},) | |
| <span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(results) | |
| {<span class="hljs-string">'accuracy'</span>: <span class="hljs-number">0.934</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-fvn7gh">Calculating the value of the metric alone is often not enough to know if a model performs significantly better than another one. With <em>bootstrapping</em> <code>evaluate</code> computes confidence intervals and the standard error which helps estimate how stable a score is:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">>>> </span>results = <span class="hljs-built_in">eval</span>.compute(model_or_pipeline=pipe, data=data, metric=metric, | |
| <span class="hljs-meta">... </span> label_mapping={<span class="hljs-string">"NEGATIVE"</span>: <span class="hljs-number">0</span>, <span class="hljs-string">"POSITIVE"</span>: <span class="hljs-number">1</span>}, | |
| <span class="hljs-meta">... </span> strategy=<span class="hljs-string">"bootstrap"</span>, n_resamples=<span class="hljs-number">200</span>) | |
| <span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(results) | |
| {<span class="hljs-string">'accuracy'</span>: | |
| { | |
| <span class="hljs-string">'confidence_interval'</span>: (<span class="hljs-number">0.906</span>, <span class="hljs-number">0.9406749892841922</span>), | |
| <span class="hljs-string">'standard_error'</span>: <span class="hljs-number">0.00865213251082787</span>, | |
| <span class="hljs-string">'score'</span>: <span class="hljs-number">0.923</span> | |
| } | |
| }<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-53c6xd">The evaluator expects a <code>"text"</code> and <code>"label"</code> column for the data input. If your dataset differs you can provide the columns with the keywords <code>input_column="text"</code> and <code>label_column="label"</code>. Currently only <code>"text-classification"</code> is supported with more tasks being added in the future.</p> <h2 class="relative group"><a id="visualization" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#visualization"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Visualization</span></h2> <p data-svelte-h="svelte-2emoqr">When comparing several models, sometimes it’s hard to spot the differences in their performance simply by looking at their scores. Also often there is not a single best model but there are trade-offs between e.g. latency and accuracy as larger models might have better performance but are also slower. We are gradually adding different visualization approaches, such as plots, to make choosing the best model for a use-case easier.</p> <p data-svelte-h="svelte-7j394z">For instance, if you have a list of results from multiple models (as dictionaries), you can feed them into the <code>radar_plot()</code> function:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> evaluate | |
| <span class="hljs-keyword">from</span> evaluate.visualization <span class="hljs-keyword">import</span> radar_plot | |
| <span class="hljs-meta">>>> </span>data = [ | |
| {<span class="hljs-string">"accuracy"</span>: <span class="hljs-number">0.99</span>, <span class="hljs-string">"precision"</span>: <span class="hljs-number">0.8</span>, <span class="hljs-string">"f1"</span>: <span class="hljs-number">0.95</span>, <span class="hljs-string">"latency_in_seconds"</span>: <span class="hljs-number">33.6</span>}, | |
| {<span class="hljs-string">"accuracy"</span>: <span class="hljs-number">0.98</span>, <span class="hljs-string">"precision"</span>: <span class="hljs-number">0.87</span>, <span class="hljs-string">"f1"</span>: <span class="hljs-number">0.91</span>, <span class="hljs-string">"latency_in_seconds"</span>: <span class="hljs-number">11.2</span>}, | |
| {<span class="hljs-string">"accuracy"</span>: <span class="hljs-number">0.98</span>, <span class="hljs-string">"precision"</span>: <span class="hljs-number">0.78</span>, <span class="hljs-string">"f1"</span>: <span class="hljs-number">0.88</span>, <span class="hljs-string">"latency_in_seconds"</span>: <span class="hljs-number">87.6</span>}, | |
| {<span class="hljs-string">"accuracy"</span>: <span class="hljs-number">0.88</span>, <span class="hljs-string">"precision"</span>: <span class="hljs-number">0.78</span>, <span class="hljs-string">"f1"</span>: <span class="hljs-number">0.81</span>, <span class="hljs-string">"latency_in_seconds"</span>: <span class="hljs-number">101.6</span>} | |
| ] | |
| <span class="hljs-meta">>>> </span>model_names = [<span class="hljs-string">"Model 1"</span>, <span class="hljs-string">"Model 2"</span>, <span class="hljs-string">"Model 3"</span>, <span class="hljs-string">"Model 4"</span>] | |
| <span class="hljs-meta">>>> </span>plot = radar_plot(data=data, model_names=model_names) | |
| <span class="hljs-meta">>>> </span>plot.show()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-a677ee">Which lets you visually compare the 4 models and choose the optimal one for you, based on one or several metrics:</p> <div class="flex justify-center" data-svelte-h="svelte-1kqtfj4"><img src="https://huggingface.co/datasets/evaluate/media/resolve/main/example_viz.png" width="400"></div> <h2 class="relative group"><a id="running-evaluation-on-a-suite-of-tasks" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#running-evaluation-on-a-suite-of-tasks"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Running evaluation on a suite of tasks</span></h2> <p data-svelte-h="svelte-n87bz9">It can be useful to evaluate models on a variety of different tasks to understand their downstream performance. The <a href="evaluation_suite">EvaluationSuite</a> enables evaluation of models on a collection of tasks. Tasks can be constructed as (<a href="base_evaluator">evaluator</a>, dataset, metric) tuples and passed to an <a href="evaluation_suite">EvaluationSuite</a> stored on the Hugging Face Hub as a Space, or locally as a Python script. See the <a href="base_evaluator">evaluator documentation</a> for a list of currently supported tasks.</p> <p data-svelte-h="svelte-x8r59q"><code>EvaluationSuite</code> scripts can be defined as follows, and supports Python code for data preprocessing.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> evaluate | |
| <span class="hljs-keyword">from</span> evaluate.evaluation_suite <span class="hljs-keyword">import</span> SubTask | |
| <span class="hljs-keyword">class</span> <span class="hljs-title class_">Suite</span>(evaluate.EvaluationSuite): | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">__init__</span>(<span class="hljs-params">self, name</span>): | |
| <span class="hljs-built_in">super</span>().__init__(name) | |
| self.suite = [ | |
| SubTask( | |
| task_type=<span class="hljs-string">"text-classification"</span>, | |
| data=<span class="hljs-string">"imdb"</span>, | |
| split=<span class="hljs-string">"test[:1]"</span>, | |
| args_for_task={ | |
| <span class="hljs-string">"metric"</span>: <span class="hljs-string">"accuracy"</span>, | |
| <span class="hljs-string">"input_column"</span>: <span class="hljs-string">"text"</span>, | |
| <span class="hljs-string">"label_column"</span>: <span class="hljs-string">"label"</span>, | |
| <span class="hljs-string">"label_mapping"</span>: { | |
| <span class="hljs-string">"LABEL_0"</span>: <span class="hljs-number">0.0</span>, | |
| <span class="hljs-string">"LABEL_1"</span>: <span class="hljs-number">1.0</span> | |
| } | |
| } | |
| ), | |
| SubTask( | |
| task_type=<span class="hljs-string">"text-classification"</span>, | |
| data=<span class="hljs-string">"sst2"</span>, | |
| split=<span class="hljs-string">"test[:1]"</span>, | |
| args_for_task={ | |
| <span class="hljs-string">"metric"</span>: <span class="hljs-string">"accuracy"</span>, | |
| <span class="hljs-string">"input_column"</span>: <span class="hljs-string">"sentence"</span>, | |
| <span class="hljs-string">"label_column"</span>: <span class="hljs-string">"label"</span>, | |
| <span class="hljs-string">"label_mapping"</span>: { | |
| <span class="hljs-string">"LABEL_0"</span>: <span class="hljs-number">0.0</span>, | |
| <span class="hljs-string">"LABEL_1"</span>: <span class="hljs-number">1.0</span> | |
| } | |
| } | |
| ) | |
| ]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-mtm0nc">Evaluation can be run by loading the <code>EvaluationSuite</code> and calling the <code>run()</code> method with a model or pipeline.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta prompt_">>>></span> <span class="language-python"><span class="hljs-keyword">from</span> evaluate <span class="hljs-keyword">import</span> EvaluationSuite</span> | |
| <span class="hljs-meta prompt_">>>></span> <span class="language-python">suite = EvaluationSuite.load(<span class="hljs-string">'mathemakitten/sentiment-evaluation-suite'</span>)</span> | |
| <span class="hljs-meta prompt_">>>></span> <span class="language-python">results = suite.run(<span class="hljs-string">"huggingface/prunebert-base-uncased-6-finepruned-w-distil-mnli"</span>)</span><!-- HTML_TAG_END --></pre></div> <table data-svelte-h="svelte-g1mim"><thead><tr><th align="right">accuracy</th> <th align="right">total_time_in_seconds</th> <th align="right">samples_per_second</th> <th align="left">latency_in_seconds</th> <th align="left">task_name</th></tr></thead> <tbody><tr><td align="right">0.3</td> <td align="right">4.62804</td> <td align="right">2.16074</td> <td align="left">0.462804</td> <td align="left">imdb</td></tr> <tr><td align="right">0</td> <td align="right">0.686388</td> <td align="right">14.569</td> <td align="left">0.0686388</td> <td align="left">sst2</td></tr></tbody></table> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/evaluate/blob/main/docs/source/a_quick_tour.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_95ljrq = { | |
| assets: "/docs/evaluate/main/en", | |
| base: "/docs/evaluate/main/en", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/evaluate/main/en/_app/immutable/entry/start.138f3e02.js"), | |
| import("/docs/evaluate/main/en/_app/immutable/entry/app.ad076786.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 2], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 87.9 kB
- Xet hash:
- e675e408250cb50c5ad6cc027b972e14e5e793f55e820141bc3362678c14ccc1
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.