Buckets:

hf-doc-build
/

doc

Files

xet

hf-doc-build/doc / evaluate /main /en /base_evaluator.html

rtrm

about 1 month ago

download

raw

53.8 kB

	<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Using the evaluator","local":"using-the-evaluator","sections":[{"title":"Text classification","local":"text-classification","sections":[{"title":"Evaluate models on the Hub","local":"evaluate-models-on-the-hub","sections":[],"depth":3},{"title":"Evaluate multiple metrics","local":"evaluate-multiple-metrics","sections":[],"depth":3}],"depth":2},{"title":"Token Classification","local":"token-classification","sections":[{"title":"Benchmarking several models","local":"benchmarking-several-models","sections":[],"depth":3},{"title":"Visualizing results","local":"visualizing-results","sections":[],"depth":3}],"depth":2},{"title":"Question Answering","local":"question-answering","sections":[{"title":"Confidence intervals","local":"confidence-intervals","sections":[],"depth":3}],"depth":2},{"title":"Image classification","local":"image-classification","sections":[{"title":"Handling large datasets","local":"handling-large-datasets","sections":[],"depth":3}],"depth":2}],"depth":1}">
	<link href="/docs/evaluate/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
	<link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/entry/start.138f3e02.js">
	<link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/scheduler.5f3e6389.js">
	<link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/singletons.3420a244.js">
	<link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/paths.65ada1b9.js">
	<link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/entry/app.ad076786.js">
	<link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/preload-helper.0572feea.js">
	<link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/index.62df735e.js">
	<link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/nodes/0.11bd9b89.js">
	<link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/each.e59479a4.js">
	<link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/nodes/3.0324acca.js">
	<link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/MermaidChart.svelte_svelte_type_style_lang.e306bd5b.js">
	<link rel="modulepreload" href="/docs/evaluate/main/en/_app/immutable/chunks/CodeBlock.dc1e8be0.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Using the evaluator","local":"using-the-evaluator","sections":[{"title":"Text classification","local":"text-classification","sections":[{"title":"Evaluate models on the Hub","local":"evaluate-models-on-the-hub","sections":[],"depth":3},{"title":"Evaluate multiple metrics","local":"evaluate-multiple-metrics","sections":[],"depth":3}],"depth":2},{"title":"Token Classification","local":"token-classification","sections":[{"title":"Benchmarking several models","local":"benchmarking-several-models","sections":[],"depth":3},{"title":"Visualizing results","local":"visualizing-results","sections":[],"depth":3}],"depth":2},{"title":"Question Answering","local":"question-answering","sections":[{"title":"Confidence intervals","local":"confidence-intervals","sections":[],"depth":3}],"depth":2},{"title":"Image classification","local":"image-classification","sections":[{"title":"Handling large datasets","local":"handling-large-datasets","sections":[],"depth":3}],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <div class="items-center shrink-0 min-w-[100px] max-sm:min-w-[50px] justify-end ml-auto flex" style="float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"><div class="inline-flex rounded-md max-sm:rounded-sm"><button class="inline-flex items-center gap-1 h-7 max-sm:h-7 px-2 max-sm:px-1.5 text-sm font-medium text-gray-800 border border-r-0 rounded-l-md max-sm:rounded-l-sm border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-live="polite"><span class="inline-flex items-center justify-center rounded-md p-0.5 max-sm:p-0 hover:text-gray-800 dark:hover:text-gray-200"><svg class="sm:size-3.5 size-3" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg></span> <span>Copy page</span></button> <button class="inline-flex items-center justify-center w-6 max-sm:w-5 h-7 max-sm:h-7 disabled:pointer-events-none text-sm text-gray-500 hover:text-gray-700 dark:hover:text-white rounded-r-md max-sm:rounded-r-sm border border-l transition border-gray-200 bg-white hover:shadow-inner dark:border-gray-850 dark:bg-gray-950 dark:text-gray-200 dark:hover:bg-gray-800" aria-haspopup="menu" aria-expanded="false" aria-label="Open copy menu"><svg class="transition-transform text-gray-400 overflow-visible sm:size-3.5 size-3 rotate-0" width="1em" height="1em" viewBox="0 0 12 7" fill="none" xmlns="http://www.w3.org/2000/svg"><path d="M1 1L6 6L11 1" stroke="currentColor"></path></svg></button></div> </div> <h1 class="relative group"><a id="using-the-evaluator" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#using-the-evaluator"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Using the evaluator</span></h1> <p data-svelte-h="svelte-qprdpr">The <code>Evaluator</code> classes allow to evaluate a triplet of model, dataset, and metric. The models wrapped in a pipeline, responsible for handling all preprocessing and post-processing and out-of-the-box, <code>Evaluator</code>s support transformers pipelines for the supported tasks, but custom pipelines can be passed, as showcased in the section <a href="custom_evaluator">Using the <code>evaluator</code> with custom pipelines</a>.</p> <p data-svelte-h="svelte-15teb2a">Currently supported tasks are:</p> <ul data-svelte-h="svelte-1srkakb"><li><code>"text-classification"</code>: will use the <a href="/docs/evaluate/main/en/package_reference/evaluator_classes#evaluate.TextClassificationEvaluator">TextClassificationEvaluator</a>.</li> <li><code>"token-classification"</code>: will use the <a href="/docs/evaluate/main/en/package_reference/evaluator_classes#evaluate.TokenClassificationEvaluator">TokenClassificationEvaluator</a>.</li> <li><code>"question-answering"</code>: will use the <a href="/docs/evaluate/main/en/package_reference/evaluator_classes#evaluate.QuestionAnsweringEvaluator">QuestionAnsweringEvaluator</a>.</li> <li><code>"image-classification"</code>: will use the <a href="/docs/evaluate/main/en/package_reference/evaluator_classes#evaluate.ImageClassificationEvaluator">ImageClassificationEvaluator</a>.</li> <li><code>"text-generation"</code>: will use the <a href="/docs/evaluate/main/en/package_reference/evaluator_classes#evaluate.TextGenerationEvaluator">TextGenerationEvaluator</a>.</li> <li><code>"text2text-generation"</code>: will use the <a href="/docs/evaluate/main/en/package_reference/evaluator_classes#evaluate.Text2TextGenerationEvaluator">Text2TextGenerationEvaluator</a>.</li> <li><code>"summarization"</code>: will use the <a href="/docs/evaluate/main/en/package_reference/evaluator_classes#evaluate.SummarizationEvaluator">SummarizationEvaluator</a>.</li> <li><code>"translation"</code>: will use the <a href="/docs/evaluate/main/en/package_reference/evaluator_classes#evaluate.TranslationEvaluator">TranslationEvaluator</a>.</li> <li><code>"automatic-speech-recognition"</code>: will use the <a href="/docs/evaluate/main/en/package_reference/evaluator_classes#evaluate.AutomaticSpeechRecognitionEvaluator">AutomaticSpeechRecognitionEvaluator</a>.</li> <li><code>"audio-classification"</code>: will use the <a href="/docs/evaluate/main/en/package_reference/evaluator_classes#evaluate.AudioClassificationEvaluator">AudioClassificationEvaluator</a>.</li></ul> <p data-svelte-h="svelte-7bnn4r">To run an <code>Evaluator</code> with several tasks in a single call, use the <a href="evaluation_suite">EvaluationSuite</a>, which runs evaluations on a collection of <code>SubTask</code>s.</p> <p data-svelte-h="svelte-4inrih">Each task has its own set of requirements for the dataset format and pipeline output, make sure to check them out for your custom use case. Let’s have a look at some of them and see how you can use the evaluator to evalute a single or multiple of models, datasets, and metrics at the same time.</p> <h2 class="relative group"><a id="text-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#text-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Text classification</span></h2> <p data-svelte-h="svelte-1gxq28j">The text classification evaluator can be used to evaluate text models on classification datasets such as IMDb. Beside the model, data, and metric inputs it takes the following optional inputs:</p> <ul data-svelte-h="svelte-1qzzvz5"><li><code>input_column="text"</code>: with this argument the column with the data for the pipeline can be specified.</li> <li><code>label_column="label"</code>: with this argument the column with the labels for the evaluation can be specified.</li> <li><code>label_mapping=None</code>: the label mapping aligns the labels in the pipeline output with the labels need for evaluation. E.g. the labels in <code>label_column</code> can be integers (<code>0</code>/<code>1</code>) whereas the pipeline can produce label names such as <code>"positive"</code>/<code>"negative"</code>. With that dictionary the pipeline outputs are mapped to the labels.</li></ul> <p data-svelte-h="svelte-15gevtk">By default the <code>"accuracy"</code> metric is computed.</p> <h3 class="relative group"><a id="evaluate-models-on-the-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#evaluate-models-on-the-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Evaluate models on the Hub</span></h3> <p data-svelte-h="svelte-154vxj0">There are several ways to pass a model to the evaluator: you can pass the name of a model on the Hub, you can load a <code>transformers</code> model and pass it to the evaluator or you can pass an initialized <code>transformers.Pipeline</code>. Alternatively you can pass any callable function that behaves like a <code>pipeline</code> call for the task in any framework.</p> <p data-svelte-h="svelte-1b2vk7d">So any of the following works:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
	<span class="hljs-keyword">from</span> evaluate <span class="hljs-keyword">import</span> evaluator
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForSequenceClassification, pipeline

	data = load_dataset(<span class="hljs-string">"imdb"</span>, split=<span class="hljs-string">"test"</span>).shuffle(seed=<span class="hljs-number">42</span>).select(<span class="hljs-built_in">range</span>(<span class="hljs-number">1000</span>))
	task_evaluator = evaluator(<span class="hljs-string">"text-classification"</span>)

	<span class="hljs-comment"># 1. Pass a model name or path</span>
	eval_results = task_evaluator.compute(
	model_or_pipeline=<span class="hljs-string">"lvwerra/distilbert-imdb"</span>,
	data=data,
	label_mapping={<span class="hljs-string">"NEGATIVE"</span>: <span class="hljs-number">0</span>, <span class="hljs-string">"POSITIVE"</span>: <span class="hljs-number">1</span>}
	)

	<span class="hljs-comment"># 2. Pass an instantiated model</span>
	model = AutoModelForSequenceClassification.from_pretrained(<span class="hljs-string">"lvwerra/distilbert-imdb"</span>)

	eval_results = task_evaluator.compute(
	model_or_pipeline=model,
	data=data,
	label_mapping={<span class="hljs-string">"NEGATIVE"</span>: <span class="hljs-number">0</span>, <span class="hljs-string">"POSITIVE"</span>: <span class="hljs-number">1</span>}
	)

	<span class="hljs-comment"># 3. Pass an instantiated pipeline</span>
	pipe = pipeline(<span class="hljs-string">"text-classification"</span>, model=<span class="hljs-string">"lvwerra/distilbert-imdb"</span>)

	eval_results = task_evaluator.compute(
	model_or_pipeline=pipe,
	data=data,
	label_mapping={<span class="hljs-string">"NEGATIVE"</span>: <span class="hljs-number">0</span>, <span class="hljs-string">"POSITIVE"</span>: <span class="hljs-number">1</span>}
	)
	<span class="hljs-built_in">print</span>(eval_results)<!-- HTML_TAG_END --></pre></div> <blockquote class="tip" data-svelte-h="svelte-725r0a"><p>Without specifying a device, the default for model inference will be the first GPU on the machine if one is available, and else CPU. If you want to use a specific device you can pass <code>device</code> to <code>compute</code> where -1 will use the GPU and a positive integer (starting with 0) will use the associated CUDA device.</p></blockquote> <p data-svelte-h="svelte-15lc2ym">The results will look as follows:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-string">'accuracy'</span>: <span class="hljs-number">0.918</span>,
	<span class="hljs-string">'latency_in_seconds'</span>: <span class="hljs-number">0.013</span>,
	<span class="hljs-string">'samples_per_second'</span>: <span class="hljs-number">78.887</span>,
	<span class="hljs-string">'total_time_in_seconds'</span>: <span class="hljs-number">12.676</span>
	}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15s0v89">Note that evaluation results include both the requested metric, and information about the time it took to obtain predictions through the pipeline.</p> <blockquote class="tip" data-svelte-h="svelte-1baxlti"><p>The time performances can give useful indication on model speed for inference but should be taken with a grain of salt: they include all the processing that goes on in the pipeline. This may include tokenizing, post-processing, that may be different depending on the model. Furthermore, it depends a lot on the hardware you are running the evaluation on and you may be able to improve the performance by optimizing things like the batch size.</p></blockquote> <h3 class="relative group"><a id="evaluate-multiple-metrics" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#evaluate-multiple-metrics"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Evaluate multiple metrics</span></h3> <p data-svelte-h="svelte-1sfvd91">With the <a href="/docs/evaluate/main/en/package_reference/main_classes#evaluate.combine">combine()</a> function one can bundle several metrics into an object that behaves like a single metric. We can use this to evaluate several metrics at once with the evaluator:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> evaluate

	eval_results = task_evaluator.compute(
	model_or_pipeline=<span class="hljs-string">"lvwerra/distilbert-imdb"</span>,
	data=data,
	metric=evaluate.combine([<span class="hljs-string">"accuracy"</span>, <span class="hljs-string">"recall"</span>, <span class="hljs-string">"precision"</span>, <span class="hljs-string">"f1"</span>]),
	label_mapping={<span class="hljs-string">"NEGATIVE"</span>: <span class="hljs-number">0</span>, <span class="hljs-string">"POSITIVE"</span>: <span class="hljs-number">1</span>}
	)
	<span class="hljs-built_in">print</span>(eval_results)
	<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15lc2ym">The results will look as follows:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-string">'accuracy'</span>: <span class="hljs-number">0.918</span>,
	<span class="hljs-string">'f1'</span>: <span class="hljs-number">0.916</span>,
	<span class="hljs-string">'precision'</span>: <span class="hljs-number">0.9147</span>,
	<span class="hljs-string">'recall'</span>: <span class="hljs-number">0.9187</span>,
	<span class="hljs-string">'latency_in_seconds'</span>: <span class="hljs-number">0.013</span>,
	<span class="hljs-string">'samples_per_second'</span>: <span class="hljs-number">78.887</span>,
	<span class="hljs-string">'total_time_in_seconds'</span>: <span class="hljs-number">12.676</span>
	}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-dtvqwy">Next let’s have a look at token classification.</p> <h2 class="relative group"><a id="token-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#token-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Token Classification</span></h2> <p data-svelte-h="svelte-geubj8">With the token classification evaluator one can evaluate models for tasks such as NER or POS tagging. It has the following specific arguments:</p> <ul data-svelte-h="svelte-1el9bak"><li><code>input_column="text"</code>: with this argument the column with the data for the pipeline can be specified.</li> <li><code>label_column="label"</code>: with this argument the column with the labels for the evaluation can be specified.</li> <li><code>label_mapping=None</code>: the label mapping aligns the labels in the pipeline output with the labels need for evaluation. E.g. the labels in <code>label_column</code> can be integers (<code>0</code>/<code>1</code>) whereas the pipeline can produce label names such as <code>"positive"</code>/<code>"negative"</code>. With that dictionary the pipeline outputs are mapped to the labels.</li> <li><code>join_by=" "</code>: While most datasets are already tokenized the pipeline expects a string. Thus the tokens need to be joined before passing to the pipeline. By default they are joined with a whitespace.</li></ul> <p data-svelte-h="svelte-1ufl7ls">Let’s have a look how we can use the evaluator to benchmark several models.</p> <h3 class="relative group"><a id="benchmarking-several-models" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#benchmarking-several-models"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Benchmarking several models</span></h3> <p data-svelte-h="svelte-yjocce">Here is an example where several models can be compared thanks to the <code>evaluator</code> in only a few lines of code, abstracting away the preprocessing, inference, postprocessing, metric computation:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> pandas <span class="hljs-keyword">as</span> pd
	<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
	<span class="hljs-keyword">from</span> evaluate <span class="hljs-keyword">import</span> evaluator
	<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline

	models = [
	<span class="hljs-string">"xlm-roberta-large-finetuned-conll03-english"</span>,
	<span class="hljs-string">"dbmdz/bert-large-cased-finetuned-conll03-english"</span>,
	<span class="hljs-string">"elastic/distilbert-base-uncased-finetuned-conll03-english"</span>,
	<span class="hljs-string">"dbmdz/electra-large-discriminator-finetuned-conll03-english"</span>,
	<span class="hljs-string">"gunghio/distilbert-base-multilingual-cased-finetuned-conll2003-ner"</span>,
	<span class="hljs-string">"philschmid/distilroberta-base-ner-conll2003"</span>,
	<span class="hljs-string">"Jorgeutd/albert-base-v2-finetuned-ner"</span>,
	]

	data = load_dataset(<span class="hljs-string">"conll2003"</span>, split=<span class="hljs-string">"validation"</span>).shuffle().select(<span class="hljs-built_in">range</span>(<span class="hljs-number">1000</span>))
	task_evaluator = evaluator(<span class="hljs-string">"token-classification"</span>)

	results = []
	<span class="hljs-keyword">for</span> model <span class="hljs-keyword">in</span> models:
	results.append(
	task_evaluator.compute(
	model_or_pipeline=model, data=data, metric=<span class="hljs-string">"seqeval"</span>
	)
	)

	df = pd.DataFrame(results, index=models)
	df[[<span class="hljs-string">"overall_f1"</span>, <span class="hljs-string">"overall_accuracy"</span>, <span class="hljs-string">"total_time_in_seconds"</span>, <span class="hljs-string">"samples_per_second"</span>, <span class="hljs-string">"latency_in_seconds"</span>]]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-xdkq6d">The result is a table that looks like this:</p> <table data-svelte-h="svelte-w5xmg2"><thead><tr><th align="left">model</th> <th align="right">overall_f1</th> <th align="right">overall_accuracy</th> <th align="right">total_time_in_seconds</th> <th align="right">samples_per_second</th> <th align="right">latency_in_seconds</th></tr></thead> <tbody><tr><td align="left">Jorgeutd/albert-base-v2-finetuned-ner</td> <td align="right">0.941</td> <td align="right">0.989</td> <td align="right">4.515</td> <td align="right">221.468</td> <td align="right">0.005</td></tr> <tr><td align="left">dbmdz/bert-large-cased-finetuned-conll03-english</td> <td align="right">0.962</td> <td align="right">0.881</td> <td align="right">11.648</td> <td align="right">85.850</td> <td align="right">0.012</td></tr> <tr><td align="left">dbmdz/electra-large-discriminator-finetuned-conll03-english</td> <td align="right">0.965</td> <td align="right">0.881</td> <td align="right">11.456</td> <td align="right">87.292</td> <td align="right">0.011</td></tr> <tr><td align="left">elastic/distilbert-base-uncased-finetuned-conll03-english</td> <td align="right">0.940</td> <td align="right">0.989</td> <td align="right">2.318</td> <td align="right">431.378</td> <td align="right">0.002</td></tr> <tr><td align="left">gunghio/distilbert-base-multilingual-cased-finetuned-conll2003-ner</td> <td align="right">0.947</td> <td align="right">0.991</td> <td align="right">2.376</td> <td align="right">420.873</td> <td align="right">0.002</td></tr> <tr><td align="left">philschmid/distilroberta-base-ner-conll2003</td> <td align="right">0.961</td> <td align="right">0.994</td> <td align="right">2.436</td> <td align="right">410.579</td> <td align="right">0.002</td></tr> <tr><td align="left">xlm-roberta-large-finetuned-conll03-english</td> <td align="right">0.969</td> <td align="right">0.882</td> <td align="right">11.996</td> <td align="right">83.359</td> <td align="right">0.012</td></tr></tbody></table> <h3 class="relative group"><a id="visualizing-results" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#visualizing-results"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Visualizing results</span></h3> <p data-svelte-h="svelte-105xi6e">You can feed in the <code>results</code> list above into the <code>plot_radar()</code> function to visualize different aspects of their performance and choose the model that is the best fit, depending on the metric(s) that are relevant to your use case:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> evaluate
	<span class="hljs-keyword">from</span> evaluate.visualization <span class="hljs-keyword">import</span> radar_plot

	<span class="hljs-meta">>>> </span>plot = radar_plot(data=results, model_names=models, invert_range=[<span class="hljs-string">"latency_in_seconds"</span>])
	<span class="hljs-meta">>>> </span>plot.show()<!-- HTML_TAG_END --></pre></div> <div class="flex justify-center" data-svelte-h="svelte-gubqhj"><img src="https://huggingface.co/datasets/evaluate/media/resolve/main/viz.png" width="400"></div> <p data-svelte-h="svelte-6fiehk">Don’t forget to specify <code>invert_range</code> for metrics for which smaller is better (such as the case for latency in seconds).</p> <p data-svelte-h="svelte-13ag5wi">If you want to save the plot locally, you can use the <code>plot.savefig()</code> function with the option <code>bbox_inches='tight'</code>, to make sure no part of the image gets cut off.</p> <h2 class="relative group"><a id="question-answering" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#question-answering"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Question Answering</span></h2> <p data-svelte-h="svelte-1b3dqfk">With the question-answering evaluator one can evaluate models for QA without needing to worry about the complicated pre- and post-processing that’s required for these models. It has the following specific arguments:</p> <ul data-svelte-h="svelte-tbjiyo"><li><code>question_column="question"</code>: the name of the column containing the question in the dataset</li> <li><code>context_column="context"</code>: the name of the column containing the context</li> <li><code>id_column="id"</code>: the name of the column cointaing the identification field of the question and answer pair</li> <li><code>label_column="answers"</code>: the name of the column containing the answers</li> <li><code>squad_v2_format=None</code>: whether the dataset follows the format of squad_v2 dataset where a question may have no answer in the context. If this parameter is not provided, the format will be automatically inferred.</li></ul> <p data-svelte-h="svelte-vay9sj">Let’s have a look how we can evaluate QA models and compute confidence intervals at the same time.</p> <h3 class="relative group"><a id="confidence-intervals" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#confidence-intervals"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Confidence intervals</span></h3> <p data-svelte-h="svelte-mw7fge">Every evaluator comes with the options to compute confidence intervals using <a href="https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.bootstrap.html" rel="nofollow">bootstrapping</a>. Simply pass <code>strategy="bootstrap"</code> and set the number of resanmples with <code>n_resamples</code>.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
	<span class="hljs-keyword">from</span> evaluate <span class="hljs-keyword">import</span> evaluator

	task_evaluator = evaluator(<span class="hljs-string">"question-answering"</span>)

	data = load_dataset(<span class="hljs-string">"squad"</span>, split=<span class="hljs-string">"validation[:1000]"</span>)
	eval_results = task_evaluator.compute(
	model_or_pipeline=<span class="hljs-string">"distilbert-base-uncased-distilled-squad"</span>,
	data=data,
	metric=<span class="hljs-string">"squad"</span>,
	strategy=<span class="hljs-string">"bootstrap"</span>,
	n_resamples=<span class="hljs-number">30</span>
	)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-x62g11">Results include confidence intervals as well as error estimates as follows:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{
	<span class="hljs-string">'exact_match'</span>:
	{
	<span class="hljs-string">'confidence_interval'</span>: (<span class="hljs-number">79.67</span>, <span class="hljs-number">84.54</span>),
	<span class="hljs-string">'score'</span>: <span class="hljs-number">82.30</span>,
	<span class="hljs-string">'standard_error'</span>: <span class="hljs-number">1.28</span>
	},
	<span class="hljs-string">'f1'</span>:
	{
	<span class="hljs-string">'confidence_interval'</span>: (<span class="hljs-number">85.30</span>, <span class="hljs-number">88.88</span>),
	<span class="hljs-string">'score'</span>: <span class="hljs-number">87.23</span>,
	<span class="hljs-string">'standard_error'</span>: <span class="hljs-number">0.97</span>
	},
	<span class="hljs-string">'latency_in_seconds'</span>: <span class="hljs-number">0.0085</span>,
	<span class="hljs-string">'samples_per_second'</span>: <span class="hljs-number">117.31</span>,
	<span class="hljs-string">'total_time_in_seconds'</span>: <span class="hljs-number">8.52</span>
	}<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="image-classification" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#image-classification"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Image classification</span></h2> <p data-svelte-h="svelte-ksrm73">With the image classification evaluator we can evaluate any image classifier. It uses the same keyword arguments at the text classifier:</p> <ul data-svelte-h="svelte-y0u0z3"><li><code>input_column="image"</code>: the name of the column containing the images as PIL ImageFile</li> <li><code>label_column="label"</code>: the name of the column containing the labels</li> <li><code>label_mapping=None</code>: We want to map class labels defined by the model in the pipeline to values consistent with those defined in the <code>label_column</code></li></ul> <p data-svelte-h="svelte-wu4hrz">Let’s have a look at how can evaluate image classification models on large datasets.</p> <h3 class="relative group"><a id="handling-large-datasets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#handling-large-datasets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Handling large datasets</span></h3> <p data-svelte-h="svelte-1ewtkuj">The evaluator can be used on large datasets! Below, an example shows how to use it on ImageNet-1k for image classification. Beware that this example will require to download ~150 GB.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->data = load_dataset(<span class="hljs-string">"imagenet-1k"</span>, split=<span class="hljs-string">"validation"</span>, token=<span class="hljs-literal">True</span>)

	pipe = pipeline(
	task=<span class="hljs-string">"image-classification"</span>,
	model=<span class="hljs-string">"facebook/deit-small-distilled-patch16-224"</span>
	)

	task_evaluator = evaluator(<span class="hljs-string">"image-classification"</span>)
	eval_results = task_evaluator.compute(
	model_or_pipeline=pipe,
	data=data,
	metric=<span class="hljs-string">"accuracy"</span>,
	label_mapping=pipe.model.config.label2id
	)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1lgn5m5">Since we are using <code>datasets</code> to store data we make use of a technique called memory mappings. This means that the dataset is never fully loaded into memory which saves a lot of RAM. Running the above code only uses roughly 1.5 GB of RAM while the validation split is more than 30 GB big.</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/evaluate/blob/main/docs/source/base_evaluator.mdx" target="_blank"><svg class="mr-1" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M31,16l-7,7l-1.41-1.41L28.17,16l-5.58-5.59L24,9l7,7z"></path><path d="M1,16l7-7l1.41,1.41L3.83,16l5.58,5.59L8,23l-7-7z"></path><path d="M12.419,25.484L17.639,6.552l1.932,0.518L14.351,26.002z"></path></svg> <span data-svelte-h="svelte-zjs2n5"><span class="underline">Update</span> on GitHub</span></a> <p></p>

	<script>
	{
	__sveltekit_95ljrq = {
	assets: "/docs/evaluate/main/en",
	base: "/docs/evaluate/main/en",
	env: {}
	};

	const element = document.currentScript.parentElement;

	const data = [null,null];

	Promise.all([
	import("/docs/evaluate/main/en/_app/immutable/entry/start.138f3e02.js"),
	import("/docs/evaluate/main/en/_app/immutable/entry/app.ad076786.js")
	]).then(([kit, app]) => {
	kit.start(app, element, {
	node_ids: [0, 3],
	data,
	form: null,
	error: null
	});
	});
	}
	</script>

Xet Storage Details

Size:: 53.8 kB
Xet hash:: 6e34d34b387a64bacfb765817dba9dc7d45ebc015781ff38de84b6419c104563

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.