Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / lighteval /pr_722 /en /_app /immutable /nodes /9.cc64f3d2.js

rtrm's picture

about 2 months ago

14.2 kB

import{s as de,n as me,o as he}from"../chunks/scheduler.7da89386.js";import{S as ue,i as ge,g as n,s as l,r as L,A as fe,h as r,f as o,c as a,j as ce,u as C,x as s,k as se,y as pe,a as i,v as A,d as z,t as T,w as P}from"../chunks/index.20910acc.js";import{H as E,E as _e}from"../chunks/getInferenceSnippets.d539cff9.js";function be(X){let c,q,M,H,d,U,m,j,h,Y="These metrics use log-likelihood of the different possible targets.",R,u,ee="<li><code>loglikelihood_acc</code>: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token (<code>loglikelihood_acc_single_token</code>).</li> <li><code>loglikelihood_acc_norm</code>: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct - also exists in a faster version for tasks where the possible choices include only one token (<code>loglikelihood_acc_norm_single_token</code>).</li> <li><code>loglikelihood_acc_norm_nospace</code>: Fraction of instances where the choice with the best logprob, normalized by sequence length, was correct, with the first space ignored.</li> <li><code>loglikelihood_f1</code>: Corpus level F1 score of the multichoice selection - also exists in a faster version for tasks where the possible choices include only one token (<code>loglikelihood_f1_single_token</code>).</li> <li><code>mcc</code>: Matthew’s correlation coefficient (a measure of agreement between statistical distributions).</li> <li><code>recall_at_1</code>: Fraction of instances where the choice with the best logprob was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (<code>recall_at_1_single_token</code>).</li> <li><code>recall_at_2</code>: Fraction of instances where the choice with the 2nd best logprob or better was correct - also exists in a faster version for tasks where the possible choices include only one token per choice (<code>recall_at_2_single_token</code>).</li> <li><code>mrr</code>: Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance - also exists in a faster version for tasks where the possible choices include only one token (<code>mrr_single_token</code>).</li> <li><code>target_perplexity</code>: Perplexity of the different choices available.</li> <li><code>acc_golds_likelihood</code>: A bit different, it actually checks if the average logprob of a single target is above or below 0.5.</li> <li><code>multi_f1_numeric</code>: Loglikelihood F1 score for multiple gold targets.</li>",G,g,te="All these metrics also exist in a “single token” version (<code>loglikelihood_acc_single_token</code>, <code>loglikelihood_acc_norm_single_token</code>, <code>loglikelihood_f1_single_token</code>, <code>mcc_single_token</code>, <code>recall@2_single_token</code> and <code>mrr_single_token</code>). When the multichoice option compares only one token (ex: “A” vs “B” vs “C” vs “D”, or “yes” vs “no”), using these metrics in the single token version will divide the time spent by the number of choices. Single token evals also include:",O,f,oe="<li><code>multi_f1_numeric</code>: Computes the f1 score of all possible choices and averages it.</li>",B,p,I,_,ie="These metrics use log-likelihood of prompt.",S,b,le="<li><code>word_perplexity</code>: Perplexity (log probability of the input) weighted by the number of words of the sequence.</li> <li><code>byte_perplexity</code>: Perplexity (log probability of the input) weighted by the number of bytes of the sequence.</li> <li><code>bits_per_byte</code>: Average number of bits per byte according to model probabilities.</li> <li><code>log_prob</code>: Predicted output’s average log probability (input’s log prob for language modeling).</li>",W,v,D,w,ae="These metrics need the model to generate an output. They are therefore slower.",J,k,ne='<li>Base:<ul><li><code>perfect_exact_match</code>: Fraction of instances where the prediction matches the gold exactly.</li> <li><code>exact_match</code>: Fraction of instances where the prediction matches the gold with the exception of the border whitespaces (= after a <code>strip</code> has been applied to both).</li> <li><code>quasi_exact_match</code>: Fraction of instances where the normalized prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, …). Other variations exist, with other normalizers, such as <code>quasi_exact_match_triviaqa</code>, which only normalizes the predictions after applying a strip to all sentences.</li> <li><code>prefix_exact_match</code>: Fraction of instances where the beginning of the prediction matches the gold at the exception of the border whitespaces (= after a <code>strip</code> has been applied to both).</li> <li><code>prefix_quasi_exact_match</code>: Fraction of instances where the normalized beginning of the prediction matches the normalized gold (normalization done on whitespace, articles, capitalization, …).</li> <li><code>exact_match_indicator</code>: Exact match with some preceding context (before an indicator) removed.</li> <li><code>f1_score_quasi</code>: Average F1 score in terms of word overlap between the model output and gold, with both being normalized first.</li> <li><code>f1_score</code>: Average F1 score in terms of word overlap between the model output and gold without normalisation.</li> <li><code>f1_score_macro</code>: Corpus level macro F1 score.</li> <li><code>f1_score_macro</code>: Corpus level micro F1 score.</li> <li><code>maj_at_5</code> and <code>maj_at_8</code>: Model majority vote. Takes n (5 or 8) generations from the model and assumes the most frequent is the actual prediction.</li></ul></li> <li>Summarization:<ul><li><code>rouge</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a>.</li> <li><code>rouge1</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a> based on 1-gram overlap.</li> <li><code>rouge2</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a> based on 2-gram overlap.</li> <li><code>rougeL</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a> based on longest common subsequence overlap.</li> <li><code>rougeLsum</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a> based on longest common subsequence overlap.</li> <li><code>rouge_t5</code> (BigBench): Corpus level ROUGE score for all available ROUGE metrics.</li> <li><code>faithfulness</code>: Faithfulness scores based on the SummaC method of <a href="https://aclanthology.org/2022.tacl-1.10/" rel="nofollow">Laban et al. (2022)</a>.</li> <li><code>extractiveness</code>: Reports, based on <a href="https://aclanthology.org/N18-1065/" rel="nofollow">(Grusky et al., 2018)</a>:<ul><li><code>summarization_coverage</code>: Extent to which the model-generated summaries are extractive fragments from the source document,</li> <li><code>summarization_density</code>: Extent to which the model-generated summaries are extractive summaries based on the source document,</li> <li><code>summarization_compression</code>: Extent to which the model-generated summaries are compressed relative to the source document.</li></ul></li> <li><code>bert_score</code>: Reports the average BERTScore precision, recall, and f1 score <a href="https://openreview.net/pdf?id=SkeHuCVFDr" rel="nofollow">(Zhang et al., 2020)</a> between model generation and gold summary.</li></ul></li> <li>Translation:<ul><li><code>bleu</code>: Corpus level BLEU score <a href="https://aclanthology.org/P02-1040/" rel="nofollow">(Papineni et al., 2002)</a> - uses the sacrebleu implementation.</li> <li><code>bleu_1</code>: Average sample BLEU score <a href="https://aclanthology.org/P02-1040/" rel="nofollow">(Papineni et al., 2002)</a> based on 1-gram overlap - uses the nltk implementation.</li> <li><code>bleu_4</code>: Average sample BLEU score <a href="https://aclanthology.org/P02-1040/" rel="nofollow">(Papineni et al., 2002)</a> based on 4-gram overlap - uses the nltk implementation.</li> <li><code>chrf</code>: Character n-gram matches f-score.</li> <li><code>ter</code>: Translation edit/error rate.</li></ul></li> <li>Copyright:<ul><li><code>copyright</code>: Reports:<ul><li><code>longest_common_prefix_length</code>: Average length of longest common prefix between model generation and reference,</li> <li><code>edit_distance</code>: Average Levenshtein edit distance between model generation and reference,</li> <li><code>edit_similarity</code>: Average Levenshtein edit similarity (normalized by the length of longer sequence) between model generation and reference.</li></ul></li></ul></li> <li>Math:<ul><li><code>quasi_exact_match_math</code>: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for math, where latex symbols, units, etc are removed).</li> <li><code>maj_at_4_math</code>: Majority choice evaluation, using the math normalisation for the predictions and gold.</li> <li><code>quasi_exact_match_gsm8k</code>: Fraction of instances where the normalized prediction matches the normalized gold (normalization done for gsm8k, where latex symbols, units, etc are removed).</li> <li><code>maj_at_8_gsm8k</code>: Majority choice evaluation, using the gsm8k normalisation for the predictions and gold.</li></ul></li>',N,y,V,x,re="<li><code>llm_judge_gpt3p5</code>: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API.</li> <li><code>llm_judge_llama_3_405b</code>: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API.</li> <li><code>llm_judge_multi_turn_gpt3p5</code>: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench.</li> <li><code>llm_judge_multi_turn_llama_3_405b</code>: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench.</li>",Z,$,K,F,Q;return d=new E({props:{title:"Metric List",local:"metric-list",headingTag:"h1"}}),m=new E({props:{title:"Automatic metrics for multiple-choice tasks",local:"automatic-metrics-for-multiple-choice-tasks",headingTag:"h2"}}),p=new E({props:{title:"Automatic metrics for perplexity and language modeling",local:"automatic-metrics-for-perplexity-and-language-modeling",headingTag:"h2"}}),v=new E({props:{title:"Automatic metrics for generative tasks",local:"automatic-metrics-for-generative-tasks",headingTag:"h2"}}),y=new E({props:{title:"LLM-as-Judge",local:"llm-as-judge",headingTag:"h2"}}),$=new _e({props:{source:"https://github.com/huggingface/lighteval/blob/main/docs/source/metric-list.mdx"}}),{c(){c=n("meta"),q=l(),M=n("p"),H=l(),L(d.$$.fragment),U=l(),L(m.$$.fragment),j=l(),h=n("p"),h.textContent=Y,R=l(),u=n("ul"),u.innerHTML=ee,G=l(),g=n("p"),g.innerHTML=te,O=l(),f=n("ul"),f.innerHTML=oe,B=l(),L(p.$$.fragment),I=l(),_=n("p"),_.textContent=ie,S=l(),b=n("ul"),b.innerHTML=le,W=l(),L(v.$$.fragment),D=l(),w=n("p"),w.textContent=ae,J=l(),k=n("ul"),k.innerHTML=ne,N=l(),L(y.$$.fragment),V=l(),x=n("ul"),x.innerHTML=re,Z=l(),L($.$$.fragment),K=l(),F=n("p"),this.h()},l(e){const t=fe("svelte-u9bgzb",document.head);c=r(t,"META",{name:!0,content:!0}),t.forEach(o),q=a(e),M=r(e,"P",{}),ce(M).forEach(o),H=a(e),C(d.$$.fragment,e),U=a(e),C(m.$$.fragment,e),j=a(e),h=r(e,"P",{"data-svelte-h":!0}),s(h)!=="svelte-108o49i"&&(h.textContent=Y),R=a(e),u=r(e,"UL",{"data-svelte-h":!0}),s(u)!=="svelte-17kxjf7"&&(u.innerHTML=ee),G=a(e),g=r(e,"P",{"data-svelte-h":!0}),s(g)!=="svelte-1u8onu4"&&(g.innerHTML=te),O=a(e),f=r(e,"UL",{"data-svelte-h":!0}),s(f)!=="svelte-hkq1ua"&&(f.innerHTML=oe),B=a(e),C(p.$$.fragment,e),I=a(e),_=r(e,"P",{"data-svelte-h":!0}),s(_)!=="svelte-3tccvl"&&(_.textContent=ie),S=a(e),b=r(e,"UL",{"data-svelte-h":!0}),s(b)!=="svelte-zznuqn"&&(b.innerHTML=le),W=a(e),C(v.$$.fragment,e),D=a(e),w=r(e,"P",{"data-svelte-h":!0}),s(w)!=="svelte-14ncypm"&&(w.textContent=ae),J=a(e),k=r(e,"UL",{"data-svelte-h":!0}),s(k)!=="svelte-hbi7zb"&&(k.innerHTML=ne),N=a(e),C(y.$$.fragment,e),V=a(e),x=r(e,"UL",{"data-svelte-h":!0}),s(x)!=="svelte-kd4sgy"&&(x.innerHTML=re),Z=a(e),C($.$$.fragment,e),K=a(e),F=r(e,"P",{}),ce(F).forEach(o),this.h()},h(){se(c,"name","hf:doc:metadata"),se(c,"content",ve)},m(e,t){pe(document.head,c),i(e,q,t),i(e,M,t),i(e,H,t),A(d,e,t),i(e,U,t),A(m,e,t),i(e,j,t),i(e,h,t),i(e,R,t),i(e,u,t),i(e,G,t),i(e,g,t),i(e,O,t),i(e,f,t),i(e,B,t),A(p,e,t),i(e,I,t),i(e,_,t),i(e,S,t),i(e,b,t),i(e,W,t),A(v,e,t),i(e,D,t),i(e,w,t),i(e,J,t),i(e,k,t),i(e,N,t),A(y,e,t),i(e,V,t),i(e,x,t),i(e,Z,t),A($,e,t),i(e,K,t),i(e,F,t),Q=!0},p:me,i(e){Q||(z(d.$$.fragment,e),z(m.$$.fragment,e),z(p.$$.fragment,e),z(v.$$.fragment,e),z(y.$$.fragment,e),z($.$$.fragment,e),Q=!0)},o(e){T(d.$$.fragment,e),T(m.$$.fragment,e),T(p.$$.fragment,e),T(v.$$.fragment,e),T(y.$$.fragment,e),T($.$$.fragment,e),Q=!1},d(e){e&&(o(q),o(M),o(H),o(U),o(j),o(h),o(R),o(u),o(G),o(g),o(O),o(f),o(B),o(I),o(_),o(S),o(b),o(W),o(D),o(w),o(J),o(k),o(N),o(V),o(x),o(Z),o(K),o(F)),o(c),P(d,e),P(m,e),P(p,e),P(v,e),P(y,e),P($,e)}}}const ve='{"title":"Metric List","local":"metric-list","sections":[{"title":"Automatic metrics for multiple-choice tasks","local":"automatic-metrics-for-multiple-choice-tasks","sections":[],"depth":2},{"title":"Automatic metrics for perplexity and language modeling","local":"automatic-metrics-for-perplexity-and-language-modeling","sections":[],"depth":2},{"title":"Automatic metrics for generative tasks","local":"automatic-metrics-for-generative-tasks","sections":[],"depth":2},{"title":"LLM-as-Judge","local":"llm-as-judge","sections":[],"depth":2}],"depth":1}';function we(X){return he(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class $e extends ue{constructor(c){super(),ge(this,c,we,be,de,{})}}export{$e as component};

Xet Storage Details

Size:: 14.2 kB
Xet hash:: b1a1544147544a45d2ab98675bf3abb1d5e7d31f92b2b462f9c5f682a786905d

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.