Buckets:

hf-doc-build
/

doc-dev

hf-doc-build/doc-dev / lighteval /pr_994 /en /_app /immutable /nodes /10.f78cd7ae.js

rtrm's picture

about 2 months ago

10.7 kB

import{s as re,n as ne,o as se}from"../chunks/scheduler.5f3e6389.js";import{S as ce,i as me,e as r,s as i,c as x,h as de,a as n,d as l,b as a,f as ie,g as k,j as L,k as ae,l as ue,m as o,n as C,t as A,o as P,p as T}from"../chunks/index.373ab25c.js";import{C as ge,H as z,E as pe}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.256a018f.js";function fe(K){let s,U,E,H,c,F,m,R,d,j,u,Q="These metrics use log-likelihood of the different possible targets.",G,g,X="<li><code>loglikelihood_acc</code>: Fraction of instances where the choice with the best logprob was correct - we recommend using a normalization by length</li> <li><code>loglikelihood_f1</code>: Corpus level F1 score of the multichoice selection</li> <li><code>mcc</code>: Matthew’s correlation coefficient (a measure of agreement between statistical distributions).</li> <li><code>recall_at_k</code>: Fraction of instances where the choice with the k-st best logprob or better was correct</li> <li><code>mrr</code>: Mean reciprocal rank, a measure of the quality of a ranking of choices ordered by correctness/relevance</li> <li><code>target_perplexity</code>: Perplexity of the different choices available.</li> <li><code>acc_golds_likelihood</code>: A bit different, it actually checks if the average logprob of a single target is above or below 0.5.</li> <li><code>multi_f1_numeric</code>: Loglikelihood F1 score for multiple gold targets.</li>",B,p,O,f,Y="These metrics use log-likelihood of prompt.",S,h,ee="<li><code>word_perplexity</code>: Perplexity (log probability of the input) weighted by the number of words of the sequence.</li> <li><code>byte_perplexity</code>: Perplexity (log probability of the input) weighted by the number of bytes of the sequence.</li> <li><code>bits_per_byte</code>: Average number of bits per byte according to model probabilities.</li> <li><code>log_prob</code>: Predicted output’s average log probability (input’s log prob for language modeling).</li>",q,b,I,_,te="These metrics need the model to generate an output. They are therefore slower.",W,v,le='<li>Base:<ul><li><code>exact_match</code>: Fraction of instances where the prediction matches the gold. Several variations can be made through parametrization:<ul><li>normalization on string pre-comparision on whitespace, articles, capitalization, …</li> <li>comparing the full string, or only subsets (prefix, suffix, …)</li></ul></li> <li><code>maj_at_k</code>: Model majority vote. Samples k generations from the model and assumes the most frequent is the actual prediction.</li> <li><code>f1_score</code>: Average F1 score in terms of word overlap between the model output and gold (normalisation optional).</li> <li><code>f1_score_macro</code>: Corpus level macro F1 score.</li> <li><code>f1_score_macro</code>: Corpus level micro F1 score.</li></ul></li> <li>Summarization:<ul><li><code>rouge</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a>.</li> <li><code>rouge1</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a> based on 1-gram overlap.</li> <li><code>rouge2</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a> based on 2-gram overlap.</li> <li><code>rougeL</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a> based on longest common subsequence overlap.</li> <li><code>rougeLsum</code>: Average ROUGE score <a href="https://aclanthology.org/W04-1013/" rel="nofollow">(Lin, 2004)</a> based on longest common subsequence overlap.</li> <li><code>rouge_t5</code> (BigBench): Corpus level ROUGE score for all available ROUGE metrics.</li> <li><code>faithfulness</code>: Faithfulness scores based on the SummaC method of <a href="https://aclanthology.org/2022.tacl-1.10/" rel="nofollow">Laban et al. (2022)</a>.</li> <li><code>extractiveness</code>: Reports, based on <a href="https://aclanthology.org/N18-1065/" rel="nofollow">(Grusky et al., 2018)</a>:<ul><li><code>summarization_coverage</code>: Extent to which the model-generated summaries are extractive fragments from the source document,</li> <li><code>summarization_density</code>: Extent to which the model-generated summaries are extractive summaries based on the source document,</li> <li><code>summarization_compression</code>: Extent to which the model-generated summaries are compressed relative to the source document.</li></ul></li> <li><code>bert_score</code>: Reports the average BERTScore precision, recall, and f1 score <a href="https://openreview.net/pdf?id=SkeHuCVFDr" rel="nofollow">(Zhang et al., 2020)</a> between model generation and gold summary.</li></ul></li> <li>Translation:<ul><li><code>bleu</code>: Corpus level BLEU score <a href="https://aclanthology.org/P02-1040/" rel="nofollow">(Papineni et al., 2002)</a> - uses the sacrebleu implementation.</li> <li><code>bleu_1</code>: Average sample BLEU score <a href="https://aclanthology.org/P02-1040/" rel="nofollow">(Papineni et al., 2002)</a> based on 1-gram overlap - uses the nltk implementation.</li> <li><code>bleu_4</code>: Average sample BLEU score <a href="https://aclanthology.org/P02-1040/" rel="nofollow">(Papineni et al., 2002)</a> based on 4-gram overlap - uses the nltk implementation.</li> <li><code>chrf</code>: Character n-gram matches f-score.</li> <li><code>ter</code>: Translation edit/error rate.</li></ul></li> <li>Copyright:<ul><li><code>copyright</code>: Reports:<ul><li><code>longest_common_prefix_length</code>: Average length of longest common prefix between model generation and reference,</li> <li><code>edit_distance</code>: Average Levenshtein edit distance between model generation and reference,</li> <li><code>edit_similarity</code>: Average Levenshtein edit similarity (normalized by the length of longer sequence) between model generation and reference.</li></ul></li></ul></li> <li>Math:<ul><li>Both <code>exact_match</code> and <code>maj_at_k</code> can be used to evaluate mathematics tasks with math specific normalization to remove and filter latex.</li></ul></li>',J,y,D,$,oe="<li><code>llm_judge_gpt3p5</code>: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API.</li> <li><code>llm_judge_llama_3_405b</code>: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API.</li> <li><code>llm_judge_multi_turn_gpt3p5</code>: Can be used for any generative task, the model will be scored by a GPT3.5 model using the OpenAI API. It is used for multiturn tasks like mt-bench.</li> <li><code>llm_judge_multi_turn_llama_3_405b</code>: Can be used for any generative task, the model will be scored by a Llama 3.405B model using the HuggingFace API. It is used for multiturn tasks like mt-bench.</li>",N,w,V,M,Z;return c=new ge({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),m=new z({props:{title:"Metric List",local:"metric-list",headingTag:"h1"}}),d=new z({props:{title:"Automatic metrics for multiple-choice tasks",local:"automatic-metrics-for-multiple-choice-tasks",headingTag:"h2"}}),p=new z({props:{title:"Automatic metrics for perplexity and language modeling",local:"automatic-metrics-for-perplexity-and-language-modeling",headingTag:"h2"}}),b=new z({props:{title:"Automatic metrics for generative tasks",local:"automatic-metrics-for-generative-tasks",headingTag:"h2"}}),y=new z({props:{title:"LLM-as-Judge",local:"llm-as-judge",headingTag:"h2"}}),w=new pe({props:{source:"https://github.com/huggingface/lighteval/blob/main/docs/source/metric-list.mdx"}}),{c(){s=r("meta"),U=i(),E=r("p"),H=i(),x(c.$$.fragment),F=i(),x(m.$$.fragment),R=i(),x(d.$$.fragment),j=i(),u=r("p"),u.textContent=Q,G=i(),g=r("ul"),g.innerHTML=X,B=i(),x(p.$$.fragment),O=i(),f=r("p"),f.textContent=Y,S=i(),h=r("ul"),h.innerHTML=ee,q=i(),x(b.$$.fragment),I=i(),_=r("p"),_.textContent=te,W=i(),v=r("ul"),v.innerHTML=le,J=i(),x(y.$$.fragment),D=i(),$=r("ul"),$.innerHTML=oe,N=i(),x(w.$$.fragment),V=i(),M=r("p"),this.h()},l(e){const t=de("svelte-u9bgzb",document.head);s=n(t,"META",{name:!0,content:!0}),t.forEach(l),U=a(e),E=n(e,"P",{}),ie(E).forEach(l),H=a(e),k(c.$$.fragment,e),F=a(e),k(m.$$.fragment,e),R=a(e),k(d.$$.fragment,e),j=a(e),u=n(e,"P",{"data-svelte-h":!0}),L(u)!=="svelte-108o49i"&&(u.textContent=Q),G=a(e),g=n(e,"UL",{"data-svelte-h":!0}),L(g)!=="svelte-7pjl1z"&&(g.innerHTML=X),B=a(e),k(p.$$.fragment,e),O=a(e),f=n(e,"P",{"data-svelte-h":!0}),L(f)!=="svelte-3tccvl"&&(f.textContent=Y),S=a(e),h=n(e,"UL",{"data-svelte-h":!0}),L(h)!=="svelte-zznuqn"&&(h.innerHTML=ee),q=a(e),k(b.$$.fragment,e),I=a(e),_=n(e,"P",{"data-svelte-h":!0}),L(_)!=="svelte-14ncypm"&&(_.textContent=te),W=a(e),v=n(e,"UL",{"data-svelte-h":!0}),L(v)!=="svelte-3lgsi0"&&(v.innerHTML=le),J=a(e),k(y.$$.fragment,e),D=a(e),$=n(e,"UL",{"data-svelte-h":!0}),L($)!=="svelte-kd4sgy"&&($.innerHTML=oe),N=a(e),k(w.$$.fragment,e),V=a(e),M=n(e,"P",{}),ie(M).forEach(l),this.h()},h(){ae(s,"name","hf:doc:metadata"),ae(s,"content",he)},m(e,t){ue(document.head,s),o(e,U,t),o(e,E,t),o(e,H,t),C(c,e,t),o(e,F,t),C(m,e,t),o(e,R,t),C(d,e,t),o(e,j,t),o(e,u,t),o(e,G,t),o(e,g,t),o(e,B,t),C(p,e,t),o(e,O,t),o(e,f,t),o(e,S,t),o(e,h,t),o(e,q,t),C(b,e,t),o(e,I,t),o(e,_,t),o(e,W,t),o(e,v,t),o(e,J,t),C(y,e,t),o(e,D,t),o(e,$,t),o(e,N,t),C(w,e,t),o(e,V,t),o(e,M,t),Z=!0},p:ne,i(e){Z||(A(c.$$.fragment,e),A(m.$$.fragment,e),A(d.$$.fragment,e),A(p.$$.fragment,e),A(b.$$.fragment,e),A(y.$$.fragment,e),A(w.$$.fragment,e),Z=!0)},o(e){P(c.$$.fragment,e),P(m.$$.fragment,e),P(d.$$.fragment,e),P(p.$$.fragment,e),P(b.$$.fragment,e),P(y.$$.fragment,e),P(w.$$.fragment,e),Z=!1},d(e){e&&(l(U),l(E),l(H),l(F),l(R),l(j),l(u),l(G),l(g),l(B),l(O),l(f),l(S),l(h),l(q),l(I),l(_),l(W),l(v),l(J),l(D),l($),l(N),l(V),l(M)),l(s),T(c,e),T(m,e),T(d,e),T(p,e),T(b,e),T(y,e),T(w,e)}}}const he='{"title":"Metric List","local":"metric-list","sections":[{"title":"Automatic metrics for multiple-choice tasks","local":"automatic-metrics-for-multiple-choice-tasks","sections":[],"depth":2},{"title":"Automatic metrics for perplexity and language modeling","local":"automatic-metrics-for-perplexity-and-language-modeling","sections":[],"depth":2},{"title":"Automatic metrics for generative tasks","local":"automatic-metrics-for-generative-tasks","sections":[],"depth":2},{"title":"LLM-as-Judge","local":"llm-as-judge","sections":[],"depth":2}],"depth":1}';function be(K){return se(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class $e extends ce{constructor(s){super(),me(this,s,be,fe,re,{})}}export{$e as component};

Xet Storage Details

Size:: 10.7 kB
Xet hash:: e6d774ec6d1a364ee807b0888f267ed68c43ff3b7a27114adaf69c7032e79aba

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.