Spaces:
Running
Running
| <html> | |
| <head> | |
| <script src="distill.bundle.js" type="module" fetchpriority="high" blocking></script> | |
| <script src="main.bundle.js" type="module" fetchpriority="low" defer></script> | |
| <meta name="viewport" content="width=device-width, initial-scale=1, shrink-to-fit=no"> | |
| <meta charset="utf8"> | |
| <base target="_blank"> | |
| <title>Scaling FineWeb to 1000+ languages: Step 1: finding signal in 100s of evaluation tasks</title> | |
| <link rel="stylesheet" href="style.css"> | |
| </head> | |
| <body> | |
| <d-front-matter> | |
| <script id='distill-front-matter' type="text/json">{ | |
| "title": "📝 Scaling FineWeb to 1000+ languages: Step 1: finding signal in 100s of evaluation tasks", | |
| "description": "This blog covers a discussion on multilingual evaluation and task signal, the processes for selecting existing evaluation tasks based on signal resulting in FineTasks, and comparisson of open and closed sourced on the FineTasks.", | |
| "published": "Oct 23, 2024", | |
| "affiliation": {"name": "HuggingFace"}, | |
| "authors": [ | |
| { | |
| "author":"Hynek Kydlíček", | |
| "authorURL":"https://huggingface.co/hynky" | |
| }, | |
| { | |
| "author":"Guilherme Penedo", | |
| "authorURL":"https://huggingface.co/guipenedo" | |
| }, | |
| { | |
| "author":"Clémentine Fourier", | |
| "authorURL":"https://huggingface.co/clefourrier" | |
| }, | |
| { | |
| "author":"Nathan Habib", | |
| "authorURL":"https://huggingface.co/SaylorTwift" | |
| }, | |
| { | |
| "author":"Thomas Wolf", | |
| "authorURL":"https://huggingface.co/thomwolf" | |
| } | |
| ] | |
| }</script> | |
| </d-front-matter> | |
| <d-byline></d-byline> | |
| <d-article> | |
| <d-contents> | |
| </d-contents> | |
| <p>We're looking forward to revisiting this analysis in the future, not with just 9 languages, but at least 50—thanks to community contributions! Let's level the playing field between English and other languages together! 🤗</p> | |
| <d-math> 1+1=2 </d-math> | |
| </d-article> | |
| <d-appendix> | |
| <d-bibliography src="bibliography.bib"></d-bibliography> | |
| <style> | |
| d-appendix .citation { | |
| font-size: 11px; | |
| line-height: 15px; | |
| border-left: 1px solid rgba(0, 0, 0, 0.1); | |
| padding-left: 18px; | |
| border: 1px solid rgba(0,0,0,0.1); | |
| background: rgba(0, 0, 0, 0.02); | |
| padding: 10px 18px; | |
| border-radius: 3px; | |
| color: rgba(150, 150, 150, 1); | |
| overflow: hidden; | |
| margin-top: -12px; | |
| white-space: pre-wrap; | |
| word-wrap: break-word; | |
| } | |
| </style> | |
| <h3 id="citation">Citation</h3> | |
| <p>For attribution in academic contexts, please cite this work as</p> | |
| <pre class="citation short">Kydlicek, et al., "FineTasks: Finding signal in a haystack of 200+ multilingual tasks", 2024.</pre> | |
| <p>BibTeX citation</p> | |
| <pre class="citation long">@misc{kydlicek2024finetasksmultilingualtasks, | |
| title={FineTasks: Finding signal in a haystack of 200+ multilingual tasks}, | |
| author={Hynek Kydlíček and Guilherme Penedo and Clémentine Fourier and Nathan Habib and Thomas Wolf}, | |
| url={https://huggingface.co/spaces/HuggingFaceFW/blogpost-fine-tasks}, | |
| }</pre> | |
| </d-appendix> | |
| <script> | |
| const article = document.querySelector('d-article'); | |
| const toc = document.querySelector('d-contents'); | |
| if (toc) { | |
| const headings = article.querySelectorAll('h2, h3, h4'); | |
| let ToC = `<nav role="navigation" class="l-text figcaption"><h3>Table of contents</h3>`; | |
| let prevLevel = 0; | |
| for (const el of headings) { | |
| // should element be included in TOC? | |
| const isInTitle = el.parentElement.tagName == 'D-TITLE'; | |
| const isException = el.getAttribute('no-toc'); | |
| if (isInTitle || isException) continue; | |
| el.setAttribute('id', el.textContent.toLowerCase().replaceAll(" ", "_")) | |
| const link = '<a target="_self" href="' + '#' + el.getAttribute('id') + '">' + el.textContent + '</a>'; | |
| const level = el.tagName === 'H2' ? 0 : (el.tagName === 'H3' ? 1 : 2); | |
| while (prevLevel < level) { | |
| ToC += '<ul>' | |
| prevLevel++; | |
| } | |
| while (prevLevel > level) { | |
| ToC += '</ul>' | |
| prevLevel--; | |
| } | |
| if (level === 0) | |
| ToC += '<div>' + link + '</div>'; | |
| else | |
| ToC += '<li>' + link + '</li>'; | |
| } | |
| while (prevLevel > 0) { | |
| ToC += '</ul>' | |
| prevLevel--; | |
| } | |
| ToC += '</nav>'; | |
| toc.innerHTML = ToC; | |
| toc.setAttribute('prerendered', 'true'); | |
| const toc_links = document.querySelectorAll('d-contents > nav a'); | |
| window.addEventListener('scroll', (_event) => { | |
| if (typeof (headings) != 'undefined' && headings != null && typeof (toc_links) != 'undefined' && toc_links != null) { | |
| // Then iterate forwards, on the first match highlight it and break | |
| find_active: { | |
| for (let i = headings.length - 1; i >= 0; i--) { | |
| if (headings[i].getBoundingClientRect().top - 50 <= 0) { | |
| if (!toc_links[i].classList.contains("active")) { | |
| toc_links.forEach((link, _index) => { | |
| link.classList.remove("active"); | |
| }); | |
| toc_links[i].classList.add('active'); | |
| } | |
| break find_active; | |
| } | |
| } | |
| toc_links.forEach((link, _index) => { | |
| link.classList.remove("active"); | |
| }); | |
| } | |
| } | |
| }); | |
| } | |
| </script> | |
| </body> | |
| </html> | |