Buckets:

rtrm's picture
download
raw
99.8 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Criando seu próprio dataset&quot;,&quot;local&quot;:&quot;criando-seu-próprio-dataset&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Obtendo os dados&quot;,&quot;local&quot;:&quot;obtendo-os-dados&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Limpando os dados&quot;,&quot;local&quot;:&quot;limpando-os-dados&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Aumentando o conjunto de dados&quot;,&quot;local&quot;:&quot;aumentando-o-conjunto-de-dados&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Carregando o conjunto de dados para o Hugging Face Hub&quot;,&quot;local&quot;:&quot;carregando-o-conjunto-de-dados-para-o-hugging-face-hub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Criando um cartão do datasets&quot;,&quot;local&quot;:&quot;criando-um-cartão-do-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1069/pt/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/entry/start.854f6ddb.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/scheduler.37c15a92.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/singletons.5a4db441.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/index.18351ede.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/paths.052a2e73.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/entry/app.99d5705b.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/index.2bf4358c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/nodes/0.24f28b67.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/nodes/32.6ec3f6e3.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/Tip.363c041f.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/Youtube.1e50a667.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/CodeBlock.4e987730.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/CourseFloatingBanner.6add7356.js">
<link rel="modulepreload" href="/docs/course/pr_1069/pt/_app/immutable/chunks/getInferenceSnippets.24b50994.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Criando seu próprio dataset&quot;,&quot;local&quot;:&quot;criando-seu-próprio-dataset&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Obtendo os dados&quot;,&quot;local&quot;:&quot;obtendo-os-dados&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Limpando os dados&quot;,&quot;local&quot;:&quot;limpando-os-dados&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Aumentando o conjunto de dados&quot;,&quot;local&quot;:&quot;aumentando-o-conjunto-de-dados&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Carregando o conjunto de dados para o Hugging Face Hub&quot;,&quot;local&quot;:&quot;carregando-o-conjunto-de-dados-para-o-hugging-face-hub&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Criando um cartão do datasets&quot;,&quot;local&quot;:&quot;criando-um-cartão-do-datasets&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="criando-seu-próprio-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#criando-seu-próprio-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Criando seu próprio dataset</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-5-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/pt/chapter5/section5.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/pt/chapter5/section5.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-qzjzzu">Às vezes, o conjunto de dados de que você precisa para criar um aplicativo de PLN não existe, portanto, você mesmo precisará criá-lo. Nesta seção, mostraremos como criar um corpus de <a href="https://github.com/features/issues/" rel="nofollow">issues do GitHub</a>, que são comumente usados ​​para rastrear bugs ou recursos nos repositórios do GitHub. Este corpus pode ser usado para vários fins, incluindo:</p> <ul data-svelte-h="svelte-13fgdbo"><li>Explorar quanto tempo leva para fechar as issues abertos ou pull requests</li> <li>Treinar um <em>classificador multilabel</em> que pode marcar issues com metadados com base na descrição da issue (por exemplo, “bug”, “melhoria” ou “pergunta”)</li> <li>Criando um mecanismo de pesquisa semântica para descobrir quais issues correspondem à consulta de um usuário</li></ul> <p data-svelte-h="svelte-1t92jpo">Aqui nos concentraremos na criação do corpus e, na próxima seção, abordaremos o aplicativo de pesquisa semântica. Para manter a meta, usaremos as issues do GitHub associados a um projeto de código aberto popular: 🤗 Datasets! Vamos dar uma olhada em como obter os dados e explorar as informações contidas nessas edições.</p> <h2 class="relative group"><a id="obtendo-os-dados" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#obtendo-os-dados"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Obtendo os dados</span></h2> <p data-svelte-h="svelte-724jk">Você pode encontrar todos as issues em 🤗 Datasets navegando até a <a href="https://github.com/huggingface/datasets/issues" rel="nofollow">guia de issues</a> do repositório. Conforme mostrado na captura de tela a seguir, no momento da redação, havia 331 issues abertos e 668 fechados.</p> <div class="flex justify-center" data-svelte-h="svelte-1htetkm"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-issues.png" alt="The GitHub issues associated with 🤗 Datasets." width="80%"></div> <p data-svelte-h="svelte-1abvgj5">Se você clicar em uma dessas issues, verá que ele contém um título, uma descrição e um conjunto de rótulos que caracterizam a issue. Um exemplo é mostrado na captura de tela abaixo.</p> <div class="flex justify-center" data-svelte-h="svelte-1jsgvzc"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-issues-single.png" alt="A typical GitHub issue in the 🤗 Datasets repository." width="80%"></div> <p data-svelte-h="svelte-jyvc3q">Para baixar todos as issues do repositório, usaremos a <a href="https://docs.github.com/en/rest" rel="nofollow">GitHub REST API</a> para pesquisar o [<code>Issues</code> endpoint](<a href="https://docs.github." rel="nofollow">https://docs.github.</a> com/en/rest/reference/issues#list-repository-issues). Esse endpoint retorna uma lista de objetos JSON, com cada objeto contendo um grande número de campos que incluem o título e a descrição, bem como metadados sobre o status da issue e assim por diante.</p> <p data-svelte-h="svelte-2cc0nx">Uma maneira conveniente de baixar as issues é por meio da biblioteca <code>requests</code>, que é a maneira padrão de fazer solicitações HTTP em Python. Você pode instalar a biblioteca executando:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip install requests<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-nc35xk">Uma vez que a biblioteca esteja instalada, você pode fazer solicitações GET para o endpoint <code>Issues</code> invocando a função <code>requests.get()</code>. Por exemplo, você pode executar o seguinte comando para recuperar a primeira issue na primeira página:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> requests
url = <span class="hljs-string">&quot;https://api.github.com/repos/huggingface/datasets/issues?page=1&amp;per_page=1&quot;</span>
response = requests.get(url)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ag02sh">O objeto <code>response</code> contém muitas informações úteis sobre a solicitação, incluindo o código de status HTTP:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->response.status_code<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">200</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1t5kbkx">onde um status <code>200</code> significa que a solicitação foi bem-sucedida (você pode encontrar uma lista de possíveis códigos de status HTTP <a href="https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" rel="nofollow">aqui</a>). O que realmente nos interessa, porém, é o <em>payload</em>, que pode ser acessado em vários formatos como bytes, strings ou JSON. Como sabemos que nossas issues estão no formato JSON, vamos inspecionar o payload da seguinte forma:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->response.json()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792&#x27;</span>,
<span class="hljs-string">&#x27;repository_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets&#x27;</span>,
<span class="hljs-string">&#x27;labels_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792/labels{/name}&#x27;</span>,
<span class="hljs-string">&#x27;comments_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792/comments&#x27;</span>,
<span class="hljs-string">&#x27;events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792/events&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792&#x27;</span>,
<span class="hljs-string">&#x27;id&#x27;</span>: <span class="hljs-number">968650274</span>,
<span class="hljs-string">&#x27;node_id&#x27;</span>: <span class="hljs-string">&#x27;MDExOlB1bGxSZXF1ZXN0NzEwNzUyMjc0&#x27;</span>,
<span class="hljs-string">&#x27;number&#x27;</span>: <span class="hljs-number">2792</span>,
<span class="hljs-string">&#x27;title&#x27;</span>: <span class="hljs-string">&#x27;Update GooAQ&#x27;</span>,
<span class="hljs-string">&#x27;user&#x27;</span>: {<span class="hljs-string">&#x27;login&#x27;</span>: <span class="hljs-string">&#x27;bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;id&#x27;</span>: <span class="hljs-number">19718818</span>,
<span class="hljs-string">&#x27;node_id&#x27;</span>: <span class="hljs-string">&#x27;MDQ6VXNlcjE5NzE4ODE4&#x27;</span>,
<span class="hljs-string">&#x27;avatar_url&#x27;</span>: <span class="hljs-string">&#x27;https://avatars.githubusercontent.com/u/19718818?v=4&#x27;</span>,
<span class="hljs-string">&#x27;gravatar_id&#x27;</span>: <span class="hljs-string">&#x27;&#x27;</span>,
<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;followers_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/followers&#x27;</span>,
<span class="hljs-string">&#x27;following_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/following{/other_user}&#x27;</span>,
<span class="hljs-string">&#x27;gists_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/gists{/gist_id}&#x27;</span>,
<span class="hljs-string">&#x27;starred_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}&#x27;</span>,
<span class="hljs-string">&#x27;subscriptions_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/subscriptions&#x27;</span>,
<span class="hljs-string">&#x27;organizations_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/orgs&#x27;</span>,
<span class="hljs-string">&#x27;repos_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/repos&#x27;</span>,
<span class="hljs-string">&#x27;events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/events{/privacy}&#x27;</span>,
<span class="hljs-string">&#x27;received_events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/received_events&#x27;</span>,
<span class="hljs-string">&#x27;type&#x27;</span>: <span class="hljs-string">&#x27;User&#x27;</span>,
<span class="hljs-string">&#x27;site_admin&#x27;</span>: <span class="hljs-literal">False</span>},
<span class="hljs-string">&#x27;labels&#x27;</span>: [],
<span class="hljs-string">&#x27;state&#x27;</span>: <span class="hljs-string">&#x27;open&#x27;</span>,
<span class="hljs-string">&#x27;locked&#x27;</span>: <span class="hljs-literal">False</span>,
<span class="hljs-string">&#x27;assignee&#x27;</span>: <span class="hljs-literal">None</span>,
<span class="hljs-string">&#x27;assignees&#x27;</span>: [],
<span class="hljs-string">&#x27;milestone&#x27;</span>: <span class="hljs-literal">None</span>,
<span class="hljs-string">&#x27;comments&#x27;</span>: <span class="hljs-number">1</span>,
<span class="hljs-string">&#x27;created_at&#x27;</span>: <span class="hljs-string">&#x27;2021-08-12T11:40:18Z&#x27;</span>,
<span class="hljs-string">&#x27;updated_at&#x27;</span>: <span class="hljs-string">&#x27;2021-08-12T12:31:17Z&#x27;</span>,
<span class="hljs-string">&#x27;closed_at&#x27;</span>: <span class="hljs-literal">None</span>,
<span class="hljs-string">&#x27;author_association&#x27;</span>: <span class="hljs-string">&#x27;CONTRIBUTOR&#x27;</span>,
<span class="hljs-string">&#x27;active_lock_reason&#x27;</span>: <span class="hljs-literal">None</span>,
<span class="hljs-string">&#x27;pull_request&#x27;</span>: {<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/pulls/2792&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792&#x27;</span>,
<span class="hljs-string">&#x27;diff_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792.diff&#x27;</span>,
<span class="hljs-string">&#x27;patch_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792.patch&#x27;</span>},
<span class="hljs-string">&#x27;body&#x27;</span>: <span class="hljs-string">&#x27;[GooAQ](https://github.com/allenai/gooaq) dataset was recently updated after splits were added for the same. This PR contains new updated GooAQ with train/val/test splits and updated README as well.&#x27;</span>,
<span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>: <span class="hljs-literal">None</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15zd9ok">Uau, é muita informação! Podemos ver campos úteis como <code>title</code>, <code>body</code> e <code>number</code> que descrevem a issue, bem como informações sobre o usuário do GitHub que abriu a issue.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1e5rp6z">✏️ <strong>Experimente!</strong> Clique em alguns dos URLs na carga JSON acima para ter uma ideia de que tipo de informação cada issue do GitHub está vinculado.</p></div> <p data-svelte-h="svelte-nznw16">Conforme descrito na [documentação] do GitHub (<a href="https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting" rel="nofollow">https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting</a>), as solicitações não autenticadas são limitadas a 60 solicitações por hora. Embora você possa aumentar o parâmetro de consulta <code>per_page</code> para reduzir o número de solicitações feitas, você ainda atingirá o limite de taxa em qualquer repositório que tenha mais do que alguns milhares de issues. Então, em vez disso, você deve seguir as [instruções] do GitHub (<a href="https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token" rel="nofollow">https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token</a>) sobre como criar um <em>token de acesso pessoal</em> para que você pode aumentar o limite de taxa para 5.000 solicitações por hora. Depois de ter seu token, você pode incluí-lo como parte do cabeçalho da solicitação:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->GITHUB_TOKEN = xxx <span class="hljs-comment"># Copy your GitHub token here</span>
headers = {<span class="hljs-string">&quot;Authorization&quot;</span>: <span class="hljs-string">f&quot;token <span class="hljs-subst">{GITHUB_TOKEN}</span>&quot;</span>}<!-- HTML_TAG_END --></pre></div> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-1vzn3qq">⚠️ Não compartilhe um notebook com seu <code>GITHUB_TOKEN</code> colado nele. Recomendamos que você exclua a última célula depois de executá-la para evitar o vazamento dessas informações acidentalmente. Melhor ainda, armazene o token em um arquivo <em>.env</em> e use a <a href="https://github.com/theskumar/python-dotenv" rel="nofollow"><code>python-dotenv</code> library</a> para carregá-lo automaticamente para você como uma variável de ambiente.</p></div> <p data-svelte-h="svelte-1u5zgji">Agora que temos nosso token de acesso, vamos criar uma função que possa baixar todas as issues de um repositório do GitHub:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> time
<span class="hljs-keyword">import</span> math
<span class="hljs-keyword">from</span> pathlib <span class="hljs-keyword">import</span> Path
<span class="hljs-keyword">import</span> pandas <span class="hljs-keyword">as</span> pd
<span class="hljs-keyword">from</span> tqdm.notebook <span class="hljs-keyword">import</span> tqdm
<span class="hljs-keyword">def</span> <span class="hljs-title function_">fetch_issues</span>(<span class="hljs-params">
owner=<span class="hljs-string">&quot;huggingface&quot;</span>,
repo=<span class="hljs-string">&quot;datasets&quot;</span>,
num_issues=<span class="hljs-number">10_000</span>,
rate_limit=<span class="hljs-number">5_000</span>,
issues_path=Path(<span class="hljs-params"><span class="hljs-string">&quot;.&quot;</span></span>),
</span>):
<span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> issues_path.is_dir():
issues_path.mkdir(exist_ok=<span class="hljs-literal">True</span>)
batch = []
all_issues = []
per_page = <span class="hljs-number">100</span> <span class="hljs-comment"># Number of issues to return per page</span>
num_pages = math.ceil(num_issues / per_page)
base_url = <span class="hljs-string">&quot;https://api.github.com/repos&quot;</span>
<span class="hljs-keyword">for</span> page <span class="hljs-keyword">in</span> tqdm(<span class="hljs-built_in">range</span>(num_pages)):
<span class="hljs-comment"># Query with state=all to get both open and closed issues</span>
query = <span class="hljs-string">f&quot;issues?page=<span class="hljs-subst">{page}</span>&amp;per_page=<span class="hljs-subst">{per_page}</span>&amp;state=all&quot;</span>
issues = requests.get(<span class="hljs-string">f&quot;<span class="hljs-subst">{base_url}</span>/<span class="hljs-subst">{owner}</span>/<span class="hljs-subst">{repo}</span>/<span class="hljs-subst">{query}</span>&quot;</span>, headers=headers)
batch.extend(issues.json())
<span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(batch) &gt; rate_limit <span class="hljs-keyword">and</span> <span class="hljs-built_in">len</span>(all_issues) &lt; num_issues:
all_issues.extend(batch)
batch = [] <span class="hljs-comment"># Flush batch for next time period</span>
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Reached GitHub rate limit. Sleeping for one hour ...&quot;</span>)
time.sleep(<span class="hljs-number">60</span> * <span class="hljs-number">60</span> + <span class="hljs-number">1</span>)
all_issues.extend(batch)
df = pd.DataFrame.from_records(all_issues)
df.to_json(<span class="hljs-string">f&quot;<span class="hljs-subst">{issues_path}</span>/<span class="hljs-subst">{repo}</span>-issues.jsonl&quot;</span>, orient=<span class="hljs-string">&quot;records&quot;</span>, lines=<span class="hljs-literal">True</span>)
<span class="hljs-built_in">print</span>(
<span class="hljs-string">f&quot;Downloaded all the issues for <span class="hljs-subst">{repo}</span>! Dataset stored at <span class="hljs-subst">{issues_path}</span>/<span class="hljs-subst">{repo}</span>-issues.jsonl&quot;</span>
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1enlzi5">Agora, quando chamamos <code>fetch_issues()</code>, ele fará o download de todas as issues em lotes para evitar exceder o limite do GitHub no número de solicitações por hora; o resultado será armazenado em um arquivo <em>repository_name-issues.jsonl</em>, onde cada linha é um objeto JSON que representa uma issue. Vamos usar esta função para pegar todas as issues de 🤗 Datasets:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Depending on your internet connection, this can take several minutes to run...</span>
fetch_issues()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-16hcfk7">Depois que as issues forem baixadas, podemos carregá-las localmente usando nossas novas habilidades da <a href="/course/chapter5/2">seção 2</a>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_dataset = load_dataset(<span class="hljs-string">&quot;json&quot;</span>, data_files=<span class="hljs-string">&quot;datasets-issues.jsonl&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
issues_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;url&#x27;</span>, <span class="hljs-string">&#x27;repository_url&#x27;</span>, <span class="hljs-string">&#x27;labels_url&#x27;</span>, <span class="hljs-string">&#x27;comments_url&#x27;</span>, <span class="hljs-string">&#x27;events_url&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;id&#x27;</span>, <span class="hljs-string">&#x27;node_id&#x27;</span>, <span class="hljs-string">&#x27;number&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;user&#x27;</span>, <span class="hljs-string">&#x27;labels&#x27;</span>, <span class="hljs-string">&#x27;state&#x27;</span>, <span class="hljs-string">&#x27;locked&#x27;</span>, <span class="hljs-string">&#x27;assignee&#x27;</span>, <span class="hljs-string">&#x27;assignees&#x27;</span>, <span class="hljs-string">&#x27;milestone&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;created_at&#x27;</span>, <span class="hljs-string">&#x27;updated_at&#x27;</span>, <span class="hljs-string">&#x27;closed_at&#x27;</span>, <span class="hljs-string">&#x27;author_association&#x27;</span>, <span class="hljs-string">&#x27;active_lock_reason&#x27;</span>, <span class="hljs-string">&#x27;pull_request&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>, <span class="hljs-string">&#x27;timeline_url&#x27;</span>, <span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>],
num_rows: <span class="hljs-number">3019</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-97971k">Ótimo, criamos nosso primeiro conjunto de dados do zero! Mas por que existem vários milhares de issues quando a <a href="https://github.com/huggingface/datasets/issues" rel="nofollow">guia Issue</a> do repositório 🤗 Datasets mostra apenas cerca de 1.000 issues no total 🤔? Conforme descrito na [documentação] do GitHub (<a href="https://docs.github.com/en/rest/reference/issues#list-issues-assigned-to-the-authenticated-user" rel="nofollow">https://docs.github.com/en/rest/reference/issues#list-issues-assigned-to-the-authenticated-user</a>), isso ocorre porque baixamos todos os pull request também:</p> <blockquote data-svelte-h="svelte-127vg01"><p>A API REST v3 do GitHub considera cada pull request como uma issue, mas nem toda issue é um pull request. Por esse motivo, os endpoints de “issues” podem retornar issues e solicitações de pull na resposta. Você pode identificar solicitações de pull pela chave <code>pull_request</code>. Esteja ciente de que o <code>id</code> de uma solicitação pull retornada de endpoints “issues” será um ID de issue.</p></blockquote> <p data-svelte-h="svelte-1uym4u1">Como o conteúdo das issues e dos pull request são bem diferentes, vamos fazer um pequeno pré-processamento para nos permitir distinguir entre eles.</p> <h2 class="relative group"><a id="limpando-os-dados" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#limpando-os-dados"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Limpando os dados</span></h2> <p data-svelte-h="svelte-r1iiw0">O trecho acima da documentação do GitHub nos diz que a coluna <code>pull_request</code> pode ser usada para diferenciar entre issues e solicitações de pull request. Vamos olhar para uma amostra aleatória para ver qual é a diferença. Como fizemos na <a href="/course/chapter5/3">seção 3</a>, vamos encadear <code>Dataset.shuffle()</code> e <code>Dataset.select()</code> para criar uma amostra aleatória e então compactar o <code>html_url</code> e <code>pull_request</code> para que possamos comparar os vários URLs:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sample = issues_dataset.shuffle(seed=<span class="hljs-number">666</span>).select(<span class="hljs-built_in">range</span>(<span class="hljs-number">3</span>))
<span class="hljs-comment"># Print out the URL and pull request entries</span>
<span class="hljs-keyword">for</span> url, pr <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(sample[<span class="hljs-string">&quot;html_url&quot;</span>], sample[<span class="hljs-string">&quot;pull_request&quot;</span>]):
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;&gt;&gt; URL: <span class="hljs-subst">{url}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;&gt;&gt; Pull request: <span class="hljs-subst">{pr}</span>\n&quot;</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->&gt;&gt; URL: https://github.com/huggingface/datasets/pull/<span class="hljs-number">850</span>
&gt;&gt; Pull request: {<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/pulls/850&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/850&#x27;</span>, <span class="hljs-string">&#x27;diff_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/850.diff&#x27;</span>, <span class="hljs-string">&#x27;patch_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/850.patch&#x27;</span>}
&gt;&gt; URL: https://github.com/huggingface/datasets/issues/<span class="hljs-number">2773</span>
&gt;&gt; Pull request: <span class="hljs-literal">None</span>
&gt;&gt; URL: https://github.com/huggingface/datasets/pull/<span class="hljs-number">783</span>
&gt;&gt; Pull request: {<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/pulls/783&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/783&#x27;</span>, <span class="hljs-string">&#x27;diff_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/783.diff&#x27;</span>, <span class="hljs-string">&#x27;patch_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/783.patch&#x27;</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-z4trhs">Aqui podemos ver que cada pull request está associado a vários URLs, enquanto as issues comuns têm uma entrada <code>None</code>. Podemos usar essa distinção para criar uma nova coluna <code>is_pull_request</code> que verifica se o campo <code>pull_request</code> é <code>None</code> ou não:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_dataset = issues_dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;is_pull_request&quot;</span>: <span class="hljs-literal">False</span> <span class="hljs-keyword">if</span> x[<span class="hljs-string">&quot;pull_request&quot;</span>] <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">else</span> <span class="hljs-literal">True</span>}
)<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-19u3bxy">✏️ <strong>Experimente!</strong> Calcule o tempo médio que leva para fechar as issues em 🤗 Datasets. Você pode achar a função <code>Dataset.filter()</code> útil para filtrar os pull requests e as issues abertas, e você pode usar a função <code>Dataset.set_format()</code> para converter o conjunto de dados em um <code>DataFrame</code> para que você possa manipular facilmente os timestamps <code>created_at</code> e <code>closed_at</code>. Para pontos de bônus, calcule o tempo médio que leva para fechar os pull requests.</p></div> <p data-svelte-h="svelte-58dtlq">Embora possamos continuar a limpar o conjunto de dados descartando ou renomeando algumas colunas, geralmente é uma boa prática manter o conjunto de dados o mais “bruto” possível neste estágio para que possa ser facilmente usado em vários aplicativos.</p> <p data-svelte-h="svelte-p477r1">Antes de enviarmos nosso conjunto de dados para o Hugging Face Hub, vamos lidar com uma coisa que está faltando: os comentários associados a cada issue e pull request. Vamos adicioná-los a seguir - você adivinhou - a API REST do GitHub!</p> <h2 class="relative group"><a id="aumentando-o-conjunto-de-dados" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#aumentando-o-conjunto-de-dados"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Aumentando o conjunto de dados</span></h2> <p data-svelte-h="svelte-18m93nm">Conforme mostrado na captura de tela a seguir, os comentários associados a uma issue ou a pull request fornecem uma rica fonte de informações, especialmente se estivermos interessados ​​em criar um mecanismo de pesquisa para responder às consultas dos usuários sobre a biblioteca.</p> <div class="flex justify-center" data-svelte-h="svelte-1fxxwaz"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-issues-comment.png" alt="Comments associated with an issue about 🤗 Datasets." width="80%"></div> <p data-svelte-h="svelte-1aq93c6">A API REST do GitHub fornece um <a href="https://docs.github.com/en/rest/reference/issues#list-issue-comments" rel="nofollow">endpoint <code>Comments</code></a> que retorna todos os comentários associados a uma issue. Vamos testar o endpoint para ver o que ele retorna:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issue_number = <span class="hljs-number">2792</span>
url = <span class="hljs-string">f&quot;https://api.github.com/repos/huggingface/datasets/issues/<span class="hljs-subst">{issue_number}</span>/comments&quot;</span>
response = requests.get(url, headers=headers)
response.json()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/comments/897594128&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/huggingface/datasets/pull/2792#issuecomment-897594128&#x27;</span>,
<span class="hljs-string">&#x27;issue_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/repos/huggingface/datasets/issues/2792&#x27;</span>,
<span class="hljs-string">&#x27;id&#x27;</span>: <span class="hljs-number">897594128</span>,
<span class="hljs-string">&#x27;node_id&#x27;</span>: <span class="hljs-string">&#x27;IC_kwDODunzps41gDMQ&#x27;</span>,
<span class="hljs-string">&#x27;user&#x27;</span>: {<span class="hljs-string">&#x27;login&#x27;</span>: <span class="hljs-string">&#x27;bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;id&#x27;</span>: <span class="hljs-number">19718818</span>,
<span class="hljs-string">&#x27;node_id&#x27;</span>: <span class="hljs-string">&#x27;MDQ6VXNlcjE5NzE4ODE4&#x27;</span>,
<span class="hljs-string">&#x27;avatar_url&#x27;</span>: <span class="hljs-string">&#x27;https://avatars.githubusercontent.com/u/19718818?v=4&#x27;</span>,
<span class="hljs-string">&#x27;gravatar_id&#x27;</span>: <span class="hljs-string">&#x27;&#x27;</span>,
<span class="hljs-string">&#x27;url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;html_url&#x27;</span>: <span class="hljs-string">&#x27;https://github.com/bhavitvyamalik&#x27;</span>,
<span class="hljs-string">&#x27;followers_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/followers&#x27;</span>,
<span class="hljs-string">&#x27;following_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/following{/other_user}&#x27;</span>,
<span class="hljs-string">&#x27;gists_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/gists{/gist_id}&#x27;</span>,
<span class="hljs-string">&#x27;starred_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}&#x27;</span>,
<span class="hljs-string">&#x27;subscriptions_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/subscriptions&#x27;</span>,
<span class="hljs-string">&#x27;organizations_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/orgs&#x27;</span>,
<span class="hljs-string">&#x27;repos_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/repos&#x27;</span>,
<span class="hljs-string">&#x27;events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/events{/privacy}&#x27;</span>,
<span class="hljs-string">&#x27;received_events_url&#x27;</span>: <span class="hljs-string">&#x27;https://api.github.com/users/bhavitvyamalik/received_events&#x27;</span>,
<span class="hljs-string">&#x27;type&#x27;</span>: <span class="hljs-string">&#x27;User&#x27;</span>,
<span class="hljs-string">&#x27;site_admin&#x27;</span>: <span class="hljs-literal">False</span>},
<span class="hljs-string">&#x27;created_at&#x27;</span>: <span class="hljs-string">&#x27;2021-08-12T12:21:52Z&#x27;</span>,
<span class="hljs-string">&#x27;updated_at&#x27;</span>: <span class="hljs-string">&#x27;2021-08-12T12:31:17Z&#x27;</span>,
<span class="hljs-string">&#x27;author_association&#x27;</span>: <span class="hljs-string">&#x27;CONTRIBUTOR&#x27;</span>,
<span class="hljs-string">&#x27;body&#x27;</span>: <span class="hljs-string">&quot;@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = &#x27;gooaq&#x27;\r\n\r\n def test_load_dataset(self, dataset_name):\r\n configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n&gt; self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n self.parent.assertTrue(len(dataset[split]) &gt; 0)\r\nE AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?&quot;</span>,
<span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>: <span class="hljs-literal">None</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1m47af7">Podemos ver que o comentário está armazenado no campo <code>body</code>, então vamos escrever uma função simples que retorna todos os comentários associados a uma issue selecionando o conteúdo do <code>body</code> para cada elemento em <code>response.json()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_comments</span>(<span class="hljs-params">issue_number</span>):
url = <span class="hljs-string">f&quot;https://api.github.com/repos/huggingface/datasets/issues/<span class="hljs-subst">{issue_number}</span>/comments&quot;</span>
response = requests.get(url, headers=headers)
<span class="hljs-keyword">return</span> [r[<span class="hljs-string">&quot;body&quot;</span>] <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> response.json()]
<span class="hljs-comment"># Test our function works as expected</span>
get_comments(<span class="hljs-number">2792</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&quot;@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = &#x27;gooaq&#x27;\r\n\r\n def test_load_dataset(self, dataset_name):\r\n configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n&gt; self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n self.parent.assertTrue(len(dataset[split]) &gt; 0)\r\nE AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?&quot;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-an9xq1">Isso parece certo, então vamos usar <code>Dataset.map()</code> para adicionar uma nova coluna <code>comments</code> para cada issue em nosso conjunto de dados:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Depending on your internet connection, this can take a few minutes...</span>
issues_with_comments_dataset = issues_dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;comments&quot;</span>: get_comments(x[<span class="hljs-string">&quot;number&quot;</span>])}
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-inkiuv">A etapa final é salvar o conjunto de dados aumentado junto com nossos dados brutos para que possamos enviá-los para o Hub:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_with_comments_dataset.to_json(<span class="hljs-string">&quot;issues-datasets-with-comments.jsonl&quot;</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="carregando-o-conjunto-de-dados-para-o-hugging-face-hub" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#carregando-o-conjunto-de-dados-para-o-hugging-face-hub"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Carregando o conjunto de dados para o Hugging Face Hub</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/HaN6qCr_Afc" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-1pq8ng9">Agora que temos nosso conjunto de dados aumentado, é hora de enviá-lo para o Hub para que possamos compartilhá-lo com a comunidade! Para fazer o upload do conjunto de dados, usaremos a <a href="https://github.com/huggingface/huggingface_hub" rel="nofollow">🤗 Hub library</a>, que nos permite interagir com o Hugging Face Hub por meio de uma API Python. 🤗 Hub vem pré-instalado com 🤗 Transformers, para que possamos usá-lo diretamente. Por exemplo, podemos usar a função <code>list_datasets()</code> para obter informações sobre todos os conjuntos de dados públicos atualmente hospedados no Hub:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> list_datasets
all_datasets = list_datasets()
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;Number of datasets on Hub: <span class="hljs-subst">{<span class="hljs-built_in">len</span>(all_datasets)}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(all_datasets[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Number of datasets on Hub: <span class="hljs-number">1487</span>
Dataset Name: acronym_identification, Tags: [<span class="hljs-string">&#x27;annotations_creators:expert-generated&#x27;</span>, <span class="hljs-string">&#x27;language_creators:found&#x27;</span>, <span class="hljs-string">&#x27;languages:en&#x27;</span>, <span class="hljs-string">&#x27;licenses:mit&#x27;</span>, <span class="hljs-string">&#x27;multilinguality:monolingual&#x27;</span>, <span class="hljs-string">&#x27;size_categories:10K&lt;n&lt;100K&#x27;</span>, <span class="hljs-string">&#x27;source_datasets:original&#x27;</span>, <span class="hljs-string">&#x27;task_categories:structure-prediction&#x27;</span>, <span class="hljs-string">&#x27;task_ids:structure-prediction-other-acronym-identification&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-elk9an">Podemos ver que atualmente existem cerca de 1.500 conjuntos de dados no Hub, e a função <code>list_datasets()</code> também fornece alguns metadados básicos sobre cada repositório de conjuntos de dados.</p> <p data-svelte-h="svelte-mwq422">Para nossos propósitos, a primeira coisa que precisamos fazer é criar um novo repositório de conjunto de dados no Hub. Para fazer isso, precisamos de um token de autenticação, que pode ser obtido primeiro entrando no Hugging Face Hub com a função <code>notebook_login()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login
notebook_login()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-3ay613">Isso criará um widget onde você poderá inserir seu nome de usuário e senha, e um token de API será salvo em <em>~/.huggingface/token</em>. Se você estiver executando o código em um terminal, poderá fazer login via CLI:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->huggingface-cli login<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1sk1cve">Feito isso, podemos criar um novo repositório de conjunto de dados com a função <code>create_repo()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> create_repo
repo_url = create_repo(name=<span class="hljs-string">&quot;github-issues&quot;</span>, repo_type=<span class="hljs-string">&quot;dataset&quot;</span>)
repo_url<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">&#x27;https://huggingface.co/datasets/lewtun/github-issues&#x27;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1b7m61j">Neste exemplo, criamos um repositório de conjunto de dados vazio chamado <code>github-issues</code> sob o nome de usuário <code>lewtun</code> (o nome de usuário deve ser seu nome de usuário do Hub quando você estiver executando este código!).</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-ld82he">✏️ <strong>Experimente!</strong> Use seu nome de usuário e senha do Hugging Face Hub para obter um token e criar um repositório vazio chamado <code>github-issues</code>. Lembre-se de <strong>nunca salvar suas credenciais</strong> no Colab ou em qualquer outro repositório, pois essas informações podem ser exploradas por agentes mal-intencionados.</p></div> <p data-svelte-h="svelte-ksirvg">Em seguida, vamos clonar o repositório do Hub para nossa máquina local e copiar nosso arquivo de conjunto de dados para ele. O 🤗 Hub fornece uma classe <code>Repository</code> útil que envolve muitos dos comandos comuns do Git, portanto, para clonar o repositório remoto, basta fornecer o URL e o caminho local para o qual desejamos clonar:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> Repository
repo = Repository(local_dir=<span class="hljs-string">&quot;github-issues&quot;</span>, clone_from=repo_url)
!cp datasets-issues-<span class="hljs-keyword">with</span>-comments.jsonl github-issues/<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-946zbw">Por padrão, várias extensões de arquivo (como <em>.bin</em>, <em>.gz</em> e <em>.zip</em>) são rastreadas com o Git LFS para que arquivos grandes possam ser versionados no mesmo fluxo de trabalho do Git. Você pode encontrar uma lista de extensões de arquivos rastreados dentro do arquivo <em>.gitattributes</em> do repositório. Para incluir o formato JSON Lines na lista, podemos executar o seguinte comando:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->repo.lfs_track(<span class="hljs-string">&quot;*.jsonl&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-uq1g87">Então podemos usar <code>Repository.push_to_hub()</code> para enviar o conjunto de dados para o Hub:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->repo.push_to_hub()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ov0i84">Se navegarmos para a URL contida em <code>repo_url</code>, veremos agora que nosso arquivo de conjunto de dados foi carregado.</p> <div class="flex justify-center" data-svelte-h="svelte-18puw29"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/hub-repo.png" alt="Our dataset repository on the Hugging Face Hub." width="80%"></div> <p data-svelte-h="svelte-1v7sua7">A partir daqui, qualquer um pode baixar o conjunto de dados simplesmente fornecendo <code>load_dataset()</code> com o ID do repositório como o argumento <code>path</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->remote_dataset = load_dataset(<span class="hljs-string">&quot;lewtun/github-issues&quot;</span>, split=<span class="hljs-string">&quot;train&quot;</span>)
remote_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;url&#x27;</span>, <span class="hljs-string">&#x27;repository_url&#x27;</span>, <span class="hljs-string">&#x27;labels_url&#x27;</span>, <span class="hljs-string">&#x27;comments_url&#x27;</span>, <span class="hljs-string">&#x27;events_url&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;id&#x27;</span>, <span class="hljs-string">&#x27;node_id&#x27;</span>, <span class="hljs-string">&#x27;number&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;user&#x27;</span>, <span class="hljs-string">&#x27;labels&#x27;</span>, <span class="hljs-string">&#x27;state&#x27;</span>, <span class="hljs-string">&#x27;locked&#x27;</span>, <span class="hljs-string">&#x27;assignee&#x27;</span>, <span class="hljs-string">&#x27;assignees&#x27;</span>, <span class="hljs-string">&#x27;milestone&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;created_at&#x27;</span>, <span class="hljs-string">&#x27;updated_at&#x27;</span>, <span class="hljs-string">&#x27;closed_at&#x27;</span>, <span class="hljs-string">&#x27;author_association&#x27;</span>, <span class="hljs-string">&#x27;active_lock_reason&#x27;</span>, <span class="hljs-string">&#x27;pull_request&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>, <span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>, <span class="hljs-string">&#x27;is_pull_request&#x27;</span>],
num_rows: <span class="hljs-number">2855</span>
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-zqtc7l">Legal, nós enviamos nosso conjunto de dados para o Hub e está disponível para outros usarem! Há apenas uma coisa importante a fazer: adicionar um <em>cartão de conjunto de dados</em> que explica como o corpus foi criado e fornece outras informações úteis para a comunidade.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-rrol3s">💡 Você também pode enviar um conjunto de dados para o Hugging Face Hub diretamente do terminal usando <code>huggingface-cli</code> e um pouco de magia Git. Consulte o <a href="https://huggingface.co/docs/datasets/share#share-a-dataset-using-the-cli" rel="nofollow">guia do 🤗 Datasets</a> para obter detalhes sobre como fazer isso.</p></div> <h2 class="relative group"><a id="criando-um-cartão-do-datasets" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#criando-um-cartão-do-datasets"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Criando um cartão do datasets</span></h2> <p data-svelte-h="svelte-dylmg9">Conjuntos de dados bem documentados são mais propensos a serem úteis para outras pessoas (incluindo você mesmo no futuro!), pois fornecem o contexto para permitir que os usuários decidam se o conjunto de dados é relevante para sua tarefa e avaliem possíveis vieses ou riscos associados ao uso o conjunto de dados.</p> <p data-svelte-h="svelte-17tg70u">No Hugging Face Hub, essas informações são armazenadas no arquivo <em>README.md</em> de cada repositório de conjunto de dados. Há duas etapas principais que você deve seguir antes de criar este arquivo:</p> <ol data-svelte-h="svelte-gkbv46"><li>Use a aplicação <a href="https://huggingface.co/datasets/tagging/" rel="nofollow"><code>datasets-tagging</code></a> para criar tags de metadados no formato YAML. Essas tags são usadas para uma variedade de recursos de pesquisa no Hugging Face Hub e garantem que seu conjunto de dados possa ser facilmente encontrado pelos membros da comunidade. Como criamos um conjunto de dados personalizado aqui, você precisará clonar o repositório <code>datasets-tagging</code> e executar o aplicativo localmente. Veja como é a interface:</li></ol> <div class="flex justify-center" data-svelte-h="svelte-1gqifn5"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-tagger.png" alt="The 'datasets-tagging' interface." width="80%"></div> <ol start="2" data-svelte-h="svelte-1u3mfkz"><li>Leia o <a href="https://github.com/huggingface/datasets/blob/master/templates/README_guide.md" rel="nofollow">guia do 🤗 datasets</a> sobre como criar cartões informativos de conjuntos de dados e use-os como modelo.</li></ol> <p data-svelte-h="svelte-1tv90dr">Você pode criar o arquivo <em>README.md</em> diretamente no Hub e encontrar um cartão de conjunto de dados de modelo no repositório de conjunto de dados <code>lewtun/github-issues</code>. Uma captura de tela do cartão de conjunto de dados preenchido é mostrada abaixo.</p> <div class="flex justify-center" data-svelte-h="svelte-ct1wn8"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/dataset-card.png" alt="A dataset card." width="80%"></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1vt20th">✏️ <strong>Experimente!</strong> Use o aplicativo <code>dataset-tagging</code> e <a href="https://github.com/huggingface/datasets/blob/master/templates/README_guide.md" rel="nofollow">guia do 🤗 datasets</a> para concluir o <em>Arquivo README.md</em> para o conjunto de dados de issues do GitHub.</p></div> <p data-svelte-h="svelte-k6tkf8">É isso! Vimos nesta seção que criar um bom conjunto de dados pode ser bastante complicado, mas felizmente carregá-lo e compartilhá-lo com a comunidade não é. Na próxima seção, usaremos nosso novo conjunto de dados para criar um mecanismo de pesquisa semântica com o 🤗 datasets que podem corresponder perguntas as issues e comentários mais relevantes.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-eptrxz">✏️ <strong>Experimente!</strong> Siga as etapas que seguimos nesta seção para criar um conjunto de dados de issues do GitHub para sua biblioteca de código aberto favorita (escolha algo diferente do 🤗 datasets, é claro!). Para pontos de bônus, ajuste um classificador multilabel para prever as tags presentes no campo <code>labels</code>.</p></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/pt/chapter5/5.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1ef170a = {
assets: "/docs/course/pr_1069/pt",
base: "/docs/course/pr_1069/pt",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1069/pt/_app/immutable/entry/start.854f6ddb.js"),
import("/docs/course/pr_1069/pt/_app/immutable/entry/app.99d5705b.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 32],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
99.8 kB
·
Xet hash:
80fd96c505a09188bf07a2f608ded7405b8201639ce9a80e290f8f193d16706c

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.