Buckets:
| <meta charset="utf-8" /><meta name="hf:doc:metadata" content="{"title":"Création de votre propre jeu de données","local":"création-de-votre-propre-jeu-de-données","sections":[{"title":"Obtenir les données","local":"obtenir-les-données","sections":[],"depth":2},{"title":"Nettoyer les données","local":"nettoyer-les-données","sections":[],"depth":2},{"title":"Enrichir le jeu de données","local":"enrichir-le-jeu-de-données","sections":[],"depth":2},{"title":"Téléchargement du jeu de données sur le <i> Hub </i>","local":"téléchargement-du-jeu-de-données-sur-le-i-hub-i","sections":[],"depth":2},{"title":"Création d’une carte pour un jeu de données","local":"création-dune-carte-pour-un-jeu-de-données","sections":[],"depth":2}],"depth":1}"> | |
| <link href="/docs/course/pr_1007/fr/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/entry/start.0dcef240.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/chunks/scheduler.37c15a92.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/chunks/singletons.f3bf6ff5.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/chunks/index.18351ede.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/chunks/paths.4129b87c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/entry/app.e06c0eda.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/chunks/index.2bf4358c.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/nodes/0.d3d72fb7.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/chunks/each.e59479a4.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/nodes/38.0133d2ab.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/chunks/Tip.363c041f.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/chunks/Youtube.1e50a667.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/chunks/CodeBlock.4e987730.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/chunks/CourseFloatingBanner.6add7356.js"> | |
| <link rel="modulepreload" href="/docs/course/pr_1007/fr/_app/immutable/chunks/getInferenceSnippets.1837c472.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{"title":"Création de votre propre jeu de données","local":"création-de-votre-propre-jeu-de-données","sections":[{"title":"Obtenir les données","local":"obtenir-les-données","sections":[],"depth":2},{"title":"Nettoyer les données","local":"nettoyer-les-données","sections":[],"depth":2},{"title":"Enrichir le jeu de données","local":"enrichir-le-jeu-de-données","sections":[],"depth":2},{"title":"Téléchargement du jeu de données sur le <i> Hub </i>","local":"téléchargement-du-jeu-de-données-sur-le-i-hub-i","sections":[],"depth":2},{"title":"Création d’une carte pour un jeu de données","local":"création-dune-carte-pour-un-jeu-de-données","sections":[],"depth":2}],"depth":1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="création-de-votre-propre-jeu-de-données" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#création-de-votre-propre-jeu-de-données"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Création de votre propre jeu de données</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-5-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"> </button> </div> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"> </button> </div></div> <p data-svelte-h="svelte-1c4rbu8">Parfois, le jeu de données dont vous avez besoin pour créer une application de NLP n’existe pas. Vous devrez donc le créer vous-même. Dans cette section, nous allons vous montrer comment créer un corpus de <a href="https://github.com/features/issues/" rel="nofollow">problèmes GitHub</a>, qui sont couramment utilisés pour suivre les bogues ou les fonctionnalités dans les dépôts GitHub. Ce corpus pourrait être utilisé à diverses fins, notamment :</p> <ul data-svelte-h="svelte-1jgco8a"><li>explorer combien de temps il faut pour fermer les problèmes ouverts ou les <em>pull requests</em></li> <li>entraîner d’un <em>classifieur multilabel</em> capable d’étiqueter les problèmes avec des métadonnées basées sur la description du problème (par exemple : « bug », « amélioration » ou « question »)</li> <li>créer un moteur de recherche sémantique pour trouver les problèmes correspondant à la requête d’un utilisateur</li></ul> <p data-svelte-h="svelte-12y74og">Ici, nous nous concentrerons sur la création du corpus, et dans la section suivante, nous aborderons l’application de recherche sémantique. Pour garder les choses méta, nous utiliserons les problèmes GitHub associés à un projet open source populaire : 🤗 <em>Datasets</em> ! Voyons comment obtenir les données et explorons les informations contenues dans ces problèmes.</p> <h2 class="relative group"><a id="obtenir-les-données" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#obtenir-les-données"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Obtenir les données</span></h2> <p data-svelte-h="svelte-pg12ky">Vous pouvez trouver tous les problèmes dans 🤗 <em>Datasets</em> en accédant à l’<a href="https://github.com/huggingface/datasets/issues" rel="nofollow">onglet « Issues »</a> du dépôt. Comme le montre la capture d’écran suivante, au moment de la rédaction, il y avait 331 problèmes ouverts et 668 problèmes fermés.</p> <div class="flex justify-center" data-svelte-h="svelte-1htetkm"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-issues.png" alt="The GitHub issues associated with 🤗 Datasets." width="80%"></div> <p data-svelte-h="svelte-11pbnfk">Si vous cliquez sur l’un de ces problèmes, vous constaterez qu’il contient un titre, une description et un ensemble d’étiquettes qui caractérisent le problème. Un exemple est montré dans la capture d’écran ci-dessous.</p> <div class="flex justify-center" data-svelte-h="svelte-1jsgvzc"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-issues-single.png" alt="A typical GitHub issue in the 🤗 Datasets repository." width="80%"></div> <p data-svelte-h="svelte-1dz5l5x">Pour télécharger tous les problèmes du dépôt, nous utilisons l’<a href="https://docs.github.com/en/rest" rel="nofollow">API REST GitHub</a> pour interroger le point de terminaison <a href="https://docs.github.com/en/rest/reference/issues#list-repository-issues" rel="nofollow"><code>Issues</code></a>. Ce point de terminaison renvoie une liste d’objets JSON. Chaque objet contenant un grand nombre de champs qui incluent le titre et la description ainsi que des métadonnées sur l’état du problème, etc.</p> <p data-svelte-h="svelte-y00q0f">Un moyen pratique de télécharger les problèmes consiste à utiliser la bibliothèque <code>requests</code>, qui est la méthode standard pour effectuer des requêtes HTTP en Python. Vous pouvez installer la bibliothèque en exécutant :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!pip install requests<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1d8tvic">Une fois la bibliothèque installée, vous pouvez envoyer des requêtes GET au point de terminaison <code>Issues</code> en appelant la fonction <code>requests.get()</code>. Par exemple, vous pouvez exécuter la commande suivante pour récupérer le premier numéro sur la première page :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> requests | |
| url = <span class="hljs-string">"https://api.github.com/repos/huggingface/datasets/issues?page=1&per_page=1"</span> | |
| response = requests.get(url)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ovgb1u">L’objet <code>response</code> contient de nombreuses informations utiles sur la requête, y compris le code d’état HTTP :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->response.status_code<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">200</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-tl8yto">où un statut <code>200</code> signifie que la requête a réussi (vous pouvez trouver une liste des codes de statut HTTP possibles <a href="https://en.wikipedia.org/wiki/List_of_HTTP_status_codes" rel="nofollow">ici</a>). Ce qui nous intéresse vraiment, cependant, c’est le <em>payload</em>, qui peut être consulté dans différents formats comme les octets, les chaînes ou JSON. Comme nous savons que nos problèmes sont au format JSON, examinons la charge utile comme suit :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->response.json()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">'url'</span>: <span class="hljs-string">'https://api.github.com/repos/huggingface/datasets/issues/2792'</span>, | |
| <span class="hljs-string">'repository_url'</span>: <span class="hljs-string">'https://api.github.com/repos/huggingface/datasets'</span>, | |
| <span class="hljs-string">'labels_url'</span>: <span class="hljs-string">'https://api.github.com/repos/huggingface/datasets/issues/2792/labels{/name}'</span>, | |
| <span class="hljs-string">'comments_url'</span>: <span class="hljs-string">'https://api.github.com/repos/huggingface/datasets/issues/2792/comments'</span>, | |
| <span class="hljs-string">'events_url'</span>: <span class="hljs-string">'https://api.github.com/repos/huggingface/datasets/issues/2792/events'</span>, | |
| <span class="hljs-string">'html_url'</span>: <span class="hljs-string">'https://github.com/huggingface/datasets/pull/2792'</span>, | |
| <span class="hljs-string">'id'</span>: <span class="hljs-number">968650274</span>, | |
| <span class="hljs-string">'node_id'</span>: <span class="hljs-string">'MDExOlB1bGxSZXF1ZXN0NzEwNzUyMjc0'</span>, | |
| <span class="hljs-string">'number'</span>: <span class="hljs-number">2792</span>, | |
| <span class="hljs-string">'title'</span>: <span class="hljs-string">'Update GooAQ'</span>, | |
| <span class="hljs-string">'user'</span>: {<span class="hljs-string">'login'</span>: <span class="hljs-string">'bhavitvyamalik'</span>, | |
| <span class="hljs-string">'id'</span>: <span class="hljs-number">19718818</span>, | |
| <span class="hljs-string">'node_id'</span>: <span class="hljs-string">'MDQ6VXNlcjE5NzE4ODE4'</span>, | |
| <span class="hljs-string">'avatar_url'</span>: <span class="hljs-string">'https://avatars.githubusercontent.com/u/19718818?v=4'</span>, | |
| <span class="hljs-string">'gravatar_id'</span>: <span class="hljs-string">''</span>, | |
| <span class="hljs-string">'url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik'</span>, | |
| <span class="hljs-string">'html_url'</span>: <span class="hljs-string">'https://github.com/bhavitvyamalik'</span>, | |
| <span class="hljs-string">'followers_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/followers'</span>, | |
| <span class="hljs-string">'following_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/following{/other_user}'</span>, | |
| <span class="hljs-string">'gists_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/gists{/gist_id}'</span>, | |
| <span class="hljs-string">'starred_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}'</span>, | |
| <span class="hljs-string">'subscriptions_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/subscriptions'</span>, | |
| <span class="hljs-string">'organizations_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/orgs'</span>, | |
| <span class="hljs-string">'repos_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/repos'</span>, | |
| <span class="hljs-string">'events_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/events{/privacy}'</span>, | |
| <span class="hljs-string">'received_events_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/received_events'</span>, | |
| <span class="hljs-string">'type'</span>: <span class="hljs-string">'User'</span>, | |
| <span class="hljs-string">'site_admin'</span>: <span class="hljs-literal">False</span>}, | |
| <span class="hljs-string">'labels'</span>: [], | |
| <span class="hljs-string">'state'</span>: <span class="hljs-string">'open'</span>, | |
| <span class="hljs-string">'locked'</span>: <span class="hljs-literal">False</span>, | |
| <span class="hljs-string">'assignee'</span>: <span class="hljs-literal">None</span>, | |
| <span class="hljs-string">'assignees'</span>: [], | |
| <span class="hljs-string">'milestone'</span>: <span class="hljs-literal">None</span>, | |
| <span class="hljs-string">'comments'</span>: <span class="hljs-number">1</span>, | |
| <span class="hljs-string">'created_at'</span>: <span class="hljs-string">'2021-08-12T11:40:18Z'</span>, | |
| <span class="hljs-string">'updated_at'</span>: <span class="hljs-string">'2021-08-12T12:31:17Z'</span>, | |
| <span class="hljs-string">'closed_at'</span>: <span class="hljs-literal">None</span>, | |
| <span class="hljs-string">'author_association'</span>: <span class="hljs-string">'CONTRIBUTOR'</span>, | |
| <span class="hljs-string">'active_lock_reason'</span>: <span class="hljs-literal">None</span>, | |
| <span class="hljs-string">'pull_request'</span>: {<span class="hljs-string">'url'</span>: <span class="hljs-string">'https://api.github.com/repos/huggingface/datasets/pulls/2792'</span>, | |
| <span class="hljs-string">'html_url'</span>: <span class="hljs-string">'https://github.com/huggingface/datasets/pull/2792'</span>, | |
| <span class="hljs-string">'diff_url'</span>: <span class="hljs-string">'https://github.com/huggingface/datasets/pull/2792.diff'</span>, | |
| <span class="hljs-string">'patch_url'</span>: <span class="hljs-string">'https://github.com/huggingface/datasets/pull/2792.patch'</span>}, | |
| <span class="hljs-string">'body'</span>: <span class="hljs-string">'[GooAQ](https://github.com/allenai/gooaq) dataset was recently updated after splits were added for the same. This PR contains new updated GooAQ with train/val/test splits and updated README as well.'</span>, | |
| <span class="hljs-string">'performed_via_github_app'</span>: <span class="hljs-literal">None</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-rrrs97">Waouh, ça fait beaucoup d’informations ! Nous pouvons voir des champs utiles comme <code>title</code>, <code>body</code> et <code>number</code> qui décrivent le problème, ainsi que des informations sur l’utilisateur GitHub qui a ouvert le problème.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-156k6y">✏️ <strong>Essayez !</strong> Cliquez sur quelques-unes des URL pour avoir une idée du type d’informations auxquelles chaque problème GitHub est lié.</p></div> <p data-svelte-h="svelte-1gjgd66">Comme décrit dans la <a href="https://docs.github.com/en/rest/overview/resources-in-the-rest-api#rate-limiting" rel="nofollow">documentation GitHub</a>, les requêtes non authentifiées sont limitées à 60 requêtes par heure. Bien que vous puissiez augmenter le paramètre de requête <code>per_page</code> pour réduire le nombre de requêtes que vous effectuez, vous atteindrez toujours la limite de débit sur tout dépôt contenant des milliers de problèmes. Donc, à la place, vous devez suivre les <a href="https://docs.github.com/en/github/authenticating-to-github/creating-a-personal-access-token" rel="nofollow">instructions de GitHub</a> sur la création d’un <em>jeton d’accès personnel</em> afin que vous peut augmenter la limite de débit à 5 000 requêtes par heure. Une fois que vous avez votre <em>token</em>, vous pouvez l’inclure dans l’en-tête de la requête :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->GITHUB_TOKEN = xxx <span class="hljs-comment"># Copiez votre jeton GitHub ici</span> | |
| headers = {<span class="hljs-string">"Authorization"</span>: <span class="hljs-string">f"token <span class="hljs-subst">{GITHUB_TOKEN}</span>"</span>}<!-- HTML_TAG_END --></pre></div> <div class="course-tip course-tip-orange bg-gradient-to-br dark:bg-gradient-to-r before:border-orange-500 dark:before:border-orange-800 from-orange-50 dark:from-gray-900 to-white dark:to-gray-950 border border-orange-50 text-orange-700 dark:text-gray-400"><p data-svelte-h="svelte-1m7v2og">⚠️ Ne partagez pas un <em>notebook</em> avec votre <code>GITHUB_TOKEN</code> collé dedans. Nous vous recommandons de supprimer la dernière cellule une fois que vous l’avez exécutée pour éviter de divulguer accidentellement ces informations. Mieux encore, stockez le jeton dans un fichier <em>.env</em> et utilisez la <a href="https://github.com/theskumar/python-dotenv" rel="nofollow">bibliothèque <code>python-dotenv</code></a> pour le charger automatiquement pour vous en tant que variable d’environnement.</p></div> <p data-svelte-h="svelte-foeklq">Maintenant que nous avons notre jeton d’accès, créons une fonction qui peut télécharger tous les problèmes depuis un référentiel GitHub :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> time | |
| <span class="hljs-keyword">import</span> math | |
| <span class="hljs-keyword">from</span> pathlib <span class="hljs-keyword">import</span> Path | |
| <span class="hljs-keyword">import</span> pandas <span class="hljs-keyword">as</span> pd | |
| <span class="hljs-keyword">from</span> tqdm.notebook <span class="hljs-keyword">import</span> tqdm | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">fetch_issues</span>(<span class="hljs-params"> | |
| owner=<span class="hljs-string">"huggingface"</span>, | |
| repo=<span class="hljs-string">"datasets"</span>, | |
| num_issues=<span class="hljs-number">10_000</span>, | |
| rate_limit=<span class="hljs-number">5_000</span>, | |
| issues_path=Path(<span class="hljs-params"><span class="hljs-string">"."</span></span>), | |
| </span>): | |
| <span class="hljs-keyword">if</span> <span class="hljs-keyword">not</span> issues_path.is_dir(): | |
| issues_path.mkdir(exist_ok=<span class="hljs-literal">True</span>) | |
| batch = [] | |
| all_issues = [] | |
| per_page = <span class="hljs-number">100</span> <span class="hljs-comment"># Nombre d'issues à retourner par page</span> | |
| num_pages = math.ceil(num_issues / per_page) | |
| base_url = <span class="hljs-string">"https://api.github.com/repos"</span> | |
| <span class="hljs-keyword">for</span> page <span class="hljs-keyword">in</span> tqdm(<span class="hljs-built_in">range</span>(num_pages)): | |
| <span class="hljs-comment"># Requête avec state=all pour obtenir les issues ouvertes et fermées</span> | |
| query = <span class="hljs-string">f"issues?page=<span class="hljs-subst">{page}</span>&per_page=<span class="hljs-subst">{per_page}</span>&state=all"</span> | |
| issues = requests.get(<span class="hljs-string">f"<span class="hljs-subst">{base_url}</span>/<span class="hljs-subst">{owner}</span>/<span class="hljs-subst">{repo}</span>/<span class="hljs-subst">{query}</span>"</span>, headers=headers) | |
| batch.extend(issues.json()) | |
| <span class="hljs-keyword">if</span> <span class="hljs-built_in">len</span>(batch) > rate_limit <span class="hljs-keyword">and</span> <span class="hljs-built_in">len</span>(all_issues) < num_issues: | |
| all_issues.extend(batch) | |
| batch = [] <span class="hljs-comment"># Vider le batch pour la période de temps suivante</span> | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"Reached GitHub rate limit. Sleeping for one hour ..."</span>) | |
| time.sleep(<span class="hljs-number">60</span> * <span class="hljs-number">60</span> + <span class="hljs-number">1</span>) | |
| all_issues.extend(batch) | |
| df = pd.DataFrame.from_records(all_issues) | |
| df.to_json(<span class="hljs-string">f"<span class="hljs-subst">{issues_path}</span>/<span class="hljs-subst">{repo}</span>-issues.jsonl"</span>, orient=<span class="hljs-string">"records"</span>, lines=<span class="hljs-literal">True</span>) | |
| <span class="hljs-built_in">print</span>( | |
| <span class="hljs-string">f"Downloaded all the issues for <span class="hljs-subst">{repo}</span>! Dataset stored at <span class="hljs-subst">{issues_path}</span>/<span class="hljs-subst">{repo}</span>-issues.jsonl"</span> | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-fx6b30">Désormais, lorsque nous appellerons <code>fetch_issues()</code>, tous les problèmes seront téléchargés par batchs pour éviter de dépasser la limite de GitHub sur le nombre de requêtes par heure. Le résultat sera stocké dans un fichier <em>repository_name-issues.jsonl</em>, où chaque ligne est un objet JSON qui représente un problème. Utilisons cette fonction pour saisir tous les problèmes de 🤗 <em>Datasets</em> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># En fonction de votre connexion Internet, l'exécution peut prendre plusieurs minutes...</span> | |
| fetch_issues()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-19rh6hc">Une fois les problèmes téléchargés, nous pouvons les charger localement en utilisant nos nouvelles compétences de <a href="/course/fr/chapter5/2">section 2</a> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_dataset = load_dataset(<span class="hljs-string">"json"</span>, data_files=<span class="hljs-string">"datasets-issues.jsonl"</span>, split=<span class="hljs-string">"train"</span>) | |
| issues_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({ | |
| features: [<span class="hljs-string">'url'</span>, <span class="hljs-string">'repository_url'</span>, <span class="hljs-string">'labels_url'</span>, <span class="hljs-string">'comments_url'</span>, <span class="hljs-string">'events_url'</span>, <span class="hljs-string">'html_url'</span>, <span class="hljs-string">'id'</span>, <span class="hljs-string">'node_id'</span>, <span class="hljs-string">'number'</span>, <span class="hljs-string">'title'</span>, <span class="hljs-string">'user'</span>, <span class="hljs-string">'labels'</span>, <span class="hljs-string">'state'</span>, <span class="hljs-string">'locked'</span>, <span class="hljs-string">'assignee'</span>, <span class="hljs-string">'assignees'</span>, <span class="hljs-string">'milestone'</span>, <span class="hljs-string">'comments'</span>, <span class="hljs-string">'created_at'</span>, <span class="hljs-string">'updated_at'</span>, <span class="hljs-string">'closed_at'</span>, <span class="hljs-string">'author_association'</span>, <span class="hljs-string">'active_lock_reason'</span>, <span class="hljs-string">'pull_request'</span>, <span class="hljs-string">'body'</span>, <span class="hljs-string">'timeline_url'</span>, <span class="hljs-string">'performed_via_github_app'</span>], | |
| num_rows: <span class="hljs-number">3019</span> | |
| })<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ak6bsw">Génial, nous avons créé notre premier jeu de données à partir de rien ! Mais pourquoi y a-t-il plusieurs milliers de problèmes alors que l’<a href="https://github.com/huggingface/datasets/issues" rel="nofollow">onglet « Issues »</a> de la librairie 🤗 <em>Datasets</em> n’affiche qu’environ 1 000 problèmes au total 🤔 ? Comme décrit dans la <a href="https://docs.github.com/en/rest/reference/issues#list-issues-assigned-to-the-authenticated-user" rel="nofollow">documentation GitHub</a>, c’est parce que nous avons téléchargé toutes les <em>pull requests</em> également :</p> <blockquote data-svelte-h="svelte-10q4vnq"><p>L’API REST v3 de GitHub considère chaque <em>pull request</em> comme un problème, mais chaque problème n’est pas une <em>pull request</em>. Pour cette raison, les points de terminaison « Issues » peuvent renvoyer à la fois des problèmes et des <em>pull requests</em> dans la réponse. Vous pouvez identifier les <em>pull requests</em> par la clé <code>pull_request</code>. Sachez que l’identifiant d’une <em>pull request</em> renvoyée par les points de terminaison « Issues » sera un identifiant de problème.</p></blockquote> <p data-svelte-h="svelte-1ipumdj">Étant donné que le contenu des « Issues » et des <em>pull request</em> est assez différent, procédons à un prétraitement mineur pour nous permettre de les distinguer.</p> <h2 class="relative group"><a id="nettoyer-les-données" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#nettoyer-les-données"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Nettoyer les données</span></h2> <p data-svelte-h="svelte-1ejkqkz">L’extrait ci-dessus de la documentation de GitHub nous indique que la colonne <code>pull_request</code> peut être utilisée pour différencier les <em>issues</em> et les <em>pull requests</em>. Regardons un échantillon aléatoire pour voir quelle est la différence. Comme nous l’avons fait dans <a href="/course/fr/chapter5/3">section 3</a>, nous allons enchaîner <code>Dataset.shuffle()</code> et <code>Dataset.select()</code> pour créer un échantillon aléatoire, puis compresser <code>html_url</code> et <code>pull_request</code> afin que nous puissions comparer les différentes URL :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sample = issues_dataset.shuffle(seed=<span class="hljs-number">666</span>).select(<span class="hljs-built_in">range</span>(<span class="hljs-number">3</span>)) | |
| <span class="hljs-comment"># Afficher l'URL et les entrées de la PR</span> | |
| <span class="hljs-keyword">for</span> url, pr <span class="hljs-keyword">in</span> <span class="hljs-built_in">zip</span>(sample[<span class="hljs-string">"html_url"</span>], sample[<span class="hljs-string">"pull_request"</span>]): | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f">> URL: <span class="hljs-subst">{url}</span>"</span>) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f">> Pull request: <span class="hljs-subst">{pr}</span>\n"</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->>> URL: https://github.com/huggingface/datasets/pull/<span class="hljs-number">850</span> | |
| >> Pull request: {<span class="hljs-string">'url'</span>: <span class="hljs-string">'https://api.github.com/repos/huggingface/datasets/pulls/850'</span>, <span class="hljs-string">'html_url'</span>: <span class="hljs-string">'https://github.com/huggingface/datasets/pull/850'</span>, <span class="hljs-string">'diff_url'</span>: <span class="hljs-string">'https://github.com/huggingface/datasets/pull/850.diff'</span>, <span class="hljs-string">'patch_url'</span>: <span class="hljs-string">'https://github.com/huggingface/datasets/pull/850.patch'</span>} | |
| >> URL: https://github.com/huggingface/datasets/issues/<span class="hljs-number">2773</span> | |
| >> Pull request: <span class="hljs-literal">None</span> | |
| >> URL: https://github.com/huggingface/datasets/pull/<span class="hljs-number">783</span> | |
| >> Pull request: {<span class="hljs-string">'url'</span>: <span class="hljs-string">'https://api.github.com/repos/huggingface/datasets/pulls/783'</span>, <span class="hljs-string">'html_url'</span>: <span class="hljs-string">'https://github.com/huggingface/datasets/pull/783'</span>, <span class="hljs-string">'diff_url'</span>: <span class="hljs-string">'https://github.com/huggingface/datasets/pull/783.diff'</span>, <span class="hljs-string">'patch_url'</span>: <span class="hljs-string">'https://github.com/huggingface/datasets/pull/783.patch'</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1rhsws8">Ici, nous pouvons voir que chaque <em>pull request</em> est associée à diverses URL, tandis que les problèmes ordinaires ont une entrée <code>None</code>. Nous pouvons utiliser cette distinction pour créer une nouvelle colonne <code>is_pull_request</code> qui vérifie si le champ <code>pull_request</code> est <code>None</code> ou non :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_dataset = issues_dataset.<span class="hljs-built_in">map</span>( | |
| <span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">"is_pull_request"</span>: <span class="hljs-literal">False</span> <span class="hljs-keyword">if</span> x[<span class="hljs-string">"pull_request"</span>] <span class="hljs-keyword">is</span> <span class="hljs-literal">None</span> <span class="hljs-keyword">else</span> <span class="hljs-literal">True</span>} | |
| )<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1urwimw">✏️ <strong>Essayez !</strong> Calculez le temps moyen nécessaire pour résoudre les problèmes dans 🤗 <em>Datasets</em>. Vous pouvez trouver la fonction <code>Dataset.filter()</code> utile pour filtrer les demandes d’extraction et les problèmes ouverts. Vous pouvez utiliser la fonction <code>Dataset.set_format()</code> pour convertir le jeu de données en un <code>DataFrame</code> afin que vous puissiez facilement manipuler les horodatages <code>created_at</code> et <code>closed_at</code>. Pour les points bonus, calculez le temps moyen nécessaire pour fermer les <em>pull_requests</em>.</p></div> <p data-svelte-h="svelte-1ksc479">Bien que nous puissions continuer à nettoyer davantage le jeu de données en supprimant ou en renommant certaines colonnes, il est généralement recommandé de le conserver aussi brut que possible à ce stade afin qu’il puisse être facilement utilisé dans plusieurs applications.</p> <p data-svelte-h="svelte-1twjm4b">Avant de pousser notre jeu de données vers le <em>Hub</em> d’Hugging Face, traitons une chose manquante : les commentaires associés à chaque problème et <em>pull requests</em>. Nous les ajouterons ensuite avec l’API GitHub REST !</p> <h2 class="relative group"><a id="enrichir-le-jeu-de-données" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#enrichir-le-jeu-de-données"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Enrichir le jeu de données</span></h2> <p data-svelte-h="svelte-lk042x">Comme le montre la capture d’écran suivante, les commentaires associés à un problème ou à une <em>pull request</em> fournissent une riche source d’informations, en particulier si nous souhaitons créer un moteur de recherche pour répondre aux requêtes des utilisateurs sur la bibliothèque.</p> <div class="flex justify-center" data-svelte-h="svelte-1fxxwaz"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-issues-comment.png" alt="Comments associated with an issue about 🤗 Datasets." width="80%"></div> <p data-svelte-h="svelte-1xlbzjd">L’API REST GitHub fournit un point de terminaison <a href="https://docs.github.com/en/rest/reference/issues#list-issue-comments" rel="nofollow"><code>Comments</code></a> qui renvoie tous les commentaires associés à un numéro de problème. Testons le point de terminaison pour voir ce qu’il renvoie :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issue_number = <span class="hljs-number">2792</span> | |
| url = <span class="hljs-string">f"https://api.github.com/repos/huggingface/datasets/issues/<span class="hljs-subst">{issue_number}</span>/comments"</span> | |
| response = requests.get(url, headers=headers) | |
| response.json()<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[{<span class="hljs-string">'url'</span>: <span class="hljs-string">'https://api.github.com/repos/huggingface/datasets/issues/comments/897594128'</span>, | |
| <span class="hljs-string">'html_url'</span>: <span class="hljs-string">'https://github.com/huggingface/datasets/pull/2792#issuecomment-897594128'</span>, | |
| <span class="hljs-string">'issue_url'</span>: <span class="hljs-string">'https://api.github.com/repos/huggingface/datasets/issues/2792'</span>, | |
| <span class="hljs-string">'id'</span>: <span class="hljs-number">897594128</span>, | |
| <span class="hljs-string">'node_id'</span>: <span class="hljs-string">'IC_kwDODunzps41gDMQ'</span>, | |
| <span class="hljs-string">'user'</span>: {<span class="hljs-string">'login'</span>: <span class="hljs-string">'bhavitvyamalik'</span>, | |
| <span class="hljs-string">'id'</span>: <span class="hljs-number">19718818</span>, | |
| <span class="hljs-string">'node_id'</span>: <span class="hljs-string">'MDQ6VXNlcjE5NzE4ODE4'</span>, | |
| <span class="hljs-string">'avatar_url'</span>: <span class="hljs-string">'https://avatars.githubusercontent.com/u/19718818?v=4'</span>, | |
| <span class="hljs-string">'gravatar_id'</span>: <span class="hljs-string">''</span>, | |
| <span class="hljs-string">'url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik'</span>, | |
| <span class="hljs-string">'html_url'</span>: <span class="hljs-string">'https://github.com/bhavitvyamalik'</span>, | |
| <span class="hljs-string">'followers_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/followers'</span>, | |
| <span class="hljs-string">'following_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/following{/other_user}'</span>, | |
| <span class="hljs-string">'gists_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/gists{/gist_id}'</span>, | |
| <span class="hljs-string">'starred_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/starred{/owner}{/repo}'</span>, | |
| <span class="hljs-string">'subscriptions_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/subscriptions'</span>, | |
| <span class="hljs-string">'organizations_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/orgs'</span>, | |
| <span class="hljs-string">'repos_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/repos'</span>, | |
| <span class="hljs-string">'events_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/events{/privacy}'</span>, | |
| <span class="hljs-string">'received_events_url'</span>: <span class="hljs-string">'https://api.github.com/users/bhavitvyamalik/received_events'</span>, | |
| <span class="hljs-string">'type'</span>: <span class="hljs-string">'User'</span>, | |
| <span class="hljs-string">'site_admin'</span>: <span class="hljs-literal">False</span>}, | |
| <span class="hljs-string">'created_at'</span>: <span class="hljs-string">'2021-08-12T12:21:52Z'</span>, | |
| <span class="hljs-string">'updated_at'</span>: <span class="hljs-string">'2021-08-12T12:31:17Z'</span>, | |
| <span class="hljs-string">'author_association'</span>: <span class="hljs-string">'CONTRIBUTOR'</span>, | |
| <span class="hljs-string">'body'</span>: <span class="hljs-string">"@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = 'gooaq'\r\n\r\n def test_load_dataset(self, dataset_name):\r\n configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n> self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n self.parent.assertTrue(len(dataset[split]) > 0)\r\nE AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?"</span>, | |
| <span class="hljs-string">'performed_via_github_app'</span>: <span class="hljs-literal">None</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ptmnzt">Nous pouvons voir que le commentaire est stocké dans le champ <code>body</code>. Ecrivons donc une fonction simple qui renvoie tous les commentaires associés à un problème en sélectionnant le contenu <code>body</code> pour chaque élément dans <code>response.json()</code> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_comments</span>(<span class="hljs-params">issue_number</span>): | |
| url = <span class="hljs-string">f"https://api.github.com/repos/huggingface/datasets/issues/<span class="hljs-subst">{issue_number}</span>/comments"</span> | |
| response = requests.get(url, headers=headers) | |
| <span class="hljs-keyword">return</span> [r[<span class="hljs-string">"body"</span>] <span class="hljs-keyword">for</span> r <span class="hljs-keyword">in</span> response.json()] | |
| <span class="hljs-comment"># Tester notre fonction fonctionne comme prévu</span> | |
| get_comments(<span class="hljs-number">2792</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">"@albertvillanova my tests are failing here:\r\n```\r\ndataset_name = 'gooaq'\r\n\r\n def test_load_dataset(self, dataset_name):\r\n configs = self.dataset_tester.load_all_configs(dataset_name, is_local=True)[:1]\r\n> self.dataset_tester.check_load_dataset(dataset_name, configs, is_local=True, use_local_dummy_data=True)\r\n\r\ntests/test_dataset_common.py:234: \r\n_ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ _ \r\ntests/test_dataset_common.py:187: in check_load_dataset\r\n self.parent.assertTrue(len(dataset[split]) > 0)\r\nE AssertionError: False is not true\r\n```\r\nWhen I try loading dataset on local machine it works fine. Any suggestions on how can I avoid this error?"</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ion8oe">Cela a l’air bien. Utilisons <code>Dataset.map()</code> pour ajouter une nouvelle colonne <code>comments</code> à chaque problème de notre jeu de données :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-comment"># Selon votre connexion internet, cela peut prendre quelques minutes...</span> | |
| issues_with_comments_dataset = issues_dataset.<span class="hljs-built_in">map</span>( | |
| <span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">"comments"</span>: get_comments(x[<span class="hljs-string">"number"</span>])} | |
| )<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-9fqk6j">La dernière étape consiste à enregistrer le jeu de données augmentées avec nos données brutes afin que nous puissions les pousser tous les deux vers le <em>Hub</em> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->issues_with_comments_dataset.to_json(<span class="hljs-string">"issues-datasets-with-comments.jsonl"</span>)<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="téléchargement-du-jeu-de-données-sur-le-i-hub-i" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#téléchargement-du-jeu-de-données-sur-le-i-hub-i"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Téléchargement du jeu de données sur le <i> Hub </i></span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/HaN6qCr_Afc" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-ecixms">Maintenant que nous avons notre jeu de données augmenté, il est temps de le pousser vers le <em>Hub</em> afin que nous puissions le partager avec la communauté ! Pour télécharger le jeu de données, nous utilisons la <a href="https://github.com/huggingface/huggingface_hub" rel="nofollow">bibliothèque 🤗 <em>Hub</em></a>, qui nous permet d’interagir avec le <em>Hub</em> d’Hugging Face via une API Python. 🤗 <em>Hub</em> est préinstallé avec 🤗 <em>Transformers</em>, nous pouvons donc l’utiliser directement. Par exemple, nous pouvons utiliser la fonction <code>list_datasets()</code> pour obtenir des informations sur tous les ensembles de données publics actuellement hébergés sur le <em>Hub</em>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> list_datasets | |
| all_datasets = list_datasets() | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"Number of datasets on Hub: <span class="hljs-subst">{<span class="hljs-built_in">len</span>(all_datasets)}</span>"</span>) | |
| <span class="hljs-built_in">print</span>(all_datasets[<span class="hljs-number">0</span>])<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Number of datasets on Hub: <span class="hljs-number">1487</span> | |
| Dataset Name: acronym_identification, Tags: [<span class="hljs-string">'annotations_creators:expert-generated'</span>, <span class="hljs-string">'language_creators:found'</span>, <span class="hljs-string">'languages:en'</span>, <span class="hljs-string">'licenses:mit'</span>, <span class="hljs-string">'multilinguality:monolingual'</span>, <span class="hljs-string">'size_categories:10K<n<100K'</span>, <span class="hljs-string">'source_datasets:original'</span>, <span class="hljs-string">'task_categories:structure-prediction'</span>, <span class="hljs-string">'task_ids:structure-prediction-other-acronym-identification'</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6hd80v">Nous pouvons voir qu’il y a actuellement près de 1 500 jeux de données sur le <em>Hub</em> et la fonction <code>list_datasets()</code> fournit également des métadonnées de base sur chaque dépôts de jeux de données.</p> <p data-svelte-h="svelte-pbsvpn">Pour nos besoins, la première chose que nous devons faire est de créer un nouveau dépôt de jeux de données sur le <em>Hub</em>. Pour ce faire, nous avons besoin d’un jeton d’authentification, qui peut être obtenu en se connectant d’abord au <em>Hub</em> d’Hugging Face avec la fonction <code>notebook_login()</code> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login | |
| notebook_login()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-18k15mu">Cela créé un <em>widget</em> dans lequel vous pouvez entrer votre nom d’utilisateur et votre mot de passe. Un jeton API sera enregistré dans <em>~/.huggingface/token</em>. Si vous exécutez le code dans un terminal, vous pouvez vous connecter via la CLI à la place :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->huggingface-cli login<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-10i2s51">Une fois que nous avons fait cela, nous pouvons créer un nouveau dépôt de jeux de données avec la fonction <code>create_repo()</code> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> create_repo | |
| repo_url = create_repo(name=<span class="hljs-string">"github-issues"</span>, repo_type=<span class="hljs-string">"dataset"</span>) | |
| repo_url<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">'https://huggingface.co/datasets/lewtun/github-issues'</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1unoukx">Dans cet exemple, nous avons créé un dépôt vide appelé <code>github-issues</code> sous le nom d’utilisateur <code>lewtun</code> (le nom d’utilisateur doit être votre nom d’utilisateur Hub lorsque vous exécutez ce code !).</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-u3spgc">✏️ <strong>Essayez !</strong> Utilisez votre nom d’utilisateur et votre mot de passe Hugging Face pour obtenir un jeton et créer un dépôt vide appelé <code>github-issues</code>. N’oubliez pas de <strong>n’enregistrez jamais vos informations d’identification</strong> dans Colab ou tout autre référentiel car ces informations peuvent être exploitées par de mauvais individus.</p></div> <p data-svelte-h="svelte-1vna43z">Ensuite, clonons le dépôt du Hub sur notre machine locale et copions-y notre fichier jeu de données. 🤗 <em>Hub</em> fournit une classe <code>Repository</code> pratique qui encapsule de nombreuses commandes Git courantes. Donc pour cloner le dépôt distant, nous devons simplement fournir l’URL et le chemin local vers lesquels nous souhaitons cloner :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> Repository | |
| repo = Repository(local_dir=<span class="hljs-string">"github-issues"</span>, clone_from=repo_url) | |
| !cp datasets-issues-<span class="hljs-keyword">with</span>-comments.jsonl github-issues/<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-8bremv">Par défaut, diverses extensions de fichiers (telles que <em>.bin</em>, <em>.gz</em> et <em>.zip</em>) sont suivies avec Git LFS afin que les fichiers volumineux puissent être versionnés dans le même workflow Git. Vous pouvez trouver une liste des extensions de fichiers suivis dans le fichier <em>.gitattributes</em> du référentiel. Pour inclure le format JSON Lines dans la liste, nous pouvons exécuter la commande suivante :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->repo.lfs_track(<span class="hljs-string">"*.jsonl"</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-28hnkh">Ensuite, nous pouvons utiliser <code>Repository.push_to_hub()</code> pour pousser le jeu de données vers le <em>Hub</em> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->repo.push_to_hub()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-w8yiax">Si nous naviguons vers l’URL contenue dans <code>repo_url</code>, nous devrions maintenant voir que notre fichier de jeu de données a été téléchargé.</p> <div class="flex justify-center" data-svelte-h="svelte-18puw29"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/hub-repo.png" alt="Our dataset repository on the Hugging Face Hub." width="80%"></div> <p data-svelte-h="svelte-1mf4gla">À partir de là, n’importe qui peut télécharger le jeu de données en fournissant simplement <code>load_dataset()</code> avec l’ID du référentiel comme argument <code>path</code> :</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->remote_dataset = load_dataset(<span class="hljs-string">"lewtun/github-issues"</span>, split=<span class="hljs-string">"train"</span>) | |
| remote_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({ | |
| features: [<span class="hljs-string">'url'</span>, <span class="hljs-string">'repository_url'</span>, <span class="hljs-string">'labels_url'</span>, <span class="hljs-string">'comments_url'</span>, <span class="hljs-string">'events_url'</span>, <span class="hljs-string">'html_url'</span>, <span class="hljs-string">'id'</span>, <span class="hljs-string">'node_id'</span>, <span class="hljs-string">'number'</span>, <span class="hljs-string">'title'</span>, <span class="hljs-string">'user'</span>, <span class="hljs-string">'labels'</span>, <span class="hljs-string">'state'</span>, <span class="hljs-string">'locked'</span>, <span class="hljs-string">'assignee'</span>, <span class="hljs-string">'assignees'</span>, <span class="hljs-string">'milestone'</span>, <span class="hljs-string">'comments'</span>, <span class="hljs-string">'created_at'</span>, <span class="hljs-string">'updated_at'</span>, <span class="hljs-string">'closed_at'</span>, <span class="hljs-string">'author_association'</span>, <span class="hljs-string">'active_lock_reason'</span>, <span class="hljs-string">'pull_request'</span>, <span class="hljs-string">'body'</span>, <span class="hljs-string">'performed_via_github_app'</span>, <span class="hljs-string">'is_pull_request'</span>], | |
| num_rows: <span class="hljs-number">2855</span> | |
| })<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1e92u5c">Cool, nous avons poussé notre jeu de données vers le <em>Hub</em> et il est disponible pour que d’autres puissent l’utiliser ! Il ne reste plus qu’une chose importante à faire : ajouter une <em>carte de jeu de données</em> qui explique comment le corpus a été créé et fournit d’autres informations utiles à la communauté.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-19botje">💡 Vous pouvez également télécharger un jeu de données sur le <em>Hub</em> directement depuis le terminal en utilisant <code>huggingface-cli</code> et un peu de magie Git. Consultez le <a href="https://huggingface.co/docs/datasets/share#share-a-dataset-using-the-cli" rel="nofollow">guide de 🤗 <em>Datasets</em></a> pour savoir comment procéder.</p></div> <h2 class="relative group"><a id="création-dune-carte-pour-un-jeu-de-données" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#création-dune-carte-pour-un-jeu-de-données"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Création d’une carte pour un jeu de données</span></h2> <p data-svelte-h="svelte-187mg26">Des jeux de données bien documentés sont plus susceptibles d’être utiles aux autres (y compris à vous-même) car ils fournissent le contexte permettant aux utilisateurs de décider si le jeu de données est pertinent pour leur tâche et d’évaluer les biais potentiels ou les risques associés à l’utilisation du jeu de données.</p> <p data-svelte-h="svelte-18gug1p">Sur le <em>Hub</em>, ces informations sont stockées dans le fichier <em>README.md</em> de chaque dépôt de jeux de données. Il y a deux étapes principales que vous devez suivre avant de créer ce fichier :</p> <ol data-svelte-h="svelte-6sif9q"><li>Utilisez l’<a href="https://huggingface.co/datasets/tagging/" rel="nofollow">application <code>datasets-tagging</code></a> pour créer des balises de métadonnées au format YAML. Ces balises sont utilisées pour une variété de fonctionnalités de recherche sur le <em>Hub</em> d’Hugging Face et garantissent que votre jeu de données peut être facilement trouvé par les membres de la communauté. Puisque nous avons créé un jeu de données personnalisé ici, vous devrez cloner le référentiel <code>datasets-tagging</code> et exécuter l’application localement. Voici à quoi ressemble l’interface :</li></ol> <div class="flex justify-center" data-svelte-h="svelte-1gqifn5"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/datasets-tagger.png" alt="The 'datasets-tagging' interface." width="80%"></div> <ol start="2" data-svelte-h="svelte-bnren5"><li>Lisez le <a href="https://github.com/huggingface/datasets/blob/master/templates/README_guide.md" rel="nofollow">guide de 🤗 <em>Datasets</em></a> sur la création des cartes informatives des jeux de données et utilisez-le comme modèle.</li></ol> <p data-svelte-h="svelte-t3xgso">Vous pouvez créer le fichier <em>README.md</em> directement sur le <em>Hub</em> et vous pouvez trouver un modèle de carte dans le dépot <code>lewtun/github-issues</code>. Une capture d’écran de la carte remplie est illustrée ci-dessous.</p> <div class="flex justify-center" data-svelte-h="svelte-ct1wn8"><img src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/dataset-card.png" alt="A dataset card." width="80%"></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-19pnpqk">✏️ <strong>Essayez !</strong> Utilisez l’application <code>dataset-tagging</code> et <a href="https://github.com/huggingface/datasets/blob/master/templates/README_guide.md" rel="nofollow">le guide de 🤗 <em>Datasets</em></a> pour compléter le fichier <em>README.md</em> de votre jeu de données de problèmes GitHub.</p></div> <p data-svelte-h="svelte-1kodeli">C’est tout ! Nous avons vu dans cette section que la création d’un bon jeu de données peut être assez complexe, mais heureusement, le télécharger et le partager avec la communauté ne l’est pas. Dans la section suivante, nous utiliserons notre nouveau jeu de données pour créer un moteur de recherche sémantique avec 🤗 <em>Datasets</em> qui peut faire correspondre les questions aux problèmes et commentaires les plus pertinents.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1fa657v">✏️ <strong>Essayez !</strong> Suivez les étapes que nous avons suivies dans cette section pour créer un jeu de données de problèmes GitHub pour votre bibliothèque open source préférée (choisissez autre chose que 🤗 <em>Datasets</em>, bien sûr !). Pour obtenir des points bonus, <em>finetunez</em> un classifieur multilabel pour prédire les balises présentes dans le champ <code>labels</code>.</p></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/fr/chapter5/5.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1"><</span> <span data-svelte-h="svelte-x0xyl0">></span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p> | |
| <script> | |
| { | |
| __sveltekit_4ugnx6 = { | |
| assets: "/docs/course/pr_1007/fr", | |
| base: "/docs/course/pr_1007/fr", | |
| env: {} | |
| }; | |
| const element = document.currentScript.parentElement; | |
| const data = [null,null]; | |
| Promise.all([ | |
| import("/docs/course/pr_1007/fr/_app/immutable/entry/start.0dcef240.js"), | |
| import("/docs/course/pr_1007/fr/_app/immutable/entry/app.e06c0eda.js") | |
| ]).then(([kit, app]) => { | |
| kit.start(app, element, { | |
| node_ids: [0, 38], | |
| data, | |
| form: null, | |
| error: null | |
| }); | |
| }); | |
| } | |
| </script> | |
Xet Storage Details
- Size:
- 102 kB
- Xet hash:
- 7493ef3efe2790a2cd92da4cd3388a3cbee2822682333c3e3e9215fee683c240
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.