Buckets:

rtrm's picture
download
raw
149 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Es momento de subdividir&quot;,&quot;local&quot;:&quot;es-momento-de-subdividir&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Subdividiendo nuestros datos&quot;,&quot;local&quot;:&quot;subdividiendo-nuestros-datos&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Creando nuevas columnas&quot;,&quot;local&quot;:&quot;creando-nuevas-columnas&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Los superpoderes del método map()&quot;,&quot;local&quot;:&quot;los-superpoderes-del-método-map&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;De Dataset s a DataFrame s y viceversa&quot;,&quot;local&quot;:&quot;de-dataset-s-a-dataframe-s-y-viceversa&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Creando un conjunto de validación&quot;,&quot;local&quot;:&quot;creando-un-conjunto-de-validación&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Saving a dataset&quot;,&quot;local&quot;:&quot;saving-a-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/course/pr_1069/es/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/entry/start.b7b528c6.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/scheduler.37c15a92.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/singletons.e1c0df1c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/index.18351ede.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/paths.2d1184ba.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/entry/app.ea1cc000.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/index.2bf4358c.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/nodes/0.4f78a9c4.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/nodes/27.c5a2140d.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/Tip.363c041f.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/Youtube.1e50a667.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/CodeBlock.4e987730.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/CourseFloatingBanner.6add7356.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/Heading.8ada512a.js">
<link rel="modulepreload" href="/docs/course/pr_1069/es/_app/immutable/chunks/getInferenceSnippets.b37612c0.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Es momento de subdividir&quot;,&quot;local&quot;:&quot;es-momento-de-subdividir&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Subdividiendo nuestros datos&quot;,&quot;local&quot;:&quot;subdividiendo-nuestros-datos&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Creando nuevas columnas&quot;,&quot;local&quot;:&quot;creando-nuevas-columnas&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Los superpoderes del método map()&quot;,&quot;local&quot;:&quot;los-superpoderes-del-método-map&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;De Dataset s a DataFrame s y viceversa&quot;,&quot;local&quot;:&quot;de-dataset-s-a-dataframe-s-y-viceversa&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Creando un conjunto de validación&quot;,&quot;local&quot;:&quot;creando-un-conjunto-de-validación&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Saving a dataset&quot;,&quot;local&quot;:&quot;saving-a-dataset&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="es-momento-de-subdividir" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#es-momento-de-subdividir"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Es momento de subdividir</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"><a href="https://discuss.huggingface.co/t/chapter-5-questions" target="_blank"><img alt="Ask a Question" class="!m-0" src="https://img.shields.io/badge/Ask%20a%20question-ffcb4c.svg?logo=data:image/svg+xml;base64,PHN2ZyB4bWxucz0iaHR0cDovL3d3dy53My5vcmcvMjAwMC9zdmciIHZpZXdCb3g9IjAgLTEgMTA0IDEwNiI+PGRlZnM+PHN0eWxlPi5jbHMtMXtmaWxsOiMyMzFmMjA7fS5jbHMtMntmaWxsOiNmZmY5YWU7fS5jbHMtM3tmaWxsOiMwMGFlZWY7fS5jbHMtNHtmaWxsOiMwMGE5NGY7fS5jbHMtNXtmaWxsOiNmMTVkMjI7fS5jbHMtNntmaWxsOiNlMzFiMjM7fTwvc3R5bGU+PC9kZWZzPjx0aXRsZT5EaXNjb3Vyc2VfbG9nbzwvdGl0bGU+PGcgaWQ9IkxheWVyXzIiPjxnIGlkPSJMYXllcl8zIj48cGF0aCBjbGFzcz0iY2xzLTEiIGQ9Ik01MS44NywwQzIzLjcxLDAsMCwyMi44MywwLDUxYzAsLjkxLDAsNTIuODEsMCw1Mi44MWw1MS44Ni0uMDVjMjguMTYsMCw1MS0yMy43MSw1MS01MS44N1M4MCwwLDUxLjg3LDBaIi8+PHBhdGggY2xhc3M9ImNscy0yIiBkPSJNNTIuMzcsMTkuNzRBMzEuNjIsMzEuNjIsMCwwLDAsMjQuNTgsNjYuNDFsLTUuNzIsMTguNEwzOS40LDgwLjE3YTMxLjYxLDMxLjYxLDAsMSwwLDEzLTYwLjQzWiIvPjxwYXRoIGNsYXNzPSJjbHMtMyIgZD0iTTc3LjQ1LDMyLjEyYTMxLjYsMzEuNiwwLDAsMS0zOC4wNSw0OEwxOC44Niw4NC44MmwyMC45MS0yLjQ3QTMxLjYsMzEuNiwwLDAsMCw3Ny40NSwzMi4xMloiLz48cGF0aCBjbGFzcz0iY2xzLTQiIGQ9Ik03MS42MywyNi4yOUEzMS42LDMxLjYsMCwwLDEsMzguOCw3OEwxOC44Niw4NC44MiwzOS40LDgwLjE3QTMxLjYsMzEuNiwwLDAsMCw3MS42MywyNi4yOVoiLz48cGF0aCBjbGFzcz0iY2xzLTUiIGQ9Ik0yNi40Nyw2Ny4xMWEzMS42MSwzMS42MSwwLDAsMSw1MS0zNUEzMS42MSwzMS42MSwwLDAsMCwyNC41OCw2Ni40MWwtNS43MiwxOC40WiIvPjxwYXRoIGNsYXNzPSJjbHMtNiIgZD0iTTI0LjU4LDY2LjQxQTMxLjYxLDMxLjYxLDAsMCwxLDcxLjYzLDI2LjI5YTMxLjYxLDMxLjYxLDAsMCwwLTQ5LDM5LjYzbC0zLjc2LDE4LjlaIi8+PC9nPjwvZz48L3N2Zz4="></a> <a href="https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter5/section3.ipynb" target="_blank"><img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"></a> <a href="https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter5/section3.ipynb" target="_blank"><img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"></a></div> <p data-svelte-h="svelte-1ft98vh">La mayor parte del tiempo tus datos no estarán perfectamente listos para entrenar modelos. En esta sección vamos a explorar distintas funciones que tiene 🤗 Datasets para limpiar tus conjuntos de datos.</p> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/tqfSFcPMgOI" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <h2 class="relative group"><a id="subdividiendo-nuestros-datos" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#subdividiendo-nuestros-datos"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Subdividiendo nuestros datos</span></h2> <p data-svelte-h="svelte-f3uyce">De manera similar a Pandas, 🤗 Datasets incluye varias funciones para manipular el contenido de los objetos <code>Dataset</code> y <code>DatasetDict</code>. Ya vimos el método <code>Dataset.map()</code> en el <a href="/course/chapter3">Capítulo 3</a> y en esta sección vamos a explorar otras funciones que tenemos a nuestra disposición.</p> <p data-svelte-h="svelte-1fu9ejq">Para este ejemplo, vamos a usar el <a href="https://archive.ics.uci.edu/ml/datasets/Drug+Review+Dataset+%28Drugs.com%29" rel="nofollow">Dataset de reseñas de medicamentos</a> alojado en el <a href="https://archive.ics.uci.edu/ml/index.php" rel="nofollow">Repositorio de Machine Learning de UC Irvine</a>, que contiene la evaluación de varios medicamentos por parte de pacientes, junto con la condición por la que los estaban tratando y una calificación en una escala de 10 estrellas sobre su satisfacción.</p> <p data-svelte-h="svelte-1t8s9yk">Primero, tenemos que descargar y extraer los datos, que se puede hacer con los comandos <code>wget</code> y <code>unzip</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!wget <span class="hljs-string">&quot;https://archive.ics.uci.edu/ml/machine-learning-databases/00462/drugsCom_raw.zip&quot;</span>
!unzip drugsCom_raw.<span class="hljs-built_in">zip</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1le9kzg">Dado que TSV es una variación de CSV en la que se usan tabulaciones en vez de comas como separadores, podemos cargar estos archivos usando el script de carga <code>csv</code> y especificando el argumento <code>delimiter</code> en la función <code>load_dataset</code> de la siguiente manera:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
data_files = {<span class="hljs-string">&quot;train&quot;</span>: <span class="hljs-string">&quot;drugsComTrain_raw.tsv&quot;</span>, <span class="hljs-string">&quot;test&quot;</span>: <span class="hljs-string">&quot;drugsComTest_raw.tsv&quot;</span>}
<span class="hljs-comment"># \t es el carácter para tabulaciones en Python</span>
drug_dataset = load_dataset(<span class="hljs-string">&quot;csv&quot;</span>, data_files=data_files, delimiter=<span class="hljs-string">&quot;\t&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-elug1x">Una buena práctica al hacer cualquier tipo de análisis de datos es tomar una muestra aleatoria del dataset para tener una vista rápida del tipo de datos con los que estás trabajando. En 🤗 Datasets, podemos crear una muestra aleatoria al encadenar las funciones <code>Dataset.shuffle()</code> y <code>Dataset.select()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug_sample = drug_dataset[<span class="hljs-string">&quot;train&quot;</span>].shuffle(seed=<span class="hljs-number">42</span>).select(<span class="hljs-built_in">range</span>(<span class="hljs-number">1000</span>))
<span class="hljs-comment"># Mirar los primeros ejemplos</span>
drug_sample[:<span class="hljs-number">3</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">&#x27;Unnamed: 0&#x27;</span>: [<span class="hljs-number">87571</span>, <span class="hljs-number">178045</span>, <span class="hljs-number">80482</span>],
<span class="hljs-string">&#x27;drugName&#x27;</span>: [<span class="hljs-string">&#x27;Naproxen&#x27;</span>, <span class="hljs-string">&#x27;Duloxetine&#x27;</span>, <span class="hljs-string">&#x27;Mobic&#x27;</span>],
<span class="hljs-string">&#x27;condition&#x27;</span>: [<span class="hljs-string">&#x27;Gout, Acute&#x27;</span>, <span class="hljs-string">&#x27;ibromyalgia&#x27;</span>, <span class="hljs-string">&#x27;Inflammatory Conditions&#x27;</span>],
<span class="hljs-string">&#x27;review&#x27;</span>: [<span class="hljs-string">&#x27;&quot;like the previous person mention, I&amp;#039;m a strong believer of aleve, it works faster for my gout than the prescription meds I take. No more going to the doctor for refills.....Aleve works!&quot;&#x27;</span>,
<span class="hljs-string">&#x27;&quot;I have taken Cymbalta for about a year and a half for fibromyalgia pain. It is great\r\nas a pain reducer and an anti-depressant, however, the side effects outweighed \r\nany benefit I got from it. I had trouble with restlessness, being tired constantly,\r\ndizziness, dry mouth, numbness and tingling in my feet, and horrible sweating. I am\r\nbeing weaned off of it now. Went from 60 mg to 30mg and now to 15 mg. I will be\r\noff completely in about a week. The fibro pain is coming back, but I would rather deal with it than the side effects.&quot;&#x27;</span>,
<span class="hljs-string">&#x27;&quot;I have been taking Mobic for over a year with no side effects other than an elevated blood pressure. I had severe knee and ankle pain which completely went away after taking Mobic. I attempted to stop the medication however pain returned after a few days.&quot;&#x27;</span>],
<span class="hljs-string">&#x27;rating&#x27;</span>: [<span class="hljs-number">9.0</span>, <span class="hljs-number">3.0</span>, <span class="hljs-number">10.0</span>],
<span class="hljs-string">&#x27;date&#x27;</span>: [<span class="hljs-string">&#x27;September 2, 2015&#x27;</span>, <span class="hljs-string">&#x27;November 7, 2011&#x27;</span>, <span class="hljs-string">&#x27;June 5, 2013&#x27;</span>],
<span class="hljs-string">&#x27;usefulCount&#x27;</span>: [<span class="hljs-number">36</span>, <span class="hljs-number">13</span>, <span class="hljs-number">128</span>]}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-w6qej5">Puedes ver que hemos fijado la semilla en <code>Dataset.shuffle()</code> por motivos de reproducibilidad. <code>Dataset.select()</code> espera un iterable de índices, así que incluimos <code>range(1000)</code> para tomar los primeros 1.000 ejemplos del conjunto de datos aleatorizado. Ya podemos ver algunos detalles para esta muestra:</p> <ul data-svelte-h="svelte-uotdqn"><li>La columna <code>Unnamed: 0</code> se ve sospechosamente como un ID anonimizado para cada paciente.</li> <li>La columna <code>condition</code> incluye una mezcla de niveles en mayúscula y minúscula.</li> <li>Las reseñas tienen longitud variable y contienen una mezcla de separadores de línea de Python (<code>\r\n</code>), así como caracteres de HTML como <code>&amp;\#039;</code>.</li></ul> <p data-svelte-h="svelte-wclb86">Veamos cómo podemos usar 🤗 Datasets para lidiar con cada uno de estos asuntos. Para probar la hipótesis de que la columna <code>Unnamed: 0</code> es un ID de los pacientes, podemos usar la función <code>Dataset.unique()</code> para verificar que el número de los ID corresponda con el número de filas de cada conjunto:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> split <span class="hljs-keyword">in</span> drug_dataset.keys():
<span class="hljs-keyword">assert</span> <span class="hljs-built_in">len</span>(drug_dataset[split]) == <span class="hljs-built_in">len</span>(drug_dataset[split].unique(<span class="hljs-string">&quot;Unnamed: 0&quot;</span>))<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qvor5p">Esto parece confirmar nuestra hipótesis, así que limpiemos el dataset un poco al cambiar el nombre de la columna <code>Unnamed: 0</code> a algo más legible. Podemos usar la función <code>DatasetDict.rename_column()</code> para renombrar la columna en ambos conjuntos en una sola operación:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug_dataset = drug_dataset.rename_column(
original_column_name=<span class="hljs-string">&quot;Unnamed: 0&quot;</span>, new_column_name=<span class="hljs-string">&quot;patient_id&quot;</span>
)
drug_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->DatasetDict({
train: Dataset({
features: [<span class="hljs-string">&#x27;patient_id&#x27;</span>, <span class="hljs-string">&#x27;drugName&#x27;</span>, <span class="hljs-string">&#x27;condition&#x27;</span>, <span class="hljs-string">&#x27;review&#x27;</span>, <span class="hljs-string">&#x27;rating&#x27;</span>, <span class="hljs-string">&#x27;date&#x27;</span>, <span class="hljs-string">&#x27;usefulCount&#x27;</span>],
num_rows: <span class="hljs-number">161297</span>
})
test: Dataset({
features: [<span class="hljs-string">&#x27;patient_id&#x27;</span>, <span class="hljs-string">&#x27;drugName&#x27;</span>, <span class="hljs-string">&#x27;condition&#x27;</span>, <span class="hljs-string">&#x27;review&#x27;</span>, <span class="hljs-string">&#x27;rating&#x27;</span>, <span class="hljs-string">&#x27;date&#x27;</span>, <span class="hljs-string">&#x27;usefulCount&#x27;</span>],
num_rows: <span class="hljs-number">53766</span>
})
})<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1qfpcg0">✏️ <strong>¡Inténtalo!</strong> Usa la función <code>Dataset.unique()</code> para encontrar el número de medicamentos y condiciones únicas en los conjuntos de entrenamiento y de prueba.</p></div> <p data-svelte-h="svelte-aa5kta">Ahora normalicemos todas las etiquetas de <code>condition</code> usando <code>Dataset.map()</code>. Tal como lo hicimos con la tokenización en el <a href="/course/chapter3">Capítulo 3</a>, podemos definir una función simple que pueda ser aplicada en todas las filas de cada conjunto en el <code>drug_dataset</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">lowercase_condition</span>(<span class="hljs-params">example</span>):
<span class="hljs-keyword">return</span> {<span class="hljs-string">&quot;condition&quot;</span>: example[<span class="hljs-string">&quot;condition&quot;</span>].lower()}
drug_dataset.<span class="hljs-built_in">map</span>(lowercase_condition)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->AttributeError: <span class="hljs-string">&#x27;NoneType&#x27;</span> <span class="hljs-built_in">object</span> has no attribute <span class="hljs-string">&#x27;lower&#x27;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-jzl4v6">¡Tenemos un problema en nuestra función de mapeo! Del error podemos inferir que algunas de las entradas de la columna <code>condición</code> son <code>None</code>, que no puede transformarse en minúscula al no ser un string. Filtremos estas filas usando <code>Dataset.filter()</code>, que funciona de una forma similar <code>Dataset.map()</code> y recibe como argumento una función que toma un ejemplo particular del dataset. En vez de escribir una función explícita como:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">filter_nones</span>(<span class="hljs-params">x</span>):
<span class="hljs-keyword">return</span> x[<span class="hljs-string">&quot;condition&quot;</span>] <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-b4m5az">y luego ejecutar <code>drug_dataset.filter(filter_nones)</code>, podemos hacerlo en una línea usando una <em>función lambda</em>. En Python, las funciones lambda son funciones pequeñas que puedes definir sin nombrarlas explícitamente. Estas toman la forma general:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->lambda <span class="hljs-tag">&lt;<span class="hljs-name">arguments</span>&gt;</span> : <span class="hljs-tag">&lt;<span class="hljs-name">expression</span>&gt;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-zbvbit">en la que <code>lambda</code> es una de las <a href="https://docs.python.org/3/reference/lexical_analysis.html#keywords" rel="nofollow">palabras especiales</a> de Python, <code>&lt;arguments&gt;</code> es una lista o conjunto de valores separados con coma que definen los argumentos de la función y <code>&lt;expression&gt;</code> representa las operaciones que quieres ejecutar. Por ejemplo, podemos definir una función lambda simple que eleve un número al cuadrado de la siguiente manera:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->lambda <span class="hljs-keyword">x</span> : <span class="hljs-keyword">x</span> * <span class="hljs-keyword">x</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1y7rpsl">Para aplicar esta función a un <em>input</em>, tenemos que envolverla a ella y al <em>input</em> en paréntesis:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-keyword">lambda</span> x: x * x)(<span class="hljs-number">3</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">9</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-g744bn">De manera similar, podemos definir funciones lambda con múltiples argumentos separándolos con comas. Por ejemplo, podemos calcular el área de un triángulo así:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-keyword">lambda</span> base, height: <span class="hljs-number">0.5</span> * base * height)(<span class="hljs-number">4</span>, <span class="hljs-number">8</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-number">16.0</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-6serfh">Las funciones lambda son útiles cuando quieres definir funciones pequeñas de un único uso (para más información sobre ellas, te recomendamos leer este excelente <a href="https://realpython.com/python-lambda/" rel="nofollow">tutorial de Real Python</a> escrito por Andre Burgaud). En el contexto de 🤗 Datasets, podemos usar las funciones lambda para definir operaciones simples de mapeo y filtrado, así que usemos este truco para eliminar las entradas <code>None</code> de nuestro dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug_dataset = drug_dataset.<span class="hljs-built_in">filter</span>(<span class="hljs-keyword">lambda</span> x: x[<span class="hljs-string">&quot;condition&quot;</span>] <span class="hljs-keyword">is</span> <span class="hljs-keyword">not</span> <span class="hljs-literal">None</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15xj71k">Ahora que eliminamos los <code>None</code>, podemos normalizar nuestra columna <code>condition</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug_dataset = drug_dataset.<span class="hljs-built_in">map</span>(lowercase_condition)
<span class="hljs-comment"># Revisar que se pasaron a minúscula</span>
drug_dataset[<span class="hljs-string">&quot;train&quot;</span>][<span class="hljs-string">&quot;condition&quot;</span>][:<span class="hljs-number">3</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-string">&#x27;left ventricular dysfunction&#x27;</span>, <span class="hljs-string">&#x27;adhd&#x27;</span>, <span class="hljs-string">&#x27;birth control&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1nq8uzf">¡Funcionó! Como ya limpiamos las etiquetas, veamos cómo podemos limpiar las reseñas.</p> <h2 class="relative group"><a id="creando-nuevas-columnas" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#creando-nuevas-columnas"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Creando nuevas columnas</span></h2> <p data-svelte-h="svelte-13icydf">Cuando estás lidiando con reseñas de clientes, es una buena práctica revisar el número de palabras de cada reseña. Una reseña puede ser una única palabra como “¡Genial!” o un ensayo completo con miles de palabras y, según el caso de uso, tendrás que abordar estos extremos de forma diferente. Para calcular el número de palabras en cada reseña, usaremos una heurística aproximada basada en dividir cada texto por los espacios en blanco.</p> <p data-svelte-h="svelte-1vldu6x">Definamos una función simple que cuente el número de palabras en cada reseña:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">compute_review_length</span>(<span class="hljs-params">example</span>):
<span class="hljs-keyword">return</span> {<span class="hljs-string">&quot;review_length&quot;</span>: <span class="hljs-built_in">len</span>(example[<span class="hljs-string">&quot;review&quot;</span>].split())}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fur9me">Contrario a la función <code>lowercase_condition()</code>, <code>compute_review_length()</code> devuelve un diccionario cuya llave no corresponde a uno de los nombres de las columnas en el conjunto de datos. En este caso, cuando se pasa <code>compute_review_length()</code> a <code>Dataset.map()</code>, la función se aplicará a todas las filas en el dataset para crear una nueva columna <code>review_length()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug_dataset = drug_dataset.<span class="hljs-built_in">map</span>(compute_review_length)
<span class="hljs-comment"># Inspeccionar el primer ejemplo de entrenamiento</span>
drug_dataset[<span class="hljs-string">&quot;train&quot;</span>][<span class="hljs-number">0</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">&#x27;patient_id&#x27;</span>: <span class="hljs-number">206461</span>,
<span class="hljs-string">&#x27;drugName&#x27;</span>: <span class="hljs-string">&#x27;Valsartan&#x27;</span>,
<span class="hljs-string">&#x27;condition&#x27;</span>: <span class="hljs-string">&#x27;left ventricular dysfunction&#x27;</span>,
<span class="hljs-string">&#x27;review&#x27;</span>: <span class="hljs-string">&#x27;&quot;It has no side effect, I take it in combination of Bystolic 5 Mg and Fish Oil&quot;&#x27;</span>,
<span class="hljs-string">&#x27;rating&#x27;</span>: <span class="hljs-number">9.0</span>,
<span class="hljs-string">&#x27;date&#x27;</span>: <span class="hljs-string">&#x27;May 20, 2012&#x27;</span>,
<span class="hljs-string">&#x27;usefulCount&#x27;</span>: <span class="hljs-number">27</span>,
<span class="hljs-string">&#x27;review_length&#x27;</span>: <span class="hljs-number">17</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1daay00">Tal como lo esperábamos, podemos ver que se añadió la columna <code>review_length</code> al conjunto de entrenamiento. Podemos ordenar esta columna nueva con <code>Dataset.sort()</code> para ver cómo son los valores extremos:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug_dataset[<span class="hljs-string">&quot;train&quot;</span>].sort(<span class="hljs-string">&quot;review_length&quot;</span>)[:<span class="hljs-number">3</span>]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">&#x27;patient_id&#x27;</span>: [<span class="hljs-number">103488</span>, <span class="hljs-number">23627</span>, <span class="hljs-number">20558</span>],
<span class="hljs-string">&#x27;drugName&#x27;</span>: [<span class="hljs-string">&#x27;Loestrin 21 1 / 20&#x27;</span>, <span class="hljs-string">&#x27;Chlorzoxazone&#x27;</span>, <span class="hljs-string">&#x27;Nucynta&#x27;</span>],
<span class="hljs-string">&#x27;condition&#x27;</span>: [<span class="hljs-string">&#x27;birth control&#x27;</span>, <span class="hljs-string">&#x27;muscle spasm&#x27;</span>, <span class="hljs-string">&#x27;pain&#x27;</span>],
<span class="hljs-string">&#x27;review&#x27;</span>: [<span class="hljs-string">&#x27;&quot;Excellent.&quot;&#x27;</span>, <span class="hljs-string">&#x27;&quot;useless&quot;&#x27;</span>, <span class="hljs-string">&#x27;&quot;ok&quot;&#x27;</span>],
<span class="hljs-string">&#x27;rating&#x27;</span>: [<span class="hljs-number">10.0</span>, <span class="hljs-number">1.0</span>, <span class="hljs-number">6.0</span>],
<span class="hljs-string">&#x27;date&#x27;</span>: [<span class="hljs-string">&#x27;November 4, 2008&#x27;</span>, <span class="hljs-string">&#x27;March 24, 2017&#x27;</span>, <span class="hljs-string">&#x27;August 20, 2016&#x27;</span>],
<span class="hljs-string">&#x27;usefulCount&#x27;</span>: [<span class="hljs-number">5</span>, <span class="hljs-number">2</span>, <span class="hljs-number">10</span>],
<span class="hljs-string">&#x27;review_length&#x27;</span>: [<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>]}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1i1xtkk">Como lo discutimos anteriormente, algunas reseñas incluyen una sola palabra, que si bien puede ser útil para el análisis de sentimientos, no sería tan informativa si quisiéramos predecir la condición.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1t4fet9">🙋 Una forma alternativa de añadir nuevas columnas al dataset es a través de la función <code>Dataset.add_column()</code>. Esta te permite incluir la columna como una lista de Python o un array de NumPy y puede ser útil en situaciones en las que <code>Dataset.map()</code> no se ajusta a tu caso de uso.</p></div> <p data-svelte-h="svelte-1iez2yj">Usemos la función <code>Dataset.filter()</code> para quitar las reseñas que contienen menos de 30 palabras. Similar a lo que hicimos con la columna <code>condition</code>, podemos filtrar las reseñas cortas al incluir una condición de que su longitud esté por encima de este umbral:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug_dataset = drug_dataset.<span class="hljs-built_in">filter</span>(<span class="hljs-keyword">lambda</span> x: x[<span class="hljs-string">&quot;review_length&quot;</span>] &gt; <span class="hljs-number">30</span>)
<span class="hljs-built_in">print</span>(drug_dataset.num_rows)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">&#x27;train&#x27;</span>: <span class="hljs-number">138514</span>, <span class="hljs-string">&#x27;test&#x27;</span>: <span class="hljs-number">46108</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ega5u0">Como puedes ver, esto ha eliminado alrededor del 15% de las reseñas de nuestros conjuntos originales de entrenamiento y prueba.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-nd86z3">✏️ <strong>¡Inténtalo!</strong> Usa la función <code>Dataset.sort()</code> para inspeccionar las reseñas con el mayor número de palabras. Revisa la <a href="https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.sort" rel="nofollow">documentación</a> para ver cuál argumento necesitas para ordenar las reseñas de mayor a menor.</p></div> <p data-svelte-h="svelte-1fapn21">Por último, tenemos que lidiar con la presencia de códigos de caracteres HTML en las reseñas. Podemos usar el módulo <code>html</code> de Python para transformar estos códigos así:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">import</span> html
text = <span class="hljs-string">&quot;I&amp;#039;m a transformer called BERT&quot;</span>
html.unescape(text)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-string">&quot;I&#x27;m a transformer called BERT&quot;</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-kg3xjp">Usaremos <code>Dataset.map()</code> para transformar todos los caracteres HTML en el corpus:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug_dataset = drug_dataset.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;review&quot;</span>: html.unescape(x[<span class="hljs-string">&quot;review&quot;</span>])})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1isc3v0">Como puedes ver, el método <code>Dataset.map()</code> es muy útil para procesar datos y esta es apenas la punta del iceberg de lo que puede hacer.</p> <h2 class="relative group"><a id="los-superpoderes-del-método-map" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#los-superpoderes-del-método-map"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Los superpoderes del método map()</span></h2> <p data-svelte-h="svelte-x7ltoh">El método <code>Dataset.map()</code> recibe un argumento <code>matched</code> que, al definirse como <code>True</code>, envía un lote de ejemplos a la función de mapeo a la vez (el tamaño del lote se puede configurar, pero tiene un valor por defecto de 1.000). Por ejemplo, la función anterior de mapeo que transformó todos los HTML se demoró un poco en su ejecución (puedes leer el tiempo en las barras de progreso). Podemos reducir el tiempo al procesar varios elementos a la vez usando un <em>list comprehension</em>.</p> <p data-svelte-h="svelte-1a4knlr">Cuando especificas <code>batched=True</code>, la función recibe un diccionario con los campos del dataset, pero cada valor es ahora una <em>lista de valores</em> y no un valor individual. La salida de <code>Dataset.map()</code> debería ser igual: un diccionario con los campos que queremos actualizar o añadir a nuestro dataset y una lista de valores. Por ejemplo, aquí puedes ver otra forma de transformar todos los caracteres HTML usando <code>batched=True</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->new_drug_dataset = drug_dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;review&quot;</span>: [html.unescape(o) <span class="hljs-keyword">for</span> o <span class="hljs-keyword">in</span> x[<span class="hljs-string">&quot;review&quot;</span>]]}, batched=<span class="hljs-literal">True</span>
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1qf7cjn">Si estás ejecutando este código en un cuaderno, verás que este comando se ejecuta mucho más rápido que el anterior. Y no es porque los caracteres HTML de las reseñas ya se hubieran procesado; si vuelves a ejecutar la instrucción de la sección anterior (sin <code>batched=True</code>), se tomará el mismo tiempo de ejecución que antes. Esto es porque las <em>list comprehensions</em> suelen ser más rápidas que ejecutar el mismo código en un ciclo <code>for</code> y porque también ganamos rendimiento al acceder a muchos elementos a la vez en vez de uno por uno.</p> <p data-svelte-h="svelte-1vouaf5">Usar <code>Dataset.map()</code> con <code>batched=True</code> será fundamental para desbloquear la velocidad de los tokenizadores “rápidos” que nos vamos a encontrar en el <a href="/course/chapter6">Capítulo 6</a>, que pueden tokenizar velozmente grandes listas de textos. Por ejemplo, para tokenizar todas las reseñas de medicamentos con un tokenizador rápido, podríamos usar una función como la siguiente:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;bert-base-cased&quot;</span>)
<span class="hljs-keyword">def</span> <span class="hljs-title function_">tokenize_function</span>(<span class="hljs-params">examples</span>):
<span class="hljs-keyword">return</span> tokenizer(examples[<span class="hljs-string">&quot;review&quot;</span>], truncation=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1v21g6w">Como viste en el <a href="/course/chapter3">Capítulo 3</a>, podemos pasar uno o varios ejemplos al tokenizador, así que podemos usar esta función con o sin <code>batched=True</code>. Aprovechemos esta oportunidad para comparar el desempeño de las distintas opciones. En un cuaderno, puedes medir el tiempo de ejecución de una instrucción de una línea añadiendo <code>%time</code> antes de la línea de código de tu interés:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->%time tokenized_dataset = drug_dataset.<span class="hljs-built_in">map</span>(tokenize_function, batched=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-7t311a">También puedes medir el tiempo de una celda completa añadiendo <code>%%time</code> al inicio de la celda. En el hardware en el que lo ejecutamos, nos arrojó 10.8s para esta instrucción (es el número que aparece después de “Wall time”).</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-115eogg">✏️ <strong>¡Inténtalo!</strong> Ejecuta la misma instrucción con y sin <code>batched=True</code> y luego usa un tokenizador “lento” (añade <code>use_fast=False</code> en el método <code>AutoTokenizer.from_pretrained()</code>) para ver cuánto tiempo se toman en tu computador.</p></div> <p data-svelte-h="svelte-1uftzft">Estos son los resultados que obtuvimos con y sin la ejecución por lotes, con un tokenizador rápido y lento:</p> <table data-svelte-h="svelte-8qajbz"><thead><tr><th align="center">Opciones</th> <th align="center">Tokenizador rápido</th> <th align="center">Tokenizador lento</th></tr></thead> <tbody><tr><td align="center"><code>batched=True</code></td> <td align="center">10.8s</td> <td align="center">4min41s</td></tr> <tr><td align="center"><code>batched=False</code></td> <td align="center">59.2s</td> <td align="center">5min3s</td></tr></tbody></table> <p data-svelte-h="svelte-1vwo6tk">Esto significa que usar un tokenizador rápido con la opción <code>batched=True</code> es 30 veces más rápido que su contraparte lenta sin usar lotes. ¡Realmente impresionante! Esta es la razón principal por la que los tokenizadores rápidos son la opción por defecto al usar <code>AutoTokenizer</code> (y por qué se denominan “rápidos”). Estos logran tal rapidez gracias a que el código de los tokenizadores corre en Rust, que es un lenguaje que facilita la ejecución del código en paralelo.</p> <p data-svelte-h="svelte-ggdes7">La paralelización también es la razón para el incremento de 6x en la velocidad del tokenizador al ejecutarse por lotes: No puedes ejecutar una única operación de tokenización en paralelo, pero cuando quieres tokenizar muchos textos al mismo tiempo puedes dividir la ejecución en diferentes procesos, cada uno responsable de sus propios textos.</p> <p data-svelte-h="svelte-1kz29fc"><code>Dataset.map()</code> también tiene algunas capacidades de paralelización. Dado que no funcionan con Rust, no van a hacer que un tokenizador lento alcance el rendimiento de uno rápido, pero aún así pueden ser útiles (especialmente si estás usando un tokenizador que no tiene una versión rápida). Para habilitar el multiprocesamiento, usa el argumento <code>num_proc</code> y especifica el número de procesos para usar en <code>Dataset.map()</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->slow_tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">&quot;bert-base-cased&quot;</span>, use_fast=<span class="hljs-literal">False</span>)
<span class="hljs-keyword">def</span> <span class="hljs-title function_">slow_tokenize_function</span>(<span class="hljs-params">examples</span>):
<span class="hljs-keyword">return</span> slow_tokenizer(examples[<span class="hljs-string">&quot;review&quot;</span>], truncation=<span class="hljs-literal">True</span>)
tokenized_dataset = drug_dataset.<span class="hljs-built_in">map</span>(slow_tokenize_function, batched=<span class="hljs-literal">True</span>, num_proc=<span class="hljs-number">8</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-sutmnm">También puedes medir el tiempo para determinar el número de procesos que vas a usar. En nuestro caso, usar 8 procesos produjo la mayor ganancia de velocidad. Aquí están algunos de los números que obtuvimos con y sin multiprocesamiento:</p> <table data-svelte-h="svelte-1h7l6xc"><thead><tr><th align="center">Opciones</th> <th align="center">Tokenizador rápido</th> <th align="center">Tokenizador lento</th></tr></thead> <tbody><tr><td align="center"><code>batched=True</code></td> <td align="center">10.8s</td> <td align="center">4min41s</td></tr> <tr><td align="center"><code>batched=False</code></td> <td align="center">59.2s</td> <td align="center">5min3s</td></tr> <tr><td align="center"><code>batched=True</code>, <code>num_proc=8</code></td> <td align="center">6.52s</td> <td align="center">41.3s</td></tr> <tr><td align="center"><code>batched=False</code>, <code>num_proc=8</code></td> <td align="center">9.49s</td> <td align="center">45.2s</td></tr></tbody></table> <p data-svelte-h="svelte-1snt99c">Estos son resultados mucho más razonables para el tokenizador lento, aunque el desempeño del rápido también mejoró sustancialmente. Sin embargo, este no siempre será el caso: para valores de <code>num_proc</code> diferentes a 8, nuestras pruebas mostraron que era más rápido usar <code>batched=true</code> sin esta opción. En general, no recomendamos usar el multiprocesamiento de Python para tokenizadores rápidos con <code>batched=True</code>.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-ze6prh">Usar <code>num_proc</code> para acelerar tu procesamiento suele ser una buena idea, siempre y cuando la función que uses no esté usando multiples procesos por si misma.</p></div> <p data-svelte-h="svelte-16serkw">Que toda esta funcionalidad está incluida en un método es algo impresionante en si mismo, ¡pero hay más!. Con <code>Dataset.map()</code> y <code>batched=True</code> puedes cambiar el número de elementos en tu dataset. Esto es súper útil en situaciones en las que quieres crear varias características de entrenamiento de un ejemplo, algo que haremos en el preprocesamiento para varias de las tareas de PLN que abordaremos en el <a href="/course/chapter7">Capítulo 7</a>.</p> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-qa77d6">💡 Un <em>ejemplo</em> en Machine Learning se suele definir como el conjunto de <em>features</em> que le damos al modelo. En algunos contextos estos features serán el conjunto de columnas en un <code>Dataset</code>, mientras que en otros se pueden extraer múltiples features de un solo ejemplo que pertenecen a una columna –como aquí y en tareas de responder preguntas-.</p></div> <p data-svelte-h="svelte-192stp6">¡Veamos cómo funciona! En este ejemplo vamos a tokenizar nuestros ejemplos y limitarlos a una longitud máxima de 128, pero le pediremos al tokenizador que devuelva <em>todos</em> los fragmentos de texto en vez de unicamente el primero. Esto se puede lograr con el argumento <code>return_overflowing_tokens=True</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">tokenize_and_split</span>(<span class="hljs-params">examples</span>):
<span class="hljs-keyword">return</span> tokenizer(
examples[<span class="hljs-string">&quot;review&quot;</span>],
truncation=<span class="hljs-literal">True</span>,
max_length=<span class="hljs-number">128</span>,
return_overflowing_tokens=<span class="hljs-literal">True</span>,
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1sosiga">Probémoslo en un ejemplo puntual antes de usar <code>Dataset.map()</code> en todo el dataset:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->result = tokenize_and_split(drug_dataset[<span class="hljs-string">&quot;train&quot;</span>][<span class="hljs-number">0</span>])
[<span class="hljs-built_in">len</span>(inp) <span class="hljs-keyword">for</span> inp <span class="hljs-keyword">in</span> result[<span class="hljs-string">&quot;input_ids&quot;</span>]]<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->[<span class="hljs-number">128</span>, <span class="hljs-number">49</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-f2gix1">El primer ejemplo en el conjunto de entrenamiento se convirtió en dos features porque fue tokenizado en un número superior de tokens al que especificamos: el primero de longitud 128 y el segundo de longitud 49. ¡Vamos a aplicarlo a todo el dataset!</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenized_dataset = drug_dataset.<span class="hljs-built_in">map</span>(tokenize_and_split, batched=<span class="hljs-literal">True</span>)<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->ArrowInvalid: Column <span class="hljs-number">1</span> named condition expected length <span class="hljs-number">1463</span> but got length <span class="hljs-number">1000</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1vdku5q">¿Por qué no funcionó? El mensaje de error nos da una pista: hay un desajuste en las longitudes de una de las columnas, siendo una de longitud 1.463 y otra de longitud 1.000. Si has revisado la <a href="https://huggingface.co/docs/datasets/package_reference/main_classes#datasets.Dataset.map" rel="nofollow">documentación de <code>Dataset.map()</code></a>, te habrás dado cuenta que estamos mapeando el número de muestras que le pasamos a la función: en este caso los 1.000 ejemplos nos devuelven 1.463 features, arrojando un error.</p> <p data-svelte-h="svelte-192bw40">El problema es que estamos tratando de mezclar dos datasets de tamaños diferentes: las columnas de <code>drug_dataset</code> tendrán un cierto número de ejemplos (los 1.000 en el error), pero el <code>tokenized_dataset</code> que estamos construyendo tendrá más (los 1.463 en el mensaje de error). Esto no funciona para un <code>Dataset</code>, así que tenemos que eliminar las columnas del anterior dataset o volverlas del mismo tamaño del nuevo. Podemos hacer la primera operación con el argumento <code>remove_columns</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenized_dataset = drug_dataset.<span class="hljs-built_in">map</span>(
tokenize_and_split, batched=<span class="hljs-literal">True</span>, remove_columns=drug_dataset[<span class="hljs-string">&quot;train&quot;</span>].column_names
)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-11arle1">Ahora funciona sin errores. Podemos revisar que nuestro dataset nuevo tiene más elementos que el original al comparar sus longitudes:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-built_in">len</span>(tokenized_dataset[<span class="hljs-string">&quot;train&quot;</span>]), <span class="hljs-built_in">len</span>(drug_dataset[<span class="hljs-string">&quot;train&quot;</span>])<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->(<span class="hljs-number">206772</span>, <span class="hljs-number">138514</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-lwfjei">También mencionamos que podemos trabajar con el problema de longitudes que no coinciden al convertir las columnas viejas en el mismo tamaño de las nuevas. Para eso, vamos a necesitar el campo <code>overflow_to_sample_mapping</code> que devuelve el tokenizer cuando definimos <code>return_overflowing_tokens=True</code>. Esto devuelve un mapeo del índice de un nuevo feature al índice de la muestra de la que se originó. Usando lo anterior, podemos asociar cada llave presente en el dataset original con una lista de valores del tamaño correcto al repetir los valores de cada ejemplo tantas veces como genere nuevos features:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">def</span> <span class="hljs-title function_">tokenize_and_split</span>(<span class="hljs-params">examples</span>):
result = tokenizer(
examples[<span class="hljs-string">&quot;review&quot;</span>],
truncation=<span class="hljs-literal">True</span>,
max_length=<span class="hljs-number">128</span>,
return_overflowing_tokens=<span class="hljs-literal">True</span>,
)
<span class="hljs-comment"># Extraer el mapeo entre los índices nuevos y viejos</span>
sample_map = result.pop(<span class="hljs-string">&quot;overflow_to_sample_mapping&quot;</span>)
<span class="hljs-keyword">for</span> key, values <span class="hljs-keyword">in</span> examples.items():
result[key] = [values[i] <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> sample_map]
<span class="hljs-keyword">return</span> result<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1y88w13">De esta forma, podemos ver que funciona con <code>Dataset.map()</code> sin necesidad de eliminar las columnas viejas.</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->tokenized_dataset = drug_dataset.<span class="hljs-built_in">map</span>(tokenize_and_split, batched=<span class="hljs-literal">True</span>)
tokenized_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->DatasetDict({
train: Dataset({
features: [<span class="hljs-string">&#x27;attention_mask&#x27;</span>, <span class="hljs-string">&#x27;condition&#x27;</span>, <span class="hljs-string">&#x27;date&#x27;</span>, <span class="hljs-string">&#x27;drugName&#x27;</span>, <span class="hljs-string">&#x27;input_ids&#x27;</span>, <span class="hljs-string">&#x27;patient_id&#x27;</span>, <span class="hljs-string">&#x27;rating&#x27;</span>, <span class="hljs-string">&#x27;review&#x27;</span>, <span class="hljs-string">&#x27;review_length&#x27;</span>, <span class="hljs-string">&#x27;token_type_ids&#x27;</span>, <span class="hljs-string">&#x27;usefulCount&#x27;</span>],
num_rows: <span class="hljs-number">206772</span>
})
test: Dataset({
features: [<span class="hljs-string">&#x27;attention_mask&#x27;</span>, <span class="hljs-string">&#x27;condition&#x27;</span>, <span class="hljs-string">&#x27;date&#x27;</span>, <span class="hljs-string">&#x27;drugName&#x27;</span>, <span class="hljs-string">&#x27;input_ids&#x27;</span>, <span class="hljs-string">&#x27;patient_id&#x27;</span>, <span class="hljs-string">&#x27;rating&#x27;</span>, <span class="hljs-string">&#x27;review&#x27;</span>, <span class="hljs-string">&#x27;review_length&#x27;</span>, <span class="hljs-string">&#x27;token_type_ids&#x27;</span>, <span class="hljs-string">&#x27;usefulCount&#x27;</span>],
num_rows: <span class="hljs-number">68876</span>
})
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-bqk7eo">Como resultado, tenemos el mismo número de features de entrenamiento que antes, pero conservando todos los campos anteriores. Quizás prefieras usar esta opción si necesitas conservarlos para algunas tareas de post-procesamiento después de aplicar tu modelo.</p> <p data-svelte-h="svelte-1046ddo">Ya has visto como usar 🤗 Datasets para preprocesar un dataset de varias formas. Si bien las funciones de procesamiento de 🤗 Datasets van a suplir la mayor parte de tus necesidades de entrenamiento de modelos, hay ocasiones en las que puedes necesitar Pandas para tener acceso a herramientas más poderosas, como <code>DataFrame.groupby()</code> o algún API de alto nivel para visualización. Afortunadamente, 🤗 Datasets está diseñado para ser interoperable con librerías como Pandas, NumPy, PyTorch, TensoFlow y JAX. Veamos cómo funciona.</p> <h2 class="relative group"><a id="de-dataset-s-a-dataframe-s-y-viceversa" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#de-dataset-s-a-dataframe-s-y-viceversa"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>De Dataset s a DataFrame s y viceversa</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/tfcY1067A5Q" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-1cjapf7">Para habilitar la conversión entre varias librerías de terceros, 🤗 Datasets provee la función <code>Dataset.set_format()</code>. Esta función sólo cambia el <em>formato de salida</em> del dataset, de tal manera que puedas cambiar a otro formato sin cambiar el <em>formato de datos subyacente</em>, que es Apache Arrow. Este cambio de formato se hace <em>in place</em>. Para verlo en acción, convirtamos el dataset a Pandas:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug_dataset.set_format(<span class="hljs-string">&quot;pandas&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-zru1uz">Ahora, cuando accedemos a los elementos del dataset obtenemos un <code>pandas.DataFrame</code> en vez de un diccionario:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug_dataset[<span class="hljs-string">&quot;train&quot;</span>][:<span class="hljs-number">3</span>]<!-- HTML_TAG_END --></pre></div> <table border="1" class="dataframe" data-svelte-h="svelte-fhhlil"><thead><tr style="text-align: right;"><th></th> <th>patient_id</th> <th>drugName</th> <th>condition</th> <th>review</th> <th>rating</th> <th>date</th> <th>usefulCount</th> <th>review_length</th></tr></thead> <tbody><tr><th>0</th> <td>95260</td> <td>Guanfacine</td> <td>adhd</td> <td>&quot;My son is halfway through his fourth week of Intuniv...&quot;</td> <td>8.0</td> <td>April 27, 2010</td> <td>192</td> <td>141</td></tr> <tr><th>1</th> <td>92703</td> <td>Lybrel</td> <td>birth control</td> <td>&quot;I used to take another oral contraceptive, which had 21 pill cycle, and was very happy- very light periods, max 5 days, no other side effects...&quot;</td> <td>5.0</td> <td>December 14, 2009</td> <td>17</td> <td>134</td></tr> <tr><th>2</th> <td>138000</td> <td>Ortho Evra</td> <td>birth control</td> <td>&quot;This is my first time using any form of birth control...&quot;</td> <td>8.0</td> <td>November 3, 2015</td> <td>10</td> <td>89</td></tr></tbody></table> <p data-svelte-h="svelte-h3y6te">Creemos un <code>pandas.DataFrame</code> para el conjunto de entrenamiento entero al seleccionar los elementos de <code>drug_dataset[&quot;train&quot;]</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->train_df = drug_dataset[<span class="hljs-string">&quot;train&quot;</span>][:]<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-jjrk22">🚨 Internamente, <code>Dataset.set_format()</code> cambia el formato de devolución del método <em>dunder</em> <code>__getitem()__</code>. Esto significa que cuando queremos crear un objeto nuevo como <code>train_df</code> de un <code>Dataset</code> en formato <code>&quot;pandas&quot;</code>, tenemos que seleccionar el dataset completo para obtener un <code>pandas.DataFrame</code>. Puedes verificar por ti mismo que el tipo de <code>drug_dataset[&quot;train&quot;]</code> es <code>Dataset</code> sin importar el formato de salida.</p></div> <p data-svelte-h="svelte-lke1gb">De aquí en adelante podemos usar toda la funcionalidad de pandas cuando queramos. Por ejemplo, podemos hacer un encadenamiento sofisticado para calcular la distribución de clase entre las entradas de <code>condition</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->frequencies = (
train_df[<span class="hljs-string">&quot;condition&quot;</span>]
.value_counts()
.to_frame()
.reset_index()
.rename(columns={<span class="hljs-string">&quot;index&quot;</span>: <span class="hljs-string">&quot;condition&quot;</span>, <span class="hljs-string">&quot;count&quot;</span>: <span class="hljs-string">&quot;frequency&quot;</span>})
)
frequencies.head()<!-- HTML_TAG_END --></pre></div> <table border="1" class="dataframe" data-svelte-h="svelte-10crns6"><thead><tr style="text-align: right;"><th></th> <th>condition</th> <th>frequency</th></tr></thead> <tbody><tr><th>0</th> <td>birth control</td> <td>27655</td></tr> <tr><th>1</th> <td>depression</td> <td>8023</td></tr> <tr><th>2</th> <td>acne</td> <td>5209</td></tr> <tr><th>3</th> <td>anxiety</td> <td>4991</td></tr> <tr><th>4</th> <td>pain</td> <td>4744</td></tr></tbody></table> <p data-svelte-h="svelte-jcxeaa">Y una vez hemos concluido el análisis con Pandas, tenemos la posibilidad de crear un nuevo objeto <code>Dataset</code> usando la función <code>Dataset.from_pandas()</code> de la siguiente manera:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset
freq_dataset = Dataset.from_pandas(frequencies)
freq_dataset<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->Dataset({
features: [<span class="hljs-string">&#x27;condition&#x27;</span>, <span class="hljs-string">&#x27;frequency&#x27;</span>],
num_rows: <span class="hljs-number">819</span>
})<!-- HTML_TAG_END --></pre></div> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-3lsbyv">✏️ <strong>¡Inténtalo!</strong> Calcula la calificación promedio por medicamento y guarda el resultado en un nuevo <code>Dataset</code>.</p></div> <p data-svelte-h="svelte-bpgcn1">Con esto terminamos nuestro tour de las múltiples técnicas de preprocesamiento disponibles en 🤗 Datasets. Para concluir, creemos un set de validación para preparar el conjunto de datos y entrenar el clasificador. Antes de hacerlo, vamos a reiniciar el formato de salida de <code>drug_dataset</code> de <code>&quot;pandas&quot;</code> a <code>&quot;arrow&quot;</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug_dataset.reset_format()<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="creando-un-conjunto-de-validación" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#creando-un-conjunto-de-validación"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Creando un conjunto de validación</span></h2> <p data-svelte-h="svelte-eqztri">Si bien tenemos un conjunto de prueba que podríamos usar para la evaluación, es una buena práctica dejar el conjunto de prueba intacto y crear un conjunto de validación aparte durante el desarrollo. Una vez estés satisfecho con el desempeño de tus modelos en el conjunto de validación, puedes hacer un último chequeo con el conjunto de prueba. Este proceso ayuda a reducir el riesgo de sobreajustar al conjunto de prueba y desplegar un modelo que falle en datos reales.</p> <p data-svelte-h="svelte-1h6ahql">🤗 Datasets provee la función <code>Dataset.train_test_split()</code> que está basada en la famosa funcionalidad de <code>scikit-learn</code>. Usémosla para separar nuestro conjunto de entrenamiento en dos partes <code>train</code> y <code>validation</code> (definiendo el argumento <code>seed</code> por motivos de reproducibilidad):</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug_dataset_clean = drug_dataset[<span class="hljs-string">&quot;train&quot;</span>].train_test_split(train_size=<span class="hljs-number">0.8</span>, seed=<span class="hljs-number">42</span>)
<span class="hljs-comment"># Renombrar el conjunto &quot;test&quot; a &quot;validation&quot;</span>
drug_dataset_clean[<span class="hljs-string">&quot;validation&quot;</span>] = drug_dataset_clean.pop(<span class="hljs-string">&quot;test&quot;</span>)
<span class="hljs-comment"># Añadir el conjunto &quot;test&quot; al `DatasetDict`</span>
drug_dataset_clean[<span class="hljs-string">&quot;test&quot;</span>] = drug_dataset[<span class="hljs-string">&quot;test&quot;</span>]
drug_dataset_clean<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->DatasetDict({
train: Dataset({
features: [<span class="hljs-string">&#x27;patient_id&#x27;</span>, <span class="hljs-string">&#x27;drugName&#x27;</span>, <span class="hljs-string">&#x27;condition&#x27;</span>, <span class="hljs-string">&#x27;review&#x27;</span>, <span class="hljs-string">&#x27;rating&#x27;</span>, <span class="hljs-string">&#x27;date&#x27;</span>, <span class="hljs-string">&#x27;usefulCount&#x27;</span>, <span class="hljs-string">&#x27;review_length&#x27;</span>, <span class="hljs-string">&#x27;review_clean&#x27;</span>],
num_rows: <span class="hljs-number">110811</span>
})
validation: Dataset({
features: [<span class="hljs-string">&#x27;patient_id&#x27;</span>, <span class="hljs-string">&#x27;drugName&#x27;</span>, <span class="hljs-string">&#x27;condition&#x27;</span>, <span class="hljs-string">&#x27;review&#x27;</span>, <span class="hljs-string">&#x27;rating&#x27;</span>, <span class="hljs-string">&#x27;date&#x27;</span>, <span class="hljs-string">&#x27;usefulCount&#x27;</span>, <span class="hljs-string">&#x27;review_length&#x27;</span>, <span class="hljs-string">&#x27;review_clean&#x27;</span>],
num_rows: <span class="hljs-number">27703</span>
})
test: Dataset({
features: [<span class="hljs-string">&#x27;patient_id&#x27;</span>, <span class="hljs-string">&#x27;drugName&#x27;</span>, <span class="hljs-string">&#x27;condition&#x27;</span>, <span class="hljs-string">&#x27;review&#x27;</span>, <span class="hljs-string">&#x27;rating&#x27;</span>, <span class="hljs-string">&#x27;date&#x27;</span>, <span class="hljs-string">&#x27;usefulCount&#x27;</span>, <span class="hljs-string">&#x27;review_length&#x27;</span>, <span class="hljs-string">&#x27;review_clean&#x27;</span>],
num_rows: <span class="hljs-number">46108</span>
})
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1orrttc">Súper, ya preparamos un dataset que está listo para entrenar modelos. En la <a href="/course/chapter5/5">sección 5</a> veremos cómo subir datasets al Hub de Hugging Face, pero por ahora terminemos el análisis estudiando algunas formas de guardarlos en tu máquina local.</p> <h2 class="relative group"><a id="saving-a-dataset" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#saving-a-dataset"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Saving a dataset</span></h2> <iframe class="w-full xl:w-4/6 h-80" src="https://www.youtube-nocookie.com/embed/blF9uxYcKHo" title="YouTube video player" frameborder="0" allow="accelerometer; autoplay; clipboard-write; encrypted-media; gyroscope; picture-in-picture" allowfullscreen></iframe> <p data-svelte-h="svelte-ayz2sz">A pesar de que 🤗 Datasets va a guardar en caché todo dataset que descargues, así como las operaciones que se ejecutan en él, hay ocasiones en las que querrás guardar un dataset en memoria (e.g., en caso que el caché se elimine). Como se muestra en la siguiente tabla, 🤗 Datasets tiene 3 funciones para guardar tu dataset en distintos formatos:</p> <table data-svelte-h="svelte-94htba"><thead><tr><th align="center">Formato</th> <th align="center">Función</th></tr></thead> <tbody><tr><td align="center">Arrow</td> <td align="center"><code>Dataset.save_to_disk()</code></td></tr> <tr><td align="center">CSV</td> <td align="center"><code>Dataset.to_csv()</code></td></tr> <tr><td align="center">JSON</td> <td align="center"><code>Dataset.to_json()</code></td></tr></tbody></table> <p data-svelte-h="svelte-1gdc94b">Por ejemplo, guardemos el dataset limpio en formato Arrow:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug_dataset_clean.save_to_disk(<span class="hljs-string">&quot;drug-reviews&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-9r8bgz">Esto creará una carpeta con la siguiente estructura:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->drug-reviews/
├── dataset_dict.json
├── test
│ ├── dataset.arrow
│ ├── dataset_info.json
│ └── <span class="hljs-keyword">state</span>.json
├── train
│ ├── dataset.arrow
│ ├── dataset_info.json
│ ├── indices.arrow
│ └── <span class="hljs-keyword">state</span>.json
└── validation
├── dataset.arrow
├── dataset_info.json
├── indices.arrow
└── <span class="hljs-keyword">state</span>.json<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-17u8ear">en las que podemos ver que cada parte del dataset está asociada con una tabla <em>dataset.arrow</em> y algunos metadatos en <em>dataset_info.json</em> y <em>state.json</em>. Puedes pensar en el formato Arrow como una tabla sofisticada de columnas y filas que está optimizada para construir aplicaciones de alto rendimiento que procesan y transportan datasets grandes.</p> <p data-svelte-h="svelte-1jueylv">Una vez el dataset está guardado, podemos cargarlo usando la función <code>load_from_disk()</code> así:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_from_disk
drug_dataset_reloaded = load_from_disk(<span class="hljs-string">&quot;drug-reviews&quot;</span>)
drug_dataset_reloaded<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->DatasetDict({
train: Dataset({
features: [<span class="hljs-string">&#x27;patient_id&#x27;</span>, <span class="hljs-string">&#x27;drugName&#x27;</span>, <span class="hljs-string">&#x27;condition&#x27;</span>, <span class="hljs-string">&#x27;review&#x27;</span>, <span class="hljs-string">&#x27;rating&#x27;</span>, <span class="hljs-string">&#x27;date&#x27;</span>, <span class="hljs-string">&#x27;usefulCount&#x27;</span>, <span class="hljs-string">&#x27;review_length&#x27;</span>],
num_rows: <span class="hljs-number">110811</span>
})
validation: Dataset({
features: [<span class="hljs-string">&#x27;patient_id&#x27;</span>, <span class="hljs-string">&#x27;drugName&#x27;</span>, <span class="hljs-string">&#x27;condition&#x27;</span>, <span class="hljs-string">&#x27;review&#x27;</span>, <span class="hljs-string">&#x27;rating&#x27;</span>, <span class="hljs-string">&#x27;date&#x27;</span>, <span class="hljs-string">&#x27;usefulCount&#x27;</span>, <span class="hljs-string">&#x27;review_length&#x27;</span>],
num_rows: <span class="hljs-number">27703</span>
})
test: Dataset({
features: [<span class="hljs-string">&#x27;patient_id&#x27;</span>, <span class="hljs-string">&#x27;drugName&#x27;</span>, <span class="hljs-string">&#x27;condition&#x27;</span>, <span class="hljs-string">&#x27;review&#x27;</span>, <span class="hljs-string">&#x27;rating&#x27;</span>, <span class="hljs-string">&#x27;date&#x27;</span>, <span class="hljs-string">&#x27;usefulCount&#x27;</span>, <span class="hljs-string">&#x27;review_length&#x27;</span>],
num_rows: <span class="hljs-number">46108</span>
})
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-151z3do">Para los formatos CSV y JSON, tenemos que guardar cada parte en un archivo separado. Una forma de hacerlo es iterando sobre las llaves y valores del objeto <code>DatasetDict</code>:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-keyword">for</span> split, dataset <span class="hljs-keyword">in</span> drug_dataset_clean.items():
dataset.to_json(<span class="hljs-string">f&quot;drug-reviews-<span class="hljs-subst">{split}</span>.jsonl&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-mqsiji">Esto guarda cada parte en formato <a href="https://jsonlines.org" rel="nofollow">JSON Lines</a>, donde cada fila del dataset está almacenada como una única línea de JSON. Así se ve el primer ejemplo:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->!head -n <span class="hljs-number">1</span> drug-reviews-train.jsonl<!-- HTML_TAG_END --></pre></div> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->{<span class="hljs-string">&quot;patient_id&quot;</span>:<span class="hljs-number">141780</span>,<span class="hljs-string">&quot;drugName&quot;</span>:<span class="hljs-string">&quot;Escitalopram&quot;</span>,<span class="hljs-string">&quot;condition&quot;</span>:<span class="hljs-string">&quot;depression&quot;</span>,<span class="hljs-string">&quot;review&quot;</span>:<span class="hljs-string">&quot;\&quot;I seemed to experience the regular side effects of LEXAPRO, insomnia, low sex drive, sleepiness during the day. I am taking it at night because my doctor said if it made me tired to take it at night. I assumed it would and started out taking it at night. Strange dreams, some pleasant. I was diagnosed with fibromyalgia. Seems to be helping with the pain. Have had anxiety and depression in my family, and have tried quite a few other medications that haven&#x27;t worked. Only have been on it for two weeks but feel more positive in my mind, want to accomplish more in my life. Hopefully the side effects will dwindle away, worth it to stick with it from hearing others responses. Great medication.\&quot;&quot;</span>,<span class="hljs-string">&quot;rating&quot;</span>:<span class="hljs-number">9.0</span>,<span class="hljs-string">&quot;date&quot;</span>:<span class="hljs-string">&quot;May 29, 2011&quot;</span>,<span class="hljs-string">&quot;usefulCount&quot;</span>:<span class="hljs-number">10</span>,<span class="hljs-string">&quot;review_length&quot;</span>:<span class="hljs-number">125</span>}<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1dtgzip">Podemos usar las técnicas de la <a href="/course/chapter5/2">sección 2</a> para cargar los archivos JSON de la siguiente manera:</p> <div class="code-block relative "><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->data_files = {
<span class="hljs-string">&quot;train&quot;</span>: <span class="hljs-string">&quot;drug-reviews-train.jsonl&quot;</span>,
<span class="hljs-string">&quot;validation&quot;</span>: <span class="hljs-string">&quot;drug-reviews-validation.jsonl&quot;</span>,
<span class="hljs-string">&quot;test&quot;</span>: <span class="hljs-string">&quot;drug-reviews-test.jsonl&quot;</span>,
}
drug_dataset_reloaded = load_dataset(<span class="hljs-string">&quot;json&quot;</span>, data_files=data_files)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ddcoie">Esto es todo lo que vamos a ver en nuestra revisión del manejo de datos con 🤗 Datasets. Ahora que tenemos un dataset limpio para entrenar un modelo, aquí van algunas ideas que podrías intentar:</p> <ol data-svelte-h="svelte-trdxcg"><li>Usa las técnicas del <a href="/course/chapter3">Capítulo 3</a> para entrenar un clasificador que pueda predecir la condición del paciente con base en las reseñas de los medicamentos.</li> <li>Usa el pipeline de <code>summarization</code> del <a href="/course/chapter1">Capítulo 1</a> para generar resúmenes de las reseñas.</li></ol> <p data-svelte-h="svelte-1h80x5y">En la siguiente sección veremos cómo 🤗 Datasets te puede ayudar a trabajar con datasets enormes ¡sin explotar tu computador!</p> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/course/blob/main/chapters/es/chapter5/3.mdx" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_brjtkr = {
assets: "/docs/course/pr_1069/es",
base: "/docs/course/pr_1069/es",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/course/pr_1069/es/_app/immutable/entry/start.b7b528c6.js"),
import("/docs/course/pr_1069/es/_app/immutable/entry/app.ea1cc000.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 27],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
149 kB
·
Xet hash:
19ba9261c457b86204b4693315e831ee4146e3258ae23ec5672be1bab53a0d25

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.