Buckets:
| import{s as Zt,o as Gt,n as Ce}from"../chunks/scheduler.37c15a92.js";import{S as kt,i as Vt,g as m,s as a,r as p,A as Wt,h as u,f as t,c as n,j as xt,u as o,x as y,k as Bt,y as Rt,a as l,v as r,d as c,t as M,w as d}from"../chunks/index.7cb9c9b8.js";import{T as Re}from"../chunks/Tip.d10b3fc9.js";import{Y as Ct}from"../chunks/Youtube.8666c400.js";import{C as J}from"../chunks/CodeBlock.abae2786.js";import{C as Xt}from"../chunks/CourseFloatingBanner.df82c153.js";import{H as Ds,E as Nt}from"../chunks/getInferenceSnippets.f9350a3f.js";function Et(w){let i,f='✎ By default, 🤗 Datasets will decompress the files needed to load a dataset. If you want to preserve hard drive space, you can pass <code>DownloadConfig(delete_extracted=True)</code> to the <code>download_config</code> argument of <code>load_dataset()</code>. See the <a href="https://huggingface.co/docs/datasets/package_reference/builder_classes#datasets.DownloadConfig" rel="nofollow">documentation</a> for more details.';return{c(){i=m("p"),i.innerHTML=f},l(h){i=u(h,"P",{"data-svelte-h":!0}),y(i)!=="svelte-1j3uv2u"&&(i.innerHTML=f)},m(h,T){l(h,i,T)},p:Ce,d(h){h&&t(i)}}}function vt(w){let i,f='✏️ <strong>Try it out!</strong> Pick one of the <a href="https://the-eye.eu/public/AI/pile_preliminary_components/" rel="nofollow">subsets</a> from the Pile that is larger than your laptop or desktop’s RAM, load it with 🤗 Datasets, and measure the amount of RAM used. Note that to get an accurate measurement, you’ll want to do this in a new process. You can find the decompressed sizes of each subset in Table 1 of <a href="https://arxiv.org/abs/2101.00027" rel="nofollow">the Pile paper</a>.';return{c(){i=m("p"),i.innerHTML=f},l(h){i=u(h,"P",{"data-svelte-h":!0}),y(i)!=="svelte-16kedbl"&&(i.innerHTML=f)},m(h,T){l(h,i,T)},p:Ce,d(h){h&&t(i)}}}function _t(w){let i,f='💡 In Jupyter notebooks you can also time cells using the <a href="https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-timeit" rel="nofollow"><code>%%timeit</code> magic function</a>.';return{c(){i=m("p"),i.innerHTML=f},l(h){i=u(h,"P",{"data-svelte-h":!0}),y(i)!=="svelte-8b0mae"&&(i.innerHTML=f)},m(h,T){l(h,i,T)},p:Ce,d(h){h&&t(i)}}}function zt(w){let i,f="💡 To speed up tokenization with streaming you can pass <code>batched=True</code>, as we saw in the last section. It will process the examples batch by batch; the default batch size is 1,000 and can be specified with the <code>batch_size</code> argument.";return{c(){i=m("p"),i.innerHTML=f},l(h){i=u(h,"P",{"data-svelte-h":!0}),y(i)!=="svelte-81vf72"&&(i.innerHTML=f)},m(h,T){l(h,i,T)},p:Ce,d(h){h&&t(i)}}}function Qt(w){let i,f='✏️ <strong>Try it out!</strong> Use one of the large Common Crawl corpora like <a href="https://huggingface.co/datasets/mc4" rel="nofollow"><code>mc4</code></a> or <a href="https://huggingface.co/datasets/oscar" rel="nofollow"><code>oscar</code></a> to create a streaming multilingual dataset that represents the spoken proportions of languages in a country of your choice. For example, the four national languages in Switzerland are German, French, Italian, and Romansh, so you could try creating a Swiss corpus by sampling the Oscar subsets according to their spoken proportion.';return{c(){i=m("p"),i.innerHTML=f},l(h){i=u(h,"P",{"data-svelte-h":!0}),y(i)!=="svelte-1e4y0rz"&&(i.innerHTML=f)},m(h,T){l(h,i,T)},p:Ce,d(h){h&&t(i)}}}function Ht(w){let i,f,h,T,$,Xe,x,Ne,B,Ps="Nowadays it is not uncommon to find yourself working with multi-gigabyte datasets, especially if you’re planning to pretrain a transformer like BERT or GPT-2 from scratch. In these cases, even <em>loading</em> the data can be a challenge. For example, the WebText corpus used to pretrain GPT-2 consists of over 8 million documents and 40 GB of text — loading this into your laptop’s RAM is likely to give it a heart attack!",Ee,Z,qs="Fortunately, 🤗 Datasets has been designed to overcome these limitations. It frees you from memory management problems by treating datasets as <em>memory-mapped</em> files, and from hard drive limits by <em>streaming</em> the entries in a corpus.",ve,G,_e,k,Os='In this section we’ll explore these features of 🤗 Datasets with a huge 825 GB corpus known as <a href="https://pile.eleuther.ai" rel="nofollow">the Pile</a>. Let’s get started!',ze,V,Qe,W,Ks='The Pile is an English text corpus that was created by <a href="https://www.eleuther.ai" rel="nofollow">EleutherAI</a> for training large-scale language models. It includes a diverse range of datasets, spanning scientific articles, GitHub code repositories, and filtered web text. The training corpus is available in <a href="https://the-eye.eu/public/AI/pile/" rel="nofollow">14 GB chunks</a>, and you can also download several of the <a href="https://the-eye.eu/public/AI/pile_preliminary_components/" rel="nofollow">individual components</a>. Let’s start by taking a look at the PubMed Abstracts dataset, which is a corpus of abstracts from 15 million biomedical publications on <a href="https://pubmed.ncbi.nlm.nih.gov/" rel="nofollow">PubMed</a>. The dataset is in <a href="https://jsonlines.org" rel="nofollow">JSON Lines format</a> and is compressed using the <code>zstandard</code> library, so first we need to install that:',He,R,Fe,C,et='Next, we can load the dataset using the method for remote files that we learned in <a href="/course/chapter5/2">section 2</a>:',Ye,X,Ae,N,Se,E,st="We can see that there are 15,518,009 rows and 2 columns in our dataset — that’s a lot!",Le,b,De,v,tt="Let’s inspect the contents of the first example:",Pe,_,qe,z,Oe,Q,lt="Okay, this looks like the abstract from a medical article. Now let’s see how much RAM we’ve used to load the dataset!",Ke,H,es,F,at='A simple way to measure memory usage in Python is with the <a href="https://psutil.readthedocs.io/en/latest/" rel="nofollow"><code>psutil</code></a> library, which can be installed with <code>pip</code> as follows:',ss,Y,ts,A,nt="It provides a <code>Process</code> class that allows us to check the memory usage of the current process as follows:",ls,S,as,L,ns,D,it="Here the <code>rss</code> attribute refers to the <em>resident set size</em>, which is the fraction of memory that a process occupies in RAM. This measurement also includes the memory used by the Python interpreter and the libraries we’ve loaded, so the actual amount of memory used to load the dataset is a bit smaller. For comparison, let’s see how large the dataset is on disk, using the <code>dataset_size</code> attribute. Since the result is expressed in bytes like before, we need to manually convert it to gigabytes:",is,P,ps,q,os,O,pt="Nice — despite it being almost 20 GB large, we’re able to load and access the dataset with much less RAM!",rs,j,cs,K,ot='If you’re familiar with Pandas, this result might come as a surprise because of Wes Kinney’s famous <a href="https://wesmckinney.com/blog/apache-arrow-pandas-internals/" rel="nofollow">rule of thumb</a> that you typically need 5 to 10 times as much RAM as the size of your dataset. So how does 🤗 Datasets solve this memory management problem? 🤗 Datasets treats each dataset as a <a href="https://en.wikipedia.org/wiki/Memory-mapped_file" rel="nofollow">memory-mapped file</a>, which provides a mapping between RAM and filesystem storage that allows the library to access and operate on elements of the dataset without needing to fully load it into memory.',Ms,ee,rt='Memory-mapped files can also be shared across multiple processes, which enables methods like <code>Dataset.map()</code> to be parallelized without needing to move or copy the dataset. Under the hood, these capabilities are all realized by the <a href="https://arrow.apache.org" rel="nofollow">Apache Arrow</a> memory format and <a href="https://arrow.apache.org/docs/python/index.html" rel="nofollow"><code>pyarrow</code></a> library, which make the data loading and processing lightning fast. (For more details about Apache Arrow and comparisons to Pandas, check out <a href="https://towardsdatascience.com/apache-arrow-read-dataframe-with-zero-memory-69634092b1a" rel="nofollow">Dejan Simic’s blog post</a>.) To see this in action, let’s run a little speed test by iterating over all the elements in the PubMed Abstracts dataset:',ds,se,hs,te,ms,le,ct="Here we’ve used Python’s <code>timeit</code> module to measure the execution time taken by <code>code_snippet</code>. You’ll typically be able to iterate over a dataset at speed of a few tenths of a GB/s to several GB/s. This works great for the vast majority of applications, but sometimes you’ll have to work with a dataset that is too large to even store on your laptop’s hard drive. For example, if we tried to download the Pile in its entirety, we’d need 825 GB of free disk space! To handle these cases, 🤗 Datasets provides a streaming feature that allows us to download and access elements on the fly, without needing to download the whole dataset. Let’s take a look at how this works.",us,U,ys,ae,Js,ne,Mt="To enable dataset streaming you just need to pass the <code>streaming=True</code> argument to the <code>load_dataset()</code> function. For example, let’s load the PubMed Abstracts dataset again, but in streaming mode:",fs,ie,Ts,pe,dt="Instead of the familiar <code>Dataset</code> that we’ve encountered elsewhere in this chapter, the object returned with <code>streaming=True</code> is an <code>IterableDataset</code>. As the name suggests, to access the elements of an <code>IterableDataset</code> we need to iterate over it. We can access the first element of our streamed dataset as follows:",ws,oe,bs,re,js,ce,ht='The elements from a streamed dataset can be processed on the fly using <code>IterableDataset.map()</code>, which is useful during training if you need to tokenize the inputs. The process is exactly the same as the one we used to tokenize our dataset in <a href="/course/chapter3">Chapter 3</a>, with the only difference being that outputs are returned one by one:',Us,Me,gs,de,Is,g,$s,he,mt="You can also shuffle a streamed dataset using <code>IterableDataset.shuffle()</code>, but unlike <code>Dataset.shuffle()</code> this only shuffles the elements in a predefined <code>buffer_size</code>:",xs,me,Bs,ue,Zs,ye,ut="In this example, we selected a random example from the first 10,000 examples in the buffer. Once an example is accessed, its spot in the buffer is filled with the next example in the corpus (i.e., the 10,001st example in the case above). You can also select elements from a streamed dataset using the <code>IterableDataset.take()</code> and <code>IterableDataset.skip()</code> functions, which act in a similar way to <code>Dataset.select()</code>. For example, to select the first 5 examples in the PubMed Abstracts dataset we can do the following:",Gs,Je,ks,fe,Vs,Te,yt="Similarly, you can use the <code>IterableDataset.skip()</code> function to create training and validation splits from a shuffled dataset as follows:",Ws,we,Rs,be,Jt="Let’s round out our exploration of dataset streaming with a common application: combining multiple datasets together to create a single corpus. 🤗 Datasets provides an <code>interleave_datasets()</code> function that converts a list of <code>IterableDataset</code> objects into a single <code>IterableDataset</code>, where the elements of the new dataset are obtained by alternating among the source examples. This function is especially useful when you’re trying to combine large datasets, so as an example let’s stream the FreeLaw subset of the Pile, which is a 51 GB dataset of legal opinions from US courts:",Cs,je,Xs,Ue,Ns,ge,ft="This dataset is large enough to stress the RAM of most laptops, yet we’ve been able to load and access it without breaking a sweat! Let’s now combine the examples from the FreeLaw and PubMed Abstracts datasets with the <code>interleave_datasets()</code> function:",Es,Ie,vs,$e,_s,xe,Tt="Here we’ve used the <code>islice()</code> function from Python’s <code>itertools</code> module to select the first two examples from the combined dataset, and we can see that they match the first examples from each of the two source datasets.",zs,Be,wt="Finally, if you want to stream the Pile in its 825 GB entirety, you can grab all the prepared files as follows:",Qs,Ze,Hs,Ge,Fs,I,Ys,ke,bt="You now have all the tools you need to load and process datasets of all shapes and sizes — but unless you’re exceptionally lucky, there will come a point in your NLP journey where you’ll have to actually create a dataset to solve the problem at hand. That’s the topic of the next section!",As,Ve,Ss,We,Ls;return $=new Ds({props:{title:"Big data? 🤗 Datasets to the rescue!",local:"big-data-datasets-to-the-rescue",headingTag:"h1"}}),x=new Xt({props:{chapter:5,classNames:"absolute z-10 right-0 top-0",notebooks:[{label:"Google Colab",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/en/chapter5/section4.ipynb"},{label:"Aws Studio",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/en/chapter5/section4.ipynb"}]}}),G=new Ct({props:{id:"JwISwTCPPWo"}}),V=new Ds({props:{title:"What is the Pile?",local:"what-is-the-pile",headingTag:"h2"}}),R=new J({props:{code:"IXBpcCUyMGluc3RhbGwlMjB6c3RhbmRhcmQ=",highlighted:"!pip install zstandard",wrap:!1}}),X=new J({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBJTBBJTIzJTIwVGhpcyUyMHRha2VzJTIwYSUyMGZldyUyMG1pbnV0ZXMlMjB0byUyMHJ1biUyQyUyMHNvJTIwZ28lMjBncmFiJTIwYSUyMHRlYSUyMG9yJTIwY29mZmVlJTIwd2hpbGUlMjB5b3UlMjB3YWl0JTIwJTNBKSUwQWRhdGFfZmlsZXMlMjAlM0QlMjAlMjJodHRwcyUzQSUyRiUyRnRoZS1leWUuZXUlMkZwdWJsaWMlMkZBSSUyRnBpbGVfcHJlbGltaW5hcnlfY29tcG9uZW50cyUyRlBVQk1FRF90aXRsZV9hYnN0cmFjdHNfMjAxOV9iYXNlbGluZS5qc29ubC56c3QlMjIlMEFwdWJtZWRfZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJqc29uJTIyJTJDJTIwZGF0YV9maWxlcyUzRGRhdGFfZmlsZXMlMkMlMjBzcGxpdCUzRCUyMnRyYWluJTIyKSUwQXB1Ym1lZF9kYXRhc2V0",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-comment"># This takes a few minutes to run, so go grab a tea or coffee while you wait :)</span> | |
| data_files = <span class="hljs-string">"https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"</span> | |
| pubmed_dataset = load_dataset(<span class="hljs-string">"json"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>) | |
| pubmed_dataset`,wrap:!1}}),N=new J({props:{code:"RGF0YXNldCglN0IlMEElMjAlMjAlMjAlMjBmZWF0dXJlcyUzQSUyMCU1QidtZXRhJyUyQyUyMCd0ZXh0JyU1RCUyQyUwQSUyMCUyMCUyMCUyMG51bV9yb3dzJTNBJTIwMTU1MTgwMDklMEElN0Qp",highlighted:`Dataset({ | |
| features: [<span class="hljs-string">'meta'</span>, <span class="hljs-string">'text'</span>], | |
| num_rows: <span class="hljs-number">15518009</span> | |
| })`,wrap:!1}}),b=new Re({props:{$$slots:{default:[Et]},$$scope:{ctx:w}}}),_=new J({props:{code:"cHVibWVkX2RhdGFzZXQlNUIwJTVE",highlighted:'pubmed_dataset[<span class="hljs-number">0</span>]',wrap:!1}}),z=new J({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzQlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAndGV4dCclM0ElMjAnRXBpZGVtaW9sb2d5JTIwb2YlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHdpdGglMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb24uJTVDblRvJTIwZGV0ZXJtaW5lJTIwdGhlJTIwcHJldmFsZW5jZSUyMG9mJTIwaHlwb3hhZW1pYSUyMGluJTIwY2hpbGRyZW4lMjBhZ2VkJTIwdW5kZXIlMjA1JTIweWVhcnMlMjBzdWZmZXJpbmclMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb25zJTIwKEFMUkkpJTJDJTIwdGhlJTIwcmlzayUyMGZhY3RvcnMlMjBmb3IlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHVuZGVyJTIwNSUyMHllYXJzJTIwb2YlMjBhZ2UlMjB3aXRoJTIwQUxSSSUyQyUyMGFuZCUyMHRoZSUyMGFzc29jaWF0aW9uJTIwb2YlMjBoeXBveGFlbWlhJTIwd2l0aCUyMGFuJTIwaW5jcmVhc2VkJTIwcmlzayUyMG9mJTIwZHlpbmclMjBpbiUyMGNoaWxkcmVuJTIwb2YlMjB0aGUlMjBzYW1lJTIwYWdlJTIwLi4uJyU3RA==",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age ...'</span>}`,wrap:!1}}),H=new Ds({props:{title:"The magic of memory mapping",local:"the-magic-of-memory-mapping",headingTag:"h2"}}),Y=new J({props:{code:"IXBpcCUyMGluc3RhbGwlMjBwc3V0aWw=",highlighted:"!pip install psutil",wrap:!1}}),S=new J({props:{code:"aW1wb3J0JTIwcHN1dGlsJTBBJTBBJTIzJTIwUHJvY2Vzcy5tZW1vcnlfaW5mbyUyMGlzJTIwZXhwcmVzc2VkJTIwaW4lMjBieXRlcyUyQyUyMHNvJTIwY29udmVydCUyMHRvJTIwbWVnYWJ5dGVzJTBBcHJpbnQoZiUyMlJBTSUyMHVzZWQlM0ElMjAlN0Jwc3V0aWwuUHJvY2VzcygpLm1lbW9yeV9pbmZvKCkucnNzJTIwJTJGJTIwKDEwMjQlMjAqJTIwMTAyNCklM0EuMmYlN0QlMjBNQiUyMik=",highlighted:`<span class="hljs-keyword">import</span> psutil | |
| <span class="hljs-comment"># Process.memory_info is expressed in bytes, so convert to megabytes</span> | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"RAM used: <span class="hljs-subst">{psutil.Process().memory_info().rss / (<span class="hljs-number">1024</span> * <span class="hljs-number">1024</span>):<span class="hljs-number">.2</span>f}</span> MB"</span>)`,wrap:!1}}),L=new J({props:{code:"UkFNJTIwdXNlZCUzQSUyMDU2NzguMzMlMjBNQg==",highlighted:'RAM used: <span class="hljs-number">5678.33</span> MB',wrap:!1}}),P=new J({props:{code:"cHJpbnQoZiUyMkRhdGFzZXQlMjBzaXplJTIwaW4lMjBieXRlcyUzQSUyMCU3QnB1Ym1lZF9kYXRhc2V0LmRhdGFzZXRfc2l6ZSU3RCUyMiklMEFzaXplX2diJTIwJTNEJTIwcHVibWVkX2RhdGFzZXQuZGF0YXNldF9zaXplJTIwJTJGJTIwKDEwMjQqKjMpJTBBcHJpbnQoZiUyMkRhdGFzZXQlMjBzaXplJTIwKGNhY2hlJTIwZmlsZSklMjAlM0ElMjAlN0JzaXplX2diJTNBLjJmJTdEJTIwR0IlMjIp",highlighted:`<span class="hljs-built_in">print</span>(<span class="hljs-string">f"Dataset size in bytes: <span class="hljs-subst">{pubmed_dataset.dataset_size}</span>"</span>) | |
| size_gb = pubmed_dataset.dataset_size / (<span class="hljs-number">1024</span>**<span class="hljs-number">3</span>) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"Dataset size (cache file) : <span class="hljs-subst">{size_gb:<span class="hljs-number">.2</span>f}</span> GB"</span>)`,wrap:!1}}),q=new J({props:{code:"RGF0YXNldCUyMHNpemUlMjBpbiUyMGJ5dGVzJTIwJTNBJTIwMjA5Nzk0MzcwNTElMEFEYXRhc2V0JTIwc2l6ZSUyMChjYWNoZSUyMGZpbGUpJTIwJTNBJTIwMTkuNTQlMjBHQg==",highlighted:`Dataset size <span class="hljs-keyword">in</span> <span class="hljs-built_in">bytes</span> : <span class="hljs-number">20979437051</span> | |
| Dataset size (cache file) : <span class="hljs-number">19.54</span> GB`,wrap:!1}}),j=new Re({props:{$$slots:{default:[vt]},$$scope:{ctx:w}}}),se=new J({props:{code:"aW1wb3J0JTIwdGltZWl0JTBBJTBBY29kZV9zbmlwcGV0JTIwJTNEJTIwJTIyJTIyJTIyYmF0Y2hfc2l6ZSUyMCUzRCUyMDEwMDAlMEElMEFmb3IlMjBpZHglMjBpbiUyMHJhbmdlKDAlMkMlMjBsZW4ocHVibWVkX2RhdGFzZXQpJTJDJTIwYmF0Y2hfc2l6ZSklM0ElMEElMjAlMjAlMjAlMjBfJTIwJTNEJTIwcHVibWVkX2RhdGFzZXQlNUJpZHglM0FpZHglMjAlMkIlMjBiYXRjaF9zaXplJTVEJTBBJTIyJTIyJTIyJTBBJTBBdGltZSUyMCUzRCUyMHRpbWVpdC50aW1laXQoc3RtdCUzRGNvZGVfc25pcHBldCUyQyUyMG51bWJlciUzRDElMkMlMjBnbG9iYWxzJTNEZ2xvYmFscygpKSUwQXByaW50KCUwQSUyMCUyMCUyMCUyMGYlMjJJdGVyYXRlZCUyMG92ZXIlMjAlN0JsZW4ocHVibWVkX2RhdGFzZXQpJTdEJTIwZXhhbXBsZXMlMjAoYWJvdXQlMjAlN0JzaXplX2diJTNBLjFmJTdEJTIwR0IpJTIwaW4lMjAlMjIlMEElMjAlMjAlMjAlMjBmJTIyJTdCdGltZSUzQS4xZiU3RHMlMkMlMjBpLmUuJTIwJTdCc2l6ZV9nYiUyRnRpbWUlM0EuM2YlN0QlMjBHQiUyRnMlMjIlMEEp",highlighted:`<span class="hljs-keyword">import</span> timeit | |
| code_snippet = <span class="hljs-string">"""batch_size = 1000 | |
| for idx in range(0, len(pubmed_dataset), batch_size): | |
| _ = pubmed_dataset[idx:idx + batch_size] | |
| """</span> | |
| time = timeit.timeit(stmt=code_snippet, number=<span class="hljs-number">1</span>, <span class="hljs-built_in">globals</span>=<span class="hljs-built_in">globals</span>()) | |
| <span class="hljs-built_in">print</span>( | |
| <span class="hljs-string">f"Iterated over <span class="hljs-subst">{<span class="hljs-built_in">len</span>(pubmed_dataset)}</span> examples (about <span class="hljs-subst">{size_gb:<span class="hljs-number">.1</span>f}</span> GB) in "</span> | |
| <span class="hljs-string">f"<span class="hljs-subst">{time:<span class="hljs-number">.1</span>f}</span>s, i.e. <span class="hljs-subst">{size_gb/time:<span class="hljs-number">.3</span>f}</span> GB/s"</span> | |
| )`,wrap:!1}}),te=new J({props:{code:"J0l0ZXJhdGVkJTIwb3ZlciUyMDE1NTE4MDA5JTIwZXhhbXBsZXMlMjAoYWJvdXQlMjAxOS41JTIwR0IpJTIwaW4lMjA2NC4ycyUyQyUyMGkuZS4lMjAwLjMwNCUyMEdCJTJGcyc=",highlighted:'<span class="hljs-string">'Iterated over 15518009 examples (about 19.5 GB) in 64.2s, i.e. 0.304 GB/s'</span>',wrap:!1}}),U=new Re({props:{$$slots:{default:[_t]},$$scope:{ctx:w}}}),ae=new Ds({props:{title:"Streaming datasets",local:"streaming-datasets",headingTag:"h2"}}),ie=new J({props:{code:"cHVibWVkX2RhdGFzZXRfc3RyZWFtZWQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTBBJTIwJTIwJTIwJTIwJTIyanNvbiUyMiUyQyUyMGRhdGFfZmlsZXMlM0RkYXRhX2ZpbGVzJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiUyQyUyMHN0cmVhbWluZyUzRFRydWUlMEEp",highlighted:`pubmed_dataset_streamed = load_dataset( | |
| <span class="hljs-string">"json"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span> | |
| )`,wrap:!1}}),oe=new J({props:{code:"bmV4dChpdGVyKHB1Ym1lZF9kYXRhc2V0X3N0cmVhbWVkKSk=",highlighted:'<span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(pubmed_dataset_streamed))',wrap:!1}}),re=new J({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzQlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAndGV4dCclM0ElMjAnRXBpZGVtaW9sb2d5JTIwb2YlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHdpdGglMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb24uJTVDblRvJTIwZGV0ZXJtaW5lJTIwdGhlJTIwcHJldmFsZW5jZSUyMG9mJTIwaHlwb3hhZW1pYSUyMGluJTIwY2hpbGRyZW4lMjBhZ2VkJTIwdW5kZXIlMjA1JTIweWVhcnMlMjBzdWZmZXJpbmclMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb25zJTIwKEFMUkkpJTJDJTIwdGhlJTIwcmlzayUyMGZhY3RvcnMlMjBmb3IlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHVuZGVyJTIwNSUyMHllYXJzJTIwb2YlMjBhZ2UlMjB3aXRoJTIwQUxSSSUyQyUyMGFuZCUyMHRoZSUyMGFzc29jaWF0aW9uJTIwb2YlMjBoeXBveGFlbWlhJTIwd2l0aCUyMGFuJTIwaW5jcmVhc2VkJTIwcmlzayUyMG9mJTIwZHlpbmclMjBpbiUyMGNoaWxkcmVuJTIwb2YlMjB0aGUlMjBzYW1lJTIwYWdlJTIwLi4uJyU3RA==",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age ...'</span>}`,wrap:!1}}),Me=new J({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Ub2tlbml6ZXIlMEElMEF0b2tlbml6ZXIlMjAlM0QlMjBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZCglMjJkaXN0aWxiZXJ0LWJhc2UtdW5jYXNlZCUyMiklMEF0b2tlbml6ZWRfZGF0YXNldCUyMCUzRCUyMHB1Ym1lZF9kYXRhc2V0X3N0cmVhbWVkLm1hcChsYW1iZGElMjB4JTNBJTIwdG9rZW5pemVyKHglNUIlMjJ0ZXh0JTIyJTVEKSklMEFuZXh0KGl0ZXIodG9rZW5pemVkX2RhdGFzZXQpKQ==",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"distilbert-base-uncased"</span>) | |
| tokenized_dataset = pubmed_dataset_streamed.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> x: tokenizer(x[<span class="hljs-string">"text"</span>])) | |
| <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(tokenized_dataset))`,wrap:!1}}),de=new J({props:{code:"JTdCJ2lucHV0X2lkcyclM0ElMjAlNUIxMDElMkMlMjA0OTU4JTJDJTIwNTE3OCUyQyUyMDQzMjglMkMlMjA2Nzc5JTJDJTIwLi4uJTVEJTJDJTIwJ2F0dGVudGlvbl9tYXNrJyUzQSUyMCU1QjElMkMlMjAxJTJDJTIwMSUyQyUyMDElMkMlMjAxJTJDJTIwLi4uJTVEJTdE",highlighted:'{<span class="hljs-string">'input_ids'</span>: [<span class="hljs-number">101</span>, <span class="hljs-number">4958</span>, <span class="hljs-number">5178</span>, <span class="hljs-number">4328</span>, <span class="hljs-number">6779</span>, ...], <span class="hljs-string">'attention_mask'</span>: [<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, ...]}',wrap:!1}}),g=new Re({props:{$$slots:{default:[zt]},$$scope:{ctx:w}}}),me=new J({props:{code:"c2h1ZmZsZWRfZGF0YXNldCUyMCUzRCUyMHB1Ym1lZF9kYXRhc2V0X3N0cmVhbWVkLnNodWZmbGUoYnVmZmVyX3NpemUlM0QxMF8wMDAlMkMlMjBzZWVkJTNENDIpJTBBbmV4dChpdGVyKHNodWZmbGVkX2RhdGFzZXQpKQ==",highlighted:`shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=<span class="hljs-number">10_000</span>, seed=<span class="hljs-number">42</span>) | |
| <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(shuffled_dataset))`,wrap:!1}}),ue=new J({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MTA3OTklMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAndGV4dCclM0ElMjAnUmFuZG9taXplZCUyMHN0dWR5JTIwb2YlMjBkb3NlJTIwb3IlMjBzY2hlZHVsZSUyMG1vZGlmaWNhdGlvbiUyMG9mJTIwZ3JhbnVsb2N5dGUlMjBjb2xvbnktc3RpbXVsYXRpbmclMjBmYWN0b3IlMjBpbiUyMHBsYXRpbnVtLWJhc2VkJTIwY2hlbW90aGVyYXB5JTIwZm9yJTIwZWxkZXJseSUyMHBhdGllbnRzJTIwd2l0aCUyMGx1bmclMjBjYW5jZXIlMjAuLi4nJTdE",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11410799</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Randomized study of dose or schedule modification of granulocyte colony-stimulating factor in platinum-based chemotherapy for elderly patients with lung cancer ...'</span>}`,wrap:!1}}),Je=new J({props:{code:"ZGF0YXNldF9oZWFkJTIwJTNEJTIwcHVibWVkX2RhdGFzZXRfc3RyZWFtZWQudGFrZSg1KSUwQWxpc3QoZGF0YXNldF9oZWFkKQ==",highlighted:`dataset_head = pubmed_dataset_streamed.take(<span class="hljs-number">5</span>) | |
| <span class="hljs-built_in">list</span>(dataset_head)`,wrap:!1}}),fe=new J({props:{code:"JTVCJTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzQlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAlMjAndGV4dCclM0ElMjAnRXBpZGVtaW9sb2d5JTIwb2YlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHdpdGglMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb24lMjAuLi4nJTdEJTJDJTBBJTIwJTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzUlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAlMjAndGV4dCclM0ElMjAnQ2xpbmljYWwlMjBzaWducyUyMG9mJTIwaHlwb3hhZW1pYSUyMGluJTIwY2hpbGRyZW4lMjB3aXRoJTIwYWN1dGUlMjBsb3dlciUyMHJlc3BpcmF0b3J5JTIwaW5mZWN0aW9uJTNBJTIwaW5kaWNhdG9ycyUyMG9mJTIwb3h5Z2VuJTIwdGhlcmFweSUyMC4uLiclN0QlMkMlMEElMjAlN0InbWV0YSclM0ElMjAlN0IncG1pZCclM0ElMjAxMTQwOTU3NiUyQyUyMCdsYW5ndWFnZSclM0ElMjAnZW5nJyU3RCUyQyUwQSUyMCUyMCd0ZXh0JyUzQSUyMCUyMkh5cG94YWVtaWElMjBpbiUyMGNoaWxkcmVuJTIwd2l0aCUyMHNldmVyZSUyMHBuZXVtb25pYSUyMGluJTIwUGFwdWElMjBOZXclMjBHdWluZWElMjAuLi4lMjIlN0QlMkMlMEElMjAlN0InbWV0YSclM0ElMjAlN0IncG1pZCclM0ElMjAxMTQwOTU3NyUyQyUyMCdsYW5ndWFnZSclM0ElMjAnZW5nJyU3RCUyQyUwQSUyMCUyMCd0ZXh0JyUzQSUyMCdPeHlnZW4lMjBjb25jZW50cmF0b3JzJTIwYW5kJTIwY3lsaW5kZXJzJTIwLi4uJyU3RCUyQyUwQSUyMCU3QidtZXRhJyUzQSUyMCU3QidwbWlkJyUzQSUyMDExNDA5NTc4JTJDJTIwJ2xhbmd1YWdlJyUzQSUyMCdlbmcnJTdEJTJDJTBBJTIwJTIwJ3RleHQnJTNBJTIwJ094eWdlbiUyMHN1cHBseSUyMGluJTIwcnVyYWwlMjBhZnJpY2ElM0ElMjBhJTIwcGVyc29uYWwlMjBleHBlcmllbmNlJTIwLi4uJyU3RCU1RA==",highlighted:`[{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Epidemiology of hypoxaemia in children with acute lower respiratory infection ...'</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409575</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Clinical signs of hypoxaemia in children with acute lower respiratory infection: indicators of oxygen therapy ...'</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409576</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">"Hypoxaemia in children with severe pneumonia in Papua New Guinea ..."</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409577</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Oxygen concentrators and cylinders ...'</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409578</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Oxygen supply in rural africa: a personal experience ...'</span>}]`,wrap:!1}}),we=new J({props:{code:"JTIzJTIwU2tpcCUyMHRoZSUyMGZpcnN0JTIwMSUyQzAwMCUyMGV4YW1wbGVzJTIwYW5kJTIwaW5jbHVkZSUyMHRoZSUyMHJlc3QlMjBpbiUyMHRoZSUyMHRyYWluaW5nJTIwc2V0JTBBdHJhaW5fZGF0YXNldCUyMCUzRCUyMHNodWZmbGVkX2RhdGFzZXQuc2tpcCgxMDAwKSUwQSUyMyUyMFRha2UlMjB0aGUlMjBmaXJzdCUyMDElMkMwMDAlMjBleGFtcGxlcyUyMGZvciUyMHRoZSUyMHZhbGlkYXRpb24lMjBzZXQlMEF2YWxpZGF0aW9uX2RhdGFzZXQlMjAlM0QlMjBzaHVmZmxlZF9kYXRhc2V0LnRha2UoMTAwMCk=",highlighted:`<span class="hljs-comment"># Skip the first 1,000 examples and include the rest in the training set</span> | |
| train_dataset = shuffled_dataset.skip(<span class="hljs-number">1000</span>) | |
| <span class="hljs-comment"># Take the first 1,000 examples for the validation set</span> | |
| validation_dataset = shuffled_dataset.take(<span class="hljs-number">1000</span>)`,wrap:!1}}),je=new J({props:{code:"bGF3X2RhdGFzZXRfc3RyZWFtZWQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTBBJTIwJTIwJTIwJTIwJTIyanNvbiUyMiUyQyUwQSUyMCUyMCUyMCUyMGRhdGFfZmlsZXMlM0QlMjJodHRwcyUzQSUyRiUyRnRoZS1leWUuZXUlMkZwdWJsaWMlMkZBSSUyRnBpbGVfcHJlbGltaW5hcnlfY29tcG9uZW50cyUyRkZyZWVMYXdfT3BpbmlvbnMuanNvbmwuenN0JTIyJTJDJTBBJTIwJTIwJTIwJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiUyQyUwQSUyMCUyMCUyMCUyMHN0cmVhbWluZyUzRFRydWUlMkMlMEEpJTBBbmV4dChpdGVyKGxhd19kYXRhc2V0X3N0cmVhbWVkKSk=",highlighted:`law_dataset_streamed = load_dataset( | |
| <span class="hljs-string">"json"</span>, | |
| data_files=<span class="hljs-string">"https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst"</span>, | |
| split=<span class="hljs-string">"train"</span>, | |
| streaming=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(law_dataset_streamed))`,wrap:!1}}),Ue=new J({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ2Nhc2VfSUQnJTNBJTIwJzExMDkyMS5qc29uJyUyQyUwQSUyMCUyMCdjYXNlX2p1cmlzZGljdGlvbiclM0ElMjAnc2NvdHVzLnRhci5neiclMkMlMEElMjAlMjAnZGF0ZV9jcmVhdGVkJyUzQSUyMCcyMDEwLTA0LTI4VDE3JTNBMTIlM0E0OVonJTdEJTJDJTBBJTIwJ3RleHQnJTNBJTIwJyU1Q240NjElMjBVLlMuJTIwMjM4JTIwKDE5ODMpJTVDbk9MSU0lMjBFVCUyMEFMLiU1Q252LiU1Q25XQUtJTkVLT05BJTVDbk5vLiUyMDgxLTE1ODEuJTVDblN1cHJlbWUlMjBDb3VydCUyMG9mJTIwVW5pdGVkJTIwU3RhdGVzLiU1Q25Bcmd1ZWQlMjBKYW51YXJ5JTIwMTklMkMlMjAxOTgzLiU1Q25EZWNpZGVkJTIwQXByaWwlMjAyNiUyQyUyMDE5ODMuJTVDbkNFUlRJT1JBUkklMjBUTyUyMFRIRSUyMFVOSVRFRCUyMFNUQVRFUyUyMENPVVJUJTIwT0YlMjBBUFBFQUxTJTIwRk9SJTIwVEhFJTIwTklOVEglMjBDSVJDVUlUJTVDbioyMzklMjBNaWNoYWVsJTIwQS4lMjBMaWxseSUyQyUyMEZpcnN0JTIwRGVwdXR5JTIwQXR0b3JuZXklMjBHZW5lcmFsJTIwb2YlMjBIYXdhaWklMkMlMjBhcmd1ZWQlMjB0aGUlMjBjYXVzZSUyMGZvciUyMHBldGl0aW9uZXJzLiUyMFdpdGglMjBoaW0lMjBvbiUyMHRoZSUyMGJyaWVmJTIwd2FzJTIwSmFtZXMlMjBILiUyMERhbm5lbmJlcmclMkMlMjBEZXB1dHklMjBBdHRvcm5leSUyMEdlbmVyYWwuLi4nJTdE",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'case_ID'</span>: <span class="hljs-string">'110921.json'</span>, | |
| <span class="hljs-string">'case_jurisdiction'</span>: <span class="hljs-string">'scotus.tar.gz'</span>, | |
| <span class="hljs-string">'date_created'</span>: <span class="hljs-string">'2010-04-28T17:12:49Z'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'\\n461 U.S. 238 (1983)\\nOLIM ET AL.\\nv.\\nWAKINEKONA\\nNo. 81-1581.\\nSupreme Court of United States.\\nArgued January 19, 1983.\\nDecided April 26, 1983.\\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...'</span>}`,wrap:!1}}),Ie=new J({props:{code:"ZnJvbSUyMGl0ZXJ0b29scyUyMGltcG9ydCUyMGlzbGljZSUwQWZyb20lMjBkYXRhc2V0cyUyMGltcG9ydCUyMGludGVybGVhdmVfZGF0YXNldHMlMEElMEFjb21iaW5lZF9kYXRhc2V0JTIwJTNEJTIwaW50ZXJsZWF2ZV9kYXRhc2V0cyglNUJwdWJtZWRfZGF0YXNldF9zdHJlYW1lZCUyQyUyMGxhd19kYXRhc2V0X3N0cmVhbWVkJTVEKSUwQWxpc3QoaXNsaWNlKGNvbWJpbmVkX2RhdGFzZXQlMkMlMjAyKSk=",highlighted:`<span class="hljs-keyword">from</span> itertools <span class="hljs-keyword">import</span> islice | |
| <span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> interleave_datasets | |
| combined_dataset = interleave_datasets([pubmed_dataset_streamed, law_dataset_streamed]) | |
| <span class="hljs-built_in">list</span>(islice(combined_dataset, <span class="hljs-number">2</span>))`,wrap:!1}}),$e=new J({props:{code:"JTVCJTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzQlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAlMjAndGV4dCclM0ElMjAnRXBpZGVtaW9sb2d5JTIwb2YlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHdpdGglMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb24lMjAuLi4nJTdEJTJDJTBBJTIwJTdCJ21ldGEnJTNBJTIwJTdCJ2Nhc2VfSUQnJTNBJTIwJzExMDkyMS5qc29uJyUyQyUwQSUyMCUyMCUyMCdjYXNlX2p1cmlzZGljdGlvbiclM0ElMjAnc2NvdHVzLnRhci5neiclMkMlMEElMjAlMjAlMjAnZGF0ZV9jcmVhdGVkJyUzQSUyMCcyMDEwLTA0LTI4VDE3JTNBMTIlM0E0OVonJTdEJTJDJTBBJTIwJTIwJ3RleHQnJTNBJTIwJyU1Q240NjElMjBVLlMuJTIwMjM4JTIwKDE5ODMpJTVDbk9MSU0lMjBFVCUyMEFMLiU1Q252LiU1Q25XQUtJTkVLT05BJTVDbk5vLiUyMDgxLTE1ODEuJTVDblN1cHJlbWUlMjBDb3VydCUyMG9mJTIwVW5pdGVkJTIwU3RhdGVzLiU1Q25Bcmd1ZWQlMjBKYW51YXJ5JTIwMTklMkMlMjAxOTgzLiU1Q25EZWNpZGVkJTIwQXByaWwlMjAyNiUyQyUyMDE5ODMuJTVDbkNFUlRJT1JBUkklMjBUTyUyMFRIRSUyMFVOSVRFRCUyMFNUQVRFUyUyMENPVVJUJTIwT0YlMjBBUFBFQUxTJTIwRk9SJTIwVEhFJTIwTklOVEglMjBDSVJDVUlUJTVDbioyMzklMjBNaWNoYWVsJTIwQS4lMjBMaWxseSUyQyUyMEZpcnN0JTIwRGVwdXR5JTIwQXR0b3JuZXklMjBHZW5lcmFsJTIwb2YlMjBIYXdhaWklMkMlMjBhcmd1ZWQlMjB0aGUlMjBjYXVzZSUyMGZvciUyMHBldGl0aW9uZXJzLiUyMFdpdGglMjBoaW0lMjBvbiUyMHRoZSUyMGJyaWVmJTIwd2FzJTIwSmFtZXMlMjBILiUyMERhbm5lbmJlcmclMkMlMjBEZXB1dHklMjBBdHRvcm5leSUyMEdlbmVyYWwuLi4nJTdEJTVE",highlighted:`[{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Epidemiology of hypoxaemia in children with acute lower respiratory infection ...'</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'case_ID'</span>: <span class="hljs-string">'110921.json'</span>, | |
| <span class="hljs-string">'case_jurisdiction'</span>: <span class="hljs-string">'scotus.tar.gz'</span>, | |
| <span class="hljs-string">'date_created'</span>: <span class="hljs-string">'2010-04-28T17:12:49Z'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'\\n461 U.S. 238 (1983)\\nOLIM ET AL.\\nv.\\nWAKINEKONA\\nNo. 81-1581.\\nSupreme Court of United States.\\nArgued January 19, 1983.\\nDecided April 26, 1983.\\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...'</span>}]`,wrap:!1}}),Ze=new J({props:{code:"YmFzZV91cmwlMjAlM0QlMjAlMjJodHRwcyUzQSUyRiUyRnRoZS1leWUuZXUlMkZwdWJsaWMlMkZBSSUyRnBpbGUlMkYlMjIlMEFkYXRhX2ZpbGVzJTIwJTNEJTIwJTdCJTBBJTIwJTIwJTIwJTIwJTIydHJhaW4lMjIlM0ElMjAlNUJiYXNlX3VybCUyMCUyQiUyMCUyMnRyYWluJTJGJTIyJTIwJTJCJTIwZiUyMiU3QmlkeCUzQTAyZCU3RC5qc29ubC56c3QlMjIlMjBmb3IlMjBpZHglMjBpbiUyMHJhbmdlKDMwKSU1RCUyQyUwQSUyMCUyMCUyMCUyMCUyMnZhbGlkYXRpb24lMjIlM0ElMjBiYXNlX3VybCUyMCUyQiUyMCUyMnZhbC5qc29ubC56c3QlMjIlMkMlMEElMjAlMjAlMjAlMjAlMjJ0ZXN0JTIyJTNBJTIwYmFzZV91cmwlMjAlMkIlMjAlMjJ0ZXN0Lmpzb25sLnpzdCUyMiUyQyUwQSU3RCUwQXBpbGVfZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJqc29uJTIyJTJDJTIwZGF0YV9maWxlcyUzRGRhdGFfZmlsZXMlMkMlMjBzdHJlYW1pbmclM0RUcnVlKSUwQW5leHQoaXRlcihwaWxlX2RhdGFzZXQlNUIlMjJ0cmFpbiUyMiU1RCkp",highlighted:`base_url = <span class="hljs-string">"https://the-eye.eu/public/AI/pile/"</span> | |
| data_files = { | |
| <span class="hljs-string">"train"</span>: [base_url + <span class="hljs-string">"train/"</span> + <span class="hljs-string">f"<span class="hljs-subst">{idx:02d}</span>.jsonl.zst"</span> <span class="hljs-keyword">for</span> idx <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">30</span>)], | |
| <span class="hljs-string">"validation"</span>: base_url + <span class="hljs-string">"val.jsonl.zst"</span>, | |
| <span class="hljs-string">"test"</span>: base_url + <span class="hljs-string">"test.jsonl.zst"</span>, | |
| } | |
| pile_dataset = load_dataset(<span class="hljs-string">"json"</span>, data_files=data_files, streaming=<span class="hljs-literal">True</span>) | |
| <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(pile_dataset[<span class="hljs-string">"train"</span>]))`,wrap:!1}}),Ge=new J({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ3BpbGVfc2V0X25hbWUnJTNBJTIwJ1BpbGUtQ0MnJTdEJTJDJTBBJTIwJ3RleHQnJTNBJTIwJ0l0JTIwaXMlMjBkb25lJTJDJTIwYW5kJTIwc3VibWl0dGVkLiUyMFlvdSUyMGNhbiUyMHBsYXklMjAlRTIlODAlOUNTdXJ2aXZhbCUyMG9mJTIwdGhlJTIwVGFzdGllc3QlRTIlODAlOUQlMjBvbiUyMEFuZHJvaWQlMkMlMjBhbmQlMjBvbiUyMHRoZSUyMHdlYi4uLiclN0Q=",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pile_set_name'</span>: <span class="hljs-string">'Pile-CC'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web...'</span>}`,wrap:!1}}),I=new Re({props:{$$slots:{default:[Qt]},$$scope:{ctx:w}}}),Ve=new Nt({props:{source:"https://github.com/huggingface/course/blob/main/chapters/en/chapter5/4.mdx"}}),{c(){i=m("meta"),f=a(),h=m("p"),T=a(),p($.$$.fragment),Xe=a(),p(x.$$.fragment),Ne=a(),B=m("p"),B.innerHTML=Ps,Ee=a(),Z=m("p"),Z.innerHTML=qs,ve=a(),p(G.$$.fragment),_e=a(),k=m("p"),k.innerHTML=Os,ze=a(),p(V.$$.fragment),Qe=a(),W=m("p"),W.innerHTML=Ks,He=a(),p(R.$$.fragment),Fe=a(),C=m("p"),C.innerHTML=et,Ye=a(),p(X.$$.fragment),Ae=a(),p(N.$$.fragment),Se=a(),E=m("p"),E.textContent=st,Le=a(),p(b.$$.fragment),De=a(),v=m("p"),v.textContent=tt,Pe=a(),p(_.$$.fragment),qe=a(),p(z.$$.fragment),Oe=a(),Q=m("p"),Q.textContent=lt,Ke=a(),p(H.$$.fragment),es=a(),F=m("p"),F.innerHTML=at,ss=a(),p(Y.$$.fragment),ts=a(),A=m("p"),A.innerHTML=nt,ls=a(),p(S.$$.fragment),as=a(),p(L.$$.fragment),ns=a(),D=m("p"),D.innerHTML=it,is=a(),p(P.$$.fragment),ps=a(),p(q.$$.fragment),os=a(),O=m("p"),O.textContent=pt,rs=a(),p(j.$$.fragment),cs=a(),K=m("p"),K.innerHTML=ot,Ms=a(),ee=m("p"),ee.innerHTML=rt,ds=a(),p(se.$$.fragment),hs=a(),p(te.$$.fragment),ms=a(),le=m("p"),le.innerHTML=ct,us=a(),p(U.$$.fragment),ys=a(),p(ae.$$.fragment),Js=a(),ne=m("p"),ne.innerHTML=Mt,fs=a(),p(ie.$$.fragment),Ts=a(),pe=m("p"),pe.innerHTML=dt,ws=a(),p(oe.$$.fragment),bs=a(),p(re.$$.fragment),js=a(),ce=m("p"),ce.innerHTML=ht,Us=a(),p(Me.$$.fragment),gs=a(),p(de.$$.fragment),Is=a(),p(g.$$.fragment),$s=a(),he=m("p"),he.innerHTML=mt,xs=a(),p(me.$$.fragment),Bs=a(),p(ue.$$.fragment),Zs=a(),ye=m("p"),ye.innerHTML=ut,Gs=a(),p(Je.$$.fragment),ks=a(),p(fe.$$.fragment),Vs=a(),Te=m("p"),Te.innerHTML=yt,Ws=a(),p(we.$$.fragment),Rs=a(),be=m("p"),be.innerHTML=Jt,Cs=a(),p(je.$$.fragment),Xs=a(),p(Ue.$$.fragment),Ns=a(),ge=m("p"),ge.innerHTML=ft,Es=a(),p(Ie.$$.fragment),vs=a(),p($e.$$.fragment),_s=a(),xe=m("p"),xe.innerHTML=Tt,zs=a(),Be=m("p"),Be.textContent=wt,Qs=a(),p(Ze.$$.fragment),Hs=a(),p(Ge.$$.fragment),Fs=a(),p(I.$$.fragment),Ys=a(),ke=m("p"),ke.textContent=bt,As=a(),p(Ve.$$.fragment),Ss=a(),We=m("p"),this.h()},l(e){const s=Wt("svelte-u9bgzb",document.head);i=u(s,"META",{name:!0,content:!0}),s.forEach(t),f=n(e),h=u(e,"P",{}),xt(h).forEach(t),T=n(e),o($.$$.fragment,e),Xe=n(e),o(x.$$.fragment,e),Ne=n(e),B=u(e,"P",{"data-svelte-h":!0}),y(B)!=="svelte-kyt5th"&&(B.innerHTML=Ps),Ee=n(e),Z=u(e,"P",{"data-svelte-h":!0}),y(Z)!=="svelte-101a1li"&&(Z.innerHTML=qs),ve=n(e),o(G.$$.fragment,e),_e=n(e),k=u(e,"P",{"data-svelte-h":!0}),y(k)!=="svelte-akmr9k"&&(k.innerHTML=Os),ze=n(e),o(V.$$.fragment,e),Qe=n(e),W=u(e,"P",{"data-svelte-h":!0}),y(W)!=="svelte-14mo1po"&&(W.innerHTML=Ks),He=n(e),o(R.$$.fragment,e),Fe=n(e),C=u(e,"P",{"data-svelte-h":!0}),y(C)!=="svelte-16qj1dd"&&(C.innerHTML=et),Ye=n(e),o(X.$$.fragment,e),Ae=n(e),o(N.$$.fragment,e),Se=n(e),E=u(e,"P",{"data-svelte-h":!0}),y(E)!=="svelte-cbv5ht"&&(E.textContent=st),Le=n(e),o(b.$$.fragment,e),De=n(e),v=u(e,"P",{"data-svelte-h":!0}),y(v)!=="svelte-1w1s5mw"&&(v.textContent=tt),Pe=n(e),o(_.$$.fragment,e),qe=n(e),o(z.$$.fragment,e),Oe=n(e),Q=u(e,"P",{"data-svelte-h":!0}),y(Q)!=="svelte-jeugzh"&&(Q.textContent=lt),Ke=n(e),o(H.$$.fragment,e),es=n(e),F=u(e,"P",{"data-svelte-h":!0}),y(F)!=="svelte-1ym3qey"&&(F.innerHTML=at),ss=n(e),o(Y.$$.fragment,e),ts=n(e),A=u(e,"P",{"data-svelte-h":!0}),y(A)!=="svelte-703q8t"&&(A.innerHTML=nt),ls=n(e),o(S.$$.fragment,e),as=n(e),o(L.$$.fragment,e),ns=n(e),D=u(e,"P",{"data-svelte-h":!0}),y(D)!=="svelte-1g80mam"&&(D.innerHTML=it),is=n(e),o(P.$$.fragment,e),ps=n(e),o(q.$$.fragment,e),os=n(e),O=u(e,"P",{"data-svelte-h":!0}),y(O)!=="svelte-e299qv"&&(O.textContent=pt),rs=n(e),o(j.$$.fragment,e),cs=n(e),K=u(e,"P",{"data-svelte-h":!0}),y(K)!=="svelte-19s0dv4"&&(K.innerHTML=ot),Ms=n(e),ee=u(e,"P",{"data-svelte-h":!0}),y(ee)!=="svelte-rzk2et"&&(ee.innerHTML=rt),ds=n(e),o(se.$$.fragment,e),hs=n(e),o(te.$$.fragment,e),ms=n(e),le=u(e,"P",{"data-svelte-h":!0}),y(le)!=="svelte-ozqcdp"&&(le.innerHTML=ct),us=n(e),o(U.$$.fragment,e),ys=n(e),o(ae.$$.fragment,e),Js=n(e),ne=u(e,"P",{"data-svelte-h":!0}),y(ne)!=="svelte-2xtgtq"&&(ne.innerHTML=Mt),fs=n(e),o(ie.$$.fragment,e),Ts=n(e),pe=u(e,"P",{"data-svelte-h":!0}),y(pe)!=="svelte-xqykn4"&&(pe.innerHTML=dt),ws=n(e),o(oe.$$.fragment,e),bs=n(e),o(re.$$.fragment,e),js=n(e),ce=u(e,"P",{"data-svelte-h":!0}),y(ce)!=="svelte-vmgfpy"&&(ce.innerHTML=ht),Us=n(e),o(Me.$$.fragment,e),gs=n(e),o(de.$$.fragment,e),Is=n(e),o(g.$$.fragment,e),$s=n(e),he=u(e,"P",{"data-svelte-h":!0}),y(he)!=="svelte-128fecr"&&(he.innerHTML=mt),xs=n(e),o(me.$$.fragment,e),Bs=n(e),o(ue.$$.fragment,e),Zs=n(e),ye=u(e,"P",{"data-svelte-h":!0}),y(ye)!=="svelte-13vs8h4"&&(ye.innerHTML=ut),Gs=n(e),o(Je.$$.fragment,e),ks=n(e),o(fe.$$.fragment,e),Vs=n(e),Te=u(e,"P",{"data-svelte-h":!0}),y(Te)!=="svelte-88ekky"&&(Te.innerHTML=yt),Ws=n(e),o(we.$$.fragment,e),Rs=n(e),be=u(e,"P",{"data-svelte-h":!0}),y(be)!=="svelte-iuph5y"&&(be.innerHTML=Jt),Cs=n(e),o(je.$$.fragment,e),Xs=n(e),o(Ue.$$.fragment,e),Ns=n(e),ge=u(e,"P",{"data-svelte-h":!0}),y(ge)!=="svelte-cn8r79"&&(ge.innerHTML=ft),Es=n(e),o(Ie.$$.fragment,e),vs=n(e),o($e.$$.fragment,e),_s=n(e),xe=u(e,"P",{"data-svelte-h":!0}),y(xe)!=="svelte-a5vxnl"&&(xe.innerHTML=Tt),zs=n(e),Be=u(e,"P",{"data-svelte-h":!0}),y(Be)!=="svelte-1j6091b"&&(Be.textContent=wt),Qs=n(e),o(Ze.$$.fragment,e),Hs=n(e),o(Ge.$$.fragment,e),Fs=n(e),o(I.$$.fragment,e),Ys=n(e),ke=u(e,"P",{"data-svelte-h":!0}),y(ke)!=="svelte-9015n6"&&(ke.textContent=bt),As=n(e),o(Ve.$$.fragment,e),Ss=n(e),We=u(e,"P",{}),xt(We).forEach(t),this.h()},h(){Bt(i,"name","hf:doc:metadata"),Bt(i,"content",Ft)},m(e,s){Rt(document.head,i),l(e,f,s),l(e,h,s),l(e,T,s),r($,e,s),l(e,Xe,s),r(x,e,s),l(e,Ne,s),l(e,B,s),l(e,Ee,s),l(e,Z,s),l(e,ve,s),r(G,e,s),l(e,_e,s),l(e,k,s),l(e,ze,s),r(V,e,s),l(e,Qe,s),l(e,W,s),l(e,He,s),r(R,e,s),l(e,Fe,s),l(e,C,s),l(e,Ye,s),r(X,e,s),l(e,Ae,s),r(N,e,s),l(e,Se,s),l(e,E,s),l(e,Le,s),r(b,e,s),l(e,De,s),l(e,v,s),l(e,Pe,s),r(_,e,s),l(e,qe,s),r(z,e,s),l(e,Oe,s),l(e,Q,s),l(e,Ke,s),r(H,e,s),l(e,es,s),l(e,F,s),l(e,ss,s),r(Y,e,s),l(e,ts,s),l(e,A,s),l(e,ls,s),r(S,e,s),l(e,as,s),r(L,e,s),l(e,ns,s),l(e,D,s),l(e,is,s),r(P,e,s),l(e,ps,s),r(q,e,s),l(e,os,s),l(e,O,s),l(e,rs,s),r(j,e,s),l(e,cs,s),l(e,K,s),l(e,Ms,s),l(e,ee,s),l(e,ds,s),r(se,e,s),l(e,hs,s),r(te,e,s),l(e,ms,s),l(e,le,s),l(e,us,s),r(U,e,s),l(e,ys,s),r(ae,e,s),l(e,Js,s),l(e,ne,s),l(e,fs,s),r(ie,e,s),l(e,Ts,s),l(e,pe,s),l(e,ws,s),r(oe,e,s),l(e,bs,s),r(re,e,s),l(e,js,s),l(e,ce,s),l(e,Us,s),r(Me,e,s),l(e,gs,s),r(de,e,s),l(e,Is,s),r(g,e,s),l(e,$s,s),l(e,he,s),l(e,xs,s),r(me,e,s),l(e,Bs,s),r(ue,e,s),l(e,Zs,s),l(e,ye,s),l(e,Gs,s),r(Je,e,s),l(e,ks,s),r(fe,e,s),l(e,Vs,s),l(e,Te,s),l(e,Ws,s),r(we,e,s),l(e,Rs,s),l(e,be,s),l(e,Cs,s),r(je,e,s),l(e,Xs,s),r(Ue,e,s),l(e,Ns,s),l(e,ge,s),l(e,Es,s),r(Ie,e,s),l(e,vs,s),r($e,e,s),l(e,_s,s),l(e,xe,s),l(e,zs,s),l(e,Be,s),l(e,Qs,s),r(Ze,e,s),l(e,Hs,s),r(Ge,e,s),l(e,Fs,s),r(I,e,s),l(e,Ys,s),l(e,ke,s),l(e,As,s),r(Ve,e,s),l(e,Ss,s),l(e,We,s),Ls=!0},p(e,[s]){const jt={};s&2&&(jt.$$scope={dirty:s,ctx:e}),b.$set(jt);const Ut={};s&2&&(Ut.$$scope={dirty:s,ctx:e}),j.$set(Ut);const gt={};s&2&&(gt.$$scope={dirty:s,ctx:e}),U.$set(gt);const It={};s&2&&(It.$$scope={dirty:s,ctx:e}),g.$set(It);const $t={};s&2&&($t.$$scope={dirty:s,ctx:e}),I.$set($t)},i(e){Ls||(c($.$$.fragment,e),c(x.$$.fragment,e),c(G.$$.fragment,e),c(V.$$.fragment,e),c(R.$$.fragment,e),c(X.$$.fragment,e),c(N.$$.fragment,e),c(b.$$.fragment,e),c(_.$$.fragment,e),c(z.$$.fragment,e),c(H.$$.fragment,e),c(Y.$$.fragment,e),c(S.$$.fragment,e),c(L.$$.fragment,e),c(P.$$.fragment,e),c(q.$$.fragment,e),c(j.$$.fragment,e),c(se.$$.fragment,e),c(te.$$.fragment,e),c(U.$$.fragment,e),c(ae.$$.fragment,e),c(ie.$$.fragment,e),c(oe.$$.fragment,e),c(re.$$.fragment,e),c(Me.$$.fragment,e),c(de.$$.fragment,e),c(g.$$.fragment,e),c(me.$$.fragment,e),c(ue.$$.fragment,e),c(Je.$$.fragment,e),c(fe.$$.fragment,e),c(we.$$.fragment,e),c(je.$$.fragment,e),c(Ue.$$.fragment,e),c(Ie.$$.fragment,e),c($e.$$.fragment,e),c(Ze.$$.fragment,e),c(Ge.$$.fragment,e),c(I.$$.fragment,e),c(Ve.$$.fragment,e),Ls=!0)},o(e){M($.$$.fragment,e),M(x.$$.fragment,e),M(G.$$.fragment,e),M(V.$$.fragment,e),M(R.$$.fragment,e),M(X.$$.fragment,e),M(N.$$.fragment,e),M(b.$$.fragment,e),M(_.$$.fragment,e),M(z.$$.fragment,e),M(H.$$.fragment,e),M(Y.$$.fragment,e),M(S.$$.fragment,e),M(L.$$.fragment,e),M(P.$$.fragment,e),M(q.$$.fragment,e),M(j.$$.fragment,e),M(se.$$.fragment,e),M(te.$$.fragment,e),M(U.$$.fragment,e),M(ae.$$.fragment,e),M(ie.$$.fragment,e),M(oe.$$.fragment,e),M(re.$$.fragment,e),M(Me.$$.fragment,e),M(de.$$.fragment,e),M(g.$$.fragment,e),M(me.$$.fragment,e),M(ue.$$.fragment,e),M(Je.$$.fragment,e),M(fe.$$.fragment,e),M(we.$$.fragment,e),M(je.$$.fragment,e),M(Ue.$$.fragment,e),M(Ie.$$.fragment,e),M($e.$$.fragment,e),M(Ze.$$.fragment,e),M(Ge.$$.fragment,e),M(I.$$.fragment,e),M(Ve.$$.fragment,e),Ls=!1},d(e){e&&(t(f),t(h),t(T),t(Xe),t(Ne),t(B),t(Ee),t(Z),t(ve),t(_e),t(k),t(ze),t(Qe),t(W),t(He),t(Fe),t(C),t(Ye),t(Ae),t(Se),t(E),t(Le),t(De),t(v),t(Pe),t(qe),t(Oe),t(Q),t(Ke),t(es),t(F),t(ss),t(ts),t(A),t(ls),t(as),t(ns),t(D),t(is),t(ps),t(os),t(O),t(rs),t(cs),t(K),t(Ms),t(ee),t(ds),t(hs),t(ms),t(le),t(us),t(ys),t(Js),t(ne),t(fs),t(Ts),t(pe),t(ws),t(bs),t(js),t(ce),t(Us),t(gs),t(Is),t($s),t(he),t(xs),t(Bs),t(Zs),t(ye),t(Gs),t(ks),t(Vs),t(Te),t(Ws),t(Rs),t(be),t(Cs),t(Xs),t(Ns),t(ge),t(Es),t(vs),t(_s),t(xe),t(zs),t(Be),t(Qs),t(Hs),t(Fs),t(Ys),t(ke),t(As),t(Ss),t(We)),t(i),d($,e),d(x,e),d(G,e),d(V,e),d(R,e),d(X,e),d(N,e),d(b,e),d(_,e),d(z,e),d(H,e),d(Y,e),d(S,e),d(L,e),d(P,e),d(q,e),d(j,e),d(se,e),d(te,e),d(U,e),d(ae,e),d(ie,e),d(oe,e),d(re,e),d(Me,e),d(de,e),d(g,e),d(me,e),d(ue,e),d(Je,e),d(fe,e),d(we,e),d(je,e),d(Ue,e),d(Ie,e),d($e,e),d(Ze,e),d(Ge,e),d(I,e),d(Ve,e)}}}const Ft='{"title":"Big data? 🤗 Datasets to the rescue!","local":"big-data-datasets-to-the-rescue","sections":[{"title":"What is the Pile?","local":"what-is-the-pile","sections":[],"depth":2},{"title":"The magic of memory mapping","local":"the-magic-of-memory-mapping","sections":[],"depth":2},{"title":"Streaming datasets","local":"streaming-datasets","sections":[],"depth":2}],"depth":1}';function Yt(w){return Gt(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Kt extends kt{constructor(i){super(),Vt(this,i,Yt,Ht,Zt,{})}}export{Kt as component}; | |
Xet Storage Details
- Size:
- 50.4 kB
- Xet hash:
- 4a0bc35b38cb3e587ed28b6d29bf233ed3a218e6b3cccac9c0f512d0508bd2d6
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.