Buckets:
| import{s as ms,n as os,o as hs}from"../chunks/scheduler.bdbef820.js";import{S as fs,i as ys,g as i,s as l,r as d,A as Ms,h as r,f as s,c as n,j as ds,u as c,x as p,k as cs,y as us,a as t,v as m,d as o,t as h,w as f}from"../chunks/index.c0aea24a.js";import{C as y}from"../chunks/CodeBlock.6ccca92e.js";import{H as u,E as bs}from"../chunks/EditOnGithub.725ee0c1.js";function ws(Ca){let M,Ue,Te,ge,b,Ze,w,$a=`There are two types of dataset objects, a <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset">Dataset</a> and an <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>. | |
| Whichever type of dataset you choose to use or create depends on the size of the dataset. | |
| In general, an <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> is ideal for big datasets (think hundreds of GBs!) due to its lazy behavior and speed advantages, while a <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset">Dataset</a> is great for everything else. | |
| This page will compare the differences between a <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset">Dataset</a> and an <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> to help you pick the right dataset object for you.`,Xe,j,Ie,J,Va=`When you have a regular <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset">Dataset</a>, you can access it using <code>my_dataset[0]</code>. This provides random access to the rows. | |
| Such datasets are also called “map-style” datasets. | |
| For example you can download ImageNet-1k like this and access any row:`,ke,T,Ge,_,Ra=`But one caveat is that you must have the entire dataset stored on your disk or in memory, which blocks you from accessing datasets bigger than the disk. | |
| Because it can become inconvenient for big datasets, there exists another type of dataset, the <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>. | |
| When you have an <code>IterableDataset</code>, you can access it using a <code>for</code> loop to load the data progressively as you iterate over the dataset. | |
| This way, only a small fraction of examples is loaded in memory, and you don’t write anything on disk.`,Ce,U,va="For example, you can stream the ImageNet-1k dataset without downloading it on disk:",$e,g,Ve,Z,Ba=`Streaming can read online data without writing any file to disk. | |
| For example, you can stream datasets made out of multiple shards, each of which is hundreds of gigabytes like <a href="https://huggingface.co/datasets/c4" rel="nofollow">C4</a>, <a href="https://huggingface.co/datasets/oscar" rel="nofollow">OSCAR</a> or <a href="https://huggingface.co/datasets/laion/laion2B-en" rel="nofollow">LAION-2B</a>. | |
| Learn more about how to stream a dataset in the <a href="./stream">Dataset Streaming Guide</a>.`,Re,X,Fa="This is not the only difference though, because the “lazy” behavior of an <code>IterableDataset</code> is also present when it comes to dataset creation and processing.",ve,I,Be,k,Ya='You can create a <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset">Dataset</a> using lists or dictionaries, and the data is entirely converted to Arrow so you can easily access any row:',Fe,G,Ye,C,xa=`To create an <code>IterableDataset</code> on the other hand, you must provide a “lazy” way to load the data. | |
| In Python, we generally use generator functions. These functions <code>yield</code> one example at a time, which means you can’t access a row by slicing it like a regular <code>Dataset</code>:`,xe,$,ze,V,We,R,za='It is possible to convert local or remote data files to an Arrow <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset">Dataset</a> using <a href="/docs/datasets/main/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a>:',Qe,v,Ne,B,Wa="However, this requires a conversion step from CSV to Arrow format, which takes time and disk space if your dataset is big.",De,F,Qa=`To save disk space and skip the conversion step, you can define an <code>IterableDataset</code> by streaming from the local files directly. | |
| This way, the data is read progressively from the local files as you iterate over the dataset:`,He,Y,Se,x,Na=`Many file formats are supported, like CSV, JSONL, and Parquet, as well as image and audio files. | |
| You can find more information in the corresponding guides for loading <a href="./tabular_load">tabular</a>, <a href="./nlp_load">text</a>, <a href="./image_load">vision</a>, and <a href="./audio_load%5D">audio</a> datasets.`,Ee,z,Ae,W,Da=`When you process a <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset">Dataset</a> object using <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map">Dataset.map()</a>, the entire dataset is processed immediately and returned. | |
| This is similar to how <code>pandas</code> works for example.`,Le,Q,qe,N,Ha=`On the other hand, due to the “lazy” nature of an <code>IterableDataset</code>, calling <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset.map">IterableDataset.map()</a> does not apply your <code>map</code> function over the full dataset. | |
| Instead, your <code>map</code> function is applied on-the-fly.`,Pe,D,Sa="Because of that, you can chain multiple processing steps and they will all run at once when you start iterating over the dataset:",Ke,H,Oe,S,ea,E,Ea=`When you shuffle a <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset">Dataset</a> using <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.shuffle">Dataset.shuffle()</a>, you apply an exact shuffling of the dataset. | |
| It works by taking a list of indices <code>[0, 1, 2, ... len(my_dataset) - 1]</code> and shuffling this list. | |
| Then, accessing <code>my_dataset[0]</code> returns the row and index defined by the first element of the indices mapping that has been shuffled:`,aa,A,sa,L,Aa=`Since we don’t have random access to the rows in the case of an <code>IterableDataset</code>, we can’t use a shuffled list of indices and access a row at an arbitrary position. | |
| This prevents the use of exact shuffling. | |
| Instead, a fast approximate shuffling is used in <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset.shuffle">IterableDataset.shuffle()</a>. | |
| It uses a shuffle buffer to sample random examples iteratively from the dataset. | |
| Since the dataset is still read iteratively, it provides excellent speed performance:`,ta,q,la,P,La='But using a shuffle buffer is not enough to provide a satisfactory shuffling for machine learning model training. So <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset.shuffle">IterableDataset.shuffle()</a> also shuffles the dataset shards if your dataset is made of multiple files or sources:',na,K,ia,O,ra,ee,qa=`Regular <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset">Dataset</a> objects are based on Arrow which provides fast random access to the rows. | |
| Thanks to memory mapping and the fact that Arrow is an in-memory format, reading data from disk doesn’t do expensive system calls and deserialization. | |
| It provides even faster data loading when iterating using a <code>for</code> loop by iterating on contiguous Arrow record batches.`,pa,ae,Pa=`However as soon as your <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset">Dataset</a> has an indices mapping (via <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.shuffle">Dataset.shuffle()</a> for example), the speed can become 10x slower. | |
| This is because there is an extra step to get the row index to read using the indices mapping, and most importantly, you aren’t reading contiguous chunks of data anymore. | |
| To restore the speed, you’d need to rewrite the entire dataset on your disk again using <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.flatten_indices">Dataset.flatten_indices()</a>, which removes the indices mapping. | |
| This may take a lot of time depending on the size of your dataset though:`,da,se,ca,te,Ka=`In this case, we recommend switching to an <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> and leveraging its fast approximate shuffling method <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset.shuffle">IterableDataset.shuffle()</a>. | |
| It only shuffles the shards order and adds a shuffle buffer to your dataset, which keeps the speed of your dataset optimal. | |
| You can also reshuffle the dataset easily:`,ma,le,oa,ne,Oa=`If you’re using your dataset on multiple epochs, the effective seed to shuffle the shards order in the shuffle buffer is <code>seed + epoch</code>. | |
| It makes it easy to reshuffle a dataset between epochs:`,ha,ie,fa,re,es="To restart the iteration of a map-style dataset, you can simply skip the first examples:",ya,pe,Ma,de,as="But if you use a <code>DataLoader</code> with a <code>Sampler</code>, you should instead save the state of your sampler (you might have written a custom sampler that allows resuming).",ua,ce,ss='On the other hand, iterable datasets don’t provide random access to a specific example index to resume from. But you can use <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset.state_dict">IterableDataset.state_dict()</a> and <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset.load_state_dict">IterableDataset.load_state_dict()</a> to resume from a checkpoint instead, similarly to what you can do for models and optimizers:',ba,me,wa,oe,ts="Under the hood, the iterable dataset keeps track of the current shard being read and the example index in the current shard and it stores this info in the <code>state_dict</code>.",ja,he,ls=`To resume from a checkpoint, the dataset skips all the shards that were previously read to restart from the current shard. | |
| Then it reads the shard and skips examples until it reaches the exact example from the checkpoint.`,Ja,fe,ns="Therefore restarting a dataset is quite fast, since it will not re-read the shards that have already been iterated on. Still, resuming a dataset is generally not instantaneous since it has to restart reading from the beginning of the current shard and skip examples until it reaches the checkpoint location.",Ta,ye,is='This can be used with the <code>StatefulDataLoader</code> from <code>torchdata</code>, see <a href="./use_with_pytorch#stream-data">streaming with a PyTorch DataLoader</a>.',_a,Me,Ua,ue,rs='If you want to benefit from the “lazy” behavior of an <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> or their speed advantages, you can switch your map-style <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset">Dataset</a> to an <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>:',ga,be,Za,we,ps='If you want to shuffle your dataset or <a href="./use_with_pytorch#stream-data">use it with a PyTorch DataLoader</a>, we recommend generating a sharded <a href="/docs/datasets/main/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>:',Xa,je,Ia,Je,ka,_e,Ga;return b=new u({props:{title:"Differences between Dataset and IterableDataset",local:"differences-between-dataset-and-iterabledataset",headingTag:"h1"}}),j=new u({props:{title:"Downloading and streaming",local:"downloading-and-streaming",headingTag:"h2"}}),T=new y({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBJTBBaW1hZ2VuZXQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIyaW1hZ2VuZXQtMWslMjIlMkMlMjBzcGxpdCUzRCUyMnRyYWluJTIyKSUyMCUyMCUyMyUyMGRvd25sb2FkcyUyMHRoZSUyMGZ1bGwlMjBkYXRhc2V0JTBBcHJpbnQoaW1hZ2VuZXQlNUIwJTVEKQ==",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| imagenet = load_dataset(<span class="hljs-string">"imagenet-1k"</span>, split=<span class="hljs-string">"train"</span>) <span class="hljs-comment"># downloads the full dataset</span> | |
| <span class="hljs-built_in">print</span>(imagenet[<span class="hljs-number">0</span>])`,wrap:!1}}),g=new y({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBJTBBaW1hZ2VuZXQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIyaW1hZ2VuZXQtMWslMjIlMkMlMjBzcGxpdCUzRCUyMnRyYWluJTIyJTJDJTIwc3RyZWFtaW5nJTNEVHJ1ZSklMjAlMjAlMjMlMjB3aWxsJTIwc3RhcnQlMjBsb2FkaW5nJTIwdGhlJTIwZGF0YSUyMHdoZW4lMjBpdGVyYXRlZCUyMG92ZXIlMEFmb3IlMjBleGFtcGxlJTIwaW4lMjBpbWFnZW5ldCUzQSUwQSUyMCUyMCUyMCUyMHByaW50KGV4YW1wbGUpJTBBJTIwJTIwJTIwJTIwYnJlYWs=",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| imagenet = load_dataset(<span class="hljs-string">"imagenet-1k"</span>, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span>) <span class="hljs-comment"># will start loading the data when iterated over</span> | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> imagenet: | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span>`,wrap:!1}}),I=new u({props:{title:"Creating map-style datasets and iterable datasets",local:"creating-map-style-datasets-and-iterable-datasets",headingTag:"h2"}}),G=new y({props:{code:"bXlfZGF0YXNldCUyMCUzRCUyMERhdGFzZXQuZnJvbV9kaWN0KCU3QiUyMmNvbF8xJTIyJTNBJTIwJTVCMCUyQyUyMDElMkMlMjAyJTJDJTIwMyUyQyUyMDQlMkMlMjA1JTJDJTIwNiUyQyUyMDclMkMlMjA4JTJDJTIwOSU1RCU3RCklMEFwcmludChteV9kYXRhc2V0JTVCMCU1RCk=",highlighted:`my_dataset = Dataset.from_dict({<span class="hljs-string">"col_1"</span>: [<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">3</span>, <span class="hljs-number">4</span>, <span class="hljs-number">5</span>, <span class="hljs-number">6</span>, <span class="hljs-number">7</span>, <span class="hljs-number">8</span>, <span class="hljs-number">9</span>]}) | |
| <span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])`,wrap:!1}}),$=new y({props:{code:"ZGVmJTIwbXlfZ2VuZXJhdG9yKG4pJTNBJTBBJTIwJTIwJTIwJTIwZm9yJTIwaSUyMGluJTIwcmFuZ2UobiklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB5aWVsZCUyMCU3QiUyMmNvbF8xJTIyJTNBJTIwaSU3RCUwQSUwQW15X2l0ZXJhYmxlX2RhdGFzZXQlMjAlM0QlMjBJdGVyYWJsZURhdGFzZXQuZnJvbV9nZW5lcmF0b3IobXlfZ2VuZXJhdG9yJTJDJTIwZ2VuX2t3YXJncyUzRCU3QiUyMm4lMjIlM0ElMjAxMCU3RCklMEFmb3IlMjBleGFtcGxlJTIwaW4lMjBteV9pdGVyYWJsZV9kYXRhc2V0JTNBJTBBJTIwJTIwJTIwJTIwcHJpbnQoZXhhbXBsZSklMEElMjAlMjAlMjAlMjBicmVhaw==",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">my_generator</span>(<span class="hljs-params">n</span>): | |
| <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n): | |
| <span class="hljs-keyword">yield</span> {<span class="hljs-string">"col_1"</span>: i} | |
| my_iterable_dataset = IterableDataset.from_generator(my_generator, gen_kwargs={<span class="hljs-string">"n"</span>: <span class="hljs-number">10</span>}) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span>`,wrap:!1}}),V=new u({props:{title:"Loading local files entirely and progressively",local:"loading-local-files-entirely-and-progressively",headingTag:"h2"}}),v=new y({props:{code:"ZGF0YV9maWxlcyUyMCUzRCUyMCU3QiUyMnRyYWluJTIyJTNBJTIwJTVCJTIycGF0aCUyRnRvJTJGZGF0YS5jc3YlMjIlNUQlN0QlMEFteV9kYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMmNzdiUyMiUyQyUyMGRhdGFfZmlsZXMlM0RkYXRhX2ZpbGVzJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiklMEFwcmludChteV9kYXRhc2V0JTVCMCU1RCk=",highlighted:`data_files = {<span class="hljs-string">"train"</span>: [<span class="hljs-string">"path/to/data.csv"</span>]} | |
| my_dataset = load_dataset(<span class="hljs-string">"csv"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>) | |
| <span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])`,wrap:!1}}),Y=new y({props:{code:"ZGF0YV9maWxlcyUyMCUzRCUyMCU3QiUyMnRyYWluJTIyJTNBJTIwJTVCJTIycGF0aCUyRnRvJTJGZGF0YS5jc3YlMjIlNUQlN0QlMEFteV9pdGVyYWJsZV9kYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMmNzdiUyMiUyQyUyMGRhdGFfZmlsZXMlM0RkYXRhX2ZpbGVzJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiUyQyUyMHN0cmVhbWluZyUzRFRydWUpJTBBZm9yJTIwZXhhbXBsZSUyMGluJTIwbXlfaXRlcmFibGVfZGF0YXNldCUzQSUyMCUyMCUyMyUyMHRoaXMlMjByZWFkcyUyMHRoZSUyMENTViUyMGZpbGUlMjBwcm9ncmVzc2l2ZWx5JTIwYXMlMjB5b3UlMjBpdGVyYXRlJTIwb3ZlciUyMHRoZSUyMGRhdGFzZXQlMEElMjAlMjAlMjAlMjBwcmludChleGFtcGxlKSUwQSUyMCUyMCUyMCUyMGJyZWFr",highlighted:`data_files = {<span class="hljs-string">"train"</span>: [<span class="hljs-string">"path/to/data.csv"</span>]} | |
| my_iterable_dataset = load_dataset(<span class="hljs-string">"csv"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span>) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: <span class="hljs-comment"># this reads the CSV file progressively as you iterate over the dataset</span> | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span>`,wrap:!1}}),z=new u({props:{title:"Eager data processing and lazy data processing",local:"eager-data-processing-and-lazy-data-processing",headingTag:"h2"}}),Q=new y({props:{code:"bXlfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQubWFwKHByb2Nlc3NfZm4pJTIwJTIwJTIzJTIwcHJvY2Vzc19mbiUyMGlzJTIwYXBwbGllZCUyMG9uJTIwYWxsJTIwdGhlJTIwZXhhbXBsZXMlMjBvZiUyMHRoZSUyMGRhdGFzZXQlMEFwcmludChteV9kYXRhc2V0JTVCMCU1RCk=",highlighted:`my_dataset = my_dataset.<span class="hljs-built_in">map</span>(process_fn) <span class="hljs-comment"># process_fn is applied on all the examples of the dataset</span> | |
| <span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])`,wrap:!1}}),H=new y({props:{code:"bXlfaXRlcmFibGVfZGF0YXNldCUyMCUzRCUyMG15X2l0ZXJhYmxlX2RhdGFzZXQubWFwKHByb2Nlc3NfZm5fMSklMEFteV9pdGVyYWJsZV9kYXRhc2V0JTIwJTNEJTIwbXlfaXRlcmFibGVfZGF0YXNldC5maWx0ZXIoZmlsdGVyX2ZuKSUwQW15X2l0ZXJhYmxlX2RhdGFzZXQlMjAlM0QlMjBteV9pdGVyYWJsZV9kYXRhc2V0Lm1hcChwcm9jZXNzX2ZuXzIpJTBBJTBBJTIzJTIwcHJvY2Vzc19mbl8xJTJDJTIwZmlsdGVyX2ZuJTIwYW5kJTIwcHJvY2Vzc19mbl8yJTIwYXJlJTIwYXBwbGllZCUyMG9uLXRoZS1mbHklMjB3aGVuJTIwaXRlcmF0aW5nJTIwb3ZlciUyMHRoZSUyMGRhdGFzZXQlMEFmb3IlMjBleGFtcGxlJTIwaW4lMjBteV9pdGVyYWJsZV9kYXRhc2V0JTNBJTIwJTIwJTBBJTIwJTIwJTIwJTIwcHJpbnQoZXhhbXBsZSklMEElMjAlMjAlMjAlMjBicmVhaw==",highlighted:`my_iterable_dataset = my_iterable_dataset.<span class="hljs-built_in">map</span>(process_fn_1) | |
| my_iterable_dataset = my_iterable_dataset.<span class="hljs-built_in">filter</span>(filter_fn) | |
| my_iterable_dataset = my_iterable_dataset.<span class="hljs-built_in">map</span>(process_fn_2) | |
| <span class="hljs-comment"># process_fn_1, filter_fn and process_fn_2 are applied on-the-fly when iterating over the dataset</span> | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span>`,wrap:!1}}),S=new u({props:{title:"Exact and fast approximate shuffling",local:"exact-and-fast-approximate-shuffling",headingTag:"h2"}}),A=new y({props:{code:"bXlfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQuc2h1ZmZsZShzZWVkJTNENDIpJTBBcHJpbnQobXlfZGF0YXNldCU1QjAlNUQp",highlighted:`my_dataset = my_dataset.shuffle(seed=<span class="hljs-number">42</span>) | |
| <span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])`,wrap:!1}}),q=new y({props:{code:"bXlfaXRlcmFibGVfZGF0YXNldCUyMCUzRCUyMG15X2l0ZXJhYmxlX2RhdGFzZXQuc2h1ZmZsZShzZWVkJTNENDIlMkMlMjBidWZmZXJfc2l6ZSUzRDEwMCklMEFmb3IlMjBleGFtcGxlJTIwaW4lMjBteV9pdGVyYWJsZV9kYXRhc2V0JTNBJTBBJTIwJTIwJTIwJTIwcHJpbnQoZXhhbXBsZSklMEElMjAlMjAlMjAlMjBicmVhaw==",highlighted:`my_iterable_dataset = my_iterable_dataset.shuffle(seed=<span class="hljs-number">42</span>, buffer_size=<span class="hljs-number">100</span>) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span>`,wrap:!1}}),K=new y({props:{code:"JTIzJTIwU3RyZWFtJTIwZnJvbSUyMHRoZSUyMGludGVybmV0JTBBbXlfaXRlcmFibGVfZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJkZWVwbWluZCUyRmNvZGVfY29udGVzdHMlMjIlMkMlMjBzcGxpdCUzRCUyMnRyYWluJTIyJTJDJTIwc3RyZWFtaW5nJTNEVHJ1ZSklMEFteV9pdGVyYWJsZV9kYXRhc2V0Lm5fc2hhcmRzJTIwJTIwJTIzJTIwMzklMEElMEElMjMlMjBTdHJlYW0lMjBmcm9tJTIwbG9jYWwlMjBmaWxlcyUwQWRhdGFfZmlsZXMlMjAlM0QlMjAlN0IlMjJ0cmFpbiUyMiUzQSUyMCU1QmYlMjJwYXRoJTJGdG8lMkZkYXRhXyU3QmklN0QuY3N2JTIyJTIwZm9yJTIwaSUyMGluJTIwcmFuZ2UoMTAyNCklNUQlN0QlMEFteV9pdGVyYWJsZV9kYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMmNzdiUyMiUyQyUyMGRhdGFfZmlsZXMlM0RkYXRhX2ZpbGVzJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiUyQyUyMHN0cmVhbWluZyUzRFRydWUpJTBBbXlfaXRlcmFibGVfZGF0YXNldC5uX3NoYXJkcyUyMCUyMCUyMyUyMDEwMjQlMEElMEElMjMlMjBGcm9tJTIwYSUyMGdlbmVyYXRvciUyMGZ1bmN0aW9uJTBBZGVmJTIwbXlfZ2VuZXJhdG9yKG4lMkMlMjBzb3VyY2VzKSUzQSUwQSUyMCUyMCUyMCUyMGZvciUyMHNvdXJjZSUyMGluJTIwc291cmNlcyUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGZvciUyMGV4YW1wbGVfaWRfZm9yX2N1cnJlbnRfc291cmNlJTIwaW4lMjByYW5nZShuKSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHlpZWxkJTIwJTdCJTIyZXhhbXBsZV9pZCUyMiUzQSUyMGYlMjIlN0Jzb3VyY2UlN0RfJTdCZXhhbXBsZV9pZF9mb3JfY3VycmVudF9zb3VyY2UlN0QlMjIlN0QlMEElMEFnZW5fa3dhcmdzJTIwJTNEJTIwJTdCJTIybiUyMiUzQSUyMDEwJTJDJTIwJTIyc291cmNlcyUyMiUzQSUyMCU1QmYlMjJwYXRoJTJGdG8lMkZkYXRhXyU3QmklN0QlMjIlMjBmb3IlMjBpJTIwaW4lMjByYW5nZSgxMDI0KSU1RCU3RCUwQW15X2l0ZXJhYmxlX2RhdGFzZXQlMjAlM0QlMjBJdGVyYWJsZURhdGFzZXQuZnJvbV9nZW5lcmF0b3IobXlfZ2VuZXJhdG9yJTJDJTIwZ2VuX2t3YXJncyUzRGdlbl9rd2FyZ3MpJTBBbXlfaXRlcmFibGVfZGF0YXNldC5uX3NoYXJkcyUyMCUyMCUyMyUyMDEwMjQ=",highlighted:`<span class="hljs-comment"># Stream from the internet</span> | |
| my_iterable_dataset = load_dataset(<span class="hljs-string">"deepmind/code_contests"</span>, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span>) | |
| my_iterable_dataset.n_shards <span class="hljs-comment"># 39</span> | |
| <span class="hljs-comment"># Stream from local files</span> | |
| data_files = {<span class="hljs-string">"train"</span>: [<span class="hljs-string">f"path/to/data_<span class="hljs-subst">{i}</span>.csv"</span> <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">1024</span>)]} | |
| my_iterable_dataset = load_dataset(<span class="hljs-string">"csv"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span>) | |
| my_iterable_dataset.n_shards <span class="hljs-comment"># 1024</span> | |
| <span class="hljs-comment"># From a generator function</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">my_generator</span>(<span class="hljs-params">n, sources</span>): | |
| <span class="hljs-keyword">for</span> source <span class="hljs-keyword">in</span> sources: | |
| <span class="hljs-keyword">for</span> example_id_for_current_source <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n): | |
| <span class="hljs-keyword">yield</span> {<span class="hljs-string">"example_id"</span>: <span class="hljs-string">f"<span class="hljs-subst">{source}</span>_<span class="hljs-subst">{example_id_for_current_source}</span>"</span>} | |
| gen_kwargs = {<span class="hljs-string">"n"</span>: <span class="hljs-number">10</span>, <span class="hljs-string">"sources"</span>: [<span class="hljs-string">f"path/to/data_<span class="hljs-subst">{i}</span>"</span> <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">1024</span>)]} | |
| my_iterable_dataset = IterableDataset.from_generator(my_generator, gen_kwargs=gen_kwargs) | |
| my_iterable_dataset.n_shards <span class="hljs-comment"># 1024</span>`,wrap:!1}}),O=new u({props:{title:"Speed differences",local:"speed-differences",headingTag:"h2"}}),se=new y({props:{code:"bXlfZGF0YXNldCU1QjAlNUQlMjAlMjAlMjMlMjBmYXN0JTBBbXlfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQuc2h1ZmZsZShzZWVkJTNENDIpJTBBbXlfZGF0YXNldCU1QjAlNUQlMjAlMjAlMjMlMjB1cCUyMHRvJTIwMTB4JTIwc2xvd2VyJTBBbXlfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQuZmxhdHRlbl9pbmRpY2VzKCklMjAlMjAlMjMlMjByZXdyaXRlJTIwdGhlJTIwc2h1ZmZsZWQlMjBkYXRhc2V0JTIwb24lMjBkaXNrJTIwYXMlMjBjb250aWd1b3VzJTIwY2h1bmtzJTIwb2YlMjBkYXRhJTBBbXlfZGF0YXNldCU1QjAlNUQlMjAlMjAlMjMlMjBmYXN0JTIwYWdhaW4=",highlighted:`my_dataset[<span class="hljs-number">0</span>] <span class="hljs-comment"># fast</span> | |
| my_dataset = my_dataset.shuffle(seed=<span class="hljs-number">42</span>) | |
| my_dataset[<span class="hljs-number">0</span>] <span class="hljs-comment"># up to 10x slower</span> | |
| my_dataset = my_dataset.flatten_indices() <span class="hljs-comment"># rewrite the shuffled dataset on disk as contiguous chunks of data</span> | |
| my_dataset[<span class="hljs-number">0</span>] <span class="hljs-comment"># fast again</span>`,wrap:!1}}),le=new y({props:{code:"Zm9yJTIwZXhhbXBsZSUyMGluJTIwZW51bWVyYXRlKG15X2l0ZXJhYmxlX2RhdGFzZXQpJTNBJTIwJTIwJTIzJTIwZmFzdCUwQSUyMCUyMCUyMCUyMHBhc3MlMEElMEFzaHVmZmxlZF9pdGVyYWJsZV9kYXRhc2V0JTIwJTNEJTIwbXlfaXRlcmFibGVfZGF0YXNldC5zaHVmZmxlKHNlZWQlM0Q0MiUyQyUyMGJ1ZmZlcl9zaXplJTNEMTAwKSUwQSUwQWZvciUyMGV4YW1wbGUlMjBpbiUyMGVudW1lcmF0ZShzaHVmZmxlZF9pdGVyYWJsZV9kYXRhc2V0KSUzQSUyMCUyMCUyMyUyMGFzJTIwZmFzdCUyMGFzJTIwYmVmb3JlJTBBJTIwJTIwJTIwJTIwcGFzcyUwQSUwQXNodWZmbGVkX2l0ZXJhYmxlX2RhdGFzZXQlMjAlM0QlMjBteV9pdGVyYWJsZV9kYXRhc2V0LnNodWZmbGUoc2VlZCUzRDEzMzclMkMlMjBidWZmZXJfc2l6ZSUzRDEwMCklMjAlMjAlMjMlMjByZXNodWZmbGluZyUyMHVzaW5nJTIwYW5vdGhlciUyMHNlZWQlMjBpcyUyMGluc3RhbnRhbmVvdXMlMEElMEFmb3IlMjBleGFtcGxlJTIwaW4lMjBlbnVtZXJhdGUoc2h1ZmZsZWRfaXRlcmFibGVfZGF0YXNldCklM0ElMjAlMjAlMjMlMjBzdGlsbCUyMGFzJTIwZmFzdCUyMGFzJTIwYmVmb3JlJTBBJTIwJTIwJTIwJTIwcGFzcw==",highlighted:`<span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(my_iterable_dataset): <span class="hljs-comment"># fast</span> | |
| <span class="hljs-keyword">pass</span> | |
| shuffled_iterable_dataset = my_iterable_dataset.shuffle(seed=<span class="hljs-number">42</span>, buffer_size=<span class="hljs-number">100</span>) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(shuffled_iterable_dataset): <span class="hljs-comment"># as fast as before</span> | |
| <span class="hljs-keyword">pass</span> | |
| shuffled_iterable_dataset = my_iterable_dataset.shuffle(seed=<span class="hljs-number">1337</span>, buffer_size=<span class="hljs-number">100</span>) <span class="hljs-comment"># reshuffling using another seed is instantaneous</span> | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(shuffled_iterable_dataset): <span class="hljs-comment"># still as fast as before</span> | |
| <span class="hljs-keyword">pass</span>`,wrap:!1}}),ie=new y({props:{code:"Zm9yJTIwZXBvY2glMjBpbiUyMHJhbmdlKG5fZXBvY2hzKSUzQSUwQSUyMCUyMCUyMCUyMG15X2l0ZXJhYmxlX2RhdGFzZXQuc2V0X2Vwb2NoKGVwb2NoKSUwQSUyMCUyMCUyMCUyMGZvciUyMGV4YW1wbGUlMjBpbiUyMG15X2l0ZXJhYmxlX2RhdGFzZXQlM0ElMjAlMjAlMjMlMjBmYXN0JTIwJTJCJTIwcmVzaHVmZmxlZCUyMGF0JTIwZWFjaCUyMGVwb2NoJTIwdXNpbmclMjAlNjBlZmZlY3RpdmVfc2VlZCUyMCUzRCUyMHNlZWQlMjAlMkIlMjBlcG9jaCU2MCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHBhc3M=",highlighted:`<span class="hljs-keyword">for</span> epoch <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_epochs): | |
| my_iterable_dataset.set_epoch(epoch) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: <span class="hljs-comment"># fast + reshuffled at each epoch using \`effective_seed = seed + epoch\`</span> | |
| <span class="hljs-keyword">pass</span>`,wrap:!1}}),pe=new y({props:{code:"bXlfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQuc2VsZWN0KHJhbmdlKHN0YXJ0X2luZGV4JTJDJTIwbGVuKGRhdGFzZXQpKSk=",highlighted:'my_dataset = my_dataset.select(<span class="hljs-built_in">range</span>(start_index, <span class="hljs-built_in">len</span>(dataset)))',wrap:!1}}),me=new y({props:{code:"aXRlcmFibGVfZGF0YXNldCUyMCUzRCUyMERhdGFzZXQuZnJvbV9kaWN0KCU3QiUyMmElMjIlM0ElMjByYW5nZSg2KSU3RCkudG9faXRlcmFibGVfZGF0YXNldChudW1fc2hhcmRzJTNEMyklMEElMjMlMjBzYXZlJTIwaW4lMjB0aGUlMjBtaWRkbGUlMjBvZiUyMHRyYWluaW5nJTBBc3RhdGVfZGljdCUyMCUzRCUyMGl0ZXJhYmxlX2RhdGFzZXQuc3RhdGVfZGljdCgpJTBBJTIzJTIwYW5kJTIwcmVzdW1lJTIwbGF0ZXIlMEFpdGVyYWJsZV9kYXRhc2V0LmxvYWRfc3RhdGVfZGljdChzdGF0ZV9kaWN0KQ==",highlighted:`<span class="hljs-meta">>>> </span>iterable_dataset = Dataset.from_dict({<span class="hljs-string">"a"</span>: <span class="hljs-built_in">range</span>(<span class="hljs-number">6</span>)}).to_iterable_dataset(num_shards=<span class="hljs-number">3</span>) | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># save in the middle of training</span> | |
| <span class="hljs-meta">>>> </span>state_dict = iterable_dataset.state_dict() | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># and resume later</span> | |
| <span class="hljs-meta">>>> </span>iterable_dataset.load_state_dict(state_dict)`,wrap:!1}}),Me=new u({props:{title:"Switch from map-style to iterable",local:"switch-from-map-style-to-iterable",headingTag:"h2"}}),be=new y({props:{code:"bXlfaXRlcmFibGVfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQudG9faXRlcmFibGVfZGF0YXNldCgp",highlighted:"my_iterable_dataset = my_dataset.to_iterable_dataset()",wrap:!1}}),je=new y({props:{code:"bXlfaXRlcmFibGVfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQudG9faXRlcmFibGVfZGF0YXNldChudW1fc2hhcmRzJTNEMTAyNCklMEFteV9pdGVyYWJsZV9kYXRhc2V0Lm5fc2hhcmRzJTIwJTIwJTIzJTIwMTAyNA==",highlighted:`my_iterable_dataset = my_dataset.to_iterable_dataset(num_shards=<span class="hljs-number">1024</span>) | |
| my_iterable_dataset.n_shards <span class="hljs-comment"># 1024</span>`,wrap:!1}}),Je=new bs({props:{source:"https://github.com/huggingface/datasets/blob/main/docs/source/about_mapstyle_vs_iterable.mdx"}}),{c(){M=i("meta"),Ue=l(),Te=i("p"),ge=l(),d(b.$$.fragment),Ze=l(),w=i("p"),w.innerHTML=$a,Xe=l(),d(j.$$.fragment),Ie=l(),J=i("p"),J.innerHTML=Va,ke=l(),d(T.$$.fragment),Ge=l(),_=i("p"),_.innerHTML=Ra,Ce=l(),U=i("p"),U.textContent=va,$e=l(),d(g.$$.fragment),Ve=l(),Z=i("p"),Z.innerHTML=Ba,Re=l(),X=i("p"),X.innerHTML=Fa,ve=l(),d(I.$$.fragment),Be=l(),k=i("p"),k.innerHTML=Ya,Fe=l(),d(G.$$.fragment),Ye=l(),C=i("p"),C.innerHTML=xa,xe=l(),d($.$$.fragment),ze=l(),d(V.$$.fragment),We=l(),R=i("p"),R.innerHTML=za,Qe=l(),d(v.$$.fragment),Ne=l(),B=i("p"),B.textContent=Wa,De=l(),F=i("p"),F.innerHTML=Qa,He=l(),d(Y.$$.fragment),Se=l(),x=i("p"),x.innerHTML=Na,Ee=l(),d(z.$$.fragment),Ae=l(),W=i("p"),W.innerHTML=Da,Le=l(),d(Q.$$.fragment),qe=l(),N=i("p"),N.innerHTML=Ha,Pe=l(),D=i("p"),D.textContent=Sa,Ke=l(),d(H.$$.fragment),Oe=l(),d(S.$$.fragment),ea=l(),E=i("p"),E.innerHTML=Ea,aa=l(),d(A.$$.fragment),sa=l(),L=i("p"),L.innerHTML=Aa,ta=l(),d(q.$$.fragment),la=l(),P=i("p"),P.innerHTML=La,na=l(),d(K.$$.fragment),ia=l(),d(O.$$.fragment),ra=l(),ee=i("p"),ee.innerHTML=qa,pa=l(),ae=i("p"),ae.innerHTML=Pa,da=l(),d(se.$$.fragment),ca=l(),te=i("p"),te.innerHTML=Ka,ma=l(),d(le.$$.fragment),oa=l(),ne=i("p"),ne.innerHTML=Oa,ha=l(),d(ie.$$.fragment),fa=l(),re=i("p"),re.textContent=es,ya=l(),d(pe.$$.fragment),Ma=l(),de=i("p"),de.innerHTML=as,ua=l(),ce=i("p"),ce.innerHTML=ss,ba=l(),d(me.$$.fragment),wa=l(),oe=i("p"),oe.innerHTML=ts,ja=l(),he=i("p"),he.textContent=ls,Ja=l(),fe=i("p"),fe.textContent=ns,Ta=l(),ye=i("p"),ye.innerHTML=is,_a=l(),d(Me.$$.fragment),Ua=l(),ue=i("p"),ue.innerHTML=rs,ga=l(),d(be.$$.fragment),Za=l(),we=i("p"),we.innerHTML=ps,Xa=l(),d(je.$$.fragment),Ia=l(),d(Je.$$.fragment),ka=l(),_e=i("p"),this.h()},l(e){const a=Ms("svelte-u9bgzb",document.head);M=r(a,"META",{name:!0,content:!0}),a.forEach(s),Ue=n(e),Te=r(e,"P",{}),ds(Te).forEach(s),ge=n(e),c(b.$$.fragment,e),Ze=n(e),w=r(e,"P",{"data-svelte-h":!0}),p(w)!=="svelte-1nndet4"&&(w.innerHTML=$a),Xe=n(e),c(j.$$.fragment,e),Ie=n(e),J=r(e,"P",{"data-svelte-h":!0}),p(J)!=="svelte-1hxjfay"&&(J.innerHTML=Va),ke=n(e),c(T.$$.fragment,e),Ge=n(e),_=r(e,"P",{"data-svelte-h":!0}),p(_)!=="svelte-mjmkw0"&&(_.innerHTML=Ra),Ce=n(e),U=r(e,"P",{"data-svelte-h":!0}),p(U)!=="svelte-p27tn9"&&(U.textContent=va),$e=n(e),c(g.$$.fragment,e),Ve=n(e),Z=r(e,"P",{"data-svelte-h":!0}),p(Z)!=="svelte-qlt0xf"&&(Z.innerHTML=Ba),Re=n(e),X=r(e,"P",{"data-svelte-h":!0}),p(X)!=="svelte-1ik5e8b"&&(X.innerHTML=Fa),ve=n(e),c(I.$$.fragment,e),Be=n(e),k=r(e,"P",{"data-svelte-h":!0}),p(k)!=="svelte-xsnnlf"&&(k.innerHTML=Ya),Fe=n(e),c(G.$$.fragment,e),Ye=n(e),C=r(e,"P",{"data-svelte-h":!0}),p(C)!=="svelte-z2r7k0"&&(C.innerHTML=xa),xe=n(e),c($.$$.fragment,e),ze=n(e),c(V.$$.fragment,e),We=n(e),R=r(e,"P",{"data-svelte-h":!0}),p(R)!=="svelte-1ewb8lm"&&(R.innerHTML=za),Qe=n(e),c(v.$$.fragment,e),Ne=n(e),B=r(e,"P",{"data-svelte-h":!0}),p(B)!=="svelte-gryvti"&&(B.textContent=Wa),De=n(e),F=r(e,"P",{"data-svelte-h":!0}),p(F)!=="svelte-gn3sd"&&(F.innerHTML=Qa),He=n(e),c(Y.$$.fragment,e),Se=n(e),x=r(e,"P",{"data-svelte-h":!0}),p(x)!=="svelte-u916f6"&&(x.innerHTML=Na),Ee=n(e),c(z.$$.fragment,e),Ae=n(e),W=r(e,"P",{"data-svelte-h":!0}),p(W)!=="svelte-15ijpck"&&(W.innerHTML=Da),Le=n(e),c(Q.$$.fragment,e),qe=n(e),N=r(e,"P",{"data-svelte-h":!0}),p(N)!=="svelte-ro2n64"&&(N.innerHTML=Ha),Pe=n(e),D=r(e,"P",{"data-svelte-h":!0}),p(D)!=="svelte-vsuw18"&&(D.textContent=Sa),Ke=n(e),c(H.$$.fragment,e),Oe=n(e),c(S.$$.fragment,e),ea=n(e),E=r(e,"P",{"data-svelte-h":!0}),p(E)!=="svelte-1ecbtre"&&(E.innerHTML=Ea),aa=n(e),c(A.$$.fragment,e),sa=n(e),L=r(e,"P",{"data-svelte-h":!0}),p(L)!=="svelte-q1ca4g"&&(L.innerHTML=Aa),ta=n(e),c(q.$$.fragment,e),la=n(e),P=r(e,"P",{"data-svelte-h":!0}),p(P)!=="svelte-yru43"&&(P.innerHTML=La),na=n(e),c(K.$$.fragment,e),ia=n(e),c(O.$$.fragment,e),ra=n(e),ee=r(e,"P",{"data-svelte-h":!0}),p(ee)!=="svelte-1mv00j9"&&(ee.innerHTML=qa),pa=n(e),ae=r(e,"P",{"data-svelte-h":!0}),p(ae)!=="svelte-vg0fdl"&&(ae.innerHTML=Pa),da=n(e),c(se.$$.fragment,e),ca=n(e),te=r(e,"P",{"data-svelte-h":!0}),p(te)!=="svelte-qyjr4f"&&(te.innerHTML=Ka),ma=n(e),c(le.$$.fragment,e),oa=n(e),ne=r(e,"P",{"data-svelte-h":!0}),p(ne)!=="svelte-1yhobsx"&&(ne.innerHTML=Oa),ha=n(e),c(ie.$$.fragment,e),fa=n(e),re=r(e,"P",{"data-svelte-h":!0}),p(re)!=="svelte-1vtsyk"&&(re.textContent=es),ya=n(e),c(pe.$$.fragment,e),Ma=n(e),de=r(e,"P",{"data-svelte-h":!0}),p(de)!=="svelte-1qarbjh"&&(de.innerHTML=as),ua=n(e),ce=r(e,"P",{"data-svelte-h":!0}),p(ce)!=="svelte-1wb4guy"&&(ce.innerHTML=ss),ba=n(e),c(me.$$.fragment,e),wa=n(e),oe=r(e,"P",{"data-svelte-h":!0}),p(oe)!=="svelte-ul64r3"&&(oe.innerHTML=ts),ja=n(e),he=r(e,"P",{"data-svelte-h":!0}),p(he)!=="svelte-fghrva"&&(he.textContent=ls),Ja=n(e),fe=r(e,"P",{"data-svelte-h":!0}),p(fe)!=="svelte-1i92o70"&&(fe.textContent=ns),Ta=n(e),ye=r(e,"P",{"data-svelte-h":!0}),p(ye)!=="svelte-1r4w1a2"&&(ye.innerHTML=is),_a=n(e),c(Me.$$.fragment,e),Ua=n(e),ue=r(e,"P",{"data-svelte-h":!0}),p(ue)!=="svelte-1auhyc3"&&(ue.innerHTML=rs),ga=n(e),c(be.$$.fragment,e),Za=n(e),we=r(e,"P",{"data-svelte-h":!0}),p(we)!=="svelte-604gzo"&&(we.innerHTML=ps),Xa=n(e),c(je.$$.fragment,e),Ia=n(e),c(Je.$$.fragment,e),ka=n(e),_e=r(e,"P",{}),ds(_e).forEach(s),this.h()},h(){cs(M,"name","hf:doc:metadata"),cs(M,"content",js)},m(e,a){us(document.head,M),t(e,Ue,a),t(e,Te,a),t(e,ge,a),m(b,e,a),t(e,Ze,a),t(e,w,a),t(e,Xe,a),m(j,e,a),t(e,Ie,a),t(e,J,a),t(e,ke,a),m(T,e,a),t(e,Ge,a),t(e,_,a),t(e,Ce,a),t(e,U,a),t(e,$e,a),m(g,e,a),t(e,Ve,a),t(e,Z,a),t(e,Re,a),t(e,X,a),t(e,ve,a),m(I,e,a),t(e,Be,a),t(e,k,a),t(e,Fe,a),m(G,e,a),t(e,Ye,a),t(e,C,a),t(e,xe,a),m($,e,a),t(e,ze,a),m(V,e,a),t(e,We,a),t(e,R,a),t(e,Qe,a),m(v,e,a),t(e,Ne,a),t(e,B,a),t(e,De,a),t(e,F,a),t(e,He,a),m(Y,e,a),t(e,Se,a),t(e,x,a),t(e,Ee,a),m(z,e,a),t(e,Ae,a),t(e,W,a),t(e,Le,a),m(Q,e,a),t(e,qe,a),t(e,N,a),t(e,Pe,a),t(e,D,a),t(e,Ke,a),m(H,e,a),t(e,Oe,a),m(S,e,a),t(e,ea,a),t(e,E,a),t(e,aa,a),m(A,e,a),t(e,sa,a),t(e,L,a),t(e,ta,a),m(q,e,a),t(e,la,a),t(e,P,a),t(e,na,a),m(K,e,a),t(e,ia,a),m(O,e,a),t(e,ra,a),t(e,ee,a),t(e,pa,a),t(e,ae,a),t(e,da,a),m(se,e,a),t(e,ca,a),t(e,te,a),t(e,ma,a),m(le,e,a),t(e,oa,a),t(e,ne,a),t(e,ha,a),m(ie,e,a),t(e,fa,a),t(e,re,a),t(e,ya,a),m(pe,e,a),t(e,Ma,a),t(e,de,a),t(e,ua,a),t(e,ce,a),t(e,ba,a),m(me,e,a),t(e,wa,a),t(e,oe,a),t(e,ja,a),t(e,he,a),t(e,Ja,a),t(e,fe,a),t(e,Ta,a),t(e,ye,a),t(e,_a,a),m(Me,e,a),t(e,Ua,a),t(e,ue,a),t(e,ga,a),m(be,e,a),t(e,Za,a),t(e,we,a),t(e,Xa,a),m(je,e,a),t(e,Ia,a),m(Je,e,a),t(e,ka,a),t(e,_e,a),Ga=!0},p:os,i(e){Ga||(o(b.$$.fragment,e),o(j.$$.fragment,e),o(T.$$.fragment,e),o(g.$$.fragment,e),o(I.$$.fragment,e),o(G.$$.fragment,e),o($.$$.fragment,e),o(V.$$.fragment,e),o(v.$$.fragment,e),o(Y.$$.fragment,e),o(z.$$.fragment,e),o(Q.$$.fragment,e),o(H.$$.fragment,e),o(S.$$.fragment,e),o(A.$$.fragment,e),o(q.$$.fragment,e),o(K.$$.fragment,e),o(O.$$.fragment,e),o(se.$$.fragment,e),o(le.$$.fragment,e),o(ie.$$.fragment,e),o(pe.$$.fragment,e),o(me.$$.fragment,e),o(Me.$$.fragment,e),o(be.$$.fragment,e),o(je.$$.fragment,e),o(Je.$$.fragment,e),Ga=!0)},o(e){h(b.$$.fragment,e),h(j.$$.fragment,e),h(T.$$.fragment,e),h(g.$$.fragment,e),h(I.$$.fragment,e),h(G.$$.fragment,e),h($.$$.fragment,e),h(V.$$.fragment,e),h(v.$$.fragment,e),h(Y.$$.fragment,e),h(z.$$.fragment,e),h(Q.$$.fragment,e),h(H.$$.fragment,e),h(S.$$.fragment,e),h(A.$$.fragment,e),h(q.$$.fragment,e),h(K.$$.fragment,e),h(O.$$.fragment,e),h(se.$$.fragment,e),h(le.$$.fragment,e),h(ie.$$.fragment,e),h(pe.$$.fragment,e),h(me.$$.fragment,e),h(Me.$$.fragment,e),h(be.$$.fragment,e),h(je.$$.fragment,e),h(Je.$$.fragment,e),Ga=!1},d(e){e&&(s(Ue),s(Te),s(ge),s(Ze),s(w),s(Xe),s(Ie),s(J),s(ke),s(Ge),s(_),s(Ce),s(U),s($e),s(Ve),s(Z),s(Re),s(X),s(ve),s(Be),s(k),s(Fe),s(Ye),s(C),s(xe),s(ze),s(We),s(R),s(Qe),s(Ne),s(B),s(De),s(F),s(He),s(Se),s(x),s(Ee),s(Ae),s(W),s(Le),s(qe),s(N),s(Pe),s(D),s(Ke),s(Oe),s(ea),s(E),s(aa),s(sa),s(L),s(ta),s(la),s(P),s(na),s(ia),s(ra),s(ee),s(pa),s(ae),s(da),s(ca),s(te),s(ma),s(oa),s(ne),s(ha),s(fa),s(re),s(ya),s(Ma),s(de),s(ua),s(ce),s(ba),s(wa),s(oe),s(ja),s(he),s(Ja),s(fe),s(Ta),s(ye),s(_a),s(Ua),s(ue),s(ga),s(Za),s(we),s(Xa),s(Ia),s(ka),s(_e)),s(M),f(b,e),f(j,e),f(T,e),f(g,e),f(I,e),f(G,e),f($,e),f(V,e),f(v,e),f(Y,e),f(z,e),f(Q,e),f(H,e),f(S,e),f(A,e),f(q,e),f(K,e),f(O,e),f(se,e),f(le,e),f(ie,e),f(pe,e),f(me,e),f(Me,e),f(be,e),f(je,e),f(Je,e)}}}const js='{"title":"Differences between Dataset and IterableDataset","local":"differences-between-dataset-and-iterabledataset","sections":[{"title":"Downloading and streaming","local":"downloading-and-streaming","sections":[],"depth":2},{"title":"Creating map-style datasets and iterable datasets","local":"creating-map-style-datasets-and-iterable-datasets","sections":[],"depth":2},{"title":"Loading local files entirely and progressively","local":"loading-local-files-entirely-and-progressively","sections":[],"depth":2},{"title":"Eager data processing and lazy data processing","local":"eager-data-processing-and-lazy-data-processing","sections":[],"depth":2},{"title":"Exact and fast approximate shuffling","local":"exact-and-fast-approximate-shuffling","sections":[],"depth":2},{"title":"Speed differences","local":"speed-differences","sections":[],"depth":2},{"title":"Switch from map-style to iterable","local":"switch-from-map-style-to-iterable","sections":[],"depth":2}],"depth":1}';function Js(Ca){return hs(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Zs extends fs{constructor(M){super(),ys(this,M,Js,ws,ms,{})}}export{Zs as component}; | |
Xet Storage Details
- Size:
- 40.4 kB
- Xet hash:
- 97ffb04ebc29bbc5172f23fc79698fdddc4567821e825d0d55a7e4a8f562f5a0
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.