Buckets:
| import{s as hs,n as fs,o as ys}from"../chunks/scheduler.d75c11ed.js";import{S as Ms,i as us,e as i,s as l,c as d,h as bs,a as r,d as s,b as n,f as ms,g as c,j as p,k as os,l as ws,m as t,n as m,t as o,o as h,p as f}from"../chunks/index.4ec9dfe9.js";import{C as js,H as u,E as _s}from"../chunks/MermaidChart.svelte_svelte_type_style_lang.ee0f129e.js";import{C as y}from"../chunks/CodeBlock.5919a092.js";function Js(Va){let M,Ue,Te,Ze,b,Xe,w,Ge,j,Ra=`There are two types of dataset objects, a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> and an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>. | |
| Whichever type of dataset you choose to use or create depends on the size of the dataset. | |
| In general, an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> is ideal for big datasets (think hundreds of GBs!) due to its lazy behavior and speed advantages, while a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> is great for everything else. | |
| This page will compare the differences between a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> and an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> to help you pick the right dataset object for you.`,Ie,_,ke,J,va=`When you have a regular <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a>, you can access it using <code>my_dataset[0]</code>. This provides random access to the rows. | |
| Such datasets are also called “map-style” datasets. | |
| For example you can download ImageNet-1k like this and access any row:`,$e,T,Ce,g,Ba=`But one caveat is that you must have the entire dataset stored on your disk or in memory, which blocks you from accessing datasets bigger than the disk. | |
| Because it can become inconvenient for big datasets, there exists another type of dataset, the <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>. | |
| When you have an <code>IterableDataset</code>, you can access it using a <code>for</code> loop to load the data progressively as you iterate over the dataset. | |
| This way, only a small fraction of examples is loaded in memory, and you don’t write anything on disk.`,Ve,U,Fa="For example, you can stream the ImageNet-1k dataset without downloading it on disk:",Re,Z,ve,X,xa=`Streaming can read online data without writing any file to disk. | |
| For example, you can stream datasets made out of multiple shards, each of which is hundreds of gigabytes like <a href="https://huggingface.co/datasets/c4" rel="nofollow">C4</a> or <a href="https://huggingface.co/datasets/laion/laion2B-en" rel="nofollow">LAION-2B</a>. | |
| Learn more about how to stream a dataset in the <a href="./stream">Dataset Streaming Guide</a>.`,Be,G,Ya="This is not the only difference though, because the “lazy” behavior of an <code>IterableDataset</code> is also present when it comes to dataset creation and processing.",Fe,I,xe,k,za='You can create a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> using lists or dictionaries, and the data is entirely converted to Arrow so you can easily access any row:',Ye,$,ze,C,Wa=`To create an <code>IterableDataset</code> on the other hand, you must provide a “lazy” way to load the data. | |
| In Python, we generally use generator functions. These functions <code>yield</code> one example at a time, which means you can’t access a row by slicing it like a regular <code>Dataset</code>:`,We,V,Ne,R,Qe,v,Na='It is possible to convert local or remote data files to an Arrow <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> using <a href="/docs/datasets/pr_8021/en/package_reference/loading_methods#datasets.load_dataset">load_dataset()</a>:',De,B,He,F,Qa="However, this requires a conversion step from CSV to Arrow format, which takes time and disk space if your dataset is big.",Ae,x,Da=`To save disk space and skip the conversion step, you can define an <code>IterableDataset</code> by streaming from the local files directly. | |
| This way, the data is read progressively from the local files as you iterate over the dataset:`,Ee,Y,Se,z,Ha=`Many file formats are supported, like CSV, JSONL, and Parquet, as well as image and audio files. | |
| You can find more information in the corresponding guides for loading <a href="./tabular_load">tabular</a>, <a href="./nlp_load">text</a>, <a href="./image_load">vision</a>, and <a href="./audio_load%5D">audio</a> datasets.`,Le,W,qe,N,Aa=`When you process a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> object using <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.map">Dataset.map()</a>, the entire dataset is processed immediately and returned. | |
| This is similar to how <code>pandas</code> works for example.`,Pe,Q,Ke,D,Ea=`On the other hand, due to the “lazy” nature of an <code>IterableDataset</code>, calling <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.map">IterableDataset.map()</a> does not apply your <code>map</code> function over the full dataset. | |
| Instead, your <code>map</code> function is applied on-the-fly.`,Oe,H,Sa="Because of that, you can chain multiple processing steps and they will all run at once when you start iterating over the dataset:",ea,A,aa,E,sa,S,La=`When you shuffle a <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> using <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.shuffle">Dataset.shuffle()</a>, you apply an exact shuffling of the dataset. | |
| It works by taking a list of indices <code>[0, 1, 2, ... len(my_dataset) - 1]</code> and shuffling this list. | |
| Then, accessing <code>my_dataset[0]</code> returns the row and index defined by the first element of the indices mapping that has been shuffled:`,ta,L,la,q,qa=`Since we don’t have random access to the rows in the case of an <code>IterableDataset</code>, we can’t use a shuffled list of indices and access a row at an arbitrary position. | |
| This prevents the use of exact shuffling. | |
| Instead, a fast approximate shuffling is used in <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.shuffle">IterableDataset.shuffle()</a>. | |
| It uses a shuffle buffer to sample random examples iteratively from the dataset. | |
| Since the dataset is still read iteratively, it provides excellent speed performance:`,na,P,ia,K,Pa='But using a shuffle buffer is not enough to provide a satisfactory shuffling for machine learning model training. So <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.shuffle">IterableDataset.shuffle()</a> also shuffles the dataset shards if your dataset is made of multiple files or sources:',ra,O,pa,ee,da,ae,Ka=`Regular <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> objects are based on Arrow which provides fast random access to the rows. | |
| Thanks to memory mapping and the fact that Arrow is an in-memory format, reading data from disk doesn’t do expensive system calls and deserialization. | |
| It provides even faster data loading when iterating using a <code>for</code> loop by iterating on contiguous Arrow record batches.`,ca,se,Oa=`However as soon as your <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> has an indices mapping (via <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.shuffle">Dataset.shuffle()</a> for example), the speed can become 10x slower. | |
| This is because there is an extra step to get the row index to read using the indices mapping, and most importantly, you aren’t reading contiguous chunks of data anymore. | |
| To restore the speed, you’d need to rewrite the entire dataset on your disk again using <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset.flatten_indices">Dataset.flatten_indices()</a>, which removes the indices mapping. | |
| This may take a lot of time depending on the size of your dataset though:`,ma,te,oa,le,es=`In this case, we recommend switching to an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> and leveraging its fast approximate shuffling method <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.shuffle">IterableDataset.shuffle()</a>. | |
| It only shuffles the shards order and adds a shuffle buffer to your dataset, which keeps the speed of your dataset optimal. | |
| You can also reshuffle the dataset easily:`,ha,ne,fa,ie,as=`If you’re using your dataset on multiple epochs, the effective seed to shuffle the shards order in the shuffle buffer is <code>seed + epoch</code>. | |
| It makes it easy to reshuffle a dataset between epochs:`,ya,re,Ma,pe,ss="To restart the iteration of a map-style dataset, you can simply skip the first examples:",ua,de,ba,ce,ts="But if you use a <code>DataLoader</code> with a <code>Sampler</code>, you should instead save the state of your sampler (you might have written a custom sampler that allows resuming).",wa,me,ls='On the other hand, iterable datasets don’t provide random access to a specific example index to resume from. But you can use <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.state_dict">IterableDataset.state_dict()</a> and <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset.load_state_dict">IterableDataset.load_state_dict()</a> to resume from a checkpoint instead, similarly to what you can do for models and optimizers:',ja,oe,_a,he,ns="Under the hood, the iterable dataset keeps track of the current shard being read and the example index in the current shard and it stores this info in the <code>state_dict</code>.",Ja,fe,is=`To resume from a checkpoint, the dataset skips all the shards that were previously read to restart from the current shard. | |
| Then it reads the shard and skips examples until it reaches the exact example from the checkpoint.`,Ta,ye,rs="Therefore restarting a dataset is quite fast, since it will not re-read the shards that have already been iterated on. Still, resuming a dataset is generally not instantaneous since it has to restart reading from the beginning of the current shard and skip examples until it reaches the checkpoint location.",ga,Me,ps='This can be used with the <code>StatefulDataLoader</code> from <code>torchdata</code>, see <a href="./use_with_pytorch#stream-data">streaming with a PyTorch DataLoader</a>.',Ua,ue,Za,be,ds='If you want to benefit from the “lazy” behavior of an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a> or their speed advantages, you can switch your map-style <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.Dataset">Dataset</a> to an <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>:',Xa,we,Ga,je,cs='If you want to shuffle your dataset or <a href="./use_with_pytorch#stream-data">use it with a PyTorch DataLoader</a>, we recommend generating a sharded <a href="/docs/datasets/pr_8021/en/package_reference/main_classes#datasets.IterableDataset">IterableDataset</a>:',Ia,_e,ka,Je,$a,ge,Ca;return b=new js({props:{containerStyle:"float: right; margin-left: 10px; display: inline-flex; position: relative; z-index: 10;"}}),w=new u({props:{title:"Differences between Dataset and IterableDataset",local:"differences-between-dataset-and-iterabledataset",headingTag:"h1"}}),_=new u({props:{title:"Downloading and streaming",local:"downloading-and-streaming",headingTag:"h2"}}),T=new y({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBJTBBaW1hZ2VuZXQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIydGltbSUyRmltYWdlbmV0LTFrLXdkcyUyMiUyQyUyMHNwbGl0JTNEJTIydHJhaW4lMjIpJTIwJTIwJTIzJTIwZG93bmxvYWRzJTIwdGhlJTIwZnVsbCUyMGRhdGFzZXQlMEFwcmludChpbWFnZW5ldCU1QjAlNUQp",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| imagenet = load_dataset(<span class="hljs-string">"timm/imagenet-1k-wds"</span>, split=<span class="hljs-string">"train"</span>) <span class="hljs-comment"># downloads the full dataset</span> | |
| <span class="hljs-built_in">print</span>(imagenet[<span class="hljs-number">0</span>])`,wrap:!1}}),Z=new y({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBJTBBaW1hZ2VuZXQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIydGltbSUyRmltYWdlbmV0LTFrLXdkcyUyMiUyQyUyMHNwbGl0JTNEJTIydHJhaW4lMjIlMkMlMjBzdHJlYW1pbmclM0RUcnVlKSUyMCUyMCUyMyUyMHdpbGwlMjBzdGFydCUyMGxvYWRpbmclMjB0aGUlMjBkYXRhJTIwd2hlbiUyMGl0ZXJhdGVkJTIwb3ZlciUwQWZvciUyMGV4YW1wbGUlMjBpbiUyMGltYWdlbmV0JTNBJTBBJTIwJTIwJTIwJTIwcHJpbnQoZXhhbXBsZSklMEElMjAlMjAlMjAlMjBicmVhaw==",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| imagenet = load_dataset(<span class="hljs-string">"timm/imagenet-1k-wds"</span>, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span>) <span class="hljs-comment"># will start loading the data when iterated over</span> | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> imagenet: | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span>`,wrap:!1}}),I=new u({props:{title:"Creating map-style datasets and iterable datasets",local:"creating-map-style-datasets-and-iterable-datasets",headingTag:"h2"}}),$=new y({props:{code:"bXlfZGF0YXNldCUyMCUzRCUyMERhdGFzZXQuZnJvbV9kaWN0KCU3QiUyMmNvbF8xJTIyJTNBJTIwJTVCMCUyQyUyMDElMkMlMjAyJTJDJTIwMyUyQyUyMDQlMkMlMjA1JTJDJTIwNiUyQyUyMDclMkMlMjA4JTJDJTIwOSU1RCU3RCklMEFwcmludChteV9kYXRhc2V0JTVCMCU1RCk=",highlighted:`my_dataset = Dataset.from_dict({<span class="hljs-string">"col_1"</span>: [<span class="hljs-number">0</span>, <span class="hljs-number">1</span>, <span class="hljs-number">2</span>, <span class="hljs-number">3</span>, <span class="hljs-number">4</span>, <span class="hljs-number">5</span>, <span class="hljs-number">6</span>, <span class="hljs-number">7</span>, <span class="hljs-number">8</span>, <span class="hljs-number">9</span>]}) | |
| <span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])`,wrap:!1}}),V=new y({props:{code:"ZGVmJTIwbXlfZ2VuZXJhdG9yKG4pJTNBJTBBJTIwJTIwJTIwJTIwZm9yJTIwaSUyMGluJTIwcmFuZ2UobiklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB5aWVsZCUyMCU3QiUyMmNvbF8xJTIyJTNBJTIwaSU3RCUwQSUwQW15X2l0ZXJhYmxlX2RhdGFzZXQlMjAlM0QlMjBJdGVyYWJsZURhdGFzZXQuZnJvbV9nZW5lcmF0b3IobXlfZ2VuZXJhdG9yJTJDJTIwZ2VuX2t3YXJncyUzRCU3QiUyMm4lMjIlM0ElMjAxMCU3RCklMEFmb3IlMjBleGFtcGxlJTIwaW4lMjBteV9pdGVyYWJsZV9kYXRhc2V0JTNBJTBBJTIwJTIwJTIwJTIwcHJpbnQoZXhhbXBsZSklMEElMjAlMjAlMjAlMjBicmVhaw==",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">my_generator</span>(<span class="hljs-params">n</span>): | |
| <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n): | |
| <span class="hljs-keyword">yield</span> {<span class="hljs-string">"col_1"</span>: i} | |
| my_iterable_dataset = IterableDataset.from_generator(my_generator, gen_kwargs={<span class="hljs-string">"n"</span>: <span class="hljs-number">10</span>}) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span>`,wrap:!1}}),R=new u({props:{title:"Loading local files entirely and progressively",local:"loading-local-files-entirely-and-progressively",headingTag:"h2"}}),B=new y({props:{code:"ZGF0YV9maWxlcyUyMCUzRCUyMCU3QiUyMnRyYWluJTIyJTNBJTIwJTVCJTIycGF0aCUyRnRvJTJGZGF0YS5jc3YlMjIlNUQlN0QlMEFteV9kYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMmNzdiUyMiUyQyUyMGRhdGFfZmlsZXMlM0RkYXRhX2ZpbGVzJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiklMEFwcmludChteV9kYXRhc2V0JTVCMCU1RCk=",highlighted:`data_files = {<span class="hljs-string">"train"</span>: [<span class="hljs-string">"path/to/data.csv"</span>]} | |
| my_dataset = load_dataset(<span class="hljs-string">"csv"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>) | |
| <span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])`,wrap:!1}}),Y=new y({props:{code:"ZGF0YV9maWxlcyUyMCUzRCUyMCU3QiUyMnRyYWluJTIyJTNBJTIwJTVCJTIycGF0aCUyRnRvJTJGZGF0YS5jc3YlMjIlNUQlN0QlMEFteV9pdGVyYWJsZV9kYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMmNzdiUyMiUyQyUyMGRhdGFfZmlsZXMlM0RkYXRhX2ZpbGVzJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiUyQyUyMHN0cmVhbWluZyUzRFRydWUpJTBBZm9yJTIwZXhhbXBsZSUyMGluJTIwbXlfaXRlcmFibGVfZGF0YXNldCUzQSUyMCUyMCUyMyUyMHRoaXMlMjByZWFkcyUyMHRoZSUyMENTViUyMGZpbGUlMjBwcm9ncmVzc2l2ZWx5JTIwYXMlMjB5b3UlMjBpdGVyYXRlJTIwb3ZlciUyMHRoZSUyMGRhdGFzZXQlMEElMjAlMjAlMjAlMjBwcmludChleGFtcGxlKSUwQSUyMCUyMCUyMCUyMGJyZWFr",highlighted:`data_files = {<span class="hljs-string">"train"</span>: [<span class="hljs-string">"path/to/data.csv"</span>]} | |
| my_iterable_dataset = load_dataset(<span class="hljs-string">"csv"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span>) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: <span class="hljs-comment"># this reads the CSV file progressively as you iterate over the dataset</span> | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span>`,wrap:!1}}),W=new u({props:{title:"Eager data processing and lazy data processing",local:"eager-data-processing-and-lazy-data-processing",headingTag:"h2"}}),Q=new y({props:{code:"bXlfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQubWFwKHByb2Nlc3NfZm4pJTIwJTIwJTIzJTIwcHJvY2Vzc19mbiUyMGlzJTIwYXBwbGllZCUyMG9uJTIwYWxsJTIwdGhlJTIwZXhhbXBsZXMlMjBvZiUyMHRoZSUyMGRhdGFzZXQlMEFwcmludChteV9kYXRhc2V0JTVCMCU1RCk=",highlighted:`my_dataset = my_dataset.<span class="hljs-built_in">map</span>(process_fn) <span class="hljs-comment"># process_fn is applied on all the examples of the dataset</span> | |
| <span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])`,wrap:!1}}),A=new y({props:{code:"bXlfaXRlcmFibGVfZGF0YXNldCUyMCUzRCUyMG15X2l0ZXJhYmxlX2RhdGFzZXQubWFwKHByb2Nlc3NfZm5fMSklMEFteV9pdGVyYWJsZV9kYXRhc2V0JTIwJTNEJTIwbXlfaXRlcmFibGVfZGF0YXNldC5maWx0ZXIoZmlsdGVyX2ZuKSUwQW15X2l0ZXJhYmxlX2RhdGFzZXQlMjAlM0QlMjBteV9pdGVyYWJsZV9kYXRhc2V0Lm1hcChwcm9jZXNzX2ZuXzIpJTBBJTBBJTIzJTIwcHJvY2Vzc19mbl8xJTJDJTIwZmlsdGVyX2ZuJTIwYW5kJTIwcHJvY2Vzc19mbl8yJTIwYXJlJTIwYXBwbGllZCUyMG9uLXRoZS1mbHklMjB3aGVuJTIwaXRlcmF0aW5nJTIwb3ZlciUyMHRoZSUyMGRhdGFzZXQlMEFmb3IlMjBleGFtcGxlJTIwaW4lMjBteV9pdGVyYWJsZV9kYXRhc2V0JTNBJTIwJTIwJTBBJTIwJTIwJTIwJTIwcHJpbnQoZXhhbXBsZSklMEElMjAlMjAlMjAlMjBicmVhaw==",highlighted:`my_iterable_dataset = my_iterable_dataset.<span class="hljs-built_in">map</span>(process_fn_1) | |
| my_iterable_dataset = my_iterable_dataset.<span class="hljs-built_in">filter</span>(filter_fn) | |
| my_iterable_dataset = my_iterable_dataset.<span class="hljs-built_in">map</span>(process_fn_2) | |
| <span class="hljs-comment"># process_fn_1, filter_fn and process_fn_2 are applied on-the-fly when iterating over the dataset</span> | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span>`,wrap:!1}}),E=new u({props:{title:"Exact and fast approximate shuffling",local:"exact-and-fast-approximate-shuffling",headingTag:"h2"}}),L=new y({props:{code:"bXlfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQuc2h1ZmZsZShzZWVkJTNENDIpJTBBcHJpbnQobXlfZGF0YXNldCU1QjAlNUQp",highlighted:`my_dataset = my_dataset.shuffle(seed=<span class="hljs-number">42</span>) | |
| <span class="hljs-built_in">print</span>(my_dataset[<span class="hljs-number">0</span>])`,wrap:!1}}),P=new y({props:{code:"bXlfaXRlcmFibGVfZGF0YXNldCUyMCUzRCUyMG15X2l0ZXJhYmxlX2RhdGFzZXQuc2h1ZmZsZShzZWVkJTNENDIlMkMlMjBidWZmZXJfc2l6ZSUzRDEwMCklMEFmb3IlMjBleGFtcGxlJTIwaW4lMjBteV9pdGVyYWJsZV9kYXRhc2V0JTNBJTBBJTIwJTIwJTIwJTIwcHJpbnQoZXhhbXBsZSklMEElMjAlMjAlMjAlMjBicmVhaw==",highlighted:`my_iterable_dataset = my_iterable_dataset.shuffle(seed=<span class="hljs-number">42</span>, buffer_size=<span class="hljs-number">100</span>) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: | |
| <span class="hljs-built_in">print</span>(example) | |
| <span class="hljs-keyword">break</span>`,wrap:!1}}),O=new y({props:{code:"JTIzJTIwU3RyZWFtJTIwZnJvbSUyMHRoZSUyMGludGVybmV0JTBBbXlfaXRlcmFibGVfZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJkZWVwbWluZCUyRmNvZGVfY29udGVzdHMlMjIlMkMlMjBzcGxpdCUzRCUyMnRyYWluJTIyJTJDJTIwc3RyZWFtaW5nJTNEVHJ1ZSklMEFteV9pdGVyYWJsZV9kYXRhc2V0Lm51bV9zaGFyZHMlMjAlMjAlMjMlMjAzOSUwQSUwQSUyMyUyMFN0cmVhbSUyMGZyb20lMjBsb2NhbCUyMGZpbGVzJTBBZGF0YV9maWxlcyUyMCUzRCUyMCU3QiUyMnRyYWluJTIyJTNBJTIwJTVCZiUyMnBhdGglMkZ0byUyRmRhdGFfJTdCaSU3RC5jc3YlMjIlMjBmb3IlMjBpJTIwaW4lMjByYW5nZSgxMDI0KSU1RCU3RCUwQW15X2l0ZXJhYmxlX2RhdGFzZXQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIyY3N2JTIyJTJDJTIwZGF0YV9maWxlcyUzRGRhdGFfZmlsZXMlMkMlMjBzcGxpdCUzRCUyMnRyYWluJTIyJTJDJTIwc3RyZWFtaW5nJTNEVHJ1ZSklMEFteV9pdGVyYWJsZV9kYXRhc2V0Lm51bV9zaGFyZHMlMjAlMjAlMjMlMjAxMDI0JTBBJTBBJTIzJTIwRnJvbSUyMGElMjBnZW5lcmF0b3IlMjBmdW5jdGlvbiUwQWRlZiUyMG15X2dlbmVyYXRvcihuJTJDJTIwc291cmNlcyklM0ElMEElMjAlMjAlMjAlMjBmb3IlMjBzb3VyY2UlMjBpbiUyMHNvdXJjZXMlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBmb3IlMjBleGFtcGxlX2lkX2Zvcl9jdXJyZW50X3NvdXJjZSUyMGluJTIwcmFuZ2UobiklM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB5aWVsZCUyMCU3QiUyMmV4YW1wbGVfaWQlMjIlM0ElMjBmJTIyJTdCc291cmNlJTdEXyU3QmV4YW1wbGVfaWRfZm9yX2N1cnJlbnRfc291cmNlJTdEJTIyJTdEJTBBJTBBZ2VuX2t3YXJncyUyMCUzRCUyMCU3QiUyMm4lMjIlM0ElMjAxMCUyQyUyMCUyMnNvdXJjZXMlMjIlM0ElMjAlNUJmJTIycGF0aCUyRnRvJTJGZGF0YV8lN0JpJTdEJTIyJTIwZm9yJTIwaSUyMGluJTIwcmFuZ2UoMTAyNCklNUQlN0QlMEFteV9pdGVyYWJsZV9kYXRhc2V0JTIwJTNEJTIwSXRlcmFibGVEYXRhc2V0LmZyb21fZ2VuZXJhdG9yKG15X2dlbmVyYXRvciUyQyUyMGdlbl9rd2FyZ3MlM0RnZW5fa3dhcmdzKSUwQW15X2l0ZXJhYmxlX2RhdGFzZXQubnVtX3NoYXJkcyUyMCUyMCUyMyUyMDEwMjQ=",highlighted:`<span class="hljs-comment"># Stream from the internet</span> | |
| my_iterable_dataset = load_dataset(<span class="hljs-string">"deepmind/code_contests"</span>, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span>) | |
| my_iterable_dataset.num_shards <span class="hljs-comment"># 39</span> | |
| <span class="hljs-comment"># Stream from local files</span> | |
| data_files = {<span class="hljs-string">"train"</span>: [<span class="hljs-string">f"path/to/data_<span class="hljs-subst">{i}</span>.csv"</span> <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">1024</span>)]} | |
| my_iterable_dataset = load_dataset(<span class="hljs-string">"csv"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span>) | |
| my_iterable_dataset.num_shards <span class="hljs-comment"># 1024</span> | |
| <span class="hljs-comment"># From a generator function</span> | |
| <span class="hljs-keyword">def</span> <span class="hljs-title function_">my_generator</span>(<span class="hljs-params">n, sources</span>): | |
| <span class="hljs-keyword">for</span> source <span class="hljs-keyword">in</span> sources: | |
| <span class="hljs-keyword">for</span> example_id_for_current_source <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n): | |
| <span class="hljs-keyword">yield</span> {<span class="hljs-string">"example_id"</span>: <span class="hljs-string">f"<span class="hljs-subst">{source}</span>_<span class="hljs-subst">{example_id_for_current_source}</span>"</span>} | |
| gen_kwargs = {<span class="hljs-string">"n"</span>: <span class="hljs-number">10</span>, <span class="hljs-string">"sources"</span>: [<span class="hljs-string">f"path/to/data_<span class="hljs-subst">{i}</span>"</span> <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">1024</span>)]} | |
| my_iterable_dataset = IterableDataset.from_generator(my_generator, gen_kwargs=gen_kwargs) | |
| my_iterable_dataset.num_shards <span class="hljs-comment"># 1024</span>`,wrap:!1}}),ee=new u({props:{title:"Speed differences",local:"speed-differences",headingTag:"h2"}}),te=new y({props:{code:"bXlfZGF0YXNldCU1QjAlNUQlMjAlMjAlMjMlMjBmYXN0JTBBbXlfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQuc2h1ZmZsZShzZWVkJTNENDIpJTBBbXlfZGF0YXNldCU1QjAlNUQlMjAlMjAlMjMlMjB1cCUyMHRvJTIwMTB4JTIwc2xvd2VyJTBBbXlfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQuZmxhdHRlbl9pbmRpY2VzKCklMjAlMjAlMjMlMjByZXdyaXRlJTIwdGhlJTIwc2h1ZmZsZWQlMjBkYXRhc2V0JTIwb24lMjBkaXNrJTIwYXMlMjBjb250aWd1b3VzJTIwY2h1bmtzJTIwb2YlMjBkYXRhJTBBbXlfZGF0YXNldCU1QjAlNUQlMjAlMjAlMjMlMjBmYXN0JTIwYWdhaW4=",highlighted:`my_dataset[<span class="hljs-number">0</span>] <span class="hljs-comment"># fast</span> | |
| my_dataset = my_dataset.shuffle(seed=<span class="hljs-number">42</span>) | |
| my_dataset[<span class="hljs-number">0</span>] <span class="hljs-comment"># up to 10x slower</span> | |
| my_dataset = my_dataset.flatten_indices() <span class="hljs-comment"># rewrite the shuffled dataset on disk as contiguous chunks of data</span> | |
| my_dataset[<span class="hljs-number">0</span>] <span class="hljs-comment"># fast again</span>`,wrap:!1}}),ne=new y({props:{code:"Zm9yJTIwZXhhbXBsZSUyMGluJTIwZW51bWVyYXRlKG15X2l0ZXJhYmxlX2RhdGFzZXQpJTNBJTIwJTIwJTIzJTIwZmFzdCUwQSUyMCUyMCUyMCUyMHBhc3MlMEElMEFzaHVmZmxlZF9pdGVyYWJsZV9kYXRhc2V0JTIwJTNEJTIwbXlfaXRlcmFibGVfZGF0YXNldC5zaHVmZmxlKHNlZWQlM0Q0MiUyQyUyMGJ1ZmZlcl9zaXplJTNEMTAwKSUwQSUwQWZvciUyMGV4YW1wbGUlMjBpbiUyMGVudW1lcmF0ZShzaHVmZmxlZF9pdGVyYWJsZV9kYXRhc2V0KSUzQSUyMCUyMCUyMyUyMGFzJTIwZmFzdCUyMGFzJTIwYmVmb3JlJTBBJTIwJTIwJTIwJTIwcGFzcyUwQSUwQXNodWZmbGVkX2l0ZXJhYmxlX2RhdGFzZXQlMjAlM0QlMjBteV9pdGVyYWJsZV9kYXRhc2V0LnNodWZmbGUoc2VlZCUzRDEzMzclMkMlMjBidWZmZXJfc2l6ZSUzRDEwMCklMjAlMjAlMjMlMjByZXNodWZmbGluZyUyMHVzaW5nJTIwYW5vdGhlciUyMHNlZWQlMjBpcyUyMGluc3RhbnRhbmVvdXMlMEElMEFmb3IlMjBleGFtcGxlJTIwaW4lMjBlbnVtZXJhdGUoc2h1ZmZsZWRfaXRlcmFibGVfZGF0YXNldCklM0ElMjAlMjAlMjMlMjBzdGlsbCUyMGFzJTIwZmFzdCUyMGFzJTIwYmVmb3JlJTBBJTIwJTIwJTIwJTIwcGFzcw==",highlighted:`<span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(my_iterable_dataset): <span class="hljs-comment"># fast</span> | |
| <span class="hljs-keyword">pass</span> | |
| shuffled_iterable_dataset = my_iterable_dataset.shuffle(seed=<span class="hljs-number">42</span>, buffer_size=<span class="hljs-number">100</span>) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(shuffled_iterable_dataset): <span class="hljs-comment"># as fast as before</span> | |
| <span class="hljs-keyword">pass</span> | |
| shuffled_iterable_dataset = my_iterable_dataset.shuffle(seed=<span class="hljs-number">1337</span>, buffer_size=<span class="hljs-number">100</span>) <span class="hljs-comment"># reshuffling using another seed is instantaneous</span> | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(shuffled_iterable_dataset): <span class="hljs-comment"># still as fast as before</span> | |
| <span class="hljs-keyword">pass</span>`,wrap:!1}}),re=new y({props:{code:"Zm9yJTIwZXBvY2glMjBpbiUyMHJhbmdlKG5fZXBvY2hzKSUzQSUwQSUyMCUyMCUyMCUyMG15X2l0ZXJhYmxlX2RhdGFzZXQuc2V0X2Vwb2NoKGVwb2NoKSUwQSUyMCUyMCUyMCUyMGZvciUyMGV4YW1wbGUlMjBpbiUyMG15X2l0ZXJhYmxlX2RhdGFzZXQlM0ElMjAlMjAlMjMlMjBmYXN0JTIwJTJCJTIwcmVzaHVmZmxlZCUyMGF0JTIwZWFjaCUyMGVwb2NoJTIwdXNpbmclMjAlNjBlZmZlY3RpdmVfc2VlZCUyMCUzRCUyMHNlZWQlMjAlMkIlMjBlcG9jaCU2MCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHBhc3M=",highlighted:`<span class="hljs-keyword">for</span> epoch <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(n_epochs): | |
| my_iterable_dataset.set_epoch(epoch) | |
| <span class="hljs-keyword">for</span> example <span class="hljs-keyword">in</span> my_iterable_dataset: <span class="hljs-comment"># fast + reshuffled at each epoch using \`effective_seed = seed + epoch\`</span> | |
| <span class="hljs-keyword">pass</span>`,wrap:!1}}),de=new y({props:{code:"bXlfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQuc2VsZWN0KHJhbmdlKHN0YXJ0X2luZGV4JTJDJTIwbGVuKGRhdGFzZXQpKSk=",highlighted:'my_dataset = my_dataset.select(<span class="hljs-built_in">range</span>(start_index, <span class="hljs-built_in">len</span>(dataset)))',wrap:!1}}),oe=new y({props:{code:"aXRlcmFibGVfZGF0YXNldCUyMCUzRCUyMERhdGFzZXQuZnJvbV9kaWN0KCU3QiUyMmElMjIlM0ElMjByYW5nZSg2KSU3RCkudG9faXRlcmFibGVfZGF0YXNldChudW1fc2hhcmRzJTNEMyklMEElMjMlMjBzYXZlJTIwaW4lMjB0aGUlMjBtaWRkbGUlMjBvZiUyMHRyYWluaW5nJTBBc3RhdGVfZGljdCUyMCUzRCUyMGl0ZXJhYmxlX2RhdGFzZXQuc3RhdGVfZGljdCgpJTBBJTIzJTIwYW5kJTIwcmVzdW1lJTIwbGF0ZXIlMEFpdGVyYWJsZV9kYXRhc2V0LmxvYWRfc3RhdGVfZGljdChzdGF0ZV9kaWN0KQ==",highlighted:`<span class="hljs-meta">>>> </span>iterable_dataset = Dataset.from_dict({<span class="hljs-string">"a"</span>: <span class="hljs-built_in">range</span>(<span class="hljs-number">6</span>)}).to_iterable_dataset(num_shards=<span class="hljs-number">3</span>) | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># save in the middle of training</span> | |
| <span class="hljs-meta">>>> </span>state_dict = iterable_dataset.state_dict() | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># and resume later</span> | |
| <span class="hljs-meta">>>> </span>iterable_dataset.load_state_dict(state_dict)`,wrap:!1}}),ue=new u({props:{title:"Switch from map-style to iterable",local:"switch-from-map-style-to-iterable",headingTag:"h2"}}),we=new y({props:{code:"bXlfaXRlcmFibGVfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQudG9faXRlcmFibGVfZGF0YXNldCgp",highlighted:"my_iterable_dataset = my_dataset.to_iterable_dataset()",wrap:!1}}),_e=new y({props:{code:"bXlfaXRlcmFibGVfZGF0YXNldCUyMCUzRCUyMG15X2RhdGFzZXQudG9faXRlcmFibGVfZGF0YXNldChudW1fc2hhcmRzJTNEMTAyNCklMEFteV9pdGVyYWJsZV9kYXRhc2V0Lm51bV9zaGFyZHMlMjAlMjAlMjMlMjAxMDI0",highlighted:`my_iterable_dataset = my_dataset.to_iterable_dataset(num_shards=<span class="hljs-number">1024</span>) | |
| my_iterable_dataset.num_shards <span class="hljs-comment"># 1024</span>`,wrap:!1}}),Je=new _s({props:{source:"https://github.com/huggingface/datasets/blob/main/docs/source/about_mapstyle_vs_iterable.mdx"}}),{c(){M=i("meta"),Ue=l(),Te=i("p"),Ze=l(),d(b.$$.fragment),Xe=l(),d(w.$$.fragment),Ge=l(),j=i("p"),j.innerHTML=Ra,Ie=l(),d(_.$$.fragment),ke=l(),J=i("p"),J.innerHTML=va,$e=l(),d(T.$$.fragment),Ce=l(),g=i("p"),g.innerHTML=Ba,Ve=l(),U=i("p"),U.textContent=Fa,Re=l(),d(Z.$$.fragment),ve=l(),X=i("p"),X.innerHTML=xa,Be=l(),G=i("p"),G.innerHTML=Ya,Fe=l(),d(I.$$.fragment),xe=l(),k=i("p"),k.innerHTML=za,Ye=l(),d($.$$.fragment),ze=l(),C=i("p"),C.innerHTML=Wa,We=l(),d(V.$$.fragment),Ne=l(),d(R.$$.fragment),Qe=l(),v=i("p"),v.innerHTML=Na,De=l(),d(B.$$.fragment),He=l(),F=i("p"),F.textContent=Qa,Ae=l(),x=i("p"),x.innerHTML=Da,Ee=l(),d(Y.$$.fragment),Se=l(),z=i("p"),z.innerHTML=Ha,Le=l(),d(W.$$.fragment),qe=l(),N=i("p"),N.innerHTML=Aa,Pe=l(),d(Q.$$.fragment),Ke=l(),D=i("p"),D.innerHTML=Ea,Oe=l(),H=i("p"),H.textContent=Sa,ea=l(),d(A.$$.fragment),aa=l(),d(E.$$.fragment),sa=l(),S=i("p"),S.innerHTML=La,ta=l(),d(L.$$.fragment),la=l(),q=i("p"),q.innerHTML=qa,na=l(),d(P.$$.fragment),ia=l(),K=i("p"),K.innerHTML=Pa,ra=l(),d(O.$$.fragment),pa=l(),d(ee.$$.fragment),da=l(),ae=i("p"),ae.innerHTML=Ka,ca=l(),se=i("p"),se.innerHTML=Oa,ma=l(),d(te.$$.fragment),oa=l(),le=i("p"),le.innerHTML=es,ha=l(),d(ne.$$.fragment),fa=l(),ie=i("p"),ie.innerHTML=as,ya=l(),d(re.$$.fragment),Ma=l(),pe=i("p"),pe.textContent=ss,ua=l(),d(de.$$.fragment),ba=l(),ce=i("p"),ce.innerHTML=ts,wa=l(),me=i("p"),me.innerHTML=ls,ja=l(),d(oe.$$.fragment),_a=l(),he=i("p"),he.innerHTML=ns,Ja=l(),fe=i("p"),fe.textContent=is,Ta=l(),ye=i("p"),ye.textContent=rs,ga=l(),Me=i("p"),Me.innerHTML=ps,Ua=l(),d(ue.$$.fragment),Za=l(),be=i("p"),be.innerHTML=ds,Xa=l(),d(we.$$.fragment),Ga=l(),je=i("p"),je.innerHTML=cs,Ia=l(),d(_e.$$.fragment),ka=l(),d(Je.$$.fragment),$a=l(),ge=i("p"),this.h()},l(e){const a=bs("svelte-u9bgzb",document.head);M=r(a,"META",{name:!0,content:!0}),a.forEach(s),Ue=n(e),Te=r(e,"P",{}),ms(Te).forEach(s),Ze=n(e),c(b.$$.fragment,e),Xe=n(e),c(w.$$.fragment,e),Ge=n(e),j=r(e,"P",{"data-svelte-h":!0}),p(j)!=="svelte-obbfk4"&&(j.innerHTML=Ra),Ie=n(e),c(_.$$.fragment,e),ke=n(e),J=r(e,"P",{"data-svelte-h":!0}),p(J)!=="svelte-noa0nl"&&(J.innerHTML=va),$e=n(e),c(T.$$.fragment,e),Ce=n(e),g=r(e,"P",{"data-svelte-h":!0}),p(g)!=="svelte-152i6l3"&&(g.innerHTML=Ba),Ve=n(e),U=r(e,"P",{"data-svelte-h":!0}),p(U)!=="svelte-p27tn9"&&(U.textContent=Fa),Re=n(e),c(Z.$$.fragment,e),ve=n(e),X=r(e,"P",{"data-svelte-h":!0}),p(X)!=="svelte-1m5el5o"&&(X.innerHTML=xa),Be=n(e),G=r(e,"P",{"data-svelte-h":!0}),p(G)!=="svelte-1ik5e8b"&&(G.innerHTML=Ya),Fe=n(e),c(I.$$.fragment,e),xe=n(e),k=r(e,"P",{"data-svelte-h":!0}),p(k)!=="svelte-1i29srk"&&(k.innerHTML=za),Ye=n(e),c($.$$.fragment,e),ze=n(e),C=r(e,"P",{"data-svelte-h":!0}),p(C)!=="svelte-z2r7k0"&&(C.innerHTML=Wa),We=n(e),c(V.$$.fragment,e),Ne=n(e),c(R.$$.fragment,e),Qe=n(e),v=r(e,"P",{"data-svelte-h":!0}),p(v)!=="svelte-zma22i"&&(v.innerHTML=Na),De=n(e),c(B.$$.fragment,e),He=n(e),F=r(e,"P",{"data-svelte-h":!0}),p(F)!=="svelte-gryvti"&&(F.textContent=Qa),Ae=n(e),x=r(e,"P",{"data-svelte-h":!0}),p(x)!=="svelte-gn3sd"&&(x.innerHTML=Da),Ee=n(e),c(Y.$$.fragment,e),Se=n(e),z=r(e,"P",{"data-svelte-h":!0}),p(z)!=="svelte-u916f6"&&(z.innerHTML=Ha),Le=n(e),c(W.$$.fragment,e),qe=n(e),N=r(e,"P",{"data-svelte-h":!0}),p(N)!=="svelte-19wo518"&&(N.innerHTML=Aa),Pe=n(e),c(Q.$$.fragment,e),Ke=n(e),D=r(e,"P",{"data-svelte-h":!0}),p(D)!=="svelte-n9zp2j"&&(D.innerHTML=Ea),Oe=n(e),H=r(e,"P",{"data-svelte-h":!0}),p(H)!=="svelte-vsuw18"&&(H.textContent=Sa),ea=n(e),c(A.$$.fragment,e),aa=n(e),c(E.$$.fragment,e),sa=n(e),S=r(e,"P",{"data-svelte-h":!0}),p(S)!=="svelte-3vxzt6"&&(S.innerHTML=La),ta=n(e),c(L.$$.fragment,e),la=n(e),q=r(e,"P",{"data-svelte-h":!0}),p(q)!=="svelte-v7lvxb"&&(q.innerHTML=qa),na=n(e),c(P.$$.fragment,e),ia=n(e),K=r(e,"P",{"data-svelte-h":!0}),p(K)!=="svelte-nx8g88"&&(K.innerHTML=Pa),ra=n(e),c(O.$$.fragment,e),pa=n(e),c(ee.$$.fragment,e),da=n(e),ae=r(e,"P",{"data-svelte-h":!0}),p(ae)!=="svelte-cfps1k"&&(ae.innerHTML=Ka),ca=n(e),se=r(e,"P",{"data-svelte-h":!0}),p(se)!=="svelte-fna18m"&&(se.innerHTML=Oa),ma=n(e),c(te.$$.fragment,e),oa=n(e),le=r(e,"P",{"data-svelte-h":!0}),p(le)!=="svelte-mfr8up"&&(le.innerHTML=es),ha=n(e),c(ne.$$.fragment,e),fa=n(e),ie=r(e,"P",{"data-svelte-h":!0}),p(ie)!=="svelte-1yhobsx"&&(ie.innerHTML=as),ya=n(e),c(re.$$.fragment,e),Ma=n(e),pe=r(e,"P",{"data-svelte-h":!0}),p(pe)!=="svelte-1vtsyk"&&(pe.textContent=ss),ua=n(e),c(de.$$.fragment,e),ba=n(e),ce=r(e,"P",{"data-svelte-h":!0}),p(ce)!=="svelte-1qarbjh"&&(ce.innerHTML=ts),wa=n(e),me=r(e,"P",{"data-svelte-h":!0}),p(me)!=="svelte-177bwe6"&&(me.innerHTML=ls),ja=n(e),c(oe.$$.fragment,e),_a=n(e),he=r(e,"P",{"data-svelte-h":!0}),p(he)!=="svelte-ul64r3"&&(he.innerHTML=ns),Ja=n(e),fe=r(e,"P",{"data-svelte-h":!0}),p(fe)!=="svelte-fghrva"&&(fe.textContent=is),Ta=n(e),ye=r(e,"P",{"data-svelte-h":!0}),p(ye)!=="svelte-1i92o70"&&(ye.textContent=rs),ga=n(e),Me=r(e,"P",{"data-svelte-h":!0}),p(Me)!=="svelte-1r4w1a2"&&(Me.innerHTML=ps),Ua=n(e),c(ue.$$.fragment,e),Za=n(e),be=r(e,"P",{"data-svelte-h":!0}),p(be)!=="svelte-1dgl4fa"&&(be.innerHTML=ds),Xa=n(e),c(we.$$.fragment,e),Ga=n(e),je=r(e,"P",{"data-svelte-h":!0}),p(je)!=="svelte-8w5p2f"&&(je.innerHTML=cs),Ia=n(e),c(_e.$$.fragment,e),ka=n(e),c(Je.$$.fragment,e),$a=n(e),ge=r(e,"P",{}),ms(ge).forEach(s),this.h()},h(){os(M,"name","hf:doc:metadata"),os(M,"content",Ts)},m(e,a){ws(document.head,M),t(e,Ue,a),t(e,Te,a),t(e,Ze,a),m(b,e,a),t(e,Xe,a),m(w,e,a),t(e,Ge,a),t(e,j,a),t(e,Ie,a),m(_,e,a),t(e,ke,a),t(e,J,a),t(e,$e,a),m(T,e,a),t(e,Ce,a),t(e,g,a),t(e,Ve,a),t(e,U,a),t(e,Re,a),m(Z,e,a),t(e,ve,a),t(e,X,a),t(e,Be,a),t(e,G,a),t(e,Fe,a),m(I,e,a),t(e,xe,a),t(e,k,a),t(e,Ye,a),m($,e,a),t(e,ze,a),t(e,C,a),t(e,We,a),m(V,e,a),t(e,Ne,a),m(R,e,a),t(e,Qe,a),t(e,v,a),t(e,De,a),m(B,e,a),t(e,He,a),t(e,F,a),t(e,Ae,a),t(e,x,a),t(e,Ee,a),m(Y,e,a),t(e,Se,a),t(e,z,a),t(e,Le,a),m(W,e,a),t(e,qe,a),t(e,N,a),t(e,Pe,a),m(Q,e,a),t(e,Ke,a),t(e,D,a),t(e,Oe,a),t(e,H,a),t(e,ea,a),m(A,e,a),t(e,aa,a),m(E,e,a),t(e,sa,a),t(e,S,a),t(e,ta,a),m(L,e,a),t(e,la,a),t(e,q,a),t(e,na,a),m(P,e,a),t(e,ia,a),t(e,K,a),t(e,ra,a),m(O,e,a),t(e,pa,a),m(ee,e,a),t(e,da,a),t(e,ae,a),t(e,ca,a),t(e,se,a),t(e,ma,a),m(te,e,a),t(e,oa,a),t(e,le,a),t(e,ha,a),m(ne,e,a),t(e,fa,a),t(e,ie,a),t(e,ya,a),m(re,e,a),t(e,Ma,a),t(e,pe,a),t(e,ua,a),m(de,e,a),t(e,ba,a),t(e,ce,a),t(e,wa,a),t(e,me,a),t(e,ja,a),m(oe,e,a),t(e,_a,a),t(e,he,a),t(e,Ja,a),t(e,fe,a),t(e,Ta,a),t(e,ye,a),t(e,ga,a),t(e,Me,a),t(e,Ua,a),m(ue,e,a),t(e,Za,a),t(e,be,a),t(e,Xa,a),m(we,e,a),t(e,Ga,a),t(e,je,a),t(e,Ia,a),m(_e,e,a),t(e,ka,a),m(Je,e,a),t(e,$a,a),t(e,ge,a),Ca=!0},p:fs,i(e){Ca||(o(b.$$.fragment,e),o(w.$$.fragment,e),o(_.$$.fragment,e),o(T.$$.fragment,e),o(Z.$$.fragment,e),o(I.$$.fragment,e),o($.$$.fragment,e),o(V.$$.fragment,e),o(R.$$.fragment,e),o(B.$$.fragment,e),o(Y.$$.fragment,e),o(W.$$.fragment,e),o(Q.$$.fragment,e),o(A.$$.fragment,e),o(E.$$.fragment,e),o(L.$$.fragment,e),o(P.$$.fragment,e),o(O.$$.fragment,e),o(ee.$$.fragment,e),o(te.$$.fragment,e),o(ne.$$.fragment,e),o(re.$$.fragment,e),o(de.$$.fragment,e),o(oe.$$.fragment,e),o(ue.$$.fragment,e),o(we.$$.fragment,e),o(_e.$$.fragment,e),o(Je.$$.fragment,e),Ca=!0)},o(e){h(b.$$.fragment,e),h(w.$$.fragment,e),h(_.$$.fragment,e),h(T.$$.fragment,e),h(Z.$$.fragment,e),h(I.$$.fragment,e),h($.$$.fragment,e),h(V.$$.fragment,e),h(R.$$.fragment,e),h(B.$$.fragment,e),h(Y.$$.fragment,e),h(W.$$.fragment,e),h(Q.$$.fragment,e),h(A.$$.fragment,e),h(E.$$.fragment,e),h(L.$$.fragment,e),h(P.$$.fragment,e),h(O.$$.fragment,e),h(ee.$$.fragment,e),h(te.$$.fragment,e),h(ne.$$.fragment,e),h(re.$$.fragment,e),h(de.$$.fragment,e),h(oe.$$.fragment,e),h(ue.$$.fragment,e),h(we.$$.fragment,e),h(_e.$$.fragment,e),h(Je.$$.fragment,e),Ca=!1},d(e){e&&(s(Ue),s(Te),s(Ze),s(Xe),s(Ge),s(j),s(Ie),s(ke),s(J),s($e),s(Ce),s(g),s(Ve),s(U),s(Re),s(ve),s(X),s(Be),s(G),s(Fe),s(xe),s(k),s(Ye),s(ze),s(C),s(We),s(Ne),s(Qe),s(v),s(De),s(He),s(F),s(Ae),s(x),s(Ee),s(Se),s(z),s(Le),s(qe),s(N),s(Pe),s(Ke),s(D),s(Oe),s(H),s(ea),s(aa),s(sa),s(S),s(ta),s(la),s(q),s(na),s(ia),s(K),s(ra),s(pa),s(da),s(ae),s(ca),s(se),s(ma),s(oa),s(le),s(ha),s(fa),s(ie),s(ya),s(Ma),s(pe),s(ua),s(ba),s(ce),s(wa),s(me),s(ja),s(_a),s(he),s(Ja),s(fe),s(Ta),s(ye),s(ga),s(Me),s(Ua),s(Za),s(be),s(Xa),s(Ga),s(je),s(Ia),s(ka),s($a),s(ge)),s(M),f(b,e),f(w,e),f(_,e),f(T,e),f(Z,e),f(I,e),f($,e),f(V,e),f(R,e),f(B,e),f(Y,e),f(W,e),f(Q,e),f(A,e),f(E,e),f(L,e),f(P,e),f(O,e),f(ee,e),f(te,e),f(ne,e),f(re,e),f(de,e),f(oe,e),f(ue,e),f(we,e),f(_e,e),f(Je,e)}}}const Ts='{"title":"Differences between Dataset and IterableDataset","local":"differences-between-dataset-and-iterabledataset","sections":[{"title":"Downloading and streaming","local":"downloading-and-streaming","sections":[],"depth":2},{"title":"Creating map-style datasets and iterable datasets","local":"creating-map-style-datasets-and-iterable-datasets","sections":[],"depth":2},{"title":"Loading local files entirely and progressively","local":"loading-local-files-entirely-and-progressively","sections":[],"depth":2},{"title":"Eager data processing and lazy data processing","local":"eager-data-processing-and-lazy-data-processing","sections":[],"depth":2},{"title":"Exact and fast approximate shuffling","local":"exact-and-fast-approximate-shuffling","sections":[],"depth":2},{"title":"Speed differences","local":"speed-differences","sections":[],"depth":2},{"title":"Switch from map-style to iterable","local":"switch-from-map-style-to-iterable","sections":[],"depth":2}],"depth":1}';function gs(Va){return ys(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Is extends Ms{constructor(M){super(),us(this,M,gs,Js,hs,{})}}export{Is as component}; | |
Xet Storage Details
- Size:
- 40.8 kB
- Xet hash:
- e6e5a698ccb48e465e594ca44594fab675ee32533e223ba2f4901b162a3008bd
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.