Buckets:
| import{s as Za,o as Ga,n as Ce}from"../chunks/scheduler.37c15a92.js";import{S as za,i as Va,g as u,s as t,r as p,A as ka,h as J,f as a,c as n,j as Ba,u as o,x as T,k as xa,y as Wa,a as l,v as r,d,t as c,w as m}from"../chunks/index.2bf4358c.js";import{T as We}from"../chunks/Tip.363c041f.js";import{Y as Ca}from"../chunks/Youtube.1e50a667.js";import{C as h}from"../chunks/CodeBlock.4e987730.js";import{C as va}from"../chunks/CourseFloatingBanner.9ff4c771.js";import{H as Ls,E as Ra}from"../chunks/getInferenceSnippets.24b50994.js";function Xa(j){let i,b='✎ Di base, 🤗 Datasets decomprimerà i file necessari a caricare un dataset. Se vuoi risparmiare sullo spazio dell’hard disk, puoi passare <code>DownloadConfig(delete_extracted_True)</code> all’argomento <code>download_config</code> di <code>load_dataset()</code>. Per maggiori dettagli leggi la <a href="https://huggingface.co/docs/datasets/package_reference/builder_classes#datasets.DownloadConfig" rel="nofollow">documentazione</a>.';return{c(){i=u("p"),i.innerHTML=b},l(M){i=J(M,"P",{"data-svelte-h":!0}),T(i)!=="svelte-ohmup"&&(i.innerHTML=b)},m(M,y){l(M,i,y)},p:Ce,d(M){M&&a(i)}}}function Ea(j){let i,b='✏️ <strong>Provaci tu!</strong> Scegli uno dei <a href="https://the-eye.eu/public/AI/pile_preliminary_components/" rel="nofollow">subset</a> di Pile che è più grande della RAM del tuo PC o del tuo portatile, caricalo utilizzando 🤗 Datasets e calcola la quantità di RAM utilizzata. Nota che per avere un valore preciso, dovrai creare un nuovo processo. Puoi trovare le grandezze decompresse di ogni subset nella Tavola 1 dell’<a href="https://arxiv.org/abs/2101.00027" rel="nofollow">articolo su Pile</a>';return{c(){i=u("p"),i.innerHTML=b},l(M){i=J(M,"P",{"data-svelte-h":!0}),T(i)!=="svelte-1p4smth"&&(i.innerHTML=b)},m(M,y){l(M,i,y)},p:Ce,d(M){M&&a(i)}}}function Na(j){let i,b='💡 Nei notebook Jupyter, puoi cronometrare le celle utilizzando la <a href="https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-timeit" rel="nofollow">funzione magica <code>%%timeit</code></a>';return{c(){i=u("p"),i.innerHTML=b},l(M){i=J(M,"P",{"data-svelte-h":!0}),T(i)!=="svelte-10cqh4y"&&(i.innerHTML=b)},m(M,y){l(M,i,y)},p:Ce,d(M){M&&a(i)}}}function _a(j){let i,b="💡 Per velocizzare la tokenizzazione con lo streaming puoi passare <code>batchet=True</code>, come abbiamo visto nell’ultima sezione. Questo processerà gli esempi per batch. Di default, la grandezza di un batch è 1.000, e può essere specificata attraverso l’argomento <code>batch_size</code>.";return{c(){i=u("p"),i.innerHTML=b},l(M){i=J(M,"P",{"data-svelte-h":!0}),T(i)!=="svelte-shytyc"&&(i.innerHTML=b)},m(M,y){l(M,i,y)},p:Ce,d(M){M&&a(i)}}}function Qa(j){let i,b='✏️ <strong>Prova tu!</strong> Usa uno dei corpora Common Crawl come <a href="https://huggingface.co/datasets/mc4" rel="nofollow"><code>mc4</code></a> oppure <a href="https://huggingface.co/datasets/oscar" rel="nofollow"><code>oscar</code></a> per crare un dataset multilingue in streaming, che rappresenta le proporzioni delle lingue parlate in un paese a tua scelta. Ad esempio, le quattro lingue ufficiali in Svizzera sono il tedesco, il francesce, l’italiano e il romancio, per cui potresti creare un corpus della Svizzera raccogliendo i campioni da Oscar, secondo la percentuale di parlanti di ognuna.';return{c(){i=u("p"),i.innerHTML=b},l(M){i=J(M,"P",{"data-svelte-h":!0}),T(i)!=="svelte-1ir16xe"&&(i.innerHTML=b)},m(M,y){l(M,i,y)},p:Ce,d(M){M&&a(i)}}}function Ya(j){let i,b,M,y,$,ve,B,Re,x,qs="Al giorno d’oggi non è raro trovarsi a lavorare con dataset grandi diversi gigabyte, soprattutto quando si vuole addestrare un transformer come BERT o GPT-2 da zero. In questi casi, persino <em>caricare</em> i dati può essere un’impresa difficile. Ad esempio, il corpus WebText utilizzato per preaddestrare GPT-2 contiente più di 8 milioni di documenti e 40gb di testo — caricare un dataset del genere sulla RAM del tuo portatile gli farebbe venire un colpo!",Xe,Z,Ps="Per fortuna, 🤗 Datasets è stato sviluppato per superare queste limitazioni, e può risolvere i problemi relativi alla gestione della memoria trattando i dataset come file <em>memory-mapped</em>, e quelli relativi ai limiti del disco rigido attraverso lo <em>stream processing</em> delle voci del corpus.",Ee,G,Ne,z,Os='In questa sezione esploreremo queste funzionalità di 🤗 Datasets con un enorme corpus di 825 GB conosciuto come <a href="https://pile.eleuther.ai" rel="nofollow">Pile</a>. Iniziamo!',_e,V,Qe,k,Ks='The Pile è un corpus testuale creato da <a href="https://www.eleuther.ai" rel="nofollow">EleutherAI</a> per addestrare modelli di linguaggio su grande scala. Include un grande varietà di dataset, a partire da articoli scientifici, repository di codici da GitHub, e testi dal web filtrati. Il corpus di addestramento è disponibili in <a href="https://the-eye.eu/public/AI/pile/" rel="nofollow">frammenti da 14 GB</a>, ed è possibile scaricare diverse delle <a href="https://the-eye.eu/public/AI/pile_preliminary_components/" rel="nofollow">componenti singole</a>. Iniziamo dando uno sguardo al dataset PubMed Abstracts, un corpus di abstract da 15 milioni di pubblicazioni in ambito biomedico da <a href="https://pubmed.ncbi.nlm.nih.gov/" rel="nofollow">PubMed</a>. Il dataset è in <a href="https://jsonlines.org" rel="nofollow">formato JSON Lines</a> ed è stato compressato usando la libreria <code>zstandard</code>, per cui dobbiamo prima installarla:',Ye,W,Ae,C,ea='Ora, possiamo caricare il dataset utilizzando il meotodo per file remoti che abbiamo visto nella <a href="/course/chapter5/2">sezione 2</a>:',Fe,v,He,R,Se,X,sa="Possiamo vedere che ci sono 15.518.009 righe e 2 colonne nel nostro dataset — un bel po’!",De,f,Le,E,aa="Ispezioniamo i contenuti del primo esempio:",qe,N,Pe,_,Oe,Q,la="Okay, questo sembra proprio l’abstract di un articolo di medicina. Ora vediamo quanta RAM è stata usata per caricare il dataset!",Ke,Y,es,A,ta='Un modo semplice per calcolare l’uso di memoria su Python è utilizzando la libreria <a href="https://psutil.readthedocs.io/en/latest/" rel="nofollow"><code>psutil</code></a>, che può essere installata con <code>pip</code> come segue:',ss,F,as,H,na="<code>psutil</code> offre una classe <code>Process</code> che permette di controllare l’utilizzo della memoria del processo attuale come segue::",ls,S,ts,D,ns,L,ia="L’attributo <code>rss</code> qui fa riferimento alla <em>grandezza del resident set</em>, che equivale alla frazione di memoria che il processo occupa nella RAM. Questo valore include inoltre la memoria utilizzata dall’interprete Python e dalle librerie caricate, per cui l’ammontare effettivo utilizzato per caricare il dataset è un po’ più piccolo. Per fare un confronto, vediamo quant’è grande il dataset su disco utilizzando l’attributo <code>dataset_size</code>. Come prima, il risultato è espresso in byte, e abbiamo bisogno di convertirlo in gigabyte:",is,q,ps,P,os,O,pa="Bene — nonostante sia grande quasi 30 GB, siamo in grado di caricare e accedere al dataset utilizzando molta meno RAM!",rs,w,ds,K,oa='Se hai dimestichezza con Pandas, questo risultato potrebbe sorprenderti, vista la famosa <a href="https://wesmckinney.com/blog/apache-arrow-pandas-internals/" rel="nofollow">regola di Wes Kinney</a>, ovvero che, in linea di massima, serve una RAM 5-10 volte più grande del dataset che vuoi caricare. Come fa 🤗 Datasets a risolvere questo problema di gestione della memoria? 🤗 Datasets tratta ogni dataset come un <a href="https://it.wikipedia.org/wiki/File_mappato_in_memoria" rel="nofollow">file mappato in memoria</a>, il che permette di avere un mapping tra la RAM e l’archiviazione dei file di sistema, che permette alla librera di accedere e operare su elementi del dataset senza doverli caricare completamente in memoria.',cs,ee,ra='I file mappati in memoria possono inoltre essre condivisi su più processi, il che permette a metodi come <code>Dataset.map()</code> di poter essere eseguiti in parallelo senza bisogno di spostare o copiare il dataset. Dietro le quinte, tutto ciò è realizzato dal formato di memoria <a href="https://arrow.apache.org" rel="nofollow">Apache Arrow</a> e dalla libreria <a href="https://arrow.apache.org/docs/python/index.html" rel="nofollow"><code>pyarrow</code></a>, che rendono più veloci il caricamento e il processamento dei dati. (per maggiori dettagli su Apache Arrow, e per un confronto con Pandas, dai un’occhiata al <a href="https://towardsdatascience.com/apache-arrow-read-dataframe-with-zero-memory-69634092b1a" rel="nofollow">post di Dejan Simic</a>.) Per vederlo in azione, eseguiamo un piccolo test di velocità con un loop su tutti gli elementi nel dataset PubMed Abstracts:',ms,se,Ms,ae,us,le,da="Abbiamo usato il modulo di Python <code>timeit</code> per calcolare il tempo di esecuzione impiegato da <code>code_snippet</code>. Tipicamente l’iterazione su un dataset impiega un tempo che va da un decimo di GB al secondo, a diversi GB al secondo. Questo funziona perfettamente per la maggior parte delle applicazioni, ma a volte avrai bisogno di lavorare con un dataset che è troppo grande persino per essere salvato sul tuo portatile. Ad esempio, se cercassimo di scaricare Pile per intero, avremo bisogno di 825 GB di spazio libero su disko! In questi casi, 🤗 Datasets permette di utilizzare processi di streaming che ci permettono di scaricare e accedere al volo ai dati, senza bisogno di scaricare l’intero dataset. Diamo un’occhiata a come funziona.",Js,g,Ts,te,hs,ne,ca="Per abilitare lo streaming dei dataset devi semplicemente passare l’argomento <code>streaming=True</code> alla funzione <code>load_dataset()</code>. Ad esempio, carichiamo un’altra volta il dataset PubMed Abstract, ma in modalità streaming:",bs,ie,ys,pe,ma="Invece del solito <code>Dataset</code> che abbiamo incontrato in precedenza in questo capitolo, l’oggetto ritornato con <code>streaming=True' è un </code>IterableDataset<code>. Come suggerito dal nome, per accedere agli elementi di un </code>IterableDataset`, dobbiamo iterare di esso. Possiamo accedere al primo elemento del nostro dataset in streaming come segue:",js,oe,fs,re,ws,de,Ma='Gli elementi di un dataset in streaming possono essere processati al volo utilizzando <code>IterableDataset.map()</code>, che è utile durante l’addestramento se hai bisogno di tokenizzare gli input. Il processo è uguale a quello che abbiamo utilizzato per tokenizzare il nostro dataset nel <a href="/course/chapter3">Capitolo 3</a>, con l’unica differenza che ora ritorneremo gli output uno alla volta:',gs,ce,Us,me,Is,U,$s,Me,ua="È anche possibile mescolare un dataset in streaming utilizzato <code>Iterabledataset.shuffle()</code>, ma a differenza di <code>Dataset.shuffle()</code>, questo metodo mescola solo gli elementi in un <code>buffer_size</code> predefinito:",Bs,ue,xs,Je,Zs,Te,Ja="In questo esempio, abbiamo selezionato un esempio casuale dai primi 10.000 esempi nel buffer. Una volta che accediamo a un esempio, il suo posto nel buffer è subito occupato dall’esempio successivo nel corpus (in questo caso l’esempio 10.0001). Puoi inoltre selezionare gli elementi da un dataset in streaming utilizzando le funzioni <code>IterableDataset.take()</code> a <code>IterableDataset.skip()</code>, che funzionano un po’ come <code>Dataset.select()</code>. Ad esempio, per selezionare i primi 5 esempi nel dataset PubMed Abstract dovremmo fare come segue:",Gs,he,zs,be,Vs,ye,Ta="Allo stesso modo, è possibile utilizzare la funzione <code>IterableDataset.skip()</code> per creare sezioni di addestramento e di validazione da un dataset mescolato, come segue:",ks,je,Ws,fe,ha="Concludiamo la nostra ricognizione dello streaming di dataset con un’applicazione comune: la combinazione di più dataset per creare un unico corpus. 🤗 Datasets fornisce una funzione <code>interleave_datasets()</code>, che converte una lista di oggetti <code>IterableDataset</code> in un unico <code>IterableDataset</code>, dove gli elementi del nuovo dataset sono ottenuti alternando tra gli esempi forniti. Questa funzione è particolarmente utile quando cerchiamo di combinare dataset di grandi dimensioni, come esempio possiamo utilizzare in streaming la sezione FreeLaw del Pile, un dataset di 51 GB di pareri legali dai tribunali statunitensi:",Cs,we,vs,ge,Rs,Ue,ba="Questo dataset è abbastanza grande da mettere sotto sforzo la RAM di molto portatili, ma siamo riusciti a caricarlo e accedervi senza alcun problema! Ora cominiamo gli esempi di FreeLaw e di PubMed Abstracts con la funzione <code>interleave_datasets()</code>:",Xs,Ie,Es,$e,Ns,Be,ya="Abbiamo utilizzato la funzione <code>islice()</code> del modulo Python <code>itertools</code> per selezionare i primi due esempi dai dataset combinati, e abbiamo visto che corrispondono ai primi esempi di ognuno dei due dataset originali.",_s,xe,ja="Infine, se vuoi processare il Pile in streaming, in tutti i suoi 825 GB, puoi recuperare tutti i file preparati, come segue:",Qs,Ze,Ys,Ge,As,I,Fs,ze,fa="Ora hai a tua disposizione tutti gli strumenti per caricare e processare dataset di ogni tipo — ma a meno che tu non sia estremamente fortunato, arriverà un momento nel tuo cammino in cui dovrai effettivamente creare un dataset per risolvere i tuoi problemi. Questo sarà argomento della prossima sezione!",Hs,Ve,Ss,ke,Ds;return $=new Ls({props:{title:"Big data? Ci pensa 🤗 Datasets!",local:"big-data-ci-pensa--datasets",headingTag:"h1"}}),B=new va({props:{chapter:5,classNames:"absolute z-10 right-0 top-0",notebooks:[{label:"Google Colab",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/it/chapter5/section4.ipynb"},{label:"Aws Studio",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/it/chapter5/section4.ipynb"}]}}),G=new Ca({props:{id:"JwISwTCPPWo"}}),V=new Ls({props:{title:"Cos’è Pile?",local:"cosè-pile",headingTag:"h2"}}),W=new h({props:{code:"IXBpcCUyMGluc3RhbGwlMjB6c3RhbmRhcmQ=",highlighted:"!pip install zstandard",wrap:!1}}),v=new h({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBJTBBJTIzJTIwQ2klMjB2dW9sZSUyMHF1YWxjaGUlMjBtaW51dG8lMjBwZXIlMjBsJ2VzZWN1emlvbmUlMkMlMjBxdWluZGklMjBwcmVwYXJhdGklMjB1biUyMHQlQzMlQTglMjBvJTIwdW4lMjBjYWZmJUMzJUE4JTIwbmVsbCdhdHRlc2ElMjAlM0EpJTBBZGF0YV9maWxlcyUyMCUzRCUyMCUyMmh0dHBzJTNBJTJGJTJGdGhlLWV5ZS5ldSUyRnB1YmxpYyUyRkFJJTJGcGlsZV9wcmVsaW1pbmFyeV9jb21wb25lbnRzJTJGUFVCTUVEX3RpdGxlX2Fic3RyYWN0c18yMDE5X2Jhc2VsaW5lLmpzb25sLnpzdCUyMiUwQXB1Ym1lZF9kYXRhc2V0JTIwJTNEJTIwbG9hZF9kYXRhc2V0KCUyMmpzb24lMjIlMkMlMjBkYXRhX2ZpbGVzJTNEZGF0YV9maWxlcyUyQyUyMHNwbGl0JTNEJTIydHJhaW4lMjIpJTBBcHVibWVkX2RhdGFzZXQ=",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-comment"># Ci vuole qualche minuto per l'esecuzione, quindi preparati un tè o un caffè nell'attesa :)</span> | |
| data_files = <span class="hljs-string">"https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"</span> | |
| pubmed_dataset = load_dataset(<span class="hljs-string">"json"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>) | |
| pubmed_dataset`,wrap:!1}}),R=new h({props:{code:"RGF0YXNldCglN0IlMEElMjAlMjAlMjAlMjBmZWF0dXJlcyUzQSUyMCU1QidtZXRhJyUyQyUyMCd0ZXh0JyU1RCUyQyUwQSUyMCUyMCUyMCUyMG51bV9yb3dzJTNBJTIwMTU1MTgwMDklMEElN0Qp",highlighted:`Dataset({ | |
| features: [<span class="hljs-string">'meta'</span>, <span class="hljs-string">'text'</span>], | |
| num_rows: <span class="hljs-number">15518009</span> | |
| })`,wrap:!1}}),f=new We({props:{$$slots:{default:[Xa]},$$scope:{ctx:j}}}),N=new h({props:{code:"cHVibWVkX2RhdGFzZXQlNUIwJTVE",highlighted:'pubmed_dataset[<span class="hljs-number">0</span>]',wrap:!1}}),_=new h({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzQlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAndGV4dCclM0ElMjAnRXBpZGVtaW9sb2d5JTIwb2YlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHdpdGglMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb24uJTVDblRvJTIwZGV0ZXJtaW5lJTIwdGhlJTIwcHJldmFsZW5jZSUyMG9mJTIwaHlwb3hhZW1pYSUyMGluJTIwY2hpbGRyZW4lMjBhZ2VkJTIwdW5kZXIlMjA1JTIweWVhcnMlMjBzdWZmZXJpbmclMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb25zJTIwKEFMUkkpJTJDJTIwdGhlJTIwcmlzayUyMGZhY3RvcnMlMjBmb3IlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHVuZGVyJTIwNSUyMHllYXJzJTIwb2YlMjBhZ2UlMjB3aXRoJTIwQUxSSSUyQyUyMGFuZCUyMHRoZSUyMGFzc29jaWF0aW9uJTIwb2YlMjBoeXBveGFlbWlhJTIwd2l0aCUyMGFuJTIwaW5jcmVhc2VkJTIwcmlzayUyMG9mJTIwZHlpbmclMjBpbiUyMGNoaWxkcmVuJTIwb2YlMjB0aGUlMjBzYW1lJTIwYWdlJTIwLi4uJyU3RA==",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age ...'</span>}`,wrap:!1}}),Y=new Ls({props:{title:"La magia del memory mapping",local:"la-magia-del-memory-mapping",headingTag:"h2"}}),F=new h({props:{code:"IXBpcCUyMGluc3RhbGwlMjBwc3V0aWw=",highlighted:"!pip install psutil",wrap:!1}}),S=new h({props:{code:"aW1wb3J0JTIwcHN1dGlsJTBBJTBBJTIzJTIwUHJvY2Vzcy5tZW1vcnlfaW5mbyUyMG1vc3RyYSUyMGklMjBkYXRpJTIwaW4lMjBieXRlJTJDJTIwcXVpbmRpJTIwY29udmVydGlhbW8lMjBpbiUyMG1lZ2FieXRlJTBBcHJpbnQoZiUyMlJBTSUyMHVzZWQlM0ElMjAlN0Jwc3V0aWwuUHJvY2VzcygpLm1lbW9yeV9pbmZvKCkucnNzJTIwJTJGJTIwKDEwMjQlMjAqJTIwMTAyNCklM0EuMmYlN0QlMjBNQiUyMik=",highlighted:`<span class="hljs-keyword">import</span> psutil | |
| <span class="hljs-comment"># Process.memory_info mostra i dati in byte, quindi convertiamo in megabyte</span> | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"RAM used: <span class="hljs-subst">{psutil.Process().memory_info().rss / (<span class="hljs-number">1024</span> * <span class="hljs-number">1024</span>):<span class="hljs-number">.2</span>f}</span> MB"</span>)`,wrap:!1}}),D=new h({props:{code:"UkFNJTIwdXNlZCUzQSUyMDU2NzguMzMlMjBNQg==",highlighted:'RAM used: <span class="hljs-number">5678.33</span> MB',wrap:!1}}),q=new h({props:{code:"cHJpbnQoZiUyMk51bWJlciUyMG9mJTIwZmlsZXMlMjBpbiUyMGRhdGFzZXQlMjAlM0ElMjAlN0JwdWJtZWRfZGF0YXNldC5kYXRhc2V0X3NpemUlN0QlMjIpJTBBc2l6ZV9nYiUyMCUzRCUyMHB1Ym1lZF9kYXRhc2V0LmRhdGFzZXRfc2l6ZSUyMCUyRiUyMCgxMDI0KiozKSUwQXByaW50KGYlMjJEYXRhc2V0JTIwc2l6ZSUyMChjYWNoZSUyMGZpbGUpJTIwJTNBJTIwJTdCc2l6ZV9nYiUzQS4yZiU3RCUyMEdCJTIyKQ==",highlighted:`<span class="hljs-built_in">print</span>(<span class="hljs-string">f"Number of files in dataset : <span class="hljs-subst">{pubmed_dataset.dataset_size}</span>"</span>) | |
| size_gb = pubmed_dataset.dataset_size / (<span class="hljs-number">1024</span>**<span class="hljs-number">3</span>) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"Dataset size (cache file) : <span class="hljs-subst">{size_gb:<span class="hljs-number">.2</span>f}</span> GB"</span>)`,wrap:!1}}),P=new h({props:{code:"TnVtYmVyJTIwb2YlMjBmaWxlcyUyMGluJTIwZGF0YXNldCUyMCUzQSUyMDIwOTc5NDM3MDUxJTBBRGF0YXNldCUyMHNpemUlMjAoY2FjaGUlMjBmaWxlKSUyMCUzQSUyMDE5LjU0JTIwR0I=",highlighted:`Number of files <span class="hljs-keyword">in</span> dataset : <span class="hljs-number">20979437051</span> | |
| Dataset size (cache file) : <span class="hljs-number">19.54</span> GB`,wrap:!1}}),w=new We({props:{$$slots:{default:[Ea]},$$scope:{ctx:j}}}),se=new h({props:{code:"aW1wb3J0JTIwdGltZWl0JTBBJTBBY29kZV9zbmlwcGV0JTIwJTNEJTIwJTIyJTIyJTIyYmF0Y2hfc2l6ZSUyMCUzRCUyMDEwMDAlMEElMEFmb3IlMjBpZHglMjBpbiUyMHJhbmdlKDAlMkMlMjBsZW4ocHVibWVkX2RhdGFzZXQpJTJDJTIwYmF0Y2hfc2l6ZSklM0ElMEElMjAlMjAlMjAlMjBfJTIwJTNEJTIwcHVibWVkX2RhdGFzZXQlNUJpZHglM0FpZHglMjAlMkIlMjBiYXRjaF9zaXplJTVEJTBBJTIyJTIyJTIyJTBBJTBBdGltZSUyMCUzRCUyMHRpbWVpdC50aW1laXQoc3RtdCUzRGNvZGVfc25pcHBldCUyQyUyMG51bWJlciUzRDElMkMlMjBnbG9iYWxzJTNEZ2xvYmFscygpKSUwQXByaW50KCUwQSUyMCUyMCUyMCUyMGYlMjJJdGVyYXRlZCUyMG92ZXIlMjAlN0JsZW4ocHVibWVkX2RhdGFzZXQpJTdEJTIwZXhhbXBsZXMlMjAoYWJvdXQlMjAlN0JzaXplX2diJTNBLjFmJTdEJTIwR0IpJTIwaW4lMjAlMjIlMEElMjAlMjAlMjAlMjBmJTIyJTdCdGltZSUzQS4xZiU3RHMlMkMlMjBpLmUuJTIwJTdCc2l6ZV9nYiUyRnRpbWUlM0EuM2YlN0QlMjBHQiUyRnMlMjIlMEEp",highlighted:`<span class="hljs-keyword">import</span> timeit | |
| code_snippet = <span class="hljs-string">"""batch_size = 1000 | |
| for idx in range(0, len(pubmed_dataset), batch_size): | |
| _ = pubmed_dataset[idx:idx + batch_size] | |
| """</span> | |
| time = timeit.timeit(stmt=code_snippet, number=<span class="hljs-number">1</span>, <span class="hljs-built_in">globals</span>=<span class="hljs-built_in">globals</span>()) | |
| <span class="hljs-built_in">print</span>( | |
| <span class="hljs-string">f"Iterated over <span class="hljs-subst">{<span class="hljs-built_in">len</span>(pubmed_dataset)}</span> examples (about <span class="hljs-subst">{size_gb:<span class="hljs-number">.1</span>f}</span> GB) in "</span> | |
| <span class="hljs-string">f"<span class="hljs-subst">{time:<span class="hljs-number">.1</span>f}</span>s, i.e. <span class="hljs-subst">{size_gb/time:<span class="hljs-number">.3</span>f}</span> GB/s"</span> | |
| )`,wrap:!1}}),ae=new h({props:{code:"J0l0ZXJhdGVkJTIwb3ZlciUyMDE1NTE4MDA5JTIwZXhhbXBsZXMlMjAoYWJvdXQlMjAxOS41JTIwR0IpJTIwaW4lMjA2NC4ycyUyQyUyMGkuZS4lMjAwLjMwNCUyMEdCJTJGcyc=",highlighted:'<span class="hljs-string">'Iterated over 15518009 examples (about 19.5 GB) in 64.2s, i.e. 0.304 GB/s'</span>',wrap:!1}}),g=new We({props:{$$slots:{default:[Na]},$$scope:{ctx:j}}}),te=new Ls({props:{title:"Streaming di dataset",local:"streaming-di-dataset",headingTag:"h2"}}),ie=new h({props:{code:"cHVibWVkX2RhdGFzZXRfc3RyZWFtZWQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTBBJTIwJTIwJTIwJTIwJTIyanNvbiUyMiUyQyUyMGRhdGFfZmlsZXMlM0RkYXRhX2ZpbGVzJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiUyQyUyMHN0cmVhbWluZyUzRFRydWUlMEEp",highlighted:`pubmed_dataset_streamed = load_dataset( | |
| <span class="hljs-string">"json"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span> | |
| )`,wrap:!1}}),oe=new h({props:{code:"bmV4dChpdGVyKHB1Ym1lZF9kYXRhc2V0X3N0cmVhbWVkKSk=",highlighted:'<span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(pubmed_dataset_streamed))',wrap:!1}}),re=new h({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzQlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAndGV4dCclM0ElMjAnRXBpZGVtaW9sb2d5JTIwb2YlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHdpdGglMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb24uJTVDblRvJTIwZGV0ZXJtaW5lJTIwdGhlJTIwcHJldmFsZW5jZSUyMG9mJTIwaHlwb3hhZW1pYSUyMGluJTIwY2hpbGRyZW4lMjBhZ2VkJTIwdW5kZXIlMjA1JTIweWVhcnMlMjBzdWZmZXJpbmclMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb25zJTIwKEFMUkkpJTJDJTIwdGhlJTIwcmlzayUyMGZhY3RvcnMlMjBmb3IlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHVuZGVyJTIwNSUyMHllYXJzJTIwb2YlMjBhZ2UlMjB3aXRoJTIwQUxSSSUyQyUyMGFuZCUyMHRoZSUyMGFzc29jaWF0aW9uJTIwb2YlMjBoeXBveGFlbWlhJTIwd2l0aCUyMGFuJTIwaW5jcmVhc2VkJTIwcmlzayUyMG9mJTIwZHlpbmclMjBpbiUyMGNoaWxkcmVuJTIwb2YlMjB0aGUlMjBzYW1lJTIwYWdlJTIwLi4uJyU3RA==",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age ...'</span>}`,wrap:!1}}),ce=new h({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Ub2tlbml6ZXIlMEElMEF0b2tlbml6ZXIlMjAlM0QlMjBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZCglMjJkaXN0aWxiZXJ0LWJhc2UtdW5jYXNlZCUyMiklMEF0b2tlbml6ZWRfZGF0YXNldCUyMCUzRCUyMHB1Ym1lZF9kYXRhc2V0X3N0cmVhbWVkLm1hcChsYW1iZGElMjB4JTNBJTIwdG9rZW5pemVyKHglNUIlMjJ0ZXh0JTIyJTVEKSklMEFuZXh0KGl0ZXIodG9rZW5pemVkX2RhdGFzZXQpKQ==",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"distilbert-base-uncased"</span>) | |
| tokenized_dataset = pubmed_dataset_streamed.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> x: tokenizer(x[<span class="hljs-string">"text"</span>])) | |
| <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(tokenized_dataset))`,wrap:!1}}),me=new h({props:{code:"JTdCJ2lucHV0X2lkcyclM0ElMjAlNUIxMDElMkMlMjA0OTU4JTJDJTIwNTE3OCUyQyUyMDQzMjglMkMlMjA2Nzc5JTJDJTIwLi4uJTVEJTJDJTIwJ2F0dGVudGlvbl9tYXNrJyUzQSUyMCU1QjElMkMlMjAxJTJDJTIwMSUyQyUyMDElMkMlMjAxJTJDJTIwLi4uJTVEJTdE",highlighted:'{<span class="hljs-string">'input_ids'</span>: [<span class="hljs-number">101</span>, <span class="hljs-number">4958</span>, <span class="hljs-number">5178</span>, <span class="hljs-number">4328</span>, <span class="hljs-number">6779</span>, ...], <span class="hljs-string">'attention_mask'</span>: [<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, ...]}',wrap:!1}}),U=new We({props:{$$slots:{default:[_a]},$$scope:{ctx:j}}}),ue=new h({props:{code:"c2h1ZmZsZWRfZGF0YXNldCUyMCUzRCUyMHB1Ym1lZF9kYXRhc2V0X3N0cmVhbWVkLnNodWZmbGUoYnVmZmVyX3NpemUlM0QxMF8wMDAlMkMlMjBzZWVkJTNENDIpJTBBbmV4dChpdGVyKHNodWZmbGVkX2RhdGFzZXQpKQ==",highlighted:`shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=<span class="hljs-number">10_000</span>, seed=<span class="hljs-number">42</span>) | |
| <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(shuffled_dataset))`,wrap:!1}}),Je=new h({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MTA3OTklMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAndGV4dCclM0ElMjAnUmFuZG9taXplZCUyMHN0dWR5JTIwb2YlMjBkb3NlJTIwb3IlMjBzY2hlZHVsZSUyMG1vZGlmaWNhdGlvbiUyMG9mJTIwZ3JhbnVsb2N5dGUlMjBjb2xvbnktc3RpbXVsYXRpbmclMjBmYWN0b3IlMjBpbiUyMHBsYXRpbnVtLWJhc2VkJTIwY2hlbW90aGVyYXB5JTIwZm9yJTIwZWxkZXJseSUyMHBhdGllbnRzJTIwd2l0aCUyMGx1bmclMjBjYW5jZXIlMjAuLi4nJTdE",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11410799</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Randomized study of dose or schedule modification of granulocyte colony-stimulating factor in platinum-based chemotherapy for elderly patients with lung cancer ...'</span>}`,wrap:!1}}),he=new h({props:{code:"ZGF0YXNldF9oZWFkJTIwJTNEJTIwcHVibWVkX2RhdGFzZXRfc3RyZWFtZWQudGFrZSg1KSUwQWxpc3QoZGF0YXNldF9oZWFkKQ==",highlighted:`dataset_head = pubmed_dataset_streamed.take(<span class="hljs-number">5</span>) | |
| <span class="hljs-built_in">list</span>(dataset_head)`,wrap:!1}}),be=new h({props:{code:"JTVCJTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzQlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAlMjAndGV4dCclM0ElMjAnRXBpZGVtaW9sb2d5JTIwb2YlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHdpdGglMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb24lMjAuLi4nJTdEJTJDJTBBJTIwJTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzUlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAlMjAndGV4dCclM0ElMjAnQ2xpbmljYWwlMjBzaWducyUyMG9mJTIwaHlwb3hhZW1pYSUyMGluJTIwY2hpbGRyZW4lMjB3aXRoJTIwYWN1dGUlMjBsb3dlciUyMHJlc3BpcmF0b3J5JTIwaW5mZWN0aW9uJTNBJTIwaW5kaWNhdG9ycyUyMG9mJTIwb3h5Z2VuJTIwdGhlcmFweSUyMC4uLiclN0QlMkMlMEElMjAlN0InbWV0YSclM0ElMjAlN0IncG1pZCclM0ElMjAxMTQwOTU3NiUyQyUyMCdsYW5ndWFnZSclM0ElMjAnZW5nJyU3RCUyQyUwQSUyMCUyMCd0ZXh0JyUzQSUyMCUyMkh5cG94YWVtaWElMjBpbiUyMGNoaWxkcmVuJTIwd2l0aCUyMHNldmVyZSUyMHBuZXVtb25pYSUyMGluJTIwUGFwdWElMjBOZXclMjBHdWluZWElMjAuLi4lMjIlN0QlMkMlMEElMjAlN0InbWV0YSclM0ElMjAlN0IncG1pZCclM0ElMjAxMTQwOTU3NyUyQyUyMCdsYW5ndWFnZSclM0ElMjAnZW5nJyU3RCUyQyUwQSUyMCUyMCd0ZXh0JyUzQSUyMCdPeHlnZW4lMjBjb25jZW50cmF0b3JzJTIwYW5kJTIwY3lsaW5kZXJzJTIwLi4uJyU3RCUyQyUwQSUyMCU3QidtZXRhJyUzQSUyMCU3QidwbWlkJyUzQSUyMDExNDA5NTc4JTJDJTIwJ2xhbmd1YWdlJyUzQSUyMCdlbmcnJTdEJTJDJTBBJTIwJTIwJ3RleHQnJTNBJTIwJ094eWdlbiUyMHN1cHBseSUyMGluJTIwcnVyYWwlMjBhZnJpY2ElM0ElMjBhJTIwcGVyc29uYWwlMjBleHBlcmllbmNlJTIwLi4uJyU3RCU1RA==",highlighted:`[{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Epidemiology of hypoxaemia in children with acute lower respiratory infection ...'</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409575</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Clinical signs of hypoxaemia in children with acute lower respiratory infection: indicators of oxygen therapy ...'</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409576</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">"Hypoxaemia in children with severe pneumonia in Papua New Guinea ..."</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409577</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Oxygen concentrators and cylinders ...'</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409578</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Oxygen supply in rural africa: a personal experience ...'</span>}]`,wrap:!1}}),je=new h({props:{code:"JTIzJTIwU2FsdGElMjBpJTIwcHJpbWklMjAxLjAwMCUyMGVzZW1waSUyQyUyMGlsJTIwcmVzdG8lMjB2aWVuZSUyMGluY2x1c28lMjBuZWxsJ2luc2llbWUlMjBkaSUyMGFkZGVzdHJhbWVudG8lMEF0cmFpbl9kYXRhc2V0JTIwJTNEJTIwc2h1ZmZsZWRfZGF0YXNldC5za2lwKDEwMDApJTBBJTIzJTIwSW5jbHVkaSUyMGklMjBwcmltaSUyMDEuMDAwJTIwZXNlbXBpJTIwbmVsbCdpbnNpZW1lJTIwZGklMjB2YWxpZGF6aW9uZSUwQXZhbGlkYXRpb25fZGF0YXNldCUyMCUzRCUyMHNodWZmbGVkX2RhdGFzZXQudGFrZSgxMDAwKQ==",highlighted:`<span class="hljs-comment"># Salta i primi 1.000 esempi, il resto viene incluso nell'insieme di addestramento</span> | |
| train_dataset = shuffled_dataset.skip(<span class="hljs-number">1000</span>) | |
| <span class="hljs-comment"># Includi i primi 1.000 esempi nell'insieme di validazione</span> | |
| validation_dataset = shuffled_dataset.take(<span class="hljs-number">1000</span>)`,wrap:!1}}),we=new h({props:{code:"bGF3X2RhdGFzZXRfc3RyZWFtZWQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTBBJTIwJTIwJTIwJTIwJTIyanNvbiUyMiUyQyUwQSUyMCUyMCUyMCUyMGRhdGFfZmlsZXMlM0QlMjJodHRwcyUzQSUyRiUyRnRoZS1leWUuZXUlMkZwdWJsaWMlMkZBSSUyRnBpbGVfcHJlbGltaW5hcnlfY29tcG9uZW50cyUyRkZyZWVMYXdfT3BpbmlvbnMuanNvbmwuenN0JTIyJTJDJTBBJTIwJTIwJTIwJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiUyQyUwQSUyMCUyMCUyMCUyMHN0cmVhbWluZyUzRFRydWUlMkMlMEEpJTBBbmV4dChpdGVyKGxhd19kYXRhc2V0X3N0cmVhbWVkKSk=",highlighted:`law_dataset_streamed = load_dataset( | |
| <span class="hljs-string">"json"</span>, | |
| data_files=<span class="hljs-string">"https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst"</span>, | |
| split=<span class="hljs-string">"train"</span>, | |
| streaming=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(law_dataset_streamed))`,wrap:!1}}),ge=new h({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ2Nhc2VfSUQnJTNBJTIwJzExMDkyMS5qc29uJyUyQyUwQSUyMCUyMCdjYXNlX2p1cmlzZGljdGlvbiclM0ElMjAnc2NvdHVzLnRhci5neiclMkMlMEElMjAlMjAnZGF0ZV9jcmVhdGVkJyUzQSUyMCcyMDEwLTA0LTI4VDE3JTNBMTIlM0E0OVonJTdEJTJDJTBBJTIwJ3RleHQnJTNBJTIwJyU1Q240NjElMjBVLlMuJTIwMjM4JTIwKDE5ODMpJTVDbk9MSU0lMjBFVCUyMEFMLiU1Q252LiU1Q25XQUtJTkVLT05BJTVDbk5vLiUyMDgxLTE1ODEuJTVDblN1cHJlbWUlMjBDb3VydCUyMG9mJTIwVW5pdGVkJTIwU3RhdGVzLiU1Q25Bcmd1ZWQlMjBKYW51YXJ5JTIwMTklMkMlMjAxOTgzLiU1Q25EZWNpZGVkJTIwQXByaWwlMjAyNiUyQyUyMDE5ODMuJTVDbkNFUlRJT1JBUkklMjBUTyUyMFRIRSUyMFVOSVRFRCUyMFNUQVRFUyUyMENPVVJUJTIwT0YlMjBBUFBFQUxTJTIwRk9SJTIwVEhFJTIwTklOVEglMjBDSVJDVUlUJTVDbioyMzklMjBNaWNoYWVsJTIwQS4lMjBMaWxseSUyQyUyMEZpcnN0JTIwRGVwdXR5JTIwQXR0b3JuZXklMjBHZW5lcmFsJTIwb2YlMjBIYXdhaWklMkMlMjBhcmd1ZWQlMjB0aGUlMjBjYXVzZSUyMGZvciUyMHBldGl0aW9uZXJzLiUyMFdpdGglMjBoaW0lMjBvbiUyMHRoZSUyMGJyaWVmJTIwd2FzJTIwSmFtZXMlMjBILiUyMERhbm5lbmJlcmclMkMlMjBEZXB1dHklMjBBdHRvcm5leSUyMEdlbmVyYWwuLi4nJTdE",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'case_ID'</span>: <span class="hljs-string">'110921.json'</span>, | |
| <span class="hljs-string">'case_jurisdiction'</span>: <span class="hljs-string">'scotus.tar.gz'</span>, | |
| <span class="hljs-string">'date_created'</span>: <span class="hljs-string">'2010-04-28T17:12:49Z'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'\\n461 U.S. 238 (1983)\\nOLIM ET AL.\\nv.\\nWAKINEKONA\\nNo. 81-1581.\\nSupreme Court of United States.\\nArgued January 19, 1983.\\nDecided April 26, 1983.\\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...'</span>}`,wrap:!1}}),Ie=new h({props:{code:"ZnJvbSUyMGl0ZXJ0b29scyUyMGltcG9ydCUyMGlzbGljZSUwQWZyb20lMjBkYXRhc2V0cyUyMGltcG9ydCUyMGludGVybGVhdmVfZGF0YXNldHMlMEElMEFjb21iaW5lZF9kYXRhc2V0JTIwJTNEJTIwaW50ZXJsZWF2ZV9kYXRhc2V0cyglNUJwdWJtZWRfZGF0YXNldF9zdHJlYW1lZCUyQyUyMGxhd19kYXRhc2V0X3N0cmVhbWVkJTVEKSUwQWxpc3QoaXNsaWNlKGNvbWJpbmVkX2RhdGFzZXQlMkMlMjAyKSk=",highlighted:`<span class="hljs-keyword">from</span> itertools <span class="hljs-keyword">import</span> islice | |
| <span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> interleave_datasets | |
| combined_dataset = interleave_datasets([pubmed_dataset_streamed, law_dataset_streamed]) | |
| <span class="hljs-built_in">list</span>(islice(combined_dataset, <span class="hljs-number">2</span>))`,wrap:!1}}),$e=new h({props:{code:"JTVCJTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzQlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAlMjAndGV4dCclM0ElMjAnRXBpZGVtaW9sb2d5JTIwb2YlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHdpdGglMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb24lMjAuLi4nJTdEJTJDJTBBJTIwJTdCJ21ldGEnJTNBJTIwJTdCJ2Nhc2VfSUQnJTNBJTIwJzExMDkyMS5qc29uJyUyQyUwQSUyMCUyMCUyMCdjYXNlX2p1cmlzZGljdGlvbiclM0ElMjAnc2NvdHVzLnRhci5neiclMkMlMEElMjAlMjAlMjAnZGF0ZV9jcmVhdGVkJyUzQSUyMCcyMDEwLTA0LTI4VDE3JTNBMTIlM0E0OVonJTdEJTJDJTBBJTIwJTIwJ3RleHQnJTNBJTIwJyU1Q240NjElMjBVLlMuJTIwMjM4JTIwKDE5ODMpJTVDbk9MSU0lMjBFVCUyMEFMLiU1Q252LiU1Q25XQUtJTkVLT05BJTVDbk5vLiUyMDgxLTE1ODEuJTVDblN1cHJlbWUlMjBDb3VydCUyMG9mJTIwVW5pdGVkJTIwU3RhdGVzLiU1Q25Bcmd1ZWQlMjBKYW51YXJ5JTIwMTklMkMlMjAxOTgzLiU1Q25EZWNpZGVkJTIwQXByaWwlMjAyNiUyQyUyMDE5ODMuJTVDbkNFUlRJT1JBUkklMjBUTyUyMFRIRSUyMFVOSVRFRCUyMFNUQVRFUyUyMENPVVJUJTIwT0YlMjBBUFBFQUxTJTIwRk9SJTIwVEhFJTIwTklOVEglMjBDSVJDVUlUJTVDbioyMzklMjBNaWNoYWVsJTIwQS4lMjBMaWxseSUyQyUyMEZpcnN0JTIwRGVwdXR5JTIwQXR0b3JuZXklMjBHZW5lcmFsJTIwb2YlMjBIYXdhaWklMkMlMjBhcmd1ZWQlMjB0aGUlMjBjYXVzZSUyMGZvciUyMHBldGl0aW9uZXJzLiUyMFdpdGglMjBoaW0lMjBvbiUyMHRoZSUyMGJyaWVmJTIwd2FzJTIwSmFtZXMlMjBILiUyMERhbm5lbmJlcmclMkMlMjBEZXB1dHklMjBBdHRvcm5leSUyMEdlbmVyYWwuLi4nJTdEJTVE",highlighted:`[{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Epidemiology of hypoxaemia in children with acute lower respiratory infection ...'</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'case_ID'</span>: <span class="hljs-string">'110921.json'</span>, | |
| <span class="hljs-string">'case_jurisdiction'</span>: <span class="hljs-string">'scotus.tar.gz'</span>, | |
| <span class="hljs-string">'date_created'</span>: <span class="hljs-string">'2010-04-28T17:12:49Z'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'\\n461 U.S. 238 (1983)\\nOLIM ET AL.\\nv.\\nWAKINEKONA\\nNo. 81-1581.\\nSupreme Court of United States.\\nArgued January 19, 1983.\\nDecided April 26, 1983.\\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...'</span>}]`,wrap:!1}}),Ze=new h({props:{code:"YmFzZV91cmwlMjAlM0QlMjAlMjJodHRwcyUzQSUyRiUyRnRoZS1leWUuZXUlMkZwdWJsaWMlMkZBSSUyRnBpbGUlMkYlMjIlMEFkYXRhX2ZpbGVzJTIwJTNEJTIwJTdCJTBBJTIwJTIwJTIwJTIwJTIydHJhaW4lMjIlM0ElMjAlNUJiYXNlX3VybCUyMCUyQiUyMCUyMnRyYWluJTJGJTIyJTIwJTJCJTIwZiUyMiU3QmlkeCUzQTAyZCU3RC5qc29ubC56c3QlMjIlMjBmb3IlMjBpZHglMjBpbiUyMHJhbmdlKDMwKSU1RCUyQyUwQSUyMCUyMCUyMCUyMCUyMnZhbGlkYXRpb24lMjIlM0ElMjBiYXNlX3VybCUyMCUyQiUyMCUyMnZhbC5qc29ubC56c3QlMjIlMkMlMEElMjAlMjAlMjAlMjAlMjJ0ZXN0JTIyJTNBJTIwYmFzZV91cmwlMjAlMkIlMjAlMjJ0ZXN0Lmpzb25sLnpzdCUyMiUyQyUwQSU3RCUwQXBpbGVfZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJqc29uJTIyJTJDJTIwZGF0YV9maWxlcyUzRGRhdGFfZmlsZXMlMkMlMjBzdHJlYW1pbmclM0RUcnVlKSUwQW5leHQoaXRlcihwaWxlX2RhdGFzZXQlNUIlMjJ0cmFpbiUyMiU1RCkp",highlighted:`base_url = <span class="hljs-string">"https://the-eye.eu/public/AI/pile/"</span> | |
| data_files = { | |
| <span class="hljs-string">"train"</span>: [base_url + <span class="hljs-string">"train/"</span> + <span class="hljs-string">f"<span class="hljs-subst">{idx:02d}</span>.jsonl.zst"</span> <span class="hljs-keyword">for</span> idx <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">30</span>)], | |
| <span class="hljs-string">"validation"</span>: base_url + <span class="hljs-string">"val.jsonl.zst"</span>, | |
| <span class="hljs-string">"test"</span>: base_url + <span class="hljs-string">"test.jsonl.zst"</span>, | |
| } | |
| pile_dataset = load_dataset(<span class="hljs-string">"json"</span>, data_files=data_files, streaming=<span class="hljs-literal">True</span>) | |
| <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(pile_dataset[<span class="hljs-string">"train"</span>]))`,wrap:!1}}),Ge=new h({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ3BpbGVfc2V0X25hbWUnJTNBJTIwJ1BpbGUtQ0MnJTdEJTJDJTBBJTIwJ3RleHQnJTNBJTIwJ0l0JTIwaXMlMjBkb25lJTJDJTIwYW5kJTIwc3VibWl0dGVkLiUyMFlvdSUyMGNhbiUyMHBsYXklMjAlRTIlODAlOUNTdXJ2aXZhbCUyMG9mJTIwdGhlJTIwVGFzdGllc3QlRTIlODAlOUQlMjBvbiUyMEFuZHJvaWQlMkMlMjBhbmQlMjBvbiUyMHRoZSUyMHdlYi4uLiclN0Q=",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pile_set_name'</span>: <span class="hljs-string">'Pile-CC'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web...'</span>}`,wrap:!1}}),I=new We({props:{$$slots:{default:[Qa]},$$scope:{ctx:j}}}),Ve=new Ra({props:{source:"https://github.com/huggingface/course/blob/main/chapters/it/chapter5/4.mdx"}}),{c(){i=u("meta"),b=t(),M=u("p"),y=t(),p($.$$.fragment),ve=t(),p(B.$$.fragment),Re=t(),x=u("p"),x.innerHTML=qs,Xe=t(),Z=u("p"),Z.innerHTML=Ps,Ee=t(),p(G.$$.fragment),Ne=t(),z=u("p"),z.innerHTML=Os,_e=t(),p(V.$$.fragment),Qe=t(),k=u("p"),k.innerHTML=Ks,Ye=t(),p(W.$$.fragment),Ae=t(),C=u("p"),C.innerHTML=ea,Fe=t(),p(v.$$.fragment),He=t(),p(R.$$.fragment),Se=t(),X=u("p"),X.textContent=sa,De=t(),p(f.$$.fragment),Le=t(),E=u("p"),E.textContent=aa,qe=t(),p(N.$$.fragment),Pe=t(),p(_.$$.fragment),Oe=t(),Q=u("p"),Q.textContent=la,Ke=t(),p(Y.$$.fragment),es=t(),A=u("p"),A.innerHTML=ta,ss=t(),p(F.$$.fragment),as=t(),H=u("p"),H.innerHTML=na,ls=t(),p(S.$$.fragment),ts=t(),p(D.$$.fragment),ns=t(),L=u("p"),L.innerHTML=ia,is=t(),p(q.$$.fragment),ps=t(),p(P.$$.fragment),os=t(),O=u("p"),O.textContent=pa,rs=t(),p(w.$$.fragment),ds=t(),K=u("p"),K.innerHTML=oa,cs=t(),ee=u("p"),ee.innerHTML=ra,ms=t(),p(se.$$.fragment),Ms=t(),p(ae.$$.fragment),us=t(),le=u("p"),le.innerHTML=da,Js=t(),p(g.$$.fragment),Ts=t(),p(te.$$.fragment),hs=t(),ne=u("p"),ne.innerHTML=ca,bs=t(),p(ie.$$.fragment),ys=t(),pe=u("p"),pe.innerHTML=ma,js=t(),p(oe.$$.fragment),fs=t(),p(re.$$.fragment),ws=t(),de=u("p"),de.innerHTML=Ma,gs=t(),p(ce.$$.fragment),Us=t(),p(me.$$.fragment),Is=t(),p(U.$$.fragment),$s=t(),Me=u("p"),Me.innerHTML=ua,Bs=t(),p(ue.$$.fragment),xs=t(),p(Je.$$.fragment),Zs=t(),Te=u("p"),Te.innerHTML=Ja,Gs=t(),p(he.$$.fragment),zs=t(),p(be.$$.fragment),Vs=t(),ye=u("p"),ye.innerHTML=Ta,ks=t(),p(je.$$.fragment),Ws=t(),fe=u("p"),fe.innerHTML=ha,Cs=t(),p(we.$$.fragment),vs=t(),p(ge.$$.fragment),Rs=t(),Ue=u("p"),Ue.innerHTML=ba,Xs=t(),p(Ie.$$.fragment),Es=t(),p($e.$$.fragment),Ns=t(),Be=u("p"),Be.innerHTML=ya,_s=t(),xe=u("p"),xe.textContent=ja,Qs=t(),p(Ze.$$.fragment),Ys=t(),p(Ge.$$.fragment),As=t(),p(I.$$.fragment),Fs=t(),ze=u("p"),ze.textContent=fa,Hs=t(),p(Ve.$$.fragment),Ss=t(),ke=u("p"),this.h()},l(e){const s=ka("svelte-u9bgzb",document.head);i=J(s,"META",{name:!0,content:!0}),s.forEach(a),b=n(e),M=J(e,"P",{}),Ba(M).forEach(a),y=n(e),o($.$$.fragment,e),ve=n(e),o(B.$$.fragment,e),Re=n(e),x=J(e,"P",{"data-svelte-h":!0}),T(x)!=="svelte-ife82b"&&(x.innerHTML=qs),Xe=n(e),Z=J(e,"P",{"data-svelte-h":!0}),T(Z)!=="svelte-1kyw64i"&&(Z.innerHTML=Ps),Ee=n(e),o(G.$$.fragment,e),Ne=n(e),z=J(e,"P",{"data-svelte-h":!0}),T(z)!=="svelte-36huww"&&(z.innerHTML=Os),_e=n(e),o(V.$$.fragment,e),Qe=n(e),k=J(e,"P",{"data-svelte-h":!0}),T(k)!=="svelte-e1fzvi"&&(k.innerHTML=Ks),Ye=n(e),o(W.$$.fragment,e),Ae=n(e),C=J(e,"P",{"data-svelte-h":!0}),T(C)!=="svelte-ueahrw"&&(C.innerHTML=ea),Fe=n(e),o(v.$$.fragment,e),He=n(e),o(R.$$.fragment,e),Se=n(e),X=J(e,"P",{"data-svelte-h":!0}),T(X)!=="svelte-ldt8f5"&&(X.textContent=sa),De=n(e),o(f.$$.fragment,e),Le=n(e),E=J(e,"P",{"data-svelte-h":!0}),T(E)!=="svelte-13o6uv9"&&(E.textContent=aa),qe=n(e),o(N.$$.fragment,e),Pe=n(e),o(_.$$.fragment,e),Oe=n(e),Q=J(e,"P",{"data-svelte-h":!0}),T(Q)!=="svelte-1kk4pfk"&&(Q.textContent=la),Ke=n(e),o(Y.$$.fragment,e),es=n(e),A=J(e,"P",{"data-svelte-h":!0}),T(A)!=="svelte-ol6i82"&&(A.innerHTML=ta),ss=n(e),o(F.$$.fragment,e),as=n(e),H=J(e,"P",{"data-svelte-h":!0}),T(H)!=="svelte-1yadugw"&&(H.innerHTML=na),ls=n(e),o(S.$$.fragment,e),ts=n(e),o(D.$$.fragment,e),ns=n(e),L=J(e,"P",{"data-svelte-h":!0}),T(L)!=="svelte-dgnood"&&(L.innerHTML=ia),is=n(e),o(q.$$.fragment,e),ps=n(e),o(P.$$.fragment,e),os=n(e),O=J(e,"P",{"data-svelte-h":!0}),T(O)!=="svelte-1p89ulg"&&(O.textContent=pa),rs=n(e),o(w.$$.fragment,e),ds=n(e),K=J(e,"P",{"data-svelte-h":!0}),T(K)!=="svelte-uataba"&&(K.innerHTML=oa),cs=n(e),ee=J(e,"P",{"data-svelte-h":!0}),T(ee)!=="svelte-inyq4t"&&(ee.innerHTML=ra),ms=n(e),o(se.$$.fragment,e),Ms=n(e),o(ae.$$.fragment,e),us=n(e),le=J(e,"P",{"data-svelte-h":!0}),T(le)!=="svelte-1fz2u01"&&(le.innerHTML=da),Js=n(e),o(g.$$.fragment,e),Ts=n(e),o(te.$$.fragment,e),hs=n(e),ne=J(e,"P",{"data-svelte-h":!0}),T(ne)!=="svelte-1ejx5xr"&&(ne.innerHTML=ca),bs=n(e),o(ie.$$.fragment,e),ys=n(e),pe=J(e,"P",{"data-svelte-h":!0}),T(pe)!=="svelte-dqnei4"&&(pe.innerHTML=ma),js=n(e),o(oe.$$.fragment,e),fs=n(e),o(re.$$.fragment,e),ws=n(e),de=J(e,"P",{"data-svelte-h":!0}),T(de)!=="svelte-jp8nni"&&(de.innerHTML=Ma),gs=n(e),o(ce.$$.fragment,e),Us=n(e),o(me.$$.fragment,e),Is=n(e),o(U.$$.fragment,e),$s=n(e),Me=J(e,"P",{"data-svelte-h":!0}),T(Me)!=="svelte-18o8mco"&&(Me.innerHTML=ua),Bs=n(e),o(ue.$$.fragment,e),xs=n(e),o(Je.$$.fragment,e),Zs=n(e),Te=J(e,"P",{"data-svelte-h":!0}),T(Te)!=="svelte-dm7z72"&&(Te.innerHTML=Ja),Gs=n(e),o(he.$$.fragment,e),zs=n(e),o(be.$$.fragment,e),Vs=n(e),ye=J(e,"P",{"data-svelte-h":!0}),T(ye)!=="svelte-1lfebd3"&&(ye.innerHTML=Ta),ks=n(e),o(je.$$.fragment,e),Ws=n(e),fe=J(e,"P",{"data-svelte-h":!0}),T(fe)!=="svelte-vvs954"&&(fe.innerHTML=ha),Cs=n(e),o(we.$$.fragment,e),vs=n(e),o(ge.$$.fragment,e),Rs=n(e),Ue=J(e,"P",{"data-svelte-h":!0}),T(Ue)!=="svelte-30gh6j"&&(Ue.innerHTML=ba),Xs=n(e),o(Ie.$$.fragment,e),Es=n(e),o($e.$$.fragment,e),Ns=n(e),Be=J(e,"P",{"data-svelte-h":!0}),T(Be)!=="svelte-qtvrif"&&(Be.innerHTML=ya),_s=n(e),xe=J(e,"P",{"data-svelte-h":!0}),T(xe)!=="svelte-13odihg"&&(xe.textContent=ja),Qs=n(e),o(Ze.$$.fragment,e),Ys=n(e),o(Ge.$$.fragment,e),As=n(e),o(I.$$.fragment,e),Fs=n(e),ze=J(e,"P",{"data-svelte-h":!0}),T(ze)!=="svelte-3idnlg"&&(ze.textContent=fa),Hs=n(e),o(Ve.$$.fragment,e),Ss=n(e),ke=J(e,"P",{}),Ba(ke).forEach(a),this.h()},h(){xa(i,"name","hf:doc:metadata"),xa(i,"content",Aa)},m(e,s){Wa(document.head,i),l(e,b,s),l(e,M,s),l(e,y,s),r($,e,s),l(e,ve,s),r(B,e,s),l(e,Re,s),l(e,x,s),l(e,Xe,s),l(e,Z,s),l(e,Ee,s),r(G,e,s),l(e,Ne,s),l(e,z,s),l(e,_e,s),r(V,e,s),l(e,Qe,s),l(e,k,s),l(e,Ye,s),r(W,e,s),l(e,Ae,s),l(e,C,s),l(e,Fe,s),r(v,e,s),l(e,He,s),r(R,e,s),l(e,Se,s),l(e,X,s),l(e,De,s),r(f,e,s),l(e,Le,s),l(e,E,s),l(e,qe,s),r(N,e,s),l(e,Pe,s),r(_,e,s),l(e,Oe,s),l(e,Q,s),l(e,Ke,s),r(Y,e,s),l(e,es,s),l(e,A,s),l(e,ss,s),r(F,e,s),l(e,as,s),l(e,H,s),l(e,ls,s),r(S,e,s),l(e,ts,s),r(D,e,s),l(e,ns,s),l(e,L,s),l(e,is,s),r(q,e,s),l(e,ps,s),r(P,e,s),l(e,os,s),l(e,O,s),l(e,rs,s),r(w,e,s),l(e,ds,s),l(e,K,s),l(e,cs,s),l(e,ee,s),l(e,ms,s),r(se,e,s),l(e,Ms,s),r(ae,e,s),l(e,us,s),l(e,le,s),l(e,Js,s),r(g,e,s),l(e,Ts,s),r(te,e,s),l(e,hs,s),l(e,ne,s),l(e,bs,s),r(ie,e,s),l(e,ys,s),l(e,pe,s),l(e,js,s),r(oe,e,s),l(e,fs,s),r(re,e,s),l(e,ws,s),l(e,de,s),l(e,gs,s),r(ce,e,s),l(e,Us,s),r(me,e,s),l(e,Is,s),r(U,e,s),l(e,$s,s),l(e,Me,s),l(e,Bs,s),r(ue,e,s),l(e,xs,s),r(Je,e,s),l(e,Zs,s),l(e,Te,s),l(e,Gs,s),r(he,e,s),l(e,zs,s),r(be,e,s),l(e,Vs,s),l(e,ye,s),l(e,ks,s),r(je,e,s),l(e,Ws,s),l(e,fe,s),l(e,Cs,s),r(we,e,s),l(e,vs,s),r(ge,e,s),l(e,Rs,s),l(e,Ue,s),l(e,Xs,s),r(Ie,e,s),l(e,Es,s),r($e,e,s),l(e,Ns,s),l(e,Be,s),l(e,_s,s),l(e,xe,s),l(e,Qs,s),r(Ze,e,s),l(e,Ys,s),r(Ge,e,s),l(e,As,s),r(I,e,s),l(e,Fs,s),l(e,ze,s),l(e,Hs,s),r(Ve,e,s),l(e,Ss,s),l(e,ke,s),Ds=!0},p(e,[s]){const wa={};s&2&&(wa.$$scope={dirty:s,ctx:e}),f.$set(wa);const ga={};s&2&&(ga.$$scope={dirty:s,ctx:e}),w.$set(ga);const Ua={};s&2&&(Ua.$$scope={dirty:s,ctx:e}),g.$set(Ua);const Ia={};s&2&&(Ia.$$scope={dirty:s,ctx:e}),U.$set(Ia);const $a={};s&2&&($a.$$scope={dirty:s,ctx:e}),I.$set($a)},i(e){Ds||(d($.$$.fragment,e),d(B.$$.fragment,e),d(G.$$.fragment,e),d(V.$$.fragment,e),d(W.$$.fragment,e),d(v.$$.fragment,e),d(R.$$.fragment,e),d(f.$$.fragment,e),d(N.$$.fragment,e),d(_.$$.fragment,e),d(Y.$$.fragment,e),d(F.$$.fragment,e),d(S.$$.fragment,e),d(D.$$.fragment,e),d(q.$$.fragment,e),d(P.$$.fragment,e),d(w.$$.fragment,e),d(se.$$.fragment,e),d(ae.$$.fragment,e),d(g.$$.fragment,e),d(te.$$.fragment,e),d(ie.$$.fragment,e),d(oe.$$.fragment,e),d(re.$$.fragment,e),d(ce.$$.fragment,e),d(me.$$.fragment,e),d(U.$$.fragment,e),d(ue.$$.fragment,e),d(Je.$$.fragment,e),d(he.$$.fragment,e),d(be.$$.fragment,e),d(je.$$.fragment,e),d(we.$$.fragment,e),d(ge.$$.fragment,e),d(Ie.$$.fragment,e),d($e.$$.fragment,e),d(Ze.$$.fragment,e),d(Ge.$$.fragment,e),d(I.$$.fragment,e),d(Ve.$$.fragment,e),Ds=!0)},o(e){c($.$$.fragment,e),c(B.$$.fragment,e),c(G.$$.fragment,e),c(V.$$.fragment,e),c(W.$$.fragment,e),c(v.$$.fragment,e),c(R.$$.fragment,e),c(f.$$.fragment,e),c(N.$$.fragment,e),c(_.$$.fragment,e),c(Y.$$.fragment,e),c(F.$$.fragment,e),c(S.$$.fragment,e),c(D.$$.fragment,e),c(q.$$.fragment,e),c(P.$$.fragment,e),c(w.$$.fragment,e),c(se.$$.fragment,e),c(ae.$$.fragment,e),c(g.$$.fragment,e),c(te.$$.fragment,e),c(ie.$$.fragment,e),c(oe.$$.fragment,e),c(re.$$.fragment,e),c(ce.$$.fragment,e),c(me.$$.fragment,e),c(U.$$.fragment,e),c(ue.$$.fragment,e),c(Je.$$.fragment,e),c(he.$$.fragment,e),c(be.$$.fragment,e),c(je.$$.fragment,e),c(we.$$.fragment,e),c(ge.$$.fragment,e),c(Ie.$$.fragment,e),c($e.$$.fragment,e),c(Ze.$$.fragment,e),c(Ge.$$.fragment,e),c(I.$$.fragment,e),c(Ve.$$.fragment,e),Ds=!1},d(e){e&&(a(b),a(M),a(y),a(ve),a(Re),a(x),a(Xe),a(Z),a(Ee),a(Ne),a(z),a(_e),a(Qe),a(k),a(Ye),a(Ae),a(C),a(Fe),a(He),a(Se),a(X),a(De),a(Le),a(E),a(qe),a(Pe),a(Oe),a(Q),a(Ke),a(es),a(A),a(ss),a(as),a(H),a(ls),a(ts),a(ns),a(L),a(is),a(ps),a(os),a(O),a(rs),a(ds),a(K),a(cs),a(ee),a(ms),a(Ms),a(us),a(le),a(Js),a(Ts),a(hs),a(ne),a(bs),a(ys),a(pe),a(js),a(fs),a(ws),a(de),a(gs),a(Us),a(Is),a($s),a(Me),a(Bs),a(xs),a(Zs),a(Te),a(Gs),a(zs),a(Vs),a(ye),a(ks),a(Ws),a(fe),a(Cs),a(vs),a(Rs),a(Ue),a(Xs),a(Es),a(Ns),a(Be),a(_s),a(xe),a(Qs),a(Ys),a(As),a(Fs),a(ze),a(Hs),a(Ss),a(ke)),a(i),m($,e),m(B,e),m(G,e),m(V,e),m(W,e),m(v,e),m(R,e),m(f,e),m(N,e),m(_,e),m(Y,e),m(F,e),m(S,e),m(D,e),m(q,e),m(P,e),m(w,e),m(se,e),m(ae,e),m(g,e),m(te,e),m(ie,e),m(oe,e),m(re,e),m(ce,e),m(me,e),m(U,e),m(ue,e),m(Je,e),m(he,e),m(be,e),m(je,e),m(we,e),m(ge,e),m(Ie,e),m($e,e),m(Ze,e),m(Ge,e),m(I,e),m(Ve,e)}}}const Aa='{"title":"Big data? Ci pensa 🤗 Datasets!","local":"big-data-ci-pensa--datasets","sections":[{"title":"Cos’è Pile?","local":"cosè-pile","sections":[],"depth":2},{"title":"La magia del memory mapping","local":"la-magia-del-memory-mapping","sections":[],"depth":2},{"title":"Streaming di dataset","local":"streaming-di-dataset","sections":[],"depth":2}],"depth":1}';function Fa(j){return Ga(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ka extends za{constructor(i){super(),Va(this,i,Fa,Ya,Za,{})}}export{Ka as component}; | |
Xet Storage Details
- Size:
- 51.2 kB
- Xet hash:
- 075f24c95e2fe1b109857df25983b9237f918e09a15b9e8183e1d3a2d8eeffb3
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.