Buckets:
| import{s as Ba,o as Ga,n as Re}from"../chunks/scheduler.37c15a92.js";import{S as Va,i as ka,g as u,s as t,r as i,A as Wa,h as J,f as a,c as n,j as Ia,u as p,x as j,k as Za,y as Ca,a as l,v as r,d,t as m,w as c}from"../chunks/index.2bf4358c.js";import{T as Ce}from"../chunks/Tip.363c041f.js";import{Y as Ra}from"../chunks/Youtube.1e50a667.js";import{C as h}from"../chunks/CodeBlock.4e987730.js";import{C as va}from"../chunks/CourseFloatingBanner.6add7356.js";import{H as Ds,E as Ea}from"../chunks/getInferenceSnippets.24b50994.js";function Na(b){let o,T='✎ Por padrão, 🤗 Datasets descompactará os arquivos necessários para carregar um dataset. Se você quiser preservar espaço no disco rígido, você pode passar <code>DownloadConfig(delete_extracted=True)</code> para o argumento <code>download_config</code> de <code>load_dataset()</code>. Consulte a <a href="https://huggingface.co/docs/datasets/package_reference/builder_classes#datasets.DownloadConfig" rel="nofollow">documentação</a> para obter mais detalhes.';return{c(){o=u("p"),o.innerHTML=T},l(M){o=J(M,"P",{"data-svelte-h":!0}),j(o)!=="svelte-a6xli5"&&(o.innerHTML=T)},m(M,y){l(M,o,y)},p:Re,d(M){M&&a(o)}}}function Xa(b){let o,T='✏️ <strong>Experimente!</strong> Escolha um dos <a href="https://the-eye.eu/public/AI/pile_preliminary_components/" rel="nofollow">subconjuntos</a> da <code>The Pile</code> que é maior que a RAM do seu laptop ou desktop, carregue com 🤗 Datasets e meça a quantidade de RAM usada. Observe que, para obter uma medição precisa, você desejará fazer isso em um novo processo. Você pode encontrar os tamanhos descompactados de cada subconjunto na Tabela 1 do <a href="https://arxiv.org/abs/2101.00027" rel="nofollow">artigo do <code>The Pile</code></a>.';return{c(){o=u("p"),o.innerHTML=T},l(M){o=J(M,"P",{"data-svelte-h":!0}),j(o)!=="svelte-1lvcqa6"&&(o.innerHTML=T)},m(M,y){l(M,o,y)},p:Re,d(M){M&&a(o)}}}function _a(b){let o,T='💡 Nos notebooks Jupyter, você também pode cronometrar células usando a <a href="https://ipython.readthedocs.io/en/stable/interactive/magics.html#magic-timeit" rel="nofollow"><code>%%timeit</code> função mágica</a>.';return{c(){o=u("p"),o.innerHTML=T},l(M){o=J(M,"P",{"data-svelte-h":!0}),j(o)!=="svelte-trq3jj"&&(o.innerHTML=T)},m(M,y){l(M,o,y)},p:Re,d(M){M&&a(o)}}}function za(b){let o,T="💡 Para acelerar a tokenização com streaming você pode passar <code>batched=True</code>, como vimos na última seção. Ele processará os exemplos lote por lote; o tamanho do lote padrão é 1.000 e pode ser especificado com o argumento <code>batch_size</code>.";return{c(){o=u("p"),o.innerHTML=T},l(M){o=J(M,"P",{"data-svelte-h":!0}),j(o)!=="svelte-1kciw7b"&&(o.innerHTML=T)},m(M,y){l(M,o,y)},p:Re,d(M){M&&a(o)}}}function Qa(b){let o,T='✏️ <strong>Experimente!</strong> Use um dos grandes corpora Common Crawl como <a href="https://huggingface.co/datasets/mc4" rel="nofollow"><code>mc4</code></a> ou <a href="https://huggingface.co/datasets/oscar" rel="nofollow"><code>oscar</code></a> para criar um conjunto de dados multilíngue de streaming que represente as proporções faladas de idiomas em um país de sua escolha. Por exemplo, as quatro línguas nacionais na Suíça são alemão, francês, italiano e romanche, então você pode tentar criar um corpus suíço amostrando os subconjuntos do Oscar de acordo com sua proporção falada.';return{c(){o=u("p"),o.innerHTML=T},l(M){o=J(M,"P",{"data-svelte-h":!0}),j(o)!=="svelte-6wh2xc"&&(o.innerHTML=T)},m(M,y){l(M,o,y)},p:Re,d(M){M&&a(o)}}}function Ha(b){let o,T,M,y,$,ve,I,Ee,Z,Ls="Hoje em dia, não é incomum encontrar-se trabalhando com conjuntos de dados de vários gigabytes, especialmente se você planeja pré-treinar um transformer como BERT ou GPT-2 do zero. Nesses casos, até mesmo <em>carregar</em> os dados pode ser um desafio. Por exemplo, o corpus WebText usado para pré-treinar o GPT-2 consiste em mais de 8 milhões de documentos e 40 GB de texto - carregar isso na RAM do seu laptop provavelmente lhe causará um ataque cardíaco!",Ne,B,Ps="Felizmente, 🤗 Datasets foram projetados para superar essas limitações. Ele libera você de problemas de gerenciamento de memória tratando conjuntos de dados como arquivos <em>memory-mapped</em> e de limites de disco rígido por <em>streaming</em> das entradas em um corpus.",Xe,G,_e,V,Os='Nesta seção, exploraremos esses recursos de 🤗 Conjuntos de dados com um enorme corpus de 825 GB conhecido como <a href="https://pile.eleuther.ai" rel="nofollow">the Pile</a>. Vamos começar!',ze,k,Qe,W,Ks='O <code>The Pile</code> é um corpus de texto em inglês que foi criado por <a href="https://www.eleuther.ai" rel="nofollow">EleutherAI</a> para treinar modelos de linguagem em larga escala. Ele inclui uma gama diversificada de conjuntos de dados, abrangendo artigos científicos, repositórios de código do GitHub e texto da web filtrado. O corpus de treinamento está disponível em <a href="https://the-eye.eu/public/AI/pile/" rel="nofollow">blocos de 14 GB</a>, e você também pode baixar vários dos <a href="https://the-eye.eu/public/AI/pile_preliminary_components/" rel="nofollow">componentes individuais</a>. Vamos começar dando uma olhada no conjunto de dados PubMed Abstracts, que é um corpus de resumos de 15 milhões de publicações biomédicas no <a href="https://pubmed.ncbi.nlm.nih.gov/" rel="nofollow">PubMed</a>. O conjunto de dados está em <a href="https://jsonlines.org" rel="nofollow">formato JSON Lines</a> e é compactado usando a biblioteca <code>zstandard</code>, então primeiro precisamos instalá-lo:',He,C,Ye,R,ea='Em seguida, podemos carregar o conjunto de dados usando o método para arquivos remotos que aprendemos na <a href="/course/chapter5/2">seção 2</a>:',Ae,v,Fe,E,Se,N,sa="Podemos ver que há 15.518.009 linhas e 2 colunas em nosso conjunto de dados - isso é muito!",qe,f,De,X,aa="Vamos inspecionar o conteúdo do primeiro exemplo:",Le,_,Pe,z,Oe,Q,la="Ok, isso parece o resumo de um artigo médico. Agora vamos ver quanta RAM usamos para carregar o conjunto de dados!",Ke,H,es,Y,ta='Uma maneira simples de medir o uso de memória em Python é com a biblioteca <a href="https://psutil.readthedocs.io/en/latest/" rel="nofollow"><code>psutil</code></a>, que pode ser instalada com <code>pip</code> da seguinte forma:',ss,A,as,F,na="Ele fornece uma classe <code>Process</code> que nos permite verificar o uso de memória do processo atual da seguinte forma:",ls,S,ts,q,ns,D,oa="Aqui o atributo <code>rss</code> refere-se ao <em>tamanho do conjunto residente</em>, que é a fração de memória que um processo ocupa na RAM. Essa medida também inclui a memória usada pelo interpretador Python e as bibliotecas que carregamos, portanto, a quantidade real de memória usada para carregar o conjunto de dados é um pouco menor. Para comparação, vamos ver o tamanho do conjunto de dados no disco, usando o atributo <code>dataset_size</code>. Como o resultado é expresso em bytes como antes, precisamos convertê-lo manualmente para gigabytes:",os,L,is,P,ps,O,ia="Legal — apesar de ter quase 20 GB de tamanho, podemos carregar e acessar o conjunto de dados com muito menos RAM!",rs,w,ds,K,pa='Se você estiver familiarizado com Pandas, esse resultado pode ser uma surpresa por causa da famosa [regra de ouro] de Wes Kinney (<a href="https://wesmckinney.com/blog/apache-arrow-pandas-internals/" rel="nofollow">https://wesmckinney.com/blog/apache-arrow-pandas-internals/</a>) de que você normalmente precisa de 5 para 10 vezes mais RAM do que o tamanho do seu conjunto de dados. Então, como 🤗 Datasets resolve esse problema de gerenciamento de memória? 🤗 Os conjuntos de dados tratam cada conjunto de dados como um <a href="https://en.wikipedia.org/wiki/Memory-mapped_file" rel="nofollow">arquivo mapeado em memória</a>, que fornece um mapeamento entre RAM e armazenamento do sistema de arquivos que permite que a biblioteca acesse e opere em elementos do conjunto de dados sem precisar carregá-lo totalmente na memória.',ms,ee,ra='Arquivos mapeados em memória também podem ser compartilhados em vários processos, o que permite que métodos como <code>Dataset.map()</code> sejam paralelizados sem a necessidade de mover ou copiar o conjunto de dados. Sob o capô, esses recursos são todos realizados pelo formato de memória <a href="https://arrow.apache.org" rel="nofollow">Apache Arrow</a> e <a href="https://arrow.apache.org/docs/python/index.html" rel="nofollow"><code>pyarrow</code></a>, que tornam o carregamento e o processamento de dados extremamente rápidos. (Para mais detalhes sobre o Apache Arrow e comparações com o Pandas, confira <a href="https://towardsdatascience.com/apache-arrow-read-dataframe-with-zero-memory-69634092b1a" rel="nofollow">post do blog de Dejan Simic</a>.) Para ver isso em ação, vamos executar um pequeno teste de velocidade iterando sobre todos os elementos no conjunto de dados PubMed Abstracts:',cs,se,Ms,ae,us,le,da="Aqui usamos o módulo <code>timeit</code> do Python para medir o tempo de execução do <code>code_snippet</code>. Normalmente, você poderá iterar em um conjunto de dados a uma velocidade de alguns décimos de GB/s a vários GB/s. Isso funciona muito bem para a grande maioria dos aplicativos, mas às vezes você terá que trabalhar com um conjunto de dados grande demais para ser armazenado no disco rígido do seu laptop. Por exemplo, se tentássemos baixar o Pile por completo, precisaríamos de 825 GB de espaço livre em disco! Para lidar com esses casos, 🤗 Datasets fornece um recurso de streaming que nos permite baixar e acessar elementos em tempo real, sem a necessidade de baixar todo o conjunto de dados. Vamos dar uma olhada em como isso funciona.",Js,U,js,te,hs,ne,ma="Para habilitar o streaming do conjunto de dados você só precisa passar o argumento <code>streaming=True</code> para a função <code>load_dataset()</code>. Por exemplo, vamos carregar o conjunto de dados PubMed Abstracts novamente, mas em modo streaming:",Ts,oe,ys,ie,ca="Em vez do familiar <code>Dataset</code> que encontramos em outro lugar neste capítulo, o objeto retornado com <code>streaming=True</code> é um <code>IterableDataset</code>. Como o nome sugere, para acessar os elementos de um <code>IterableDataset</code> precisamos iterar sobre ele. Podemos acessar o primeiro elemento do nosso conjunto de dados transmitido da seguinte forma:",bs,pe,fs,re,ws,de,Ma='Os elementos de um conjunto de dados transmitido podem ser processados dinamicamente usando <code>IterableDataset.map()</code>, o que é útil durante o treinamento se você precisar tokenizar as entradas. O processo é exatamente o mesmo que usamos para tokenizar nosso conjunto de dados no <a href="/course/chapter3">Capítulo 3</a>, com a única diferença de que as saídas são retornadas uma a uma:',Us,me,gs,ce,xs,g,$s,Me,ua="Você também pode embaralhar um conjunto de dados transmitido usando <code>IterableDataset.shuffle()</code>, mas, diferentemente de <code>Dataset.shuffle()</code>, isso apenas embaralha os elementos em um <code>buffer_size</code> predefinido:",Is,ue,Zs,Je,Bs,je,Ja="Neste exemplo, selecionamos um exemplo aleatório dos primeiros 10.000 exemplos no buffer. Uma vez que um exemplo é acessado, seu lugar no buffer é preenchido com o próximo exemplo no corpus (ou seja, o 10.001º exemplo no caso acima). Você também pode selecionar elementos de um conjunto de dados transmitido usando as funções <code>IterableDataset.take()</code> e <code>IterableDataset.skip()</code>, que agem de maneira semelhante a <code>Dataset.select()</code>. Por exemplo, para selecionar os primeiros 5 exemplos no conjunto de dados PubMed Abstracts, podemos fazer o seguinte:",Gs,he,Vs,Te,ks,ye,ja="Da mesma forma, você pode usar a função <code>IterableDataset.skip()</code> para criar divisões de treinamento e validação de um conjunto de dados embaralhado da seguinte forma:",Ws,be,Cs,fe,ha="Vamos completar nossa exploração de streaming de conjuntos de dados com um aplicativo comum: combinar vários conjuntos de dados para criar um único corpus. 🤗 Datasets fornece uma função <code>interleave_datasets()</code> que converte uma lista de objetos <code>IterableDataset</code> em um único <code>IterableDataset</code>, onde os elementos do novo conjunto de dados são obtidos alternando entre os exemplos de origem. Essa função é especialmente útil quando você está tentando combinar grandes conjuntos de dados, então, como exemplo, vamos transmitir o subconjunto FreeLaw do Pile, que é um conjunto de dados de 51 GB de pareceres jurídicos dos tribunais dos EUA:",Rs,we,vs,Ue,Es,ge,Ta="Esse conjunto de dados é grande o suficiente para sobrecarregar a RAM da maioria dos laptops, mas conseguimos carregá-lo e acessá-lo sem suar a camisa! Vamos agora combinar os exemplos dos conjuntos de dados FreeLaw e PubMed Abstracts com a função <code>interleave_datasets()</code>:",Ns,xe,Xs,$e,_s,Ie,ya="Aqui usamos a função <code>islice()</code> do módulo <code>itertools</code> do Python para selecionar os dois primeiros exemplos do conjunto de dados combinado e podemos ver que eles correspondem aos primeiros exemplos de cada um dos dois conjuntos de dados de origem.",zs,Ze,ba="Por fim, se você quiser transmitir o Pile em sua totalidade de 825 GB, poderá pegar todos os arquivos preparados da seguinte maneira:",Qs,Be,Hs,Ge,Ys,x,As,Ve,fa="Agora você tem todas as ferramentas necessárias para carregar e processar conjuntos de dados de todas as formas e tamanhos, mas, a menos que tenha muita sorte, chegará um ponto em sua jornada de PNL em que você terá que criar um conjunto de dados para resolver o problema. problema em mãos. Esse é o tema da próxima seção!",Fs,ke,Ss,We,qs;return $=new Ds({props:{title:"Big data? 🤗 Datasets ao resgate",local:"big-data--datasets-ao-resgate",headingTag:"h1"}}),I=new va({props:{chapter:5,classNames:"absolute z-10 right-0 top-0",notebooks:[{label:"Google Colab",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/pt/chapter5/section4.ipynb"},{label:"Aws Studio",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/pt/chapter5/section4.ipynb"}]}}),G=new Ra({props:{id:"JwISwTCPPWo"}}),k=new Ds({props:{title:"O que é the Pile?",local:"o-que-é-the-pile",headingTag:"h2"}}),C=new h({props:{code:"IXBpcCUyMGluc3RhbGwlMjB6c3RhbmRhcmQ=",highlighted:"!pip install zstandard",wrap:!1}}),v=new h({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBJTBBJTIzJTIwVGhpcyUyMHRha2VzJTIwYSUyMGZldyUyMG1pbnV0ZXMlMjB0byUyMHJ1biUyQyUyMHNvJTIwZ28lMjBncmFiJTIwYSUyMHRlYSUyMG9yJTIwY29mZmVlJTIwd2hpbGUlMjB5b3UlMjB3YWl0JTIwJTNBKSUwQWRhdGFfZmlsZXMlMjAlM0QlMjAlMjJodHRwcyUzQSUyRiUyRnRoZS1leWUuZXUlMkZwdWJsaWMlMkZBSSUyRnBpbGVfcHJlbGltaW5hcnlfY29tcG9uZW50cyUyRlBVQk1FRF90aXRsZV9hYnN0cmFjdHNfMjAxOV9iYXNlbGluZS5qc29ubC56c3QlMjIlMEFwdWJtZWRfZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJqc29uJTIyJTJDJTIwZGF0YV9maWxlcyUzRGRhdGFfZmlsZXMlMkMlMjBzcGxpdCUzRCUyMnRyYWluJTIyKSUwQXB1Ym1lZF9kYXRhc2V0",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-comment"># This takes a few minutes to run, so go grab a tea or coffee while you wait :)</span> | |
| data_files = <span class="hljs-string">"https://the-eye.eu/public/AI/pile_preliminary_components/PUBMED_title_abstracts_2019_baseline.jsonl.zst"</span> | |
| pubmed_dataset = load_dataset(<span class="hljs-string">"json"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>) | |
| pubmed_dataset`,wrap:!1}}),E=new h({props:{code:"RGF0YXNldCglN0IlMEElMjAlMjAlMjAlMjBmZWF0dXJlcyUzQSUyMCU1QidtZXRhJyUyQyUyMCd0ZXh0JyU1RCUyQyUwQSUyMCUyMCUyMCUyMG51bV9yb3dzJTNBJTIwMTU1MTgwMDklMEElN0Qp",highlighted:`Dataset({ | |
| features: [<span class="hljs-string">'meta'</span>, <span class="hljs-string">'text'</span>], | |
| num_rows: <span class="hljs-number">15518009</span> | |
| })`,wrap:!1}}),f=new Ce({props:{$$slots:{default:[Na]},$$scope:{ctx:b}}}),_=new h({props:{code:"cHVibWVkX2RhdGFzZXQlNUIwJTVE",highlighted:'pubmed_dataset[<span class="hljs-number">0</span>]',wrap:!1}}),z=new h({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzQlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAndGV4dCclM0ElMjAnRXBpZGVtaW9sb2d5JTIwb2YlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHdpdGglMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb24uJTVDblRvJTIwZGV0ZXJtaW5lJTIwdGhlJTIwcHJldmFsZW5jZSUyMG9mJTIwaHlwb3hhZW1pYSUyMGluJTIwY2hpbGRyZW4lMjBhZ2VkJTIwdW5kZXIlMjA1JTIweWVhcnMlMjBzdWZmZXJpbmclMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb25zJTIwKEFMUkkpJTJDJTIwdGhlJTIwcmlzayUyMGZhY3RvcnMlMjBmb3IlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHVuZGVyJTIwNSUyMHllYXJzJTIwb2YlMjBhZ2UlMjB3aXRoJTIwQUxSSSUyQyUyMGFuZCUyMHRoZSUyMGFzc29jaWF0aW9uJTIwb2YlMjBoeXBveGFlbWlhJTIwd2l0aCUyMGFuJTIwaW5jcmVhc2VkJTIwcmlzayUyMG9mJTIwZHlpbmclMjBpbiUyMGNoaWxkcmVuJTIwb2YlMjB0aGUlMjBzYW1lJTIwYWdlJTIwLi4uJyU3RA==",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age ...'</span>}`,wrap:!1}}),H=new Ds({props:{title:"A magia do mapeamento de memória",local:"a-magia-do-mapeamento-de-memória",headingTag:"h2"}}),A=new h({props:{code:"IXBpcCUyMGluc3RhbGwlMjBwc3V0aWw=",highlighted:"!pip install psutil",wrap:!1}}),S=new h({props:{code:"aW1wb3J0JTIwcHN1dGlsJTBBJTBBJTIzJTIwUHJvY2Vzcy5tZW1vcnlfaW5mbyUyMGlzJTIwZXhwcmVzc2VkJTIwaW4lMjBieXRlcyUyQyUyMHNvJTIwY29udmVydCUyMHRvJTIwbWVnYWJ5dGVzJTBBcHJpbnQoZiUyMlJBTSUyMHVzZWQlM0ElMjAlN0Jwc3V0aWwuUHJvY2VzcygpLm1lbW9yeV9pbmZvKCkucnNzJTIwJTJGJTIwKDEwMjQlMjAqJTIwMTAyNCklM0EuMmYlN0QlMjBNQiUyMik=",highlighted:`<span class="hljs-keyword">import</span> psutil | |
| <span class="hljs-comment"># Process.memory_info is expressed in bytes, so convert to megabytes</span> | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"RAM used: <span class="hljs-subst">{psutil.Process().memory_info().rss / (<span class="hljs-number">1024</span> * <span class="hljs-number">1024</span>):<span class="hljs-number">.2</span>f}</span> MB"</span>)`,wrap:!1}}),q=new h({props:{code:"UkFNJTIwdXNlZCUzQSUyMDU2NzguMzMlMjBNQg==",highlighted:'RAM used: <span class="hljs-number">5678.33</span> MB',wrap:!1}}),L=new h({props:{code:"cHJpbnQoZiUyMk51bWJlciUyMG9mJTIwZmlsZXMlMjBpbiUyMGRhdGFzZXQlMjAlM0ElMjAlN0JwdWJtZWRfZGF0YXNldC5kYXRhc2V0X3NpemUlN0QlMjIpJTBBc2l6ZV9nYiUyMCUzRCUyMHB1Ym1lZF9kYXRhc2V0LmRhdGFzZXRfc2l6ZSUyMCUyRiUyMCgxMDI0KiozKSUwQXByaW50KGYlMjJEYXRhc2V0JTIwc2l6ZSUyMChjYWNoZSUyMGZpbGUpJTIwJTNBJTIwJTdCc2l6ZV9nYiUzQS4yZiU3RCUyMEdCJTIyKQ==",highlighted:`<span class="hljs-built_in">print</span>(<span class="hljs-string">f"Number of files in dataset : <span class="hljs-subst">{pubmed_dataset.dataset_size}</span>"</span>) | |
| size_gb = pubmed_dataset.dataset_size / (<span class="hljs-number">1024</span>**<span class="hljs-number">3</span>) | |
| <span class="hljs-built_in">print</span>(<span class="hljs-string">f"Dataset size (cache file) : <span class="hljs-subst">{size_gb:<span class="hljs-number">.2</span>f}</span> GB"</span>)`,wrap:!1}}),P=new h({props:{code:"TnVtYmVyJTIwb2YlMjBmaWxlcyUyMGluJTIwZGF0YXNldCUyMCUzQSUyMDIwOTc5NDM3MDUxJTBBRGF0YXNldCUyMHNpemUlMjAoY2FjaGUlMjBmaWxlKSUyMCUzQSUyMDE5LjU0JTIwR0I=",highlighted:`Number of files <span class="hljs-keyword">in</span> dataset : <span class="hljs-number">20979437051</span> | |
| Dataset size (cache file) : <span class="hljs-number">19.54</span> GB`,wrap:!1}}),w=new Ce({props:{$$slots:{default:[Xa]},$$scope:{ctx:b}}}),se=new h({props:{code:"aW1wb3J0JTIwdGltZWl0JTBBJTBBY29kZV9zbmlwcGV0JTIwJTNEJTIwJTIyJTIyJTIyYmF0Y2hfc2l6ZSUyMCUzRCUyMDEwMDAlMEElMEFmb3IlMjBpZHglMjBpbiUyMHJhbmdlKDAlMkMlMjBsZW4ocHVibWVkX2RhdGFzZXQpJTJDJTIwYmF0Y2hfc2l6ZSklM0ElMEElMjAlMjAlMjAlMjBfJTIwJTNEJTIwcHVibWVkX2RhdGFzZXQlNUJpZHglM0FpZHglMjAlMkIlMjBiYXRjaF9zaXplJTVEJTBBJTIyJTIyJTIyJTBBJTBBdGltZSUyMCUzRCUyMHRpbWVpdC50aW1laXQoc3RtdCUzRGNvZGVfc25pcHBldCUyQyUyMG51bWJlciUzRDElMkMlMjBnbG9iYWxzJTNEZ2xvYmFscygpKSUwQXByaW50KCUwQSUyMCUyMCUyMCUyMGYlMjJJdGVyYXRlZCUyMG92ZXIlMjAlN0JsZW4ocHVibWVkX2RhdGFzZXQpJTdEJTIwZXhhbXBsZXMlMjAoYWJvdXQlMjAlN0JzaXplX2diJTNBLjFmJTdEJTIwR0IpJTIwaW4lMjAlMjIlMEElMjAlMjAlMjAlMjBmJTIyJTdCdGltZSUzQS4xZiU3RHMlMkMlMjBpLmUuJTIwJTdCc2l6ZV9nYiUyRnRpbWUlM0EuM2YlN0QlMjBHQiUyRnMlMjIlMEEp",highlighted:`<span class="hljs-keyword">import</span> timeit | |
| code_snippet = <span class="hljs-string">"""batch_size = 1000 | |
| for idx in range(0, len(pubmed_dataset), batch_size): | |
| _ = pubmed_dataset[idx:idx + batch_size] | |
| """</span> | |
| time = timeit.timeit(stmt=code_snippet, number=<span class="hljs-number">1</span>, <span class="hljs-built_in">globals</span>=<span class="hljs-built_in">globals</span>()) | |
| <span class="hljs-built_in">print</span>( | |
| <span class="hljs-string">f"Iterated over <span class="hljs-subst">{<span class="hljs-built_in">len</span>(pubmed_dataset)}</span> examples (about <span class="hljs-subst">{size_gb:<span class="hljs-number">.1</span>f}</span> GB) in "</span> | |
| <span class="hljs-string">f"<span class="hljs-subst">{time:<span class="hljs-number">.1</span>f}</span>s, i.e. <span class="hljs-subst">{size_gb/time:<span class="hljs-number">.3</span>f}</span> GB/s"</span> | |
| )`,wrap:!1}}),ae=new h({props:{code:"J0l0ZXJhdGVkJTIwb3ZlciUyMDE1NTE4MDA5JTIwZXhhbXBsZXMlMjAoYWJvdXQlMjAxOS41JTIwR0IpJTIwaW4lMjA2NC4ycyUyQyUyMGkuZS4lMjAwLjMwNCUyMEdCJTJGcyc=",highlighted:'<span class="hljs-string">'Iterated over 15518009 examples (about 19.5 GB) in 64.2s, i.e. 0.304 GB/s'</span>',wrap:!1}}),U=new Ce({props:{$$slots:{default:[_a]},$$scope:{ctx:b}}}),te=new Ds({props:{title:"Conjuntos de dados em streaming",local:"conjuntos-de-dados-em-streaming",headingTag:"h2"}}),oe=new h({props:{code:"cHVibWVkX2RhdGFzZXRfc3RyZWFtZWQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTBBJTIwJTIwJTIwJTIwJTIyanNvbiUyMiUyQyUyMGRhdGFfZmlsZXMlM0RkYXRhX2ZpbGVzJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiUyQyUyMHN0cmVhbWluZyUzRFRydWUlMEEp",highlighted:`pubmed_dataset_streamed = load_dataset( | |
| <span class="hljs-string">"json"</span>, data_files=data_files, split=<span class="hljs-string">"train"</span>, streaming=<span class="hljs-literal">True</span> | |
| )`,wrap:!1}}),pe=new h({props:{code:"bmV4dChpdGVyKHB1Ym1lZF9kYXRhc2V0X3N0cmVhbWVkKSk=",highlighted:'<span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(pubmed_dataset_streamed))',wrap:!1}}),re=new h({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzQlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAndGV4dCclM0ElMjAnRXBpZGVtaW9sb2d5JTIwb2YlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHdpdGglMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb24uJTVDblRvJTIwZGV0ZXJtaW5lJTIwdGhlJTIwcHJldmFsZW5jZSUyMG9mJTIwaHlwb3hhZW1pYSUyMGluJTIwY2hpbGRyZW4lMjBhZ2VkJTIwdW5kZXIlMjA1JTIweWVhcnMlMjBzdWZmZXJpbmclMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb25zJTIwKEFMUkkpJTJDJTIwdGhlJTIwcmlzayUyMGZhY3RvcnMlMjBmb3IlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHVuZGVyJTIwNSUyMHllYXJzJTIwb2YlMjBhZ2UlMjB3aXRoJTIwQUxSSSUyQyUyMGFuZCUyMHRoZSUyMGFzc29jaWF0aW9uJTIwb2YlMjBoeXBveGFlbWlhJTIwd2l0aCUyMGFuJTIwaW5jcmVhc2VkJTIwcmlzayUyMG9mJTIwZHlpbmclMjBpbiUyMGNoaWxkcmVuJTIwb2YlMjB0aGUlMjBzYW1lJTIwYWdlJTIwLi4uJyU3RA==",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Epidemiology of hypoxaemia in children with acute lower respiratory infection.\\nTo determine the prevalence of hypoxaemia in children aged under 5 years suffering acute lower respiratory infections (ALRI), the risk factors for hypoxaemia in children under 5 years of age with ALRI, and the association of hypoxaemia with an increased risk of dying in children of the same age ...'</span>}`,wrap:!1}}),me=new h({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Ub2tlbml6ZXIlMEElMEF0b2tlbml6ZXIlMjAlM0QlMjBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZCglMjJkaXN0aWxiZXJ0LWJhc2UtdW5jYXNlZCUyMiklMEF0b2tlbml6ZWRfZGF0YXNldCUyMCUzRCUyMHB1Ym1lZF9kYXRhc2V0X3N0cmVhbWVkLm1hcChsYW1iZGElMjB4JTNBJTIwdG9rZW5pemVyKHglNUIlMjJ0ZXh0JTIyJTVEKSklMEFuZXh0KGl0ZXIodG9rZW5pemVkX2RhdGFzZXQpKQ==",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer | |
| tokenizer = AutoTokenizer.from_pretrained(<span class="hljs-string">"distilbert-base-uncased"</span>) | |
| tokenized_dataset = pubmed_dataset_streamed.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> x: tokenizer(x[<span class="hljs-string">"text"</span>])) | |
| <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(tokenized_dataset))`,wrap:!1}}),ce=new h({props:{code:"JTdCJ2lucHV0X2lkcyclM0ElMjAlNUIxMDElMkMlMjA0OTU4JTJDJTIwNTE3OCUyQyUyMDQzMjglMkMlMjA2Nzc5JTJDJTIwLi4uJTVEJTJDJTIwJ2F0dGVudGlvbl9tYXNrJyUzQSUyMCU1QjElMkMlMjAxJTJDJTIwMSUyQyUyMDElMkMlMjAxJTJDJTIwLi4uJTVEJTdE",highlighted:'{<span class="hljs-string">'input_ids'</span>: [<span class="hljs-number">101</span>, <span class="hljs-number">4958</span>, <span class="hljs-number">5178</span>, <span class="hljs-number">4328</span>, <span class="hljs-number">6779</span>, ...], <span class="hljs-string">'attention_mask'</span>: [<span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, <span class="hljs-number">1</span>, ...]}',wrap:!1}}),g=new Ce({props:{$$slots:{default:[za]},$$scope:{ctx:b}}}),ue=new h({props:{code:"c2h1ZmZsZWRfZGF0YXNldCUyMCUzRCUyMHB1Ym1lZF9kYXRhc2V0X3N0cmVhbWVkLnNodWZmbGUoYnVmZmVyX3NpemUlM0QxMF8wMDAlMkMlMjBzZWVkJTNENDIpJTBBbmV4dChpdGVyKHNodWZmbGVkX2RhdGFzZXQpKQ==",highlighted:`shuffled_dataset = pubmed_dataset_streamed.shuffle(buffer_size=<span class="hljs-number">10_000</span>, seed=<span class="hljs-number">42</span>) | |
| <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(shuffled_dataset))`,wrap:!1}}),Je=new h({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MTA3OTklMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAndGV4dCclM0ElMjAnUmFuZG9taXplZCUyMHN0dWR5JTIwb2YlMjBkb3NlJTIwb3IlMjBzY2hlZHVsZSUyMG1vZGlmaWNhdGlvbiUyMG9mJTIwZ3JhbnVsb2N5dGUlMjBjb2xvbnktc3RpbXVsYXRpbmclMjBmYWN0b3IlMjBpbiUyMHBsYXRpbnVtLWJhc2VkJTIwY2hlbW90aGVyYXB5JTIwZm9yJTIwZWxkZXJseSUyMHBhdGllbnRzJTIwd2l0aCUyMGx1bmclMjBjYW5jZXIlMjAuLi4nJTdE",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11410799</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Randomized study of dose or schedule modification of granulocyte colony-stimulating factor in platinum-based chemotherapy for elderly patients with lung cancer ...'</span>}`,wrap:!1}}),he=new h({props:{code:"ZGF0YXNldF9oZWFkJTIwJTNEJTIwcHVibWVkX2RhdGFzZXRfc3RyZWFtZWQudGFrZSg1KSUwQWxpc3QoZGF0YXNldF9oZWFkKQ==",highlighted:`dataset_head = pubmed_dataset_streamed.take(<span class="hljs-number">5</span>) | |
| <span class="hljs-built_in">list</span>(dataset_head)`,wrap:!1}}),Te=new h({props:{code:"JTVCJTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzQlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAlMjAndGV4dCclM0ElMjAnRXBpZGVtaW9sb2d5JTIwb2YlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHdpdGglMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb24lMjAuLi4nJTdEJTJDJTBBJTIwJTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzUlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAlMjAndGV4dCclM0ElMjAnQ2xpbmljYWwlMjBzaWducyUyMG9mJTIwaHlwb3hhZW1pYSUyMGluJTIwY2hpbGRyZW4lMjB3aXRoJTIwYWN1dGUlMjBsb3dlciUyMHJlc3BpcmF0b3J5JTIwaW5mZWN0aW9uJTNBJTIwaW5kaWNhdG9ycyUyMG9mJTIwb3h5Z2VuJTIwdGhlcmFweSUyMC4uLiclN0QlMkMlMEElMjAlN0InbWV0YSclM0ElMjAlN0IncG1pZCclM0ElMjAxMTQwOTU3NiUyQyUyMCdsYW5ndWFnZSclM0ElMjAnZW5nJyU3RCUyQyUwQSUyMCUyMCd0ZXh0JyUzQSUyMCUyMkh5cG94YWVtaWElMjBpbiUyMGNoaWxkcmVuJTIwd2l0aCUyMHNldmVyZSUyMHBuZXVtb25pYSUyMGluJTIwUGFwdWElMjBOZXclMjBHdWluZWElMjAuLi4lMjIlN0QlMkMlMEElMjAlN0InbWV0YSclM0ElMjAlN0IncG1pZCclM0ElMjAxMTQwOTU3NyUyQyUyMCdsYW5ndWFnZSclM0ElMjAnZW5nJyU3RCUyQyUwQSUyMCUyMCd0ZXh0JyUzQSUyMCdPeHlnZW4lMjBjb25jZW50cmF0b3JzJTIwYW5kJTIwY3lsaW5kZXJzJTIwLi4uJyU3RCUyQyUwQSUyMCU3QidtZXRhJyUzQSUyMCU3QidwbWlkJyUzQSUyMDExNDA5NTc4JTJDJTIwJ2xhbmd1YWdlJyUzQSUyMCdlbmcnJTdEJTJDJTBBJTIwJTIwJ3RleHQnJTNBJTIwJ094eWdlbiUyMHN1cHBseSUyMGluJTIwcnVyYWwlMjBhZnJpY2ElM0ElMjBhJTIwcGVyc29uYWwlMjBleHBlcmllbmNlJTIwLi4uJyU3RCU1RA==",highlighted:`[{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Epidemiology of hypoxaemia in children with acute lower respiratory infection ...'</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409575</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Clinical signs of hypoxaemia in children with acute lower respiratory infection: indicators of oxygen therapy ...'</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409576</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">"Hypoxaemia in children with severe pneumonia in Papua New Guinea ..."</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409577</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Oxygen concentrators and cylinders ...'</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409578</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Oxygen supply in rural africa: a personal experience ...'</span>}]`,wrap:!1}}),be=new h({props:{code:"JTIzJTIwU2tpcCUyMHRoZSUyMGZpcnN0JTIwMSUyQzAwMCUyMGV4YW1wbGVzJTIwYW5kJTIwaW5jbHVkZSUyMHRoZSUyMHJlc3QlMjBpbiUyMHRoZSUyMHRyYWluaW5nJTIwc2V0JTBBdHJhaW5fZGF0YXNldCUyMCUzRCUyMHNodWZmbGVkX2RhdGFzZXQuc2tpcCgxMDAwKSUwQSUyMyUyMFRha2UlMjB0aGUlMjBmaXJzdCUyMDElMkMwMDAlMjBleGFtcGxlcyUyMGZvciUyMHRoZSUyMHZhbGlkYXRpb24lMjBzZXQlMEF2YWxpZGF0aW9uX2RhdGFzZXQlMjAlM0QlMjBzaHVmZmxlZF9kYXRhc2V0LnRha2UoMTAwMCk=",highlighted:`<span class="hljs-comment"># Skip the first 1,000 examples and include the rest in the training set</span> | |
| train_dataset = shuffled_dataset.skip(<span class="hljs-number">1000</span>) | |
| <span class="hljs-comment"># Take the first 1,000 examples for the validation set</span> | |
| validation_dataset = shuffled_dataset.take(<span class="hljs-number">1000</span>)`,wrap:!1}}),we=new h({props:{code:"bGF3X2RhdGFzZXRfc3RyZWFtZWQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTBBJTIwJTIwJTIwJTIwJTIyanNvbiUyMiUyQyUwQSUyMCUyMCUyMCUyMGRhdGFfZmlsZXMlM0QlMjJodHRwcyUzQSUyRiUyRnRoZS1leWUuZXUlMkZwdWJsaWMlMkZBSSUyRnBpbGVfcHJlbGltaW5hcnlfY29tcG9uZW50cyUyRkZyZWVMYXdfT3BpbmlvbnMuanNvbmwuenN0JTIyJTJDJTBBJTIwJTIwJTIwJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiUyQyUwQSUyMCUyMCUyMCUyMHN0cmVhbWluZyUzRFRydWUlMkMlMEEpJTBBbmV4dChpdGVyKGxhd19kYXRhc2V0X3N0cmVhbWVkKSk=",highlighted:`law_dataset_streamed = load_dataset( | |
| <span class="hljs-string">"json"</span>, | |
| data_files=<span class="hljs-string">"https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst"</span>, | |
| split=<span class="hljs-string">"train"</span>, | |
| streaming=<span class="hljs-literal">True</span>, | |
| ) | |
| <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(law_dataset_streamed))`,wrap:!1}}),Ue=new h({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ2Nhc2VfSUQnJTNBJTIwJzExMDkyMS5qc29uJyUyQyUwQSUyMCUyMCdjYXNlX2p1cmlzZGljdGlvbiclM0ElMjAnc2NvdHVzLnRhci5neiclMkMlMEElMjAlMjAnZGF0ZV9jcmVhdGVkJyUzQSUyMCcyMDEwLTA0LTI4VDE3JTNBMTIlM0E0OVonJTdEJTJDJTBBJTIwJ3RleHQnJTNBJTIwJyU1Q240NjElMjBVLlMuJTIwMjM4JTIwKDE5ODMpJTVDbk9MSU0lMjBFVCUyMEFMLiU1Q252LiU1Q25XQUtJTkVLT05BJTVDbk5vLiUyMDgxLTE1ODEuJTVDblN1cHJlbWUlMjBDb3VydCUyMG9mJTIwVW5pdGVkJTIwU3RhdGVzLiU1Q25Bcmd1ZWQlMjBKYW51YXJ5JTIwMTklMkMlMjAxOTgzLiU1Q25EZWNpZGVkJTIwQXByaWwlMjAyNiUyQyUyMDE5ODMuJTVDbkNFUlRJT1JBUkklMjBUTyUyMFRIRSUyMFVOSVRFRCUyMFNUQVRFUyUyMENPVVJUJTIwT0YlMjBBUFBFQUxTJTIwRk9SJTIwVEhFJTIwTklOVEglMjBDSVJDVUlUJTVDbioyMzklMjBNaWNoYWVsJTIwQS4lMjBMaWxseSUyQyUyMEZpcnN0JTIwRGVwdXR5JTIwQXR0b3JuZXklMjBHZW5lcmFsJTIwb2YlMjBIYXdhaWklMkMlMjBhcmd1ZWQlMjB0aGUlMjBjYXVzZSUyMGZvciUyMHBldGl0aW9uZXJzLiUyMFdpdGglMjBoaW0lMjBvbiUyMHRoZSUyMGJyaWVmJTIwd2FzJTIwSmFtZXMlMjBILiUyMERhbm5lbmJlcmclMkMlMjBEZXB1dHklMjBBdHRvcm5leSUyMEdlbmVyYWwuLi4nJTdE",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'case_ID'</span>: <span class="hljs-string">'110921.json'</span>, | |
| <span class="hljs-string">'case_jurisdiction'</span>: <span class="hljs-string">'scotus.tar.gz'</span>, | |
| <span class="hljs-string">'date_created'</span>: <span class="hljs-string">'2010-04-28T17:12:49Z'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'\\n461 U.S. 238 (1983)\\nOLIM ET AL.\\nv.\\nWAKINEKONA\\nNo. 81-1581.\\nSupreme Court of United States.\\nArgued January 19, 1983.\\nDecided April 26, 1983.\\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...'</span>}`,wrap:!1}}),xe=new h({props:{code:"ZnJvbSUyMGl0ZXJ0b29scyUyMGltcG9ydCUyMGlzbGljZSUwQWZyb20lMjBkYXRhc2V0cyUyMGltcG9ydCUyMGludGVybGVhdmVfZGF0YXNldHMlMEElMEFjb21iaW5lZF9kYXRhc2V0JTIwJTNEJTIwaW50ZXJsZWF2ZV9kYXRhc2V0cyglNUJwdWJtZWRfZGF0YXNldF9zdHJlYW1lZCUyQyUyMGxhd19kYXRhc2V0X3N0cmVhbWVkJTVEKSUwQWxpc3QoaXNsaWNlKGNvbWJpbmVkX2RhdGFzZXQlMkMlMjAyKSk=",highlighted:`<span class="hljs-keyword">from</span> itertools <span class="hljs-keyword">import</span> islice | |
| <span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> interleave_datasets | |
| combined_dataset = interleave_datasets([pubmed_dataset_streamed, law_dataset_streamed]) | |
| <span class="hljs-built_in">list</span>(islice(combined_dataset, <span class="hljs-number">2</span>))`,wrap:!1}}),$e=new h({props:{code:"JTVCJTdCJ21ldGEnJTNBJTIwJTdCJ3BtaWQnJTNBJTIwMTE0MDk1NzQlMkMlMjAnbGFuZ3VhZ2UnJTNBJTIwJ2VuZyclN0QlMkMlMEElMjAlMjAndGV4dCclM0ElMjAnRXBpZGVtaW9sb2d5JTIwb2YlMjBoeXBveGFlbWlhJTIwaW4lMjBjaGlsZHJlbiUyMHdpdGglMjBhY3V0ZSUyMGxvd2VyJTIwcmVzcGlyYXRvcnklMjBpbmZlY3Rpb24lMjAuLi4nJTdEJTJDJTBBJTIwJTdCJ21ldGEnJTNBJTIwJTdCJ2Nhc2VfSUQnJTNBJTIwJzExMDkyMS5qc29uJyUyQyUwQSUyMCUyMCUyMCdjYXNlX2p1cmlzZGljdGlvbiclM0ElMjAnc2NvdHVzLnRhci5neiclMkMlMEElMjAlMjAlMjAnZGF0ZV9jcmVhdGVkJyUzQSUyMCcyMDEwLTA0LTI4VDE3JTNBMTIlM0E0OVonJTdEJTJDJTBBJTIwJTIwJ3RleHQnJTNBJTIwJyU1Q240NjElMjBVLlMuJTIwMjM4JTIwKDE5ODMpJTVDbk9MSU0lMjBFVCUyMEFMLiU1Q252LiU1Q25XQUtJTkVLT05BJTVDbk5vLiUyMDgxLTE1ODEuJTVDblN1cHJlbWUlMjBDb3VydCUyMG9mJTIwVW5pdGVkJTIwU3RhdGVzLiU1Q25Bcmd1ZWQlMjBKYW51YXJ5JTIwMTklMkMlMjAxOTgzLiU1Q25EZWNpZGVkJTIwQXByaWwlMjAyNiUyQyUyMDE5ODMuJTVDbkNFUlRJT1JBUkklMjBUTyUyMFRIRSUyMFVOSVRFRCUyMFNUQVRFUyUyMENPVVJUJTIwT0YlMjBBUFBFQUxTJTIwRk9SJTIwVEhFJTIwTklOVEglMjBDSVJDVUlUJTVDbioyMzklMjBNaWNoYWVsJTIwQS4lMjBMaWxseSUyQyUyMEZpcnN0JTIwRGVwdXR5JTIwQXR0b3JuZXklMjBHZW5lcmFsJTIwb2YlMjBIYXdhaWklMkMlMjBhcmd1ZWQlMjB0aGUlMjBjYXVzZSUyMGZvciUyMHBldGl0aW9uZXJzLiUyMFdpdGglMjBoaW0lMjBvbiUyMHRoZSUyMGJyaWVmJTIwd2FzJTIwSmFtZXMlMjBILiUyMERhbm5lbmJlcmclMkMlMjBEZXB1dHklMjBBdHRvcm5leSUyMEdlbmVyYWwuLi4nJTdEJTVE",highlighted:`[{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pmid'</span>: <span class="hljs-number">11409574</span>, <span class="hljs-string">'language'</span>: <span class="hljs-string">'eng'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'Epidemiology of hypoxaemia in children with acute lower respiratory infection ...'</span>}, | |
| {<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'case_ID'</span>: <span class="hljs-string">'110921.json'</span>, | |
| <span class="hljs-string">'case_jurisdiction'</span>: <span class="hljs-string">'scotus.tar.gz'</span>, | |
| <span class="hljs-string">'date_created'</span>: <span class="hljs-string">'2010-04-28T17:12:49Z'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'\\n461 U.S. 238 (1983)\\nOLIM ET AL.\\nv.\\nWAKINEKONA\\nNo. 81-1581.\\nSupreme Court of United States.\\nArgued January 19, 1983.\\nDecided April 26, 1983.\\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...'</span>}]`,wrap:!1}}),Be=new h({props:{code:"YmFzZV91cmwlMjAlM0QlMjAlMjJodHRwcyUzQSUyRiUyRnRoZS1leWUuZXUlMkZwdWJsaWMlMkZBSSUyRnBpbGUlMkYlMjIlMEFkYXRhX2ZpbGVzJTIwJTNEJTIwJTdCJTBBJTIwJTIwJTIwJTIwJTIydHJhaW4lMjIlM0ElMjAlNUJiYXNlX3VybCUyMCUyQiUyMCUyMnRyYWluJTJGJTIyJTIwJTJCJTIwZiUyMiU3QmlkeCUzQTAyZCU3RC5qc29ubC56c3QlMjIlMjBmb3IlMjBpZHglMjBpbiUyMHJhbmdlKDMwKSU1RCUyQyUwQSUyMCUyMCUyMCUyMCUyMnZhbGlkYXRpb24lMjIlM0ElMjBiYXNlX3VybCUyMCUyQiUyMCUyMnZhbC5qc29ubC56c3QlMjIlMkMlMEElMjAlMjAlMjAlMjAlMjJ0ZXN0JTIyJTNBJTIwYmFzZV91cmwlMjAlMkIlMjAlMjJ0ZXN0Lmpzb25sLnpzdCUyMiUyQyUwQSU3RCUwQXBpbGVfZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJqc29uJTIyJTJDJTIwZGF0YV9maWxlcyUzRGRhdGFfZmlsZXMlMkMlMjBzdHJlYW1pbmclM0RUcnVlKSUwQW5leHQoaXRlcihwaWxlX2RhdGFzZXQlNUIlMjJ0cmFpbiUyMiU1RCkp",highlighted:`base_url = <span class="hljs-string">"https://the-eye.eu/public/AI/pile/"</span> | |
| data_files = { | |
| <span class="hljs-string">"train"</span>: [base_url + <span class="hljs-string">"train/"</span> + <span class="hljs-string">f"<span class="hljs-subst">{idx:02d}</span>.jsonl.zst"</span> <span class="hljs-keyword">for</span> idx <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-number">30</span>)], | |
| <span class="hljs-string">"validation"</span>: base_url + <span class="hljs-string">"val.jsonl.zst"</span>, | |
| <span class="hljs-string">"test"</span>: base_url + <span class="hljs-string">"test.jsonl.zst"</span>, | |
| } | |
| pile_dataset = load_dataset(<span class="hljs-string">"json"</span>, data_files=data_files, streaming=<span class="hljs-literal">True</span>) | |
| <span class="hljs-built_in">next</span>(<span class="hljs-built_in">iter</span>(pile_dataset[<span class="hljs-string">"train"</span>]))`,wrap:!1}}),Ge=new h({props:{code:"JTdCJ21ldGEnJTNBJTIwJTdCJ3BpbGVfc2V0X25hbWUnJTNBJTIwJ1BpbGUtQ0MnJTdEJTJDJTBBJTIwJ3RleHQnJTNBJTIwJ0l0JTIwaXMlMjBkb25lJTJDJTIwYW5kJTIwc3VibWl0dGVkLiUyMFlvdSUyMGNhbiUyMHBsYXklMjAlRTIlODAlOUNTdXJ2aXZhbCUyMG9mJTIwdGhlJTIwVGFzdGllc3QlRTIlODAlOUQlMjBvbiUyMEFuZHJvaWQlMkMlMjBhbmQlMjBvbiUyMHRoZSUyMHdlYi4uLiclN0Q=",highlighted:`{<span class="hljs-string">'meta'</span>: {<span class="hljs-string">'pile_set_name'</span>: <span class="hljs-string">'Pile-CC'</span>}, | |
| <span class="hljs-string">'text'</span>: <span class="hljs-string">'It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web...'</span>}`,wrap:!1}}),x=new Ce({props:{$$slots:{default:[Qa]},$$scope:{ctx:b}}}),ke=new Ea({props:{source:"https://github.com/huggingface/course/blob/main/chapters/pt/chapter5/4.mdx"}}),{c(){o=u("meta"),T=t(),M=u("p"),y=t(),i($.$$.fragment),ve=t(),i(I.$$.fragment),Ee=t(),Z=u("p"),Z.innerHTML=Ls,Ne=t(),B=u("p"),B.innerHTML=Ps,Xe=t(),i(G.$$.fragment),_e=t(),V=u("p"),V.innerHTML=Os,ze=t(),i(k.$$.fragment),Qe=t(),W=u("p"),W.innerHTML=Ks,He=t(),i(C.$$.fragment),Ye=t(),R=u("p"),R.innerHTML=ea,Ae=t(),i(v.$$.fragment),Fe=t(),i(E.$$.fragment),Se=t(),N=u("p"),N.textContent=sa,qe=t(),i(f.$$.fragment),De=t(),X=u("p"),X.textContent=aa,Le=t(),i(_.$$.fragment),Pe=t(),i(z.$$.fragment),Oe=t(),Q=u("p"),Q.textContent=la,Ke=t(),i(H.$$.fragment),es=t(),Y=u("p"),Y.innerHTML=ta,ss=t(),i(A.$$.fragment),as=t(),F=u("p"),F.innerHTML=na,ls=t(),i(S.$$.fragment),ts=t(),i(q.$$.fragment),ns=t(),D=u("p"),D.innerHTML=oa,os=t(),i(L.$$.fragment),is=t(),i(P.$$.fragment),ps=t(),O=u("p"),O.textContent=ia,rs=t(),i(w.$$.fragment),ds=t(),K=u("p"),K.innerHTML=pa,ms=t(),ee=u("p"),ee.innerHTML=ra,cs=t(),i(se.$$.fragment),Ms=t(),i(ae.$$.fragment),us=t(),le=u("p"),le.innerHTML=da,Js=t(),i(U.$$.fragment),js=t(),i(te.$$.fragment),hs=t(),ne=u("p"),ne.innerHTML=ma,Ts=t(),i(oe.$$.fragment),ys=t(),ie=u("p"),ie.innerHTML=ca,bs=t(),i(pe.$$.fragment),fs=t(),i(re.$$.fragment),ws=t(),de=u("p"),de.innerHTML=Ma,Us=t(),i(me.$$.fragment),gs=t(),i(ce.$$.fragment),xs=t(),i(g.$$.fragment),$s=t(),Me=u("p"),Me.innerHTML=ua,Is=t(),i(ue.$$.fragment),Zs=t(),i(Je.$$.fragment),Bs=t(),je=u("p"),je.innerHTML=Ja,Gs=t(),i(he.$$.fragment),Vs=t(),i(Te.$$.fragment),ks=t(),ye=u("p"),ye.innerHTML=ja,Ws=t(),i(be.$$.fragment),Cs=t(),fe=u("p"),fe.innerHTML=ha,Rs=t(),i(we.$$.fragment),vs=t(),i(Ue.$$.fragment),Es=t(),ge=u("p"),ge.innerHTML=Ta,Ns=t(),i(xe.$$.fragment),Xs=t(),i($e.$$.fragment),_s=t(),Ie=u("p"),Ie.innerHTML=ya,zs=t(),Ze=u("p"),Ze.textContent=ba,Qs=t(),i(Be.$$.fragment),Hs=t(),i(Ge.$$.fragment),Ys=t(),i(x.$$.fragment),As=t(),Ve=u("p"),Ve.textContent=fa,Fs=t(),i(ke.$$.fragment),Ss=t(),We=u("p"),this.h()},l(e){const s=Wa("svelte-u9bgzb",document.head);o=J(s,"META",{name:!0,content:!0}),s.forEach(a),T=n(e),M=J(e,"P",{}),Ia(M).forEach(a),y=n(e),p($.$$.fragment,e),ve=n(e),p(I.$$.fragment,e),Ee=n(e),Z=J(e,"P",{"data-svelte-h":!0}),j(Z)!=="svelte-1q486zx"&&(Z.innerHTML=Ls),Ne=n(e),B=J(e,"P",{"data-svelte-h":!0}),j(B)!=="svelte-9tb4dk"&&(B.innerHTML=Ps),Xe=n(e),p(G.$$.fragment,e),_e=n(e),V=J(e,"P",{"data-svelte-h":!0}),j(V)!=="svelte-1uw732t"&&(V.innerHTML=Os),ze=n(e),p(k.$$.fragment,e),Qe=n(e),W=J(e,"P",{"data-svelte-h":!0}),j(W)!=="svelte-1qjf1np"&&(W.innerHTML=Ks),He=n(e),p(C.$$.fragment,e),Ye=n(e),R=J(e,"P",{"data-svelte-h":!0}),j(R)!=="svelte-afsc53"&&(R.innerHTML=ea),Ae=n(e),p(v.$$.fragment,e),Fe=n(e),p(E.$$.fragment,e),Se=n(e),N=J(e,"P",{"data-svelte-h":!0}),j(N)!=="svelte-iemqix"&&(N.textContent=sa),qe=n(e),p(f.$$.fragment,e),De=n(e),X=J(e,"P",{"data-svelte-h":!0}),j(X)!=="svelte-1xjyfts"&&(X.textContent=aa),Le=n(e),p(_.$$.fragment,e),Pe=n(e),p(z.$$.fragment,e),Oe=n(e),Q=J(e,"P",{"data-svelte-h":!0}),j(Q)!=="svelte-1bdr1nw"&&(Q.textContent=la),Ke=n(e),p(H.$$.fragment,e),es=n(e),Y=J(e,"P",{"data-svelte-h":!0}),j(Y)!=="svelte-1m7fz3m"&&(Y.innerHTML=ta),ss=n(e),p(A.$$.fragment,e),as=n(e),F=J(e,"P",{"data-svelte-h":!0}),j(F)!=="svelte-3lzw0l"&&(F.innerHTML=na),ls=n(e),p(S.$$.fragment,e),ts=n(e),p(q.$$.fragment,e),ns=n(e),D=J(e,"P",{"data-svelte-h":!0}),j(D)!=="svelte-1a7sky6"&&(D.innerHTML=oa),os=n(e),p(L.$$.fragment,e),is=n(e),p(P.$$.fragment,e),ps=n(e),O=J(e,"P",{"data-svelte-h":!0}),j(O)!=="svelte-c8hqos"&&(O.textContent=ia),rs=n(e),p(w.$$.fragment,e),ds=n(e),K=J(e,"P",{"data-svelte-h":!0}),j(K)!=="svelte-lemoq5"&&(K.innerHTML=pa),ms=n(e),ee=J(e,"P",{"data-svelte-h":!0}),j(ee)!=="svelte-1x65nwt"&&(ee.innerHTML=ra),cs=n(e),p(se.$$.fragment,e),Ms=n(e),p(ae.$$.fragment,e),us=n(e),le=J(e,"P",{"data-svelte-h":!0}),j(le)!=="svelte-1ffobuu"&&(le.innerHTML=da),Js=n(e),p(U.$$.fragment,e),js=n(e),p(te.$$.fragment,e),hs=n(e),ne=J(e,"P",{"data-svelte-h":!0}),j(ne)!=="svelte-teh51o"&&(ne.innerHTML=ma),Ts=n(e),p(oe.$$.fragment,e),ys=n(e),ie=J(e,"P",{"data-svelte-h":!0}),j(ie)!=="svelte-1dh8umf"&&(ie.innerHTML=ca),bs=n(e),p(pe.$$.fragment,e),fs=n(e),p(re.$$.fragment,e),ws=n(e),de=J(e,"P",{"data-svelte-h":!0}),j(de)!=="svelte-1vpifmk"&&(de.innerHTML=Ma),Us=n(e),p(me.$$.fragment,e),gs=n(e),p(ce.$$.fragment,e),xs=n(e),p(g.$$.fragment,e),$s=n(e),Me=J(e,"P",{"data-svelte-h":!0}),j(Me)!=="svelte-16pdpnn"&&(Me.innerHTML=ua),Is=n(e),p(ue.$$.fragment,e),Zs=n(e),p(Je.$$.fragment,e),Bs=n(e),je=J(e,"P",{"data-svelte-h":!0}),j(je)!=="svelte-jif4zv"&&(je.innerHTML=Ja),Gs=n(e),p(he.$$.fragment,e),Vs=n(e),p(Te.$$.fragment,e),ks=n(e),ye=J(e,"P",{"data-svelte-h":!0}),j(ye)!=="svelte-1mv032g"&&(ye.innerHTML=ja),Ws=n(e),p(be.$$.fragment,e),Cs=n(e),fe=J(e,"P",{"data-svelte-h":!0}),j(fe)!=="svelte-7b97jo"&&(fe.innerHTML=ha),Rs=n(e),p(we.$$.fragment,e),vs=n(e),p(Ue.$$.fragment,e),Es=n(e),ge=J(e,"P",{"data-svelte-h":!0}),j(ge)!=="svelte-lgjetr"&&(ge.innerHTML=Ta),Ns=n(e),p(xe.$$.fragment,e),Xs=n(e),p($e.$$.fragment,e),_s=n(e),Ie=J(e,"P",{"data-svelte-h":!0}),j(Ie)!=="svelte-14cmuu4"&&(Ie.innerHTML=ya),zs=n(e),Ze=J(e,"P",{"data-svelte-h":!0}),j(Ze)!=="svelte-sgohav"&&(Ze.textContent=ba),Qs=n(e),p(Be.$$.fragment,e),Hs=n(e),p(Ge.$$.fragment,e),Ys=n(e),p(x.$$.fragment,e),As=n(e),Ve=J(e,"P",{"data-svelte-h":!0}),j(Ve)!=="svelte-1k4043d"&&(Ve.textContent=fa),Fs=n(e),p(ke.$$.fragment,e),Ss=n(e),We=J(e,"P",{}),Ia(We).forEach(a),this.h()},h(){Za(o,"name","hf:doc:metadata"),Za(o,"content",Ya)},m(e,s){Ca(document.head,o),l(e,T,s),l(e,M,s),l(e,y,s),r($,e,s),l(e,ve,s),r(I,e,s),l(e,Ee,s),l(e,Z,s),l(e,Ne,s),l(e,B,s),l(e,Xe,s),r(G,e,s),l(e,_e,s),l(e,V,s),l(e,ze,s),r(k,e,s),l(e,Qe,s),l(e,W,s),l(e,He,s),r(C,e,s),l(e,Ye,s),l(e,R,s),l(e,Ae,s),r(v,e,s),l(e,Fe,s),r(E,e,s),l(e,Se,s),l(e,N,s),l(e,qe,s),r(f,e,s),l(e,De,s),l(e,X,s),l(e,Le,s),r(_,e,s),l(e,Pe,s),r(z,e,s),l(e,Oe,s),l(e,Q,s),l(e,Ke,s),r(H,e,s),l(e,es,s),l(e,Y,s),l(e,ss,s),r(A,e,s),l(e,as,s),l(e,F,s),l(e,ls,s),r(S,e,s),l(e,ts,s),r(q,e,s),l(e,ns,s),l(e,D,s),l(e,os,s),r(L,e,s),l(e,is,s),r(P,e,s),l(e,ps,s),l(e,O,s),l(e,rs,s),r(w,e,s),l(e,ds,s),l(e,K,s),l(e,ms,s),l(e,ee,s),l(e,cs,s),r(se,e,s),l(e,Ms,s),r(ae,e,s),l(e,us,s),l(e,le,s),l(e,Js,s),r(U,e,s),l(e,js,s),r(te,e,s),l(e,hs,s),l(e,ne,s),l(e,Ts,s),r(oe,e,s),l(e,ys,s),l(e,ie,s),l(e,bs,s),r(pe,e,s),l(e,fs,s),r(re,e,s),l(e,ws,s),l(e,de,s),l(e,Us,s),r(me,e,s),l(e,gs,s),r(ce,e,s),l(e,xs,s),r(g,e,s),l(e,$s,s),l(e,Me,s),l(e,Is,s),r(ue,e,s),l(e,Zs,s),r(Je,e,s),l(e,Bs,s),l(e,je,s),l(e,Gs,s),r(he,e,s),l(e,Vs,s),r(Te,e,s),l(e,ks,s),l(e,ye,s),l(e,Ws,s),r(be,e,s),l(e,Cs,s),l(e,fe,s),l(e,Rs,s),r(we,e,s),l(e,vs,s),r(Ue,e,s),l(e,Es,s),l(e,ge,s),l(e,Ns,s),r(xe,e,s),l(e,Xs,s),r($e,e,s),l(e,_s,s),l(e,Ie,s),l(e,zs,s),l(e,Ze,s),l(e,Qs,s),r(Be,e,s),l(e,Hs,s),r(Ge,e,s),l(e,Ys,s),r(x,e,s),l(e,As,s),l(e,Ve,s),l(e,Fs,s),r(ke,e,s),l(e,Ss,s),l(e,We,s),qs=!0},p(e,[s]){const wa={};s&2&&(wa.$$scope={dirty:s,ctx:e}),f.$set(wa);const Ua={};s&2&&(Ua.$$scope={dirty:s,ctx:e}),w.$set(Ua);const ga={};s&2&&(ga.$$scope={dirty:s,ctx:e}),U.$set(ga);const xa={};s&2&&(xa.$$scope={dirty:s,ctx:e}),g.$set(xa);const $a={};s&2&&($a.$$scope={dirty:s,ctx:e}),x.$set($a)},i(e){qs||(d($.$$.fragment,e),d(I.$$.fragment,e),d(G.$$.fragment,e),d(k.$$.fragment,e),d(C.$$.fragment,e),d(v.$$.fragment,e),d(E.$$.fragment,e),d(f.$$.fragment,e),d(_.$$.fragment,e),d(z.$$.fragment,e),d(H.$$.fragment,e),d(A.$$.fragment,e),d(S.$$.fragment,e),d(q.$$.fragment,e),d(L.$$.fragment,e),d(P.$$.fragment,e),d(w.$$.fragment,e),d(se.$$.fragment,e),d(ae.$$.fragment,e),d(U.$$.fragment,e),d(te.$$.fragment,e),d(oe.$$.fragment,e),d(pe.$$.fragment,e),d(re.$$.fragment,e),d(me.$$.fragment,e),d(ce.$$.fragment,e),d(g.$$.fragment,e),d(ue.$$.fragment,e),d(Je.$$.fragment,e),d(he.$$.fragment,e),d(Te.$$.fragment,e),d(be.$$.fragment,e),d(we.$$.fragment,e),d(Ue.$$.fragment,e),d(xe.$$.fragment,e),d($e.$$.fragment,e),d(Be.$$.fragment,e),d(Ge.$$.fragment,e),d(x.$$.fragment,e),d(ke.$$.fragment,e),qs=!0)},o(e){m($.$$.fragment,e),m(I.$$.fragment,e),m(G.$$.fragment,e),m(k.$$.fragment,e),m(C.$$.fragment,e),m(v.$$.fragment,e),m(E.$$.fragment,e),m(f.$$.fragment,e),m(_.$$.fragment,e),m(z.$$.fragment,e),m(H.$$.fragment,e),m(A.$$.fragment,e),m(S.$$.fragment,e),m(q.$$.fragment,e),m(L.$$.fragment,e),m(P.$$.fragment,e),m(w.$$.fragment,e),m(se.$$.fragment,e),m(ae.$$.fragment,e),m(U.$$.fragment,e),m(te.$$.fragment,e),m(oe.$$.fragment,e),m(pe.$$.fragment,e),m(re.$$.fragment,e),m(me.$$.fragment,e),m(ce.$$.fragment,e),m(g.$$.fragment,e),m(ue.$$.fragment,e),m(Je.$$.fragment,e),m(he.$$.fragment,e),m(Te.$$.fragment,e),m(be.$$.fragment,e),m(we.$$.fragment,e),m(Ue.$$.fragment,e),m(xe.$$.fragment,e),m($e.$$.fragment,e),m(Be.$$.fragment,e),m(Ge.$$.fragment,e),m(x.$$.fragment,e),m(ke.$$.fragment,e),qs=!1},d(e){e&&(a(T),a(M),a(y),a(ve),a(Ee),a(Z),a(Ne),a(B),a(Xe),a(_e),a(V),a(ze),a(Qe),a(W),a(He),a(Ye),a(R),a(Ae),a(Fe),a(Se),a(N),a(qe),a(De),a(X),a(Le),a(Pe),a(Oe),a(Q),a(Ke),a(es),a(Y),a(ss),a(as),a(F),a(ls),a(ts),a(ns),a(D),a(os),a(is),a(ps),a(O),a(rs),a(ds),a(K),a(ms),a(ee),a(cs),a(Ms),a(us),a(le),a(Js),a(js),a(hs),a(ne),a(Ts),a(ys),a(ie),a(bs),a(fs),a(ws),a(de),a(Us),a(gs),a(xs),a($s),a(Me),a(Is),a(Zs),a(Bs),a(je),a(Gs),a(Vs),a(ks),a(ye),a(Ws),a(Cs),a(fe),a(Rs),a(vs),a(Es),a(ge),a(Ns),a(Xs),a(_s),a(Ie),a(zs),a(Ze),a(Qs),a(Hs),a(Ys),a(As),a(Ve),a(Fs),a(Ss),a(We)),a(o),c($,e),c(I,e),c(G,e),c(k,e),c(C,e),c(v,e),c(E,e),c(f,e),c(_,e),c(z,e),c(H,e),c(A,e),c(S,e),c(q,e),c(L,e),c(P,e),c(w,e),c(se,e),c(ae,e),c(U,e),c(te,e),c(oe,e),c(pe,e),c(re,e),c(me,e),c(ce,e),c(g,e),c(ue,e),c(Je,e),c(he,e),c(Te,e),c(be,e),c(we,e),c(Ue,e),c(xe,e),c($e,e),c(Be,e),c(Ge,e),c(x,e),c(ke,e)}}}const Ya='{"title":"Big data? 🤗 Datasets ao resgate","local":"big-data--datasets-ao-resgate","sections":[{"title":"O que é the Pile?","local":"o-que-é-the-pile","sections":[],"depth":2},{"title":"A magia do mapeamento de memória","local":"a-magia-do-mapeamento-de-memória","sections":[],"depth":2},{"title":"Conjuntos de dados em streaming","local":"conjuntos-de-dados-em-streaming","sections":[],"depth":2}],"depth":1}';function Aa(b){return Ga(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Ka extends Va{constructor(o){super(),ka(this,o,Aa,Ha,Ba,{})}}export{Ka as component}; | |
Xet Storage Details
- Size:
- 51.6 kB
- Xet hash:
- 348414ff3c089e3df6d64d49364606beb02d26cee23932b108b304137b8dd17f
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.