Buckets:

rtrm's picture
download
raw
67.4 kB
import{s as et,o as st,n as Pl}from"../chunks/scheduler.37c15a92.js";import{S as lt,i as tt,g as h,s as i,r as u,A as at,h as j,f as l,c as r,j as Ll,u as U,x as f,k as Ke,l as ml,y as nt,a as t,v as y,t as m,b as al,d as M,w as J,p as nl}from"../chunks/index.2bf4358c.js";import{T as Kl}from"../chunks/Tip.363c041f.js";import{Y as ot}from"../chunks/Youtube.1e50a667.js";import{C as T}from"../chunks/CodeBlock.4e987730.js";import{C as Ol}from"../chunks/CourseFloatingBanner.6add7356.js";import{F as it}from"../chunks/FrameworkSwitchCourse.8d4d4ab6.js";import{H as Pe,E as rt}from"../chunks/getInferenceSnippets.6f5a2721.js";function dt(C){let a,p;return a=new Ol({props:{chapter:5,classNames:"absolute z-10 right-0 top-0",notebooks:[{label:"Google Colab",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/pt/chapter5/section6_tf.ipynb"},{label:"Aws Studio",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/pt/chapter5/section6_tf.ipynb"}]}}),{c(){u(a.$$.fragment)},l(n){U(a.$$.fragment,n)},m(n,w){y(a,n,w),p=!0},i(n){p||(M(a.$$.fragment,n),p=!0)},o(n){m(a.$$.fragment,n),p=!1},d(n){J(a,n)}}}function mt(C){let a,p;return a=new Ol({props:{chapter:5,classNames:"absolute z-10 right-0 top-0",notebooks:[{label:"Google Colab",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/pt/chapter5/section6_pt.ipynb"},{label:"Aws Studio",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/pt/chapter5/section6_pt.ipynb"}]}}),{c(){u(a.$$.fragment)},l(n){U(a.$$.fragment,n)},m(n,w){y(a,n,w),p=!0},i(n){p||(M(a.$$.fragment,n),p=!0)},o(n){m(a.$$.fragment,n),p=!1},d(n){J(a,n)}}}function Mt(C){let a,p='✏️ <strong>Experimente!</strong> Veja se você pode usar <code>Dataset.map()</code> para explodir a coluna <code>comments</code> de <code>issues_dataset</code> <em>sem</em> recorrer ao uso de Pandas. Isso é um pouco complicado; você pode achar útil para esta tarefa a seção <a href="https://huggingface.co/docs/datasets/v1.12.1/about_map_batch#batch-mapping" rel="nofollow">“Mapeamento em lote”</a> da documentação do 🤗 Dataset.';return{c(){a=h("p"),a.innerHTML=p},l(n){a=j(n,"P",{"data-svelte-h":!0}),f(a)!=="svelte-83kmej"&&(a.innerHTML=p)},m(n,w){t(n,a,w)},p:Pl,d(n){n&&l(a)}}}function pt(C){let a,p,n,w="Observe que definimos <code>from_pt=True</code> como um argumento do método <code>from_pretrained()</code>. Isso ocorre porque o checkpoint <code>multi-qa-mpnet-base-dot-v1</code> só tem pesos PyTorch, portanto, definir <code>from_pt=True</code> irá convertê-los automaticamente para o formato TensorFlow para nós. Como você pode ver, é muito simples alternar entre frameworks no 🤗 Transformers!",d;return a=new T({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Ub2tlbml6ZXIlMkMlMjBURkF1dG9Nb2RlbCUwQSUwQW1vZGVsX2NrcHQlMjAlM0QlMjAlMjJzZW50ZW5jZS10cmFuc2Zvcm1lcnMlMkZtdWx0aS1xYS1tcG5ldC1iYXNlLWRvdC12MSUyMiUwQXRva2VuaXplciUyMCUzRCUyMEF1dG9Ub2tlbml6ZXIuZnJvbV9wcmV0cmFpbmVkKG1vZGVsX2NrcHQpJTBBbW9kZWwlMjAlM0QlMjBURkF1dG9Nb2RlbC5mcm9tX3ByZXRyYWluZWQobW9kZWxfY2twdCUyQyUyMGZyb21fcHQlM0RUcnVlKQ==",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, TFAutoModel
model_ckpt = <span class="hljs-string">&quot;sentence-transformers/multi-qa-mpnet-base-dot-v1&quot;</span>
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = TFAutoModel.from_pretrained(model_ckpt, from_pt=<span class="hljs-literal">True</span>)`,wrap:!1}}),{c(){u(a.$$.fragment),p=i(),n=h("p"),n.innerHTML=w},l(c){U(a.$$.fragment,c),p=r(c),n=j(c,"P",{"data-svelte-h":!0}),f(n)!=="svelte-m83kxk"&&(n.innerHTML=w)},m(c,G){y(a,c,G),t(c,p,G),t(c,n,G),d=!0},i(c){d||(M(a.$$.fragment,c),d=!0)},o(c){m(a.$$.fragment,c),d=!1},d(c){c&&(l(p),l(n)),J(a,c)}}}function ct(C){let a,p,n,w="Para acelerar o processo de embedding, é útil colocar o modelo e as entradas em um dispositivo GPU, então vamos fazer isso agora:",d,c,G;return a=new T({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Ub2tlbml6ZXIlMkMlMjBBdXRvTW9kZWwlMEElMEFtb2RlbF9ja3B0JTIwJTNEJTIwJTIyc2VudGVuY2UtdHJhbnNmb3JtZXJzJTJGbXVsdGktcWEtbXBuZXQtYmFzZS1kb3QtdjElMjIlMEF0b2tlbml6ZXIlMjAlM0QlMjBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZChtb2RlbF9ja3B0KSUwQW1vZGVsJTIwJTNEJTIwQXV0b01vZGVsLmZyb21fcHJldHJhaW5lZChtb2RlbF9ja3B0KQ==",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModel
model_ckpt = <span class="hljs-string">&quot;sentence-transformers/multi-qa-mpnet-base-dot-v1&quot;</span>
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)
model = AutoModel.from_pretrained(model_ckpt)`,wrap:!1}}),c=new T({props:{code:"aW1wb3J0JTIwdG9yY2glMEElMEFkZXZpY2UlMjAlM0QlMjB0b3JjaC5kZXZpY2UoJTIyY3VkYSUyMiklMEFtb2RlbC50byhkZXZpY2Up",highlighted:`<span class="hljs-keyword">import</span> torch
device = torch.device(<span class="hljs-string">&quot;cuda&quot;</span>)
model.to(device)`,wrap:!1}}),{c(){u(a.$$.fragment),p=i(),n=h("p"),n.textContent=w,d=i(),u(c.$$.fragment)},l(b){U(a.$$.fragment,b),p=r(b),n=j(b,"P",{"data-svelte-h":!0}),f(n)!=="svelte-19fxoub"&&(n.textContent=w),d=r(b),U(c.$$.fragment,b)},m(b,Q){y(a,b,Q),t(b,p,Q),t(b,n,Q),t(b,d,Q),y(c,b,Q),G=!0},i(b){G||(M(a.$$.fragment,b),M(c.$$.fragment,b),G=!0)},o(b){m(a.$$.fragment,b),m(c.$$.fragment,b),G=!1},d(b){b&&(l(p),l(n),l(d)),J(a,b),J(c,b)}}}function ut(C){let a,p,n,w="Podemos testar o funcionamento da função alimentando-a com a primeira entrada de texto em nosso corpus e inspecionando a forma de saída:",d,c,G,b,Q,Z,R="Ótimo, convertemos a primeira entrada em nosso corpus em um vetor de 768 dimensões! Podemos usar <code>Dataset.map()</code> para aplicar nossa função <code>get_embeddings()</code> a cada linha em nosso corpus, então vamos criar uma nova coluna <code>embeddings</code> da seguinte forma:",k,I,$;return a=new T({props:{code:"ZGVmJTIwZ2V0X2VtYmVkZGluZ3ModGV4dF9saXN0KSUzQSUwQSUyMCUyMCUyMCUyMGVuY29kZWRfaW5wdXQlMjAlM0QlMjB0b2tlbml6ZXIoJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwdGV4dF9saXN0JTJDJTIwcGFkZGluZyUzRFRydWUlMkMlMjB0cnVuY2F0aW9uJTNEVHJ1ZSUyQyUyMHJldHVybl90ZW5zb3JzJTNEJTIydGYlMjIlMEElMjAlMjAlMjAlMjApJTBBJTIwJTIwJTIwJTIwZW5jb2RlZF9pbnB1dCUyMCUzRCUyMCU3QmslM0ElMjB2JTIwZm9yJTIwayUyQyUyMHYlMjBpbiUyMGVuY29kZWRfaW5wdXQuaXRlbXMoKSU3RCUwQSUyMCUyMCUyMCUyMG1vZGVsX291dHB1dCUyMCUzRCUyMG1vZGVsKCoqZW5jb2RlZF9pbnB1dCklMEElMjAlMjAlMjAlMjByZXR1cm4lMjBjbHNfcG9vbGluZyhtb2RlbF9vdXRwdXQp",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">get_embeddings</span>(<span class="hljs-params">text_list</span>):
encoded_input = tokenizer(
text_list, padding=<span class="hljs-literal">True</span>, truncation=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">&quot;tf&quot;</span>
)
encoded_input = {k: v <span class="hljs-keyword">for</span> k, v <span class="hljs-keyword">in</span> encoded_input.items()}
model_output = model(**encoded_input)
<span class="hljs-keyword">return</span> cls_pooling(model_output)`,wrap:!1}}),c=new T({props:{code:"ZW1iZWRkaW5nJTIwJTNEJTIwZ2V0X2VtYmVkZGluZ3MoY29tbWVudHNfZGF0YXNldCU1QiUyMnRleHQlMjIlNUQlNUIwJTVEKSUwQWVtYmVkZGluZy5zaGFwZQ==",highlighted:`embedding = get_embeddings(comments_dataset[<span class="hljs-string">&quot;text&quot;</span>][<span class="hljs-number">0</span>])
embedding.shape`,wrap:!1}}),b=new T({props:{code:"VGVuc29yU2hhcGUoJTVCMSUyQyUyMDc2OCU1RCk=",highlighted:'TensorShape([<span class="hljs-number">1</span>, <span class="hljs-number">768</span>])',wrap:!1}}),I=new T({props:{code:"ZW1iZWRkaW5nc19kYXRhc2V0JTIwJTNEJTIwY29tbWVudHNfZGF0YXNldC5tYXAoJTBBJTIwJTIwJTIwJTIwbGFtYmRhJTIweCUzQSUyMCU3QiUyMmVtYmVkZGluZ3MlMjIlM0ElMjBnZXRfZW1iZWRkaW5ncyh4JTVCJTIydGV4dCUyMiU1RCkubnVtcHkoKSU1QjAlNUQlN0QlMEEp",highlighted:`embeddings_dataset = comments_dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;embeddings&quot;</span>: get_embeddings(x[<span class="hljs-string">&quot;text&quot;</span>]).numpy()[<span class="hljs-number">0</span>]}
)`,wrap:!1}}),{c(){u(a.$$.fragment),p=i(),n=h("p"),n.textContent=w,d=i(),u(c.$$.fragment),G=i(),u(b.$$.fragment),Q=i(),Z=h("p"),Z.innerHTML=R,k=i(),u(I.$$.fragment)},l(o){U(a.$$.fragment,o),p=r(o),n=j(o,"P",{"data-svelte-h":!0}),f(n)!=="svelte-1yecorf"&&(n.textContent=w),d=r(o),U(c.$$.fragment,o),G=r(o),U(b.$$.fragment,o),Q=r(o),Z=j(o,"P",{"data-svelte-h":!0}),f(Z)!=="svelte-whkpu9"&&(Z.innerHTML=R),k=r(o),U(I.$$.fragment,o)},m(o,g){y(a,o,g),t(o,p,g),t(o,n,g),t(o,d,g),y(c,o,g),t(o,G,g),y(b,o,g),t(o,Q,g),t(o,Z,g),t(o,k,g),y(I,o,g),$=!0},i(o){$||(M(a.$$.fragment,o),M(c.$$.fragment,o),M(b.$$.fragment,o),M(I.$$.fragment,o),$=!0)},o(o){m(a.$$.fragment,o),m(c.$$.fragment,o),m(b.$$.fragment,o),m(I.$$.fragment,o),$=!1},d(o){o&&(l(p),l(n),l(d),l(G),l(Q),l(Z),l(k)),J(a,o),J(c,o),J(b,o),J(I,o)}}}function Ut(C){let a,p,n,w="Podemos testar o funcionamento da função alimentando-a com a primeira entrada de texto em nosso corpus e inspecionando a forma de saída:",d,c,G,b,Q,Z,R="Ótimo, convertemos a primeira entrada em nosso corpus em um vetor de 768 dimensões! Podemos usar <code>Dataset.map()</code> para aplicar nossa função <code>get_embeddings()</code> a cada linha em nosso corpus, então vamos criar uma nova coluna <code>embeddings</code> da seguinte forma:",k,I,$;return a=new T({props:{code:"ZGVmJTIwZ2V0X2VtYmVkZGluZ3ModGV4dF9saXN0KSUzQSUwQSUyMCUyMCUyMCUyMGVuY29kZWRfaW5wdXQlMjAlM0QlMjB0b2tlbml6ZXIoJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwdGV4dF9saXN0JTJDJTIwcGFkZGluZyUzRFRydWUlMkMlMjB0cnVuY2F0aW9uJTNEVHJ1ZSUyQyUyMHJldHVybl90ZW5zb3JzJTNEJTIycHQlMjIlMEElMjAlMjAlMjAlMjApJTBBJTIwJTIwJTIwJTIwZW5jb2RlZF9pbnB1dCUyMCUzRCUyMCU3QmslM0ElMjB2LnRvKGRldmljZSklMjBmb3IlMjBrJTJDJTIwdiUyMGluJTIwZW5jb2RlZF9pbnB1dC5pdGVtcygpJTdEJTBBJTIwJTIwJTIwJTIwbW9kZWxfb3V0cHV0JTIwJTNEJTIwbW9kZWwoKiplbmNvZGVkX2lucHV0KSUwQSUyMCUyMCUyMCUyMHJldHVybiUyMGNsc19wb29saW5nKG1vZGVsX291dHB1dCk=",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">get_embeddings</span>(<span class="hljs-params">text_list</span>):
encoded_input = tokenizer(
text_list, padding=<span class="hljs-literal">True</span>, truncation=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>
)
encoded_input = {k: v.to(device) <span class="hljs-keyword">for</span> k, v <span class="hljs-keyword">in</span> encoded_input.items()}
model_output = model(**encoded_input)
<span class="hljs-keyword">return</span> cls_pooling(model_output)`,wrap:!1}}),c=new T({props:{code:"ZW1iZWRkaW5nJTIwJTNEJTIwZ2V0X2VtYmVkZGluZ3MoY29tbWVudHNfZGF0YXNldCU1QiUyMnRleHQlMjIlNUQlNUIwJTVEKSUwQWVtYmVkZGluZy5zaGFwZQ==",highlighted:`embedding = get_embeddings(comments_dataset[<span class="hljs-string">&quot;text&quot;</span>][<span class="hljs-number">0</span>])
embedding.shape`,wrap:!1}}),b=new T({props:{code:"dG9yY2guU2l6ZSglNUIxJTJDJTIwNzY4JTVEKQ==",highlighted:'torch.Size([<span class="hljs-number">1</span>, <span class="hljs-number">768</span>])',wrap:!1}}),I=new T({props:{code:"ZW1iZWRkaW5nc19kYXRhc2V0JTIwJTNEJTIwY29tbWVudHNfZGF0YXNldC5tYXAoJTBBJTIwJTIwJTIwJTIwbGFtYmRhJTIweCUzQSUyMCU3QiUyMmVtYmVkZGluZ3MlMjIlM0ElMjBnZXRfZW1iZWRkaW5ncyh4JTVCJTIydGV4dCUyMiU1RCkuZGV0YWNoKCkuY3B1KCkubnVtcHkoKSU1QjAlNUQlN0QlMEEp",highlighted:`embeddings_dataset = comments_dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;embeddings&quot;</span>: get_embeddings(x[<span class="hljs-string">&quot;text&quot;</span>]).detach().cpu().numpy()[<span class="hljs-number">0</span>]}
)`,wrap:!1}}),{c(){u(a.$$.fragment),p=i(),n=h("p"),n.textContent=w,d=i(),u(c.$$.fragment),G=i(),u(b.$$.fragment),Q=i(),Z=h("p"),Z.innerHTML=R,k=i(),u(I.$$.fragment)},l(o){U(a.$$.fragment,o),p=r(o),n=j(o,"P",{"data-svelte-h":!0}),f(n)!=="svelte-1yecorf"&&(n.textContent=w),d=r(o),U(c.$$.fragment,o),G=r(o),U(b.$$.fragment,o),Q=r(o),Z=j(o,"P",{"data-svelte-h":!0}),f(Z)!=="svelte-whkpu9"&&(Z.innerHTML=R),k=r(o),U(I.$$.fragment,o)},m(o,g){y(a,o,g),t(o,p,g),t(o,n,g),t(o,d,g),y(c,o,g),t(o,G,g),y(b,o,g),t(o,Q,g),t(o,Z,g),t(o,k,g),y(I,o,g),$=!0},i(o){$||(M(a.$$.fragment,o),M(c.$$.fragment,o),M(b.$$.fragment,o),M(I.$$.fragment,o),$=!0)},o(o){m(a.$$.fragment,o),m(c.$$.fragment,o),m(b.$$.fragment,o),m(I.$$.fragment,o),$=!1},d(o){o&&(l(p),l(n),l(d),l(G),l(Q),l(Z),l(k)),J(a,o),J(c,o),J(b,o),J(I,o)}}}function yt(C){let a,p,n,w;return a=new T({props:{code:"cXVlc3Rpb24lMjAlM0QlMjAlMjJIb3clMjBjYW4lMjBJJTIwbG9hZCUyMGElMjBkYXRhc2V0JTIwb2ZmbGluZSUzRiUyMiUwQXF1ZXN0aW9uX2VtYmVkZGluZyUyMCUzRCUyMGdldF9lbWJlZGRpbmdzKCU1QnF1ZXN0aW9uJTVEKS5udW1weSgpJTBBcXVlc3Rpb25fZW1iZWRkaW5nLnNoYXBl",highlighted:`question = <span class="hljs-string">&quot;How can I load a dataset offline?&quot;</span>
question_embedding = get_embeddings([question]).numpy()
question_embedding.shape`,wrap:!1}}),n=new T({props:{code:"KDElMkMlMjA3Njgp",highlighted:'(<span class="hljs-number">1</span>, <span class="hljs-number">768</span>)',wrap:!1}}),{c(){u(a.$$.fragment),p=i(),u(n.$$.fragment)},l(d){U(a.$$.fragment,d),p=r(d),U(n.$$.fragment,d)},m(d,c){y(a,d,c),t(d,p,c),y(n,d,c),w=!0},i(d){w||(M(a.$$.fragment,d),M(n.$$.fragment,d),w=!0)},o(d){m(a.$$.fragment,d),m(n.$$.fragment,d),w=!1},d(d){d&&l(p),J(a,d),J(n,d)}}}function Jt(C){let a,p,n,w;return a=new T({props:{code:"cXVlc3Rpb24lMjAlM0QlMjAlMjJIb3clMjBjYW4lMjBJJTIwbG9hZCUyMGElMjBkYXRhc2V0JTIwb2ZmbGluZSUzRiUyMiUwQXF1ZXN0aW9uX2VtYmVkZGluZyUyMCUzRCUyMGdldF9lbWJlZGRpbmdzKCU1QnF1ZXN0aW9uJTVEKS5jcHUoKS5kZXRhY2goKS5udW1weSgpJTBBcXVlc3Rpb25fZW1iZWRkaW5nLnNoYXBl",highlighted:`question = <span class="hljs-string">&quot;How can I load a dataset offline?&quot;</span>
question_embedding = get_embeddings([question]).cpu().detach().numpy()
question_embedding.shape`,wrap:!1}}),n=new T({props:{code:"dG9yY2guU2l6ZSglNUIxJTJDJTIwNzY4JTVEKQ==",highlighted:'torch.Size([<span class="hljs-number">1</span>, <span class="hljs-number">768</span>])',wrap:!1}}),{c(){u(a.$$.fragment),p=i(),u(n.$$.fragment)},l(d){U(a.$$.fragment,d),p=r(d),U(n.$$.fragment,d)},m(d,c){y(a,d,c),t(d,p,c),y(n,d,c),w=!0},i(d){w||(M(a.$$.fragment,d),M(n.$$.fragment,d),w=!0)},o(d){m(a.$$.fragment,d),m(n.$$.fragment,d),w=!1},d(d){d&&l(p),J(a,d),J(n,d)}}}function bt(C){let a,p="✏️ <strong>Experimente!</strong> Crie sua própria consulta e veja se consegue encontrar uma resposta nos documentos recuperados. Você pode ter que aumentar o parâmetro <code>k</code> em <code>Dataset.get_nearest_examples()</code> para ampliar a pesquisa.";return{c(){a=h("p"),a.innerHTML=p},l(n){a=j(n,"P",{"data-svelte-h":!0}),f(a)!=="svelte-1c1kivx"&&(a.innerHTML=p)},m(n,w){t(n,a,w)},p:Pl,d(n){n&&l(a)}}}function ht(C){let a,p,n,w,d,c,G,b,Q,Z,R,k,I='Na <a href="/course/chapter5/5">seção 5</a>, criamos um conjunto de dados de issues e comentários do GitHub do repositório 🤗 Datasets. Nesta seção, usaremos essas informações para construir um mecanismo de pesquisa que pode nos ajudar a encontrar respostas para nossas perguntas mais urgentes sobre a biblioteca!',$,o,g,F,Oe,S,Ml='Como vimos no <a href="/course/chapter1">Capítulo 1</a>, os modelos de linguagem baseados em Transformer representam cada token em um intervalo de texto como um <em>vetor de incorporação</em>. Acontece que é possível “agrupar” as incorporações individuais para criar uma representação vetorial para frases inteiras, parágrafos ou (em alguns casos) documentos. Essas incorporações podem ser usadas para encontrar documentos semelhantes no corpus calculando a similaridade do produto escalar (ou alguma outra métrica de similaridade) entre cada incorporação e retornando os documentos com maior sobreposição.',es,E,pl="Nesta seção, usaremos embeddings para desenvolver um mecanismo de pesquisa semântica. Esses mecanismos de pesquisa oferecem várias vantagens sobre as abordagens convencionais que se baseiam na correspondência de palavras-chave em uma consulta com os documentos.",ss,W,cl='<img class="block dark:hidden" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/semantic-search.svg" alt="Semantic search."/> <img class="hidden dark:block" src="https://huggingface.co/datasets/huggingface-course/documentation-images/resolve/main/en/chapter5/semantic-search-dark.svg" alt="Semantic search."/>',ls,A,ts,H,ul="A primeira coisa que precisamos fazer é baixar nosso conjunto de dados de issues do GitHub, então vamos usar a biblioteca 🤗 Hub para resolver a URL onde nosso arquivo está armazenado no Hugging Face Hub:",as,q,ns,D,Ul='Com a URL armazenada em <code>data_files</code>, podemos carregar o conjunto de dados remoto usando o método apresentado na <a href="/course/chapter5/2">seção 2</a>:',os,L,is,K,rs,P,yl="Aqui nós especificamos a divisão padrão <code>train</code> em <code>load_dataset()</code>, então ele retorna um <code>Dataset</code> em vez de um <code>DatasetDict</code>. A primeira ordem de negócios é filtrar os pull request, pois elas tendem a ser raramente usadas para responder a consultas de usuários e introduzirão ruído em nosso mecanismo de pesquisa. Como já deve ser familiar, podemos usar a função <code>Dataset.filter()</code> para excluir essas linhas em nosso conjunto de dados. Enquanto estamos nisso, também vamos filtrar as linhas sem comentários, pois elas não fornecem respostas às consultas dos usuários:",ds,O,ms,ee,Ms,se,Jl="Podemos ver que há muitas colunas em nosso conjunto de dados, a maioria das quais não precisamos para construir nosso mecanismo de pesquisa. De uma perspectiva de pesquisa, as colunas mais informativas são <code>title</code>, <code>body</code> e <code>comments</code>, enquanto <code>html_url</code> nos fornece um link de volta para a issue de origem. Vamos usar a função <code>Dataset.remove_columns()</code> para descartar o resto:",ps,le,cs,te,us,ae,bl='Para criar nossos embeddings, aumentaremos cada comentário com o título e o corpo da issue, pois esses campos geralmente incluem informações contextuais úteis. Como nossa coluna <code>comments</code> é atualmente uma lista de comentários para cada issue, precisamos “explodir” a coluna para que cada linha consista em uma tupla <code>(html_url, title, body, comment)</code>. No Pandas podemos fazer isso com a função <a href="https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.explode.html" rel="nofollow"><code>DataFrame.explode()</code></a>, que cria uma nova linha para cada elemento em uma coluna semelhante a uma lista, enquanto replica todos os outros valores de coluna. Para ver isso em ação, vamos primeiro mudar para o formato <code>DataFrame</code> do Pandas:',Us,ne,ys,oe,hl="Se inspecionarmos a primeira linha neste <code>DataFrame</code>, podemos ver que há quatro comentários associados a esta issue:",Js,ie,bs,re,hs,de,jl="Quando explodimos <code>df</code>, esperamos obter uma linha para cada um desses comentários. Vamos verificar se é o caso:",js,me,Ts,_,Tl='<thead><tr style="text-align: right;"><th></th> <th>html_url</th> <th>title</th> <th>comments</th> <th>body</th></tr></thead> <tbody><tr><th>0</th> <td>https://github.com/huggingface/datasets/issues/2787</td> <td>ConnectionError: Couldn&#39;t reach https://raw.githubusercontent.com</td> <td>the bug code locate in :\\r\\n if data_args.task_name is not None...</td> <td>Hello,\\r\\nI am trying to run run_glue.py and it gives me this error...</td></tr> <tr><th>1</th> <td>https://github.com/huggingface/datasets/issues/2787</td> <td>ConnectionError: Couldn&#39;t reach https://raw.githubusercontent.com</td> <td>Hi @jinec,\\r\\n\\r\\nFrom time to time we get this kind of `ConnectionError` coming from the github.com website: https://raw.githubusercontent.com...</td> <td>Hello,\\r\\nI am trying to run run_glue.py and it gives me this error...</td></tr> <tr><th>2</th> <td>https://github.com/huggingface/datasets/issues/2787</td> <td>ConnectionError: Couldn&#39;t reach https://raw.githubusercontent.com</td> <td>cannot connect,even by Web browser,please check that there is some problems。</td> <td>Hello,\\r\\nI am trying to run run_glue.py and it gives me this error...</td></tr> <tr><th>3</th> <td>https://github.com/huggingface/datasets/issues/2787</td> <td>ConnectionError: Couldn&#39;t reach https://raw.githubusercontent.com</td> <td>I can access https://raw.githubusercontent.com/huggingface/datasets/1.7.0/datasets/glue/glue.py without problem...</td> <td>Hello,\\r\\nI am trying to run run_glue.py and it gives me this error...</td></tr></tbody>',fs,Me,fl="Ótimo, podemos ver que as linhas foram replicadas, com a coluna <code>comments</code> contendo os comentários individuais! Agora que terminamos com o Pandas, podemos voltar rapidamente para um <code>Dataset</code> carregando o <code>DataFrame</code> na memória",ws,pe,gs,ce,Qs,ue,wl="Ok, isso nos deu alguns milhares de comentários para trabalhar!",Zs,z,Gs,Ue,gl="Agora que temos um comentário por linha, vamos criar uma nova coluna <code>comments_length</code> que contém o número de palavras por comentário:",Cs,ye,Is,Je,Ql="Podemos usar essa nova coluna para filtrar comentários curtos, que normalmente incluem coisas como “cc @lewtun” ou “Obrigado!” que não são relevantes para o nosso motor de busca. Não há um número preciso para selecionar o filtro, mas cerca de 15 palavras parece um bom começo:",ks,be,$s,he,Rs,je,Zl="Depois de limpar um pouco nosso conjunto de dados, vamos concatenar o título, a descrição e os comentários da issue em uma nova coluna <code>text</code>. Como de costume, escreveremos uma função simples que podemos passar para <code>Dataset.map()</code>:",_s,Te,Ns,fe,Gl="Finalmente estamos prontos para criar alguns embeddings! Vamos dar uma olhada.",xs,we,Bs,ge,Cl='Vimos no <a href="/course/chapter2">Capítulo 2</a> que podemos obter tokens embeddings usando a classe <code>AutoModel</code>. Tudo o que precisamos fazer é escolher um checkpoint adequado para carregar o modelo. Felizmente, existe uma biblioteca chamada <code>sentence-transformers</code> dedicada à criação de embeddings. Conforme descrito na <a href="https://www.sbert.net/examples/applications/semantic-search/README.html#symmetric-vs-asymmetric-semantic-search" rel="nofollow">documentação da biblioteca</a>, nosso caso de uso é um exemplo de <em>asymmetric semantic search</em> porque temos uma consulta curta cuja resposta gostaríamos de encontrar em um documento mais longo, como um comentário da issue. A útil <a href="https://www.sbert.net/docs/pretrained_models.html#model-overview" rel="nofollow">tabela de visão geral do modelo</a> na documentação indica que o checkpoint <code>multi-qa-mpnet-base-dot-v1</code> tem o melhor desempenho para pesquisa semântica, então usaremos isso para nosso aplicativo. Também carregaremos o tokenizer usando o mesmo checkpoint:',vs,N,x,He,Qe,Il="Como mencionamos anteriormente, gostaríamos de representar cada entrada em nosso corpus de issues do GitHub como um único vetor, portanto, precisamos “pool” ou calcular a média de nossas incorporações de token de alguma forma. Uma abordagem popular é realizar <em>CLS pooling</em> nas saídas do nosso modelo, onde simplesmente coletamos o último estado oculto para o token especial <code>[CLS]</code>. A função a seguir faz o truque para nós:",Xs,Ze,Vs,Ge,kl="Em seguida, criaremos uma função auxiliar que tokenizará uma lista de documentos, colocará os tensores na GPU, os alimentará no modelo e, finalmente, aplicará o agrupamento CLS às saídas:",Ws,B,v,qe,Ce,$l="Observe que convertemos os embeddings em arrays NumPy — isso porque 🤗 Datasets requer esse formato quando tentamos indexá-los com FAISS, o que faremos a seguir.",zs,Ie,Ys,ke,Rl='Agora que temos um conjunto de dados de embeddings, precisamos de alguma maneira de pesquisá-los. Para fazer isso, usaremos uma estrutura de dados especial em 🤗 Datasets chamada <em>FAISS index</em>. <a href="https://faiss.ai/" rel="nofollow">FAISS</a> (abreviação de Facebook AI Similarity Search) é uma biblioteca que fornece algoritmos eficientes para pesquisar rapidamente e agrupar vetores de incorporação.',Fs,$e,_l="A idéia básica por trás do FAISS é criar uma estrutura de dados especial chamada <em>index</em> que permite descobrir quais embeddings são semelhantes a um embedding de entrada. Criar um índice FAISS em 🤗 Datasets é simples — usamos a função <code>Dataset.add_faiss_index()</code> e especificamos qual coluna do nosso conjunto de dados gostaríamos de indexar:",Ss,Re,Es,_e,Nl="Agora podemos realizar consultas neste índice fazendo uma pesquisa do vizinho mais próximo com a função <code>Dataset.get_nearest_examples()</code>. Vamos testar isso primeiro incorporando uma pergunta da seguinte forma:",As,X,V,De,Ne,xl="Assim como com os documentos, agora temos um vetor de 768 dimensões representando a consulta, que podemos comparar com todo o corpus para encontrar os embeddings mais semelhantes:",Hs,xe,qs,Be,Bl="A função <code>Dataset.get_nearest_examples()</code> retorna uma tupla de pontuações que classificam a sobreposição entre a consulta e o documento e um conjunto correspondente de amostras (aqui, as 5 melhores correspondências). Vamos coletá-los em um <code>pandas.DataFrame</code> para que possamos classificá-los facilmente:",Ds,ve,Ls,Xe,vl="Agora podemos iterar nas primeiras linhas para ver como nossa consulta correspondeu aos comentários disponíveis:",Ks,Ve,Ps,We,Os,ze,Xl="Nada mal! Nosso segundo resultado parece corresponder à consulta.",el,Y,sl,Ye,ll,Le,tl;d=new it({props:{fw:C[0]}}),G=new Pe({props:{title:"Busca semântica com o FAISS",local:"busca-semântica-com-o-faiss",headingTag:"h1"}});const Vl=[mt,dt],Fe=[];function Wl(e,s){return e[0]==="pt"?0:1}Q=Wl(C),Z=Fe[Q]=Vl[Q](C),o=new ot({props:{id:"OATCgQtNX2o"}}),F=new Pe({props:{title:"Usando embeddings para pesquisa semântica",local:"usando-embeddings-para-pesquisa-semântica",headingTag:"h2"}}),A=new Pe({props:{title:"Carregando e preparando o conjunto de dados",local:"carregando-e-preparando-o-conjunto-de-dados",headingTag:"h2"}}),q=new T({props:{code:"ZnJvbSUyMGh1Z2dpbmdmYWNlX2h1YiUyMGltcG9ydCUyMGhmX2h1Yl91cmwlMEElMEFkYXRhX2ZpbGVzJTIwJTNEJTIwaGZfaHViX3VybCglMEElMjAlMjAlMjAlMjByZXBvX2lkJTNEJTIybGV3dHVuJTJGZ2l0aHViLWlzc3VlcyUyMiUyQyUwQSUyMCUyMCUyMCUyMGZpbGVuYW1lJTNEJTIyZGF0YXNldHMtaXNzdWVzLXdpdGgtY29tbWVudHMuanNvbmwlMjIlMkMlMEElMjAlMjAlMjAlMjByZXBvX3R5cGUlM0QlMjJkYXRhc2V0JTIyJTJDJTBBKQ==",highlighted:`<span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> hf_hub_url
data_files = hf_hub_url(
repo_id=<span class="hljs-string">&quot;lewtun/github-issues&quot;</span>,
filename=<span class="hljs-string">&quot;datasets-issues-with-comments.jsonl&quot;</span>,
repo_type=<span class="hljs-string">&quot;dataset&quot;</span>,
)`,wrap:!1}}),L=new T({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBJTBBaXNzdWVzX2RhdGFzZXQlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIyanNvbiUyMiUyQyUyMGRhdGFfZmlsZXMlM0RkYXRhX2ZpbGVzJTJDJTIwc3BsaXQlM0QlMjJ0cmFpbiUyMiklMEFpc3N1ZXNfZGF0YXNldA==",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
issues_dataset = load_dataset(<span class="hljs-string">&quot;json&quot;</span>, data_files=data_files, split=<span class="hljs-string">&quot;train&quot;</span>)
issues_dataset`,wrap:!1}}),K=new T({props:{code:"RGF0YXNldCglN0IlMEElMjAlMjAlMjAlMjBmZWF0dXJlcyUzQSUyMCU1Qid1cmwnJTJDJTIwJ3JlcG9zaXRvcnlfdXJsJyUyQyUyMCdsYWJlbHNfdXJsJyUyQyUyMCdjb21tZW50c191cmwnJTJDJTIwJ2V2ZW50c191cmwnJTJDJTIwJ2h0bWxfdXJsJyUyQyUyMCdpZCclMkMlMjAnbm9kZV9pZCclMkMlMjAnbnVtYmVyJyUyQyUyMCd0aXRsZSclMkMlMjAndXNlciclMkMlMjAnbGFiZWxzJyUyQyUyMCdzdGF0ZSclMkMlMjAnbG9ja2VkJyUyQyUyMCdhc3NpZ25lZSclMkMlMjAnYXNzaWduZWVzJyUyQyUyMCdtaWxlc3RvbmUnJTJDJTIwJ2NvbW1lbnRzJyUyQyUyMCdjcmVhdGVkX2F0JyUyQyUyMCd1cGRhdGVkX2F0JyUyQyUyMCdjbG9zZWRfYXQnJTJDJTIwJ2F1dGhvcl9hc3NvY2lhdGlvbiclMkMlMjAnYWN0aXZlX2xvY2tfcmVhc29uJyUyQyUyMCdwdWxsX3JlcXVlc3QnJTJDJTIwJ2JvZHknJTJDJTIwJ3BlcmZvcm1lZF92aWFfZ2l0aHViX2FwcCclMkMlMjAnaXNfcHVsbF9yZXF1ZXN0JyU1RCUyQyUwQSUyMCUyMCUyMCUyMG51bV9yb3dzJTNBJTIwMjg1NSUwQSU3RCk=",highlighted:`Dataset({
features: [<span class="hljs-string">&#x27;url&#x27;</span>, <span class="hljs-string">&#x27;repository_url&#x27;</span>, <span class="hljs-string">&#x27;labels_url&#x27;</span>, <span class="hljs-string">&#x27;comments_url&#x27;</span>, <span class="hljs-string">&#x27;events_url&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;id&#x27;</span>, <span class="hljs-string">&#x27;node_id&#x27;</span>, <span class="hljs-string">&#x27;number&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;user&#x27;</span>, <span class="hljs-string">&#x27;labels&#x27;</span>, <span class="hljs-string">&#x27;state&#x27;</span>, <span class="hljs-string">&#x27;locked&#x27;</span>, <span class="hljs-string">&#x27;assignee&#x27;</span>, <span class="hljs-string">&#x27;assignees&#x27;</span>, <span class="hljs-string">&#x27;milestone&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;created_at&#x27;</span>, <span class="hljs-string">&#x27;updated_at&#x27;</span>, <span class="hljs-string">&#x27;closed_at&#x27;</span>, <span class="hljs-string">&#x27;author_association&#x27;</span>, <span class="hljs-string">&#x27;active_lock_reason&#x27;</span>, <span class="hljs-string">&#x27;pull_request&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>, <span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>, <span class="hljs-string">&#x27;is_pull_request&#x27;</span>],
num_rows: <span class="hljs-number">2855</span>
})`,wrap:!1}}),O=new T({props:{code:"aXNzdWVzX2RhdGFzZXQlMjAlM0QlMjBpc3N1ZXNfZGF0YXNldC5maWx0ZXIoJTBBJTIwJTIwJTIwJTIwbGFtYmRhJTIweCUzQSUyMCh4JTVCJTIyaXNfcHVsbF9yZXF1ZXN0JTIyJTVEJTIwJTNEJTNEJTIwRmFsc2UlMjBhbmQlMjBsZW4oeCU1QiUyMmNvbW1lbnRzJTIyJTVEKSUyMCUzRSUyMDApJTBBKSUwQWlzc3Vlc19kYXRhc2V0",highlighted:`issues_dataset = issues_dataset.<span class="hljs-built_in">filter</span>(
<span class="hljs-keyword">lambda</span> x: (x[<span class="hljs-string">&quot;is_pull_request&quot;</span>] == <span class="hljs-literal">False</span> <span class="hljs-keyword">and</span> <span class="hljs-built_in">len</span>(x[<span class="hljs-string">&quot;comments&quot;</span>]) &gt; <span class="hljs-number">0</span>)
)
issues_dataset`,wrap:!1}}),ee=new T({props:{code:"RGF0YXNldCglN0IlMEElMjAlMjAlMjAlMjBmZWF0dXJlcyUzQSUyMCU1Qid1cmwnJTJDJTIwJ3JlcG9zaXRvcnlfdXJsJyUyQyUyMCdsYWJlbHNfdXJsJyUyQyUyMCdjb21tZW50c191cmwnJTJDJTIwJ2V2ZW50c191cmwnJTJDJTIwJ2h0bWxfdXJsJyUyQyUyMCdpZCclMkMlMjAnbm9kZV9pZCclMkMlMjAnbnVtYmVyJyUyQyUyMCd0aXRsZSclMkMlMjAndXNlciclMkMlMjAnbGFiZWxzJyUyQyUyMCdzdGF0ZSclMkMlMjAnbG9ja2VkJyUyQyUyMCdhc3NpZ25lZSclMkMlMjAnYXNzaWduZWVzJyUyQyUyMCdtaWxlc3RvbmUnJTJDJTIwJ2NvbW1lbnRzJyUyQyUyMCdjcmVhdGVkX2F0JyUyQyUyMCd1cGRhdGVkX2F0JyUyQyUyMCdjbG9zZWRfYXQnJTJDJTIwJ2F1dGhvcl9hc3NvY2lhdGlvbiclMkMlMjAnYWN0aXZlX2xvY2tfcmVhc29uJyUyQyUyMCdwdWxsX3JlcXVlc3QnJTJDJTIwJ2JvZHknJTJDJTIwJ3BlcmZvcm1lZF92aWFfZ2l0aHViX2FwcCclMkMlMjAnaXNfcHVsbF9yZXF1ZXN0JyU1RCUyQyUwQSUyMCUyMCUyMCUyMG51bV9yb3dzJTNBJTIwNzcxJTBBJTdEKQ==",highlighted:`Dataset({
features: [<span class="hljs-string">&#x27;url&#x27;</span>, <span class="hljs-string">&#x27;repository_url&#x27;</span>, <span class="hljs-string">&#x27;labels_url&#x27;</span>, <span class="hljs-string">&#x27;comments_url&#x27;</span>, <span class="hljs-string">&#x27;events_url&#x27;</span>, <span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;id&#x27;</span>, <span class="hljs-string">&#x27;node_id&#x27;</span>, <span class="hljs-string">&#x27;number&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;user&#x27;</span>, <span class="hljs-string">&#x27;labels&#x27;</span>, <span class="hljs-string">&#x27;state&#x27;</span>, <span class="hljs-string">&#x27;locked&#x27;</span>, <span class="hljs-string">&#x27;assignee&#x27;</span>, <span class="hljs-string">&#x27;assignees&#x27;</span>, <span class="hljs-string">&#x27;milestone&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;created_at&#x27;</span>, <span class="hljs-string">&#x27;updated_at&#x27;</span>, <span class="hljs-string">&#x27;closed_at&#x27;</span>, <span class="hljs-string">&#x27;author_association&#x27;</span>, <span class="hljs-string">&#x27;active_lock_reason&#x27;</span>, <span class="hljs-string">&#x27;pull_request&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>, <span class="hljs-string">&#x27;performed_via_github_app&#x27;</span>, <span class="hljs-string">&#x27;is_pull_request&#x27;</span>],
num_rows: <span class="hljs-number">771</span>
})`,wrap:!1}}),le=new T({props:{code:"Y29sdW1ucyUyMCUzRCUyMGlzc3Vlc19kYXRhc2V0LmNvbHVtbl9uYW1lcyUwQWNvbHVtbnNfdG9fa2VlcCUyMCUzRCUyMCU1QiUyMnRpdGxlJTIyJTJDJTIwJTIyYm9keSUyMiUyQyUyMCUyMmh0bWxfdXJsJTIyJTJDJTIwJTIyY29tbWVudHMlMjIlNUQlMEFjb2x1bW5zX3RvX3JlbW92ZSUyMCUzRCUyMHNldChjb2x1bW5zX3RvX2tlZXApLnN5bW1ldHJpY19kaWZmZXJlbmNlKGNvbHVtbnMpJTBBaXNzdWVzX2RhdGFzZXQlMjAlM0QlMjBpc3N1ZXNfZGF0YXNldC5yZW1vdmVfY29sdW1ucyhjb2x1bW5zX3RvX3JlbW92ZSklMEFpc3N1ZXNfZGF0YXNldA==",highlighted:`columns = issues_dataset.column_names
columns_to_keep = [<span class="hljs-string">&quot;title&quot;</span>, <span class="hljs-string">&quot;body&quot;</span>, <span class="hljs-string">&quot;html_url&quot;</span>, <span class="hljs-string">&quot;comments&quot;</span>]
columns_to_remove = <span class="hljs-built_in">set</span>(columns_to_keep).symmetric_difference(columns)
issues_dataset = issues_dataset.remove_columns(columns_to_remove)
issues_dataset`,wrap:!1}}),te=new T({props:{code:"RGF0YXNldCglN0IlMEElMjAlMjAlMjAlMjBmZWF0dXJlcyUzQSUyMCU1QidodG1sX3VybCclMkMlMjAndGl0bGUnJTJDJTIwJ2NvbW1lbnRzJyUyQyUyMCdib2R5JyU1RCUyQyUwQSUyMCUyMCUyMCUyMG51bV9yb3dzJTNBJTIwNzcxJTBBJTdEKQ==",highlighted:`Dataset({
features: [<span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>],
num_rows: <span class="hljs-number">771</span>
})`,wrap:!1}}),ne=new T({props:{code:"aXNzdWVzX2RhdGFzZXQuc2V0X2Zvcm1hdCglMjJwYW5kYXMlMjIpJTBBZGYlMjAlM0QlMjBpc3N1ZXNfZGF0YXNldCU1QiUzQSU1RA==",highlighted:`issues_dataset.set_format(<span class="hljs-string">&quot;pandas&quot;</span>)
df = issues_dataset[:]`,wrap:!1}}),ie=new T({props:{code:"ZGYlNUIlMjJjb21tZW50cyUyMiU1RCU1QjAlNUQudG9saXN0KCk=",highlighted:'df[<span class="hljs-string">&quot;comments&quot;</span>][<span class="hljs-number">0</span>].tolist()',wrap:!1}}),re=new T({props:{code:"JTVCJ3RoZSUyMGJ1ZyUyMGNvZGUlMjBsb2NhdGUlMjBpbiUyMCVFRiVCQyU5QSU1Q3IlNUNuJTIwJTIwJTIwJTIwaWYlMjBkYXRhX2FyZ3MudGFza19uYW1lJTIwaXMlMjBub3QlMjBOb25lJTNBJTVDciU1Q24lMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjBEb3dubG9hZGluZyUyMGFuZCUyMGxvYWRpbmclMjBhJTIwZGF0YXNldCUyMGZyb20lMjB0aGUlMjBodWIuJTVDciU1Q24lMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBkYXRhc2V0cyUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJnbHVlJTIyJTJDJTIwZGF0YV9hcmdzLnRhc2tfbmFtZSUyQyUyMGNhY2hlX2RpciUzRG1vZGVsX2FyZ3MuY2FjaGVfZGlyKSclMkMlMEElMjAnSGklMjAlNDBqaW5lYyUyQyU1Q3IlNUNuJTVDciU1Q25Gcm9tJTIwdGltZSUyMHRvJTIwdGltZSUyMHdlJTIwZ2V0JTIwdGhpcyUyMGtpbmQlMjBvZiUyMCU2MENvbm5lY3Rpb25FcnJvciU2MCUyMGNvbWluZyUyMGZyb20lMjB0aGUlMjBnaXRodWIuY29tJTIwd2Vic2l0ZSUzQSUyMGh0dHBzJTNBJTJGJTJGcmF3LmdpdGh1YnVzZXJjb250ZW50LmNvbSU1Q3IlNUNuJTVDciU1Q25Ob3JtYWxseSUyQyUyMGl0JTIwc2hvdWxkJTIwd29yayUyMGlmJTIweW91JTIwd2FpdCUyMGElMjBsaXR0bGUlMjBhbmQlMjB0aGVuJTIwcmV0cnkuJTVDciU1Q24lNUNyJTVDbkNvdWxkJTIweW91JTIwcGxlYXNlJTIwY29uZmlybSUyMGlmJTIwdGhlJTIwcHJvYmxlbSUyMHBlcnNpc3RzJTNGJyUyQyUwQSUyMCdjYW5ub3QlMjBjb25uZWN0JUVGJUJDJThDZXZlbiUyMGJ5JTIwV2ViJTIwYnJvd3NlciVFRiVCQyU4Q3BsZWFzZSUyMGNoZWNrJTIwdGhhdCUyMCUyMHRoZXJlJTIwaXMlMjBzb21lJTIwJTIwcHJvYmxlbXMlRTMlODAlODInJTJDJTBBJTIwJ0klMjBjYW4lMjBhY2Nlc3MlMjBodHRwcyUzQSUyRiUyRnJhdy5naXRodWJ1c2VyY29udGVudC5jb20lMkZodWdnaW5nZmFjZSUyRmRhdGFzZXRzJTJGMS43LjAlMkZkYXRhc2V0cyUyRmdsdWUlMkZnbHVlLnB5JTIwd2l0aG91dCUyMHByb2JsZW0uLi4nJTVE",highlighted:`[<span class="hljs-string">&#x27;the bug code locate in :\\r\\n if data_args.task_name is not None:\\r\\n # Downloading and loading a dataset from the hub.\\r\\n datasets = load_dataset(&quot;glue&quot;, data_args.task_name, cache_dir=model_args.cache_dir)&#x27;</span>,
<span class="hljs-string">&#x27;Hi @jinec,\\r\\n\\r\\nFrom time to time we get this kind of \`ConnectionError\` coming from the github.com website: https://raw.githubusercontent.com\\r\\n\\r\\nNormally, it should work if you wait a little and then retry.\\r\\n\\r\\nCould you please confirm if the problem persists?&#x27;</span>,
<span class="hljs-string">&#x27;cannot connect,even by Web browser,please check that there is some problems。&#x27;</span>,
<span class="hljs-string">&#x27;I can access https://raw.githubusercontent.com/huggingface/datasets/1.7.0/datasets/glue/glue.py without problem...&#x27;</span>]`,wrap:!1}}),me=new T({props:{code:"Y29tbWVudHNfZGYlMjAlM0QlMjBkZi5leHBsb2RlKCUyMmNvbW1lbnRzJTIyJTJDJTIwaWdub3JlX2luZGV4JTNEVHJ1ZSklMEFjb21tZW50c19kZi5oZWFkKDQp",highlighted:`comments_df = df.explode(<span class="hljs-string">&quot;comments&quot;</span>, ignore_index=<span class="hljs-literal">True</span>)
comments_df.head(<span class="hljs-number">4</span>)`,wrap:!1}}),pe=new T({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwRGF0YXNldCUwQSUwQWNvbW1lbnRzX2RhdGFzZXQlMjAlM0QlMjBEYXRhc2V0LmZyb21fcGFuZGFzKGNvbW1lbnRzX2RmKSUwQWNvbW1lbnRzX2RhdGFzZXQ=",highlighted:`<span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> Dataset
comments_dataset = Dataset.from_pandas(comments_df)
comments_dataset`,wrap:!1}}),ce=new T({props:{code:"RGF0YXNldCglN0IlMEElMjAlMjAlMjAlMjBmZWF0dXJlcyUzQSUyMCU1QidodG1sX3VybCclMkMlMjAndGl0bGUnJTJDJTIwJ2NvbW1lbnRzJyUyQyUyMCdib2R5JyU1RCUyQyUwQSUyMCUyMCUyMCUyMG51bV9yb3dzJTNBJTIwMjg0MiUwQSU3RCk=",highlighted:`Dataset({
features: [<span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>],
num_rows: <span class="hljs-number">2842</span>
})`,wrap:!1}}),z=new Kl({props:{$$slots:{default:[Mt]},$$scope:{ctx:C}}}),ye=new T({props:{code:"Y29tbWVudHNfZGF0YXNldCUyMCUzRCUyMGNvbW1lbnRzX2RhdGFzZXQubWFwKCUwQSUyMCUyMCUyMCUyMGxhbWJkYSUyMHglM0ElMjAlN0IlMjJjb21tZW50X2xlbmd0aCUyMiUzQSUyMGxlbih4JTVCJTIyY29tbWVudHMlMjIlNUQuc3BsaXQoKSklN0QlMEEp",highlighted:`comments_dataset = comments_dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-keyword">lambda</span> x: {<span class="hljs-string">&quot;comment_length&quot;</span>: <span class="hljs-built_in">len</span>(x[<span class="hljs-string">&quot;comments&quot;</span>].split())}
)`,wrap:!1}}),be=new T({props:{code:"Y29tbWVudHNfZGF0YXNldCUyMCUzRCUyMGNvbW1lbnRzX2RhdGFzZXQuZmlsdGVyKGxhbWJkYSUyMHglM0ElMjB4JTVCJTIyY29tbWVudF9sZW5ndGglMjIlNUQlMjAlM0UlMjAxNSklMEFjb21tZW50c19kYXRhc2V0",highlighted:`comments_dataset = comments_dataset.<span class="hljs-built_in">filter</span>(<span class="hljs-keyword">lambda</span> x: x[<span class="hljs-string">&quot;comment_length&quot;</span>] &gt; <span class="hljs-number">15</span>)
comments_dataset`,wrap:!1}}),he=new T({props:{code:"RGF0YXNldCglN0IlMEElMjAlMjAlMjAlMjBmZWF0dXJlcyUzQSUyMCU1QidodG1sX3VybCclMkMlMjAndGl0bGUnJTJDJTIwJ2NvbW1lbnRzJyUyQyUyMCdib2R5JyUyQyUyMCdjb21tZW50X2xlbmd0aCclNUQlMkMlMEElMjAlMjAlMjAlMjBudW1fcm93cyUzQSUyMDIwOTglMEElN0Qp",highlighted:`Dataset({
features: [<span class="hljs-string">&#x27;html_url&#x27;</span>, <span class="hljs-string">&#x27;title&#x27;</span>, <span class="hljs-string">&#x27;comments&#x27;</span>, <span class="hljs-string">&#x27;body&#x27;</span>, <span class="hljs-string">&#x27;comment_length&#x27;</span>],
num_rows: <span class="hljs-number">2098</span>
})`,wrap:!1}}),Te=new T({props:{code:"ZGVmJTIwY29uY2F0ZW5hdGVfdGV4dChleGFtcGxlcyklM0ElMEElMjAlMjAlMjAlMjByZXR1cm4lMjAlN0IlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjJ0ZXh0JTIyJTNBJTIwZXhhbXBsZXMlNUIlMjJ0aXRsZSUyMiU1RCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyQiUyMCUyMiUyMCU1Q24lMjAlMjIlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMkIlMjBleGFtcGxlcyU1QiUyMmJvZHklMjIlNUQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMkIlMjAlMjIlMjAlNUNuJTIwJTIyJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTJCJTIwZXhhbXBsZXMlNUIlMjJjb21tZW50cyUyMiU1RCUwQSUyMCUyMCUyMCUyMCU3RCUwQSUwQSUwQWNvbW1lbnRzX2RhdGFzZXQlMjAlM0QlMjBjb21tZW50c19kYXRhc2V0Lm1hcChjb25jYXRlbmF0ZV90ZXh0KQ==",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">concatenate_text</span>(<span class="hljs-params">examples</span>):
<span class="hljs-keyword">return</span> {
<span class="hljs-string">&quot;text&quot;</span>: examples[<span class="hljs-string">&quot;title&quot;</span>]
+ <span class="hljs-string">&quot; \\n &quot;</span>
+ examples[<span class="hljs-string">&quot;body&quot;</span>]
+ <span class="hljs-string">&quot; \\n &quot;</span>
+ examples[<span class="hljs-string">&quot;comments&quot;</span>]
}
comments_dataset = comments_dataset.<span class="hljs-built_in">map</span>(concatenate_text)`,wrap:!1}}),we=new Pe({props:{title:"Criando embeddings de texto",local:"criando-embeddings-de-texto",headingTag:"h2"}});const zl=[ct,pt],Se=[];function Yl(e,s){return e[0]==="pt"?0:1}N=Yl(C),x=Se[N]=zl[N](C),Ze=new T({props:{code:"ZGVmJTIwY2xzX3Bvb2xpbmcobW9kZWxfb3V0cHV0KSUzQSUwQSUyMCUyMCUyMCUyMHJldHVybiUyMG1vZGVsX291dHB1dC5sYXN0X2hpZGRlbl9zdGF0ZSU1QiUzQSUyQyUyMDAlNUQ=",highlighted:`<span class="hljs-keyword">def</span> <span class="hljs-title function_">cls_pooling</span>(<span class="hljs-params">model_output</span>):
<span class="hljs-keyword">return</span> model_output.last_hidden_state[:, <span class="hljs-number">0</span>]`,wrap:!1}});const Fl=[Ut,ut],Ee=[];function Sl(e,s){return e[0]==="pt"?0:1}B=Sl(C),v=Ee[B]=Fl[B](C),Ie=new Pe({props:{title:"Usando FAISS para busca de similaridade",local:"usando-faiss-para-busca-de-similaridade",headingTag:"h2"}}),Re=new T({props:{code:"ZW1iZWRkaW5nc19kYXRhc2V0LmFkZF9mYWlzc19pbmRleChjb2x1bW4lM0QlMjJlbWJlZGRpbmdzJTIyKQ==",highlighted:'embeddings_dataset.add_faiss_index(column=<span class="hljs-string">&quot;embeddings&quot;</span>)',wrap:!1}});const El=[Jt,yt],Ae=[];function Al(e,s){return e[0]==="pt"?0:1}return X=Al(C),V=Ae[X]=El[X](C),xe=new T({props:{code:"c2NvcmVzJTJDJTIwc2FtcGxlcyUyMCUzRCUyMGVtYmVkZGluZ3NfZGF0YXNldC5nZXRfbmVhcmVzdF9leGFtcGxlcyglMEElMjAlMjAlMjAlMjAlMjJlbWJlZGRpbmdzJTIyJTJDJTIwcXVlc3Rpb25fZW1iZWRkaW5nJTJDJTIwayUzRDUlMEEp",highlighted:`scores, samples = embeddings_dataset.get_nearest_examples(
<span class="hljs-string">&quot;embeddings&quot;</span>, question_embedding, k=<span class="hljs-number">5</span>
)`,wrap:!1}}),ve=new T({props:{code:"aW1wb3J0JTIwcGFuZGFzJTIwYXMlMjBwZCUwQSUwQXNhbXBsZXNfZGYlMjAlM0QlMjBwZC5EYXRhRnJhbWUuZnJvbV9kaWN0KHNhbXBsZXMpJTBBc2FtcGxlc19kZiU1QiUyMnNjb3JlcyUyMiU1RCUyMCUzRCUyMHNjb3JlcyUwQXNhbXBsZXNfZGYuc29ydF92YWx1ZXMoJTIyc2NvcmVzJTIyJTJDJTIwYXNjZW5kaW5nJTNERmFsc2UlMkMlMjBpbnBsYWNlJTNEVHJ1ZSk=",highlighted:`<span class="hljs-keyword">import</span> pandas <span class="hljs-keyword">as</span> pd
samples_df = pd.DataFrame.from_dict(samples)
samples_df[<span class="hljs-string">&quot;scores&quot;</span>] = scores
samples_df.sort_values(<span class="hljs-string">&quot;scores&quot;</span>, ascending=<span class="hljs-literal">False</span>, inplace=<span class="hljs-literal">True</span>)`,wrap:!1}}),Ve=new T({props:{code:"Zm9yJTIwXyUyQyUyMHJvdyUyMGluJTIwc2FtcGxlc19kZi5pdGVycm93cygpJTNBJTBBJTIwJTIwJTIwJTIwcHJpbnQoZiUyMkNPTU1FTlQlM0ElMjAlN0Jyb3cuY29tbWVudHMlN0QlMjIpJTBBJTIwJTIwJTIwJTIwcHJpbnQoZiUyMlNDT1JFJTNBJTIwJTdCcm93LnNjb3JlcyU3RCUyMiklMEElMjAlMjAlMjAlMjBwcmludChmJTIyVElUTEUlM0ElMjAlN0Jyb3cudGl0bGUlN0QlMjIpJTBBJTIwJTIwJTIwJTIwcHJpbnQoZiUyMlVSTCUzQSUyMCU3QnJvdy5odG1sX3VybCU3RCUyMiklMEElMjAlMjAlMjAlMjBwcmludCglMjIlM0QlMjIlMjAqJTIwNTApJTBBJTIwJTIwJTIwJTIwcHJpbnQoKQ==",highlighted:`<span class="hljs-keyword">for</span> _, row <span class="hljs-keyword">in</span> samples_df.iterrows():
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;COMMENT: <span class="hljs-subst">{row.comments}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;SCORE: <span class="hljs-subst">{row.scores}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;TITLE: <span class="hljs-subst">{row.title}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">f&quot;URL: <span class="hljs-subst">{row.html_url}</span>&quot;</span>)
<span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;=&quot;</span> * <span class="hljs-number">50</span>)
<span class="hljs-built_in">print</span>()`,wrap:!1}}),We=new T({props:{code:"JTIyJTIyJTIyJTBBQ09NTUVOVCUzQSUyMFJlcXVpcmluZyUyMG9ubGluZSUyMGNvbm5lY3Rpb24lMjBpcyUyMGElMjBkZWFsJTIwYnJlYWtlciUyMGluJTIwc29tZSUyMGNhc2VzJTIwdW5mb3J0dW5hdGVseSUyMHNvJTIwaXQnZCUyMGJlJTIwZ3JlYXQlMjBpZiUyMG9mZmxpbmUlMjBtb2RlJTIwaXMlMjBhZGRlZCUyMHNpbWlsYXIlMjB0byUyMGhvdyUyMCU2MHRyYW5zZm9ybWVycyU2MCUyMGxvYWRzJTIwbW9kZWxzJTIwb2ZmbGluZSUyMGZpbmUuJTBBJTBBJTQwbWFuZHViaWFuJ3MlMjBzZWNvbmQlMjBidWxsZXQlMjBwb2ludCUyMHN1Z2dlc3RzJTIwdGhhdCUyMHRoZXJlJ3MlMjBhJTIwd29ya2Fyb3VuZCUyMGFsbG93aW5nJTIweW91JTIwdG8lMjB1c2UlMjB5b3VyJTIwb2ZmbGluZSUyMChjdXN0b20lM0YpJTIwZGF0YXNldCUyMHdpdGglMjAlNjBkYXRhc2V0cyU2MC4lMjBDb3VsZCUyMHlvdSUyMHBsZWFzZSUyMGVsYWJvcmF0ZSUyMG9uJTIwaG93JTIwdGhhdCUyMHNob3VsZCUyMGxvb2slMjBsaWtlJTNGJTBBU0NPUkUlM0ElMjAyNS41MDUwNDY4NDQ0ODI0MjIlMEFUSVRMRSUzQSUyMERpc2N1c3Npb24lMjB1c2luZyUyMGRhdGFzZXRzJTIwaW4lMjBvZmZsaW5lJTIwbW9kZSUwQVVSTCUzQSUyMGh0dHBzJTNBJTJGJTJGZ2l0aHViLmNvbSUyRmh1Z2dpbmdmYWNlJTJGZGF0YXNldHMlMkZpc3N1ZXMlMkY4MjQlMEElM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlMEElMEFDT01NRU5UJTNBJTIwVGhlJTIwbG9jYWwlMjBkYXRhc2V0JTIwYnVpbGRlcnMlMjAoY3N2JTJDJTIwdGV4dCUyMCUyQyUyMGpzb24lMjBhbmQlMjBwYW5kYXMpJTIwYXJlJTIwbm93JTIwcGFydCUyMG9mJTIwdGhlJTIwJTYwZGF0YXNldHMlNjAlMjBwYWNrYWdlJTIwc2luY2UlMjAlMjMxNzI2JTIwJTNBKSUwQVlvdSUyMGNhbiUyMG5vdyUyMHVzZSUyMHRoZW0lMjBvZmZsaW5lJTBBJTVDJTYwJTVDJTYwJTVDJTYwcHl0aG9uJTBBZGF0YXNldHMlMjAlM0QlMjBsb2FkX2RhdGFzZXQoJTIydGV4dCUyMiUyQyUyMGRhdGFfZmlsZXMlM0RkYXRhX2ZpbGVzKSUwQSU1QyU2MCU1QyU2MCU1QyU2MCUwQSUwQVdlJ2xsJTIwZG8lMjBhJTIwbmV3JTIwcmVsZWFzZSUyMHNvb24lMEFTQ09SRSUzQSUyMDI0LjU1NTUwOTU2NzI2MDc0MiUwQVRJVExFJTNBJTIwRGlzY3Vzc2lvbiUyMHVzaW5nJTIwZGF0YXNldHMlMjBpbiUyMG9mZmxpbmUlMjBtb2RlJTBBVVJMJTNBJTIwaHR0cHMlM0ElMkYlMkZnaXRodWIuY29tJTJGaHVnZ2luZ2ZhY2UlMkZkYXRhc2V0cyUyRmlzc3VlcyUyRjgyNCUwQSUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUzRCUwQSUwQUNPTU1FTlQlM0ElMjBJJTIwb3BlbmVkJTIwYSUyMFBSJTIwdGhhdCUyMGFsbG93cyUyMHRvJTIwcmVsb2FkJTIwbW9kdWxlcyUyMHRoYXQlMjBoYXZlJTIwYWxyZWFkeSUyMGJlZW4lMjBsb2FkZWQlMjBvbmNlJTIwZXZlbiUyMGlmJTIwdGhlcmUncyUyMG5vJTIwaW50ZXJuZXQuJTBBJTBBTGV0JTIwbWUlMjBrbm93JTIwaWYlMjB5b3UlMjBrbm93JTIwb3RoZXIlMjB3YXlzJTIwdGhhdCUyMGNhbiUyMG1ha2UlMjB0aGUlMjBvZmZsaW5lJTIwbW9kZSUyMGV4cGVyaWVuY2UlMjBiZXR0ZXIuJTIwSSdkJTIwYmUlMjBoYXBweSUyMHRvJTIwYWRkJTIwdGhlbSUyMCUzQSklMEElMEFJJTIwYWxyZWFkeSUyMG5vdGUlMjB0aGUlMjAlMjJmcmVlemUlMjIlMjBtb2R1bGVzJTIwb3B0aW9uJTJDJTIwdG8lMjBwcmV2ZW50JTIwbG9jYWwlMjBtb2R1bGVzJTIwdXBkYXRlcy4lMjBJdCUyMHdvdWxkJTIwYmUlMjBhJTIwY29vbCUyMGZlYXR1cmUuJTBBJTBBLS0tLS0tLS0tLSUwQSUwQSUzRSUyMCU0MG1hbmR1YmlhbidzJTIwc2Vjb25kJTIwYnVsbGV0JTIwcG9pbnQlMjBzdWdnZXN0cyUyMHRoYXQlMjB0aGVyZSdzJTIwYSUyMHdvcmthcm91bmQlMjBhbGxvd2luZyUyMHlvdSUyMHRvJTIwdXNlJTIweW91ciUyMG9mZmxpbmUlMjAoY3VzdG9tJTNGKSUyMGRhdGFzZXQlMjB3aXRoJTIwJTYwZGF0YXNldHMlNjAuJTIwQ291bGQlMjB5b3UlMjBwbGVhc2UlMjBlbGFib3JhdGUlMjBvbiUyMGhvdyUyMHRoYXQlMjBzaG91bGQlMjBsb29rJTIwbGlrZSUzRiUwQSUwQUluZGVlZCUyMCU2MGxvYWRfZGF0YXNldCU2MCUyMGFsbG93cyUyMHRvJTIwbG9hZCUyMHJlbW90ZSUyMGRhdGFzZXQlMjBzY3JpcHQlMjAoc3F1YWQlMkMlMjBnbHVlJTJDJTIwZXRjLiklMjBidXQlMjBhbHNvJTIweW91JTIwb3duJTIwbG9jYWwlMjBvbmVzLiUwQUZvciUyMGV4YW1wbGUlMjBpZiUyMHlvdSUyMGhhdmUlMjBhJTIwZGF0YXNldCUyMHNjcmlwdCUyMGF0JTIwJTYwLiUyRm15X2RhdGFzZXQlMkZteV9kYXRhc2V0LnB5JTYwJTIwdGhlbiUyMHlvdSUyMGNhbiUyMGRvJTBBJTVDJTYwJTVDJTYwJTVDJTYwcHl0aG9uJTBBbG9hZF9kYXRhc2V0KCUyMi4lMkZteV9kYXRhc2V0JTIyKSUwQSU1QyU2MCU1QyU2MCU1QyU2MCUwQWFuZCUyMHRoZSUyMGRhdGFzZXQlMjBzY3JpcHQlMjB3aWxsJTIwZ2VuZXJhdGUlMjB5b3VyJTIwZGF0YXNldCUyMG9uY2UlMjBhbmQlMjBmb3IlMjBhbGwuJTBBJTBBLS0tLS0tLS0tLSUwQSUwQUFib3V0JTIwSSdtJTIwbG9va2luZyUyMGludG8lMjBoYXZpbmclMjAlNjBjc3YlNjAlMkMlMjAlNjBqc29uJTYwJTJDJTIwJTYwdGV4dCU2MCUyQyUyMCU2MHBhbmRhcyU2MCUyMGRhdGFzZXQlMjBidWlsZGVycyUyMGFscmVhZHklMjBpbmNsdWRlZCUyMGluJTIwdGhlJTIwJTYwZGF0YXNldHMlNjAlMjBwYWNrYWdlJTJDJTIwc28lMjB0aGF0JTIwdGhleSUyMGFyZSUyMGF2YWlsYWJsZSUyMG9mZmxpbmUlMjBieSUyMGRlZmF1bHQlMkMlMjBhcyUyMG9wcG9zZWQlMjB0byUyMHRoZSUyMG90aGVyJTIwZGF0YXNldHMlMjB0aGF0JTIwcmVxdWlyZSUyMHRoZSUyMHNjcmlwdCUyMHRvJTIwYmUlMjBkb3dubG9hZGVkLiUwQWNmJTIwJTIzMTcyNCUwQVNDT1JFJTNBJTIwMjQuMTQ4OTY1ODM1NTcxMjklMEFUSVRMRSUzQSUyMERpc2N1c3Npb24lMjB1c2luZyUyMGRhdGFzZXRzJTIwaW4lMjBvZmZsaW5lJTIwbW9kZSUwQVVSTCUzQSUyMGh0dHBzJTNBJTJGJTJGZ2l0aHViLmNvbSUyRmh1Z2dpbmdmYWNlJTJGZGF0YXNldHMlMkZpc3N1ZXMlMkY4MjQlMEElM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlMEElMEFDT01NRU5UJTNBJTIwJTNFJTIwaGVyZSUyMGlzJTIwbXklMjB3YXklMjB0byUyMGxvYWQlMjBhJTIwZGF0YXNldCUyMG9mZmxpbmUlMkMlMjBidXQlMjBpdCUyMCoqcmVxdWlyZXMqKiUyMGFuJTIwb25saW5lJTIwbWFjaGluZSUwQSUzRSUwQSUzRSUyMDEuJTIwKG9ubGluZSUyMG1hY2hpbmUpJTBBJTNFJTBBJTNFJTIwJTYwJTYwJTYwJTBBJTNFJTBBJTNFJTIwaW1wb3J0JTIwZGF0YXNldHMlMEElM0UlMEElM0UlMjBkYXRhJTIwJTNEJTIwZGF0YXNldHMubG9hZF9kYXRhc2V0KC4uLiklMEElM0UlMEElM0UlMjBkYXRhLnNhdmVfdG9fZGlzayglMkZZT1VSJTJGREFUQVNFVCUyRkRJUiklMEElM0UlMEElM0UlMjAlNjAlNjAlNjAlMEElM0UlMEElM0UlMjAyLiUyMGNvcHklMjB0aGUlMjBkaXIlMjBmcm9tJTIwb25saW5lJTIwdG8lMjB0aGUlMjBvZmZsaW5lJTIwbWFjaGluZSUwQSUzRSUwQSUzRSUyMDMuJTIwKG9mZmxpbmUlMjBtYWNoaW5lKSUwQSUzRSUwQSUzRSUyMCU2MCU2MCU2MCUwQSUzRSUwQSUzRSUyMGltcG9ydCUyMGRhdGFzZXRzJTBBJTNFJTBBJTNFJTIwZGF0YSUyMCUzRCUyMGRhdGFzZXRzLmxvYWRfZnJvbV9kaXNrKCUyRlNBVkVEJTJGREFUQSUyRkRJUiklMEElM0UlMEElM0UlMjAlNjAlNjAlNjAlMEElM0UlMEElM0UlMEElM0UlMEElM0UlMjBIVEguJTBBJTBBJTBBU0NPUkUlM0ElMjAyMi44OTM5OTMzNzc2ODU1NDclMEFUSVRMRSUzQSUyMERpc2N1c3Npb24lMjB1c2luZyUyMGRhdGFzZXRzJTIwaW4lMjBvZmZsaW5lJTIwbW9kZSUwQVVSTCUzQSUyMGh0dHBzJTNBJTJGJTJGZ2l0aHViLmNvbSUyRmh1Z2dpbmdmYWNlJTJGZGF0YXNldHMlMkZpc3N1ZXMlMkY4MjQlMEElM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlMEElMEFDT01NRU5UJTNBJTIwaGVyZSUyMGlzJTIwbXklMjB3YXklMjB0byUyMGxvYWQlMjBhJTIwZGF0YXNldCUyMG9mZmxpbmUlMkMlMjBidXQlMjBpdCUyMCoqcmVxdWlyZXMqKiUyMGFuJTIwb25saW5lJTIwbWFjaGluZSUwQTEuJTIwKG9ubGluZSUyMG1hY2hpbmUpJTBBJTVDJTYwJTVDJTYwJTVDJTYwJTBBaW1wb3J0JTIwZGF0YXNldHMlMEFkYXRhJTIwJTNEJTIwZGF0YXNldHMubG9hZF9kYXRhc2V0KC4uLiklMEFkYXRhLnNhdmVfdG9fZGlzayglMkZZT1VSJTJGREFUQVNFVCUyRkRJUiklMEElNUMlNjAlNUMlNjAlNUMlNjAlMEEyLiUyMGNvcHklMjB0aGUlMjBkaXIlMjBmcm9tJTIwb25saW5lJTIwdG8lMjB0aGUlMjBvZmZsaW5lJTIwbWFjaGluZSUwQTMuJTIwKG9mZmxpbmUlMjBtYWNoaW5lKSUwQSU1QyU2MCU1QyU2MCU1QyU2MCUwQWltcG9ydCUyMGRhdGFzZXRzJTBBZGF0YSUyMCUzRCUyMGRhdGFzZXRzLmxvYWRfZnJvbV9kaXNrKCUyRlNBVkVEJTJGREFUQSUyRkRJUiklMEElNUMlNjAlNUMlNjAlNUMlNjAlMEElMEFIVEguJTBBU0NPUkUlM0ElMjAyMi40MDY2MzUyODQ0MjM4MjglMEFUSVRMRSUzQSUyMERpc2N1c3Npb24lMjB1c2luZyUyMGRhdGFzZXRzJTIwaW4lMjBvZmZsaW5lJTIwbW9kZSUwQVVSTCUzQSUyMGh0dHBzJTNBJTJGJTJGZ2l0aHViLmNvbSUyRmh1Z2dpbmdmYWNlJTJGZGF0YXNldHMlMkZpc3N1ZXMlMkY4MjQlMEElM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlM0QlMEElMjIlMjIlMjI=",highlighted:`<span class="hljs-string">&quot;&quot;&quot;
COMMENT: Requiring online connection is a deal breaker in some cases unfortunately so it&#x27;d be great if offline mode is added similar to how \`transformers\` loads models offline fine.
@mandubian&#x27;s second bullet point suggests that there&#x27;s a workaround allowing you to use your offline (custom?) dataset with \`datasets\`. Could you please elaborate on how that should look like?
SCORE: 25.505046844482422
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824
==================================================
COMMENT: The local dataset builders (csv, text , json and pandas) are now part of the \`datasets\` package since #1726 :)
You can now use them offline
\\\`\\\`\\\`python
datasets = load_dataset(&quot;text&quot;, data_files=data_files)
\\\`\\\`\\\`
We&#x27;ll do a new release soon
SCORE: 24.555509567260742
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824
==================================================
COMMENT: I opened a PR that allows to reload modules that have already been loaded once even if there&#x27;s no internet.
Let me know if you know other ways that can make the offline mode experience better. I&#x27;d be happy to add them :)
I already note the &quot;freeze&quot; modules option, to prevent local modules updates. It would be a cool feature.
----------
&gt; @mandubian&#x27;s second bullet point suggests that there&#x27;s a workaround allowing you to use your offline (custom?) dataset with \`datasets\`. Could you please elaborate on how that should look like?
Indeed \`load_dataset\` allows to load remote dataset script (squad, glue, etc.) but also you own local ones.
For example if you have a dataset script at \`./my_dataset/my_dataset.py\` then you can do
\\\`\\\`\\\`python
load_dataset(&quot;./my_dataset&quot;)
\\\`\\\`\\\`
and the dataset script will generate your dataset once and for all.
----------
About I&#x27;m looking into having \`csv\`, \`json\`, \`text\`, \`pandas\` dataset builders already included in the \`datasets\` package, so that they are available offline by default, as opposed to the other datasets that require the script to be downloaded.
cf #1724
SCORE: 24.14896583557129
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824
==================================================
COMMENT: &gt; here is my way to load a dataset offline, but it **requires** an online machine
&gt;
&gt; 1. (online machine)
&gt;
&gt; \`\`\`
&gt;
&gt; import datasets
&gt;
&gt; data = datasets.load_dataset(...)
&gt;
&gt; data.save_to_disk(/YOUR/DATASET/DIR)
&gt;
&gt; \`\`\`
&gt;
&gt; 2. copy the dir from online to the offline machine
&gt;
&gt; 3. (offline machine)
&gt;
&gt; \`\`\`
&gt;
&gt; import datasets
&gt;
&gt; data = datasets.load_from_disk(/SAVED/DATA/DIR)
&gt;
&gt; \`\`\`
&gt;
&gt;
&gt;
&gt; HTH.
SCORE: 22.893993377685547
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824
==================================================
COMMENT: here is my way to load a dataset offline, but it **requires** an online machine
1. (online machine)
\\\`\\\`\\\`
import datasets
data = datasets.load_dataset(...)
data.save_to_disk(/YOUR/DATASET/DIR)
\\\`\\\`\\\`
2. copy the dir from online to the offline machine
3. (offline machine)
\\\`\\\`\\\`
import datasets
data = datasets.load_from_disk(/SAVED/DATA/DIR)
\\\`\\\`\\\`
HTH.
SCORE: 22.406635284423828
TITLE: Discussion using datasets in offline mode
URL: https://github.com/huggingface/datasets/issues/824
==================================================
&quot;&quot;&quot;</span>`,wrap:!1}}),Y=new Kl({props:{$$slots:{default:[bt]},$$scope:{ctx:C}}}),Ye=new rt({props:{source:"https://github.com/huggingface/course/blob/main/chapters/pt/chapter5/6.mdx"}}),{c(){a=h("meta"),p=i(),n=h("p"),w=i(),u(d.$$.fragment),c=i(),u(G.$$.fragment),b=i(),Z.c(),R=i(),k=h("p"),k.innerHTML=I,$=i(),u(o.$$.fragment),g=i(),u(F.$$.fragment),Oe=i(),S=h("p"),S.innerHTML=Ml,es=i(),E=h("p"),E.textContent=pl,ss=i(),W=h("div"),W.innerHTML=cl,ls=i(),u(A.$$.fragment),ts=i(),H=h("p"),H.textContent=ul,as=i(),u(q.$$.fragment),ns=i(),D=h("p"),D.innerHTML=Ul,os=i(),u(L.$$.fragment),is=i(),u(K.$$.fragment),rs=i(),P=h("p"),P.innerHTML=yl,ds=i(),u(O.$$.fragment),ms=i(),u(ee.$$.fragment),Ms=i(),se=h("p"),se.innerHTML=Jl,ps=i(),u(le.$$.fragment),cs=i(),u(te.$$.fragment),us=i(),ae=h("p"),ae.innerHTML=bl,Us=i(),u(ne.$$.fragment),ys=i(),oe=h("p"),oe.innerHTML=hl,Js=i(),u(ie.$$.fragment),bs=i(),u(re.$$.fragment),hs=i(),de=h("p"),de.innerHTML=jl,js=i(),u(me.$$.fragment),Ts=i(),_=h("table"),_.innerHTML=Tl,fs=i(),Me=h("p"),Me.innerHTML=fl,ws=i(),u(pe.$$.fragment),gs=i(),u(ce.$$.fragment),Qs=i(),ue=h("p"),ue.textContent=wl,Zs=i(),u(z.$$.fragment),Gs=i(),Ue=h("p"),Ue.innerHTML=gl,Cs=i(),u(ye.$$.fragment),Is=i(),Je=h("p"),Je.textContent=Ql,ks=i(),u(be.$$.fragment),$s=i(),u(he.$$.fragment),Rs=i(),je=h("p"),je.innerHTML=Zl,_s=i(),u(Te.$$.fragment),Ns=i(),fe=h("p"),fe.textContent=Gl,xs=i(),u(we.$$.fragment),Bs=i(),ge=h("p"),ge.innerHTML=Cl,vs=i(),x.c(),He=i(),Qe=h("p"),Qe.innerHTML=Il,Xs=i(),u(Ze.$$.fragment),Vs=i(),Ge=h("p"),Ge.textContent=kl,Ws=i(),v.c(),qe=i(),Ce=h("p"),Ce.textContent=$l,zs=i(),u(Ie.$$.fragment),Ys=i(),ke=h("p"),ke.innerHTML=Rl,Fs=i(),$e=h("p"),$e.innerHTML=_l,Ss=i(),u(Re.$$.fragment),Es=i(),_e=h("p"),_e.innerHTML=Nl,As=i(),V.c(),De=i(),Ne=h("p"),Ne.textContent=xl,Hs=i(),u(xe.$$.fragment),qs=i(),Be=h("p"),Be.innerHTML=Bl,Ds=i(),u(ve.$$.fragment),Ls=i(),Xe=h("p"),Xe.textContent=vl,Ks=i(),u(Ve.$$.fragment),Ps=i(),u(We.$$.fragment),Os=i(),ze=h("p"),ze.textContent=Xl,el=i(),u(Y.$$.fragment),sl=i(),u(Ye.$$.fragment),ll=i(),Le=h("p"),this.h()},l(e){const s=at("svelte-u9bgzb",document.head);a=j(s,"META",{name:!0,content:!0}),s.forEach(l),p=r(e),n=j(e,"P",{}),Ll(n).forEach(l),w=r(e),U(d.$$.fragment,e),c=r(e),U(G.$$.fragment,e),b=r(e),Z.l(e),R=r(e),k=j(e,"P",{"data-svelte-h":!0}),f(k)!=="svelte-gjg3xh"&&(k.innerHTML=I),$=r(e),U(o.$$.fragment,e),g=r(e),U(F.$$.fragment,e),Oe=r(e),S=j(e,"P",{"data-svelte-h":!0}),f(S)!=="svelte-isds6h"&&(S.innerHTML=Ml),es=r(e),E=j(e,"P",{"data-svelte-h":!0}),f(E)!=="svelte-1xgdub6"&&(E.textContent=pl),ss=r(e),W=j(e,"DIV",{class:!0,"data-svelte-h":!0}),f(W)!=="svelte-yxatr"&&(W.innerHTML=cl),ls=r(e),U(A.$$.fragment,e),ts=r(e),H=j(e,"P",{"data-svelte-h":!0}),f(H)!=="svelte-r8uu0o"&&(H.textContent=ul),as=r(e),U(q.$$.fragment,e),ns=r(e),D=j(e,"P",{"data-svelte-h":!0}),f(D)!=="svelte-136cz13"&&(D.innerHTML=Ul),os=r(e),U(L.$$.fragment,e),is=r(e),U(K.$$.fragment,e),rs=r(e),P=j(e,"P",{"data-svelte-h":!0}),f(P)!=="svelte-1dvqw0"&&(P.innerHTML=yl),ds=r(e),U(O.$$.fragment,e),ms=r(e),U(ee.$$.fragment,e),Ms=r(e),se=j(e,"P",{"data-svelte-h":!0}),f(se)!=="svelte-nwy9uh"&&(se.innerHTML=Jl),ps=r(e),U(le.$$.fragment,e),cs=r(e),U(te.$$.fragment,e),us=r(e),ae=j(e,"P",{"data-svelte-h":!0}),f(ae)!=="svelte-xiuizr"&&(ae.innerHTML=bl),Us=r(e),U(ne.$$.fragment,e),ys=r(e),oe=j(e,"P",{"data-svelte-h":!0}),f(oe)!=="svelte-b7um6f"&&(oe.innerHTML=hl),Js=r(e),U(ie.$$.fragment,e),bs=r(e),U(re.$$.fragment,e),hs=r(e),de=j(e,"P",{"data-svelte-h":!0}),f(de)!=="svelte-tnp87o"&&(de.innerHTML=jl),js=r(e),U(me.$$.fragment,e),Ts=r(e),_=j(e,"TABLE",{border:!0,class:!0,style:!0,"data-svelte-h":!0}),f(_)!=="svelte-1g5whzd"&&(_.innerHTML=Tl),fs=r(e),Me=j(e,"P",{"data-svelte-h":!0}),f(Me)!=="svelte-11z2n8l"&&(Me.innerHTML=fl),ws=r(e),U(pe.$$.fragment,e),gs=r(e),U(ce.$$.fragment,e),Qs=r(e),ue=j(e,"P",{"data-svelte-h":!0}),f(ue)!=="svelte-11iemom"&&(ue.textContent=wl),Zs=r(e),U(z.$$.fragment,e),Gs=r(e),Ue=j(e,"P",{"data-svelte-h":!0}),f(Ue)!=="svelte-1y0kl1b"&&(Ue.innerHTML=gl),Cs=r(e),U(ye.$$.fragment,e),Is=r(e),Je=j(e,"P",{"data-svelte-h":!0}),f(Je)!=="svelte-rgaukq"&&(Je.textContent=Ql),ks=r(e),U(be.$$.fragment,e),$s=r(e),U(he.$$.fragment,e),Rs=r(e),je=j(e,"P",{"data-svelte-h":!0}),f(je)!=="svelte-yjini6"&&(je.innerHTML=Zl),_s=r(e),U(Te.$$.fragment,e),Ns=r(e),fe=j(e,"P",{"data-svelte-h":!0}),f(fe)!=="svelte-tbk9kh"&&(fe.textContent=Gl),xs=r(e),U(we.$$.fragment,e),Bs=r(e),ge=j(e,"P",{"data-svelte-h":!0}),f(ge)!=="svelte-ep5a0v"&&(ge.innerHTML=Cl),vs=r(e),x.l(e),He=r(e),Qe=j(e,"P",{"data-svelte-h":!0}),f(Qe)!=="svelte-1vq7v6g"&&(Qe.innerHTML=Il),Xs=r(e),U(Ze.$$.fragment,e),Vs=r(e),Ge=j(e,"P",{"data-svelte-h":!0}),f(Ge)!=="svelte-wrdycy"&&(Ge.textContent=kl),Ws=r(e),v.l(e),qe=r(e),Ce=j(e,"P",{"data-svelte-h":!0}),f(Ce)!=="svelte-183luvk"&&(Ce.textContent=$l),zs=r(e),U(Ie.$$.fragment,e),Ys=r(e),ke=j(e,"P",{"data-svelte-h":!0}),f(ke)!=="svelte-70zzak"&&(ke.innerHTML=Rl),Fs=r(e),$e=j(e,"P",{"data-svelte-h":!0}),f($e)!=="svelte-z1611h"&&($e.innerHTML=_l),Ss=r(e),U(Re.$$.fragment,e),Es=r(e),_e=j(e,"P",{"data-svelte-h":!0}),f(_e)!=="svelte-1w5adwn"&&(_e.innerHTML=Nl),As=r(e),V.l(e),De=r(e),Ne=j(e,"P",{"data-svelte-h":!0}),f(Ne)!=="svelte-v8tlmq"&&(Ne.textContent=xl),Hs=r(e),U(xe.$$.fragment,e),qs=r(e),Be=j(e,"P",{"data-svelte-h":!0}),f(Be)!=="svelte-vun2q"&&(Be.innerHTML=Bl),Ds=r(e),U(ve.$$.fragment,e),Ls=r(e),Xe=j(e,"P",{"data-svelte-h":!0}),f(Xe)!=="svelte-j214oi"&&(Xe.textContent=vl),Ks=r(e),U(Ve.$$.fragment,e),Ps=r(e),U(We.$$.fragment,e),Os=r(e),ze=j(e,"P",{"data-svelte-h":!0}),f(ze)!=="svelte-mo1i2k"&&(ze.textContent=Xl),el=r(e),U(Y.$$.fragment,e),sl=r(e),U(Ye.$$.fragment,e),ll=r(e),Le=j(e,"P",{}),Ll(Le).forEach(l),this.h()},h(){Ke(a,"name","hf:doc:metadata"),Ke(a,"content",jt),Ke(W,"class","flex justify-center"),Ke(_,"border","1"),Ke(_,"class","dataframe"),ml(_,"table-layout","fixed"),ml(_,"word-wrap","break-word"),ml(_,"width","100%")},m(e,s){nt(document.head,a),t(e,p,s),t(e,n,s),t(e,w,s),y(d,e,s),t(e,c,s),y(G,e,s),t(e,b,s),Fe[Q].m(e,s),t(e,R,s),t(e,k,s),t(e,$,s),y(o,e,s),t(e,g,s),y(F,e,s),t(e,Oe,s),t(e,S,s),t(e,es,s),t(e,E,s),t(e,ss,s),t(e,W,s),t(e,ls,s),y(A,e,s),t(e,ts,s),t(e,H,s),t(e,as,s),y(q,e,s),t(e,ns,s),t(e,D,s),t(e,os,s),y(L,e,s),t(e,is,s),y(K,e,s),t(e,rs,s),t(e,P,s),t(e,ds,s),y(O,e,s),t(e,ms,s),y(ee,e,s),t(e,Ms,s),t(e,se,s),t(e,ps,s),y(le,e,s),t(e,cs,s),y(te,e,s),t(e,us,s),t(e,ae,s),t(e,Us,s),y(ne,e,s),t(e,ys,s),t(e,oe,s),t(e,Js,s),y(ie,e,s),t(e,bs,s),y(re,e,s),t(e,hs,s),t(e,de,s),t(e,js,s),y(me,e,s),t(e,Ts,s),t(e,_,s),t(e,fs,s),t(e,Me,s),t(e,ws,s),y(pe,e,s),t(e,gs,s),y(ce,e,s),t(e,Qs,s),t(e,ue,s),t(e,Zs,s),y(z,e,s),t(e,Gs,s),t(e,Ue,s),t(e,Cs,s),y(ye,e,s),t(e,Is,s),t(e,Je,s),t(e,ks,s),y(be,e,s),t(e,$s,s),y(he,e,s),t(e,Rs,s),t(e,je,s),t(e,_s,s),y(Te,e,s),t(e,Ns,s),t(e,fe,s),t(e,xs,s),y(we,e,s),t(e,Bs,s),t(e,ge,s),t(e,vs,s),Se[N].m(e,s),t(e,He,s),t(e,Qe,s),t(e,Xs,s),y(Ze,e,s),t(e,Vs,s),t(e,Ge,s),t(e,Ws,s),Ee[B].m(e,s),t(e,qe,s),t(e,Ce,s),t(e,zs,s),y(Ie,e,s),t(e,Ys,s),t(e,ke,s),t(e,Fs,s),t(e,$e,s),t(e,Ss,s),y(Re,e,s),t(e,Es,s),t(e,_e,s),t(e,As,s),Ae[X].m(e,s),t(e,De,s),t(e,Ne,s),t(e,Hs,s),y(xe,e,s),t(e,qs,s),t(e,Be,s),t(e,Ds,s),y(ve,e,s),t(e,Ls,s),t(e,Xe,s),t(e,Ks,s),y(Ve,e,s),t(e,Ps,s),y(We,e,s),t(e,Os,s),t(e,ze,s),t(e,el,s),y(Y,e,s),t(e,sl,s),y(Ye,e,s),t(e,ll,s),t(e,Le,s),tl=!0},p(e,[s]){const Hl={};s&1&&(Hl.fw=e[0]),d.$set(Hl);let ol=Q;Q=Wl(e),Q!==ol&&(nl(),m(Fe[ol],1,1,()=>{Fe[ol]=null}),al(),Z=Fe[Q],Z||(Z=Fe[Q]=Vl[Q](e),Z.c()),M(Z,1),Z.m(R.parentNode,R));const ql={};s&2&&(ql.$$scope={dirty:s,ctx:e}),z.$set(ql);let il=N;N=Yl(e),N!==il&&(nl(),m(Se[il],1,1,()=>{Se[il]=null}),al(),x=Se[N],x||(x=Se[N]=zl[N](e),x.c()),M(x,1),x.m(He.parentNode,He));let rl=B;B=Sl(e),B!==rl&&(nl(),m(Ee[rl],1,1,()=>{Ee[rl]=null}),al(),v=Ee[B],v||(v=Ee[B]=Fl[B](e),v.c()),M(v,1),v.m(qe.parentNode,qe));let dl=X;X=Al(e),X!==dl&&(nl(),m(Ae[dl],1,1,()=>{Ae[dl]=null}),al(),V=Ae[X],V||(V=Ae[X]=El[X](e),V.c()),M(V,1),V.m(De.parentNode,De));const Dl={};s&2&&(Dl.$$scope={dirty:s,ctx:e}),Y.$set(Dl)},i(e){tl||(M(d.$$.fragment,e),M(G.$$.fragment,e),M(Z),M(o.$$.fragment,e),M(F.$$.fragment,e),M(A.$$.fragment,e),M(q.$$.fragment,e),M(L.$$.fragment,e),M(K.$$.fragment,e),M(O.$$.fragment,e),M(ee.$$.fragment,e),M(le.$$.fragment,e),M(te.$$.fragment,e),M(ne.$$.fragment,e),M(ie.$$.fragment,e),M(re.$$.fragment,e),M(me.$$.fragment,e),M(pe.$$.fragment,e),M(ce.$$.fragment,e),M(z.$$.fragment,e),M(ye.$$.fragment,e),M(be.$$.fragment,e),M(he.$$.fragment,e),M(Te.$$.fragment,e),M(we.$$.fragment,e),M(x),M(Ze.$$.fragment,e),M(v),M(Ie.$$.fragment,e),M(Re.$$.fragment,e),M(V),M(xe.$$.fragment,e),M(ve.$$.fragment,e),M(Ve.$$.fragment,e),M(We.$$.fragment,e),M(Y.$$.fragment,e),M(Ye.$$.fragment,e),tl=!0)},o(e){m(d.$$.fragment,e),m(G.$$.fragment,e),m(Z),m(o.$$.fragment,e),m(F.$$.fragment,e),m(A.$$.fragment,e),m(q.$$.fragment,e),m(L.$$.fragment,e),m(K.$$.fragment,e),m(O.$$.fragment,e),m(ee.$$.fragment,e),m(le.$$.fragment,e),m(te.$$.fragment,e),m(ne.$$.fragment,e),m(ie.$$.fragment,e),m(re.$$.fragment,e),m(me.$$.fragment,e),m(pe.$$.fragment,e),m(ce.$$.fragment,e),m(z.$$.fragment,e),m(ye.$$.fragment,e),m(be.$$.fragment,e),m(he.$$.fragment,e),m(Te.$$.fragment,e),m(we.$$.fragment,e),m(x),m(Ze.$$.fragment,e),m(v),m(Ie.$$.fragment,e),m(Re.$$.fragment,e),m(V),m(xe.$$.fragment,e),m(ve.$$.fragment,e),m(Ve.$$.fragment,e),m(We.$$.fragment,e),m(Y.$$.fragment,e),m(Ye.$$.fragment,e),tl=!1},d(e){e&&(l(p),l(n),l(w),l(c),l(b),l(R),l(k),l($),l(g),l(Oe),l(S),l(es),l(E),l(ss),l(W),l(ls),l(ts),l(H),l(as),l(ns),l(D),l(os),l(is),l(rs),l(P),l(ds),l(ms),l(Ms),l(se),l(ps),l(cs),l(us),l(ae),l(Us),l(ys),l(oe),l(Js),l(bs),l(hs),l(de),l(js),l(Ts),l(_),l(fs),l(Me),l(ws),l(gs),l(Qs),l(ue),l(Zs),l(Gs),l(Ue),l(Cs),l(Is),l(Je),l(ks),l($s),l(Rs),l(je),l(_s),l(Ns),l(fe),l(xs),l(Bs),l(ge),l(vs),l(He),l(Qe),l(Xs),l(Vs),l(Ge),l(Ws),l(qe),l(Ce),l(zs),l(Ys),l(ke),l(Fs),l($e),l(Ss),l(Es),l(_e),l(As),l(De),l(Ne),l(Hs),l(qs),l(Be),l(Ds),l(Ls),l(Xe),l(Ks),l(Ps),l(Os),l(ze),l(el),l(sl),l(ll),l(Le)),l(a),J(d,e),J(G,e),Fe[Q].d(e),J(o,e),J(F,e),J(A,e),J(q,e),J(L,e),J(K,e),J(O,e),J(ee,e),J(le,e),J(te,e),J(ne,e),J(ie,e),J(re,e),J(me,e),J(pe,e),J(ce,e),J(z,e),J(ye,e),J(be,e),J(he,e),J(Te,e),J(we,e),Se[N].d(e),J(Ze,e),Ee[B].d(e),J(Ie,e),J(Re,e),Ae[X].d(e),J(xe,e),J(ve,e),J(Ve,e),J(We,e),J(Y,e),J(Ye,e)}}}const jt='{"title":"Busca semântica com o FAISS","local":"busca-semântica-com-o-faiss","sections":[{"title":"Usando embeddings para pesquisa semântica","local":"usando-embeddings-para-pesquisa-semântica","sections":[],"depth":2},{"title":"Carregando e preparando o conjunto de dados","local":"carregando-e-preparando-o-conjunto-de-dados","sections":[],"depth":2},{"title":"Criando embeddings de texto","local":"criando-embeddings-de-texto","sections":[],"depth":2},{"title":"Usando FAISS para busca de similaridade","local":"usando-faiss-para-busca-de-similaridade","sections":[],"depth":2}],"depth":1}';function Tt(C,a,p){let n="pt";return st(()=>{const w=new URLSearchParams(window.location.search);p(0,n=w.get("fw")||"pt")}),[n]}class kt extends lt{constructor(a){super(),tt(this,a,Tt,ht,et,{})}}export{kt as component};

Xet Storage Details

Size:
67.4 kB
·
Xet hash:
212338ce50ff36acea1e6c57fcbcab3f73ebfa811072e68a9ee4d149b2cd8ba2

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.