Buckets:
| import{s as et,o as nt}from"../chunks/scheduler.37c15a92.js";import{S as st,i as at,g as h,s as c,r as u,A as ct,h as T,f as e,c as i,j as Pl,u as r,x as y,k as Ll,y as it,a as s,v as m,t as p,b as Kl,d as M,w as J,p as lt}from"../chunks/index.2bf4358c.js";import{C as U}from"../chunks/CodeBlock.4e987730.js";import{C as tt}from"../chunks/CourseFloatingBanner.9ff4c771.js";import{F as ot}from"../chunks/FrameworkSwitchCourse.8d4d4ab6.js";import{H as El,E as pt}from"../chunks/getInferenceSnippets.80a69898.js";function Mt(j){let a,o;return a=new tt({props:{chapter:2,classNames:"absolute z-10 right-0 top-0",notebooks:[{label:"Google Colab",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/vi/chapter2/section6_tf.ipynb"},{label:"Aws Studio",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/vi/chapter2/section6_tf.ipynb"}]}}),{c(){u(a.$$.fragment)},l(n){r(a.$$.fragment,n)},m(n,b){m(a,n,b),o=!0},i(n){o||(M(a.$$.fragment,n),o=!0)},o(n){p(a.$$.fragment,n),o=!1},d(n){J(a,n)}}}function ut(j){let a,o;return a=new tt({props:{chapter:2,classNames:"absolute z-10 right-0 top-0",notebooks:[{label:"Google Colab",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/master/course/vi/chapter2/section6_pt.ipynb"},{label:"Aws Studio",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/master/course/vi/chapter2/section6_pt.ipynb"}]}}),{c(){u(a.$$.fragment)},l(n){r(a.$$.fragment,n)},m(n,b){m(a,n,b),o=!0},i(n){o||(M(a.$$.fragment,n),o=!0)},o(n){p(a.$$.fragment,n),o=!1},d(n){J(a,n)}}}function rt(j){let a,o;return a=new U({props:{code:"aW1wb3J0JTIwdGVuc29yZmxvdyUyMGFzJTIwdGYlMEFmcm9tJTIwdHJhbnNmb3JtZXJzJTIwaW1wb3J0JTIwQXV0b1Rva2VuaXplciUyQyUyMFRGQXV0b01vZGVsRm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbiUwQSUwQWNoZWNrcG9pbnQlMjAlM0QlMjAlMjJkaXN0aWxiZXJ0LWJhc2UtdW5jYXNlZC1maW5ldHVuZWQtc3N0LTItZW5nbGlzaCUyMiUwQXRva2VuaXplciUyMCUzRCUyMEF1dG9Ub2tlbml6ZXIuZnJvbV9wcmV0cmFpbmVkKGNoZWNrcG9pbnQpJTBBbW9kZWwlMjAlM0QlMjBURkF1dG9Nb2RlbEZvclNlcXVlbmNlQ2xhc3NpZmljYXRpb24uZnJvbV9wcmV0cmFpbmVkKGNoZWNrcG9pbnQpJTBBc2VxdWVuY2VzJTIwJTNEJTIwJTVCJTIySSd2ZSUyMGJlZW4lMjB3YWl0aW5nJTIwZm9yJTIwYSUyMEh1Z2dpbmdGYWNlJTIwY291cnNlJTIwbXklMjB3aG9sZSUyMGxpZmUuJTIyJTJDJTIwJTIyU28lMjBoYXZlJTIwSSElMjIlNUQlMEElMEF0b2tlbnMlMjAlM0QlMjB0b2tlbml6ZXIoc2VxdWVuY2VzJTJDJTIwcGFkZGluZyUzRFRydWUlMkMlMjB0cnVuY2F0aW9uJTNEVHJ1ZSUyQyUyMHJldHVybl90ZW5zb3JzJTNEJTIydGYlMjIpJTBBb3V0cHV0JTIwJTNEJTIwbW9kZWwoKip0b2tlbnMp",highlighted:`<span class="hljs-keyword">import</span> tensorflow <span class="hljs-keyword">as</span> tf | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, TFAutoModelForSequenceClassification | |
| checkpoint = <span class="hljs-string">"distilbert-base-uncased-finetuned-sst-2-english"</span> | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint) | |
| sequences = [<span class="hljs-string">"I've been waiting for a HuggingFace course my whole life."</span>, <span class="hljs-string">"So have I!"</span>] | |
| tokens = tokenizer(sequences, padding=<span class="hljs-literal">True</span>, truncation=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">"tf"</span>) | |
| output = model(**tokens)`,wrap:!1}}),{c(){u(a.$$.fragment)},l(n){r(a.$$.fragment,n)},m(n,b){m(a,n,b),o=!0},i(n){o||(M(a.$$.fragment,n),o=!0)},o(n){p(a.$$.fragment,n),o=!1},d(n){J(a,n)}}}function mt(j){let a,o;return a=new U({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwdHJhbnNmb3JtZXJzJTIwaW1wb3J0JTIwQXV0b1Rva2VuaXplciUyQyUyMEF1dG9Nb2RlbEZvclNlcXVlbmNlQ2xhc3NpZmljYXRpb24lMEElMEFjaGVja3BvaW50JTIwJTNEJTIwJTIyZGlzdGlsYmVydC1iYXNlLXVuY2FzZWQtZmluZXR1bmVkLXNzdC0yLWVuZ2xpc2glMjIlMEF0b2tlbml6ZXIlMjAlM0QlMjBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZChjaGVja3BvaW50KSUwQW1vZGVsJTIwJTNEJTIwQXV0b01vZGVsRm9yU2VxdWVuY2VDbGFzc2lmaWNhdGlvbi5mcm9tX3ByZXRyYWluZWQoY2hlY2twb2ludCklMEFzZXF1ZW5jZXMlMjAlM0QlMjAlNUIlMjJJJ3ZlJTIwYmVlbiUyMHdhaXRpbmclMjBmb3IlMjBhJTIwSHVnZ2luZ0ZhY2UlMjBjb3Vyc2UlMjBteSUyMHdob2xlJTIwbGlmZS4lMjIlMkMlMjAlMjJTbyUyMGhhdmUlMjBJISUyMiU1RCUwQSUwQXRva2VucyUyMCUzRCUyMHRva2VuaXplcihzZXF1ZW5jZXMlMkMlMjBwYWRkaW5nJTNEVHJ1ZSUyQyUyMHRydW5jYXRpb24lM0RUcnVlJTJDJTIwcmV0dXJuX3RlbnNvcnMlM0QlMjJwdCUyMiklMEFvdXRwdXQlMjAlM0QlMjBtb2RlbCgqKnRva2Vucyk=",highlighted:`<span class="hljs-keyword">import</span> torch | |
| <span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer, AutoModelForSequenceClassification | |
| checkpoint = <span class="hljs-string">"distilbert-base-uncased-finetuned-sst-2-english"</span> | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| model = AutoModelForSequenceClassification.from_pretrained(checkpoint) | |
| sequences = [<span class="hljs-string">"I've been waiting for a HuggingFace course my whole life."</span>, <span class="hljs-string">"So have I!"</span>] | |
| tokens = tokenizer(sequences, padding=<span class="hljs-literal">True</span>, truncation=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">"pt"</span>) | |
| output = model(**tokens)`,wrap:!1}}),{c(){u(a.$$.fragment)},l(n){r(a.$$.fragment,n)},m(n,b){m(a,n,b),o=!0},i(n){o||(M(a.$$.fragment,n),o=!0)},o(n){p(a.$$.fragment,n),o=!1},d(n){J(a,n)}}}function Jt(j){let a,o,n,b,k,nl,f,sl,w,d,ll,V,Sl="Trong vài phần trước, chúng ta đã cố gắng hết sức để làm hầu hết các tác vụ bằng tay. Chúng ta đã khám phá cách thức hoạt động của các công cụ tokenize và xem xét quá trình tokenize, chuyển đổi dữ liệu sang ID đầu vào, đệm, cắt bớt và các lớp che attention.",al,Q,zl="Tuy nhiên, như chúng ta đã thấy trong phần 2, API 🤗 Transformers có thể xử lý tất cả những điều này cho chúng ta bằng một chức năng cấp cao mà chúng ta sẽ đi sâu vào đây. Khi bạn gọi trực tiếp <code>tokenizer</code> trên câu, bạn sẽ nhận lại được các thông tin đầu vào sẵn sàng chuyển qua mô hình của bạn:",cl,Z,il,$,Wl="Ở đây, biến <code>model_inputs</code> chứa mọi thứ cần thiết để một mô hình hoạt động tốt. Đối với DistilBERT, điều đó bao gồm các ID đầu vào cũng như lớp che attention. Các mô hình khác chấp nhận đầu vào bổ sung cũng sẽ có đầu ra đó từ đối tượng <code>tokenizer</code>.",ol,B,vl="Như chúng ta sẽ thấy trong một số ví dụ bên dưới, phương pháp này rất mạnh mẽ. Đầu tiên, nó có thể mã hóa một chuỗi duy nhất:",pl,C,Ml,E,Nl="Nó cũng xử lý nhiều chuỗi cùng một lúc mà không cần thay đổi trong API:",ul,S,rl,z,Gl="Nó có thể đệm thêm tuỳ theo một số mục tiêu như sau:",ml,W,Jl,v,xl="Nó cũng có thể cắt bớt các chuỗi:",hl,N,Tl,G,Rl="Đối tượng <code>tokenizer</code> có thể xử lý việc chuyển đổi sang các tensor cụ thể, sau đó có thể được gửi trực tiếp đến mô hình. Ví dụ: trong đoạn mã sau, chúng tôi đang nhắc tokenizer trả về tensors từ các khung khác nhau - <code>"pt"</code> trả về tensors PyTorch, <code>"tf"</code> trả về tensors TensorFlow và <code>"np"</code> trả về mảng NumPy:",bl,x,yl,R,Ul,_,_l="Nếu chúng ta xem xét các ID đầu vào được trả về bởi tokenizer, chúng ta sẽ thấy chúng hơi khác một chút so với những gì chúng ta đã có trước đó:",jl,F,wl,q,dl,Y,Fl="Một token ID đã được thêm vào vị trí đầu và cuối. Hãy giải mã hai chuỗi ID ở trên để xem nó là gì:",gl,X,Il,H,kl,D,ql="Tokenizer đã thêm từ đặc biệt <code>[CLS]</code> vào đầu và từ đặc biệt <code>[SEP]</code> ở cuối. Điều này là do mô hình đã được huấn luyện trước với chúng, vì vậy để có được kết quả tương tự để luận suy, chúng ta cũng cần thêm chúng vào. Lưu ý rằng một số mô hình không thêm các từ đặc biệt hoặc thêm các từ khác; mô hình cũng có thể chỉ thêm những từ đặc biệt này vào đầu hoặc chỉ ở cuối. Trong mọi trường hợp, tokenizer biết cái nào được mong đợi và sẽ giải quyết việc này cho bạn.",fl,A,Vl,O,Yl="Giờ chúng ta đã thấy tất cả các bước riêng lẻ mà <code>tokenizer</code> sử dụng khi áp dụng lên văn bản, chúng ta hãy xem lần cuối cách nó có thể xử lý nhiều chuỗi (đệm thêm!), chuỗi rất dài (cắt ngắn!) Và nhiều kiểu tensor với API chính của nó:",Ql,g,I,tl,P,Zl,el,$l;k=new ot({props:{fw:j[0]}}),f=new El({props:{title:"Kết hợp lại",local:"kết-hợp-lại",headingTag:"h1"}});const Xl=[ut,Mt],L=[];function Hl(l,t){return l[0]==="pt"?0:1}w=Hl(j),d=L[w]=Xl[w](j),Z=new U({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Ub2tlbml6ZXIlMEElMEFjaGVja3BvaW50JTIwJTNEJTIwJTIyZGlzdGlsYmVydC1iYXNlLXVuY2FzZWQtZmluZXR1bmVkLXNzdC0yLWVuZ2xpc2glMjIlMEF0b2tlbml6ZXIlMjAlM0QlMjBBdXRvVG9rZW5pemVyLmZyb21fcHJldHJhaW5lZChjaGVja3BvaW50KSUwQSUwQXNlcXVlbmNlJTIwJTNEJTIwJTIySSd2ZSUyMGJlZW4lMjB3YWl0aW5nJTIwZm9yJTIwYSUyMEh1Z2dpbmdGYWNlJTIwY291cnNlJTIwbXklMjB3aG9sZSUyMGxpZmUuJTIyJTBBJTBBbW9kZWxfaW5wdXRzJTIwJTNEJTIwdG9rZW5pemVyKHNlcXVlbmNlKQ==",highlighted:`<span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoTokenizer | |
| checkpoint = <span class="hljs-string">"distilbert-base-uncased-finetuned-sst-2-english"</span> | |
| tokenizer = AutoTokenizer.from_pretrained(checkpoint) | |
| sequence = <span class="hljs-string">"I've been waiting for a HuggingFace course my whole life."</span> | |
| model_inputs = tokenizer(sequence)`,wrap:!1}}),C=new U({props:{code:"c2VxdWVuY2UlMjAlM0QlMjAlMjJJJ3ZlJTIwYmVlbiUyMHdhaXRpbmclMjBmb3IlMjBhJTIwSHVnZ2luZ0ZhY2UlMjBjb3Vyc2UlMjBteSUyMHdob2xlJTIwbGlmZS4lMjIlMEElMEFtb2RlbF9pbnB1dHMlMjAlM0QlMjB0b2tlbml6ZXIoc2VxdWVuY2Up",highlighted:`sequence = <span class="hljs-string">"I've been waiting for a HuggingFace course my whole life."</span> | |
| model_inputs = tokenizer(sequence)`,wrap:!1}}),S=new U({props:{code:"c2VxdWVuY2VzJTIwJTNEJTIwJTVCJTIySSd2ZSUyMGJlZW4lMjB3YWl0aW5nJTIwZm9yJTIwYSUyMEh1Z2dpbmdGYWNlJTIwY291cnNlJTIwbXklMjB3aG9sZSUyMGxpZmUuJTIyJTJDJTIwJTIyU28lMjBoYXZlJTIwSSElMjIlNUQlMEElMEFtb2RlbF9pbnB1dHMlMjAlM0QlMjB0b2tlbml6ZXIoc2VxdWVuY2VzKQ==",highlighted:`sequences = [<span class="hljs-string">"I've been waiting for a HuggingFace course my whole life."</span>, <span class="hljs-string">"So have I!"</span>] | |
| model_inputs = tokenizer(sequences)`,wrap:!1}}),W=new U({props:{code:"JTIzJTIwUyVFMSVCQSVCRCUyMCVDNCU5MSVFMSVCQiU4N20lMjB0aCVDMyVBQW0lMjB2JUMzJUEwbyUyMGNodSVFMSVCQiU5N2klMjBzYW8lMjBjaG8lMjAlQzQlOTElRTElQkIlOTklMjBkJUMzJUEwaSUyMGIlRTElQkElQjFuZyUyMCVDNCU5MSVFMSVCQiU5OSUyMGQlQzMlQTBpJTIwdCVFMSVCQiU5MWklMjAlQzQlOTFhJTIwYyVFMSVCQiVBN2ElMjBjaHUlRTElQkIlOTdpJTBBbW9kZWxfaW5wdXRzJTIwJTNEJTIwdG9rZW5pemVyKHNlcXVlbmNlcyUyQyUyMHBhZGRpbmclM0QlMjJsb25nZXN0JTIyKSUwQSUwQSUyMyUyMFMlRTElQkElQkQlMjAlQzQlOTElRTElQkIlODdtJTIwdGglQzMlQUFtJTIwdiVDMyVBMG8lMjBjaHUlRTElQkIlOTdpJTIwc2FvJTIwY2hvJTIwJUM0JTkxJUUxJUJCJTk5JTIwZCVDMyVBMGklMjBiJUUxJUJBJUIxbmclMjAlQzQlOTElRTElQkIlOTklMjBkJUMzJUEwaSUyMHQlRTElQkIlOTFpJTIwJUM0JTkxYSUyMGMlRTElQkIlQTdhJTIwbSVDMyVCNCUyMGglQzMlQUNuaCUwQSUyMyUyMCg1MTIlMjBjaG8lMjBCRVJUJTIwaG8lRTElQkElQjdjJTIwRGlzdGlsQkVSVCklMEFtb2RlbF9pbnB1dHMlMjAlM0QlMjB0b2tlbml6ZXIoc2VxdWVuY2VzJTJDJTIwcGFkZGluZyUzRCUyMm1heF9sZW5ndGglMjIpJTBBJTBBJTIzJTIwUyVFMSVCQSVCRCUyMCVDNCU5MSVFMSVCQiU4N20lMjB0aCVDMyVBQW0lMjB2JUMzJUEwbyUyMGNodSVFMSVCQiU5N2klMjBzYW8lMjBjaG8lMjAlQzQlOTElRTElQkIlOTklMjBkJUMzJUEwaSUyMGIlRTElQkElQjFuZyUyMCVDNCU5MSVFMSVCQiU5OSUyMGQlQzMlQTBpJTIwdCVFMSVCQiU5MWklMjAlQzQlOTFhJTIwJUM0JTkxJUM2JUIwJUUxJUJCJUEzYyUyMGNoJUUxJUJCJTg5JTIwJUM0JTkxJUUxJUJCJThCbmglMEFtb2RlbF9pbnB1dHMlMjAlM0QlMjB0b2tlbml6ZXIoc2VxdWVuY2VzJTJDJTIwcGFkZGluZyUzRCUyMm1heF9sZW5ndGglMjIlMkMlMjBtYXhfbGVuZ3RoJTNEOCk=",highlighted:`<span class="hljs-comment"># Sẽ đệm thêm vào chuỗi sao cho độ dài bằng độ dài tối đa của chuỗi</span> | |
| model_inputs = tokenizer(sequences, padding=<span class="hljs-string">"longest"</span>) | |
| <span class="hljs-comment"># Sẽ đệm thêm vào chuỗi sao cho độ dài bằng độ dài tối đa của mô hình</span> | |
| <span class="hljs-comment"># (512 cho BERT hoặc DistilBERT)</span> | |
| model_inputs = tokenizer(sequences, padding=<span class="hljs-string">"max_length"</span>) | |
| <span class="hljs-comment"># Sẽ đệm thêm vào chuỗi sao cho độ dài bằng độ dài tối đa được chỉ định</span> | |
| model_inputs = tokenizer(sequences, padding=<span class="hljs-string">"max_length"</span>, max_length=<span class="hljs-number">8</span>)`,wrap:!1}}),N=new U({props:{code:"c2VxdWVuY2VzJTIwJTNEJTIwJTVCJTIySSd2ZSUyMGJlZW4lMjB3YWl0aW5nJTIwZm9yJTIwYSUyMEh1Z2dpbmdGYWNlJTIwY291cnNlJTIwbXklMjB3aG9sZSUyMGxpZmUuJTIyJTJDJTIwJTIyU28lMjBoYXZlJTIwSSElMjIlNUQlMEElMEElMjMlMjBTJUUxJUJBJUJEJTIwYyVFMSVCQSVBRnQlMjBiJUUxJUJCJTlCdCUyMGNodSVFMSVCQiU5N2klMjBjaG8lMjBiJUUxJUJBJUIxbmclMjAlQzQlOTElRTElQkIlOTklMjBkJUMzJUEwaSUyMHQlRTElQkIlOTFpJTIwJUM0JTkxYSUyMGMlRTElQkIlQTdhJTIwbSVDMyVCNCUyMGglQzMlQUNuaCUwQSUyMyUyMCg1MTIlMjBjaG8lMjBCRVJUJTIwaG8lRTElQkElQjdjJTIwRGlzdGlsQkVSVCklMEFtb2RlbF9pbnB1dHMlMjAlM0QlMjB0b2tlbml6ZXIoc2VxdWVuY2VzJTJDJTIwdHJ1bmNhdGlvbiUzRFRydWUpJTBBJTBBJTIzJTIwUyVFMSVCQSVCRCUyMGMlRTElQkElQUZ0JTIwYiVFMSVCQiU5QnQlMjBjaHUlRTElQkIlOTdpJTIwYyVDMyVCMyUyMCVDNCU5MSVFMSVCQiU5OSUyMGQlQzMlQTBpJTIwZCVDMyVBMGklMjBoJUM2JUExbiUyMCVDNCU5MSVFMSVCQiU5OSUyMGQlQzMlQTBpJTIwdCVFMSVCQiU5MWklMjAlQzQlOTFhJTIwJUM0JTkxJUM2JUIwJUUxJUJCJUEzYyUyMGNoJUUxJUJCJTg5JTIwJUM0JTkxJUUxJUJCJThCbmglMEFtb2RlbF9pbnB1dHMlMjAlM0QlMjB0b2tlbml6ZXIoc2VxdWVuY2VzJTJDJTIwbWF4X2xlbmd0aCUzRDglMkMlMjB0cnVuY2F0aW9uJTNEVHJ1ZSk=",highlighted:`sequences = [<span class="hljs-string">"I've been waiting for a HuggingFace course my whole life."</span>, <span class="hljs-string">"So have I!"</span>] | |
| <span class="hljs-comment"># Sẽ cắt bớt chuỗi cho bằng độ dài tối đa của mô hình</span> | |
| <span class="hljs-comment"># (512 cho BERT hoặc DistilBERT)</span> | |
| model_inputs = tokenizer(sequences, truncation=<span class="hljs-literal">True</span>) | |
| <span class="hljs-comment"># Sẽ cắt bớt chuỗi có độ dài dài hơn độ dài tối đa được chỉ định</span> | |
| model_inputs = tokenizer(sequences, max_length=<span class="hljs-number">8</span>, truncation=<span class="hljs-literal">True</span>)`,wrap:!1}}),x=new U({props:{code:"c2VxdWVuY2VzJTIwJTNEJTIwJTVCJTIySSd2ZSUyMGJlZW4lMjB3YWl0aW5nJTIwZm9yJTIwYSUyMEh1Z2dpbmdGYWNlJTIwY291cnNlJTIwbXklMjB3aG9sZSUyMGxpZmUuJTIyJTJDJTIwJTIyU28lMjBoYXZlJTIwSSElMjIlNUQlMEElMEElMjMlMjBUciVFMSVCQSVBMyUyMHYlRTElQkIlODElMjB0ZW5zb3IlMjBQeVRvcmNoJTBBbW9kZWxfaW5wdXRzJTIwJTNEJTIwdG9rZW5pemVyKHNlcXVlbmNlcyUyQyUyMHBhZGRpbmclM0RUcnVlJTJDJTIwcmV0dXJuX3RlbnNvcnMlM0QlMjJwdCUyMiklMEElMEElMjMlMjBUciVFMSVCQSVBMyUyMHYlRTElQkIlODElMjB0ZW5zb3IlMjBUZW5zb3JGbG93JTBBbW9kZWxfaW5wdXRzJTIwJTNEJTIwdG9rZW5pemVyKHNlcXVlbmNlcyUyQyUyMHBhZGRpbmclM0RUcnVlJTJDJTIwcmV0dXJuX3RlbnNvcnMlM0QlMjJ0ZiUyMiklMEElMEElMjMlMjBUciVFMSVCQSVBMyUyMHYlRTElQkIlODElMjBtJUUxJUJBJUEzbmclMjBOdW1QeSUwQW1vZGVsX2lucHV0cyUyMCUzRCUyMHRva2VuaXplcihzZXF1ZW5jZXMlMkMlMjBwYWRkaW5nJTNEVHJ1ZSUyQyUyMHJldHVybl90ZW5zb3JzJTNEJTIybnAlMjIp",highlighted:`sequences = [<span class="hljs-string">"I've been waiting for a HuggingFace course my whole life."</span>, <span class="hljs-string">"So have I!"</span>] | |
| <span class="hljs-comment"># Trả về tensor PyTorch</span> | |
| model_inputs = tokenizer(sequences, padding=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">"pt"</span>) | |
| <span class="hljs-comment"># Trả về tensor TensorFlow</span> | |
| model_inputs = tokenizer(sequences, padding=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">"tf"</span>) | |
| <span class="hljs-comment"># Trả về mảng NumPy</span> | |
| model_inputs = tokenizer(sequences, padding=<span class="hljs-literal">True</span>, return_tensors=<span class="hljs-string">"np"</span>)`,wrap:!1}}),R=new El({props:{title:"Các token đặc biệt",local:"các-token-đặc-biệt",headingTag:"h2"}}),F=new U({props:{code:"c2VxdWVuY2UlMjAlM0QlMjAlMjJJJ3ZlJTIwYmVlbiUyMHdhaXRpbmclMjBmb3IlMjBhJTIwSHVnZ2luZ0ZhY2UlMjBjb3Vyc2UlMjBteSUyMHdob2xlJTIwbGlmZS4lMjIlMEElMEFtb2RlbF9pbnB1dHMlMjAlM0QlMjB0b2tlbml6ZXIoc2VxdWVuY2UpJTBBcHJpbnQobW9kZWxfaW5wdXRzJTVCJTIyaW5wdXRfaWRzJTIyJTVEKSUwQSUwQXRva2VucyUyMCUzRCUyMHRva2VuaXplci50b2tlbml6ZShzZXF1ZW5jZSklMEFpZHMlMjAlM0QlMjB0b2tlbml6ZXIuY29udmVydF90b2tlbnNfdG9faWRzKHRva2VucyklMEFwcmludChpZHMp",highlighted:`sequence = <span class="hljs-string">"I've been waiting for a HuggingFace course my whole life."</span> | |
| model_inputs = tokenizer(sequence) | |
| <span class="hljs-built_in">print</span>(model_inputs[<span class="hljs-string">"input_ids"</span>]) | |
| tokens = tokenizer.tokenize(sequence) | |
| ids = tokenizer.convert_tokens_to_ids(tokens) | |
| <span class="hljs-built_in">print</span>(ids)`,wrap:!1}}),q=new U({props:{code:"JTVCMTAxJTJDJTIwMTA0NSUyQyUyMDEwMDUlMkMlMjAyMzEwJTJDJTIwMjA0MiUyQyUyMDM0MDMlMkMlMjAyMDA1JTJDJTIwMTAzNyUyQyUyMDE3NjYyJTJDJTIwMTIxNzIlMkMlMjAyNjA3JTJDJTIwMjAyNiUyQyUyMDI4NzglMkMlMjAyMTY2JTJDJTIwMTAxMiUyQyUyMDEwMiU1RCUwQSU1QjEwNDUlMkMlMjAxMDA1JTJDJTIwMjMxMCUyQyUyMDIwNDIlMkMlMjAzNDAzJTJDJTIwMjAwNSUyQyUyMDEwMzclMkMlMjAxNzY2MiUyQyUyMDEyMTcyJTJDJTIwMjYwNyUyQyUyMDIwMjYlMkMlMjAyODc4JTJDJTIwMjE2NiUyQyUyMDEwMTIlNUQ=",highlighted:`[<span class="hljs-number">101</span>, <span class="hljs-number">1045</span>, <span class="hljs-number">1005</span>, <span class="hljs-number">2310</span>, <span class="hljs-number">2042</span>, <span class="hljs-number">3403</span>, <span class="hljs-number">2005</span>, <span class="hljs-number">1037</span>, <span class="hljs-number">17662</span>, <span class="hljs-number">12172</span>, <span class="hljs-number">2607</span>, <span class="hljs-number">2026</span>, <span class="hljs-number">2878</span>, <span class="hljs-number">2166</span>, <span class="hljs-number">1012</span>, <span class="hljs-number">102</span>] | |
| [<span class="hljs-number">1045</span>, <span class="hljs-number">1005</span>, <span class="hljs-number">2310</span>, <span class="hljs-number">2042</span>, <span class="hljs-number">3403</span>, <span class="hljs-number">2005</span>, <span class="hljs-number">1037</span>, <span class="hljs-number">17662</span>, <span class="hljs-number">12172</span>, <span class="hljs-number">2607</span>, <span class="hljs-number">2026</span>, <span class="hljs-number">2878</span>, <span class="hljs-number">2166</span>, <span class="hljs-number">1012</span>]`,wrap:!1}}),X=new U({props:{code:"cHJpbnQodG9rZW5pemVyLmRlY29kZShtb2RlbF9pbnB1dHMlNUIlMjJpbnB1dF9pZHMlMjIlNUQpKSUwQXByaW50KHRva2VuaXplci5kZWNvZGUoaWRzKSk=",highlighted:`<span class="hljs-built_in">print</span>(tokenizer.decode(model_inputs[<span class="hljs-string">"input_ids"</span>])) | |
| <span class="hljs-built_in">print</span>(tokenizer.decode(ids))`,wrap:!1}}),H=new U({props:{code:"JTIyJTVCQ0xTJTVEJTIwaSd2ZSUyMGJlZW4lMjB3YWl0aW5nJTIwZm9yJTIwYSUyMGh1Z2dpbmdmYWNlJTIwY291cnNlJTIwbXklMjB3aG9sZSUyMGxpZmUuJTIwJTVCU0VQJTVEJTIyJTBBJTIyaSd2ZSUyMGJlZW4lMjB3YWl0aW5nJTIwZm9yJTIwYSUyMGh1Z2dpbmdmYWNlJTIwY291cnNlJTIwbXklMjB3aG9sZSUyMGxpZmUuJTIy",highlighted:`<span class="hljs-string">"[CLS] i've been waiting for a huggingface course my whole life. [SEP]"</span> | |
| <span class="hljs-string">"i've been waiting for a huggingface course my whole life."</span>`,wrap:!1}}),A=new El({props:{title:"Tổng kết: Từ tokenizer đến mô hình",local:"tổng-kết-từ-tokenizer-đến-mô-hình",headingTag:"h2"}});const Dl=[mt,rt],K=[];function Al(l,t){return l[0]==="pt"?0:1}return g=Al(j),I=K[g]=Dl[g](j),P=new pt({props:{source:"https://github.com/huggingface/course/blob/main/chapters/vi/chapter2/6.mdx"}}),{c(){a=h("meta"),o=c(),n=h("p"),b=c(),u(k.$$.fragment),nl=c(),u(f.$$.fragment),sl=c(),d.c(),ll=c(),V=h("p"),V.textContent=Sl,al=c(),Q=h("p"),Q.innerHTML=zl,cl=c(),u(Z.$$.fragment),il=c(),$=h("p"),$.innerHTML=Wl,ol=c(),B=h("p"),B.textContent=vl,pl=c(),u(C.$$.fragment),Ml=c(),E=h("p"),E.textContent=Nl,ul=c(),u(S.$$.fragment),rl=c(),z=h("p"),z.textContent=Gl,ml=c(),u(W.$$.fragment),Jl=c(),v=h("p"),v.textContent=xl,hl=c(),u(N.$$.fragment),Tl=c(),G=h("p"),G.innerHTML=Rl,bl=c(),u(x.$$.fragment),yl=c(),u(R.$$.fragment),Ul=c(),_=h("p"),_.textContent=_l,jl=c(),u(F.$$.fragment),wl=c(),u(q.$$.fragment),dl=c(),Y=h("p"),Y.textContent=Fl,gl=c(),u(X.$$.fragment),Il=c(),u(H.$$.fragment),kl=c(),D=h("p"),D.innerHTML=ql,fl=c(),u(A.$$.fragment),Vl=c(),O=h("p"),O.innerHTML=Yl,Ql=c(),I.c(),tl=c(),u(P.$$.fragment),Zl=c(),el=h("p"),this.h()},l(l){const t=ct("svelte-u9bgzb",document.head);a=T(t,"META",{name:!0,content:!0}),t.forEach(e),o=i(l),n=T(l,"P",{}),Pl(n).forEach(e),b=i(l),r(k.$$.fragment,l),nl=i(l),r(f.$$.fragment,l),sl=i(l),d.l(l),ll=i(l),V=T(l,"P",{"data-svelte-h":!0}),y(V)!=="svelte-1u8130m"&&(V.textContent=Sl),al=i(l),Q=T(l,"P",{"data-svelte-h":!0}),y(Q)!=="svelte-1h8wj95"&&(Q.innerHTML=zl),cl=i(l),r(Z.$$.fragment,l),il=i(l),$=T(l,"P",{"data-svelte-h":!0}),y($)!=="svelte-2oyymw"&&($.innerHTML=Wl),ol=i(l),B=T(l,"P",{"data-svelte-h":!0}),y(B)!=="svelte-5zlfre"&&(B.textContent=vl),pl=i(l),r(C.$$.fragment,l),Ml=i(l),E=T(l,"P",{"data-svelte-h":!0}),y(E)!=="svelte-mvqlrn"&&(E.textContent=Nl),ul=i(l),r(S.$$.fragment,l),rl=i(l),z=T(l,"P",{"data-svelte-h":!0}),y(z)!=="svelte-ulchx8"&&(z.textContent=Gl),ml=i(l),r(W.$$.fragment,l),Jl=i(l),v=T(l,"P",{"data-svelte-h":!0}),y(v)!=="svelte-19xof0h"&&(v.textContent=xl),hl=i(l),r(N.$$.fragment,l),Tl=i(l),G=T(l,"P",{"data-svelte-h":!0}),y(G)!=="svelte-2u7qwj"&&(G.innerHTML=Rl),bl=i(l),r(x.$$.fragment,l),yl=i(l),r(R.$$.fragment,l),Ul=i(l),_=T(l,"P",{"data-svelte-h":!0}),y(_)!=="svelte-zhuz9l"&&(_.textContent=_l),jl=i(l),r(F.$$.fragment,l),wl=i(l),r(q.$$.fragment,l),dl=i(l),Y=T(l,"P",{"data-svelte-h":!0}),y(Y)!=="svelte-d3k5ao"&&(Y.textContent=Fl),gl=i(l),r(X.$$.fragment,l),Il=i(l),r(H.$$.fragment,l),kl=i(l),D=T(l,"P",{"data-svelte-h":!0}),y(D)!=="svelte-m99wg0"&&(D.innerHTML=ql),fl=i(l),r(A.$$.fragment,l),Vl=i(l),O=T(l,"P",{"data-svelte-h":!0}),y(O)!=="svelte-nuly3r"&&(O.innerHTML=Yl),Ql=i(l),I.l(l),tl=i(l),r(P.$$.fragment,l),Zl=i(l),el=T(l,"P",{}),Pl(el).forEach(e),this.h()},h(){Ll(a,"name","hf:doc:metadata"),Ll(a,"content",ht)},m(l,t){it(document.head,a),s(l,o,t),s(l,n,t),s(l,b,t),m(k,l,t),s(l,nl,t),m(f,l,t),s(l,sl,t),L[w].m(l,t),s(l,ll,t),s(l,V,t),s(l,al,t),s(l,Q,t),s(l,cl,t),m(Z,l,t),s(l,il,t),s(l,$,t),s(l,ol,t),s(l,B,t),s(l,pl,t),m(C,l,t),s(l,Ml,t),s(l,E,t),s(l,ul,t),m(S,l,t),s(l,rl,t),s(l,z,t),s(l,ml,t),m(W,l,t),s(l,Jl,t),s(l,v,t),s(l,hl,t),m(N,l,t),s(l,Tl,t),s(l,G,t),s(l,bl,t),m(x,l,t),s(l,yl,t),m(R,l,t),s(l,Ul,t),s(l,_,t),s(l,jl,t),m(F,l,t),s(l,wl,t),m(q,l,t),s(l,dl,t),s(l,Y,t),s(l,gl,t),m(X,l,t),s(l,Il,t),m(H,l,t),s(l,kl,t),s(l,D,t),s(l,fl,t),m(A,l,t),s(l,Vl,t),s(l,O,t),s(l,Ql,t),K[g].m(l,t),s(l,tl,t),m(P,l,t),s(l,Zl,t),s(l,el,t),$l=!0},p(l,[t]){const Ol={};t&1&&(Ol.fw=l[0]),k.$set(Ol);let Bl=w;w=Hl(l),w!==Bl&&(lt(),p(L[Bl],1,1,()=>{L[Bl]=null}),Kl(),d=L[w],d||(d=L[w]=Xl[w](l),d.c()),M(d,1),d.m(ll.parentNode,ll));let Cl=g;g=Al(l),g!==Cl&&(lt(),p(K[Cl],1,1,()=>{K[Cl]=null}),Kl(),I=K[g],I||(I=K[g]=Dl[g](l),I.c()),M(I,1),I.m(tl.parentNode,tl))},i(l){$l||(M(k.$$.fragment,l),M(f.$$.fragment,l),M(d),M(Z.$$.fragment,l),M(C.$$.fragment,l),M(S.$$.fragment,l),M(W.$$.fragment,l),M(N.$$.fragment,l),M(x.$$.fragment,l),M(R.$$.fragment,l),M(F.$$.fragment,l),M(q.$$.fragment,l),M(X.$$.fragment,l),M(H.$$.fragment,l),M(A.$$.fragment,l),M(I),M(P.$$.fragment,l),$l=!0)},o(l){p(k.$$.fragment,l),p(f.$$.fragment,l),p(d),p(Z.$$.fragment,l),p(C.$$.fragment,l),p(S.$$.fragment,l),p(W.$$.fragment,l),p(N.$$.fragment,l),p(x.$$.fragment,l),p(R.$$.fragment,l),p(F.$$.fragment,l),p(q.$$.fragment,l),p(X.$$.fragment,l),p(H.$$.fragment,l),p(A.$$.fragment,l),p(I),p(P.$$.fragment,l),$l=!1},d(l){l&&(e(o),e(n),e(b),e(nl),e(sl),e(ll),e(V),e(al),e(Q),e(cl),e(il),e($),e(ol),e(B),e(pl),e(Ml),e(E),e(ul),e(rl),e(z),e(ml),e(Jl),e(v),e(hl),e(Tl),e(G),e(bl),e(yl),e(Ul),e(_),e(jl),e(wl),e(dl),e(Y),e(gl),e(Il),e(kl),e(D),e(fl),e(Vl),e(O),e(Ql),e(tl),e(Zl),e(el)),e(a),J(k,l),J(f,l),L[w].d(l),J(Z,l),J(C,l),J(S,l),J(W,l),J(N,l),J(x,l),J(R,l),J(F,l),J(q,l),J(X,l),J(H,l),J(A,l),K[g].d(l),J(P,l)}}}const ht='{"title":"Kết hợp lại","local":"kết-hợp-lại","sections":[{"title":"Các token đặc biệt","local":"các-token-đặc-biệt","sections":[],"depth":2},{"title":"Tổng kết: Từ tokenizer đến mô hình","local":"tổng-kết-từ-tokenizer-đến-mô-hình","sections":[],"depth":2}],"depth":1}';function Tt(j,a,o){let n="pt";return nt(()=>{const b=new URLSearchParams(window.location.search);o(0,n=b.get("fw")||"pt")}),[n]}class gt extends st{constructor(a){super(),at(this,a,Tt,Jt,et,{})}}export{gt as component}; | |
Xet Storage Details
- Size:
- 25.3 kB
- Xet hash:
- 95be04dc8760df61014dddeb62f7b1794e3db4f729bd6f931e5cb77a20cadaef
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.