Buckets:
| import{s as Jn,o as yn,n as wn}from"../chunks/scheduler.56730f09.js";import{S as Tn,i as un,g as p,s as t,r as j,A as Cn,h as i,f as a,c as e,j as dn,u as r,x as c,k as fa,y as bn,a as n,v as M,d as h,t as o,w as x}from"../chunks/index.1f144517.js";import{T as fn}from"../chunks/Tip.41e845e5.js";import{C as g}from"../chunks/CodeBlock.738eeccb.js";import{D as In}from"../chunks/DocNotebookDropdown.b2e82107.js";import{H as C}from"../chunks/Heading.57d46534.js";function _n(Os){let U,b="이 튜토리얼에서 설명하는 태스크는 다음과 같은 모델 아키텍처에서 지원됩니다:",y,d,w='<a href="../model_doc/layoutlm">LayoutLM</a>, <a href="../model_doc/layoutlmv2">LayoutLMv2</a>, <a href="../model_doc/layoutlmv3">LayoutLMv3</a>';return{c(){U=p("p"),U.textContent=b,y=t(),d=p("p"),d.innerHTML=w},l(m){U=i(m,"P",{"data-svelte-h":!0}),c(U)!=="svelte-6xt7gu"&&(U.textContent=b),y=e(m),d=i(m,"P",{"data-svelte-h":!0}),c(d)!=="svelte-4nltzx"&&(d.innerHTML=w)},m(m,J){n(m,U,J),n(m,y,J),n(m,d,J)},p:wn,d(m){m&&(a(U),a(y),a(d))}}}function Vn(Os){let U,b,y,d,w,m,J,Ps,f,Ia=`문서 시각적 질의 응답(Document Visual Question Answering)이라고도 하는 | |
| 문서 질의 응답(Document Question Answering)은 문서 이미지에 대한 질문에 답변을 주는 태스크입니다. | |
| 이 태스크를 지원하는 모델의 입력은 일반적으로 이미지와 질문의 조합이고, 출력은 자연어로 된 답변입니다. 이러한 모델은 텍스트, 단어의 위치(바운딩 박스), 이미지 등 다양한 모달리티를 활용합니다.`,Ks,I,_a="이 가이드는 다음 내용을 설명합니다:",sl,_,Va='<li><a href="https://huggingface.co/datasets/nielsr/docvqa_1200_examples_donut" rel="nofollow">DocVQA dataset</a>을 사용해 <a href="../model_doc/layoutlmv2">LayoutLMv2</a> 미세 조정하기</li> <li>추론을 위해 미세 조정된 모델을 사용하기</li>',ll,T,al,V,Ra=`LayoutLMv2는 토큰의 마지막 은닉층 위에 질의 응답 헤드를 추가해 답변의 시작 토큰과 끝 토큰의 위치를 예측함으로써 문서 질의 응답 태스크를 해결합니다. 즉, 문맥이 주어졌을 때 질문에 답하는 정보를 추출하는 추출형 질의 응답(Extractive question answering)으로 문제를 처리합니다. | |
| 문맥은 OCR 엔진의 출력에서 가져오며, 여기서는 Google의 Tesseract를 사용합니다.`,nl,R,ka="시작하기 전에 필요한 라이브러리가 모두 설치되어 있는지 확인하세요. LayoutLMv2는 detectron2, torchvision 및 테서랙트를 필요로 합니다.",tl,k,el,Z,pl,Q,il,A,Za="필요한 라이브러리들을 모두 설치한 후 런타임을 다시 시작합니다.",cl,$,Qa=`커뮤니티에 당신의 모델을 공유하는 것을 권장합니다. Hugging Face 계정에 로그인해서 모델을 🤗 Hub에 업로드하세요. | |
| 프롬프트가 실행되면, 로그인을 위해 토큰을 입력하세요:`,jl,X,rl,v,Aa="몇 가지 전역 변수를 정의해 보겠습니다.",Ml,N,hl,G,ol,E,$a=`이 가이드에서는 🤗 Hub에서 찾을 수 있는 전처리된 DocVQA의 작은 샘플을 사용합니다. | |
| DocVQA의 전체 데이터 세트를 사용하고 싶다면, <a href="https://rrc.cvc.uab.es/?ch=17" rel="nofollow">DocVQA homepage</a>에 가입 후 다운로드 할 수 있습니다. 전체 데이터 세트를 다운로드 했다면, 이 가이드를 계속 진행하기 위해 <a href="https://huggingface.co/docs/datasets/loading#local-and-remote-files" rel="nofollow">🤗 dataset에 파일을 가져오는 방법</a>을 확인하세요.`,xl,B,gl,F,Xa="보시다시피, 데이터 세트는 이미 훈련 세트와 테스트 세트로 나누어져 있습니다. 무작위로 예제를 살펴보면서 특성을 확인해보세요.",Ul,W,ml,z,va="각 필드가 나타내는 내용은 다음과 같습니다:",dl,q,Na="<li><code>id</code>: 예제의 id</li> <li><code>image</code>: 문서 이미지를 포함하는 PIL.Image.Image 객체</li> <li><code>query</code>: 질문 문자열 - 여러 언어의 자연어로 된 질문</li> <li><code>answers</code>: 사람이 주석을 단 정답 리스트</li> <li><code>words</code> and <code>bounding_boxes</code>: OCR의 결과값들이며 이 가이드에서는 사용하지 않을 예정</li> <li><code>answer</code>: 다른 모델과 일치하는 답변이며 이 가이드에서는 사용하지 않을 예정</li>",Jl,S,Ga=`영어로 된 질문만 남기고 다른 모델에 대한 예측을 포함하는 <code>answer</code> 특성을 삭제하겠습니다. | |
| 그리고 주석 작성자가 제공한 데이터 세트에서 첫 번째 답변을 가져옵니다. 또는 무작위로 샘플을 추출할 수도 있습니다.`,yl,Y,wl,H,Ea=`이 가이드에서 사용하는 LayoutLMv2 체크포인트는 <code>max_position_embeddings = 512</code>로 훈련되었습니다(이 정보는 <a href="https://huggingface.co/microsoft/layoutlmv2-base-uncased/blob/main/config.json#L18" rel="nofollow">체크포인트의 <code>config.json</code> 파일</a>에서 확인할 수 있습니다). | |
| 바로 예제를 잘라낼 수도 있지만, 긴 문서의 끝에 답변이 있어 잘리는 상황을 피하기 위해 여기서는 임베딩이 512보다 길어질 가능성이 있는 몇 가지 예제를 제거하겠습니다. | |
| 데이터 세트에 있는 대부분의 문서가 긴 경우 슬라이딩 윈도우 방법을 사용할 수 있습니다 - 자세한 내용을 확인하고 싶으면 이 <a href="https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb" rel="nofollow">노트북</a>을 확인하세요.`,Tl,D,ul,L,Ba=`이 시점에서 이 데이터 세트의 OCR 특성도 제거해 보겠습니다. OCR 특성은 다른 모델을 미세 조정하기 위한 것으로, 이 가이드에서 사용하는 모델의 입력 요구 사항과 일치하지 않기 때문에 이 특성을 사용하기 위해서는 일부 처리가 필요합니다. | |
| 대신, 원본 데이터에 <code>LayoutLMv2Processor</code>를 사용하여 OCR 및 토큰화를 모두 수행할 수 있습니다. | |
| 이렇게 하면 모델이 요구하는 입력을 얻을 수 있습니다. | |
| 이미지를 수동으로 처리하려면, <a href="../model_doc/layoutlmv2"><code>LayoutLMv2</code> model documentation</a>에서 모델이 요구하는 입력 포맷을 확인해보세요.`,Cl,O,bl,P,Fa="마지막으로, 데이터 탐색을 완료하기 위해 이미지 예시를 살펴봅시다.",fl,K,Il,u,Wa='<img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/docvqa_example.jpg" alt="DocVQA Image Example"/>',_l,ss,Vl,ls,za=`문서 질의 응답 태스크는 멀티모달 태스크이며, 각 모달리티의 입력이 모델의 요구에 맞게 전처리 되었는지 확인해야 합니다. | |
| 이미지 데이터를 처리할 수 있는 이미지 프로세서와 텍스트 데이터를 인코딩할 수 있는 토크나이저를 결합한 <code>LayoutLMv2Processor</code>를 가져오는 것부터 시작해 보겠습니다.`,Rl,as,kl,ns,Zl,ts,qa=`먼저, 프로세서의 <code>image_processor</code>를 사용해 모델에 대한 문서 이미지를 준비해 보겠습니다. | |
| 기본값으로, 이미지 프로세서는 이미지 크기를 224x224로 조정하고 색상 채널의 순서가 올바른지 확인한 후 단어와 정규화된 바운딩 박스를 얻기 위해 테서랙트를 사용해 OCR를 적용합니다. | |
| 이 튜토리얼에서 우리가 필요한 것과 기본값은 완전히 동일합니다. 이미지 배치에 기본 이미지 처리를 적용하고 OCR의 결과를 변환하는 함수를 작성합니다.`,Ql,es,Al,ps,Sa="이 전처리를 데이터 세트 전체에 빠르게 적용하려면 <code>map</code>를 사용하세요.",$l,is,Xl,cs,vl,js,Ya=`이미지에 OCR을 적용했으면 데이터 세트의 텍스트 부분을 모델에 맞게 인코딩해야 합니다. | |
| 이 인코딩에는 이전 단계에서 가져온 단어와 박스를 토큰 수준의 <code>input_ids</code>, <code>attention_mask</code>, <code>token_type_ids</code> 및 <code>bbox</code>로 변환하는 작업이 포함됩니다. | |
| 텍스트를 전처리하려면 프로세서의 <code>tokenizer</code>가 필요합니다.`,Nl,rs,Gl,Ms,Ha="위에서 언급한 전처리 외에도 모델을 위해 레이블을 추가해야 합니다. 🤗 Transformers의 <code>xxxForQuestionAnswering</code> 모델의 경우, 레이블은 <code>start_positions</code>와 <code>end_positions</code>로 구성되며 어떤 토큰이 답변의 시작과 끝에 있는지를 나타냅니다.",El,hs,Da="레이블 추가를 위해서, 먼저 더 큰 리스트(단어 리스트)에서 하위 리스트(단어로 분할된 답변)을 찾을 수 있는 헬퍼 함수를 정의합니다.",Bl,os,La=`이 함수는 <code>words_list</code>와 <code>answer_list</code>, 이렇게 두 리스트를 입력으로 받습니다. | |
| 그런 다음 <code>words_list</code>를 반복하여 <code>words_list</code>의 현재 단어(words_list[i])가 <code>answer_list</code>의 첫 번째 단어(answer_list[0])와 같은지, | |
| 현재 단어에서 시작해 <code>answer_list</code>와 같은 길이만큼의 <code>words_list</code>의 하위 리스트가 <code>answer_list</code>와 일치하는지 확인합니다. | |
| 이 조건이 참이라면 일치하는 항목을 발견했음을 의미하며, 함수는 일치 항목, 시작 인덱스(idx) 및 종료 인덱스(idx + len(answer_list) - 1)를 기록합니다. 일치하는 항목이 두 개 이상 발견되면 함수는 첫 번째 항목만 반환합니다. 일치하는 항목이 없다면 함수는 (<code>None</code>, 0, 0)을 반환합니다.`,Fl,xs,Wl,gs,Oa="이 함수가 어떻게 정답의 위치를 찾는지 설명하기 위해 다음 예제에서 함수를 사용해 보겠습니다:",zl,Us,ql,ms,Pa="한편, 위 예제가 인코딩되면 다음과 같이 표시됩니다:",Sl,ds,Yl,Js,Ka="이제 인코딩된 입력에서 정답의 위치를 찾아야 합니다.",Hl,ys,sn="<li><code>token_type_ids</code>는 어떤 토큰이 질문에 속하는지, 그리고 어떤 토큰이 문서의 단어에 포함되는지를 알려줍니다.</li> <li><code>tokenizer.cls_token_id</code> 입력의 시작 부분에 있는 특수 토큰을 찾는 데 도움을 줍니다.</li> <li><code>word_ids</code>는 원본 <code>words</code>에서 찾은 답변을 전체 인코딩된 입력의 동일한 답과 일치시키고 인코딩된 입력에서 답변의 시작/끝 위치를 결정합니다.</li>",Dl,ws,ln="위 내용들을 염두에 두고 데이터 세트 예제의 배치를 인코딩하는 함수를 만들어 보겠습니다:",Ll,Ts,Ol,us,an="이제 이 전처리 함수가 있으니 전체 데이터 세트를 인코딩할 수 있습니다:",Pl,Cs,Kl,bs,nn="인코딩된 데이터 세트의 특성이 어떻게 생겼는지 확인해 보겠습니다:",sa,fs,la,Is,aa,_s,tn=`문서 질의 응답을 평가하려면 상당한 양의 후처리가 필요합니다. 시간이 너무 많이 걸리지 않도록 이 가이드에서는 평가 단계를 생략합니다. | |
| <code>Trainer</code>가 훈련 과정에서 평가 손실(evaluation loss)을 계속 계산하기 때문에 모델의 성능을 대략적으로 알 수 있습니다. | |
| 추출적(Extractive) 질의 응답은 보통 F1/exact match 방법을 사용해 평가됩니다. | |
| 직접 구현해보고 싶으시다면, Hugging Face course의 <a href="https://huggingface.co/course/chapter7/7?fw=pt#postprocessing" rel="nofollow">Question Answering chapter</a>을 참고하세요.`,na,Vs,ta,Rs,en=`축하합니다! 이 가이드의 가장 어려운 부분을 성공적으로 처리했으니 이제 나만의 모델을 훈련할 준비가 되었습니다. | |
| 훈련은 다음과 같은 단계로 이루어져 있습니다:`,ea,ks,pn="<li>전처리에서의 동일한 체크포인트를 사용하기 위해 <code>AutoModelForDocumentQuestionAnswering</code>으로 모델을 가져옵니다.</li> <li><code>TrainingArguments</code>로 훈련 하이퍼파라미터를 정합니다.</li> <li>예제를 배치 처리하는 함수를 정의합니다. 여기서는 <code>DefaultDataCollator</code>가 적당합니다.</li> <li>모델, 데이터 세트, 데이터 콜레이터(Data collator)와 함께 <code>Trainer</code>에 훈련 인수들을 전달합니다.</li> <li><code>train()</code>을 호출해서 모델을 미세 조정합니다.</li>",pa,Zs,ia,Qs,cn=`<code>TrainingArguments</code>에서 <code>output_dir</code>을 사용하여 모델을 저장할 위치를 지정하고, 적절한 하이퍼파라미터를 설정합니다. | |
| 모델을 커뮤니티와 공유하려면 <code>push_to_hub</code>를 <code>True</code>로 설정하세요 (모델을 업로드하려면 Hugging Face에 로그인해야 합니다). | |
| 이 경우 <code>output_dir</code>은 모델의 체크포인트를 푸시할 레포지토리의 이름이 됩니다.`,ca,As,ja,$s,jn="간단한 데이터 콜레이터를 정의하여 예제를 함께 배치합니다.",ra,Xs,Ma,vs,rn="마지막으로, 모든 것을 한 곳에 모아 <code>train()</code>을 호출합니다:",ha,Ns,oa,Gs,Mn="최종 모델을 🤗 Hub에 추가하려면, 모델 카드를 생성하고 <code>push_to_hub</code>를 호출합니다:",xa,Es,ga,Bs,Ua,Fs,hn=`이제 LayoutLMv2 모델을 미세 조정하고 🤗 Hub에 업로드했으니 추론에도 사용할 수 있습니다. | |
| 추론을 위해 미세 조정된 모델을 사용해 보는 가장 간단한 방법은 <code>Pipeline</code>을 사용하는 것 입니다.`,ma,Ws,on="예를 들어 보겠습니다:",da,zs,Ja,qs,xn="그 다음, 모델로 문서 질의 응답을 하기 위해 파이프라인을 인스턴스화하고 이미지 + 질문 조합을 전달합니다.",ya,Ss,wa,Ys,gn="원한다면 파이프라인의 결과를 수동으로 복제할 수도 있습니다:",Ta,Hs,Un="<li>이미지와 질문을 가져와 모델의 프로세서를 사용해 모델에 맞게 준비합니다.</li> <li>모델을 통해 결과 또는 전처리를 전달합니다.</li> <li>모델은 어떤 토큰이 답변의 시작에 있는지, 어떤 토큰이 답변이 끝에 있는지를 나타내는 <code>start_logits</code>와 <code>end_logits</code>를 반환합니다. 둘 다 (batch_size, sequence_length) 형태를 갖습니다.</li> <li><code>start_logits</code>와 <code>end_logits</code>의 마지막 차원을 최대로 만드는 값을 찾아 예상 <code>start_idx</code>와 <code>end_idx</code>를 얻습니다.</li> <li>토크나이저로 답변을 디코딩합니다.</li>",ua,Ds,Ca,Ls,ba;return w=new C({props:{title:"문서 질의 응답(Document Question Answering)",local:"document_question_answering",headingTag:"h1"}}),J=new In({props:{classNames:"absolute z-10 right-0 top-0",options:[{label:"Mixed",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/ko/document_question_answering.ipynb"},{label:"PyTorch",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/ko/pytorch/document_question_answering.ipynb"},{label:"TensorFlow",value:"https://colab.research.google.com/github/huggingface/notebooks/blob/main/transformers_doc/ko/tensorflow/document_question_answering.ipynb"},{label:"Mixed",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/ko/document_question_answering.ipynb"},{label:"PyTorch",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/ko/pytorch/document_question_answering.ipynb"},{label:"TensorFlow",value:"https://studiolab.sagemaker.aws/import/github/huggingface/notebooks/blob/main/transformers_doc/ko/tensorflow/document_question_answering.ipynb"}]}}),T=new fn({props:{$$slots:{default:[_n]},$$scope:{ctx:Os}}}),k=new g({props:{code:"cGlwJTIwaW5zdGFsbCUyMC1xJTIwdHJhbnNmb3JtZXJzJTIwZGF0YXNldHM=",highlighted:"pip install -q transformers datasets",wrap:!1}}),Z=new g({props:{code:"cGlwJTIwaW5zdGFsbCUyMCdnaXQlMkJodHRwcyUzQSUyRiUyRmdpdGh1Yi5jb20lMkZmYWNlYm9va3Jlc2VhcmNoJTJGZGV0ZWN0cm9uMi5naXQnJTBBcGlwJTIwaW5zdGFsbCUyMHRvcmNodmlzaW9u",highlighted:`pip install <span class="hljs-string">'git+https://github.com/facebookresearch/detectron2.git'</span> | |
| pip install torchvision`,wrap:!1}}),Q=new g({props:{code:"c3VkbyUyMGFwdCUyMGluc3RhbGwlMjB0ZXNzZXJhY3Qtb2NyJTBBcGlwJTIwaW5zdGFsbCUyMC1xJTIwcHl0ZXNzZXJhY3Q=",highlighted:`sudo apt install tesseract-ocr | |
| pip install -q pytesseract`,wrap:!1}}),X=new g({props:{code:"ZnJvbSUyMGh1Z2dpbmdmYWNlX2h1YiUyMGltcG9ydCUyMG5vdGVib29rX2xvZ2luJTBBJTBBbm90ZWJvb2tfbG9naW4oKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login | |
| <span class="hljs-meta">>>> </span>notebook_login()`,wrap:!1}}),N=new g({props:{code:"bW9kZWxfY2hlY2twb2ludCUyMCUzRCUyMCUyMm1pY3Jvc29mdCUyRmxheW91dGxtdjItYmFzZS11bmNhc2VkJTIyJTBBYmF0Y2hfc2l6ZSUyMCUzRCUyMDQ=",highlighted:`<span class="hljs-meta">>>> </span>model_checkpoint = <span class="hljs-string">"microsoft/layoutlmv2-base-uncased"</span> | |
| <span class="hljs-meta">>>> </span>batch_size = <span class="hljs-number">4</span>`,wrap:!1}}),G=new C({props:{title:"데이터 불러오기",local:"load-the-data",headingTag:"h2"}}),B=new g({props:{code:"ZnJvbSUyMGRhdGFzZXRzJTIwaW1wb3J0JTIwbG9hZF9kYXRhc2V0JTBBJTBBZGF0YXNldCUyMCUzRCUyMGxvYWRfZGF0YXNldCglMjJuaWVsc3IlMkZkb2N2cWFfMTIwMF9leGFtcGxlcyUyMiklMEFkYXRhc2V0",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset | |
| <span class="hljs-meta">>>> </span>dataset = load_dataset(<span class="hljs-string">"nielsr/docvqa_1200_examples"</span>) | |
| <span class="hljs-meta">>>> </span>dataset | |
| DatasetDict({ | |
| train: Dataset({ | |
| features: [<span class="hljs-string">'id'</span>, <span class="hljs-string">'image'</span>, <span class="hljs-string">'query'</span>, <span class="hljs-string">'answers'</span>, <span class="hljs-string">'words'</span>, <span class="hljs-string">'bounding_boxes'</span>, <span class="hljs-string">'answer'</span>], | |
| num_rows: <span class="hljs-number">1000</span> | |
| }) | |
| test: Dataset({ | |
| features: [<span class="hljs-string">'id'</span>, <span class="hljs-string">'image'</span>, <span class="hljs-string">'query'</span>, <span class="hljs-string">'answers'</span>, <span class="hljs-string">'words'</span>, <span class="hljs-string">'bounding_boxes'</span>, <span class="hljs-string">'answer'</span>], | |
| num_rows: <span class="hljs-number">200</span> | |
| }) | |
| })`,wrap:!1}}),W=new g({props:{code:"ZGF0YXNldCU1QiUyMnRyYWluJTIyJTVELmZlYXR1cmVz",highlighted:'<span class="hljs-meta">>>> </span>dataset[<span class="hljs-string">"train"</span>].features',wrap:!1}}),Y=new g({props:{code:"dXBkYXRlZF9kYXRhc2V0JTIwJTNEJTIwZGF0YXNldC5tYXAobGFtYmRhJTIwZXhhbXBsZSUzQSUyMCU3QiUyMnF1ZXN0aW9uJTIyJTNBJTIwZXhhbXBsZSU1QiUyMnF1ZXJ5JTIyJTVEJTVCJTIyZW4lMjIlNUQlN0QlMkMlMjByZW1vdmVfY29sdW1ucyUzRCU1QiUyMnF1ZXJ5JTIyJTVEKSUwQXVwZGF0ZWRfZGF0YXNldCUyMCUzRCUyMHVwZGF0ZWRfZGF0YXNldC5tYXAoJTBBJTIwJTIwJTIwJTIwbGFtYmRhJTIwZXhhbXBsZSUzQSUyMCU3QiUyMmFuc3dlciUyMiUzQSUyMGV4YW1wbGUlNUIlMjJhbnN3ZXJzJTIyJTVEJTVCMCU1RCU3RCUyQyUyMHJlbW92ZV9jb2x1bW5zJTNEJTVCJTIyYW5zd2VyJTIyJTJDJTIwJTIyYW5zd2VycyUyMiU1RCUwQSk=",highlighted:`<span class="hljs-meta">>>> </span>updated_dataset = dataset.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> example: {<span class="hljs-string">"question"</span>: example[<span class="hljs-string">"query"</span>][<span class="hljs-string">"en"</span>]}, remove_columns=[<span class="hljs-string">"query"</span>]) | |
| <span class="hljs-meta">>>> </span>updated_dataset = updated_dataset.<span class="hljs-built_in">map</span>( | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">lambda</span> example: {<span class="hljs-string">"answer"</span>: example[<span class="hljs-string">"answers"</span>][<span class="hljs-number">0</span>]}, remove_columns=[<span class="hljs-string">"answer"</span>, <span class="hljs-string">"answers"</span>] | |
| <span class="hljs-meta">... </span>)`,wrap:!1}}),D=new g({props:{code:"dXBkYXRlZF9kYXRhc2V0JTIwJTNEJTIwdXBkYXRlZF9kYXRhc2V0LmZpbHRlcihsYW1iZGElMjB4JTNBJTIwbGVuKHglNUIlMjJ3b3JkcyUyMiU1RCklMjAlMkIlMjBsZW4oeCU1QiUyMnF1ZXN0aW9uJTIyJTVELnNwbGl0KCkpJTIwJTNDJTIwNTEyKQ==",highlighted:'<span class="hljs-meta">>>> </span>updated_dataset = updated_dataset.<span class="hljs-built_in">filter</span>(<span class="hljs-keyword">lambda</span> x: <span class="hljs-built_in">len</span>(x[<span class="hljs-string">"words"</span>]) + <span class="hljs-built_in">len</span>(x[<span class="hljs-string">"question"</span>].split()) < <span class="hljs-number">512</span>)',wrap:!1}}),O=new g({props:{code:"dXBkYXRlZF9kYXRhc2V0JTIwJTNEJTIwdXBkYXRlZF9kYXRhc2V0LnJlbW92ZV9jb2x1bW5zKCUyMndvcmRzJTIyKSUwQXVwZGF0ZWRfZGF0YXNldCUyMCUzRCUyMHVwZGF0ZWRfZGF0YXNldC5yZW1vdmVfY29sdW1ucyglMjJib3VuZGluZ19ib3hlcyUyMik=",highlighted:`<span class="hljs-meta">>>> </span>updated_dataset = updated_dataset.remove_columns(<span class="hljs-string">"words"</span>) | |
| <span class="hljs-meta">>>> </span>updated_dataset = updated_dataset.remove_columns(<span class="hljs-string">"bounding_boxes"</span>)`,wrap:!1}}),K=new g({props:{code:"dXBkYXRlZF9kYXRhc2V0JTVCJTIydHJhaW4lMjIlNUQlNUIxMSU1RCU1QiUyMmltYWdlJTIyJTVE",highlighted:'<span class="hljs-meta">>>> </span>updated_dataset[<span class="hljs-string">"train"</span>][<span class="hljs-number">11</span>][<span class="hljs-string">"image"</span>]',wrap:!1}}),ss=new C({props:{title:"데이터 전처리",local:"preprocess-the-data",headingTag:"h2"}}),as=new g({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Qcm9jZXNzb3IlMEElMEFwcm9jZXNzb3IlMjAlM0QlMjBBdXRvUHJvY2Vzc29yLmZyb21fcHJldHJhaW5lZChtb2RlbF9jaGVja3BvaW50KQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoProcessor | |
| <span class="hljs-meta">>>> </span>processor = AutoProcessor.from_pretrained(model_checkpoint)`,wrap:!1}}),ns=new C({props:{title:"문서 이미지 전처리",local:"preprocessing-document-images",headingTag:"h3"}}),es=new g({props:{code:"aW1hZ2VfcHJvY2Vzc29yJTIwJTNEJTIwcHJvY2Vzc29yLmltYWdlX3Byb2Nlc3NvciUwQSUwQSUwQWRlZiUyMGdldF9vY3Jfd29yZHNfYW5kX2JveGVzKGV4YW1wbGVzKSUzQSUwQSUyMCUyMCUyMCUyMGltYWdlcyUyMCUzRCUyMCU1QmltYWdlLmNvbnZlcnQoJTIyUkdCJTIyKSUyMGZvciUyMGltYWdlJTIwaW4lMjBleGFtcGxlcyU1QiUyMmltYWdlJTIyJTVEJTVEJTBBJTIwJTIwJTIwJTIwZW5jb2RlZF9pbnB1dHMlMjAlM0QlMjBpbWFnZV9wcm9jZXNzb3IoaW1hZ2VzKSUwQSUwQSUyMCUyMCUyMCUyMGV4YW1wbGVzJTVCJTIyaW1hZ2UlMjIlNUQlMjAlM0QlMjBlbmNvZGVkX2lucHV0cy5waXhlbF92YWx1ZXMlMEElMjAlMjAlMjAlMjBleGFtcGxlcyU1QiUyMndvcmRzJTIyJTVEJTIwJTNEJTIwZW5jb2RlZF9pbnB1dHMud29yZHMlMEElMjAlMjAlMjAlMjBleGFtcGxlcyU1QiUyMmJveGVzJTIyJTVEJTIwJTNEJTIwZW5jb2RlZF9pbnB1dHMuYm94ZXMlMEElMEElMjAlMjAlMjAlMjByZXR1cm4lMjBleGFtcGxlcw==",highlighted:`<span class="hljs-meta">>>> </span>image_processor = processor.image_processor | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_ocr_words_and_boxes</span>(<span class="hljs-params">examples</span>): | |
| <span class="hljs-meta">... </span> images = [image.convert(<span class="hljs-string">"RGB"</span>) <span class="hljs-keyword">for</span> image <span class="hljs-keyword">in</span> examples[<span class="hljs-string">"image"</span>]] | |
| <span class="hljs-meta">... </span> encoded_inputs = image_processor(images) | |
| <span class="hljs-meta">... </span> examples[<span class="hljs-string">"image"</span>] = encoded_inputs.pixel_values | |
| <span class="hljs-meta">... </span> examples[<span class="hljs-string">"words"</span>] = encoded_inputs.words | |
| <span class="hljs-meta">... </span> examples[<span class="hljs-string">"boxes"</span>] = encoded_inputs.boxes | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> examples`,wrap:!1}}),is=new g({props:{code:"ZGF0YXNldF93aXRoX29jciUyMCUzRCUyMHVwZGF0ZWRfZGF0YXNldC5tYXAoZ2V0X29jcl93b3Jkc19hbmRfYm94ZXMlMkMlMjBiYXRjaGVkJTNEVHJ1ZSUyQyUyMGJhdGNoX3NpemUlM0QyKQ==",highlighted:'<span class="hljs-meta">>>> </span>dataset_with_ocr = updated_dataset.<span class="hljs-built_in">map</span>(get_ocr_words_and_boxes, batched=<span class="hljs-literal">True</span>, batch_size=<span class="hljs-number">2</span>)',wrap:!1}}),cs=new C({props:{title:"텍스트 데이터 전처리",local:"preprocessing-text-data",headingTag:"h3"}}),rs=new g({props:{code:"dG9rZW5pemVyJTIwJTNEJTIwcHJvY2Vzc29yLnRva2VuaXplcg==",highlighted:'<span class="hljs-meta">>>> </span>tokenizer = processor.tokenizer',wrap:!1}}),xs=new g({props:{code:"ZGVmJTIwc3ViZmluZGVyKHdvcmRzX2xpc3QlMkMlMjBhbnN3ZXJfbGlzdCklM0ElMEElMjAlMjAlMjAlMjBtYXRjaGVzJTIwJTNEJTIwJTVCJTVEJTBBJTIwJTIwJTIwJTIwc3RhcnRfaW5kaWNlcyUyMCUzRCUyMCU1QiU1RCUwQSUyMCUyMCUyMCUyMGVuZF9pbmRpY2VzJTIwJTNEJTIwJTVCJTVEJTBBJTIwJTIwJTIwJTIwZm9yJTIwaWR4JTJDJTIwaSUyMGluJTIwZW51bWVyYXRlKHJhbmdlKGxlbih3b3Jkc19saXN0KSkpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwaWYlMjB3b3Jkc19saXN0JTVCaSU1RCUyMCUzRCUzRCUyMGFuc3dlcl9saXN0JTVCMCU1RCUyMGFuZCUyMHdvcmRzX2xpc3QlNUJpJTIwJTNBJTIwaSUyMCUyQiUyMGxlbihhbnN3ZXJfbGlzdCklNUQlMjAlM0QlM0QlMjBhbnN3ZXJfbGlzdCUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMG1hdGNoZXMuYXBwZW5kKGFuc3dlcl9saXN0KSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHN0YXJ0X2luZGljZXMuYXBwZW5kKGlkeCklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBlbmRfaW5kaWNlcy5hcHBlbmQoaWR4JTIwJTJCJTIwbGVuKGFuc3dlcl9saXN0KSUyMC0lMjAxKSUwQSUyMCUyMCUyMCUyMGlmJTIwbWF0Y2hlcyUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHJldHVybiUyMG1hdGNoZXMlNUIwJTVEJTJDJTIwc3RhcnRfaW5kaWNlcyU1QjAlNUQlMkMlMjBlbmRfaW5kaWNlcyU1QjAlNUQlMEElMjAlMjAlMjAlMjBlbHNlJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwcmV0dXJuJTIwTm9uZSUyQyUyMDAlMkMlMjAw",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">subfinder</span>(<span class="hljs-params">words_list, answer_list</span>): | |
| <span class="hljs-meta">... </span> matches = [] | |
| <span class="hljs-meta">... </span> start_indices = [] | |
| <span class="hljs-meta">... </span> end_indices = [] | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> idx, i <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(<span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(words_list))): | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> words_list[i] == answer_list[<span class="hljs-number">0</span>] <span class="hljs-keyword">and</span> words_list[i : i + <span class="hljs-built_in">len</span>(answer_list)] == answer_list: | |
| <span class="hljs-meta">... </span> matches.append(answer_list) | |
| <span class="hljs-meta">... </span> start_indices.append(idx) | |
| <span class="hljs-meta">... </span> end_indices.append(idx + <span class="hljs-built_in">len</span>(answer_list) - <span class="hljs-number">1</span>) | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> matches: | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> matches[<span class="hljs-number">0</span>], start_indices[<span class="hljs-number">0</span>], end_indices[<span class="hljs-number">0</span>] | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">else</span>: | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> <span class="hljs-literal">None</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span>`,wrap:!1}}),Us=new g({props:{code:"ZXhhbXBsZSUyMCUzRCUyMGRhdGFzZXRfd2l0aF9vY3IlNUIlMjJ0cmFpbiUyMiU1RCU1QjElNUQlMEF3b3JkcyUyMCUzRCUyMCU1QndvcmQubG93ZXIoKSUyMGZvciUyMHdvcmQlMjBpbiUyMGV4YW1wbGUlNUIlMjJ3b3JkcyUyMiU1RCU1RCUwQW1hdGNoJTJDJTIwd29yZF9pZHhfc3RhcnQlMkMlMjB3b3JkX2lkeF9lbmQlMjAlM0QlMjBzdWJmaW5kZXIod29yZHMlMkMlMjBleGFtcGxlJTVCJTIyYW5zd2VyJTIyJTVELmxvd2VyKCkuc3BsaXQoKSklMEFwcmludCglMjJRdWVzdGlvbiUzQSUyMCUyMiUyQyUyMGV4YW1wbGUlNUIlMjJxdWVzdGlvbiUyMiU1RCklMEFwcmludCglMjJXb3JkcyUzQSUyMiUyQyUyMHdvcmRzKSUwQXByaW50KCUyMkFuc3dlciUzQSUyMCUyMiUyQyUyMGV4YW1wbGUlNUIlMjJhbnN3ZXIlMjIlNUQpJTBBcHJpbnQoJTIyc3RhcnRfaW5kZXglMjIlMkMlMjB3b3JkX2lkeF9zdGFydCklMEFwcmludCglMjJlbmRfaW5kZXglMjIlMkMlMjB3b3JkX2lkeF9lbmQp",highlighted:`<span class="hljs-meta">>>> </span>example = dataset_with_ocr[<span class="hljs-string">"train"</span>][<span class="hljs-number">1</span>] | |
| <span class="hljs-meta">>>> </span>words = [word.lower() <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> example[<span class="hljs-string">"words"</span>]] | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">match</span>, word_idx_start, word_idx_end = subfinder(words, example[<span class="hljs-string">"answer"</span>].lower().split()) | |
| <span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(<span class="hljs-string">"Question: "</span>, example[<span class="hljs-string">"question"</span>]) | |
| <span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(<span class="hljs-string">"Words:"</span>, words) | |
| <span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(<span class="hljs-string">"Answer: "</span>, example[<span class="hljs-string">"answer"</span>]) | |
| <span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(<span class="hljs-string">"start_index"</span>, word_idx_start) | |
| <span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(<span class="hljs-string">"end_index"</span>, word_idx_end) | |
| Question: Who <span class="hljs-keyword">is</span> <span class="hljs-keyword">in</span> cc <span class="hljs-keyword">in</span> this letter? | |
| Words: [<span class="hljs-string">'wie'</span>, <span class="hljs-string">'baw'</span>, <span class="hljs-string">'brown'</span>, <span class="hljs-string">'&'</span>, <span class="hljs-string">'williamson'</span>, <span class="hljs-string">'tobacco'</span>, <span class="hljs-string">'corporation'</span>, <span class="hljs-string">'research'</span>, <span class="hljs-string">'&'</span>, <span class="hljs-string">'development'</span>, <span class="hljs-string">'internal'</span>, <span class="hljs-string">'correspondence'</span>, <span class="hljs-string">'to:'</span>, <span class="hljs-string">'r.'</span>, <span class="hljs-string">'h.'</span>, <span class="hljs-string">'honeycutt'</span>, <span class="hljs-string">'ce:'</span>, <span class="hljs-string">'t.f.'</span>, <span class="hljs-string">'riehl'</span>, <span class="hljs-string">'from:'</span>, <span class="hljs-string">'.'</span>, <span class="hljs-string">'c.j.'</span>, <span class="hljs-string">'cook'</span>, <span class="hljs-string">'date:'</span>, <span class="hljs-string">'may'</span>, <span class="hljs-string">'8,'</span>, <span class="hljs-string">'1995'</span>, <span class="hljs-string">'subject:'</span>, <span class="hljs-string">'review'</span>, <span class="hljs-string">'of'</span>, <span class="hljs-string">'existing'</span>, <span class="hljs-string">'brainstorming'</span>, <span class="hljs-string">'ideas/483'</span>, <span class="hljs-string">'the'</span>, <span class="hljs-string">'major'</span>, <span class="hljs-string">'function'</span>, <span class="hljs-string">'of'</span>, <span class="hljs-string">'the'</span>, <span class="hljs-string">'product'</span>, <span class="hljs-string">'innovation'</span>, <span class="hljs-string">'graup'</span>, <span class="hljs-string">'is'</span>, <span class="hljs-string">'to'</span>, <span class="hljs-string">'develop'</span>, <span class="hljs-string">'marketable'</span>, <span class="hljs-string">'nove!'</span>, <span class="hljs-string">'products'</span>, <span class="hljs-string">'that'</span>, <span class="hljs-string">'would'</span>, <span class="hljs-string">'be'</span>, <span class="hljs-string">'profitable'</span>, <span class="hljs-string">'to'</span>, <span class="hljs-string">'manufacture'</span>, <span class="hljs-string">'and'</span>, <span class="hljs-string">'sell.'</span>, <span class="hljs-string">'novel'</span>, <span class="hljs-string">'is'</span>, <span class="hljs-string">'defined'</span>, <span class="hljs-string">'as:'</span>, <span class="hljs-string">'of'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'new'</span>, <span class="hljs-string">'kind,'</span>, <span class="hljs-string">'or'</span>, <span class="hljs-string">'different'</span>, <span class="hljs-string">'from'</span>, <span class="hljs-string">'anything'</span>, <span class="hljs-string">'seen'</span>, <span class="hljs-string">'or'</span>, <span class="hljs-string">'known'</span>, <span class="hljs-string">'before.'</span>, <span class="hljs-string">'innovation'</span>, <span class="hljs-string">'is'</span>, <span class="hljs-string">'defined'</span>, <span class="hljs-string">'as:'</span>, <span class="hljs-string">'something'</span>, <span class="hljs-string">'new'</span>, <span class="hljs-string">'or'</span>, <span class="hljs-string">'different'</span>, <span class="hljs-string">'introduced;'</span>, <span class="hljs-string">'act'</span>, <span class="hljs-string">'of'</span>, <span class="hljs-string">'innovating;'</span>, <span class="hljs-string">'introduction'</span>, <span class="hljs-string">'of'</span>, <span class="hljs-string">'new'</span>, <span class="hljs-string">'things'</span>, <span class="hljs-string">'or'</span>, <span class="hljs-string">'methods.'</span>, <span class="hljs-string">'the'</span>, <span class="hljs-string">'products'</span>, <span class="hljs-string">'may'</span>, <span class="hljs-string">'incorporate'</span>, <span class="hljs-string">'the'</span>, <span class="hljs-string">'latest'</span>, <span class="hljs-string">'technologies,'</span>, <span class="hljs-string">'materials'</span>, <span class="hljs-string">'and'</span>, <span class="hljs-string">'know-how'</span>, <span class="hljs-string">'available'</span>, <span class="hljs-string">'to'</span>, <span class="hljs-string">'give'</span>, <span class="hljs-string">'then'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'unique'</span>, <span class="hljs-string">'taste'</span>, <span class="hljs-string">'or'</span>, <span class="hljs-string">'look.'</span>, <span class="hljs-string">'the'</span>, <span class="hljs-string">'first'</span>, <span class="hljs-string">'task'</span>, <span class="hljs-string">'of'</span>, <span class="hljs-string">'the'</span>, <span class="hljs-string">'product'</span>, <span class="hljs-string">'innovation'</span>, <span class="hljs-string">'group'</span>, <span class="hljs-string">'was'</span>, <span class="hljs-string">'to'</span>, <span class="hljs-string">'assemble,'</span>, <span class="hljs-string">'review'</span>, <span class="hljs-string">'and'</span>, <span class="hljs-string">'categorize'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'list'</span>, <span class="hljs-string">'of'</span>, <span class="hljs-string">'existing'</span>, <span class="hljs-string">'brainstorming'</span>, <span class="hljs-string">'ideas.'</span>, <span class="hljs-string">'ideas'</span>, <span class="hljs-string">'were'</span>, <span class="hljs-string">'grouped'</span>, <span class="hljs-string">'into'</span>, <span class="hljs-string">'two'</span>, <span class="hljs-string">'major'</span>, <span class="hljs-string">'categories'</span>, <span class="hljs-string">'labeled'</span>, <span class="hljs-string">'appearance'</span>, <span class="hljs-string">'and'</span>, <span class="hljs-string">'taste/aroma.'</span>, <span class="hljs-string">'these'</span>, <span class="hljs-string">'categories'</span>, <span class="hljs-string">'are'</span>, <span class="hljs-string">'used'</span>, <span class="hljs-string">'for'</span>, <span class="hljs-string">'novel'</span>, <span class="hljs-string">'products'</span>, <span class="hljs-string">'that'</span>, <span class="hljs-string">'may'</span>, <span class="hljs-string">'differ'</span>, <span class="hljs-string">'from'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'visual'</span>, <span class="hljs-string">'and/or'</span>, <span class="hljs-string">'taste/aroma'</span>, <span class="hljs-string">'point'</span>, <span class="hljs-string">'of'</span>, <span class="hljs-string">'view'</span>, <span class="hljs-string">'compared'</span>, <span class="hljs-string">'to'</span>, <span class="hljs-string">'canventional'</span>, <span class="hljs-string">'cigarettes.'</span>, <span class="hljs-string">'other'</span>, <span class="hljs-string">'categories'</span>, <span class="hljs-string">'include'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'combination'</span>, <span class="hljs-string">'of'</span>, <span class="hljs-string">'the'</span>, <span class="hljs-string">'above,'</span>, <span class="hljs-string">'filters,'</span>, <span class="hljs-string">'packaging'</span>, <span class="hljs-string">'and'</span>, <span class="hljs-string">'brand'</span>, <span class="hljs-string">'extensions.'</span>, <span class="hljs-string">'appearance'</span>, <span class="hljs-string">'this'</span>, <span class="hljs-string">'category'</span>, <span class="hljs-string">'is'</span>, <span class="hljs-string">'used'</span>, <span class="hljs-string">'for'</span>, <span class="hljs-string">'novel'</span>, <span class="hljs-string">'cigarette'</span>, <span class="hljs-string">'constructions'</span>, <span class="hljs-string">'that'</span>, <span class="hljs-string">'yield'</span>, <span class="hljs-string">'visually'</span>, <span class="hljs-string">'different'</span>, <span class="hljs-string">'products'</span>, <span class="hljs-string">'with'</span>, <span class="hljs-string">'minimal'</span>, <span class="hljs-string">'changes'</span>, <span class="hljs-string">'in'</span>, <span class="hljs-string">'smoke'</span>, <span class="hljs-string">'chemistry'</span>, <span class="hljs-string">'two'</span>, <span class="hljs-string">'cigarettes'</span>, <span class="hljs-string">'in'</span>, <span class="hljs-string">'cne.'</span>, <span class="hljs-string">'emulti-plug'</span>, <span class="hljs-string">'te'</span>, <span class="hljs-string">'build'</span>, <span class="hljs-string">'yaur'</span>, <span class="hljs-string">'awn'</span>, <span class="hljs-string">'cigarette.'</span>, <span class="hljs-string">'eswitchable'</span>, <span class="hljs-string">'menthol'</span>, <span class="hljs-string">'or'</span>, <span class="hljs-string">'non'</span>, <span class="hljs-string">'menthol'</span>, <span class="hljs-string">'cigarette.'</span>, <span class="hljs-string">'*cigarettes'</span>, <span class="hljs-string">'with'</span>, <span class="hljs-string">'interspaced'</span>, <span class="hljs-string">'perforations'</span>, <span class="hljs-string">'to'</span>, <span class="hljs-string">'enable'</span>, <span class="hljs-string">'smoker'</span>, <span class="hljs-string">'to'</span>, <span class="hljs-string">'separate'</span>, <span class="hljs-string">'unburned'</span>, <span class="hljs-string">'section'</span>, <span class="hljs-string">'for'</span>, <span class="hljs-string">'future'</span>, <span class="hljs-string">'smoking.'</span>, <span class="hljs-string">'«short'</span>, <span class="hljs-string">'cigarette,'</span>, <span class="hljs-string">'tobacco'</span>, <span class="hljs-string">'section'</span>, <span class="hljs-string">'30'</span>, <span class="hljs-string">'mm.'</span>, <span class="hljs-string">'«extremely'</span>, <span class="hljs-string">'fast'</span>, <span class="hljs-string">'buming'</span>, <span class="hljs-string">'cigarette.'</span>, <span class="hljs-string">'«novel'</span>, <span class="hljs-string">'cigarette'</span>, <span class="hljs-string">'constructions'</span>, <span class="hljs-string">'that'</span>, <span class="hljs-string">'permit'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'significant'</span>, <span class="hljs-string">'reduction'</span>, <span class="hljs-string">'iretobacco'</span>, <span class="hljs-string">'weight'</span>, <span class="hljs-string">'while'</span>, <span class="hljs-string">'maintaining'</span>, <span class="hljs-string">'smoking'</span>, <span class="hljs-string">'mechanics'</span>, <span class="hljs-string">'and'</span>, <span class="hljs-string">'visual'</span>, <span class="hljs-string">'characteristics.'</span>, <span class="hljs-string">'higher'</span>, <span class="hljs-string">'basis'</span>, <span class="hljs-string">'weight'</span>, <span class="hljs-string">'paper:'</span>, <span class="hljs-string">'potential'</span>, <span class="hljs-string">'reduction'</span>, <span class="hljs-string">'in'</span>, <span class="hljs-string">'tobacco'</span>, <span class="hljs-string">'weight.'</span>, <span class="hljs-string">'«more'</span>, <span class="hljs-string">'rigid'</span>, <span class="hljs-string">'tobacco'</span>, <span class="hljs-string">'column;'</span>, <span class="hljs-string">'stiffing'</span>, <span class="hljs-string">'agent'</span>, <span class="hljs-string">'for'</span>, <span class="hljs-string">'tobacco;'</span>, <span class="hljs-string">'e.g.'</span>, <span class="hljs-string">'starch'</span>, <span class="hljs-string">'*colored'</span>, <span class="hljs-string">'tow'</span>, <span class="hljs-string">'and'</span>, <span class="hljs-string">'cigarette'</span>, <span class="hljs-string">'papers;'</span>, <span class="hljs-string">'seasonal'</span>, <span class="hljs-string">'promotions,'</span>, <span class="hljs-string">'e.g.'</span>, <span class="hljs-string">'pastel'</span>, <span class="hljs-string">'colored'</span>, <span class="hljs-string">'cigarettes'</span>, <span class="hljs-string">'for'</span>, <span class="hljs-string">'easter'</span>, <span class="hljs-string">'or'</span>, <span class="hljs-string">'in'</span>, <span class="hljs-string">'an'</span>, <span class="hljs-string">'ebony'</span>, <span class="hljs-string">'and'</span>, <span class="hljs-string">'ivory'</span>, <span class="hljs-string">'brand'</span>, <span class="hljs-string">'containing'</span>, <span class="hljs-string">'a'</span>, <span class="hljs-string">'mixture'</span>, <span class="hljs-string">'of'</span>, <span class="hljs-string">'all'</span>, <span class="hljs-string">'black'</span>, <span class="hljs-string">'(black'</span>, <span class="hljs-string">'paper'</span>, <span class="hljs-string">'and'</span>, <span class="hljs-string">'tow)'</span>, <span class="hljs-string">'and'</span>, <span class="hljs-string">'ail'</span>, <span class="hljs-string">'white'</span>, <span class="hljs-string">'cigarettes.'</span>, <span class="hljs-string">'499150498'</span>] | |
| Answer: T.F. Riehl | |
| start_index <span class="hljs-number">17</span> | |
| end_index <span class="hljs-number">18</span>`,wrap:!1}}),ds=new g({props:{code:"ZW5jb2RpbmclMjAlM0QlMjB0b2tlbml6ZXIoZXhhbXBsZSU1QiUyMnF1ZXN0aW9uJTIyJTVEJTJDJTIwZXhhbXBsZSU1QiUyMndvcmRzJTIyJTVEJTJDJTIwZXhhbXBsZSU1QiUyMmJveGVzJTIyJTVEKSUwQXRva2VuaXplci5kZWNvZGUoZW5jb2RpbmclNUIlMjJpbnB1dF9pZHMlMjIlNUQp",highlighted:`<span class="hljs-meta">>>> </span>encoding = tokenizer(example[<span class="hljs-string">"question"</span>], example[<span class="hljs-string">"words"</span>], example[<span class="hljs-string">"boxes"</span>]) | |
| <span class="hljs-meta">>>> </span>tokenizer.decode(encoding[<span class="hljs-string">"input_ids"</span>]) | |
| [CLS] who <span class="hljs-keyword">is</span> <span class="hljs-keyword">in</span> cc <span class="hljs-keyword">in</span> this letter? [SEP] wie baw brown & williamson tobacco corporation research & development ...`,wrap:!1}}),Ts=new g({props:{code:"ZGVmJTIwZW5jb2RlX2RhdGFzZXQoZXhhbXBsZXMlMkMlMjBtYXhfbGVuZ3RoJTNENTEyKSUzQSUwQSUyMCUyMCUyMCUyMHF1ZXN0aW9ucyUyMCUzRCUyMGV4YW1wbGVzJTVCJTIycXVlc3Rpb24lMjIlNUQlMEElMjAlMjAlMjAlMjB3b3JkcyUyMCUzRCUyMGV4YW1wbGVzJTVCJTIyd29yZHMlMjIlNUQlMEElMjAlMjAlMjAlMjBib3hlcyUyMCUzRCUyMGV4YW1wbGVzJTVCJTIyYm94ZXMlMjIlNUQlMEElMjAlMjAlMjAlMjBhbnN3ZXJzJTIwJTNEJTIwZXhhbXBsZXMlNUIlMjJhbnN3ZXIlMjIlNUQlMEElMEElMjAlMjAlMjAlMjAlMjMlMjAlRUMlOTglODglRUMlQTAlOUMlMjAlRUIlQjAlQjAlRUMlQjklOTglRUIlQTUlQkMlMjAlRUMlOUQlQjglRUMlQkQlOTQlRUIlOTQlQTklRUQlOTUlOTglRUElQjMlQTAlMjBzdGFydF9wb3NpdGlvbnMlRUMlOTklODAlMjBlbmRfcG9zaXRpb25zJUVCJUE1JUJDJTIwJUVDJUI0JTg4JUVBJUI4JUIwJUVEJTk5JTk0JUVEJTk1JUE5JUVCJThCJTg4JUVCJThCJUE0JTBBJTIwJTIwJTIwJTIwZW5jb2RpbmclMjAlM0QlMjB0b2tlbml6ZXIocXVlc3Rpb25zJTJDJTIwd29yZHMlMkMlMjBib3hlcyUyQyUyMG1heF9sZW5ndGglM0RtYXhfbGVuZ3RoJTJDJTIwcGFkZGluZyUzRCUyMm1heF9sZW5ndGglMjIlMkMlMjB0cnVuY2F0aW9uJTNEVHJ1ZSklMEElMjAlMjAlMjAlMjBzdGFydF9wb3NpdGlvbnMlMjAlM0QlMjAlNUIlNUQlMEElMjAlMjAlMjAlMjBlbmRfcG9zaXRpb25zJTIwJTNEJTIwJTVCJTVEJTBBJTBBJTIwJTIwJTIwJTIwJTIzJTIwJUVCJUIwJUIwJUVDJUI5JTk4JUVDJTlEJTk4JTIwJUVDJTk4JTg4JUVDJUEwJTlDJUVCJUE1JUJDJTIwJUVCJUIwJTk4JUVCJUIzJUI1JUVEJTk1JUE5JUVCJThCJTg4JUVCJThCJUE0JTBBJTIwJTIwJTIwJTIwZm9yJTIwaSUyMGluJTIwcmFuZ2UobGVuKHF1ZXN0aW9ucykpJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwY2xzX2luZGV4JTIwJTNEJTIwZW5jb2RpbmclNUIlMjJpbnB1dF9pZHMlMjIlNUQlNUJpJTVELmluZGV4KHRva2VuaXplci5jbHNfdG9rZW5faWQpJTBBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIzJTIwJUVDJTk4JTg4JUVDJUEwJTlDJUVDJTlEJTk4JTIwd29yZHMlRUMlOTclOTAlRUMlODQlOUMlMjAlRUIlOEIlQjUlRUIlQjMlODAlRUMlOUQlOTglMjAlRUMlOUMlODQlRUMlQjklOTglRUIlQTUlQkMlMjAlRUMlQjAlQkUlRUMlOEElQjUlRUIlOEIlODglRUIlOEIlQTQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB3b3Jkc19leGFtcGxlJTIwJTNEJTIwJTVCd29yZC5sb3dlcigpJTIwZm9yJTIwd29yZCUyMGluJTIwd29yZHMlNUJpJTVEJTVEJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwYW5zd2VyJTIwJTNEJTIwYW5zd2VycyU1QmklNUQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBtYXRjaCUyQyUyMHdvcmRfaWR4X3N0YXJ0JTJDJTIwd29yZF9pZHhfZW5kJTIwJTNEJTIwc3ViZmluZGVyKHdvcmRzX2V4YW1wbGUlMkMlMjBhbnN3ZXIubG93ZXIoKS5zcGxpdCgpKSUwQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGlmJTIwbWF0Y2glM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjMlMjAlRUMlOUQlQkMlRUMlQjklOTglRUQlOTUlOTglRUIlOEElOTQlMjAlRUQlOTUlQUQlRUIlQUElQTklRUMlOUQlODQlMjAlRUIlQjAlOUMlRUElQjIlQUMlRUQlOTUlOTglRUIlQTklQjQlMkMlMjAlNjB0b2tlbl90eXBlX2lkcyU2MCVFQiVBNSVCQyUyMCVFQyU4MiVBQyVFQyU5QSVBOSVFRCU5NSVCNCUyMCVFQyU5RCVCOCVFQyVCRCU5NCVFQiU5NCVBOSVFQyU5NyU5MCVFQyU4NCU5QyUyMCVFQiU4QiVBOCVFQyU5NiVCNCVFQSVCMCU4MCUyMCVFQyU4QiU5QyVFQyU5RSU5MSVFRCU5NSU5OCVFQiU4QSU5NCUyMCVFQyU5QyU4NCVFQyVCOSU5OCVFQiVBNSVCQyUyMCVFQyVCMCVCRSVFQyU4QSVCNSVFQiU4QiU4OCVFQiU4QiVBNCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHRva2VuX3R5cGVfaWRzJTIwJTNEJTIwZW5jb2RpbmclNUIlMjJ0b2tlbl90eXBlX2lkcyUyMiU1RCU1QmklNUQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB0b2tlbl9zdGFydF9pbmRleCUyMCUzRCUyMDAlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB3aGlsZSUyMHRva2VuX3R5cGVfaWRzJTVCdG9rZW5fc3RhcnRfaW5kZXglNUQlMjAhJTNEJTIwMSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHRva2VuX3N0YXJ0X2luZGV4JTIwJTJCJTNEJTIwMSUwQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHRva2VuX2VuZF9pbmRleCUyMCUzRCUyMGxlbihlbmNvZGluZyU1QiUyMmlucHV0X2lkcyUyMiU1RCU1QmklNUQpJTIwLSUyMDElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB3aGlsZSUyMHRva2VuX3R5cGVfaWRzJTVCdG9rZW5fZW5kX2luZGV4JTVEJTIwISUzRCUyMDElM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjB0b2tlbl9lbmRfaW5kZXglMjAtJTNEJTIwMSUwQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHdvcmRfaWRzJTIwJTNEJTIwZW5jb2Rpbmcud29yZF9pZHMoaSklNUJ0b2tlbl9zdGFydF9pbmRleCUyMCUzQSUyMHRva2VuX2VuZF9pbmRleCUyMCUyQiUyMDElNUQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzdGFydF9wb3NpdGlvbiUyMCUzRCUyMGNsc19pbmRleCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGVuZF9wb3NpdGlvbiUyMCUzRCUyMGNsc19pbmRleCUwQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMHdvcmRzJUVDJTlEJTk4JTIwJUVCJThCJUI1JUVCJUIzJTgwJTIwJUVDJTlDJTg0JUVDJUI5JTk4JUVDJTk5JTgwJTIwJUVDJTlEJUJDJUVDJUI5JTk4JUVEJTk1JUEwJTIwJUVCJTk1JThDJUVBJUI5JThDJUVDJUE3JTgwJTIwd29yZF9pZHMlRUIlQTUlQkMlMjAlRUIlQjAlOTglRUIlQjMlQjUlRUQlOTUlOTglRUElQjMlQTAlMjAlNjB0b2tlbl9zdGFydF9pbmRleCU2MCVFQiVBNSVCQyUyMCVFQiU4QSU5OCVFQiVBNiVCRCVFQiU4QiU4OCVFQiU4QiVBNCUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMCVFQyU5RCVCQyVFQyVCOSU5OCVFRCU5NSU5OCVFQiVBOSVCNCUyMCU2MHRva2VuX3N0YXJ0X2luZGV4JTYwJUVCJUE1JUJDJTIwJUVDJTlEJUI4JUVDJUJEJTk0JUVCJTk0JUE5JUVDJTk3JTkwJUVDJTg0JTlDJTIwJUVCJThCJUI1JUVCJUIzJTgwJUVDJTlEJTk4JTIwJTYwc3RhcnRfcG9zaXRpb24lNjAlRUMlOUMlQkMlRUIlQTElOUMlMjAlRUMlQTAlODAlRUMlOUUlQTUlRUQlOTUlQTklRUIlOEIlODglRUIlOEIlQTQlMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBmb3IlMjBpZCUyMGluJTIwd29yZF9pZHMlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpZiUyMGlkJTIwJTNEJTNEJTIwd29yZF9pZHhfc3RhcnQlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBzdGFydF9wb3NpdGlvbiUyMCUzRCUyMHRva2VuX3N0YXJ0X2luZGV4JTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZWxzZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHRva2VuX3N0YXJ0X2luZGV4JTIwJTJCJTNEJTIwMSUwQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMyUyMCVFQiVCOSU4NCVFQyU4QSVCNyVFRCU5NSU5OCVFQSVCMiU4QyUyQyUyMCVFQiU4MSU5RCVFQyU5NyU5MCVFQyU4NCU5QyUyMCVFQyU4QiU5QyVFQyU5RSU5MSVFRCU5NSVCNCUyMCU2MHdvcmRfaWRzJTYwJUVCJUE1JUJDJTIwJUVCJUIwJTk4JUVCJUIzJUI1JUVEJTk1JTk4JUVCJUE5JUIwJTIwJUVCJThCJUI1JUVCJUIzJTgwJUVDJTlEJTk4JTIwJTYwZW5kX3Bvc2l0aW9uJTYwJUVDJTlEJTg0JTIwJUVDJUIwJUJFJUVDJThBJUI1JUVCJThCJTg4JUVCJThCJUE0JTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZm9yJTIwaWQlMjBpbiUyMHdvcmRfaWRzJTVCJTNBJTNBLTElNUQlM0ElMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBpZiUyMGlkJTIwJTNEJTNEJTIwd29yZF9pZHhfZW5kJTNBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZW5kX3Bvc2l0aW9uJTIwJTNEJTIwdG9rZW5fZW5kX2luZGV4JTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZWxzZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHRva2VuX2VuZF9pbmRleCUyMC0lM0QlMjAxJTBBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwc3RhcnRfcG9zaXRpb25zLmFwcGVuZChzdGFydF9wb3NpdGlvbiklMEElMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjAlMjBlbmRfcG9zaXRpb25zLmFwcGVuZChlbmRfcG9zaXRpb24pJTBBJTBBJTIwJTIwJTIwJTIwJTIwJTIwJTIwJTIwZWxzZSUzQSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMHN0YXJ0X3Bvc2l0aW9ucy5hcHBlbmQoY2xzX2luZGV4KSUwQSUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMCUyMGVuZF9wb3NpdGlvbnMuYXBwZW5kKGNsc19pbmRleCklMEElMEElMjAlMjAlMjAlMjBlbmNvZGluZyU1QiUyMmltYWdlJTIyJTVEJTIwJTNEJTIwZXhhbXBsZXMlNUIlMjJpbWFnZSUyMiU1RCUwQSUyMCUyMCUyMCUyMGVuY29kaW5nJTVCJTIyc3RhcnRfcG9zaXRpb25zJTIyJTVEJTIwJTNEJTIwc3RhcnRfcG9zaXRpb25zJTBBJTIwJTIwJTIwJTIwZW5jb2RpbmclNUIlMjJlbmRfcG9zaXRpb25zJTIyJTVEJTIwJTNEJTIwZW5kX3Bvc2l0aW9ucyUwQSUwQSUyMCUyMCUyMCUyMHJldHVybiUyMGVuY29kaW5n",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">encode_dataset</span>(<span class="hljs-params">examples, max_length=<span class="hljs-number">512</span></span>): | |
| <span class="hljs-meta">... </span> questions = examples[<span class="hljs-string">"question"</span>] | |
| <span class="hljs-meta">... </span> words = examples[<span class="hljs-string">"words"</span>] | |
| <span class="hljs-meta">... </span> boxes = examples[<span class="hljs-string">"boxes"</span>] | |
| <span class="hljs-meta">... </span> answers = examples[<span class="hljs-string">"answer"</span>] | |
| <span class="hljs-meta">... </span> <span class="hljs-comment"># 예제 배치를 인코딩하고 start_positions와 end_positions를 초기화합니다</span> | |
| <span class="hljs-meta">... </span> encoding = tokenizer(questions, words, boxes, max_length=max_length, padding=<span class="hljs-string">"max_length"</span>, truncation=<span class="hljs-literal">True</span>) | |
| <span class="hljs-meta">... </span> start_positions = [] | |
| <span class="hljs-meta">... </span> end_positions = [] | |
| <span class="hljs-meta">... </span> <span class="hljs-comment"># 배치의 예제를 반복합니다</span> | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(questions)): | |
| <span class="hljs-meta">... </span> cls_index = encoding[<span class="hljs-string">"input_ids"</span>][i].index(tokenizer.cls_token_id) | |
| <span class="hljs-meta">... </span> <span class="hljs-comment"># 예제의 words에서 답변의 위치를 찾습니다</span> | |
| <span class="hljs-meta">... </span> words_example = [word.lower() <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> words[i]] | |
| <span class="hljs-meta">... </span> answer = answers[i] | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">match</span>, word_idx_start, word_idx_end = subfinder(words_example, answer.lower().split()) | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> <span class="hljs-keyword">match</span>: | |
| <span class="hljs-meta">... </span> <span class="hljs-comment"># 일치하는 항목을 발견하면, \`token_type_ids\`를 사용해 인코딩에서 단어가 시작하는 위치를 찾습니다</span> | |
| <span class="hljs-meta">... </span> token_type_ids = encoding[<span class="hljs-string">"token_type_ids"</span>][i] | |
| <span class="hljs-meta">... </span> token_start_index = <span class="hljs-number">0</span> | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">while</span> token_type_ids[token_start_index] != <span class="hljs-number">1</span>: | |
| <span class="hljs-meta">... </span> token_start_index += <span class="hljs-number">1</span> | |
| <span class="hljs-meta">... </span> token_end_index = <span class="hljs-built_in">len</span>(encoding[<span class="hljs-string">"input_ids"</span>][i]) - <span class="hljs-number">1</span> | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">while</span> token_type_ids[token_end_index] != <span class="hljs-number">1</span>: | |
| <span class="hljs-meta">... </span> token_end_index -= <span class="hljs-number">1</span> | |
| <span class="hljs-meta">... </span> word_ids = encoding.word_ids(i)[token_start_index : token_end_index + <span class="hljs-number">1</span>] | |
| <span class="hljs-meta">... </span> start_position = cls_index | |
| <span class="hljs-meta">... </span> end_position = cls_index | |
| <span class="hljs-meta">... </span> <span class="hljs-comment"># words의 답변 위치와 일치할 때까지 word_ids를 반복하고 \`token_start_index\`를 늘립니다</span> | |
| <span class="hljs-meta">... </span> <span class="hljs-comment"># 일치하면 \`token_start_index\`를 인코딩에서 답변의 \`start_position\`으로 저장합니다</span> | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> <span class="hljs-built_in">id</span> <span class="hljs-keyword">in</span> word_ids: | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> <span class="hljs-built_in">id</span> == word_idx_start: | |
| <span class="hljs-meta">... </span> start_position = token_start_index | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">else</span>: | |
| <span class="hljs-meta">... </span> token_start_index += <span class="hljs-number">1</span> | |
| <span class="hljs-meta">... </span> <span class="hljs-comment"># 비슷하게, 끝에서 시작해 \`word_ids\`를 반복하며 답변의 \`end_position\`을 찾습니다</span> | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> <span class="hljs-built_in">id</span> <span class="hljs-keyword">in</span> word_ids[::-<span class="hljs-number">1</span>]: | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> <span class="hljs-built_in">id</span> == word_idx_end: | |
| <span class="hljs-meta">... </span> end_position = token_end_index | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">else</span>: | |
| <span class="hljs-meta">... </span> token_end_index -= <span class="hljs-number">1</span> | |
| <span class="hljs-meta">... </span> start_positions.append(start_position) | |
| <span class="hljs-meta">... </span> end_positions.append(end_position) | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">else</span>: | |
| <span class="hljs-meta">... </span> start_positions.append(cls_index) | |
| <span class="hljs-meta">... </span> end_positions.append(cls_index) | |
| <span class="hljs-meta">... </span> encoding[<span class="hljs-string">"image"</span>] = examples[<span class="hljs-string">"image"</span>] | |
| <span class="hljs-meta">... </span> encoding[<span class="hljs-string">"start_positions"</span>] = start_positions | |
| <span class="hljs-meta">... </span> encoding[<span class="hljs-string">"end_positions"</span>] = end_positions | |
| <span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> encoding`,wrap:!1}}),Cs=new g({props:{code:"ZW5jb2RlZF90cmFpbl9kYXRhc2V0JTIwJTNEJTIwZGF0YXNldF93aXRoX29jciU1QiUyMnRyYWluJTIyJTVELm1hcCglMEElMjAlMjAlMjAlMjBlbmNvZGVfZGF0YXNldCUyQyUyMGJhdGNoZWQlM0RUcnVlJTJDJTIwYmF0Y2hfc2l6ZSUzRDIlMkMlMjByZW1vdmVfY29sdW1ucyUzRGRhdGFzZXRfd2l0aF9vY3IlNUIlMjJ0cmFpbiUyMiU1RC5jb2x1bW5fbmFtZXMlMEEpJTBBZW5jb2RlZF90ZXN0X2RhdGFzZXQlMjAlM0QlMjBkYXRhc2V0X3dpdGhfb2NyJTVCJTIydGVzdCUyMiU1RC5tYXAoJTBBJTIwJTIwJTIwJTIwZW5jb2RlX2RhdGFzZXQlMkMlMjBiYXRjaGVkJTNEVHJ1ZSUyQyUyMGJhdGNoX3NpemUlM0QyJTJDJTIwcmVtb3ZlX2NvbHVtbnMlM0RkYXRhc2V0X3dpdGhfb2NyJTVCJTIydGVzdCUyMiU1RC5jb2x1bW5fbmFtZXMlMEEp",highlighted:`<span class="hljs-meta">>>> </span>encoded_train_dataset = dataset_with_ocr[<span class="hljs-string">"train"</span>].<span class="hljs-built_in">map</span>( | |
| <span class="hljs-meta">... </span> encode_dataset, batched=<span class="hljs-literal">True</span>, batch_size=<span class="hljs-number">2</span>, remove_columns=dataset_with_ocr[<span class="hljs-string">"train"</span>].column_names | |
| <span class="hljs-meta">... </span>) | |
| <span class="hljs-meta">>>> </span>encoded_test_dataset = dataset_with_ocr[<span class="hljs-string">"test"</span>].<span class="hljs-built_in">map</span>( | |
| <span class="hljs-meta">... </span> encode_dataset, batched=<span class="hljs-literal">True</span>, batch_size=<span class="hljs-number">2</span>, remove_columns=dataset_with_ocr[<span class="hljs-string">"test"</span>].column_names | |
| <span class="hljs-meta">... </span>)`,wrap:!1}}),fs=new g({props:{code:"ZW5jb2RlZF90cmFpbl9kYXRhc2V0LmZlYXR1cmVz",highlighted:`<span class="hljs-meta">>>> </span>encoded_train_dataset.features | |
| {<span class="hljs-string">'image'</span>: <span class="hljs-type">Sequence</span>(feature=<span class="hljs-type">Sequence</span>(feature=<span class="hljs-type">Sequence</span>(feature=Value(dtype=<span class="hljs-string">'uint8'</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), | |
| <span class="hljs-string">'input_ids'</span>: <span class="hljs-type">Sequence</span>(feature=Value(dtype=<span class="hljs-string">'int32'</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), | |
| <span class="hljs-string">'token_type_ids'</span>: <span class="hljs-type">Sequence</span>(feature=Value(dtype=<span class="hljs-string">'int8'</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), | |
| <span class="hljs-string">'attention_mask'</span>: <span class="hljs-type">Sequence</span>(feature=Value(dtype=<span class="hljs-string">'int8'</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), | |
| <span class="hljs-string">'bbox'</span>: <span class="hljs-type">Sequence</span>(feature=<span class="hljs-type">Sequence</span>(feature=Value(dtype=<span class="hljs-string">'int64'</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), | |
| <span class="hljs-string">'start_positions'</span>: Value(dtype=<span class="hljs-string">'int64'</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), | |
| <span class="hljs-string">'end_positions'</span>: Value(dtype=<span class="hljs-string">'int64'</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>)}`,wrap:!1}}),Is=new C({props:{title:"평가",local:"evaluation",headingTag:"h2"}}),Vs=new C({props:{title:"훈련",local:"train",headingTag:"h2"}}),Zs=new g({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMEF1dG9Nb2RlbEZvckRvY3VtZW50UXVlc3Rpb25BbnN3ZXJpbmclMEElMEFtb2RlbCUyMCUzRCUyMEF1dG9Nb2RlbEZvckRvY3VtZW50UXVlc3Rpb25BbnN3ZXJpbmcuZnJvbV9wcmV0cmFpbmVkKG1vZGVsX2NoZWNrcG9pbnQp",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForDocumentQuestionAnswering | |
| <span class="hljs-meta">>>> </span>model = AutoModelForDocumentQuestionAnswering.from_pretrained(model_checkpoint)`,wrap:!1}}),As=new g({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFRyYWluaW5nQXJndW1lbnRzJTBBJTBBJTIzJTIwJUVCJUIzJUI4JUVDJTlEJUI4JUVDJTlEJTk4JTIwJUVCJUEwJTg4JUVEJThGJUFDJUVDJUE3JTgwJUVEJTg2JUEwJUVCJUE2JUFDJTIwSUQlRUIlQTElOUMlMjAlRUIlQjAlOTQlRUElQkUlQjglRUMlODQlQjglRUMlOUElOTQlMEFyZXBvX2lkJTIwJTNEJTIwJTIyTWFyaWFLJTJGbGF5b3V0bG12Mi1iYXNlLXVuY2FzZWRfZmluZXR1bmVkX2RvY3ZxYSUyMiUwQSUwQXRyYWluaW5nX2FyZ3MlMjAlM0QlMjBUcmFpbmluZ0FyZ3VtZW50cyglMEElMjAlMjAlMjAlMjBvdXRwdXRfZGlyJTNEcmVwb19pZCUyQyUwQSUyMCUyMCUyMCUyMHBlcl9kZXZpY2VfdHJhaW5fYmF0Y2hfc2l6ZSUzRDQlMkMlMEElMjAlMjAlMjAlMjBudW1fdHJhaW5fZXBvY2hzJTNEMjAlMkMlMEElMjAlMjAlMjAlMjBzYXZlX3N0ZXBzJTNEMjAwJTJDJTBBJTIwJTIwJTIwJTIwbG9nZ2luZ19zdGVwcyUzRDUwJTJDJTBBJTIwJTIwJTIwJTIwZXZhbHVhdGlvbl9zdHJhdGVneSUzRCUyMnN0ZXBzJTIyJTJDJTBBJTIwJTIwJTIwJTIwbGVhcm5pbmdfcmF0ZSUzRDVlLTUlMkMlMEElMjAlMjAlMjAlMjBzYXZlX3RvdGFsX2xpbWl0JTNEMiUyQyUwQSUyMCUyMCUyMCUyMHJlbW92ZV91bnVzZWRfY29sdW1ucyUzREZhbHNlJTJDJTBBJTIwJTIwJTIwJTIwcHVzaF90b19odWIlM0RUcnVlJTJDJTBBKQ==",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> TrainingArguments | |
| <span class="hljs-meta">>>> </span><span class="hljs-comment"># 본인의 레포지토리 ID로 바꾸세요</span> | |
| <span class="hljs-meta">>>> </span>repo_id = <span class="hljs-string">"MariaK/layoutlmv2-base-uncased_finetuned_docvqa"</span> | |
| <span class="hljs-meta">>>> </span>training_args = TrainingArguments( | |
| <span class="hljs-meta">... </span> output_dir=repo_id, | |
| <span class="hljs-meta">... </span> per_device_train_batch_size=<span class="hljs-number">4</span>, | |
| <span class="hljs-meta">... </span> num_train_epochs=<span class="hljs-number">20</span>, | |
| <span class="hljs-meta">... </span> save_steps=<span class="hljs-number">200</span>, | |
| <span class="hljs-meta">... </span> logging_steps=<span class="hljs-number">50</span>, | |
| <span class="hljs-meta">... </span> evaluation_strategy=<span class="hljs-string">"steps"</span>, | |
| <span class="hljs-meta">... </span> learning_rate=<span class="hljs-number">5e-5</span>, | |
| <span class="hljs-meta">... </span> save_total_limit=<span class="hljs-number">2</span>, | |
| <span class="hljs-meta">... </span> remove_unused_columns=<span class="hljs-literal">False</span>, | |
| <span class="hljs-meta">... </span> push_to_hub=<span class="hljs-literal">True</span>, | |
| <span class="hljs-meta">... </span>)`,wrap:!1}}),Xs=new g({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMERlZmF1bHREYXRhQ29sbGF0b3IlMEElMEFkYXRhX2NvbGxhdG9yJTIwJTNEJTIwRGVmYXVsdERhdGFDb2xsYXRvcigp",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> DefaultDataCollator | |
| <span class="hljs-meta">>>> </span>data_collator = DefaultDataCollator()`,wrap:!1}}),Ns=new g({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMFRyYWluZXIlMEElMEF0cmFpbmVyJTIwJTNEJTIwVHJhaW5lciglMEElMjAlMjAlMjAlMjBtb2RlbCUzRG1vZGVsJTJDJTBBJTIwJTIwJTIwJTIwYXJncyUzRHRyYWluaW5nX2FyZ3MlMkMlMEElMjAlMjAlMjAlMjBkYXRhX2NvbGxhdG9yJTNEZGF0YV9jb2xsYXRvciUyQyUwQSUyMCUyMCUyMCUyMHRyYWluX2RhdGFzZXQlM0RlbmNvZGVkX3RyYWluX2RhdGFzZXQlMkMlMEElMjAlMjAlMjAlMjBldmFsX2RhdGFzZXQlM0RlbmNvZGVkX3Rlc3RfZGF0YXNldCUyQyUwQSUyMCUyMCUyMCUyMHRva2VuaXplciUzRHByb2Nlc3NvciUyQyUwQSklMEElMEF0cmFpbmVyLnRyYWluKCk=",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> Trainer | |
| <span class="hljs-meta">>>> </span>trainer = Trainer( | |
| <span class="hljs-meta">... </span> model=model, | |
| <span class="hljs-meta">... </span> args=training_args, | |
| <span class="hljs-meta">... </span> data_collator=data_collator, | |
| <span class="hljs-meta">... </span> train_dataset=encoded_train_dataset, | |
| <span class="hljs-meta">... </span> eval_dataset=encoded_test_dataset, | |
| <span class="hljs-meta">... </span> tokenizer=processor, | |
| <span class="hljs-meta">... </span>) | |
| <span class="hljs-meta">>>> </span>trainer.train()`,wrap:!1}}),Es=new g({props:{code:"dHJhaW5lci5jcmVhdGVfbW9kZWxfY2FyZCgpJTBBdHJhaW5lci5wdXNoX3RvX2h1Yigp",highlighted:`<span class="hljs-meta">>>> </span>trainer.create_model_card() | |
| <span class="hljs-meta">>>> </span>trainer.push_to_hub()`,wrap:!1}}),Bs=new C({props:{title:"추론",local:"inference",headingTag:"h2"}}),zs=new g({props:{code:"ZXhhbXBsZSUyMCUzRCUyMGRhdGFzZXQlNUIlMjJ0ZXN0JTIyJTVEJTVCMiU1RCUwQXF1ZXN0aW9uJTIwJTNEJTIwZXhhbXBsZSU1QiUyMnF1ZXJ5JTIyJTVEJTVCJTIyZW4lMjIlNUQlMEFpbWFnZSUyMCUzRCUyMGV4YW1wbGUlNUIlMjJpbWFnZSUyMiU1RCUwQXByaW50KHF1ZXN0aW9uKSUwQXByaW50KGV4YW1wbGUlNUIlMjJhbnN3ZXJzJTIyJTVEKQ==",highlighted:`<span class="hljs-meta">>>> </span>example = dataset[<span class="hljs-string">"test"</span>][<span class="hljs-number">2</span>] | |
| <span class="hljs-meta">>>> </span>question = example[<span class="hljs-string">"query"</span>][<span class="hljs-string">"en"</span>] | |
| <span class="hljs-meta">>>> </span>image = example[<span class="hljs-string">"image"</span>] | |
| <span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(question) | |
| <span class="hljs-meta">>>> </span><span class="hljs-built_in">print</span>(example[<span class="hljs-string">"answers"</span>]) | |
| <span class="hljs-string">'Who is ‘presiding’ TRRF GENERAL SESSION (PART 1)?'</span> | |
| [<span class="hljs-string">'TRRF Vice President'</span>, <span class="hljs-string">'lee a. waller'</span>]`,wrap:!1}}),Ss=new g({props:{code:"ZnJvbSUyMHRyYW5zZm9ybWVycyUyMGltcG9ydCUyMHBpcGVsaW5lJTBBJTBBcWFfcGlwZWxpbmUlMjAlM0QlMjBwaXBlbGluZSglMjJkb2N1bWVudC1xdWVzdGlvbi1hbnN3ZXJpbmclMjIlMkMlMjBtb2RlbCUzRCUyMk1hcmlhSyUyRmxheW91dGxtdjItYmFzZS11bmNhc2VkX2ZpbmV0dW5lZF9kb2N2cWElMjIpJTBBcWFfcGlwZWxpbmUoaW1hZ2UlMkMlMjBxdWVzdGlvbik=",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline | |
| <span class="hljs-meta">>>> </span>qa_pipeline = pipeline(<span class="hljs-string">"document-question-answering"</span>, model=<span class="hljs-string">"MariaK/layoutlmv2-base-uncased_finetuned_docvqa"</span>) | |
| <span class="hljs-meta">>>> </span>qa_pipeline(image, question) | |
| [{<span class="hljs-string">'score'</span>: <span class="hljs-number">0.9949808120727539</span>, | |
| <span class="hljs-string">'answer'</span>: <span class="hljs-string">'Lee A. Waller'</span>, | |
| <span class="hljs-string">'start'</span>: <span class="hljs-number">55</span>, | |
| <span class="hljs-string">'end'</span>: <span class="hljs-number">57</span>}]`,wrap:!1}}),Ds=new g({props:{code:"aW1wb3J0JTIwdG9yY2glMEFmcm9tJTIwdHJhbnNmb3JtZXJzJTIwaW1wb3J0JTIwQXV0b1Byb2Nlc3NvciUwQWZyb20lMjB0cmFuc2Zvcm1lcnMlMjBpbXBvcnQlMjBBdXRvTW9kZWxGb3JEb2N1bWVudFF1ZXN0aW9uQW5zd2VyaW5nJTBBJTBBcHJvY2Vzc29yJTIwJTNEJTIwQXV0b1Byb2Nlc3Nvci5mcm9tX3ByZXRyYWluZWQoJTIyTWFyaWFLJTJGbGF5b3V0bG12Mi1iYXNlLXVuY2FzZWRfZmluZXR1bmVkX2RvY3ZxYSUyMiklMEFtb2RlbCUyMCUzRCUyMEF1dG9Nb2RlbEZvckRvY3VtZW50UXVlc3Rpb25BbnN3ZXJpbmcuZnJvbV9wcmV0cmFpbmVkKCUyMk1hcmlhSyUyRmxheW91dGxtdjItYmFzZS11bmNhc2VkX2ZpbmV0dW5lZF9kb2N2cWElMjIpJTBBJTBBd2l0aCUyMHRvcmNoLm5vX2dyYWQoKSUzQSUwQSUyMCUyMCUyMCUyMGVuY29kaW5nJTIwJTNEJTIwcHJvY2Vzc29yKGltYWdlLmNvbnZlcnQoJTIyUkdCJTIyKSUyQyUyMHF1ZXN0aW9uJTJDJTIwcmV0dXJuX3RlbnNvcnMlM0QlMjJwdCUyMiklMEElMjAlMjAlMjAlMjBvdXRwdXRzJTIwJTNEJTIwbW9kZWwoKiplbmNvZGluZyklMEElMjAlMjAlMjAlMjBzdGFydF9sb2dpdHMlMjAlM0QlMjBvdXRwdXRzLnN0YXJ0X2xvZ2l0cyUwQSUyMCUyMCUyMCUyMGVuZF9sb2dpdHMlMjAlM0QlMjBvdXRwdXRzLmVuZF9sb2dpdHMlMEElMjAlMjAlMjAlMjBwcmVkaWN0ZWRfc3RhcnRfaWR4JTIwJTNEJTIwc3RhcnRfbG9naXRzLmFyZ21heCgtMSkuaXRlbSgpJTBBJTIwJTIwJTIwJTIwcHJlZGljdGVkX2VuZF9pZHglMjAlM0QlMjBlbmRfbG9naXRzLmFyZ21heCgtMSkuaXRlbSgpJTBBJTBBcHJvY2Vzc29yLnRva2VuaXplci5kZWNvZGUoZW5jb2RpbmcuaW5wdXRfaWRzLnNxdWVlemUoKSU1QnByZWRpY3RlZF9zdGFydF9pZHglMjAlM0ElMjBwcmVkaWN0ZWRfZW5kX2lkeCUyMCUyQiUyMDElNUQp",highlighted:`<span class="hljs-meta">>>> </span><span class="hljs-keyword">import</span> torch | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoProcessor | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForDocumentQuestionAnswering | |
| <span class="hljs-meta">>>> </span>processor = AutoProcessor.from_pretrained(<span class="hljs-string">"MariaK/layoutlmv2-base-uncased_finetuned_docvqa"</span>) | |
| <span class="hljs-meta">>>> </span>model = AutoModelForDocumentQuestionAnswering.from_pretrained(<span class="hljs-string">"MariaK/layoutlmv2-base-uncased_finetuned_docvqa"</span>) | |
| <span class="hljs-meta">>>> </span><span class="hljs-keyword">with</span> torch.no_grad(): | |
| <span class="hljs-meta">... </span> encoding = processor(image.convert(<span class="hljs-string">"RGB"</span>), question, return_tensors=<span class="hljs-string">"pt"</span>) | |
| <span class="hljs-meta">... </span> outputs = model(**encoding) | |
| <span class="hljs-meta">... </span> start_logits = outputs.start_logits | |
| <span class="hljs-meta">... </span> end_logits = outputs.end_logits | |
| <span class="hljs-meta">... </span> predicted_start_idx = start_logits.argmax(-<span class="hljs-number">1</span>).item() | |
| <span class="hljs-meta">... </span> predicted_end_idx = end_logits.argmax(-<span class="hljs-number">1</span>).item() | |
| <span class="hljs-meta">>>> </span>processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + <span class="hljs-number">1</span>]) | |
| <span class="hljs-string">'lee a. waller'</span>`,wrap:!1}}),{c(){U=p("meta"),b=t(),y=p("p"),d=t(),j(w.$$.fragment),m=t(),j(J.$$.fragment),Ps=t(),f=p("p"),f.textContent=Ia,Ks=t(),I=p("p"),I.textContent=_a,sl=t(),_=p("ul"),_.innerHTML=Va,ll=t(),j(T.$$.fragment),al=t(),V=p("p"),V.textContent=Ra,nl=t(),R=p("p"),R.textContent=ka,tl=t(),j(k.$$.fragment),el=t(),j(Z.$$.fragment),pl=t(),j(Q.$$.fragment),il=t(),A=p("p"),A.textContent=Za,cl=t(),$=p("p"),$.textContent=Qa,jl=t(),j(X.$$.fragment),rl=t(),v=p("p"),v.textContent=Aa,Ml=t(),j(N.$$.fragment),hl=t(),j(G.$$.fragment),ol=t(),E=p("p"),E.innerHTML=$a,xl=t(),j(B.$$.fragment),gl=t(),F=p("p"),F.textContent=Xa,Ul=t(),j(W.$$.fragment),ml=t(),z=p("p"),z.textContent=va,dl=t(),q=p("ul"),q.innerHTML=Na,Jl=t(),S=p("p"),S.innerHTML=Ga,yl=t(),j(Y.$$.fragment),wl=t(),H=p("p"),H.innerHTML=Ea,Tl=t(),j(D.$$.fragment),ul=t(),L=p("p"),L.innerHTML=Ba,Cl=t(),j(O.$$.fragment),bl=t(),P=p("p"),P.textContent=Fa,fl=t(),j(K.$$.fragment),Il=t(),u=p("div"),u.innerHTML=Wa,_l=t(),j(ss.$$.fragment),Vl=t(),ls=p("p"),ls.innerHTML=za,Rl=t(),j(as.$$.fragment),kl=t(),j(ns.$$.fragment),Zl=t(),ts=p("p"),ts.innerHTML=qa,Ql=t(),j(es.$$.fragment),Al=t(),ps=p("p"),ps.innerHTML=Sa,$l=t(),j(is.$$.fragment),Xl=t(),j(cs.$$.fragment),vl=t(),js=p("p"),js.innerHTML=Ya,Nl=t(),j(rs.$$.fragment),Gl=t(),Ms=p("p"),Ms.innerHTML=Ha,El=t(),hs=p("p"),hs.textContent=Da,Bl=t(),os=p("p"),os.innerHTML=La,Fl=t(),j(xs.$$.fragment),Wl=t(),gs=p("p"),gs.textContent=Oa,zl=t(),j(Us.$$.fragment),ql=t(),ms=p("p"),ms.textContent=Pa,Sl=t(),j(ds.$$.fragment),Yl=t(),Js=p("p"),Js.textContent=Ka,Hl=t(),ys=p("ul"),ys.innerHTML=sn,Dl=t(),ws=p("p"),ws.textContent=ln,Ll=t(),j(Ts.$$.fragment),Ol=t(),us=p("p"),us.textContent=an,Pl=t(),j(Cs.$$.fragment),Kl=t(),bs=p("p"),bs.textContent=nn,sa=t(),j(fs.$$.fragment),la=t(),j(Is.$$.fragment),aa=t(),_s=p("p"),_s.innerHTML=tn,na=t(),j(Vs.$$.fragment),ta=t(),Rs=p("p"),Rs.textContent=en,ea=t(),ks=p("ul"),ks.innerHTML=pn,pa=t(),j(Zs.$$.fragment),ia=t(),Qs=p("p"),Qs.innerHTML=cn,ca=t(),j(As.$$.fragment),ja=t(),$s=p("p"),$s.textContent=jn,ra=t(),j(Xs.$$.fragment),Ma=t(),vs=p("p"),vs.innerHTML=rn,ha=t(),j(Ns.$$.fragment),oa=t(),Gs=p("p"),Gs.innerHTML=Mn,xa=t(),j(Es.$$.fragment),ga=t(),j(Bs.$$.fragment),Ua=t(),Fs=p("p"),Fs.innerHTML=hn,ma=t(),Ws=p("p"),Ws.textContent=on,da=t(),j(zs.$$.fragment),Ja=t(),qs=p("p"),qs.textContent=xn,ya=t(),j(Ss.$$.fragment),wa=t(),Ys=p("p"),Ys.textContent=gn,Ta=t(),Hs=p("ol"),Hs.innerHTML=Un,ua=t(),j(Ds.$$.fragment),Ca=t(),Ls=p("p"),this.h()},l(s){const l=Cn("svelte-u9bgzb",document.head);U=i(l,"META",{name:!0,content:!0}),l.forEach(a),b=e(s),y=i(s,"P",{}),dn(y).forEach(a),d=e(s),r(w.$$.fragment,s),m=e(s),r(J.$$.fragment,s),Ps=e(s),f=i(s,"P",{"data-svelte-h":!0}),c(f)!=="svelte-oqnn9h"&&(f.textContent=Ia),Ks=e(s),I=i(s,"P",{"data-svelte-h":!0}),c(I)!=="svelte-k9bbb9"&&(I.textContent=_a),sl=e(s),_=i(s,"UL",{"data-svelte-h":!0}),c(_)!=="svelte-1t9y1pd"&&(_.innerHTML=Va),ll=e(s),r(T.$$.fragment,s),al=e(s),V=i(s,"P",{"data-svelte-h":!0}),c(V)!=="svelte-1t45j8o"&&(V.textContent=Ra),nl=e(s),R=i(s,"P",{"data-svelte-h":!0}),c(R)!=="svelte-ddzvb0"&&(R.textContent=ka),tl=e(s),r(k.$$.fragment,s),el=e(s),r(Z.$$.fragment,s),pl=e(s),r(Q.$$.fragment,s),il=e(s),A=i(s,"P",{"data-svelte-h":!0}),c(A)!=="svelte-1uvmu64"&&(A.textContent=Za),cl=e(s),$=i(s,"P",{"data-svelte-h":!0}),c($)!=="svelte-xyvcw8"&&($.textContent=Qa),jl=e(s),r(X.$$.fragment,s),rl=e(s),v=i(s,"P",{"data-svelte-h":!0}),c(v)!=="svelte-13bifrs"&&(v.textContent=Aa),Ml=e(s),r(N.$$.fragment,s),hl=e(s),r(G.$$.fragment,s),ol=e(s),E=i(s,"P",{"data-svelte-h":!0}),c(E)!=="svelte-12y9xdp"&&(E.innerHTML=$a),xl=e(s),r(B.$$.fragment,s),gl=e(s),F=i(s,"P",{"data-svelte-h":!0}),c(F)!=="svelte-9rg4tz"&&(F.textContent=Xa),Ul=e(s),r(W.$$.fragment,s),ml=e(s),z=i(s,"P",{"data-svelte-h":!0}),c(z)!=="svelte-zq0ej4"&&(z.textContent=va),dl=e(s),q=i(s,"UL",{"data-svelte-h":!0}),c(q)!=="svelte-g2ws24"&&(q.innerHTML=Na),Jl=e(s),S=i(s,"P",{"data-svelte-h":!0}),c(S)!=="svelte-xeapog"&&(S.innerHTML=Ga),yl=e(s),r(Y.$$.fragment,s),wl=e(s),H=i(s,"P",{"data-svelte-h":!0}),c(H)!=="svelte-m4zkx9"&&(H.innerHTML=Ea),Tl=e(s),r(D.$$.fragment,s),ul=e(s),L=i(s,"P",{"data-svelte-h":!0}),c(L)!=="svelte-1xnig6m"&&(L.innerHTML=Ba),Cl=e(s),r(O.$$.fragment,s),bl=e(s),P=i(s,"P",{"data-svelte-h":!0}),c(P)!=="svelte-lp4iy7"&&(P.textContent=Fa),fl=e(s),r(K.$$.fragment,s),Il=e(s),u=i(s,"DIV",{class:!0,"data-svelte-h":!0}),c(u)!=="svelte-q63tj1"&&(u.innerHTML=Wa),_l=e(s),r(ss.$$.fragment,s),Vl=e(s),ls=i(s,"P",{"data-svelte-h":!0}),c(ls)!=="svelte-1y93l9d"&&(ls.innerHTML=za),Rl=e(s),r(as.$$.fragment,s),kl=e(s),r(ns.$$.fragment,s),Zl=e(s),ts=i(s,"P",{"data-svelte-h":!0}),c(ts)!=="svelte-1kcsbqg"&&(ts.innerHTML=qa),Ql=e(s),r(es.$$.fragment,s),Al=e(s),ps=i(s,"P",{"data-svelte-h":!0}),c(ps)!=="svelte-1rksgl2"&&(ps.innerHTML=Sa),$l=e(s),r(is.$$.fragment,s),Xl=e(s),r(cs.$$.fragment,s),vl=e(s),js=i(s,"P",{"data-svelte-h":!0}),c(js)!=="svelte-1l26czp"&&(js.innerHTML=Ya),Nl=e(s),r(rs.$$.fragment,s),Gl=e(s),Ms=i(s,"P",{"data-svelte-h":!0}),c(Ms)!=="svelte-1yiqpne"&&(Ms.innerHTML=Ha),El=e(s),hs=i(s,"P",{"data-svelte-h":!0}),c(hs)!=="svelte-rkm9nf"&&(hs.textContent=Da),Bl=e(s),os=i(s,"P",{"data-svelte-h":!0}),c(os)!=="svelte-1pxhw5r"&&(os.innerHTML=La),Fl=e(s),r(xs.$$.fragment,s),Wl=e(s),gs=i(s,"P",{"data-svelte-h":!0}),c(gs)!=="svelte-mdy9jm"&&(gs.textContent=Oa),zl=e(s),r(Us.$$.fragment,s),ql=e(s),ms=i(s,"P",{"data-svelte-h":!0}),c(ms)!=="svelte-1gff4qz"&&(ms.textContent=Pa),Sl=e(s),r(ds.$$.fragment,s),Yl=e(s),Js=i(s,"P",{"data-svelte-h":!0}),c(Js)!=="svelte-zzn8kk"&&(Js.textContent=Ka),Hl=e(s),ys=i(s,"UL",{"data-svelte-h":!0}),c(ys)!=="svelte-f0s3pn"&&(ys.innerHTML=sn),Dl=e(s),ws=i(s,"P",{"data-svelte-h":!0}),c(ws)!=="svelte-1h1oq5v"&&(ws.textContent=ln),Ll=e(s),r(Ts.$$.fragment,s),Ol=e(s),us=i(s,"P",{"data-svelte-h":!0}),c(us)!=="svelte-10ovv0a"&&(us.textContent=an),Pl=e(s),r(Cs.$$.fragment,s),Kl=e(s),bs=i(s,"P",{"data-svelte-h":!0}),c(bs)!=="svelte-1cmvv07"&&(bs.textContent=nn),sa=e(s),r(fs.$$.fragment,s),la=e(s),r(Is.$$.fragment,s),aa=e(s),_s=i(s,"P",{"data-svelte-h":!0}),c(_s)!=="svelte-8iuplh"&&(_s.innerHTML=tn),na=e(s),r(Vs.$$.fragment,s),ta=e(s),Rs=i(s,"P",{"data-svelte-h":!0}),c(Rs)!=="svelte-1r96dak"&&(Rs.textContent=en),ea=e(s),ks=i(s,"UL",{"data-svelte-h":!0}),c(ks)!=="svelte-dhr9ud"&&(ks.innerHTML=pn),pa=e(s),r(Zs.$$.fragment,s),ia=e(s),Qs=i(s,"P",{"data-svelte-h":!0}),c(Qs)!=="svelte-1wyv6ig"&&(Qs.innerHTML=cn),ca=e(s),r(As.$$.fragment,s),ja=e(s),$s=i(s,"P",{"data-svelte-h":!0}),c($s)!=="svelte-1hdarm6"&&($s.textContent=jn),ra=e(s),r(Xs.$$.fragment,s),Ma=e(s),vs=i(s,"P",{"data-svelte-h":!0}),c(vs)!=="svelte-1heolva"&&(vs.innerHTML=rn),ha=e(s),r(Ns.$$.fragment,s),oa=e(s),Gs=i(s,"P",{"data-svelte-h":!0}),c(Gs)!=="svelte-1gum9w7"&&(Gs.innerHTML=Mn),xa=e(s),r(Es.$$.fragment,s),ga=e(s),r(Bs.$$.fragment,s),Ua=e(s),Fs=i(s,"P",{"data-svelte-h":!0}),c(Fs)!=="svelte-1flysn8"&&(Fs.innerHTML=hn),ma=e(s),Ws=i(s,"P",{"data-svelte-h":!0}),c(Ws)!=="svelte-1dcjn75"&&(Ws.textContent=on),da=e(s),r(zs.$$.fragment,s),Ja=e(s),qs=i(s,"P",{"data-svelte-h":!0}),c(qs)!=="svelte-gaec9h"&&(qs.textContent=xn),ya=e(s),r(Ss.$$.fragment,s),wa=e(s),Ys=i(s,"P",{"data-svelte-h":!0}),c(Ys)!=="svelte-4epvs7"&&(Ys.textContent=gn),Ta=e(s),Hs=i(s,"OL",{"data-svelte-h":!0}),c(Hs)!=="svelte-10zqnq5"&&(Hs.innerHTML=Un),ua=e(s),r(Ds.$$.fragment,s),Ca=e(s),Ls=i(s,"P",{}),dn(Ls).forEach(a),this.h()},h(){fa(U,"name","hf:doc:metadata"),fa(U,"content",Rn),fa(u,"class","flex justify-center")},m(s,l){bn(document.head,U),n(s,b,l),n(s,y,l),n(s,d,l),M(w,s,l),n(s,m,l),M(J,s,l),n(s,Ps,l),n(s,f,l),n(s,Ks,l),n(s,I,l),n(s,sl,l),n(s,_,l),n(s,ll,l),M(T,s,l),n(s,al,l),n(s,V,l),n(s,nl,l),n(s,R,l),n(s,tl,l),M(k,s,l),n(s,el,l),M(Z,s,l),n(s,pl,l),M(Q,s,l),n(s,il,l),n(s,A,l),n(s,cl,l),n(s,$,l),n(s,jl,l),M(X,s,l),n(s,rl,l),n(s,v,l),n(s,Ml,l),M(N,s,l),n(s,hl,l),M(G,s,l),n(s,ol,l),n(s,E,l),n(s,xl,l),M(B,s,l),n(s,gl,l),n(s,F,l),n(s,Ul,l),M(W,s,l),n(s,ml,l),n(s,z,l),n(s,dl,l),n(s,q,l),n(s,Jl,l),n(s,S,l),n(s,yl,l),M(Y,s,l),n(s,wl,l),n(s,H,l),n(s,Tl,l),M(D,s,l),n(s,ul,l),n(s,L,l),n(s,Cl,l),M(O,s,l),n(s,bl,l),n(s,P,l),n(s,fl,l),M(K,s,l),n(s,Il,l),n(s,u,l),n(s,_l,l),M(ss,s,l),n(s,Vl,l),n(s,ls,l),n(s,Rl,l),M(as,s,l),n(s,kl,l),M(ns,s,l),n(s,Zl,l),n(s,ts,l),n(s,Ql,l),M(es,s,l),n(s,Al,l),n(s,ps,l),n(s,$l,l),M(is,s,l),n(s,Xl,l),M(cs,s,l),n(s,vl,l),n(s,js,l),n(s,Nl,l),M(rs,s,l),n(s,Gl,l),n(s,Ms,l),n(s,El,l),n(s,hs,l),n(s,Bl,l),n(s,os,l),n(s,Fl,l),M(xs,s,l),n(s,Wl,l),n(s,gs,l),n(s,zl,l),M(Us,s,l),n(s,ql,l),n(s,ms,l),n(s,Sl,l),M(ds,s,l),n(s,Yl,l),n(s,Js,l),n(s,Hl,l),n(s,ys,l),n(s,Dl,l),n(s,ws,l),n(s,Ll,l),M(Ts,s,l),n(s,Ol,l),n(s,us,l),n(s,Pl,l),M(Cs,s,l),n(s,Kl,l),n(s,bs,l),n(s,sa,l),M(fs,s,l),n(s,la,l),M(Is,s,l),n(s,aa,l),n(s,_s,l),n(s,na,l),M(Vs,s,l),n(s,ta,l),n(s,Rs,l),n(s,ea,l),n(s,ks,l),n(s,pa,l),M(Zs,s,l),n(s,ia,l),n(s,Qs,l),n(s,ca,l),M(As,s,l),n(s,ja,l),n(s,$s,l),n(s,ra,l),M(Xs,s,l),n(s,Ma,l),n(s,vs,l),n(s,ha,l),M(Ns,s,l),n(s,oa,l),n(s,Gs,l),n(s,xa,l),M(Es,s,l),n(s,ga,l),M(Bs,s,l),n(s,Ua,l),n(s,Fs,l),n(s,ma,l),n(s,Ws,l),n(s,da,l),M(zs,s,l),n(s,Ja,l),n(s,qs,l),n(s,ya,l),M(Ss,s,l),n(s,wa,l),n(s,Ys,l),n(s,Ta,l),n(s,Hs,l),n(s,ua,l),M(Ds,s,l),n(s,Ca,l),n(s,Ls,l),ba=!0},p(s,[l]){const mn={};l&2&&(mn.$$scope={dirty:l,ctx:s}),T.$set(mn)},i(s){ba||(h(w.$$.fragment,s),h(J.$$.fragment,s),h(T.$$.fragment,s),h(k.$$.fragment,s),h(Z.$$.fragment,s),h(Q.$$.fragment,s),h(X.$$.fragment,s),h(N.$$.fragment,s),h(G.$$.fragment,s),h(B.$$.fragment,s),h(W.$$.fragment,s),h(Y.$$.fragment,s),h(D.$$.fragment,s),h(O.$$.fragment,s),h(K.$$.fragment,s),h(ss.$$.fragment,s),h(as.$$.fragment,s),h(ns.$$.fragment,s),h(es.$$.fragment,s),h(is.$$.fragment,s),h(cs.$$.fragment,s),h(rs.$$.fragment,s),h(xs.$$.fragment,s),h(Us.$$.fragment,s),h(ds.$$.fragment,s),h(Ts.$$.fragment,s),h(Cs.$$.fragment,s),h(fs.$$.fragment,s),h(Is.$$.fragment,s),h(Vs.$$.fragment,s),h(Zs.$$.fragment,s),h(As.$$.fragment,s),h(Xs.$$.fragment,s),h(Ns.$$.fragment,s),h(Es.$$.fragment,s),h(Bs.$$.fragment,s),h(zs.$$.fragment,s),h(Ss.$$.fragment,s),h(Ds.$$.fragment,s),ba=!0)},o(s){o(w.$$.fragment,s),o(J.$$.fragment,s),o(T.$$.fragment,s),o(k.$$.fragment,s),o(Z.$$.fragment,s),o(Q.$$.fragment,s),o(X.$$.fragment,s),o(N.$$.fragment,s),o(G.$$.fragment,s),o(B.$$.fragment,s),o(W.$$.fragment,s),o(Y.$$.fragment,s),o(D.$$.fragment,s),o(O.$$.fragment,s),o(K.$$.fragment,s),o(ss.$$.fragment,s),o(as.$$.fragment,s),o(ns.$$.fragment,s),o(es.$$.fragment,s),o(is.$$.fragment,s),o(cs.$$.fragment,s),o(rs.$$.fragment,s),o(xs.$$.fragment,s),o(Us.$$.fragment,s),o(ds.$$.fragment,s),o(Ts.$$.fragment,s),o(Cs.$$.fragment,s),o(fs.$$.fragment,s),o(Is.$$.fragment,s),o(Vs.$$.fragment,s),o(Zs.$$.fragment,s),o(As.$$.fragment,s),o(Xs.$$.fragment,s),o(Ns.$$.fragment,s),o(Es.$$.fragment,s),o(Bs.$$.fragment,s),o(zs.$$.fragment,s),o(Ss.$$.fragment,s),o(Ds.$$.fragment,s),ba=!1},d(s){s&&(a(b),a(y),a(d),a(m),a(Ps),a(f),a(Ks),a(I),a(sl),a(_),a(ll),a(al),a(V),a(nl),a(R),a(tl),a(el),a(pl),a(il),a(A),a(cl),a($),a(jl),a(rl),a(v),a(Ml),a(hl),a(ol),a(E),a(xl),a(gl),a(F),a(Ul),a(ml),a(z),a(dl),a(q),a(Jl),a(S),a(yl),a(wl),a(H),a(Tl),a(ul),a(L),a(Cl),a(bl),a(P),a(fl),a(Il),a(u),a(_l),a(Vl),a(ls),a(Rl),a(kl),a(Zl),a(ts),a(Ql),a(Al),a(ps),a($l),a(Xl),a(vl),a(js),a(Nl),a(Gl),a(Ms),a(El),a(hs),a(Bl),a(os),a(Fl),a(Wl),a(gs),a(zl),a(ql),a(ms),a(Sl),a(Yl),a(Js),a(Hl),a(ys),a(Dl),a(ws),a(Ll),a(Ol),a(us),a(Pl),a(Kl),a(bs),a(sa),a(la),a(aa),a(_s),a(na),a(ta),a(Rs),a(ea),a(ks),a(pa),a(ia),a(Qs),a(ca),a(ja),a($s),a(ra),a(Ma),a(vs),a(ha),a(oa),a(Gs),a(xa),a(ga),a(Ua),a(Fs),a(ma),a(Ws),a(da),a(Ja),a(qs),a(ya),a(wa),a(Ys),a(Ta),a(Hs),a(ua),a(Ca),a(Ls)),a(U),x(w,s),x(J,s),x(T,s),x(k,s),x(Z,s),x(Q,s),x(X,s),x(N,s),x(G,s),x(B,s),x(W,s),x(Y,s),x(D,s),x(O,s),x(K,s),x(ss,s),x(as,s),x(ns,s),x(es,s),x(is,s),x(cs,s),x(rs,s),x(xs,s),x(Us,s),x(ds,s),x(Ts,s),x(Cs,s),x(fs,s),x(Is,s),x(Vs,s),x(Zs,s),x(As,s),x(Xs,s),x(Ns,s),x(Es,s),x(Bs,s),x(zs,s),x(Ss,s),x(Ds,s)}}}const Rn='{"title":"문서 질의 응답(Document Question Answering)","local":"document_question_answering","sections":[{"title":"데이터 불러오기","local":"load-the-data","sections":[],"depth":2},{"title":"데이터 전처리","local":"preprocess-the-data","sections":[{"title":"문서 이미지 전처리","local":"preprocessing-document-images","sections":[],"depth":3},{"title":"텍스트 데이터 전처리","local":"preprocessing-text-data","sections":[],"depth":3}],"depth":2},{"title":"평가","local":"evaluation","sections":[],"depth":2},{"title":"훈련","local":"train","sections":[],"depth":2},{"title":"추론","local":"inference","sections":[],"depth":2}],"depth":1}';function kn(Os){return yn(()=>{new URLSearchParams(window.location.search).get("fw")}),[]}class Nn extends Tn{constructor(U){super(),un(this,U,kn,Vn,Jn,{})}}export{Nn as component}; | |
Xet Storage Details
- Size:
- 88.9 kB
- Xet hash:
- 782442d6cb8d2d0c0e42e97ebe192472618e8a195b57c527cf4d0fd8c2910219
·
Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.