Buckets:

hf-doc-build/doc-dev / transformers /main /en /tasks /document_question_answering.html
rtrm's picture
download
raw
107 kB
<meta charset="utf-8" /><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Document Question Answering&quot;,&quot;local&quot;:&quot;document-question-answering&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Load the data&quot;,&quot;local&quot;:&quot;load-the-data&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Preprocess the data&quot;,&quot;local&quot;:&quot;preprocess-the-data&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Preprocessing document images&quot;,&quot;local&quot;:&quot;preprocessing-document-images&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Preprocessing text data&quot;,&quot;local&quot;:&quot;preprocessing-text-data&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Evaluation&quot;,&quot;local&quot;:&quot;evaluation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Train&quot;,&quot;local&quot;:&quot;train&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Inference&quot;,&quot;local&quot;:&quot;inference&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}">
<link href="/docs/transformers/main/en/_app/immutable/assets/0.e3b0c442.css" rel="modulepreload">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/start.2135b7e6.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/scheduler.25b97de1.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/singletons.0f2b7d5f.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.e188933d.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/paths.3d04d2c6.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/entry/app.24372c84.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/index.d9030fc9.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/0.026d2fdd.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/each.e59479a4.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/nodes/401.ab8a29e2.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/Tip.baa67368.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/CodeBlock.e6cd0d95.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/DocNotebookDropdown.5ea6cb78.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/globals.7f7f1b26.js">
<link rel="modulepreload" href="/docs/transformers/main/en/_app/immutable/chunks/EditOnGithub.91d95064.js"><!-- HEAD_svelte-u9bgzb_START --><meta name="hf:doc:metadata" content="{&quot;title&quot;:&quot;Document Question Answering&quot;,&quot;local&quot;:&quot;document-question-answering&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Load the data&quot;,&quot;local&quot;:&quot;load-the-data&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Preprocess the data&quot;,&quot;local&quot;:&quot;preprocess-the-data&quot;,&quot;sections&quot;:[{&quot;title&quot;:&quot;Preprocessing document images&quot;,&quot;local&quot;:&quot;preprocessing-document-images&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3},{&quot;title&quot;:&quot;Preprocessing text data&quot;,&quot;local&quot;:&quot;preprocessing-text-data&quot;,&quot;sections&quot;:[],&quot;depth&quot;:3}],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Evaluation&quot;,&quot;local&quot;:&quot;evaluation&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Train&quot;,&quot;local&quot;:&quot;train&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2},{&quot;title&quot;:&quot;Inference&quot;,&quot;local&quot;:&quot;inference&quot;,&quot;sections&quot;:[],&quot;depth&quot;:2}],&quot;depth&quot;:1}"><!-- HEAD_svelte-u9bgzb_END --> <p></p> <h1 class="relative group"><a id="document-question-answering" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#document-question-answering"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Document Question Answering</span></h1> <div class="flex space-x-1 absolute z-10 right-0 top-0"> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Colab" class="!m-0" src="https://colab.research.google.com/assets/colab-badge.svg"> </button> </div> <div class="relative colab-dropdown "> <button class=" " type="button"> <img alt="Open In Studio Lab" class="!m-0" src="https://studiolab.sagemaker.aws/studiolab.svg"> </button> </div></div> <p data-svelte-h="svelte-1c1m6de">Document Question Answering, also referred to as Document Visual Question Answering, is a task that involves providing
answers to questions posed about document images. The input to models supporting this task is typically a combination of an image and
a question, and the output is an answer expressed in natural language. These models utilize multiple modalities, including
text, the positions of words (bounding boxes), and the image itself.</p> <p data-svelte-h="svelte-ku8orh">This guide illustrates how to:</p> <ul data-svelte-h="svelte-1g8eree"><li>Fine-tune <a href="../model_doc/layoutlmv2">LayoutLMv2</a> on the <a href="https://huggingface.co/datasets/nielsr/docvqa_1200_examples_donut" rel="nofollow">DocVQA dataset</a>.</li> <li>Use your fine-tuned model for inference.</li></ul> <div class="course-tip bg-gradient-to-br dark:bg-gradient-to-r before:border-green-500 dark:before:border-green-800 from-green-50 dark:from-gray-900 to-white dark:to-gray-950 border border-green-50 text-green-700 dark:text-gray-400"><p data-svelte-h="svelte-1nrnqa3">To see all architectures and checkpoints compatible with this task, we recommend checking the <a href="https://huggingface.co/tasks/image-to-text" rel="nofollow">task-page</a></p></div> <p data-svelte-h="svelte-1svbrv5">LayoutLMv2 solves the document question-answering task by adding a question-answering head on top of the final hidden
states of the tokens, to predict the positions of the start and end tokens of the
answer. In other words, the problem is treated as extractive question answering: given the context, extract which piece
of information answers the question. The context comes from the output of an OCR engine, here it is Google’s Tesseract.</p> <p data-svelte-h="svelte-17fjxql">Before you begin, make sure you have all the necessary libraries installed. LayoutLMv2 depends on detectron2, torchvision and tesseract.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install -q transformers datasets<!-- HTML_TAG_END --></pre></div> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->pip install <span class="hljs-string">&#x27;git+https://github.com/facebookresearch/detectron2.git&#x27;</span>
pip install torchvision<!-- HTML_TAG_END --></pre></div> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START -->sudo apt install tesseract-ocr
pip install -q pytesseract<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-hsz112">Once you have installed all of the dependencies, restart your runtime.</p> <p data-svelte-h="svelte-1yqpblu">We encourage you to share your model with the community. Log in to your Hugging Face account to upload it to the 🤗 Hub.
When prompted, enter your token to log in:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> huggingface_hub <span class="hljs-keyword">import</span> notebook_login
<span class="hljs-meta">&gt;&gt;&gt; </span>notebook_login()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1us2g34">Let’s define some global variables.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>model_checkpoint = <span class="hljs-string">&quot;microsoft/layoutlmv2-base-uncased&quot;</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>batch_size = <span class="hljs-number">4</span><!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="load-the-data" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#load-the-data"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Load the data</span></h2> <p data-svelte-h="svelte-xkaeyi">In this guide we use a small sample of preprocessed DocVQA that you can find on 🤗 Hub. If you’d like to use the full
DocVQA dataset, you can register and download it on <a href="https://rrc.cvc.uab.es/?ch=17" rel="nofollow">DocVQA homepage</a>. If you do so, to
proceed with this guide check out <a href="https://huggingface.co/docs/datasets/loading#local-and-remote-files" rel="nofollow">how to load files into a 🤗 dataset</a>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> datasets <span class="hljs-keyword">import</span> load_dataset
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset = load_dataset(<span class="hljs-string">&quot;nielsr/docvqa_1200_examples&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>dataset
DatasetDict({
train: Dataset({
features: [<span class="hljs-string">&#x27;id&#x27;</span>, <span class="hljs-string">&#x27;image&#x27;</span>, <span class="hljs-string">&#x27;query&#x27;</span>, <span class="hljs-string">&#x27;answers&#x27;</span>, <span class="hljs-string">&#x27;words&#x27;</span>, <span class="hljs-string">&#x27;bounding_boxes&#x27;</span>, <span class="hljs-string">&#x27;answer&#x27;</span>],
num_rows: <span class="hljs-number">1000</span>
})
test: Dataset({
features: [<span class="hljs-string">&#x27;id&#x27;</span>, <span class="hljs-string">&#x27;image&#x27;</span>, <span class="hljs-string">&#x27;query&#x27;</span>, <span class="hljs-string">&#x27;answers&#x27;</span>, <span class="hljs-string">&#x27;words&#x27;</span>, <span class="hljs-string">&#x27;bounding_boxes&#x27;</span>, <span class="hljs-string">&#x27;answer&#x27;</span>],
num_rows: <span class="hljs-number">200</span>
})
})<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-18ggx10">As you can see, the dataset is split into train and test sets already. Take a look at a random example to familiarize
yourself with the features.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset[<span class="hljs-string">&quot;train&quot;</span>].features<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1fi388d">Here’s what the individual fields represent:</p> <ul data-svelte-h="svelte-12b5dxa"><li><code>id</code>: the example’s id</li> <li><code>image</code>: a PIL.Image.Image object containing the document image</li> <li><code>query</code>: the question string - natural language asked question, in several languages</li> <li><code>answers</code>: a list of correct answers provided by human annotators</li> <li><code>words</code> and <code>bounding_boxes</code>: the results of OCR, which we will not use here</li> <li><code>answer</code>: an answer matched by a different model which we will not use here</li></ul> <p data-svelte-h="svelte-1h0f0qo">Let’s leave only English questions, and drop the <code>answer</code> feature which appears to contain predictions by another model.
We’ll also take the first of the answers from the set provided by the annotators. Alternatively, you can randomly sample it.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>updated_dataset = dataset.<span class="hljs-built_in">map</span>(<span class="hljs-keyword">lambda</span> example: {<span class="hljs-string">&quot;question&quot;</span>: example[<span class="hljs-string">&quot;query&quot;</span>][<span class="hljs-string">&quot;en&quot;</span>]}, remove_columns=[<span class="hljs-string">&quot;query&quot;</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span>updated_dataset = updated_dataset.<span class="hljs-built_in">map</span>(
<span class="hljs-meta">... </span> <span class="hljs-keyword">lambda</span> example: {<span class="hljs-string">&quot;answer&quot;</span>: example[<span class="hljs-string">&quot;answers&quot;</span>][<span class="hljs-number">0</span>]}, remove_columns=[<span class="hljs-string">&quot;answer&quot;</span>, <span class="hljs-string">&quot;answers&quot;</span>]
<span class="hljs-meta">... </span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-x5p0j2">Note that the LayoutLMv2 checkpoint that we use in this guide has been trained with <code>max_position_embeddings = 512</code> (you can
find this information in the <a href="https://huggingface.co/microsoft/layoutlmv2-base-uncased/blob/main/config.json#L18" rel="nofollow">checkpoint’s <code>config.json</code> file</a>).
We can truncate the examples but to avoid the situation where the answer might be at the end of a large document and end up truncated,
here we’ll remove the few examples where the embedding is likely to end up longer than 512.
If most of the documents in your dataset are long, you can implement a sliding window strategy - check out <a href="https://github.com/huggingface/notebooks/blob/main/examples/question_answering.ipynb" rel="nofollow">this notebook</a> for details.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>updated_dataset = updated_dataset.<span class="hljs-built_in">filter</span>(<span class="hljs-keyword">lambda</span> x: <span class="hljs-built_in">len</span>(x[<span class="hljs-string">&quot;words&quot;</span>]) + <span class="hljs-built_in">len</span>(x[<span class="hljs-string">&quot;question&quot;</span>].split()) &lt; <span class="hljs-number">512</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-ydtan0">At this point let’s also remove the OCR features from this dataset. These are a result of OCR for fine-tuning a different
model. They would still require some processing if we wanted to use them, as they do not match the input requirements
of the model we use in this guide. Instead, we can use the <a href="/docs/transformers/main/en/model_doc/layoutlmv2#transformers.LayoutLMv2Processor">LayoutLMv2Processor</a> on the original data for both OCR and
tokenization. This way we’ll get the inputs that match model’s expected input. If you want to process images manually,
check out the <a href="../model_doc/layoutlmv2"><code>LayoutLMv2</code> model documentation</a> to learn what input format the model expects.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>updated_dataset = updated_dataset.remove_columns(<span class="hljs-string">&quot;words&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>updated_dataset = updated_dataset.remove_columns(<span class="hljs-string">&quot;bounding_boxes&quot;</span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1vy80t">Finally, the data exploration won’t be complete if we don’t peek at an image example.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>updated_dataset[<span class="hljs-string">&quot;train&quot;</span>][<span class="hljs-number">11</span>][<span class="hljs-string">&quot;image&quot;</span>]<!-- HTML_TAG_END --></pre></div> <div class="flex justify-center" data-svelte-h="svelte-q63tj1"><img src="https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/transformers/tasks/docvqa_example.jpg" alt="DocVQA Image Example"></div> <h2 class="relative group"><a id="preprocess-the-data" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#preprocess-the-data"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Preprocess the data</span></h2> <p data-svelte-h="svelte-1hlj9yo">The Document Question Answering task is a multimodal task, and you need to make sure that the inputs from each modality
are preprocessed according to the model’s expectations. Let’s start by loading the <a href="/docs/transformers/main/en/model_doc/layoutlmv2#transformers.LayoutLMv2Processor">LayoutLMv2Processor</a>, which internally combines an image processor that can handle image data and a tokenizer that can encode text data.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoProcessor
<span class="hljs-meta">&gt;&gt;&gt; </span>processor = AutoProcessor.from_pretrained(model_checkpoint)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="preprocessing-document-images" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#preprocessing-document-images"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Preprocessing document images</span></h3> <p data-svelte-h="svelte-1u7369n">First, let’s prepare the document images for the model with the help of the <code>image_processor</code> from the processor.
By default, image processor resizes the images to 224x224, makes sure they have the correct order of color channels,
applies OCR with tesseract to get words and normalized bounding boxes. In this tutorial, all of these defaults are exactly what we need.
Write a function that applies the default image processing to a batch of images and returns the results of OCR.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>image_processor = processor.image_processor
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">get_ocr_words_and_boxes</span>(<span class="hljs-params">examples</span>):
<span class="hljs-meta">... </span> images = [image.convert(<span class="hljs-string">&quot;RGB&quot;</span>) <span class="hljs-keyword">for</span> image <span class="hljs-keyword">in</span> examples[<span class="hljs-string">&quot;image&quot;</span>]]
<span class="hljs-meta">... </span> encoded_inputs = image_processor(images)
<span class="hljs-meta">... </span> examples[<span class="hljs-string">&quot;image&quot;</span>] = encoded_inputs.pixel_values
<span class="hljs-meta">... </span> examples[<span class="hljs-string">&quot;words&quot;</span>] = encoded_inputs.words
<span class="hljs-meta">... </span> examples[<span class="hljs-string">&quot;boxes&quot;</span>] = encoded_inputs.boxes
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> examples<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1um3786">To apply this preprocessing to the entire dataset in a fast way, use <a href="https://huggingface.co/docs/datasets/main/en/package_reference/main_classes#datasets.Dataset.map" rel="nofollow">map</a>.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>dataset_with_ocr = updated_dataset.<span class="hljs-built_in">map</span>(get_ocr_words_and_boxes, batched=<span class="hljs-literal">True</span>, batch_size=<span class="hljs-number">2</span>)<!-- HTML_TAG_END --></pre></div> <h3 class="relative group"><a id="preprocessing-text-data" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#preprocessing-text-data"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Preprocessing text data</span></h3> <p data-svelte-h="svelte-dfarfe">Once we have applied OCR to the images, we need to encode the text part of the dataset to prepare it for the model.
This involves converting the words and boxes that we got in the previous step to token-level <code>input_ids</code>, <code>attention_mask</code>,
<code>token_type_ids</code> and <code>bbox</code>. For preprocessing text, we’ll need the <code>tokenizer</code> from the processor.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer = processor.tokenizer<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-12sojfo">On top of the preprocessing mentioned above, we also need to add the labels for the model. For <code>xxxForQuestionAnswering</code> models
in 🤗 Transformers, the labels consist of the <code>start_positions</code> and <code>end_positions</code>, indicating which token is at the
start and which token is at the end of the answer.</p> <p data-svelte-h="svelte-1kkerbo">Let’s start with that. Define a helper function that can find a sublist (the answer split into words) in a larger list (the words list).</p> <p data-svelte-h="svelte-1wppb4o">This function will take two lists as input, <code>words_list</code> and <code>answer_list</code>. It will then iterate over the <code>words_list</code> and check
if the current word in the <code>words_list</code> (words_list[i]) is equal to the first word of answer_list (answer_list[0]) and if
the sublist of <code>words_list</code> starting from the current word and of the same length as <code>answer_list</code> is equal <code>to answer_list</code>.
If this condition is true, it means that a match has been found, and the function will record the match, its starting index (idx),
and its ending index (idx + len(answer_list) - 1). If more than one match was found, the function will return only the first one.
If no match is found, the function returns (<code>None</code>, 0, and 0).</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">subfinder</span>(<span class="hljs-params">words_list, answer_list</span>):
<span class="hljs-meta">... </span> matches = []
<span class="hljs-meta">... </span> start_indices = []
<span class="hljs-meta">... </span> end_indices = []
<span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> idx, i <span class="hljs-keyword">in</span> <span class="hljs-built_in">enumerate</span>(<span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(words_list))):
<span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> words_list[i] == answer_list[<span class="hljs-number">0</span>] <span class="hljs-keyword">and</span> words_list[i : i + <span class="hljs-built_in">len</span>(answer_list)] == answer_list:
<span class="hljs-meta">... </span> matches.append(answer_list)
<span class="hljs-meta">... </span> start_indices.append(idx)
<span class="hljs-meta">... </span> end_indices.append(idx + <span class="hljs-built_in">len</span>(answer_list) - <span class="hljs-number">1</span>)
<span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> matches:
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> matches[<span class="hljs-number">0</span>], start_indices[<span class="hljs-number">0</span>], end_indices[<span class="hljs-number">0</span>]
<span class="hljs-meta">... </span> <span class="hljs-keyword">else</span>:
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> <span class="hljs-literal">None</span>, <span class="hljs-number">0</span>, <span class="hljs-number">0</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-19pibjd">To illustrate how this function finds the position of the answer, let’s use it on an example:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>example = dataset_with_ocr[<span class="hljs-string">&quot;train&quot;</span>][<span class="hljs-number">1</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>words = [word.lower() <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> example[<span class="hljs-string">&quot;words&quot;</span>]]
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">match</span>, word_idx_start, word_idx_end = subfinder(words, example[<span class="hljs-string">&quot;answer&quot;</span>].lower().split())
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Question: &quot;</span>, example[<span class="hljs-string">&quot;question&quot;</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Words:&quot;</span>, words)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;Answer: &quot;</span>, example[<span class="hljs-string">&quot;answer&quot;</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;start_index&quot;</span>, word_idx_start)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(<span class="hljs-string">&quot;end_index&quot;</span>, word_idx_end)
Question: Who <span class="hljs-keyword">is</span> <span class="hljs-keyword">in</span> cc <span class="hljs-keyword">in</span> this letter?
Words: [<span class="hljs-string">&#x27;wie&#x27;</span>, <span class="hljs-string">&#x27;baw&#x27;</span>, <span class="hljs-string">&#x27;brown&#x27;</span>, <span class="hljs-string">&#x27;&amp;&#x27;</span>, <span class="hljs-string">&#x27;williamson&#x27;</span>, <span class="hljs-string">&#x27;tobacco&#x27;</span>, <span class="hljs-string">&#x27;corporation&#x27;</span>, <span class="hljs-string">&#x27;research&#x27;</span>, <span class="hljs-string">&#x27;&amp;&#x27;</span>, <span class="hljs-string">&#x27;development&#x27;</span>, <span class="hljs-string">&#x27;internal&#x27;</span>, <span class="hljs-string">&#x27;correspondence&#x27;</span>, <span class="hljs-string">&#x27;to:&#x27;</span>, <span class="hljs-string">&#x27;r.&#x27;</span>, <span class="hljs-string">&#x27;h.&#x27;</span>, <span class="hljs-string">&#x27;honeycutt&#x27;</span>, <span class="hljs-string">&#x27;ce:&#x27;</span>, <span class="hljs-string">&#x27;t.f.&#x27;</span>, <span class="hljs-string">&#x27;riehl&#x27;</span>, <span class="hljs-string">&#x27;from:&#x27;</span>, <span class="hljs-string">&#x27;.&#x27;</span>, <span class="hljs-string">&#x27;c.j.&#x27;</span>, <span class="hljs-string">&#x27;cook&#x27;</span>, <span class="hljs-string">&#x27;date:&#x27;</span>, <span class="hljs-string">&#x27;may&#x27;</span>, <span class="hljs-string">&#x27;8,&#x27;</span>, <span class="hljs-string">&#x27;1995&#x27;</span>, <span class="hljs-string">&#x27;subject:&#x27;</span>, <span class="hljs-string">&#x27;review&#x27;</span>, <span class="hljs-string">&#x27;of&#x27;</span>, <span class="hljs-string">&#x27;existing&#x27;</span>, <span class="hljs-string">&#x27;brainstorming&#x27;</span>, <span class="hljs-string">&#x27;ideas/483&#x27;</span>, <span class="hljs-string">&#x27;the&#x27;</span>, <span class="hljs-string">&#x27;major&#x27;</span>, <span class="hljs-string">&#x27;function&#x27;</span>, <span class="hljs-string">&#x27;of&#x27;</span>, <span class="hljs-string">&#x27;the&#x27;</span>, <span class="hljs-string">&#x27;product&#x27;</span>, <span class="hljs-string">&#x27;innovation&#x27;</span>, <span class="hljs-string">&#x27;graup&#x27;</span>, <span class="hljs-string">&#x27;is&#x27;</span>, <span class="hljs-string">&#x27;to&#x27;</span>, <span class="hljs-string">&#x27;develop&#x27;</span>, <span class="hljs-string">&#x27;marketable&#x27;</span>, <span class="hljs-string">&#x27;nove!&#x27;</span>, <span class="hljs-string">&#x27;products&#x27;</span>, <span class="hljs-string">&#x27;that&#x27;</span>, <span class="hljs-string">&#x27;would&#x27;</span>, <span class="hljs-string">&#x27;be&#x27;</span>, <span class="hljs-string">&#x27;profitable&#x27;</span>, <span class="hljs-string">&#x27;to&#x27;</span>, <span class="hljs-string">&#x27;manufacture&#x27;</span>, <span class="hljs-string">&#x27;and&#x27;</span>, <span class="hljs-string">&#x27;sell.&#x27;</span>, <span class="hljs-string">&#x27;novel&#x27;</span>, <span class="hljs-string">&#x27;is&#x27;</span>, <span class="hljs-string">&#x27;defined&#x27;</span>, <span class="hljs-string">&#x27;as:&#x27;</span>, <span class="hljs-string">&#x27;of&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;new&#x27;</span>, <span class="hljs-string">&#x27;kind,&#x27;</span>, <span class="hljs-string">&#x27;or&#x27;</span>, <span class="hljs-string">&#x27;different&#x27;</span>, <span class="hljs-string">&#x27;from&#x27;</span>, <span class="hljs-string">&#x27;anything&#x27;</span>, <span class="hljs-string">&#x27;seen&#x27;</span>, <span class="hljs-string">&#x27;or&#x27;</span>, <span class="hljs-string">&#x27;known&#x27;</span>, <span class="hljs-string">&#x27;before.&#x27;</span>, <span class="hljs-string">&#x27;innovation&#x27;</span>, <span class="hljs-string">&#x27;is&#x27;</span>, <span class="hljs-string">&#x27;defined&#x27;</span>, <span class="hljs-string">&#x27;as:&#x27;</span>, <span class="hljs-string">&#x27;something&#x27;</span>, <span class="hljs-string">&#x27;new&#x27;</span>, <span class="hljs-string">&#x27;or&#x27;</span>, <span class="hljs-string">&#x27;different&#x27;</span>, <span class="hljs-string">&#x27;introduced;&#x27;</span>, <span class="hljs-string">&#x27;act&#x27;</span>, <span class="hljs-string">&#x27;of&#x27;</span>, <span class="hljs-string">&#x27;innovating;&#x27;</span>, <span class="hljs-string">&#x27;introduction&#x27;</span>, <span class="hljs-string">&#x27;of&#x27;</span>, <span class="hljs-string">&#x27;new&#x27;</span>, <span class="hljs-string">&#x27;things&#x27;</span>, <span class="hljs-string">&#x27;or&#x27;</span>, <span class="hljs-string">&#x27;methods.&#x27;</span>, <span class="hljs-string">&#x27;the&#x27;</span>, <span class="hljs-string">&#x27;products&#x27;</span>, <span class="hljs-string">&#x27;may&#x27;</span>, <span class="hljs-string">&#x27;incorporate&#x27;</span>, <span class="hljs-string">&#x27;the&#x27;</span>, <span class="hljs-string">&#x27;latest&#x27;</span>, <span class="hljs-string">&#x27;technologies,&#x27;</span>, <span class="hljs-string">&#x27;materials&#x27;</span>, <span class="hljs-string">&#x27;and&#x27;</span>, <span class="hljs-string">&#x27;know-how&#x27;</span>, <span class="hljs-string">&#x27;available&#x27;</span>, <span class="hljs-string">&#x27;to&#x27;</span>, <span class="hljs-string">&#x27;give&#x27;</span>, <span class="hljs-string">&#x27;then&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;unique&#x27;</span>, <span class="hljs-string">&#x27;taste&#x27;</span>, <span class="hljs-string">&#x27;or&#x27;</span>, <span class="hljs-string">&#x27;look.&#x27;</span>, <span class="hljs-string">&#x27;the&#x27;</span>, <span class="hljs-string">&#x27;first&#x27;</span>, <span class="hljs-string">&#x27;task&#x27;</span>, <span class="hljs-string">&#x27;of&#x27;</span>, <span class="hljs-string">&#x27;the&#x27;</span>, <span class="hljs-string">&#x27;product&#x27;</span>, <span class="hljs-string">&#x27;innovation&#x27;</span>, <span class="hljs-string">&#x27;group&#x27;</span>, <span class="hljs-string">&#x27;was&#x27;</span>, <span class="hljs-string">&#x27;to&#x27;</span>, <span class="hljs-string">&#x27;assemble,&#x27;</span>, <span class="hljs-string">&#x27;review&#x27;</span>, <span class="hljs-string">&#x27;and&#x27;</span>, <span class="hljs-string">&#x27;categorize&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;list&#x27;</span>, <span class="hljs-string">&#x27;of&#x27;</span>, <span class="hljs-string">&#x27;existing&#x27;</span>, <span class="hljs-string">&#x27;brainstorming&#x27;</span>, <span class="hljs-string">&#x27;ideas.&#x27;</span>, <span class="hljs-string">&#x27;ideas&#x27;</span>, <span class="hljs-string">&#x27;were&#x27;</span>, <span class="hljs-string">&#x27;grouped&#x27;</span>, <span class="hljs-string">&#x27;into&#x27;</span>, <span class="hljs-string">&#x27;two&#x27;</span>, <span class="hljs-string">&#x27;major&#x27;</span>, <span class="hljs-string">&#x27;categories&#x27;</span>, <span class="hljs-string">&#x27;labeled&#x27;</span>, <span class="hljs-string">&#x27;appearance&#x27;</span>, <span class="hljs-string">&#x27;and&#x27;</span>, <span class="hljs-string">&#x27;taste/aroma.&#x27;</span>, <span class="hljs-string">&#x27;these&#x27;</span>, <span class="hljs-string">&#x27;categories&#x27;</span>, <span class="hljs-string">&#x27;are&#x27;</span>, <span class="hljs-string">&#x27;used&#x27;</span>, <span class="hljs-string">&#x27;for&#x27;</span>, <span class="hljs-string">&#x27;novel&#x27;</span>, <span class="hljs-string">&#x27;products&#x27;</span>, <span class="hljs-string">&#x27;that&#x27;</span>, <span class="hljs-string">&#x27;may&#x27;</span>, <span class="hljs-string">&#x27;differ&#x27;</span>, <span class="hljs-string">&#x27;from&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;visual&#x27;</span>, <span class="hljs-string">&#x27;and/or&#x27;</span>, <span class="hljs-string">&#x27;taste/aroma&#x27;</span>, <span class="hljs-string">&#x27;point&#x27;</span>, <span class="hljs-string">&#x27;of&#x27;</span>, <span class="hljs-string">&#x27;view&#x27;</span>, <span class="hljs-string">&#x27;compared&#x27;</span>, <span class="hljs-string">&#x27;to&#x27;</span>, <span class="hljs-string">&#x27;canventional&#x27;</span>, <span class="hljs-string">&#x27;cigarettes.&#x27;</span>, <span class="hljs-string">&#x27;other&#x27;</span>, <span class="hljs-string">&#x27;categories&#x27;</span>, <span class="hljs-string">&#x27;include&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;combination&#x27;</span>, <span class="hljs-string">&#x27;of&#x27;</span>, <span class="hljs-string">&#x27;the&#x27;</span>, <span class="hljs-string">&#x27;above,&#x27;</span>, <span class="hljs-string">&#x27;filters,&#x27;</span>, <span class="hljs-string">&#x27;packaging&#x27;</span>, <span class="hljs-string">&#x27;and&#x27;</span>, <span class="hljs-string">&#x27;brand&#x27;</span>, <span class="hljs-string">&#x27;extensions.&#x27;</span>, <span class="hljs-string">&#x27;appearance&#x27;</span>, <span class="hljs-string">&#x27;this&#x27;</span>, <span class="hljs-string">&#x27;category&#x27;</span>, <span class="hljs-string">&#x27;is&#x27;</span>, <span class="hljs-string">&#x27;used&#x27;</span>, <span class="hljs-string">&#x27;for&#x27;</span>, <span class="hljs-string">&#x27;novel&#x27;</span>, <span class="hljs-string">&#x27;cigarette&#x27;</span>, <span class="hljs-string">&#x27;constructions&#x27;</span>, <span class="hljs-string">&#x27;that&#x27;</span>, <span class="hljs-string">&#x27;yield&#x27;</span>, <span class="hljs-string">&#x27;visually&#x27;</span>, <span class="hljs-string">&#x27;different&#x27;</span>, <span class="hljs-string">&#x27;products&#x27;</span>, <span class="hljs-string">&#x27;with&#x27;</span>, <span class="hljs-string">&#x27;minimal&#x27;</span>, <span class="hljs-string">&#x27;changes&#x27;</span>, <span class="hljs-string">&#x27;in&#x27;</span>, <span class="hljs-string">&#x27;smoke&#x27;</span>, <span class="hljs-string">&#x27;chemistry&#x27;</span>, <span class="hljs-string">&#x27;two&#x27;</span>, <span class="hljs-string">&#x27;cigarettes&#x27;</span>, <span class="hljs-string">&#x27;in&#x27;</span>, <span class="hljs-string">&#x27;cne.&#x27;</span>, <span class="hljs-string">&#x27;emulti-plug&#x27;</span>, <span class="hljs-string">&#x27;te&#x27;</span>, <span class="hljs-string">&#x27;build&#x27;</span>, <span class="hljs-string">&#x27;yaur&#x27;</span>, <span class="hljs-string">&#x27;awn&#x27;</span>, <span class="hljs-string">&#x27;cigarette.&#x27;</span>, <span class="hljs-string">&#x27;eswitchable&#x27;</span>, <span class="hljs-string">&#x27;menthol&#x27;</span>, <span class="hljs-string">&#x27;or&#x27;</span>, <span class="hljs-string">&#x27;non&#x27;</span>, <span class="hljs-string">&#x27;menthol&#x27;</span>, <span class="hljs-string">&#x27;cigarette.&#x27;</span>, <span class="hljs-string">&#x27;*cigarettes&#x27;</span>, <span class="hljs-string">&#x27;with&#x27;</span>, <span class="hljs-string">&#x27;interspaced&#x27;</span>, <span class="hljs-string">&#x27;perforations&#x27;</span>, <span class="hljs-string">&#x27;to&#x27;</span>, <span class="hljs-string">&#x27;enable&#x27;</span>, <span class="hljs-string">&#x27;smoker&#x27;</span>, <span class="hljs-string">&#x27;to&#x27;</span>, <span class="hljs-string">&#x27;separate&#x27;</span>, <span class="hljs-string">&#x27;unburned&#x27;</span>, <span class="hljs-string">&#x27;section&#x27;</span>, <span class="hljs-string">&#x27;for&#x27;</span>, <span class="hljs-string">&#x27;future&#x27;</span>, <span class="hljs-string">&#x27;smoking.&#x27;</span>, <span class="hljs-string">&#x27;«short&#x27;</span>, <span class="hljs-string">&#x27;cigarette,&#x27;</span>, <span class="hljs-string">&#x27;tobacco&#x27;</span>, <span class="hljs-string">&#x27;section&#x27;</span>, <span class="hljs-string">&#x27;30&#x27;</span>, <span class="hljs-string">&#x27;mm.&#x27;</span>, <span class="hljs-string">&#x27;«extremely&#x27;</span>, <span class="hljs-string">&#x27;fast&#x27;</span>, <span class="hljs-string">&#x27;buming&#x27;</span>, <span class="hljs-string">&#x27;cigarette.&#x27;</span>, <span class="hljs-string">&#x27;«novel&#x27;</span>, <span class="hljs-string">&#x27;cigarette&#x27;</span>, <span class="hljs-string">&#x27;constructions&#x27;</span>, <span class="hljs-string">&#x27;that&#x27;</span>, <span class="hljs-string">&#x27;permit&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;significant&#x27;</span>, <span class="hljs-string">&#x27;reduction&#x27;</span>, <span class="hljs-string">&#x27;iretobacco&#x27;</span>, <span class="hljs-string">&#x27;weight&#x27;</span>, <span class="hljs-string">&#x27;while&#x27;</span>, <span class="hljs-string">&#x27;maintaining&#x27;</span>, <span class="hljs-string">&#x27;smoking&#x27;</span>, <span class="hljs-string">&#x27;mechanics&#x27;</span>, <span class="hljs-string">&#x27;and&#x27;</span>, <span class="hljs-string">&#x27;visual&#x27;</span>, <span class="hljs-string">&#x27;characteristics.&#x27;</span>, <span class="hljs-string">&#x27;higher&#x27;</span>, <span class="hljs-string">&#x27;basis&#x27;</span>, <span class="hljs-string">&#x27;weight&#x27;</span>, <span class="hljs-string">&#x27;paper:&#x27;</span>, <span class="hljs-string">&#x27;potential&#x27;</span>, <span class="hljs-string">&#x27;reduction&#x27;</span>, <span class="hljs-string">&#x27;in&#x27;</span>, <span class="hljs-string">&#x27;tobacco&#x27;</span>, <span class="hljs-string">&#x27;weight.&#x27;</span>, <span class="hljs-string">&#x27;«more&#x27;</span>, <span class="hljs-string">&#x27;rigid&#x27;</span>, <span class="hljs-string">&#x27;tobacco&#x27;</span>, <span class="hljs-string">&#x27;column;&#x27;</span>, <span class="hljs-string">&#x27;stiffing&#x27;</span>, <span class="hljs-string">&#x27;agent&#x27;</span>, <span class="hljs-string">&#x27;for&#x27;</span>, <span class="hljs-string">&#x27;tobacco;&#x27;</span>, <span class="hljs-string">&#x27;e.g.&#x27;</span>, <span class="hljs-string">&#x27;starch&#x27;</span>, <span class="hljs-string">&#x27;*colored&#x27;</span>, <span class="hljs-string">&#x27;tow&#x27;</span>, <span class="hljs-string">&#x27;and&#x27;</span>, <span class="hljs-string">&#x27;cigarette&#x27;</span>, <span class="hljs-string">&#x27;papers;&#x27;</span>, <span class="hljs-string">&#x27;seasonal&#x27;</span>, <span class="hljs-string">&#x27;promotions,&#x27;</span>, <span class="hljs-string">&#x27;e.g.&#x27;</span>, <span class="hljs-string">&#x27;pastel&#x27;</span>, <span class="hljs-string">&#x27;colored&#x27;</span>, <span class="hljs-string">&#x27;cigarettes&#x27;</span>, <span class="hljs-string">&#x27;for&#x27;</span>, <span class="hljs-string">&#x27;easter&#x27;</span>, <span class="hljs-string">&#x27;or&#x27;</span>, <span class="hljs-string">&#x27;in&#x27;</span>, <span class="hljs-string">&#x27;an&#x27;</span>, <span class="hljs-string">&#x27;ebony&#x27;</span>, <span class="hljs-string">&#x27;and&#x27;</span>, <span class="hljs-string">&#x27;ivory&#x27;</span>, <span class="hljs-string">&#x27;brand&#x27;</span>, <span class="hljs-string">&#x27;containing&#x27;</span>, <span class="hljs-string">&#x27;a&#x27;</span>, <span class="hljs-string">&#x27;mixture&#x27;</span>, <span class="hljs-string">&#x27;of&#x27;</span>, <span class="hljs-string">&#x27;all&#x27;</span>, <span class="hljs-string">&#x27;black&#x27;</span>, <span class="hljs-string">&#x27;(black&#x27;</span>, <span class="hljs-string">&#x27;paper&#x27;</span>, <span class="hljs-string">&#x27;and&#x27;</span>, <span class="hljs-string">&#x27;tow)&#x27;</span>, <span class="hljs-string">&#x27;and&#x27;</span>, <span class="hljs-string">&#x27;ail&#x27;</span>, <span class="hljs-string">&#x27;white&#x27;</span>, <span class="hljs-string">&#x27;cigarettes.&#x27;</span>, <span class="hljs-string">&#x27;499150498&#x27;</span>]
Answer: T.F. Riehl
start_index <span class="hljs-number">17</span>
end_index <span class="hljs-number">18</span><!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-19lp6r8">Once examples are encoded, however, they will look like this:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>encoding = tokenizer(example[<span class="hljs-string">&quot;question&quot;</span>], example[<span class="hljs-string">&quot;words&quot;</span>], example[<span class="hljs-string">&quot;boxes&quot;</span>])
<span class="hljs-meta">&gt;&gt;&gt; </span>tokenizer.decode(encoding[<span class="hljs-string">&quot;input_ids&quot;</span>])
[CLS] who <span class="hljs-keyword">is</span> <span class="hljs-keyword">in</span> cc <span class="hljs-keyword">in</span> this letter? [SEP] wie baw brown &amp; williamson tobacco corporation research &amp; development ...<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1tk94l">We’ll need to find the position of the answer in the encoded input.</p> <ul data-svelte-h="svelte-zfehno"><li><code>token_type_ids</code> tells us which tokens are part of the question, and which ones are part of the document’s words.</li> <li><code>tokenizer.cls_token_id</code> will help find the special token at the beginning of the input.</li> <li><code>word_ids</code> will help match the answer found in the original <code>words</code> to the same answer in the full encoded input and determine
the start/end position of the answer in the encoded input.</li></ul> <p data-svelte-h="svelte-701rvg">With that in mind, let’s create a function to encode a batch of examples in the dataset:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">def</span> <span class="hljs-title function_">encode_dataset</span>(<span class="hljs-params">examples, max_length=<span class="hljs-number">512</span></span>):
<span class="hljs-meta">... </span> questions = examples[<span class="hljs-string">&quot;question&quot;</span>]
<span class="hljs-meta">... </span> words = examples[<span class="hljs-string">&quot;words&quot;</span>]
<span class="hljs-meta">... </span> boxes = examples[<span class="hljs-string">&quot;boxes&quot;</span>]
<span class="hljs-meta">... </span> answers = examples[<span class="hljs-string">&quot;answer&quot;</span>]
<span class="hljs-meta">... </span> <span class="hljs-comment"># encode the batch of examples and initialize the start_positions and end_positions</span>
<span class="hljs-meta">... </span> encoding = tokenizer(questions, words, boxes, max_length=max_length, padding=<span class="hljs-string">&quot;max_length&quot;</span>, truncation=<span class="hljs-literal">True</span>)
<span class="hljs-meta">... </span> start_positions = []
<span class="hljs-meta">... </span> end_positions = []
<span class="hljs-meta">... </span> <span class="hljs-comment"># loop through the examples in the batch</span>
<span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> i <span class="hljs-keyword">in</span> <span class="hljs-built_in">range</span>(<span class="hljs-built_in">len</span>(questions)):
<span class="hljs-meta">... </span> cls_index = encoding[<span class="hljs-string">&quot;input_ids&quot;</span>][i].index(tokenizer.cls_token_id)
<span class="hljs-meta">... </span> <span class="hljs-comment"># find the position of the answer in example&#x27;s words</span>
<span class="hljs-meta">... </span> words_example = [word.lower() <span class="hljs-keyword">for</span> word <span class="hljs-keyword">in</span> words[i]]
<span class="hljs-meta">... </span> answer = answers[i]
<span class="hljs-meta">... </span> <span class="hljs-keyword">match</span>, word_idx_start, word_idx_end = subfinder(words_example, answer.lower().split())
<span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> <span class="hljs-keyword">match</span>:
<span class="hljs-meta">... </span> <span class="hljs-comment"># if match is found, use `token_type_ids` to find where words start in the encoding</span>
<span class="hljs-meta">... </span> token_type_ids = encoding[<span class="hljs-string">&quot;token_type_ids&quot;</span>][i]
<span class="hljs-meta">... </span> token_start_index = <span class="hljs-number">0</span>
<span class="hljs-meta">... </span> <span class="hljs-keyword">while</span> token_type_ids[token_start_index] != <span class="hljs-number">1</span>:
<span class="hljs-meta">... </span> token_start_index += <span class="hljs-number">1</span>
<span class="hljs-meta">... </span> token_end_index = <span class="hljs-built_in">len</span>(encoding[<span class="hljs-string">&quot;input_ids&quot;</span>][i]) - <span class="hljs-number">1</span>
<span class="hljs-meta">... </span> <span class="hljs-keyword">while</span> token_type_ids[token_end_index] != <span class="hljs-number">1</span>:
<span class="hljs-meta">... </span> token_end_index -= <span class="hljs-number">1</span>
<span class="hljs-meta">... </span> word_ids = encoding.word_ids(i)[token_start_index : token_end_index + <span class="hljs-number">1</span>]
<span class="hljs-meta">... </span> start_position = cls_index
<span class="hljs-meta">... </span> end_position = cls_index
<span class="hljs-meta">... </span> <span class="hljs-comment"># loop over word_ids and increase `token_start_index` until it matches the answer position in words</span>
<span class="hljs-meta">... </span> <span class="hljs-comment"># once it matches, save the `token_start_index` as the `start_position` of the answer in the encoding</span>
<span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> <span class="hljs-built_in">id</span> <span class="hljs-keyword">in</span> word_ids:
<span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> <span class="hljs-built_in">id</span> == word_idx_start:
<span class="hljs-meta">... </span> start_position = token_start_index
<span class="hljs-meta">... </span> <span class="hljs-keyword">else</span>:
<span class="hljs-meta">... </span> token_start_index += <span class="hljs-number">1</span>
<span class="hljs-meta">... </span> <span class="hljs-comment"># similarly loop over `word_ids` starting from the end to find the `end_position` of the answer</span>
<span class="hljs-meta">... </span> <span class="hljs-keyword">for</span> <span class="hljs-built_in">id</span> <span class="hljs-keyword">in</span> word_ids[::-<span class="hljs-number">1</span>]:
<span class="hljs-meta">... </span> <span class="hljs-keyword">if</span> <span class="hljs-built_in">id</span> == word_idx_end:
<span class="hljs-meta">... </span> end_position = token_end_index
<span class="hljs-meta">... </span> <span class="hljs-keyword">else</span>:
<span class="hljs-meta">... </span> token_end_index -= <span class="hljs-number">1</span>
<span class="hljs-meta">... </span> start_positions.append(start_position)
<span class="hljs-meta">... </span> end_positions.append(end_position)
<span class="hljs-meta">... </span> <span class="hljs-keyword">else</span>:
<span class="hljs-meta">... </span> start_positions.append(cls_index)
<span class="hljs-meta">... </span> end_positions.append(cls_index)
<span class="hljs-meta">... </span> encoding[<span class="hljs-string">&quot;image&quot;</span>] = examples[<span class="hljs-string">&quot;image&quot;</span>]
<span class="hljs-meta">... </span> encoding[<span class="hljs-string">&quot;start_positions&quot;</span>] = start_positions
<span class="hljs-meta">... </span> encoding[<span class="hljs-string">&quot;end_positions&quot;</span>] = end_positions
<span class="hljs-meta">... </span> <span class="hljs-keyword">return</span> encoding<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1ori799">Now that we have this preprocessing function, we can encode the entire dataset:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>encoded_train_dataset = dataset_with_ocr[<span class="hljs-string">&quot;train&quot;</span>].<span class="hljs-built_in">map</span>(
<span class="hljs-meta">... </span> encode_dataset, batched=<span class="hljs-literal">True</span>, batch_size=<span class="hljs-number">2</span>, remove_columns=dataset_with_ocr[<span class="hljs-string">&quot;train&quot;</span>].column_names
<span class="hljs-meta">... </span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>encoded_test_dataset = dataset_with_ocr[<span class="hljs-string">&quot;test&quot;</span>].<span class="hljs-built_in">map</span>(
<span class="hljs-meta">... </span> encode_dataset, batched=<span class="hljs-literal">True</span>, batch_size=<span class="hljs-number">2</span>, remove_columns=dataset_with_ocr[<span class="hljs-string">&quot;test&quot;</span>].column_names
<span class="hljs-meta">... </span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-upxsp">Let’s check what the features of the encoded dataset look like:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>encoded_train_dataset.features
{<span class="hljs-string">&#x27;image&#x27;</span>: <span class="hljs-type">Sequence</span>(feature=<span class="hljs-type">Sequence</span>(feature=<span class="hljs-type">Sequence</span>(feature=Value(dtype=<span class="hljs-string">&#x27;uint8&#x27;</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>),
<span class="hljs-string">&#x27;input_ids&#x27;</span>: <span class="hljs-type">Sequence</span>(feature=Value(dtype=<span class="hljs-string">&#x27;int32&#x27;</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>),
<span class="hljs-string">&#x27;token_type_ids&#x27;</span>: <span class="hljs-type">Sequence</span>(feature=Value(dtype=<span class="hljs-string">&#x27;int8&#x27;</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>),
<span class="hljs-string">&#x27;attention_mask&#x27;</span>: <span class="hljs-type">Sequence</span>(feature=Value(dtype=<span class="hljs-string">&#x27;int8&#x27;</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>),
<span class="hljs-string">&#x27;bbox&#x27;</span>: <span class="hljs-type">Sequence</span>(feature=<span class="hljs-type">Sequence</span>(feature=Value(dtype=<span class="hljs-string">&#x27;int64&#x27;</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>), length=-<span class="hljs-number">1</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>),
<span class="hljs-string">&#x27;start_positions&#x27;</span>: Value(dtype=<span class="hljs-string">&#x27;int64&#x27;</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>),
<span class="hljs-string">&#x27;end_positions&#x27;</span>: Value(dtype=<span class="hljs-string">&#x27;int64&#x27;</span>, <span class="hljs-built_in">id</span>=<span class="hljs-literal">None</span>)}<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="evaluation" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#evaluation"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Evaluation</span></h2> <p data-svelte-h="svelte-1ro13gx">Evaluation for document question answering requires a significant amount of postprocessing. To avoid taking up too much
of your time, this guide skips the evaluation step. The <a href="/docs/transformers/main/en/main_classes/trainer#transformers.Trainer">Trainer</a> still calculates the evaluation loss during training so
you’re not completely in the dark about your model’s performance. Extractive question answering is typically evaluated using F1/exact match.
If you’d like to implement it yourself, check out the <a href="https://huggingface.co/course/chapter7/7?fw=pt#postprocessing" rel="nofollow">Question Answering chapter</a>
of the Hugging Face course for inspiration.</p> <h2 class="relative group"><a id="train" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#train"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Train</span></h2> <p data-svelte-h="svelte-10f6ay">Congratulations! You’ve successfully navigated the toughest part of this guide and now you are ready to train your own model.
Training involves the following steps:</p> <ul data-svelte-h="svelte-pmt07p"><li>Load the model with <a href="/docs/transformers/main/en/model_doc/auto#transformers.AutoModelForDocumentQuestionAnswering">AutoModelForDocumentQuestionAnswering</a> using the same checkpoint as in the preprocessing.</li> <li>Define your training hyperparameters in <a href="/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a>.</li> <li>Define a function to batch examples together, here the <a href="/docs/transformers/main/en/main_classes/data_collator#transformers.DefaultDataCollator">DefaultDataCollator</a> will do just fine</li> <li>Pass the training arguments to <a href="/docs/transformers/main/en/main_classes/trainer#transformers.Trainer">Trainer</a> along with the model, dataset, and data collator.</li> <li>Call <a href="/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train">train()</a> to finetune your model.</li></ul> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForDocumentQuestionAnswering
<span class="hljs-meta">&gt;&gt;&gt; </span>model = AutoModelForDocumentQuestionAnswering.from_pretrained(model_checkpoint)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-b9l6b1">In the <a href="/docs/transformers/main/en/main_classes/trainer#transformers.TrainingArguments">TrainingArguments</a> use <code>output_dir</code> to specify where to save your model, and configure hyperparameters as you see fit.
If you wish to share your model with the community, set <code>push_to_hub</code> to <code>True</code> (you must be signed in to Hugging Face to upload your model).
In this case the <code>output_dir</code> will also be the name of the repo where your model checkpoint will be pushed.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> TrainingArguments
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-comment"># REPLACE THIS WITH YOUR REPO ID</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>repo_id = <span class="hljs-string">&quot;MariaK/layoutlmv2-base-uncased_finetuned_docvqa&quot;</span>
<span class="hljs-meta">&gt;&gt;&gt; </span>training_args = TrainingArguments(
<span class="hljs-meta">... </span> output_dir=repo_id,
<span class="hljs-meta">... </span> per_device_train_batch_size=<span class="hljs-number">4</span>,
<span class="hljs-meta">... </span> num_train_epochs=<span class="hljs-number">20</span>,
<span class="hljs-meta">... </span> save_steps=<span class="hljs-number">200</span>,
<span class="hljs-meta">... </span> logging_steps=<span class="hljs-number">50</span>,
<span class="hljs-meta">... </span> eval_strategy=<span class="hljs-string">&quot;steps&quot;</span>,
<span class="hljs-meta">... </span> learning_rate=<span class="hljs-number">5e-5</span>,
<span class="hljs-meta">... </span> save_total_limit=<span class="hljs-number">2</span>,
<span class="hljs-meta">... </span> remove_unused_columns=<span class="hljs-literal">False</span>,
<span class="hljs-meta">... </span> push_to_hub=<span class="hljs-literal">True</span>,
<span class="hljs-meta">... </span>)<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-1gq6t8w">Define a simple data collator to batch examples together.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> DefaultDataCollator
<span class="hljs-meta">&gt;&gt;&gt; </span>data_collator = DefaultDataCollator()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-15i5vaz">Finally, bring everything together, and call <a href="/docs/transformers/main/en/main_classes/trainer#transformers.Trainer.train">train()</a>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> Trainer
<span class="hljs-meta">&gt;&gt;&gt; </span>trainer = Trainer(
<span class="hljs-meta">... </span> model=model,
<span class="hljs-meta">... </span> args=training_args,
<span class="hljs-meta">... </span> data_collator=data_collator,
<span class="hljs-meta">... </span> train_dataset=encoded_train_dataset,
<span class="hljs-meta">... </span> eval_dataset=encoded_test_dataset,
<span class="hljs-meta">... </span> tokenizer=processor,
<span class="hljs-meta">... </span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>trainer.train()<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-gilssp">To add the final model to 🤗 Hub, create a model card and call <code>push_to_hub</code>:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>trainer.create_model_card()
<span class="hljs-meta">&gt;&gt;&gt; </span>trainer.push_to_hub()<!-- HTML_TAG_END --></pre></div> <h2 class="relative group"><a id="inference" class="header-link block pr-1.5 text-lg no-hover:hidden with-hover:absolute with-hover:p-1.5 with-hover:opacity-0 with-hover:group-hover:opacity-100 with-hover:right-full" href="#inference"><span><svg class="" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" aria-hidden="true" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 256 256"><path d="M167.594 88.393a8.001 8.001 0 0 1 0 11.314l-67.882 67.882a8 8 0 1 1-11.314-11.315l67.882-67.881a8.003 8.003 0 0 1 11.314 0zm-28.287 84.86l-28.284 28.284a40 40 0 0 1-56.567-56.567l28.284-28.284a8 8 0 0 0-11.315-11.315l-28.284 28.284a56 56 0 0 0 79.196 79.197l28.285-28.285a8 8 0 1 0-11.315-11.314zM212.852 43.14a56.002 56.002 0 0 0-79.196 0l-28.284 28.284a8 8 0 1 0 11.314 11.314l28.284-28.284a40 40 0 0 1 56.568 56.567l-28.285 28.285a8 8 0 0 0 11.315 11.314l28.284-28.284a56.065 56.065 0 0 0 0-79.196z" fill="currentColor"></path></svg></span></a> <span>Inference</span></h2> <p data-svelte-h="svelte-1pszhza">Now that you have finetuned a LayoutLMv2 model, and uploaded it to the 🤗 Hub, you can use it for inference. The simplest
way to try out your finetuned model for inference is to use it in a <a href="/docs/transformers/main/en/main_classes/pipelines#transformers.Pipeline">Pipeline</a>.</p> <p data-svelte-h="svelte-1wtngfz">Let’s take an example:</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span>example = dataset[<span class="hljs-string">&quot;test&quot;</span>][<span class="hljs-number">2</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>question = example[<span class="hljs-string">&quot;query&quot;</span>][<span class="hljs-string">&quot;en&quot;</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span>image = example[<span class="hljs-string">&quot;image&quot;</span>]
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(question)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-built_in">print</span>(example[<span class="hljs-string">&quot;answers&quot;</span>])
<span class="hljs-string">&#x27;Who is ‘presiding’ TRRF GENERAL SESSION (PART 1)?&#x27;</span>
[<span class="hljs-string">&#x27;TRRF Vice President&#x27;</span>, <span class="hljs-string">&#x27;lee a. waller&#x27;</span>]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-mcchgg">Next, instantiate a pipeline for
document question answering with your model, and pass the image + question combination to it.</p> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> pipeline
<span class="hljs-meta">&gt;&gt;&gt; </span>qa_pipeline = pipeline(<span class="hljs-string">&quot;document-question-answering&quot;</span>, model=<span class="hljs-string">&quot;MariaK/layoutlmv2-base-uncased_finetuned_docvqa&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>qa_pipeline(image, question)
[{<span class="hljs-string">&#x27;score&#x27;</span>: <span class="hljs-number">0.9949808120727539</span>,
<span class="hljs-string">&#x27;answer&#x27;</span>: <span class="hljs-string">&#x27;Lee A. Waller&#x27;</span>,
<span class="hljs-string">&#x27;start&#x27;</span>: <span class="hljs-number">55</span>,
<span class="hljs-string">&#x27;end&#x27;</span>: <span class="hljs-number">57</span>}]<!-- HTML_TAG_END --></pre></div> <p data-svelte-h="svelte-o6117l">You can also manually replicate the results of the pipeline if you’d like:</p> <ol data-svelte-h="svelte-19rdijs"><li>Take an image and a question, prepare them for the model using the processor from your model.</li> <li>Forward the result or preprocessing through the model.</li> <li>The model returns <code>start_logits</code> and <code>end_logits</code>, which indicate which token is at the start of the answer and
which token is at the end of the answer. Both have shape (batch_size, sequence_length).</li> <li>Take an argmax on the last dimension of both the <code>start_logits</code> and <code>end_logits</code> to get the predicted <code>start_idx</code> and <code>end_idx</code>.</li> <li>Decode the answer with the tokenizer.</li></ol> <div class="code-block relative"><div class="absolute top-2.5 right-4"><button class="inline-flex items-center relative text-sm focus:text-green-500 cursor-pointer focus:outline-none transition duration-200 ease-in-out opacity-0 mx-0.5 text-gray-600 " title="code excerpt" type="button"><svg class="" xmlns="http://www.w3.org/2000/svg" aria-hidden="true" fill="currentColor" focusable="false" role="img" width="1em" height="1em" preserveAspectRatio="xMidYMid meet" viewBox="0 0 32 32"><path d="M28,10V28H10V10H28m0-2H10a2,2,0,0,0-2,2V28a2,2,0,0,0,2,2H28a2,2,0,0,0,2-2V10a2,2,0,0,0-2-2Z" transform="translate(0)"></path><path d="M4,18H2V4A2,2,0,0,1,4,2H18V4H4Z" transform="translate(0)"></path><rect fill="none" width="32" height="32"></rect></svg> <div class="absolute pointer-events-none transition-opacity bg-black text-white py-1 px-2 leading-tight rounded font-normal shadow left-1/2 top-full transform -translate-x-1/2 translate-y-2 opacity-0"><div class="absolute bottom-full left-1/2 transform -translate-x-1/2 w-0 h-0 border-black border-4 border-t-0" style="border-left-color: transparent; border-right-color: transparent; "></div> Copied</div></button></div> <pre class=""><!-- HTML_TAG_START --><span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">import</span> torch
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoProcessor
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">from</span> transformers <span class="hljs-keyword">import</span> AutoModelForDocumentQuestionAnswering
<span class="hljs-meta">&gt;&gt;&gt; </span>processor = AutoProcessor.from_pretrained(<span class="hljs-string">&quot;MariaK/layoutlmv2-base-uncased_finetuned_docvqa&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span>model = AutoModelForDocumentQuestionAnswering.from_pretrained(<span class="hljs-string">&quot;MariaK/layoutlmv2-base-uncased_finetuned_docvqa&quot;</span>)
<span class="hljs-meta">&gt;&gt;&gt; </span><span class="hljs-keyword">with</span> torch.no_grad():
<span class="hljs-meta">... </span> encoding = processor(image.convert(<span class="hljs-string">&quot;RGB&quot;</span>), question, return_tensors=<span class="hljs-string">&quot;pt&quot;</span>)
<span class="hljs-meta">... </span> outputs = model(**encoding)
<span class="hljs-meta">... </span> start_logits = outputs.start_logits
<span class="hljs-meta">... </span> end_logits = outputs.end_logits
<span class="hljs-meta">... </span> predicted_start_idx = start_logits.argmax(-<span class="hljs-number">1</span>).item()
<span class="hljs-meta">... </span> predicted_end_idx = end_logits.argmax(-<span class="hljs-number">1</span>).item()
<span class="hljs-meta">&gt;&gt;&gt; </span>processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx : predicted_end_idx + <span class="hljs-number">1</span>])
<span class="hljs-string">&#x27;lee a. waller&#x27;</span><!-- HTML_TAG_END --></pre></div> <a class="!text-gray-400 !no-underline text-sm flex items-center not-prose mt-4" href="https://github.com/huggingface/transformers/blob/main/docs/source/en/tasks/document_question_answering.md" target="_blank"><span data-svelte-h="svelte-1kd6by1">&lt;</span> <span data-svelte-h="svelte-x0xyl0">&gt;</span> <span data-svelte-h="svelte-1dajgef"><span class="underline ml-1.5">Update</span> on GitHub</span></a> <p></p>
<script>
{
__sveltekit_1xexzbk = {
assets: "/docs/transformers/main/en",
base: "/docs/transformers/main/en",
env: {}
};
const element = document.currentScript.parentElement;
const data = [null,null];
Promise.all([
import("/docs/transformers/main/en/_app/immutable/entry/start.2135b7e6.js"),
import("/docs/transformers/main/en/_app/immutable/entry/app.24372c84.js")
]).then(([kit, app]) => {
kit.start(app, element, {
node_ids: [0, 401],
data,
form: null,
error: null
});
});
}
</script>

Xet Storage Details

Size:
107 kB
·
Xet hash:
fe7560bad558fa4b8eb4f75f20d05fbe49558dd2027a7b4cd281b3e9402519cc

Xet efficiently stores files, intelligently splitting them into unique chunks and accelerating uploads and downloads. More info.