ethanbradley
/

fintabqa

@@ -16,14 +16,73 @@ A model for financial table question-answering using the [LayoutLM](https://hugg
 ## Quick start
-To get started with FinTabQA, load it, and the tokenizer, like you would any other Hugging Face Transformer model.
 ```python3
-from transformers import LayoutLMForQuestionAnswering, LayoutLMTokenizer
-model = LayoutLMForQuestionAnswering.from_pretrained("ethanbradley/fintabqa")
-tokenizer = LayoutLMTokenizer.from_pretrained(
-    "microsoft/layoutlm-base-uncased")
 ```
 ## Citation

 ## Quick start
+To get started with FinTabQA, load it, and a fast tokenizer, like you would any other Hugging Face Transformer model and tokenizer. Below is a minimum working example using the [SynFinTabs](https://huggingface.co/datasets/ethanbradley/synfintabs) dataset.
 ```python3
+>>> from typing import List, Tuple
+>>> from datasets import load_dataset
+>>> from transformers import LayoutLMForQuestionAnswering, LayoutLMTokenizerFast
+>>> import torch
+>>>
+>>> synfintabs_dataset = load_dataset("ethanbradley/synfintabs")
+>>> model = LayoutLMForQuestionAnswering.from_pretrained("ethanbradley/fintabqa")
+>>> tokenizer = LayoutLMTokenizerFast.from_pretrained(
+...     "microsoft/layoutlm-base-uncased")
+>>>
+>>> def normalise_boxes(
+...         boxes: List[List[int]],
+...         old_image_size: Tuple[int, int],
+...         new_image_size: Tuple[int, int]) -> List[List[int]]:
+...     old_im_w, old_im_h = old_image_size
+...     new_im_w, new_im_h = new_image_size
+...
+...     return [[
+...         max(min(int(x1 / old_im_w * new_im_w), new_im_w), 0),
+...         max(min(int(y1 / old_im_h * new_im_h), new_im_h), 0),
+...         max(min(int(x2 / old_im_w * new_im_w), new_im_w), 0),
+...         max(min(int(y2 / old_im_h * new_im_h), new_im_h), 0)
+...     ] for (x1, y1, x2, y2) in boxes]
+>>>
+>>> item = synfintabs_dataset['test'][0]
+>>> question_dict = next(question for question in item['questions']
+...     if question['id'] == item['question_id'])
+>>> encoding = tokenizer(
+...     question_dict['question'].split(),
+...     item['ocr_results']['words'],
+...     max_length=512,
+...     padding="max_length",
+...     truncation="only_second",
+...     is_split_into_words=True,
+...     return_token_type_ids=True,
+...     return_tensors="pt")
+>>>
+>>> word_boxes = normalise_boxes(
+...     item['ocr_results']['bboxes'],
+...     item['image'].crop(item['bbox']).size,
+...     (1000, 1000))
+>>> token_boxes = []
+>>>
+>>> for i, s, w in zip(
+...         encoding['input_ids'][0],
+...         encoding.sequence_ids(0),
+...         encoding.word_ids(0)):
+...     if s == 1:
+...         token_boxes.append(word_boxes[w])
+...     elif i == tokenizer.sep_token_id:
+...         token_boxes.append([1000] * 4)
+...     else:
+...         token_boxes.append([0] * 4)
+>>>
+>>> encoding['bbox'] = torch.tensor([token_boxes])
+>>> outputs = model(**encoding)
+>>> start = encoding.word_ids(0)[outputs['start_logits'].argmax(-1)]
+>>> end = encoding.word_ids(0)[outputs['end_logits'].argmax(-1)]
+>>>
+>>> print(f"Target: {question_dict['answer']}")
+Target: 6,980
+>>>
+>>> print(f"Prediction: {' '.join(item['ocr_results']['words'][start : end])}")
+Prediction: 6,980
 ```
 ## Citation