| | --- |
| | license: mit |
| | language: |
| | - en |
| | pipeline_tag: document-question-answering |
| | --- |
| | This model was trained on [DocVQA](https://www.docvqa.org/) Dataset questions |
| |
|
| | Code for Training and Prediction (v1): https://www.kaggle.com/tusharcode/training-layoutlm-docvqa |
| |
|
| | **How to use:** |
| | ```python |
| | from transformers import AutoTokenizer, AutoModelForDocumentQuestionAnswering |
| | from datasets import load_dataset |
| | |
| | model_checkpoint = "TusharGoel/LayoutLM-Finetuned-DocVQA" |
| | tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, add_prefix_space=True) |
| | model_predict = AutoModelForDocumentQuestionAnswering.from_pretrained(model_checkpoint) |
| | |
| | model_predict.eval() |
| | dataset = load_dataset("nielsr/funsd", split="train") |
| | example = dataset[0] |
| | |
| | question = "What's Licensee Number?" |
| | |
| | words = example["words"] |
| | boxes = example["bboxes"] |
| | |
| | encoding = tokenizer(question.split(), words, |
| | is_split_into_words=True, return_token_type_ids=True, return_tensors="pt") |
| | |
| | bbox = [] |
| | for i, s, w in zip(encoding.input_ids[0], encoding.sequence_ids(0), encoding.word_ids(0)): |
| | if s == 1: |
| | bbox.append(boxes[w]) |
| | elif i == tokenizer.sep_token_id: |
| | bbox.append([1000] * 4) |
| | else: |
| | bbox.append([0] * 4) |
| | encoding["bbox"] = torch.tensor([bbox]) |
| | |
| | word_ids = encoding.word_ids(0) |
| | outputs = model_predict(**encoding) |
| | |
| | loss = outputs.loss |
| | start_scores = outputs.start_logits |
| | end_scores = outputs.end_logits |
| | |
| | start, end = word_ids[start_scores.argmax(-1).item()], word_ids[end_scores.argmax(-1).item()] |
| | print(" ".join(words[start : end + 1])) |
| | ``` |