Amanda commited on
Commit
cf7d42e
·
1 Parent(s): 6e16eba

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +9 -8
app.py CHANGED
@@ -10,13 +10,14 @@ model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-fin
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
  model.to(device)
12
 
13
- def process_document(image):
14
  # prepare encoder inputs
15
  pixel_values = processor(image, return_tensors="pt").pixel_values
16
 
17
  # prepare decoder inputs
18
- task_prompt = "<s_cord-v2>"
19
- decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
 
20
 
21
  # generate answer
22
  outputs = model.generate(
@@ -39,18 +40,18 @@ def process_document(image):
39
 
40
  return processor.token2json(sequence)
41
 
42
- description = "Gradio Demo for Donut, an instance of `VisionEncoderDecoderModel` fine-tuned on CORD (document parsing). To use it, simply upload your image and click 'submit', or click one of the examples to load them. Read more at the links below."
43
  article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2111.15664' target='_blank'>Donut: OCR-free Document Understanding Transformer</a> | <a href='https://github.com/clovaai/donut' target='_blank'>Github Repo</a></p>"
44
 
45
  demo = gr.Interface(
46
  fn=process_document,
47
- inputs="image",
48
  outputs="json",
49
- title="Demo: Donut 🍩 for Document Parsing",
50
  description=description,
51
  article=article,
52
  enable_queue=True,
53
- examples=[["DL.jpg"], ["EAC.png"], ["BC.jfif"]],
54
  cache_examples=False)
55
 
56
- demo.launch()
 
10
  device = "cuda" if torch.cuda.is_available() else "cpu"
11
  model.to(device)
12
 
13
+ def process_document(image, question):
14
  # prepare encoder inputs
15
  pixel_values = processor(image, return_tensors="pt").pixel_values
16
 
17
  # prepare decoder inputs
18
+ task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
19
+ prompt = task_prompt.replace("{user_input}", question)
20
+ decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
21
 
22
  # generate answer
23
  outputs = model.generate(
 
40
 
41
  return processor.token2json(sequence)
42
 
43
+ description = "Gradio Demo for Donut, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
44
  article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2111.15664' target='_blank'>Donut: OCR-free Document Understanding Transformer</a> | <a href='https://github.com/clovaai/donut' target='_blank'>Github Repo</a></p>"
45
 
46
  demo = gr.Interface(
47
  fn=process_document,
48
+ inputs=["image", "text"],
49
  outputs="json",
50
+ title="Demo: Donut 🍩 for DocVQA",
51
  description=description,
52
  article=article,
53
  enable_queue=True,
54
+ examples=[["DL.jpg", "What is the Surname?"], ["EAC.png", "What is the Address?"]],
55
  cache_examples=False)
56
 
57
+ demo.launch()