Spaces:

AmandaHydar
/

IntelligentDataCapture

Runtime error

Amanda commited on Jun 4, 2023

Commit

cf7d42e

1 Parent(s): 6e16eba

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -10,13 +10,14 @@ model = VisionEncoderDecoderModel.from_pretrained("naver-clova-ix/donut-base-fin
 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
-def process_document(image):
     # prepare encoder inputs
     pixel_values = processor(image, return_tensors="pt").pixel_values
     # prepare decoder inputs
-    task_prompt = "<s_cord-v2>"
-    decoder_input_ids = processor.tokenizer(task_prompt, add_special_tokens=False, return_tensors="pt").input_ids
     # generate answer
     outputs = model.generate(
@@ -39,18 +40,18 @@ def process_document(image):
     return processor.token2json(sequence)
-description = "Gradio Demo for Donut, an instance of `VisionEncoderDecoderModel` fine-tuned on CORD (document parsing). To use it, simply upload your image and click 'submit', or click one of the examples to load them. Read more at the links below."
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2111.15664' target='_blank'>Donut: OCR-free Document Understanding Transformer</a> | <a href='https://github.com/clovaai/donut' target='_blank'>Github Repo</a></p>"
 demo = gr.Interface(
     fn=process_document,
-    inputs="image",
     outputs="json",
-    title="Demo: Donut 🍩 for Document Parsing",
     description=description,
     article=article,
     enable_queue=True,
-    examples=[["DL.jpg"], ["EAC.png"], ["BC.jfif"]],
     cache_examples=False)
-demo.launch()

 device = "cuda" if torch.cuda.is_available() else "cpu"
 model.to(device)
+def process_document(image, question):
     # prepare encoder inputs
     pixel_values = processor(image, return_tensors="pt").pixel_values
     # prepare decoder inputs
+    task_prompt = "<s_docvqa><s_question>{user_input}</s_question><s_answer>"
+    prompt = task_prompt.replace("{user_input}", question)
+    decoder_input_ids = processor.tokenizer(prompt, add_special_tokens=False, return_tensors="pt").input_ids
     # generate answer
     outputs = model.generate(
     return processor.token2json(sequence)
+description = "Gradio Demo for Donut, an instance of `VisionEncoderDecoderModel` fine-tuned on DocVQA (document visual question answering). To use it, simply upload your image and type a question and click 'submit', or click one of the examples to load them. Read more at the links below."
 article = "<p style='text-align: center'><a href='https://arxiv.org/abs/2111.15664' target='_blank'>Donut: OCR-free Document Understanding Transformer</a> | <a href='https://github.com/clovaai/donut' target='_blank'>Github Repo</a></p>"
 demo = gr.Interface(
     fn=process_document,
+    inputs=["image", "text"],
     outputs="json",
+    title="Demo: Donut 🍩 for DocVQA",
     description=description,
     article=article,
     enable_queue=True,
+    examples=[["DL.jpg", "What is the Surname?"], ["EAC.png", "What is the Address?"]],
     cache_examples=False)
+demo.launch()