Spaces:

valyx
/

layout-lm

Runtime error

App Files Files Community

anirudh-valyx commited on May 15, 2025

Commit

28cce98

1 Parent(s): 1327cd1

fix input text

Browse files

Files changed (1) hide show

app.py +107 -51

app.py CHANGED Viewed

@@ -1,43 +1,66 @@
 import gradio as gr
 from transformers import AutoProcessor, AutoModelForDocumentQuestionAnswering
-from PIL import Image
 # Load processor and model
-processor = AutoProcessor.from_pretrained("impira/layoutlm-invoices")
-model = AutoModelForDocumentQuestionAnswering.from_pretrained("impira/layoutlm-invoices")
-def answer_question(image, question):
-    """
-    Process an invoice image and answer a question about its content
-    Args:
-        image: PIL image of the invoice
-        question: String question about the invoice
-    Returns:
-        String answer extracted from the invoice
-    """
-    # Input validation
-    if image is None:
-        return "Please upload an image"
-    if question is None or question.strip() == "":
-        return "Please enter a question"
-    # Ensure RGB mode
-    if image.mode != "RGB":
-        image = image.convert("RGB")
-    # Ensure question is a string
-    if not isinstance(question, str):
-        question = str(question)
     try:
-        # Following the exact format from the model documentation
-        # First position is for image, second is for text/question
         encoding = processor(image, question, return_tensors="pt")
-        # Forward pass through model
         outputs = model(**encoding)
         # Extract answer span
@@ -50,30 +73,63 @@ def answer_question(image, question):
         # Clean up answer
         answer = answer.replace("[CLS]", "").replace("[SEP]", "").strip()
-        if not answer:
-            return "No answer found in the document"
-        return answer
     except Exception as e:
         import traceback
-        tb = traceback.format_exc()
-        return f"Error processing document: {str(e)}\n\nDetails:\n{tb}"
 # Create Gradio interface
-iface = gr.Interface(
-    fn=answer_question,
-    inputs=[
-        gr.Image(type="pil", label="Upload Invoice Image"),
-        gr.Textbox(placeholder="Ask a question about the invoice...", label="Question")
-    ],
-    outputs=gr.Textbox(label="Answer"),
-    title="Invoice Question Answering with LayoutLM",
-    description="Upload an invoice image and ask questions like 'What is the invoice number?', 'What is the total amount?', 'Who is the vendor?', etc.",
-    # No hardcoded examples since we don't have sample files
-    examples=None,
-    allow_flagging="never"
-)
-# Launch the app
 if __name__ == "__main__":
-    iface.launch()

 import gradio as gr
+import os
+import torch
 from transformers import AutoProcessor, AutoModelForDocumentQuestionAnswering
+from PIL import Image, ImageDraw
+# Disable tokenizers parallelism to avoid warnings
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 # Load processor and model
+MODEL_NAME = "impira/layoutlm-invoices"
+processor = AutoProcessor.from_pretrained(MODEL_NAME)
+model = AutoModelForDocumentQuestionAnswering.from_pretrained(MODEL_NAME)
+# Use GPU if available
+device = "cuda" if torch.cuda.is_available() else "cpu"
+model = model.to(device)
+def ensure_list(x):
+    """Ensure input is a list"""
+    if isinstance(x, list):
+        return x
+    else:
+        return [x]
+def normalize_bbox(box, width, height, padding=0.005):
+    """Normalize bounding box coordinates"""
+    min_x, min_y, max_x, max_y = [c / 1000 for c in box]
+    if padding != 0:
+        min_x = max(0, min_x - padding)
+        min_y = max(0, min_y - padding)
+        max_x = min(max_x + padding, 1)
+        max_y = min(max_y + padding, 1)
+    return [min_x * width, min_y * height, max_x * width, max_y * height]
+def process_document(image_file):
+    """Process uploaded document"""
+    if image_file is None:
+        return None, gr.update(visible=False)
+    try:
+        # Load image
+        image = Image.open(image_file.name)
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+        # Return the document and show the image
+        return image, gr.update(visible=True, value=image)
+    except Exception as e:
+        return None, gr.update(visible=False, value=f"Error: {str(e)}")
+def answer_question(question, image):
+    """Process question with LayoutLM model"""
+    if image is None or question.strip() == "":
+        return None, None
     try:
+        # Process inputs
         encoding = processor(image, question, return_tensors="pt")
+        for key in encoding.keys():
+            encoding[key] = encoding[key].to(device)
+        # Get model predictions
         outputs = model(**encoding)
         # Extract answer span
         # Clean up answer
         answer = answer.replace("[CLS]", "").replace("[SEP]", "").strip()
+        # Highlight answer in image if word_ids are available
+        result_image = image.copy().convert("RGB")
+        # Return results
+        return answer, result_image
     except Exception as e:
         import traceback
+        error_msg = f"Error processing document: {str(e)}\n{traceback.format_exc()}"
+        return error_msg, None
 # Create Gradio interface
+with gr.Blocks() as demo:
+    gr.Markdown("# Invoice Question Answering with LayoutLM")
+    gr.Markdown("Upload an invoice image and ask questions like 'What is the invoice number?', 'What is the total amount?', etc.")
+    # Document storage
+    document = gr.State(None)
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("## 1. Upload a document")
+            upload = gr.File(label="Upload Invoice Image")
+            image_preview = gr.Image(label="Preview", visible=False)
+            gr.Markdown("## 2. Ask a question")
+            question = gr.Textbox(
+                label="Question",
+                placeholder="e.g. What is the invoice number?",
+                lines=1
+            )
+            submit_button = gr.Button("Submit", variant="primary")
+        with gr.Column():
+            gr.Markdown("## Results")
+            answer_text = gr.Textbox(label="Answer", lines=2)
+            result_image = gr.Image(label="Document with Answer")
+    # Set up event handlers
+    upload.change(
+        fn=process_document,
+        inputs=[upload],
+        outputs=[document, image_preview]
+    )
+    submit_button.click(
+        fn=answer_question,
+        inputs=[question, document],
+        outputs=[answer_text, result_image]
+    )
+    # Also trigger on pressing Enter in question box
+    question.submit(
+        fn=answer_question,
+        inputs=[question, document],
+        outputs=[answer_text, result_image]
+    )
 if __name__ == "__main__":
+    demo.launch(debug=True)