Spaces:

our-sci
/

agriculture-experiments-document-processing

Runtime error

App Files Files Community

rosemariafontana commited on Aug 30, 2024

Commit

f9937f7

verified ·

1 Parent(s): a54945b

Changed to tokenization

Browse files

Files changed (1) hide show

app.py +159 -94

app.py CHANGED Viewed

@@ -1,34 +1,98 @@
 import gradio as gr
 import pandas as pd
-from PIL import Image
-from transformers import LayoutLMv3Processor, LayoutLMv3ForQuestionAnswering
 processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
-model = LayoutLMv3ForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")
-def process_question(question, document):
-    #print(f"Debug - Processing Question: {question}")
-    encoding = processor(document, question, return_tensors="pt")
-    #print(f"Debug - Encoding Input IDs: {encoding.input_ids}")
     outputs = model(**encoding)
-    #print(f"Debug - Model Outputs: {outputs}")
-    predicted_start_idx = outputs.start_logits.argmax(-1).item()
-    predicted_end_idx = outputs.end_logits.argmax(-1).item()
-    # Check if indices are valid
-    if predicted_start_idx < 0 or predicted_end_idx < 0:
-        print(f"Warning - Invalid prediction indices: start={predicted_start_idx}, end={predicted_end_idx}")
-        return ""
-    answer_tokens = encoding.input_ids.squeeze()[predicted_start_idx: predicted_end_idx + 1]
-    answer = processor.tokenizer.decode(answer_tokens)
-    return answer
 #def process_question(question, document):
 #    if not question or document is None:
 #        return None, None, None
@@ -46,82 +110,83 @@ def process_question(question, document):
 #
 #    return text_value
-def parse_ticket_image(image, question):
-    """Basically just runs through these questions for the document"""
-    # Processing the image
-    if image:
-        try:
-            if image.mode != "RGB":
-                document = image.convert("RGB")
-            else:
-                document = image
-        except Exception as e:
-            traceback.print_exc()
-            error = str(e)
-    # Define questions you want to ask the model
-    questions = [
-        "What is the ticket number?",
-        "What is the type of grain (For example: corn, soybeans, wheat)?",
-        "What is the date?",
-        "What is the time?",
-        "What is the gross weight?",
-        "What is the tare weight?",
-        "What is the net weight?",
-        "What is the moisture (moist) percentage?",
-        "What is the damage percentage?",
-        "What is the gross units?",
-        "What is the dock units?",
-        "What is the comment?",
-        "What is the assembly number?",
-    ]
-    # Use the model to answer each question
-    answers = {}
-    for q in questions:
-        print(f"Question: {q}")
-        answer_text = process_question(q, document)
-        print(f"Answer Text extracted here: {answer_text}")
-        answers[q] = answer_text
-    ticket_number = answers["What is the ticket number?"]
-    grain_type = answers["What is the type of grain (For example: corn, soybeans, wheat)?"]
-    date = answers["What is the date?"]
-    time = answers["What is the time?"]
-    gross_weight = answers["What is the gross weight?"]
-    tare_weight = answers["What is the tare weight?"]
-    net_weight = answers["What is the net weight?"]
-    moisture = answers["What is the moisture (moist) percentage?"]
-    damage = answers["What is the damage percentage?"]
-    gross_units = answers["What is the gross units?"]
-    dock_units = answers["What is the dock units?"]
-    comment = answers["What is the comment?"]
-    assembly_number = answers["What is the assembly number?"]
-    # Create a structured format (like a table) using pandas
-    data = {
-        "Ticket Number": [ticket_number],
-        "Grain Type": [grain_type],
-        "Assembly Number": [assembly_number],
-        "Date": [date],
-        "Time": [time],
-        "Gross Weight": [gross_weight],
-        "Tare Weight": [tare_weight],
-        "Net Weight": [net_weight],
-        "Moisture": [moisture],
-        "Damage": [damage],
-        "Gross Units": [gross_units],
-        "Dock Units": [dock_units],
-        "Comment": [comment],
-    }
-    df = pd.DataFrame(data)
-    return df
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface

 import gradio as gr
 import pandas as pd
+from PIL import Image, ImageDraw, ImageFont
+import torch
+from transformers import LayoutLMv3Processor, LayoutLMv3ForQuestionAnswering, LayoutLMv3ForTokenClassification
 processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
+# More traditional approach that works from token classification basis (not questions)
+model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
+device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+model.to(device)
+labels = model.config.id2label
+print(labels)
+# helper function to unnormalize bounding boxes
+def unnormalize_box(bbox, width, height):
+    return [
+        width * (bbox[0] / 1000),
+        height * (bbox[1] / 1000),
+        width * (bbox[2] / 1000),
+        height * (bbox[3] / 1000),
+    ]
+# process the image in the correct format
+# extract token classifications
+def parse_ticket_image(image):
+    if image:
+        document = image.convert("RGB") if image.mode != "RGB" else image
+    else:
+        print(f"Warning - no image or malformed image!")
+        return pd.DataFrame()
+    encoding = processor(document, return_tensors="pt", truncation=True)
+    for k, v in encoding.items():
+        encoding[k] = v.to(device)
     outputs = model(**encoding)
+    predictions = outputs.logits.argmax(-1).squeeze().tolist()
+    token_boxes = encoding.bbox.squeeze().tolist()
+    input_ids = encoding.input_ids.squeeze().tolist()
+    words = [processor.tokenizer.decode(id) for id in input_ids]
+    width, height = document.size
+    true_predictions = []
+    true_boxes = []
+    for idx, pred in enumerate(predictions):
+        label = model.config.id2label[pred]
+        # apparently 'O' stands for non-entity tokens
+        if label != 'O':
+            true_predictions.append(label)
+            true_boxes.append(unnormalize_box(token_boxes[idx], width, height))
+    data = {
+        "Field": true_predictions,
+        "Value": words[1:len(true_predictions)+1]
+    }
+    df = pd.DataFrame(data)
+    return df
+# This is how to use questions to find answers in the document
+# Less traditional approach, less flexibility, easier to implement/understand (didnt provide robust answers)
+#model = LayoutLMv3ForQuestionAnswering.from_pretrained("microsoft/layoutlmv3-base")
+#def process_question(question, document):
+#    #print(f"Debug - Processing Question: {question}")
+#
+#    encoding = processor(document, question, return_tensors="pt")
+#    #print(f"Debug - Encoding Input IDs: {encoding.input_ids}")
+#
+#    outputs = model(**encoding)
+#    #print(f"Debug - Model Outputs: {outputs}")
+#
+#    predicted_start_idx = outputs.start_logits.argmax(-1).item()
+#    predicted_end_idx = outputs.end_logits.argmax(-1).item()
+#
+#    # Check if indices are valid
+#    if predicted_start_idx < 0 or predicted_end_idx < 0:
+#        print(f"Warning - Invalid prediction indices: start={predicted_start_idx}, end={predicted_end_idx}")
+#        return ""
+#
+#    answer_tokens = encoding.input_ids.squeeze()[predicted_start_idx: predicted_end_idx + 1]
+#    answer = processor.tokenizer.decode(answer_tokens)
+#
+#    return answer
+# Older iteration of the code, retaining for emergencies ?
 #def process_question(question, document):
 #    if not question or document is None:
 #        return None, None, None
 #
 #    return text_value
+#def parse_ticket_image(image, question):
+#    """Basically just runs through these questions for the document"""
+#    # Processing the image
+#    if image:
+#        try:
+#            if image.mode != "RGB":
+#                document = image.convert("RGB")
+#            else:
+#               document = image
+#        except Exception as e:
+#            traceback.print_exc()
+#            error = str(e)
+#
+#
+#    # Define questions you want to ask the model
+#
+#    questions = [
+#        "What is the ticket number?",
+#        "What is the type of grain (For example: corn, soybeans, wheat)?",
+#        "What is the date?",
+#        "What is the time?",
+#        "What is the gross weight?",
+#        "What is the tare weight?",
+#        "What is the net weight?",
+#        "What is the moisture (moist) percentage?",
+#        "What is the damage percentage?",
+#        "What is the gross units?",
+#        "What is the dock units?",
+#        "What is the comment?",
+#        "What is the assembly number?",
+#    ]
+#
+#    # Use the model to answer each question
+#    answers = {}
+#    for q in questions:
+#        print(f"Question: {q}")
+#        answer_text = process_question(q, document)
+#        print(f"Answer Text extracted here: {answer_text}")
+#        answers[q] = answer_text
+#
+#
+#    ticket_number = answers["What is the ticket number?"]
+#    grain_type = answers["What is the type of grain (For example: corn, soybeans, wheat)?"]
+#    date = answers["What is the date?"]
+#    time = answers["What is the time?"]
+#    gross_weight = answers["What is the gross weight?"]
+#    tare_weight = answers["What is the tare weight?"]
+#    net_weight = answers["What is the net weight?"]
+#    moisture = answers["What is the moisture (moist) percentage?"]
+#    damage = answers["What is the damage percentage?"]
+#    gross_units = answers["What is the gross units?"]
+#    dock_units = answers["What is the dock units?"]
+#    comment = answers["What is the comment?"]
+#    assembly_number = answers["What is the assembly number?"]
+#
+#
+#    # Create a structured format (like a table) using pandas
+#    data = {
+#        "Ticket Number": [ticket_number],
+#        "Grain Type": [grain_type],
+#        "Assembly Number": [assembly_number],
+#        "Date": [date],
+#       "Time": [time],
+#        "Gross Weight": [gross_weight],
+#        "Tare Weight": [tare_weight],
+#        "Net Weight": [net_weight],
+#        "Moisture": [moisture],
+#        "Damage": [damage],
+#        "Gross Units": [gross_units],
+#        "Dock Units": [dock_units],
+#        "Comment": [comment],
+#    }
+#    df = pd.DataFrame(data)
+#
+#    return df
 """
 For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface