Spaces:

our-sci
/

agriculture-experiments-document-processing

Runtime error

App Files Files Community

rosemariafontana commited on Aug 30, 2024

Commit

7e409fd

verified ·

1 Parent(s): 7252bf3

update to extract date entity

Browse files

Files changed (1) hide show

app.py +34 -0

app.py CHANGED Viewed

@@ -10,11 +10,42 @@ processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
 # More traditional approach that works from token classification basis (not questions)
 model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
 model.to(device)
 labels = model.config.id2label
 print(labels)
 # process the image in the correct format
 # extract token classifications
 def parse_ticket_image(image):
@@ -69,6 +100,9 @@ def parse_ticket_image(image):
     min_length = min(len(fields), len(values))
     fields = fields[:min_length]
     values = values[:min_length]
     data = {
         "Field": fields,

 # More traditional approach that works from token classification basis (not questions)
 model = LayoutLMv3ForTokenClassification.from_pretrained("microsoft/layoutlmv3-base")
 device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
+print(f"Debug -- Using device: {device}")
 model.to(device)
 labels = model.config.id2label
 print(labels)
+# Homemade feature extraction
+def extract_features(tokens, labels):
+    merged_entities = []
+    current_date = ""
+    # Loop through tokens and labels
+    for token, label in zip(tokens, labels):
+        if label === 'LABEL_1':
+            # Date logic
+            if re.match(r"^\d{1,2}/$", token) or re.match(r"^\d{4}$", token):
+                current_date += token
+            # Date logic
+            if re.match(r"^\d{4}$", token) and current_date.count('/') == 2:
+                merged_entities.append(current_date)
+                current_date = ""
+        else:
+            if current_date:
+                merged_entities.append(current_date)
+                current_date = ""
+            merged_entities.append(token)
+    if current_date:
+        merged_entities.append(current_date)
+    return merged_entities
 # process the image in the correct format
 # extract token classifications
 def parse_ticket_image(image):
     min_length = min(len(fields), len(values))
     fields = fields[:min_length]
     values = values[:min_length]
+    #Homemade feature extraction
+    values = extract_features(values, fields)
     data = {
         "Field": fields,