rosemariafontana commited on
Commit
9af0f9c
Β·
verified Β·
1 Parent(s): 09dced8

updated to not use labels

Browse files
Files changed (1) hide show
  1. app.py +61 -32
app.py CHANGED
@@ -34,46 +34,75 @@ def extract_features(tokens, labels):
34
  partial_date_pattern = r"^\d{1,2}/?$|^[/-]$" # Matches partial date components like "12", "/", "02/", etc.
35
 
36
 
37
-
38
- # Loop through tokens and labels
39
- for token, label in zip(tokens, labels):
40
- print(f"Debug -- Potentially creating date,, token: {token} label: {label}")
41
-
42
- if label == 'LABEL_1':
43
- # Check for partial date fragments (like '12' or '/')
44
- if re.match(date_pattern, current_date):
45
- merged_entities.append((current_date, 'date'))
46
- print(f"Debug -- Complete date added: {token}")
47
- current_date = "" # Reset for next entity
48
- # If the accumulated entity matches a full date
49
- elif re.match(partial_date_pattern, token):
50
- print(f"Debug -- Potentially building date: Token Start {token} After Token")
51
- current_date += token # Append token to the current entity
52
- else:
53
- # No partial or completed patterns are detected, but it's still LABEL_1
54
- # If there were any accumulated data so far
55
- if current_date:
56
- merged_entities.append((current_date, 'date'))
57
- print(f"Debug -- Date finalized: {current_date}")
58
- current_date = "" # Reset
59
-
60
- merged_entities.append((token, label))
61
  else:
62
- # These are LABEL_0, supposedly trash but keep them for now
63
- if current_date: # If there's a leftover date fragment, add it first
64
  merged_entities.append((current_date, 'date'))
65
- print(f"Debug -- Finalizing leftover date added: {current_date}")
66
- current_date = "" # Reset
67
 
68
- # Append LABEL_0 token
69
- print(f"Debug -- Appending LABEL_0 Token: Token Start {token} Token After")
70
- merged_entities.append((token, label))
71
 
72
- if current_date:
73
  print(f"Debug -- Dangling leftover date added: {current_date}")
74
  merged_entities.append((current_date, 'date'))
75
 
76
  return merged_entities
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
 
78
 
79
  # process the image in the correct format
 
34
  partial_date_pattern = r"^\d{1,2}/?$|^[/-]$" # Matches partial date components like "12", "/", "02/", etc.
35
 
36
 
37
+ # This is a label AGNOSTIC approach
38
+ for token, label in zip(tokens, labels):
39
+ print(f"Debug -- Processing token: {token}")
40
+
41
+ # Check complete date pattern
42
+ if re.match(date_pattern, current_date):
43
+ merged_entities.append((current_date, 'date'))
44
+ print(f"Debug -- Completed date added: {current_date}")
45
+ current_date = ""
46
+ # Check if the token matches a partial date pattern
47
+ elif re.match(partial_date_pattern, token):
48
+ print(f"Debug -- Potentially building date: Token Start {token} After Token")
49
+ current_date += LayoutLMv3ForTokenClassification
 
 
 
 
 
 
 
 
 
 
 
50
  else:
51
+ if current_date:
 
52
  merged_entities.append((current_date, 'date'))
53
+ print(f"Debug -- Date finalized: {current_date}")
54
+ current_date = ""
55
 
56
+ print(f"Debug -- Appending non-date Token: {token}")
57
+ merged_entities.append((token, 'other'))
 
58
 
59
+ if current_date:
60
  print(f"Debug -- Dangling leftover date added: {current_date}")
61
  merged_entities.append((current_date, 'date'))
62
 
63
  return merged_entities
64
+
65
+ # NOTE: labels aren't being applied properly ... This is the LABEL approach
66
+ #
67
+ # Loop through tokens and labels
68
+ #for token, label in zip(tokens, labels):
69
+ # print(f"Debug -- Potentially creating date,, token: {token} label: {label}")
70
+ #
71
+ # if label == 'LABEL_1':
72
+ # # Check for partial date fragments (like '12' or '/')
73
+ # if re.match(date_pattern, current_date):
74
+ # merged_entities.append((current_date, 'date'))
75
+ # print(f"Debug -- Complete date added: {token}")
76
+ # current_date = "" # Reset for next entity
77
+ # # If the accumulated entity matches a full date
78
+ # elif re.match(partial_date_pattern, token):
79
+ # print(f"Debug -- Potentially building date: Token Start {token} After Token")
80
+ # current_date += token # Append token to the current entity
81
+ # else:
82
+ # # No partial or completed patterns are detected, but it's still LABEL_1
83
+ # # If there were any accumulated data so far
84
+ # if current_date:
85
+ # merged_entities.append((current_date, 'date'))
86
+ # print(f"Debug -- Date finalized: {current_date}")
87
+ # current_date = "" # Reset
88
+ #
89
+ # merged_entities.append((token, label))
90
+ # else:
91
+ # # These are LABEL_0, supposedly trash but keep them for now
92
+ # if current_date: # If there's a leftover date fragment, add it first
93
+ # merged_entities.append((current_date, 'date'))
94
+ # print(f"Debug -- Finalizing leftover date added: {current_date}")
95
+ # current_date = "" # Reset
96
+ #
97
+ # # Append LABEL_0 token
98
+ # print(f"Debug -- Appending LABEL_0 Token: Token Start {token} Token After")
99
+ # merged_entities.append((token, label))
100
+ #
101
+ # if current_date:
102
+ # print(f"Debug -- Dangling leftover date added: {current_date}")
103
+ # merged_entities.append((current_date, 'date'))
104
+ #
105
+ # return merged_entities
106
 
107
 
108
  # process the image in the correct format