rosemariafontana commited on
Commit
996d5b1
Β·
verified Β·
1 Parent(s): f5a15e3

updates to handling partial dates

Browse files
Files changed (1) hide show
  1. app.py +26 -15
app.py CHANGED
@@ -26,30 +26,41 @@ def extract_features(tokens, labels):
26
  merged_entities = []
27
  current_date = ""
28
 
29
- print(f"Debug -- Potentially creating date")
 
 
 
 
30
  # Loop through tokens and labels
31
  for token, label in zip(tokens, labels):
32
  print(f"Debug -- Potentially creating date,, token: {token} label: {label}")
33
  if label == 'LABEL_1':
34
- # Date logic
35
- if re.match(r"^\d{1,2}/$", token) or re.match(r"^\d{4}$", token):
36
  print(f"Debug -- Potentially building date: Token Start {token} After Token")
37
- current_date += token
38
-
39
- # Date logic
40
- if re.match(r"^\d{4}$", token) and current_date.count('/') == 2:
41
- merged_entities.append(current_date)
42
- print(f"Debug -- Added date 1: Token Start {curent_date} After Token")
 
 
 
 
 
 
 
 
43
  current_date = ""
44
  else:
45
- if current_date:
46
- merged_entities.append(current_date)
47
- print(f"Debug -- Added date 2: Token Start {current_date} After Token")
48
  current_date = ""
49
- merged_entities.append(token)
50
 
51
- if current_date:
52
- merged_entities.append(current_date)
53
 
54
  return merged_entities
55
 
 
26
  merged_entities = []
27
  current_date = ""
28
 
29
+ print(f"Debug -- Starting entity extraction")
30
+ date_pattern = r"\d{1,2}/\d{1,2}/\d{2,4}" # Matches full date formats like MM/DD/YYYY or DD/MM/YYYY
31
+ partial_date_pattern = r"\d{1,2}$|[/-]$" # Matches partial date components like "12" or "/" at the end
32
+
33
+
34
  # Loop through tokens and labels
35
  for token, label in zip(tokens, labels):
36
  print(f"Debug -- Potentially creating date,, token: {token} label: {label}")
37
  if label == 'LABEL_1':
38
+ # Check for partial date fragments (like '12' or '/')
39
+ if re.match(partial_date_pattern, token):
40
  print(f"Debug -- Potentially building date: Token Start {token} After Token")
41
+ current_date += token # Append token to the current entity
42
+
43
+ # If the accumulated entity matches a full date
44
+ if re.match(date_pattern, current_date):
45
+ merged_entities.append((current_date, 'date'))
46
+ print(f"Debug -- Complete date added: {token}")
47
+ current_date = "" # Reset for next entity
48
+ else:
49
+ # No partial or completed patterns are detected, but it's still LABEL_1
50
+ # If there were any accumulated data so far
51
+ if current_date:
52
+ merged_entities.append((current_date, 'date'))
53
+ else:
54
+ merged_entities.append((token, label))
55
  current_date = ""
56
  else:
57
+ # These are LABEL_0, supposedly trash but keep them for now
58
+ if token:
59
+ merged_entities.append((token, label))
60
  current_date = ""
 
61
 
62
+ if current_date:
63
+ merged_entities.append(current_date, 'date')
64
 
65
  return merged_entities
66