Spaces:
Runtime error
Runtime error
updates to handling partial dates
Browse files
app.py
CHANGED
|
@@ -26,30 +26,41 @@ def extract_features(tokens, labels):
|
|
| 26 |
merged_entities = []
|
| 27 |
current_date = ""
|
| 28 |
|
| 29 |
-
print(f"Debug --
|
|
|
|
|
|
|
|
|
|
|
|
|
| 30 |
# Loop through tokens and labels
|
| 31 |
for token, label in zip(tokens, labels):
|
| 32 |
print(f"Debug -- Potentially creating date,, token: {token} label: {label}")
|
| 33 |
if label == 'LABEL_1':
|
| 34 |
-
#
|
| 35 |
-
if re.match(
|
| 36 |
print(f"Debug -- Potentially building date: Token Start {token} After Token")
|
| 37 |
-
current_date += token
|
| 38 |
-
|
| 39 |
-
#
|
| 40 |
-
if re.match(
|
| 41 |
-
merged_entities.append(current_date)
|
| 42 |
-
print(f"Debug --
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
current_date = ""
|
| 44 |
else:
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
current_date = ""
|
| 49 |
-
merged_entities.append(token)
|
| 50 |
|
| 51 |
-
if current_date:
|
| 52 |
-
merged_entities.append(current_date)
|
| 53 |
|
| 54 |
return merged_entities
|
| 55 |
|
|
|
|
| 26 |
merged_entities = []
|
| 27 |
current_date = ""
|
| 28 |
|
| 29 |
+
print(f"Debug -- Starting entity extraction")
|
| 30 |
+
date_pattern = r"\d{1,2}/\d{1,2}/\d{2,4}" # Matches full date formats like MM/DD/YYYY or DD/MM/YYYY
|
| 31 |
+
partial_date_pattern = r"\d{1,2}$|[/-]$" # Matches partial date components like "12" or "/" at the end
|
| 32 |
+
|
| 33 |
+
|
| 34 |
# Loop through tokens and labels
|
| 35 |
for token, label in zip(tokens, labels):
|
| 36 |
print(f"Debug -- Potentially creating date,, token: {token} label: {label}")
|
| 37 |
if label == 'LABEL_1':
|
| 38 |
+
# Check for partial date fragments (like '12' or '/')
|
| 39 |
+
if re.match(partial_date_pattern, token):
|
| 40 |
print(f"Debug -- Potentially building date: Token Start {token} After Token")
|
| 41 |
+
current_date += token # Append token to the current entity
|
| 42 |
+
|
| 43 |
+
# If the accumulated entity matches a full date
|
| 44 |
+
if re.match(date_pattern, current_date):
|
| 45 |
+
merged_entities.append((current_date, 'date'))
|
| 46 |
+
print(f"Debug -- Complete date added: {token}")
|
| 47 |
+
current_date = "" # Reset for next entity
|
| 48 |
+
else:
|
| 49 |
+
# No partial or completed patterns are detected, but it's still LABEL_1
|
| 50 |
+
# If there were any accumulated data so far
|
| 51 |
+
if current_date:
|
| 52 |
+
merged_entities.append((current_date, 'date'))
|
| 53 |
+
else:
|
| 54 |
+
merged_entities.append((token, label))
|
| 55 |
current_date = ""
|
| 56 |
else:
|
| 57 |
+
# These are LABEL_0, supposedly trash but keep them for now
|
| 58 |
+
if token:
|
| 59 |
+
merged_entities.append((token, label))
|
| 60 |
current_date = ""
|
|
|
|
| 61 |
|
| 62 |
+
if current_date:
|
| 63 |
+
merged_entities.append(current_date, 'date')
|
| 64 |
|
| 65 |
return merged_entities
|
| 66 |
|