Spaces:
Running
Running
File size: 9,386 Bytes
64deb3c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 |
"""
Date Parser Service
Handles parsing of various date formats commonly found in insurance documents.
Supports:
- 1-1-25, 01-01-2025, 1/1/25, 01/01/2025
- January 1, 2025, Jan 1, 2025, 1 January 2025
- 2025-01-01 (ISO format)
- Date ranges and period calculations
"""
import re
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Tuple
class DateParser:
"""Parse and normalize dates from various formats."""
# Month name mappings
MONTHS = {
'january': 1, 'jan': 1,
'february': 2, 'feb': 2,
'march': 3, 'mar': 3,
'april': 4, 'apr': 4,
'may': 5,
'june': 6, 'jun': 6,
'july': 7, 'jul': 7,
'august': 8, 'aug': 8,
'september': 9, 'sep': 9, 'sept': 9,
'october': 10, 'oct': 10,
'november': 11, 'nov': 11,
'december': 12, 'dec': 12
}
# Date context keywords for identifying date types
DATE_CONTEXTS = {
'start': ['start', 'commence', 'inception', 'effective', 'from', 'begins', 'starting'],
'end': ['end', 'expiry', 'expire', 'expiration', 'until', 'to', 'ending', 'valid till', 'valid until'],
'renewal': ['renewal', 'renew', 'next renewal', 'due for renewal'],
'issue': ['issue', 'issued', 'date of issue', 'policy date']
}
def __init__(self):
self._compile_patterns()
def _compile_patterns(self):
"""Compile regex patterns for date extraction."""
# DD-MM-YY or DD-MM-YYYY (with - or /)
self.pattern_dmy = re.compile(
r'\b(\d{1,2})[-/](\d{1,2})[-/](\d{2,4})\b'
)
# YYYY-MM-DD (ISO format)
self.pattern_iso = re.compile(
r'\b(\d{4})[-/](\d{1,2})[-/](\d{1,2})\b'
)
# Month DD, YYYY or DD Month YYYY
month_names = '|'.join(self.MONTHS.keys())
self.pattern_month_name = re.compile(
rf'\b(\d{{1,2}})\s*(?:st|nd|rd|th)?\s*({month_names})[,]?\s*(\d{{4}})\b|'
rf'\b({month_names})\s*(\d{{1,2}})(?:st|nd|rd|th)?[,]?\s*(\d{{4}})\b',
re.IGNORECASE
)
def parse_date(self, date_str: str) -> Optional[datetime]:
"""
Parse a date string in various formats to datetime object.
Args:
date_str: Date string to parse
Returns:
datetime object or None if parsing fails
"""
if not date_str:
return None
date_str = str(date_str).strip()
# Try ISO format first (YYYY-MM-DD)
match = self.pattern_iso.search(date_str)
if match:
year, month, day = match.groups()
try:
return datetime(int(year), int(month), int(day))
except ValueError:
pass
# Try DMY format (DD-MM-YY or DD-MM-YYYY)
match = self.pattern_dmy.search(date_str)
if match:
day, month, year = match.groups()
year = int(year)
# Handle 2-digit years
if year < 100:
year = 2000 + year if year < 50 else 1900 + year
try:
return datetime(year, int(month), int(day))
except ValueError:
# Try swapping day/month for US format
try:
return datetime(year, int(day), int(month))
except ValueError:
pass
# Try month name format
match = self.pattern_month_name.search(date_str)
if match:
groups = match.groups()
if groups[0]: # DD Month YYYY format
day, month_name, year = groups[0], groups[1], groups[2]
else: # Month DD, YYYY format
month_name, day, year = groups[3], groups[4], groups[5]
month = self.MONTHS.get(month_name.lower())
if month:
try:
return datetime(int(year), month, int(day))
except ValueError:
pass
return None
def extract_dates_from_text(self, text: str) -> List[Dict]:
"""
Extract all dates from text with their context.
Args:
text: Text to search for dates
Returns:
List of dicts with date info:
[{"date": datetime, "context": "start/end/renewal/issue/unknown",
"original": "01-01-2025", "position": 123}]
"""
if not text:
return []
results = []
text_lower = text.lower()
# Find all date matches
all_matches = []
# DMY format
for match in self.pattern_dmy.finditer(text):
parsed = self.parse_date(match.group())
if parsed:
all_matches.append({
'date': parsed,
'original': match.group(),
'position': match.start()
})
# ISO format
for match in self.pattern_iso.finditer(text):
parsed = self.parse_date(match.group())
if parsed:
all_matches.append({
'date': parsed,
'original': match.group(),
'position': match.start()
})
# Month name format
for match in self.pattern_month_name.finditer(text):
parsed = self.parse_date(match.group())
if parsed:
all_matches.append({
'date': parsed,
'original': match.group(),
'position': match.start()
})
# Determine context for each date
for match in all_matches:
pos = match['position']
# Look at surrounding text (100 chars before)
context_start = max(0, pos - 100)
context_text = text_lower[context_start:pos]
date_type = 'unknown'
for dtype, keywords in self.DATE_CONTEXTS.items():
if any(kw in context_text for kw in keywords):
date_type = dtype
break
results.append({
'date': match['date'],
'date_str': match['date'].strftime('%Y-%m-%d'),
'context': date_type,
'original': match['original'],
'position': pos
})
# Remove duplicates based on date
seen_dates = set()
unique_results = []
for r in results:
date_key = r['date_str']
if date_key not in seen_dates:
seen_dates.add(date_key)
unique_results.append(r)
return unique_results
def calculate_renewal_date(self, policy_start: datetime,
term_months: int = 12) -> datetime:
"""
Calculate policy renewal date.
Args:
policy_start: Policy start date
term_months: Policy term in months (default 12)
Returns:
Renewal date (policy_start + term_months)
"""
# Add months
new_month = policy_start.month + term_months
new_year = policy_start.year + (new_month - 1) // 12
new_month = ((new_month - 1) % 12) + 1
# Handle day overflow
try:
return datetime(new_year, new_month, policy_start.day)
except ValueError:
# Last day of month for dates like Jan 31 + 1 month
if new_month == 12:
next_month = datetime(new_year + 1, 1, 1)
else:
next_month = datetime(new_year, new_month + 1, 1)
return next_month - timedelta(days=1)
def is_date_in_range(self, date: datetime,
year: int = None,
before: datetime = None,
after: datetime = None) -> bool:
"""
Check if date matches filter criteria.
Args:
date: Date to check
year: Match specific year
before: Date must be before this
after: Date must be after this
Returns:
True if date matches all criteria
"""
if not date:
return False
if year and date.year != year:
return False
if before and date >= before:
return False
if after and date <= after:
return False
return True
def get_year_from_query(self, query: str) -> Optional[int]:
"""Extract year from query like 'policies renewing in 2026'."""
match = re.search(r'\b(20\d{2})\b', query)
if match:
return int(match.group(1))
# Handle relative years
current_year = datetime.now().year
if 'this year' in query.lower():
return current_year
if 'next year' in query.lower():
return current_year + 1
if 'last year' in query.lower():
return current_year - 1
return None
# Singleton instance
date_parser = DateParser()
|