""" Date Parser Service Handles parsing of various date formats commonly found in insurance documents. Supports: - 1-1-25, 01-01-2025, 1/1/25, 01/01/2025 - January 1, 2025, Jan 1, 2025, 1 January 2025 - 2025-01-01 (ISO format) - Date ranges and period calculations """ import re from datetime import datetime, timedelta from typing import Optional, List, Dict, Tuple class DateParser: """Parse and normalize dates from various formats.""" # Month name mappings MONTHS = { 'january': 1, 'jan': 1, 'february': 2, 'feb': 2, 'march': 3, 'mar': 3, 'april': 4, 'apr': 4, 'may': 5, 'june': 6, 'jun': 6, 'july': 7, 'jul': 7, 'august': 8, 'aug': 8, 'september': 9, 'sep': 9, 'sept': 9, 'october': 10, 'oct': 10, 'november': 11, 'nov': 11, 'december': 12, 'dec': 12 } # Date context keywords for identifying date types DATE_CONTEXTS = { 'start': ['start', 'commence', 'inception', 'effective', 'from', 'begins', 'starting'], 'end': ['end', 'expiry', 'expire', 'expiration', 'until', 'to', 'ending', 'valid till', 'valid until'], 'renewal': ['renewal', 'renew', 'next renewal', 'due for renewal'], 'issue': ['issue', 'issued', 'date of issue', 'policy date'] } def __init__(self): self._compile_patterns() def _compile_patterns(self): """Compile regex patterns for date extraction.""" # DD-MM-YY or DD-MM-YYYY (with - or /) self.pattern_dmy = re.compile( r'\b(\d{1,2})[-/](\d{1,2})[-/](\d{2,4})\b' ) # YYYY-MM-DD (ISO format) self.pattern_iso = re.compile( r'\b(\d{4})[-/](\d{1,2})[-/](\d{1,2})\b' ) # Month DD, YYYY or DD Month YYYY month_names = '|'.join(self.MONTHS.keys()) self.pattern_month_name = re.compile( rf'\b(\d{{1,2}})\s*(?:st|nd|rd|th)?\s*({month_names})[,]?\s*(\d{{4}})\b|' rf'\b({month_names})\s*(\d{{1,2}})(?:st|nd|rd|th)?[,]?\s*(\d{{4}})\b', re.IGNORECASE ) def parse_date(self, date_str: str) -> Optional[datetime]: """ Parse a date string in various formats to datetime object. Args: date_str: Date string to parse Returns: datetime object or None if parsing fails """ if not date_str: return None date_str = str(date_str).strip() # Try ISO format first (YYYY-MM-DD) match = self.pattern_iso.search(date_str) if match: year, month, day = match.groups() try: return datetime(int(year), int(month), int(day)) except ValueError: pass # Try DMY format (DD-MM-YY or DD-MM-YYYY) match = self.pattern_dmy.search(date_str) if match: day, month, year = match.groups() year = int(year) # Handle 2-digit years if year < 100: year = 2000 + year if year < 50 else 1900 + year try: return datetime(year, int(month), int(day)) except ValueError: # Try swapping day/month for US format try: return datetime(year, int(day), int(month)) except ValueError: pass # Try month name format match = self.pattern_month_name.search(date_str) if match: groups = match.groups() if groups[0]: # DD Month YYYY format day, month_name, year = groups[0], groups[1], groups[2] else: # Month DD, YYYY format month_name, day, year = groups[3], groups[4], groups[5] month = self.MONTHS.get(month_name.lower()) if month: try: return datetime(int(year), month, int(day)) except ValueError: pass return None def extract_dates_from_text(self, text: str) -> List[Dict]: """ Extract all dates from text with their context. Args: text: Text to search for dates Returns: List of dicts with date info: [{"date": datetime, "context": "start/end/renewal/issue/unknown", "original": "01-01-2025", "position": 123}] """ if not text: return [] results = [] text_lower = text.lower() # Find all date matches all_matches = [] # DMY format for match in self.pattern_dmy.finditer(text): parsed = self.parse_date(match.group()) if parsed: all_matches.append({ 'date': parsed, 'original': match.group(), 'position': match.start() }) # ISO format for match in self.pattern_iso.finditer(text): parsed = self.parse_date(match.group()) if parsed: all_matches.append({ 'date': parsed, 'original': match.group(), 'position': match.start() }) # Month name format for match in self.pattern_month_name.finditer(text): parsed = self.parse_date(match.group()) if parsed: all_matches.append({ 'date': parsed, 'original': match.group(), 'position': match.start() }) # Determine context for each date for match in all_matches: pos = match['position'] # Look at surrounding text (100 chars before) context_start = max(0, pos - 100) context_text = text_lower[context_start:pos] date_type = 'unknown' for dtype, keywords in self.DATE_CONTEXTS.items(): if any(kw in context_text for kw in keywords): date_type = dtype break results.append({ 'date': match['date'], 'date_str': match['date'].strftime('%Y-%m-%d'), 'context': date_type, 'original': match['original'], 'position': pos }) # Remove duplicates based on date seen_dates = set() unique_results = [] for r in results: date_key = r['date_str'] if date_key not in seen_dates: seen_dates.add(date_key) unique_results.append(r) return unique_results def calculate_renewal_date(self, policy_start: datetime, term_months: int = 12) -> datetime: """ Calculate policy renewal date. Args: policy_start: Policy start date term_months: Policy term in months (default 12) Returns: Renewal date (policy_start + term_months) """ # Add months new_month = policy_start.month + term_months new_year = policy_start.year + (new_month - 1) // 12 new_month = ((new_month - 1) % 12) + 1 # Handle day overflow try: return datetime(new_year, new_month, policy_start.day) except ValueError: # Last day of month for dates like Jan 31 + 1 month if new_month == 12: next_month = datetime(new_year + 1, 1, 1) else: next_month = datetime(new_year, new_month + 1, 1) return next_month - timedelta(days=1) def is_date_in_range(self, date: datetime, year: int = None, before: datetime = None, after: datetime = None) -> bool: """ Check if date matches filter criteria. Args: date: Date to check year: Match specific year before: Date must be before this after: Date must be after this Returns: True if date matches all criteria """ if not date: return False if year and date.year != year: return False if before and date >= before: return False if after and date <= after: return False return True def get_year_from_query(self, query: str) -> Optional[int]: """Extract year from query like 'policies renewing in 2026'.""" match = re.search(r'\b(20\d{2})\b', query) if match: return int(match.group(1)) # Handle relative years current_year = datetime.now().year if 'this year' in query.lower(): return current_year if 'next year' in query.lower(): return current_year + 1 if 'last year' in query.lower(): return current_year - 1 return None # Singleton instance date_parser = DateParser()