Spaces:
Running
Running
| """ | |
| Date Parser Service | |
| Handles parsing of various date formats commonly found in insurance documents. | |
| Supports: | |
| - 1-1-25, 01-01-2025, 1/1/25, 01/01/2025 | |
| - January 1, 2025, Jan 1, 2025, 1 January 2025 | |
| - 2025-01-01 (ISO format) | |
| - Date ranges and period calculations | |
| """ | |
| import re | |
| from datetime import datetime, timedelta | |
| from typing import Optional, List, Dict, Tuple | |
| class DateParser: | |
| """Parse and normalize dates from various formats.""" | |
| # Month name mappings | |
| MONTHS = { | |
| 'january': 1, 'jan': 1, | |
| 'february': 2, 'feb': 2, | |
| 'march': 3, 'mar': 3, | |
| 'april': 4, 'apr': 4, | |
| 'may': 5, | |
| 'june': 6, 'jun': 6, | |
| 'july': 7, 'jul': 7, | |
| 'august': 8, 'aug': 8, | |
| 'september': 9, 'sep': 9, 'sept': 9, | |
| 'october': 10, 'oct': 10, | |
| 'november': 11, 'nov': 11, | |
| 'december': 12, 'dec': 12 | |
| } | |
| # Date context keywords for identifying date types | |
| DATE_CONTEXTS = { | |
| 'start': ['start', 'commence', 'inception', 'effective', 'from', 'begins', 'starting'], | |
| 'end': ['end', 'expiry', 'expire', 'expiration', 'until', 'to', 'ending', 'valid till', 'valid until'], | |
| 'renewal': ['renewal', 'renew', 'next renewal', 'due for renewal'], | |
| 'issue': ['issue', 'issued', 'date of issue', 'policy date'] | |
| } | |
| def __init__(self): | |
| self._compile_patterns() | |
| def _compile_patterns(self): | |
| """Compile regex patterns for date extraction.""" | |
| # DD-MM-YY or DD-MM-YYYY (with - or /) | |
| self.pattern_dmy = re.compile( | |
| r'\b(\d{1,2})[-/](\d{1,2})[-/](\d{2,4})\b' | |
| ) | |
| # YYYY-MM-DD (ISO format) | |
| self.pattern_iso = re.compile( | |
| r'\b(\d{4})[-/](\d{1,2})[-/](\d{1,2})\b' | |
| ) | |
| # Month DD, YYYY or DD Month YYYY | |
| month_names = '|'.join(self.MONTHS.keys()) | |
| self.pattern_month_name = re.compile( | |
| rf'\b(\d{{1,2}})\s*(?:st|nd|rd|th)?\s*({month_names})[,]?\s*(\d{{4}})\b|' | |
| rf'\b({month_names})\s*(\d{{1,2}})(?:st|nd|rd|th)?[,]?\s*(\d{{4}})\b', | |
| re.IGNORECASE | |
| ) | |
| def parse_date(self, date_str: str) -> Optional[datetime]: | |
| """ | |
| Parse a date string in various formats to datetime object. | |
| Args: | |
| date_str: Date string to parse | |
| Returns: | |
| datetime object or None if parsing fails | |
| """ | |
| if not date_str: | |
| return None | |
| date_str = str(date_str).strip() | |
| # Try ISO format first (YYYY-MM-DD) | |
| match = self.pattern_iso.search(date_str) | |
| if match: | |
| year, month, day = match.groups() | |
| try: | |
| return datetime(int(year), int(month), int(day)) | |
| except ValueError: | |
| pass | |
| # Try DMY format (DD-MM-YY or DD-MM-YYYY) | |
| match = self.pattern_dmy.search(date_str) | |
| if match: | |
| day, month, year = match.groups() | |
| year = int(year) | |
| # Handle 2-digit years | |
| if year < 100: | |
| year = 2000 + year if year < 50 else 1900 + year | |
| try: | |
| return datetime(year, int(month), int(day)) | |
| except ValueError: | |
| # Try swapping day/month for US format | |
| try: | |
| return datetime(year, int(day), int(month)) | |
| except ValueError: | |
| pass | |
| # Try month name format | |
| match = self.pattern_month_name.search(date_str) | |
| if match: | |
| groups = match.groups() | |
| if groups[0]: # DD Month YYYY format | |
| day, month_name, year = groups[0], groups[1], groups[2] | |
| else: # Month DD, YYYY format | |
| month_name, day, year = groups[3], groups[4], groups[5] | |
| month = self.MONTHS.get(month_name.lower()) | |
| if month: | |
| try: | |
| return datetime(int(year), month, int(day)) | |
| except ValueError: | |
| pass | |
| return None | |
| def extract_dates_from_text(self, text: str) -> List[Dict]: | |
| """ | |
| Extract all dates from text with their context. | |
| Args: | |
| text: Text to search for dates | |
| Returns: | |
| List of dicts with date info: | |
| [{"date": datetime, "context": "start/end/renewal/issue/unknown", | |
| "original": "01-01-2025", "position": 123}] | |
| """ | |
| if not text: | |
| return [] | |
| results = [] | |
| text_lower = text.lower() | |
| # Find all date matches | |
| all_matches = [] | |
| # DMY format | |
| for match in self.pattern_dmy.finditer(text): | |
| parsed = self.parse_date(match.group()) | |
| if parsed: | |
| all_matches.append({ | |
| 'date': parsed, | |
| 'original': match.group(), | |
| 'position': match.start() | |
| }) | |
| # ISO format | |
| for match in self.pattern_iso.finditer(text): | |
| parsed = self.parse_date(match.group()) | |
| if parsed: | |
| all_matches.append({ | |
| 'date': parsed, | |
| 'original': match.group(), | |
| 'position': match.start() | |
| }) | |
| # Month name format | |
| for match in self.pattern_month_name.finditer(text): | |
| parsed = self.parse_date(match.group()) | |
| if parsed: | |
| all_matches.append({ | |
| 'date': parsed, | |
| 'original': match.group(), | |
| 'position': match.start() | |
| }) | |
| # Determine context for each date | |
| for match in all_matches: | |
| pos = match['position'] | |
| # Look at surrounding text (100 chars before) | |
| context_start = max(0, pos - 100) | |
| context_text = text_lower[context_start:pos] | |
| date_type = 'unknown' | |
| for dtype, keywords in self.DATE_CONTEXTS.items(): | |
| if any(kw in context_text for kw in keywords): | |
| date_type = dtype | |
| break | |
| results.append({ | |
| 'date': match['date'], | |
| 'date_str': match['date'].strftime('%Y-%m-%d'), | |
| 'context': date_type, | |
| 'original': match['original'], | |
| 'position': pos | |
| }) | |
| # Remove duplicates based on date | |
| seen_dates = set() | |
| unique_results = [] | |
| for r in results: | |
| date_key = r['date_str'] | |
| if date_key not in seen_dates: | |
| seen_dates.add(date_key) | |
| unique_results.append(r) | |
| return unique_results | |
| def calculate_renewal_date(self, policy_start: datetime, | |
| term_months: int = 12) -> datetime: | |
| """ | |
| Calculate policy renewal date. | |
| Args: | |
| policy_start: Policy start date | |
| term_months: Policy term in months (default 12) | |
| Returns: | |
| Renewal date (policy_start + term_months) | |
| """ | |
| # Add months | |
| new_month = policy_start.month + term_months | |
| new_year = policy_start.year + (new_month - 1) // 12 | |
| new_month = ((new_month - 1) % 12) + 1 | |
| # Handle day overflow | |
| try: | |
| return datetime(new_year, new_month, policy_start.day) | |
| except ValueError: | |
| # Last day of month for dates like Jan 31 + 1 month | |
| if new_month == 12: | |
| next_month = datetime(new_year + 1, 1, 1) | |
| else: | |
| next_month = datetime(new_year, new_month + 1, 1) | |
| return next_month - timedelta(days=1) | |
| def is_date_in_range(self, date: datetime, | |
| year: int = None, | |
| before: datetime = None, | |
| after: datetime = None) -> bool: | |
| """ | |
| Check if date matches filter criteria. | |
| Args: | |
| date: Date to check | |
| year: Match specific year | |
| before: Date must be before this | |
| after: Date must be after this | |
| Returns: | |
| True if date matches all criteria | |
| """ | |
| if not date: | |
| return False | |
| if year and date.year != year: | |
| return False | |
| if before and date >= before: | |
| return False | |
| if after and date <= after: | |
| return False | |
| return True | |
| def get_year_from_query(self, query: str) -> Optional[int]: | |
| """Extract year from query like 'policies renewing in 2026'.""" | |
| match = re.search(r'\b(20\d{2})\b', query) | |
| if match: | |
| return int(match.group(1)) | |
| # Handle relative years | |
| current_year = datetime.now().year | |
| if 'this year' in query.lower(): | |
| return current_year | |
| if 'next year' in query.lower(): | |
| return current_year + 1 | |
| if 'last year' in query.lower(): | |
| return current_year - 1 | |
| return None | |
| # Singleton instance | |
| date_parser = DateParser() | |