notebooklm-fast / services /date_parser.py
jashdoshi77
feat: Add AI-powered query understanding with DeepSeek parsing
64deb3c
"""
Date Parser Service
Handles parsing of various date formats commonly found in insurance documents.
Supports:
- 1-1-25, 01-01-2025, 1/1/25, 01/01/2025
- January 1, 2025, Jan 1, 2025, 1 January 2025
- 2025-01-01 (ISO format)
- Date ranges and period calculations
"""
import re
from datetime import datetime, timedelta
from typing import Optional, List, Dict, Tuple
class DateParser:
"""Parse and normalize dates from various formats."""
# Month name mappings
MONTHS = {
'january': 1, 'jan': 1,
'february': 2, 'feb': 2,
'march': 3, 'mar': 3,
'april': 4, 'apr': 4,
'may': 5,
'june': 6, 'jun': 6,
'july': 7, 'jul': 7,
'august': 8, 'aug': 8,
'september': 9, 'sep': 9, 'sept': 9,
'october': 10, 'oct': 10,
'november': 11, 'nov': 11,
'december': 12, 'dec': 12
}
# Date context keywords for identifying date types
DATE_CONTEXTS = {
'start': ['start', 'commence', 'inception', 'effective', 'from', 'begins', 'starting'],
'end': ['end', 'expiry', 'expire', 'expiration', 'until', 'to', 'ending', 'valid till', 'valid until'],
'renewal': ['renewal', 'renew', 'next renewal', 'due for renewal'],
'issue': ['issue', 'issued', 'date of issue', 'policy date']
}
def __init__(self):
self._compile_patterns()
def _compile_patterns(self):
"""Compile regex patterns for date extraction."""
# DD-MM-YY or DD-MM-YYYY (with - or /)
self.pattern_dmy = re.compile(
r'\b(\d{1,2})[-/](\d{1,2})[-/](\d{2,4})\b'
)
# YYYY-MM-DD (ISO format)
self.pattern_iso = re.compile(
r'\b(\d{4})[-/](\d{1,2})[-/](\d{1,2})\b'
)
# Month DD, YYYY or DD Month YYYY
month_names = '|'.join(self.MONTHS.keys())
self.pattern_month_name = re.compile(
rf'\b(\d{{1,2}})\s*(?:st|nd|rd|th)?\s*({month_names})[,]?\s*(\d{{4}})\b|'
rf'\b({month_names})\s*(\d{{1,2}})(?:st|nd|rd|th)?[,]?\s*(\d{{4}})\b',
re.IGNORECASE
)
def parse_date(self, date_str: str) -> Optional[datetime]:
"""
Parse a date string in various formats to datetime object.
Args:
date_str: Date string to parse
Returns:
datetime object or None if parsing fails
"""
if not date_str:
return None
date_str = str(date_str).strip()
# Try ISO format first (YYYY-MM-DD)
match = self.pattern_iso.search(date_str)
if match:
year, month, day = match.groups()
try:
return datetime(int(year), int(month), int(day))
except ValueError:
pass
# Try DMY format (DD-MM-YY or DD-MM-YYYY)
match = self.pattern_dmy.search(date_str)
if match:
day, month, year = match.groups()
year = int(year)
# Handle 2-digit years
if year < 100:
year = 2000 + year if year < 50 else 1900 + year
try:
return datetime(year, int(month), int(day))
except ValueError:
# Try swapping day/month for US format
try:
return datetime(year, int(day), int(month))
except ValueError:
pass
# Try month name format
match = self.pattern_month_name.search(date_str)
if match:
groups = match.groups()
if groups[0]: # DD Month YYYY format
day, month_name, year = groups[0], groups[1], groups[2]
else: # Month DD, YYYY format
month_name, day, year = groups[3], groups[4], groups[5]
month = self.MONTHS.get(month_name.lower())
if month:
try:
return datetime(int(year), month, int(day))
except ValueError:
pass
return None
def extract_dates_from_text(self, text: str) -> List[Dict]:
"""
Extract all dates from text with their context.
Args:
text: Text to search for dates
Returns:
List of dicts with date info:
[{"date": datetime, "context": "start/end/renewal/issue/unknown",
"original": "01-01-2025", "position": 123}]
"""
if not text:
return []
results = []
text_lower = text.lower()
# Find all date matches
all_matches = []
# DMY format
for match in self.pattern_dmy.finditer(text):
parsed = self.parse_date(match.group())
if parsed:
all_matches.append({
'date': parsed,
'original': match.group(),
'position': match.start()
})
# ISO format
for match in self.pattern_iso.finditer(text):
parsed = self.parse_date(match.group())
if parsed:
all_matches.append({
'date': parsed,
'original': match.group(),
'position': match.start()
})
# Month name format
for match in self.pattern_month_name.finditer(text):
parsed = self.parse_date(match.group())
if parsed:
all_matches.append({
'date': parsed,
'original': match.group(),
'position': match.start()
})
# Determine context for each date
for match in all_matches:
pos = match['position']
# Look at surrounding text (100 chars before)
context_start = max(0, pos - 100)
context_text = text_lower[context_start:pos]
date_type = 'unknown'
for dtype, keywords in self.DATE_CONTEXTS.items():
if any(kw in context_text for kw in keywords):
date_type = dtype
break
results.append({
'date': match['date'],
'date_str': match['date'].strftime('%Y-%m-%d'),
'context': date_type,
'original': match['original'],
'position': pos
})
# Remove duplicates based on date
seen_dates = set()
unique_results = []
for r in results:
date_key = r['date_str']
if date_key not in seen_dates:
seen_dates.add(date_key)
unique_results.append(r)
return unique_results
def calculate_renewal_date(self, policy_start: datetime,
term_months: int = 12) -> datetime:
"""
Calculate policy renewal date.
Args:
policy_start: Policy start date
term_months: Policy term in months (default 12)
Returns:
Renewal date (policy_start + term_months)
"""
# Add months
new_month = policy_start.month + term_months
new_year = policy_start.year + (new_month - 1) // 12
new_month = ((new_month - 1) % 12) + 1
# Handle day overflow
try:
return datetime(new_year, new_month, policy_start.day)
except ValueError:
# Last day of month for dates like Jan 31 + 1 month
if new_month == 12:
next_month = datetime(new_year + 1, 1, 1)
else:
next_month = datetime(new_year, new_month + 1, 1)
return next_month - timedelta(days=1)
def is_date_in_range(self, date: datetime,
year: int = None,
before: datetime = None,
after: datetime = None) -> bool:
"""
Check if date matches filter criteria.
Args:
date: Date to check
year: Match specific year
before: Date must be before this
after: Date must be after this
Returns:
True if date matches all criteria
"""
if not date:
return False
if year and date.year != year:
return False
if before and date >= before:
return False
if after and date <= after:
return False
return True
def get_year_from_query(self, query: str) -> Optional[int]:
"""Extract year from query like 'policies renewing in 2026'."""
match = re.search(r'\b(20\d{2})\b', query)
if match:
return int(match.group(1))
# Handle relative years
current_year = datetime.now().year
if 'this year' in query.lower():
return current_year
if 'next year' in query.lower():
return current_year + 1
if 'last year' in query.lower():
return current_year - 1
return None
# Singleton instance
date_parser = DateParser()