import re from datetime import datetime def preprocess_text(text): """Enhanced text preprocessing.""" text = text.lower() text = re.sub(r'[^\w\s-]', ' ', text) # Keep hyphens for date ranges text = re.sub(r'\s+', ' ', text) return text.strip() def extract_dates(text): """Improved date extraction with various formats.""" date_patterns = [ r'(\d{4}/\d{2})\s*-\s*(\d{4}/\d{2}|present|current)', r'(\w+\s+\d{4})\s*-\s*(\w+\s+\d{4}|present|current)', r'(\d{4})\s*-\s*(\d{4}|present|current)', ] dates = [] for pattern in date_patterns: matches = re.finditer(pattern, text, re.IGNORECASE) dates.extend((m.group(1), m.group(2)) for m in matches) return dates def parse_date(date_str): """Parse various date formats.""" if not date_str or date_str.lower() in ['present', 'current']: return datetime.now() try: # Try different date formats formats = ['%Y/%m', '%B %Y', '%b %Y', '%Y'] for fmt in formats: try: return datetime.strptime(date_str, fmt) except ValueError: continue return None except Exception: return None