File size: 1,229 Bytes
d052038
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import re
from datetime import datetime

def preprocess_text(text):
    """Enhanced text preprocessing."""
    text = text.lower()
    text = re.sub(r'[^\w\s-]', ' ', text)  # Keep hyphens for date ranges
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

def extract_dates(text):
    """Improved date extraction with various formats."""
    date_patterns = [
        r'(\d{4}/\d{2})\s*-\s*(\d{4}/\d{2}|present|current)',
        r'(\w+\s+\d{4})\s*-\s*(\w+\s+\d{4}|present|current)',
        r'(\d{4})\s*-\s*(\d{4}|present|current)',
    ]
    
    dates = []
    for pattern in date_patterns:
        matches = re.finditer(pattern, text, re.IGNORECASE)
        dates.extend((m.group(1), m.group(2)) for m in matches)
    return dates

def parse_date(date_str):
    """Parse various date formats."""
    if not date_str or date_str.lower() in ['present', 'current']:
        return datetime.now()
    
    try:
        # Try different date formats
        formats = ['%Y/%m', '%B %Y', '%b %Y', '%Y']
        for fmt in formats:
            try:
                return datetime.strptime(date_str, fmt)
            except ValueError:
                continue
        return None
    except Exception:
        return None