File size: 3,876 Bytes
897a32a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
be4faf2
897a32a
be4faf2
897a32a
be4faf2
 
 
 
 
 
 
897a32a
be4faf2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
dbfb079
 
 
 
be4faf2
 
dbfb079
be4faf2
dbfb079
be4faf2
 
dbfb079
897a32a
 
dbfb079
 
 
 
 
897a32a
be4faf2
897a32a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
"""
Date Parsing and Normalization Utility
FAANG-Level Quality Control for Published Dates

Ensures all dates are in strict ISO-8601 UTC format for reliable sorting.
"""

from typing import Optional
from datetime import datetime, timezone
import re
from dateutil import parser as dateutil_parser
from dateutil.tz import tzutc


def parse_date_to_iso(date_str: str) -> str:
    """
    Parse any date format and convert to strict ISO-8601 UTC
    
    Handles:
    - ISO-8601: "2026-01-22T05:58:33Z" ✅
    - RFC-822: "Mon, 22 Jan 2026 05:58:33 GMT" ✅
    - Natural language: "2 hours ago", "yesterday" ✅
    - Unix timestamps: "1737525513" ✅
    
    Returns: "2026-01-22T05:58:33.000Z" (always UTC)
    """
    if not date_str:
        return datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z')
    
    try:
        # Try parsing with dateutil (handles most formats)
        parsed_date = dateutil_parser.parse(date_str)
        
        # Convert to UTC if timezone-aware
        if parsed_date.tzinfo is not None:
            parsed_date = parsed_date.astimezone(timezone.utc)
        else:
            # Assume UTC if no timezone
            parsed_date = parsed_date.replace(tzinfo=timezone.utc)
        
        # Return in strict ISO-8601 format with Z suffix
        return parsed_date.isoformat().replace('+00:00', 'Z')
        
    except Exception as e:
        # Fallback to current time if unparsable
        print(f"⚠️  Date parsing failed for '{date_str}': {e}. Using current time.")
        return datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z')


def normalize_article_date(article):
    """
    Normalize the publishedAt field in an article
    
    HOTFIX (2026-01-23): Now handles both Pydantic Article models AND dicts
    
    Args:
        article: Article model or dict
    
    Returns:
        dict with normalized publishedAt field
    """
    # HOTFIX: Convert Pydantic model to dict if needed
    if hasattr(article, 'model_dump'):
        # It's a Pydantic v2 model
        article_dict = article.model_dump()
    elif hasattr(article, 'dict'):
        # It's a Pydantic v1 model
        article_dict = article.dict()
    elif isinstance(article, dict):
        # Already a dict - create a copy to avoid mutating original
        article_dict = article.copy()
    else:
        # Unknown type - try to convert to dict
        article_dict = dict(article)
    
    # Normalize the date
    # Check both keys
    published_at = article_dict.get('publishedAt') or article_dict.get('published_at')
    
    if published_at:
        # Handle datetime objects
        if isinstance(published_at, datetime):
            iso_date = published_at.astimezone(timezone.utc).isoformat().replace('+00:00', 'Z')
        elif isinstance(published_at, str):
            iso_date = parse_date_to_iso(published_at)
        else:
            # Unknown type, use current time
            iso_date = datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z')
    else:
        # If missing, use current time
        iso_date = datetime.now(timezone.utc).isoformat().replace('+00:00', 'Z')
    
    # Set both keys to ensure compatibility
    article_dict['publishedAt'] = iso_date
    article_dict['published_at'] = iso_date
    
    return article_dict


def validate_date_format(date_str: str) -> bool:
    """
    Validate that a date string is in strict ISO-8601 UTC format
    
    Expected format: "YYYY-MM-DDTHH:MM:SS.sssZ" or "YYYY-MM-DDTHH:MM:SSZ"
    
    Returns: True if valid, False otherwise
    """
    # ISO-8601 UTC pattern
    iso_pattern = r'^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(\.\d{3})?Z$'
    
    if not date_str:
        return False
    
    return bool(re.match(iso_pattern, date_str))


# Export functions
__all__ = [
    'parse_date_to_iso',
    'normalize_article_date',
    'validate_date_format'
]