File size: 2,286 Bytes
86b932c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
from datetime import datetime, timezone
import pandas as pd
import numpy as np

def calculate_freshness(
    published_date, 
    has_date: bool, 
    is_inference: bool = False,
    reference_date: datetime = None
) -> float:
    """
    Calculate the temporal freshness score for a single article.
    
    Rules:
      - score = 1.0 if article is < 30 days old
      - score = max(0.1, 1 - (days_old / 365)) for older articles
      - score = 0.5 if has_date is False (neutral for training)
      - score = 0.35 if has_date is False AND called from inference
      
    Args:
        published_date: The published date of the article (datetime or NaT).
        has_date: Boolean flag indicating if a valid date is present.
        is_inference: Whether the scoring is happening during live inference.
        reference_date: The date to compute 'days_old' against (defaults to now).
        
    Returns:
        Float score between 0.1 and 1.0.
    """
    if not has_date or pd.isna(published_date):
        return 0.35 if is_inference else 0.50
        
    if reference_date is None:
        reference_date = datetime.now(timezone.utc)
        
    # Ensure published_date is timezone-aware
    if pd.api.types.is_scalar(published_date) and getattr(published_date, 'tzinfo', None) is None:
        # Assuming UTC if naive, typical for web dates
        try:
            published_date = published_date.replace(tzinfo=timezone.utc)
        except Exception:
            pass
            
    days_old = (reference_date - published_date).days
    
    # Handle future dates gracefully (e.g., bad parsed data)
    if days_old < 0:
        days_old = 0
        
    if days_old < 30:
        return 1.0
        
    return max(0.1, 1.0 - (days_old / 365.0))

def apply_freshness_score(df: pd.DataFrame, is_inference: bool = False) -> pd.DataFrame:
    """
    Apply freshness scoring to a DataFrame.
    """
    df = df.copy()
    ref_date = datetime.now(timezone.utc)
    
    # Vectorized execution wrapper
    df["freshness_score"] = df.apply(
        lambda r: calculate_freshness(
            r.get("published_date"), 
            r.get("has_date", pd.notna(r.get("published_date"))),
            is_inference,
            ref_date
        ),
        axis=1
    )
    return df