from datetime import datetime, timezone import pandas as pd import numpy as np def calculate_freshness( published_date, has_date: bool, is_inference: bool = False, reference_date: datetime = None ) -> float: """ Calculate the temporal freshness score for a single article. Rules: - score = 1.0 if article is < 30 days old - score = max(0.1, 1 - (days_old / 365)) for older articles - score = 0.5 if has_date is False (neutral for training) - score = 0.35 if has_date is False AND called from inference Args: published_date: The published date of the article (datetime or NaT). has_date: Boolean flag indicating if a valid date is present. is_inference: Whether the scoring is happening during live inference. reference_date: The date to compute 'days_old' against (defaults to now). Returns: Float score between 0.1 and 1.0. """ if not has_date or pd.isna(published_date): return 0.35 if is_inference else 0.50 if reference_date is None: reference_date = datetime.now(timezone.utc) # Ensure published_date is timezone-aware if pd.api.types.is_scalar(published_date) and getattr(published_date, 'tzinfo', None) is None: # Assuming UTC if naive, typical for web dates try: published_date = published_date.replace(tzinfo=timezone.utc) except Exception: pass days_old = (reference_date - published_date).days # Handle future dates gracefully (e.g., bad parsed data) if days_old < 0: days_old = 0 if days_old < 30: return 1.0 return max(0.1, 1.0 - (days_old / 365.0)) def apply_freshness_score(df: pd.DataFrame, is_inference: bool = False) -> pd.DataFrame: """ Apply freshness scoring to a DataFrame. """ df = df.copy() ref_date = datetime.now(timezone.utc) # Vectorized execution wrapper df["freshness_score"] = df.apply( lambda r: calculate_freshness( r.get("published_date"), r.get("has_date", pd.notna(r.get("published_date"))), is_inference, ref_date ), axis=1 ) return df