Spaces:
Sleeping
Sleeping
File size: 2,286 Bytes
86b932c | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | from datetime import datetime, timezone
import pandas as pd
import numpy as np
def calculate_freshness(
published_date,
has_date: bool,
is_inference: bool = False,
reference_date: datetime = None
) -> float:
"""
Calculate the temporal freshness score for a single article.
Rules:
- score = 1.0 if article is < 30 days old
- score = max(0.1, 1 - (days_old / 365)) for older articles
- score = 0.5 if has_date is False (neutral for training)
- score = 0.35 if has_date is False AND called from inference
Args:
published_date: The published date of the article (datetime or NaT).
has_date: Boolean flag indicating if a valid date is present.
is_inference: Whether the scoring is happening during live inference.
reference_date: The date to compute 'days_old' against (defaults to now).
Returns:
Float score between 0.1 and 1.0.
"""
if not has_date or pd.isna(published_date):
return 0.35 if is_inference else 0.50
if reference_date is None:
reference_date = datetime.now(timezone.utc)
# Ensure published_date is timezone-aware
if pd.api.types.is_scalar(published_date) and getattr(published_date, 'tzinfo', None) is None:
# Assuming UTC if naive, typical for web dates
try:
published_date = published_date.replace(tzinfo=timezone.utc)
except Exception:
pass
days_old = (reference_date - published_date).days
# Handle future dates gracefully (e.g., bad parsed data)
if days_old < 0:
days_old = 0
if days_old < 30:
return 1.0
return max(0.1, 1.0 - (days_old / 365.0))
def apply_freshness_score(df: pd.DataFrame, is_inference: bool = False) -> pd.DataFrame:
"""
Apply freshness scoring to a DataFrame.
"""
df = df.copy()
ref_date = datetime.now(timezone.utc)
# Vectorized execution wrapper
df["freshness_score"] = df.apply(
lambda r: calculate_freshness(
r.get("published_date"),
r.get("has_date", pd.notna(r.get("published_date"))),
is_inference,
ref_date
),
axis=1
)
return df
|