Minerv4 / time_signals.py
Imaginethat's picture
Upload 4 files
0e20813 verified
from __future__ import annotations
import re
from datetime import date
from typing import Dict, List, Optional, Sequence
HOLIDAY_MONTH_MAP: Dict[str, int] = {
"newyear": 1,
"newyears": 1,
"nye": 1,
"valentine": 2,
"valentines": 2,
"stpatrick": 3,
"easter": 4,
"mothersday": 5,
"memorial": 5,
"juneteenth": 6,
"pride": 6,
"father": 6,
"independence": 7,
"july4": 7,
"labor": 9,
"halloween": 10,
"thanksgiving": 11,
"blackfriday": 11,
"cybermonday": 11,
"christmas": 12,
"xmas": 12,
"hanukkah": 12,
}
MONTH_KEYWORDS: Dict[str, int] = {
"january": 1,
"jan": 1,
"february": 2,
"feb": 2,
"march": 3,
"mar": 3,
"april": 4,
"apr": 4,
"may": 5,
"june": 6,
"jun": 6,
"july": 7,
"jul": 7,
"august": 8,
"aug": 8,
"september": 9,
"sept": 9,
"sep": 9,
"october": 10,
"oct": 10,
"november": 11,
"nov": 11,
"december": 12,
"dec": 12,
}
SEASON_TERMS = {
"spring",
"summer",
"fall",
"autumn",
"winter",
"backtoschool",
"graduation",
}
VIRAL_TOKENS = {
"fyp",
"foryou",
"foryoupage",
"viral",
"trending",
"trend",
"xyzbca",
}
ALNUM_RE = re.compile(r"[^a-z0-9]+")
def normalize_token(token: str) -> str:
return ALNUM_RE.sub("", (token or "").lower())
def parse_created_month(created_date: Optional[str]) -> Optional[int]:
if not created_date:
return None
try:
parts = str(created_date).split("T", 1)[0].split(" ", 1)[0].split("-", 2)
if len(parts) >= 2:
m = int(parts[1])
if 1 <= m <= 12:
return m
except Exception:
return None
return None
def detect_month_from_token(token: str) -> Optional[int]:
token = normalize_token(token)
if not token:
return None
if token in MONTH_KEYWORDS:
return MONTH_KEYWORDS[token]
for holiday, month in HOLIDAY_MONTH_MAP.items():
if holiday in token:
return month
return None
def _squash_hits(hits: int, *, base: float = 0.35, step: float = 0.15) -> float:
if hits <= 0:
return 0.0
return float(min(1.0, base + step * (hits - 1)))
def compute_time_scores(
tokens: Sequence[str],
created_date: Optional[str],
label_order: Sequence[str],
) -> List[float]:
"""
Derived time scores (not lexicon-mined).
Supports the default System 7.1 time labels: ['seasonal', 'viral'].
"""
seasonal_hits = 0
viral_hits = 0
token_month_hits: Dict[int, int] = {}
for tok in tokens or []:
norm = normalize_token(str(tok))
if not norm:
continue
if norm in VIRAL_TOKENS:
viral_hits += 1
if norm in SEASON_TERMS:
seasonal_hits += 1
m = detect_month_from_token(norm)
if m is not None:
seasonal_hits += 1
token_month_hits[m] = token_month_hits.get(m, 0) + 1
seasonal = _squash_hits(seasonal_hits)
viral = _squash_hits(viral_hits)
created_month = parse_created_month(created_date)
if created_month and token_month_hits:
dominant_month = max(token_month_hits.items(), key=lambda kv: kv[1])[0]
if dominant_month == created_month and seasonal > 0:
seasonal = float(min(1.0, seasonal + 0.05))
by_label = {"seasonal": seasonal, "viral": viral}
return [float(by_label.get(label, 0.0)) for label in label_order]