Spaces:

Imaginethat
/

Minerv4

Paused

App Files Files Community

Minerv4 / time_signals.py

Imaginethat

Upload 4 files

0e20813 verified 5 months ago

raw

history blame contribute delete

3.52 kB

	from __future__ import annotations

	import re
	from datetime import date
	from typing import Dict, List, Optional, Sequence


	HOLIDAY_MONTH_MAP: Dict[str, int] = {
	"newyear": 1,
	"newyears": 1,
	"nye": 1,
	"valentine": 2,
	"valentines": 2,
	"stpatrick": 3,
	"easter": 4,
	"mothersday": 5,
	"memorial": 5,
	"juneteenth": 6,
	"pride": 6,
	"father": 6,
	"independence": 7,
	"july4": 7,
	"labor": 9,
	"halloween": 10,
	"thanksgiving": 11,
	"blackfriday": 11,
	"cybermonday": 11,
	"christmas": 12,
	"xmas": 12,
	"hanukkah": 12,
	}

	MONTH_KEYWORDS: Dict[str, int] = {
	"january": 1,
	"jan": 1,
	"february": 2,
	"feb": 2,
	"march": 3,
	"mar": 3,
	"april": 4,
	"apr": 4,
	"may": 5,
	"june": 6,
	"jun": 6,
	"july": 7,
	"jul": 7,
	"august": 8,
	"aug": 8,
	"september": 9,
	"sept": 9,
	"sep": 9,
	"october": 10,
	"oct": 10,
	"november": 11,
	"nov": 11,
	"december": 12,
	"dec": 12,
	}

	SEASON_TERMS = {
	"spring",
	"summer",
	"fall",
	"autumn",
	"winter",
	"backtoschool",
	"graduation",
	}

	VIRAL_TOKENS = {
	"fyp",
	"foryou",
	"foryoupage",
	"viral",
	"trending",
	"trend",
	"xyzbca",
	}

	ALNUM_RE = re.compile(r"[^a-z0-9]+")


	def normalize_token(token: str) -> str:
	return ALNUM_RE.sub("", (token or "").lower())


	def parse_created_month(created_date: Optional[str]) -> Optional[int]:
	if not created_date:
	return None
	try:
	parts = str(created_date).split("T", 1)[0].split(" ", 1)[0].split("-", 2)
	if len(parts) >= 2:
	m = int(parts[1])
	if 1 <= m <= 12:
	return m
	except Exception:
	return None
	return None


	def detect_month_from_token(token: str) -> Optional[int]:
	token = normalize_token(token)
	if not token:
	return None
	if token in MONTH_KEYWORDS:
	return MONTH_KEYWORDS[token]
	for holiday, month in HOLIDAY_MONTH_MAP.items():
	if holiday in token:
	return month
	return None


	def _squash_hits(hits: int, *, base: float = 0.35, step: float = 0.15) -> float:
	if hits <= 0:
	return 0.0
	return float(min(1.0, base + step * (hits - 1)))


	def compute_time_scores(
	tokens: Sequence[str],
	created_date: Optional[str],
	label_order: Sequence[str],
	) -> List[float]:
	"""
	Derived time scores (not lexicon-mined).

	Supports the default System 7.1 time labels: ['seasonal', 'viral'].
	"""
	seasonal_hits = 0
	viral_hits = 0
	token_month_hits: Dict[int, int] = {}

	for tok in tokens or []:
	norm = normalize_token(str(tok))
	if not norm:
	continue
	if norm in VIRAL_TOKENS:
	viral_hits += 1
	if norm in SEASON_TERMS:
	seasonal_hits += 1
	m = detect_month_from_token(norm)
	if m is not None:
	seasonal_hits += 1
	token_month_hits[m] = token_month_hits.get(m, 0) + 1

	seasonal = _squash_hits(seasonal_hits)
	viral = _squash_hits(viral_hits)

	created_month = parse_created_month(created_date)
	if created_month and token_month_hits:
	dominant_month = max(token_month_hits.items(), key=lambda kv: kv[1])[0]
	if dominant_month == created_month and seasonal > 0:
	seasonal = float(min(1.0, seasonal + 0.05))

	by_label = {"seasonal": seasonal, "viral": viral}
	return [float(by_label.get(label, 0.0)) for label in label_order]