Spaces:

Imaginethat
/

Minerv4

Paused

App Files Files Community

Imaginethat commited on Dec 15, 2025

Commit

0e20813

verified ·

1 Parent(s): 663f82a

Upload 4 files

Browse files

Files changed (4) hide show

__init__.py +4 -0
labels.py +35 -0
sys7_miner_2.py +152 -0
time_signals.py +157 -0

__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .text import fuse_text, tokenize  # noqa: F401
+from .lexicon import load_json, prepare_slang_map, orient_lexicons, compute_raw_scores  # noqa: F401
+from .labels import dominant_label_gated, UNKNOWN_LABEL  # noqa: F401
+from .time_signals import compute_time_scores  # noqa: F401

labels.py ADDED Viewed

	@@ -0,0 +1,35 @@

+from __future__ import annotations
+from typing import Sequence, Tuple
+import numpy as np
+UNKNOWN_LABEL = "Unknown"
+def dominant_label_gated(
+    scores: Sequence[float] | None,
+    labels: Sequence[str] | None,
+    min_score: float,
+    *,
+    unknown_label: str = UNKNOWN_LABEL,
+) -> Tuple[str, float]:
+    """
+    Return the dominant label only if it's confidently supported.
+    Rule:
+      - If max(score) >= min_score: return (label, max_score)
+      - Else: return (unknown_label, max_score)
+    """
+    if not scores or not labels:
+        return unknown_label, 0.0
+    arr = np.asarray(list(scores), dtype=float)
+    if arr.size == 0:
+        return unknown_label, 0.0
+    idx = int(arr.argmax())
+    max_score = float(arr[idx])
+    if idx >= len(labels) or max_score < float(min_score):
+        return unknown_label, max_score
+    return str(labels[idx]), max_score

sys7_miner_2.py CHANGED Viewed

@@ -61,6 +61,147 @@ WHITESPACE_RE = re.compile(r"\s+")
 HASHTAG_SPLIT_RE = re.compile(r"[A-Z]?[a-z]+|[0-9]+")
 TOKENIZER_RE = re.compile(r"[a-z0-9']+")
 @dataclass
 class MinerConfig:
@@ -479,6 +620,17 @@ def process_chunk(
                     created_date = str(pd.to_datetime(created_raw)).split(" ")[0]
                 except Exception:
                     created_date = None
         video_id = coerce_id(first_present(row, "video_id", "aweme_id", "id"))
         author_id = coerce_id(first_present(row, "author_id", "user_id"))
         record: Dict[str, object] = {

 HASHTAG_SPLIT_RE = re.compile(r"[A-Z]?[a-z]+|[0-9]+")
 TOKENIZER_RE = re.compile(r"[a-z0-9']+")
+TIME_ALNUM_RE = re.compile(r"[^a-z0-9]+")
+TIME_HOLIDAY_MONTH_MAP: Dict[str, int] = {
+    "newyear": 1,
+    "newyears": 1,
+    "nye": 1,
+    "valentine": 2,
+    "valentines": 2,
+    "stpatrick": 3,
+    "easter": 4,
+    "mothersday": 5,
+    "memorial": 5,
+    "juneteenth": 6,
+    "pride": 6,
+    "father": 6,
+    "independence": 7,
+    "july4": 7,
+    "labor": 9,
+    "halloween": 10,
+    "thanksgiving": 11,
+    "blackfriday": 11,
+    "cybermonday": 11,
+    "christmas": 12,
+    "xmas": 12,
+    "hanukkah": 12,
+}
+TIME_MONTH_KEYWORDS: Dict[str, int] = {
+    "january": 1,
+    "jan": 1,
+    "february": 2,
+    "feb": 2,
+    "march": 3,
+    "mar": 3,
+    "april": 4,
+    "apr": 4,
+    "may": 5,
+    "june": 6,
+    "jun": 6,
+    "july": 7,
+    "jul": 7,
+    "august": 8,
+    "aug": 8,
+    "september": 9,
+    "sept": 9,
+    "sep": 9,
+    "october": 10,
+    "oct": 10,
+    "november": 11,
+    "nov": 11,
+    "december": 12,
+    "dec": 12,
+}
+TIME_SEASON_TERMS = {
+    "spring",
+    "summer",
+    "fall",
+    "autumn",
+    "winter",
+    "backtoschool",
+    "graduation",
+}
+TIME_VIRAL_TOKENS = {
+    "fyp",
+    "foryou",
+    "foryoupage",
+    "viral",
+    "trending",
+    "trend",
+    "xyzbca",
+}
+def _time_normalize_token(token: str) -> str:
+    return TIME_ALNUM_RE.sub("", (token or "").lower())
+def _time_parse_created_month(created_date: Optional[str]) -> Optional[int]:
+    if not created_date:
+        return None
+    try:
+        parts = str(created_date).split("T", 1)[0].split(" ", 1)[0].split("-", 2)
+        if len(parts) >= 2:
+            m = int(parts[1])
+            if 1 <= m <= 12:
+                return m
+    except Exception:
+        return None
+    return None
+def _time_detect_month_from_token(token: str) -> Optional[int]:
+    token = _time_normalize_token(token)
+    if not token:
+        return None
+    if token in TIME_MONTH_KEYWORDS:
+        return TIME_MONTH_KEYWORDS[token]
+    for holiday, month in TIME_HOLIDAY_MONTH_MAP.items():
+        if holiday in token:
+            return month
+    return None
+def _time_squash_hits(hits: int, *, base: float = 0.35, step: float = 0.15) -> float:
+    if hits <= 0:
+        return 0.0
+    return float(min(1.0, base + step * (hits - 1)))
+def compute_time_scores_derived(tokens: Sequence[str], created_date: Optional[str], label_order: Sequence[str]) -> List[float]:
+    seasonal_hits = 0
+    viral_hits = 0
+    token_month_hits: Dict[int, int] = {}
+    for tok in tokens or []:
+        norm = _time_normalize_token(str(tok))
+        if not norm:
+            continue
+        if norm in TIME_VIRAL_TOKENS:
+            viral_hits += 1
+        if norm in TIME_SEASON_TERMS:
+            seasonal_hits += 1
+        m = _time_detect_month_from_token(norm)
+        if m is not None:
+            seasonal_hits += 1
+            token_month_hits[m] = token_month_hits.get(m, 0) + 1
+    seasonal = _time_squash_hits(seasonal_hits)
+    viral = _time_squash_hits(viral_hits)
+    created_month = _time_parse_created_month(created_date)
+    if created_month and token_month_hits:
+        dominant_month = max(token_month_hits.items(), key=lambda kv: kv[1])[0]
+        if dominant_month == created_month and seasonal > 0:
+            seasonal = float(min(1.0, seasonal + 0.05))
+    by_label = {"seasonal": seasonal, "viral": viral}
+    return [float(by_label.get(label, 0.0)) for label in label_order]
 @dataclass
 class MinerConfig:
                     created_date = str(pd.to_datetime(created_raw)).split(" ")[0]
                 except Exception:
                     created_date = None
+        # Derived time signals (time lexicon mining can be empty depending on inputs).
+        time_labels = label_orders.get("time", []) or []
+        if time_labels:
+            derived_time = compute_time_scores_derived(tokens, created_date, time_labels)
+            base_time = raw_scores.get("time") or [0.0] * len(time_labels)
+            if len(base_time) < len(time_labels):
+                base_time = list(base_time) + [0.0] * (len(time_labels) - len(base_time))
+            elif len(base_time) > len(time_labels):
+                base_time = list(base_time)[: len(time_labels)]
+            raw_scores["time"] = [float(max(a, b)) for a, b in zip(base_time, derived_time)]
         video_id = coerce_id(first_present(row, "video_id", "aweme_id", "id"))
         author_id = coerce_id(first_present(row, "author_id", "user_id"))
         record: Dict[str, object] = {

time_signals.py ADDED Viewed

	@@ -0,0 +1,157 @@

+from __future__ import annotations
+import re
+from datetime import date
+from typing import Dict, List, Optional, Sequence
+HOLIDAY_MONTH_MAP: Dict[str, int] = {
+    "newyear": 1,
+    "newyears": 1,
+    "nye": 1,
+    "valentine": 2,
+    "valentines": 2,
+    "stpatrick": 3,
+    "easter": 4,
+    "mothersday": 5,
+    "memorial": 5,
+    "juneteenth": 6,
+    "pride": 6,
+    "father": 6,
+    "independence": 7,
+    "july4": 7,
+    "labor": 9,
+    "halloween": 10,
+    "thanksgiving": 11,
+    "blackfriday": 11,
+    "cybermonday": 11,
+    "christmas": 12,
+    "xmas": 12,
+    "hanukkah": 12,
+}
+MONTH_KEYWORDS: Dict[str, int] = {
+    "january": 1,
+    "jan": 1,
+    "february": 2,
+    "feb": 2,
+    "march": 3,
+    "mar": 3,
+    "april": 4,
+    "apr": 4,
+    "may": 5,
+    "june": 6,
+    "jun": 6,
+    "july": 7,
+    "jul": 7,
+    "august": 8,
+    "aug": 8,
+    "september": 9,
+    "sept": 9,
+    "sep": 9,
+    "october": 10,
+    "oct": 10,
+    "november": 11,
+    "nov": 11,
+    "december": 12,
+    "dec": 12,
+}
+SEASON_TERMS = {
+    "spring",
+    "summer",
+    "fall",
+    "autumn",
+    "winter",
+    "backtoschool",
+    "graduation",
+}
+VIRAL_TOKENS = {
+    "fyp",
+    "foryou",
+    "foryoupage",
+    "viral",
+    "trending",
+    "trend",
+    "xyzbca",
+}
+ALNUM_RE = re.compile(r"[^a-z0-9]+")
+def normalize_token(token: str) -> str:
+    return ALNUM_RE.sub("", (token or "").lower())
+def parse_created_month(created_date: Optional[str]) -> Optional[int]:
+    if not created_date:
+        return None
+    try:
+        parts = str(created_date).split("T", 1)[0].split(" ", 1)[0].split("-", 2)
+        if len(parts) >= 2:
+            m = int(parts[1])
+            if 1 <= m <= 12:
+                return m
+    except Exception:
+        return None
+    return None
+def detect_month_from_token(token: str) -> Optional[int]:
+    token = normalize_token(token)
+    if not token:
+        return None
+    if token in MONTH_KEYWORDS:
+        return MONTH_KEYWORDS[token]
+    for holiday, month in HOLIDAY_MONTH_MAP.items():
+        if holiday in token:
+            return month
+    return None
+def _squash_hits(hits: int, *, base: float = 0.35, step: float = 0.15) -> float:
+    if hits <= 0:
+        return 0.0
+    return float(min(1.0, base + step * (hits - 1)))
+def compute_time_scores(
+    tokens: Sequence[str],
+    created_date: Optional[str],
+    label_order: Sequence[str],
+) -> List[float]:
+    """
+    Derived time scores (not lexicon-mined).
+    Supports the default System 7.1 time labels: ['seasonal', 'viral'].
+    """
+    seasonal_hits = 0
+    viral_hits = 0
+    token_month_hits: Dict[int, int] = {}
+    for tok in tokens or []:
+        norm = normalize_token(str(tok))
+        if not norm:
+            continue
+        if norm in VIRAL_TOKENS:
+            viral_hits += 1
+        if norm in SEASON_TERMS:
+            seasonal_hits += 1
+        m = detect_month_from_token(norm)
+        if m is not None:
+            seasonal_hits += 1
+            token_month_hits[m] = token_month_hits.get(m, 0) + 1
+    seasonal = _squash_hits(seasonal_hits)
+    viral = _squash_hits(viral_hits)
+    created_month = parse_created_month(created_date)
+    if created_month and token_month_hits:
+        dominant_month = max(token_month_hits.items(), key=lambda kv: kv[1])[0]
+        if dominant_month == created_month and seasonal > 0:
+            seasonal = float(min(1.0, seasonal + 0.05))
+    by_label = {"seasonal": seasonal, "viral": viral}
+    return [float(by_label.get(label, 0.0)) for label in label_order]