Imaginethat commited on
Commit
0e20813
·
verified ·
1 Parent(s): 663f82a

Upload 4 files

Browse files
Files changed (4) hide show
  1. __init__.py +4 -0
  2. labels.py +35 -0
  3. sys7_miner_2.py +152 -0
  4. time_signals.py +157 -0
__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .text import fuse_text, tokenize # noqa: F401
2
+ from .lexicon import load_json, prepare_slang_map, orient_lexicons, compute_raw_scores # noqa: F401
3
+ from .labels import dominant_label_gated, UNKNOWN_LABEL # noqa: F401
4
+ from .time_signals import compute_time_scores # noqa: F401
labels.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ from typing import Sequence, Tuple
4
+
5
+ import numpy as np
6
+
7
+
8
+ UNKNOWN_LABEL = "Unknown"
9
+
10
+
11
+ def dominant_label_gated(
12
+ scores: Sequence[float] | None,
13
+ labels: Sequence[str] | None,
14
+ min_score: float,
15
+ *,
16
+ unknown_label: str = UNKNOWN_LABEL,
17
+ ) -> Tuple[str, float]:
18
+ """
19
+ Return the dominant label only if it's confidently supported.
20
+
21
+ Rule:
22
+ - If max(score) >= min_score: return (label, max_score)
23
+ - Else: return (unknown_label, max_score)
24
+ """
25
+ if not scores or not labels:
26
+ return unknown_label, 0.0
27
+ arr = np.asarray(list(scores), dtype=float)
28
+ if arr.size == 0:
29
+ return unknown_label, 0.0
30
+ idx = int(arr.argmax())
31
+ max_score = float(arr[idx])
32
+ if idx >= len(labels) or max_score < float(min_score):
33
+ return unknown_label, max_score
34
+ return str(labels[idx]), max_score
35
+
sys7_miner_2.py CHANGED
@@ -61,6 +61,147 @@ WHITESPACE_RE = re.compile(r"\s+")
61
  HASHTAG_SPLIT_RE = re.compile(r"[A-Z]?[a-z]+|[0-9]+")
62
  TOKENIZER_RE = re.compile(r"[a-z0-9']+")
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  @dataclass
66
  class MinerConfig:
@@ -479,6 +620,17 @@ def process_chunk(
479
  created_date = str(pd.to_datetime(created_raw)).split(" ")[0]
480
  except Exception:
481
  created_date = None
 
 
 
 
 
 
 
 
 
 
 
482
  video_id = coerce_id(first_present(row, "video_id", "aweme_id", "id"))
483
  author_id = coerce_id(first_present(row, "author_id", "user_id"))
484
  record: Dict[str, object] = {
 
61
  HASHTAG_SPLIT_RE = re.compile(r"[A-Z]?[a-z]+|[0-9]+")
62
  TOKENIZER_RE = re.compile(r"[a-z0-9']+")
63
 
64
+ TIME_ALNUM_RE = re.compile(r"[^a-z0-9]+")
65
+
66
+ TIME_HOLIDAY_MONTH_MAP: Dict[str, int] = {
67
+ "newyear": 1,
68
+ "newyears": 1,
69
+ "nye": 1,
70
+ "valentine": 2,
71
+ "valentines": 2,
72
+ "stpatrick": 3,
73
+ "easter": 4,
74
+ "mothersday": 5,
75
+ "memorial": 5,
76
+ "juneteenth": 6,
77
+ "pride": 6,
78
+ "father": 6,
79
+ "independence": 7,
80
+ "july4": 7,
81
+ "labor": 9,
82
+ "halloween": 10,
83
+ "thanksgiving": 11,
84
+ "blackfriday": 11,
85
+ "cybermonday": 11,
86
+ "christmas": 12,
87
+ "xmas": 12,
88
+ "hanukkah": 12,
89
+ }
90
+
91
+ TIME_MONTH_KEYWORDS: Dict[str, int] = {
92
+ "january": 1,
93
+ "jan": 1,
94
+ "february": 2,
95
+ "feb": 2,
96
+ "march": 3,
97
+ "mar": 3,
98
+ "april": 4,
99
+ "apr": 4,
100
+ "may": 5,
101
+ "june": 6,
102
+ "jun": 6,
103
+ "july": 7,
104
+ "jul": 7,
105
+ "august": 8,
106
+ "aug": 8,
107
+ "september": 9,
108
+ "sept": 9,
109
+ "sep": 9,
110
+ "october": 10,
111
+ "oct": 10,
112
+ "november": 11,
113
+ "nov": 11,
114
+ "december": 12,
115
+ "dec": 12,
116
+ }
117
+
118
+ TIME_SEASON_TERMS = {
119
+ "spring",
120
+ "summer",
121
+ "fall",
122
+ "autumn",
123
+ "winter",
124
+ "backtoschool",
125
+ "graduation",
126
+ }
127
+
128
+ TIME_VIRAL_TOKENS = {
129
+ "fyp",
130
+ "foryou",
131
+ "foryoupage",
132
+ "viral",
133
+ "trending",
134
+ "trend",
135
+ "xyzbca",
136
+ }
137
+
138
+
139
+ def _time_normalize_token(token: str) -> str:
140
+ return TIME_ALNUM_RE.sub("", (token or "").lower())
141
+
142
+
143
+ def _time_parse_created_month(created_date: Optional[str]) -> Optional[int]:
144
+ if not created_date:
145
+ return None
146
+ try:
147
+ parts = str(created_date).split("T", 1)[0].split(" ", 1)[0].split("-", 2)
148
+ if len(parts) >= 2:
149
+ m = int(parts[1])
150
+ if 1 <= m <= 12:
151
+ return m
152
+ except Exception:
153
+ return None
154
+ return None
155
+
156
+
157
+ def _time_detect_month_from_token(token: str) -> Optional[int]:
158
+ token = _time_normalize_token(token)
159
+ if not token:
160
+ return None
161
+ if token in TIME_MONTH_KEYWORDS:
162
+ return TIME_MONTH_KEYWORDS[token]
163
+ for holiday, month in TIME_HOLIDAY_MONTH_MAP.items():
164
+ if holiday in token:
165
+ return month
166
+ return None
167
+
168
+
169
+ def _time_squash_hits(hits: int, *, base: float = 0.35, step: float = 0.15) -> float:
170
+ if hits <= 0:
171
+ return 0.0
172
+ return float(min(1.0, base + step * (hits - 1)))
173
+
174
+
175
+ def compute_time_scores_derived(tokens: Sequence[str], created_date: Optional[str], label_order: Sequence[str]) -> List[float]:
176
+ seasonal_hits = 0
177
+ viral_hits = 0
178
+ token_month_hits: Dict[int, int] = {}
179
+
180
+ for tok in tokens or []:
181
+ norm = _time_normalize_token(str(tok))
182
+ if not norm:
183
+ continue
184
+ if norm in TIME_VIRAL_TOKENS:
185
+ viral_hits += 1
186
+ if norm in TIME_SEASON_TERMS:
187
+ seasonal_hits += 1
188
+ m = _time_detect_month_from_token(norm)
189
+ if m is not None:
190
+ seasonal_hits += 1
191
+ token_month_hits[m] = token_month_hits.get(m, 0) + 1
192
+
193
+ seasonal = _time_squash_hits(seasonal_hits)
194
+ viral = _time_squash_hits(viral_hits)
195
+
196
+ created_month = _time_parse_created_month(created_date)
197
+ if created_month and token_month_hits:
198
+ dominant_month = max(token_month_hits.items(), key=lambda kv: kv[1])[0]
199
+ if dominant_month == created_month and seasonal > 0:
200
+ seasonal = float(min(1.0, seasonal + 0.05))
201
+
202
+ by_label = {"seasonal": seasonal, "viral": viral}
203
+ return [float(by_label.get(label, 0.0)) for label in label_order]
204
+
205
 
206
  @dataclass
207
  class MinerConfig:
 
620
  created_date = str(pd.to_datetime(created_raw)).split(" ")[0]
621
  except Exception:
622
  created_date = None
623
+
624
+ # Derived time signals (time lexicon mining can be empty depending on inputs).
625
+ time_labels = label_orders.get("time", []) or []
626
+ if time_labels:
627
+ derived_time = compute_time_scores_derived(tokens, created_date, time_labels)
628
+ base_time = raw_scores.get("time") or [0.0] * len(time_labels)
629
+ if len(base_time) < len(time_labels):
630
+ base_time = list(base_time) + [0.0] * (len(time_labels) - len(base_time))
631
+ elif len(base_time) > len(time_labels):
632
+ base_time = list(base_time)[: len(time_labels)]
633
+ raw_scores["time"] = [float(max(a, b)) for a, b in zip(base_time, derived_time)]
634
  video_id = coerce_id(first_present(row, "video_id", "aweme_id", "id"))
635
  author_id = coerce_id(first_present(row, "author_id", "user_id"))
636
  record: Dict[str, object] = {
time_signals.py ADDED
@@ -0,0 +1,157 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import re
4
+ from datetime import date
5
+ from typing import Dict, List, Optional, Sequence
6
+
7
+
8
+ HOLIDAY_MONTH_MAP: Dict[str, int] = {
9
+ "newyear": 1,
10
+ "newyears": 1,
11
+ "nye": 1,
12
+ "valentine": 2,
13
+ "valentines": 2,
14
+ "stpatrick": 3,
15
+ "easter": 4,
16
+ "mothersday": 5,
17
+ "memorial": 5,
18
+ "juneteenth": 6,
19
+ "pride": 6,
20
+ "father": 6,
21
+ "independence": 7,
22
+ "july4": 7,
23
+ "labor": 9,
24
+ "halloween": 10,
25
+ "thanksgiving": 11,
26
+ "blackfriday": 11,
27
+ "cybermonday": 11,
28
+ "christmas": 12,
29
+ "xmas": 12,
30
+ "hanukkah": 12,
31
+ }
32
+
33
+ MONTH_KEYWORDS: Dict[str, int] = {
34
+ "january": 1,
35
+ "jan": 1,
36
+ "february": 2,
37
+ "feb": 2,
38
+ "march": 3,
39
+ "mar": 3,
40
+ "april": 4,
41
+ "apr": 4,
42
+ "may": 5,
43
+ "june": 6,
44
+ "jun": 6,
45
+ "july": 7,
46
+ "jul": 7,
47
+ "august": 8,
48
+ "aug": 8,
49
+ "september": 9,
50
+ "sept": 9,
51
+ "sep": 9,
52
+ "october": 10,
53
+ "oct": 10,
54
+ "november": 11,
55
+ "nov": 11,
56
+ "december": 12,
57
+ "dec": 12,
58
+ }
59
+
60
+ SEASON_TERMS = {
61
+ "spring",
62
+ "summer",
63
+ "fall",
64
+ "autumn",
65
+ "winter",
66
+ "backtoschool",
67
+ "graduation",
68
+ }
69
+
70
+ VIRAL_TOKENS = {
71
+ "fyp",
72
+ "foryou",
73
+ "foryoupage",
74
+ "viral",
75
+ "trending",
76
+ "trend",
77
+ "xyzbca",
78
+ }
79
+
80
+ ALNUM_RE = re.compile(r"[^a-z0-9]+")
81
+
82
+
83
+ def normalize_token(token: str) -> str:
84
+ return ALNUM_RE.sub("", (token or "").lower())
85
+
86
+
87
+ def parse_created_month(created_date: Optional[str]) -> Optional[int]:
88
+ if not created_date:
89
+ return None
90
+ try:
91
+ parts = str(created_date).split("T", 1)[0].split(" ", 1)[0].split("-", 2)
92
+ if len(parts) >= 2:
93
+ m = int(parts[1])
94
+ if 1 <= m <= 12:
95
+ return m
96
+ except Exception:
97
+ return None
98
+ return None
99
+
100
+
101
+ def detect_month_from_token(token: str) -> Optional[int]:
102
+ token = normalize_token(token)
103
+ if not token:
104
+ return None
105
+ if token in MONTH_KEYWORDS:
106
+ return MONTH_KEYWORDS[token]
107
+ for holiday, month in HOLIDAY_MONTH_MAP.items():
108
+ if holiday in token:
109
+ return month
110
+ return None
111
+
112
+
113
+ def _squash_hits(hits: int, *, base: float = 0.35, step: float = 0.15) -> float:
114
+ if hits <= 0:
115
+ return 0.0
116
+ return float(min(1.0, base + step * (hits - 1)))
117
+
118
+
119
+ def compute_time_scores(
120
+ tokens: Sequence[str],
121
+ created_date: Optional[str],
122
+ label_order: Sequence[str],
123
+ ) -> List[float]:
124
+ """
125
+ Derived time scores (not lexicon-mined).
126
+
127
+ Supports the default System 7.1 time labels: ['seasonal', 'viral'].
128
+ """
129
+ seasonal_hits = 0
130
+ viral_hits = 0
131
+ token_month_hits: Dict[int, int] = {}
132
+
133
+ for tok in tokens or []:
134
+ norm = normalize_token(str(tok))
135
+ if not norm:
136
+ continue
137
+ if norm in VIRAL_TOKENS:
138
+ viral_hits += 1
139
+ if norm in SEASON_TERMS:
140
+ seasonal_hits += 1
141
+ m = detect_month_from_token(norm)
142
+ if m is not None:
143
+ seasonal_hits += 1
144
+ token_month_hits[m] = token_month_hits.get(m, 0) + 1
145
+
146
+ seasonal = _squash_hits(seasonal_hits)
147
+ viral = _squash_hits(viral_hits)
148
+
149
+ created_month = parse_created_month(created_date)
150
+ if created_month and token_month_hits:
151
+ dominant_month = max(token_month_hits.items(), key=lambda kv: kv[1])[0]
152
+ if dominant_month == created_month and seasonal > 0:
153
+ seasonal = float(min(1.0, seasonal + 0.05))
154
+
155
+ by_label = {"seasonal": seasonal, "viral": viral}
156
+ return [float(by_label.get(label, 0.0)) for label in label_order]
157
+