elfsong commited on
Commit
a603af9
·
1 Parent(s): 5f73cbf

feat: add lifecycle_retrieve script to compute and upload bimonthly topic hype cycle snapshots to HuggingFace

Browse files
Files changed (2) hide show
  1. src/lifecycle_retrieve.py +377 -0
  2. src/streamlit_app.py +51 -151
src/lifecycle_retrieve.py ADDED
@@ -0,0 +1,377 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Lifecycle Snapshot Retriever -- compute bimonthly topic lifecycle snapshots.
3
+
4
+ Computes Gartner-style hype cycle classification for research topics using
5
+ all available paper data up to each snapshot month (every 2 months).
6
+
7
+ Results are pushed to Elfsong/hf_paper_lifecycle.
8
+
9
+ Usage:
10
+ uv run python src/lifecycle_retrieve.py # latest snapshot
11
+ uv run python src/lifecycle_retrieve.py --snapshot 2025-06 # specific snapshot
12
+ uv run python src/lifecycle_retrieve.py --all # all missing snapshots
13
+ uv run python src/lifecycle_retrieve.py --no-push # dry run
14
+ """
15
+
16
+ import os
17
+
18
+ os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
19
+ os.environ["DATASETS_VERBOSITY"] = "error"
20
+ from tqdm import tqdm # noqa: E402
21
+ from functools import partialmethod # noqa: E402
22
+
23
+ tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)
24
+
25
+ import argparse # noqa: E402
26
+ import json # noqa: E402
27
+ import logging # noqa: E402
28
+ import sys # noqa: E402
29
+ import time # noqa: E402
30
+ from collections import Counter, defaultdict # noqa: E402
31
+ from datetime import datetime, timezone # noqa: E402
32
+ from pathlib import Path # noqa: E402
33
+
34
+ import numpy as np # noqa: E402
35
+ from scipy.stats import linregress # noqa: E402
36
+ from dotenv import load_dotenv # noqa: E402
37
+
38
+ ROOT = Path(__file__).resolve().parent.parent
39
+ load_dotenv(ROOT / ".env")
40
+
41
+ for _name in ("datasets", "huggingface_hub", "huggingface_hub.utils",
42
+ "fsspec", "datasets.utils", "datasets.arrow_writer"):
43
+ logging.getLogger(_name).setLevel(logging.ERROR)
44
+
45
+ # ---------------------------------------------------------------------------
46
+ # ANSI helpers
47
+ # ---------------------------------------------------------------------------
48
+ _RESET = "\033[0m"
49
+ _BOLD = "\033[1m"
50
+ _DIM = "\033[2m"
51
+ _GREEN = "\033[32m"
52
+ _YELLOW = "\033[33m"
53
+ _CYAN = "\033[36m"
54
+ _GRAY = "\033[90m"
55
+
56
+ # ---------------------------------------------------------------------------
57
+ # Config
58
+ # ---------------------------------------------------------------------------
59
+ HF_DATASET_REPO = "Elfsong/hf_paper_summary"
60
+ HF_LIFECYCLE_REPO = "Elfsong/hf_paper_lifecycle"
61
+
62
+ # Bimonthly snapshot months (even months)
63
+ SNAPSHOT_MONTHS = {2, 4, 6, 8, 10, 12}
64
+
65
+
66
+ # ---------------------------------------------------------------------------
67
+ # Helpers
68
+ # ---------------------------------------------------------------------------
69
+ def _get_env(key: str) -> str:
70
+ val = os.getenv(key, "")
71
+ if val:
72
+ return val
73
+ env_path = ROOT / ".env"
74
+ if env_path.exists():
75
+ for line in env_path.read_text().splitlines():
76
+ if line.startswith(f"{key}="):
77
+ return line.split("=", 1)[1].strip()
78
+ return ""
79
+
80
+
81
+ def _snapshot_to_split(snapshot_str: str) -> str:
82
+ return "snapshot_" + snapshot_str.replace("-", "_")
83
+
84
+
85
+ def _parse_paper_row(paper: dict) -> dict:
86
+ for key in ("detailed_analysis", "detailed_analysis_zh"):
87
+ v = paper.get(key, "{}")
88
+ if isinstance(v, str):
89
+ paper[key] = json.loads(v) if v else {}
90
+ for key in ("topics", "topics_zh", "keywords", "keywords_zh"):
91
+ v = paper.get(key, "[]")
92
+ if isinstance(v, str):
93
+ paper[key] = json.loads(v) if v else []
94
+ if not isinstance(paper.get("authors"), list):
95
+ try:
96
+ paper["authors"] = list(paper["authors"])
97
+ except Exception:
98
+ paper["authors"] = []
99
+ return paper
100
+
101
+
102
+ def _list_repo_files(repo: str) -> list[str]:
103
+ from huggingface_hub import HfApi
104
+
105
+ token = _get_env("HF_TOKEN")
106
+ if not token:
107
+ return []
108
+ try:
109
+ api = HfApi(token=token)
110
+ return list(api.list_repo_files(repo, repo_type="dataset"))
111
+ except Exception:
112
+ return []
113
+
114
+
115
+ def _load_all_papers(files: list[str]) -> list[dict]:
116
+ """Download all parquet files and return papers with _date and _month."""
117
+ import pandas as pd
118
+ from huggingface_hub import hf_hub_download
119
+
120
+ token = _get_env("HF_TOKEN")
121
+ parquet_files = [f for f in files if f.endswith(".parquet")]
122
+
123
+ seen_ids: set[str] = set()
124
+ papers: list[dict] = []
125
+
126
+ for i, pf in enumerate(parquet_files):
127
+ fname = pf.split("/")[-1]
128
+ date_part = fname.split("-00")[0]
129
+ date_str = date_part.replace("date_", "").replace("_", "-")
130
+
131
+ try:
132
+ local_path = hf_hub_download(
133
+ HF_DATASET_REPO, pf, repo_type="dataset", token=token,
134
+ )
135
+ df = pd.read_parquet(local_path)
136
+ for _, row in df.iterrows():
137
+ paper = row.to_dict()
138
+ pid = paper.get("paper_id", "")
139
+ if pid and pid not in seen_ids:
140
+ seen_ids.add(pid)
141
+ paper["_date"] = date_str
142
+ paper["_month"] = date_str[:7]
143
+ papers.append(_parse_paper_row(paper))
144
+ except Exception:
145
+ continue
146
+
147
+ if sys.stdout.isatty() and (i + 1) % 20 == 0:
148
+ sys.stdout.write(f"\r {_DIM}Loading papers... {i+1}/{len(parquet_files)} files, {len(papers)} papers{_RESET}")
149
+ sys.stdout.flush()
150
+
151
+ if sys.stdout.isatty():
152
+ sys.stdout.write("\r\033[K")
153
+ sys.stdout.flush()
154
+
155
+ return papers
156
+
157
+
158
+ # ---------------------------------------------------------------------------
159
+ # Lifecycle computation
160
+ # ---------------------------------------------------------------------------
161
+ def _get_paper_topics(paper: dict, lang: str) -> list[str]:
162
+ if lang == "zh":
163
+ return paper.get("topics_zh", []) or paper.get("topics", [])
164
+ return paper.get("topics", [])
165
+
166
+
167
+ def compute_lifecycle(papers: list[str], lang: str = "en") -> tuple[dict, list[str]]:
168
+ """Compute lifecycle metrics for all topics from papers."""
169
+ topics_by_month: dict[str, Counter] = defaultdict(Counter)
170
+ all_topics: Counter = Counter()
171
+
172
+ for p in papers:
173
+ month = p.get("_month", "")
174
+ if not month:
175
+ continue
176
+ topics = _get_paper_topics(p, lang)
177
+ topics_by_month[month].update(topics)
178
+ all_topics.update(topics)
179
+
180
+ sorted_months = sorted(topics_by_month.keys())
181
+ if len(sorted_months) < 2:
182
+ return {}, sorted_months
183
+
184
+ total_by_month = {m: sum(topics_by_month[m].values()) for m in sorted_months}
185
+ n_months = len(sorted_months)
186
+ min_papers = max(3, n_months)
187
+ candidates = [t for t, c in all_topics.items() if c >= min_papers]
188
+
189
+ lifecycle: dict = {}
190
+ for topic in candidates:
191
+ proportions = np.array([
192
+ topics_by_month[m].get(topic, 0) / total_by_month[m]
193
+ if total_by_month[m] > 0 else 0
194
+ for m in sorted_months
195
+ ])
196
+ counts = np.array([topics_by_month[m].get(topic, 0) for m in sorted_months])
197
+ nonzero = np.where(proportions > 0)[0]
198
+ if len(nonzero) < 2:
199
+ continue
200
+
201
+ first_idx = int(nonzero[0])
202
+ peak_idx = int(np.argmax(proportions))
203
+ peak_val = float(proportions[peak_idx])
204
+ current_avg = float(np.mean(proportions[-min(3, n_months):]))
205
+
206
+ window = min(6, n_months)
207
+ recent = proportions[-window:]
208
+ slope = float(linregress(np.arange(len(recent)), recent).slope) if len(recent) >= 3 else 0.0
209
+
210
+ decline_ratio = current_avg / peak_val if peak_val > 0 else 0
211
+ months_since_peak = n_months - 1 - peak_idx
212
+ months_active = n_months - first_idx
213
+ recent_window = min(8, len(counts))
214
+ recent_fraction = float(counts[-recent_window:].sum() / max(counts.sum(), 1))
215
+
216
+ # Phase classification (same thresholds as reference analysis script)
217
+ dr, sl, ma, msp = decline_ratio, slope, months_active, months_since_peak
218
+ tc = int(counts.sum())
219
+ rf = recent_fraction
220
+
221
+ if ma <= 8 or (rf > 0.60 and tc < 200):
222
+ phase = "Innovation Trigger"
223
+ elif (dr > 0.70 and msp <= 6) or (sl > 0.001 and dr > 0.65):
224
+ phase = "Peak of Inflated Expectations"
225
+ elif dr < 0.65:
226
+ phase = "Slope of Enlightenment" if sl > 0.0003 else "Trough of Disillusionment"
227
+ elif sl < -0.001 and dr < 0.75:
228
+ phase = "Trough of Disillusionment"
229
+ elif dr < 0.85 and sl > 0.0005 and msp > 4:
230
+ phase = "Slope of Enlightenment"
231
+ else:
232
+ phase = "Plateau of Productivity"
233
+
234
+ lifecycle[topic] = {
235
+ "topic": topic, "phase": phase,
236
+ "total_count": tc, "peak_val": peak_val,
237
+ "peak_month": sorted_months[peak_idx],
238
+ "current_avg": current_avg, "slope": slope,
239
+ "decline_ratio": decline_ratio,
240
+ "months_since_peak": months_since_peak,
241
+ "months_active": months_active,
242
+ }
243
+
244
+ return lifecycle, sorted_months
245
+
246
+
247
+ # ---------------------------------------------------------------------------
248
+ # Push to HuggingFace
249
+ # ---------------------------------------------------------------------------
250
+ def push_lifecycle_to_hf(lifecycle_en: dict, lifecycle_zh: dict,
251
+ sorted_months: list[str], n_papers: int,
252
+ snapshot_month: str):
253
+ from datasets import Dataset
254
+
255
+ token = _get_env("HF_TOKEN")
256
+ if not token:
257
+ raise RuntimeError("HF_TOKEN not set")
258
+
259
+ row = {
260
+ "lifecycle_data": json.dumps(lifecycle_en, ensure_ascii=False),
261
+ "lifecycle_data_zh": json.dumps(lifecycle_zh, ensure_ascii=False),
262
+ "sorted_months": json.dumps(sorted_months, ensure_ascii=False),
263
+ "n_papers": n_papers,
264
+ "n_months": len(sorted_months),
265
+ }
266
+ ds = Dataset.from_list([row])
267
+ split_name = _snapshot_to_split(snapshot_month)
268
+ ds.push_to_hub(HF_LIFECYCLE_REPO, split=split_name, token=token)
269
+
270
+
271
+ # ---------------------------------------------------------------------------
272
+ # Run one snapshot
273
+ # ---------------------------------------------------------------------------
274
+ def run_snapshot(snapshot_month: str, all_papers: list[dict],
275
+ existing_splits: set[str], no_push: bool = False):
276
+ split_name = _snapshot_to_split(snapshot_month)
277
+
278
+ if split_name in existing_splits:
279
+ print(f" {_GRAY}⊘ {snapshot_month} — already on HF, skipping{_RESET}")
280
+ return
281
+
282
+ papers = [p for p in all_papers if p.get("_month", "") <= snapshot_month]
283
+ if not papers:
284
+ print(f" {_YELLOW}⊘ {snapshot_month} — no papers, skipping{_RESET}")
285
+ return
286
+
287
+ print(f" {_CYAN}⟳ {snapshot_month}{_RESET} — {len(papers)} papers...", end="", flush=True)
288
+
289
+ lc_en, months_en = compute_lifecycle(papers, lang="en")
290
+ lc_zh, _ = compute_lifecycle(papers, lang="zh")
291
+
292
+ print(f" {len(lc_en)} topics (en), {len(lc_zh)} topics (zh)", end="", flush=True)
293
+
294
+ if no_push:
295
+ print(f" {_GRAY}[--no-push]{_RESET}")
296
+ else:
297
+ try:
298
+ push_lifecycle_to_hf(lc_en, lc_zh, months_en, len(papers), snapshot_month)
299
+ print(f" {_GREEN}✓ pushed{_RESET}")
300
+ except Exception as e:
301
+ print(f" {_YELLOW}✗ push failed: {e}{_RESET}")
302
+
303
+
304
+ # ---------------------------------------------------------------------------
305
+ # Main
306
+ # ---------------------------------------------------------------------------
307
+ def main():
308
+ parser = argparse.ArgumentParser(
309
+ description="Compute bimonthly topic lifecycle snapshots and push to HuggingFace"
310
+ )
311
+ parser.add_argument("--snapshot", type=str, default=None,
312
+ help="Snapshot month (YYYY-MM, even month). Default: latest bimonthly.")
313
+ parser.add_argument("--all", action="store_true",
314
+ help="Compute all missing bimonthly snapshots")
315
+ parser.add_argument("--no-push", action="store_true",
316
+ help="Skip pushing results to HuggingFace")
317
+ args = parser.parse_args()
318
+
319
+ print(f"\n {_BOLD}📊 Lifecycle Snapshot Retriever{_RESET}\n")
320
+
321
+ # Step 1: List dataset files
322
+ print(f" {_DIM}Listing dataset files...{_RESET}", end="", flush=True)
323
+ all_files = _list_repo_files(HF_DATASET_REPO)
324
+ if not all_files:
325
+ print(f"\n {_YELLOW}Error: could not list files — check HF_TOKEN{_RESET}")
326
+ return
327
+ print(f" {len(all_files)} files")
328
+
329
+ # Step 2: Load all papers
330
+ print(f" {_DIM}Loading all papers...{_RESET}", end="", flush=True)
331
+ t0 = time.time()
332
+ all_papers = _load_all_papers(all_files)
333
+ elapsed = time.time() - t0
334
+ print(f" {len(all_papers)} papers in {elapsed:.1f}s")
335
+
336
+ if not all_papers:
337
+ print(f" {_YELLOW}No papers found{_RESET}")
338
+ return
339
+
340
+ # Step 3: Determine data range
341
+ all_months = sorted(set(p["_month"] for p in all_papers if p.get("_month")))
342
+ print(f" {_DIM}Data range: {all_months[0]} → {all_months[-1]} ({len(all_months)} months){_RESET}")
343
+
344
+ # List existing lifecycle splits
345
+ lifecycle_files = _list_repo_files(HF_LIFECYCLE_REPO)
346
+ existing_splits: set[str] = set()
347
+ for f in lifecycle_files:
348
+ name = f.split("/")[-1].replace(".parquet", "").replace(".arrow", "")
349
+ for part in name.split("-"):
350
+ if part.startswith("snapshot_"):
351
+ existing_splits.add(part)
352
+
353
+ # Step 4: Determine snapshots to compute
354
+ if args.all:
355
+ snapshots = [m for m in all_months if int(m[5:7]) in SNAPSHOT_MONTHS]
356
+ elif args.snapshot:
357
+ snapshots = [args.snapshot]
358
+ else:
359
+ now = datetime.now(timezone.utc)
360
+ last_completed = now.month - 1 if now.month > 1 else 12
361
+ snap_year = now.year if now.month > 1 else now.year - 1
362
+ snap_month = last_completed if last_completed % 2 == 0 else last_completed - 1
363
+ if snap_month <= 0:
364
+ snap_month = 12
365
+ snap_year -= 1
366
+ snapshots = [f"{snap_year}-{snap_month:02d}"]
367
+
368
+ print(f" {_DIM}Snapshots to process: {len(snapshots)}{_RESET}\n")
369
+
370
+ for snapshot in snapshots:
371
+ run_snapshot(snapshot, all_papers, existing_splits, no_push=args.no_push)
372
+
373
+ print(f"\n {_GREEN}{_BOLD}✓{_RESET} Done\n")
374
+
375
+
376
+ if __name__ == "__main__":
377
+ main()
src/streamlit_app.py CHANGED
@@ -282,6 +282,7 @@ DATA_DIR = Path(__file__).resolve().parent.parent / "data"
282
  HF_DATASET_REPO = "Elfsong/hf_paper_summary"
283
  HF_TRENDING_REPO = "Elfsong/hf_paper_daily_trending"
284
  HF_MONTHLY_TRENDING_REPO = "Elfsong/hf_paper_monthly_trending"
 
285
 
286
 
287
  def _get_hf_token() -> str | None:
@@ -1001,48 +1002,9 @@ def pull_monthly_trending_from_hf(month_str: str) -> dict | None:
1001
 
1002
 
1003
  # ---------------------------------------------------------------------------
1004
- # Topic lifecycle analysis
1005
  # ---------------------------------------------------------------------------
1006
 
1007
- @st.cache_data(ttl=3600, show_spinner=False)
1008
- def _load_papers_for_months(months: tuple[str, ...]) -> list[dict]:
1009
- """Bulk-load papers for multiple months directly from parquet files."""
1010
- import pandas as pd
1011
- from huggingface_hub import hf_hub_download
1012
-
1013
- token = _get_hf_token()
1014
- files = _list_repo_files_cached(HF_DATASET_REPO)
1015
- parquet_files = [f for f in files if f.endswith(".parquet")]
1016
-
1017
- month_set = set(months)
1018
- seen_ids: set[str] = set()
1019
- papers: list[dict] = []
1020
-
1021
- for pf in parquet_files:
1022
- fname = pf.split("/")[-1]
1023
- date_part = fname.split("-00")[0]
1024
- date_str = date_part.replace("date_", "").replace("_", "-")
1025
- if date_str[:7] not in month_set:
1026
- continue
1027
- try:
1028
- local_path = hf_hub_download(
1029
- HF_DATASET_REPO, pf, repo_type="dataset", token=token,
1030
- )
1031
- df = pd.read_parquet(local_path)
1032
- for _, row in df.iterrows():
1033
- paper = row.to_dict()
1034
- pid = paper.get("paper_id", "")
1035
- if pid and pid not in seen_ids:
1036
- seen_ids.add(pid)
1037
- paper["_date"] = date_str
1038
- paper["_month"] = date_str[:7]
1039
- papers.append(_parse_paper_row(paper))
1040
- except Exception:
1041
- continue
1042
-
1043
- return papers
1044
-
1045
-
1046
  _PHASES_ORDER = [
1047
  "Innovation Trigger",
1048
  "Peak of Inflated Expectations",
@@ -1052,86 +1014,26 @@ _PHASES_ORDER = [
1052
  ]
1053
 
1054
 
1055
- def _compute_lifecycle(papers: list[dict], lang: bool) -> tuple[dict, list[str]]:
1056
- """Compute lifecycle metrics and classify phases for all topics."""
1057
- from collections import Counter, defaultdict
1058
- import numpy as np
1059
- from scipy.stats import linregress
1060
-
1061
- topics_by_month: dict[str, Counter] = defaultdict(Counter)
1062
- all_topics: Counter = Counter()
1063
-
1064
- for p in papers:
1065
- month = (p.get("_date", "") or p.get("published_at", "")[:10])[:7]
1066
- if not month:
1067
- continue
1068
- topics = _get_paper_topics(p, lang)
1069
- topics_by_month[month].update(topics)
1070
- all_topics.update(topics)
1071
-
1072
- sorted_months = sorted(topics_by_month.keys())
1073
- if len(sorted_months) < 2:
1074
- return {}, sorted_months
1075
-
1076
- total_by_month = {m: sum(topics_by_month[m].values()) for m in sorted_months}
1077
- n_months = len(sorted_months)
1078
- min_papers = max(3, n_months)
1079
- candidates = [t for t, c in all_topics.items() if c >= min_papers]
1080
-
1081
- lifecycle: dict = {}
1082
- for topic in candidates:
1083
- proportions = np.array([
1084
- topics_by_month[m].get(topic, 0) / total_by_month[m]
1085
- if total_by_month[m] > 0 else 0
1086
- for m in sorted_months
1087
- ])
1088
- counts = np.array([topics_by_month[m].get(topic, 0) for m in sorted_months])
1089
- nonzero = np.where(proportions > 0)[0]
1090
- if len(nonzero) < 2:
1091
- continue
1092
-
1093
- first_idx = int(nonzero[0])
1094
- peak_idx = int(np.argmax(proportions))
1095
- peak_val = float(proportions[peak_idx])
1096
- current_avg = float(np.mean(proportions[-min(3, n_months):]))
1097
-
1098
- window = min(6, n_months)
1099
- recent = proportions[-window:]
1100
- slope = float(linregress(np.arange(len(recent)), recent).slope) if len(recent) >= 3 else 0.0
1101
-
1102
- decline_ratio = current_avg / peak_val if peak_val > 0 else 0
1103
- months_since_peak = n_months - 1 - peak_idx
1104
- months_active = n_months - first_idx
1105
- recent_window = min(8, len(counts))
1106
- recent_fraction = float(counts[-recent_window:].sum() / max(counts.sum(), 1))
1107
-
1108
- # Phase classification (thresholds adapted for variable-length windows)
1109
- dr, sl, ma, msp = decline_ratio, slope, months_active, months_since_peak
1110
- tc = int(counts.sum())
1111
- if ma <= max(3, n_months * 0.4) and tc < n_months * 10:
1112
- phase = "Innovation Trigger"
1113
- elif (dr > 0.70 and msp <= max(2, n_months // 3)) or (sl > 0.001 and dr > 0.65):
1114
- phase = "Peak of Inflated Expectations"
1115
- elif dr < 0.65:
1116
- phase = "Slope of Enlightenment" if sl > 0.0003 else "Trough of Disillusionment"
1117
- elif sl < -0.001 and dr < 0.75:
1118
- phase = "Trough of Disillusionment"
1119
- elif dr < 0.85 and sl > 0.0005 and msp > max(2, n_months // 4):
1120
- phase = "Slope of Enlightenment"
1121
- else:
1122
- phase = "Plateau of Productivity"
1123
-
1124
- lifecycle[topic] = {
1125
- "topic": topic, "phase": phase,
1126
- "total_count": tc, "peak_val": peak_val,
1127
- "peak_month": sorted_months[peak_idx],
1128
- "current_avg": current_avg, "slope": slope,
1129
- "decline_ratio": decline_ratio,
1130
- "months_since_peak": months_since_peak,
1131
- "months_active": months_active,
1132
- }
1133
-
1134
- return lifecycle, sorted_months
1135
 
1136
 
1137
  def _render_hype_cycle(lifecycle_data: dict, lang: bool):
@@ -1835,47 +1737,45 @@ elif active_tab == "Monthly":
1835
 
1836
  # ---- Lifecycle tab ----
1837
  elif active_tab == "Lifecycle":
1838
- _lc_splits_key = "lifecycle_available_halves"
1839
  if _lc_splits_key not in st.session_state:
1840
- splits = _list_dataset_splits()
1841
- all_dates = [_split_to_date(s) for s in splits]
1842
- halves: set[str] = set()
1843
- for d in all_dates:
1844
- half = "H1" if int(d[5:7]) <= 6 else "H2"
1845
- halves.add(f"{d[:4]}-{half}")
1846
- st.session_state[_lc_splits_key] = sorted(halves, reverse=True)
1847
- half_year_options = st.session_state[_lc_splits_key]
1848
-
1849
- if not half_year_options:
1850
- st.info("No data available for lifecycle analysis.")
1851
  else:
1852
  with hdr[1]:
1853
- selected_half = st.selectbox(
1854
- "Select period", options=half_year_options,
1855
  label_visibility="collapsed", key="lifecycle_select",
1856
  )
1857
 
1858
- year = int(selected_half[:4])
1859
- if selected_half.endswith("H1"):
1860
- month_range = tuple(f"{year}-{m:02d}" for m in range(1, 7))
 
1861
  else:
1862
- month_range = tuple(f"{year}-{m:02d}" for m in range(7, 13))
 
 
1863
 
1864
- _lc_papers_key = f"lifecycle_papers_{selected_half}"
1865
- if _lc_papers_key not in st.session_state:
1866
- with st.spinner(f"Loading papers for {selected_half}..."):
1867
- st.session_state[_lc_papers_key] = _load_papers_for_months(month_range)
1868
- lc_papers = st.session_state[_lc_papers_key]
1869
-
1870
- if not lc_papers:
1871
- st.warning(f"No papers found for {selected_half}")
1872
  else:
1873
- st.metric("Papers", f"{len(lc_papers):,}")
 
1874
 
1875
- _lc_data_key = f"lifecycle_data_{selected_half}_{lang}"
1876
- if _lc_data_key not in st.session_state:
1877
- st.session_state[_lc_data_key] = _compute_lifecycle(lc_papers, lang)
1878
- lc_data, lc_months = st.session_state[_lc_data_key]
 
1879
 
1880
  if not lc_data:
1881
  st.warning("Not enough data for lifecycle analysis.")
 
282
  HF_DATASET_REPO = "Elfsong/hf_paper_summary"
283
  HF_TRENDING_REPO = "Elfsong/hf_paper_daily_trending"
284
  HF_MONTHLY_TRENDING_REPO = "Elfsong/hf_paper_monthly_trending"
285
+ HF_LIFECYCLE_REPO = "Elfsong/hf_paper_lifecycle"
286
 
287
 
288
  def _get_hf_token() -> str | None:
 
1002
 
1003
 
1004
  # ---------------------------------------------------------------------------
1005
+ # Topic lifecycle (read-only from HF, generated by lifecycle_retrieve.py)
1006
  # ---------------------------------------------------------------------------
1007
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1008
  _PHASES_ORDER = [
1009
  "Innovation Trigger",
1010
  "Peak of Inflated Expectations",
 
1014
  ]
1015
 
1016
 
1017
+ @st.cache_data(ttl=300, show_spinner=False)
1018
+ def pull_lifecycle_from_hf(snapshot_str: str) -> dict | None:
1019
+ """Load a pre-computed lifecycle snapshot from HF."""
1020
+ log.info("[pull_lifecycle] snapshot_str=%s", snapshot_str)
1021
+ files = _list_repo_files_cached(HF_LIFECYCLE_REPO)
1022
+ splits = _extract_splits(files, prefix="snapshot_")
1023
+ target_split = "snapshot_" + snapshot_str.replace("-", "_")
1024
+ if target_split not in splits:
1025
+ return None
1026
+ rows = _download_split_rows(HF_LIFECYCLE_REPO, target_split)
1027
+ if not rows:
1028
+ return None
1029
+ row = rows[0]
1030
+ return {
1031
+ "lifecycle_data": json.loads(row.get("lifecycle_data", "{}")),
1032
+ "lifecycle_data_zh": json.loads(row.get("lifecycle_data_zh", "{}")),
1033
+ "sorted_months": json.loads(row.get("sorted_months", "[]")),
1034
+ "n_papers": row.get("n_papers", 0),
1035
+ "n_months": row.get("n_months", 0),
1036
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1037
 
1038
 
1039
  def _render_hype_cycle(lifecycle_data: dict, lang: bool):
 
1737
 
1738
  # ---- Lifecycle tab ----
1739
  elif active_tab == "Lifecycle":
1740
+ _lc_splits_key = "lifecycle_available_snapshots"
1741
  if _lc_splits_key not in st.session_state:
1742
+ lc_files = _list_repo_files_cached(HF_LIFECYCLE_REPO)
1743
+ st.session_state[_lc_splits_key] = sorted(
1744
+ [s.replace("snapshot_", "").replace("_", "-")
1745
+ for s in _extract_splits(lc_files, prefix="snapshot_")],
1746
+ reverse=True,
1747
+ )
1748
+ snapshot_options = st.session_state[_lc_splits_key]
1749
+
1750
+ if not snapshot_options:
1751
+ st.info("No lifecycle data available yet. Run `uv run python src/lifecycle_retrieve.py --all` to generate.")
 
1752
  else:
1753
  with hdr[1]:
1754
+ selected_snapshot = st.selectbox(
1755
+ "Select snapshot", options=snapshot_options,
1756
  label_visibility="collapsed", key="lifecycle_select",
1757
  )
1758
 
1759
+ _lc_cache_key = f"lifecycle_{selected_snapshot}"
1760
+ lc_raw = None
1761
+ if _lc_cache_key in st.session_state:
1762
+ lc_raw = st.session_state[_lc_cache_key]
1763
  else:
1764
+ lc_raw = pull_lifecycle_from_hf(selected_snapshot)
1765
+ if lc_raw:
1766
+ st.session_state[_lc_cache_key] = lc_raw
1767
 
1768
+ if not lc_raw:
1769
+ st.warning(f"Could not load lifecycle data for {selected_snapshot}")
 
 
 
 
 
 
1770
  else:
1771
+ lc_data = lc_raw["lifecycle_data_zh"] if lang else lc_raw["lifecycle_data"]
1772
+ sorted_months = lc_raw["sorted_months"]
1773
 
1774
+ st.metric("Papers", f"{lc_raw['n_papers']:,}")
1775
+ if sorted_months:
1776
+ st.caption(
1777
+ f"{lc_raw['n_months']} months ({sorted_months[0]} → {sorted_months[-1]})"
1778
+ )
1779
 
1780
  if not lc_data:
1781
  st.warning("Not enough data for lifecycle analysis.")