Spaces:

Elfsong
/

Paper_Espresso

Running

App Files Files Community

elfsong commited on 25 days ago

Commit

a603af9

1 Parent(s): 5f73cbf

feat: add lifecycle_retrieve script to compute and upload bimonthly topic hype cycle snapshots to HuggingFace

Browse files

Files changed (2) hide show

src/lifecycle_retrieve.py +377 -0
src/streamlit_app.py +51 -151

src/lifecycle_retrieve.py ADDED Viewed

	@@ -0,0 +1,377 @@

+"""
+Lifecycle Snapshot Retriever -- compute bimonthly topic lifecycle snapshots.
+Computes Gartner-style hype cycle classification for research topics using
+all available paper data up to each snapshot month (every 2 months).
+Results are pushed to Elfsong/hf_paper_lifecycle.
+Usage:
+    uv run python src/lifecycle_retrieve.py                          # latest snapshot
+    uv run python src/lifecycle_retrieve.py --snapshot 2025-06       # specific snapshot
+    uv run python src/lifecycle_retrieve.py --all                    # all missing snapshots
+    uv run python src/lifecycle_retrieve.py --no-push                # dry run
+"""
+import os
+os.environ["HF_HUB_DISABLE_PROGRESS_BARS"] = "1"
+os.environ["DATASETS_VERBOSITY"] = "error"
+from tqdm import tqdm  # noqa: E402
+from functools import partialmethod  # noqa: E402
+tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)
+import argparse  # noqa: E402
+import json  # noqa: E402
+import logging  # noqa: E402
+import sys  # noqa: E402
+import time  # noqa: E402
+from collections import Counter, defaultdict  # noqa: E402
+from datetime import datetime, timezone  # noqa: E402
+from pathlib import Path  # noqa: E402
+import numpy as np  # noqa: E402
+from scipy.stats import linregress  # noqa: E402
+from dotenv import load_dotenv  # noqa: E402
+ROOT = Path(__file__).resolve().parent.parent
+load_dotenv(ROOT / ".env")
+for _name in ("datasets", "huggingface_hub", "huggingface_hub.utils",
+              "fsspec", "datasets.utils", "datasets.arrow_writer"):
+    logging.getLogger(_name).setLevel(logging.ERROR)
+# ---------------------------------------------------------------------------
+# ANSI helpers
+# ---------------------------------------------------------------------------
+_RESET = "\033[0m"
+_BOLD = "\033[1m"
+_DIM = "\033[2m"
+_GREEN = "\033[32m"
+_YELLOW = "\033[33m"
+_CYAN = "\033[36m"
+_GRAY = "\033[90m"
+# ---------------------------------------------------------------------------
+# Config
+# ---------------------------------------------------------------------------
+HF_DATASET_REPO = "Elfsong/hf_paper_summary"
+HF_LIFECYCLE_REPO = "Elfsong/hf_paper_lifecycle"
+# Bimonthly snapshot months (even months)
+SNAPSHOT_MONTHS = {2, 4, 6, 8, 10, 12}
+# ---------------------------------------------------------------------------
+# Helpers
+# ---------------------------------------------------------------------------
+def _get_env(key: str) -> str:
+    val = os.getenv(key, "")
+    if val:
+        return val
+    env_path = ROOT / ".env"
+    if env_path.exists():
+        for line in env_path.read_text().splitlines():
+            if line.startswith(f"{key}="):
+                return line.split("=", 1)[1].strip()
+    return ""
+def _snapshot_to_split(snapshot_str: str) -> str:
+    return "snapshot_" + snapshot_str.replace("-", "_")
+def _parse_paper_row(paper: dict) -> dict:
+    for key in ("detailed_analysis", "detailed_analysis_zh"):
+        v = paper.get(key, "{}")
+        if isinstance(v, str):
+            paper[key] = json.loads(v) if v else {}
+    for key in ("topics", "topics_zh", "keywords", "keywords_zh"):
+        v = paper.get(key, "[]")
+        if isinstance(v, str):
+            paper[key] = json.loads(v) if v else []
+    if not isinstance(paper.get("authors"), list):
+        try:
+            paper["authors"] = list(paper["authors"])
+        except Exception:
+            paper["authors"] = []
+    return paper
+def _list_repo_files(repo: str) -> list[str]:
+    from huggingface_hub import HfApi
+    token = _get_env("HF_TOKEN")
+    if not token:
+        return []
+    try:
+        api = HfApi(token=token)
+        return list(api.list_repo_files(repo, repo_type="dataset"))
+    except Exception:
+        return []
+def _load_all_papers(files: list[str]) -> list[dict]:
+    """Download all parquet files and return papers with _date and _month."""
+    import pandas as pd
+    from huggingface_hub import hf_hub_download
+    token = _get_env("HF_TOKEN")
+    parquet_files = [f for f in files if f.endswith(".parquet")]
+    seen_ids: set[str] = set()
+    papers: list[dict] = []
+    for i, pf in enumerate(parquet_files):
+        fname = pf.split("/")[-1]
+        date_part = fname.split("-00")[0]
+        date_str = date_part.replace("date_", "").replace("_", "-")
+        try:
+            local_path = hf_hub_download(
+                HF_DATASET_REPO, pf, repo_type="dataset", token=token,
+            )
+            df = pd.read_parquet(local_path)
+            for _, row in df.iterrows():
+                paper = row.to_dict()
+                pid = paper.get("paper_id", "")
+                if pid and pid not in seen_ids:
+                    seen_ids.add(pid)
+                    paper["_date"] = date_str
+                    paper["_month"] = date_str[:7]
+                    papers.append(_parse_paper_row(paper))
+        except Exception:
+            continue
+        if sys.stdout.isatty() and (i + 1) % 20 == 0:
+            sys.stdout.write(f"\r  {_DIM}Loading papers... {i+1}/{len(parquet_files)} files, {len(papers)} papers{_RESET}")
+            sys.stdout.flush()
+    if sys.stdout.isatty():
+        sys.stdout.write("\r\033[K")
+        sys.stdout.flush()
+    return papers
+# ---------------------------------------------------------------------------
+# Lifecycle computation
+# ---------------------------------------------------------------------------
+def _get_paper_topics(paper: dict, lang: str) -> list[str]:
+    if lang == "zh":
+        return paper.get("topics_zh", []) or paper.get("topics", [])
+    return paper.get("topics", [])
+def compute_lifecycle(papers: list[str], lang: str = "en") -> tuple[dict, list[str]]:
+    """Compute lifecycle metrics for all topics from papers."""
+    topics_by_month: dict[str, Counter] = defaultdict(Counter)
+    all_topics: Counter = Counter()
+    for p in papers:
+        month = p.get("_month", "")
+        if not month:
+            continue
+        topics = _get_paper_topics(p, lang)
+        topics_by_month[month].update(topics)
+        all_topics.update(topics)
+    sorted_months = sorted(topics_by_month.keys())
+    if len(sorted_months) < 2:
+        return {}, sorted_months
+    total_by_month = {m: sum(topics_by_month[m].values()) for m in sorted_months}
+    n_months = len(sorted_months)
+    min_papers = max(3, n_months)
+    candidates = [t for t, c in all_topics.items() if c >= min_papers]
+    lifecycle: dict = {}
+    for topic in candidates:
+        proportions = np.array([
+            topics_by_month[m].get(topic, 0) / total_by_month[m]
+            if total_by_month[m] > 0 else 0
+            for m in sorted_months
+        ])
+        counts = np.array([topics_by_month[m].get(topic, 0) for m in sorted_months])
+        nonzero = np.where(proportions > 0)[0]
+        if len(nonzero) < 2:
+            continue
+        first_idx = int(nonzero[0])
+        peak_idx = int(np.argmax(proportions))
+        peak_val = float(proportions[peak_idx])
+        current_avg = float(np.mean(proportions[-min(3, n_months):]))
+        window = min(6, n_months)
+        recent = proportions[-window:]
+        slope = float(linregress(np.arange(len(recent)), recent).slope) if len(recent) >= 3 else 0.0
+        decline_ratio = current_avg / peak_val if peak_val > 0 else 0
+        months_since_peak = n_months - 1 - peak_idx
+        months_active = n_months - first_idx
+        recent_window = min(8, len(counts))
+        recent_fraction = float(counts[-recent_window:].sum() / max(counts.sum(), 1))
+        # Phase classification (same thresholds as reference analysis script)
+        dr, sl, ma, msp = decline_ratio, slope, months_active, months_since_peak
+        tc = int(counts.sum())
+        rf = recent_fraction
+        if ma <= 8 or (rf > 0.60 and tc < 200):
+            phase = "Innovation Trigger"
+        elif (dr > 0.70 and msp <= 6) or (sl > 0.001 and dr > 0.65):
+            phase = "Peak of Inflated Expectations"
+        elif dr < 0.65:
+            phase = "Slope of Enlightenment" if sl > 0.0003 else "Trough of Disillusionment"
+        elif sl < -0.001 and dr < 0.75:
+            phase = "Trough of Disillusionment"
+        elif dr < 0.85 and sl > 0.0005 and msp > 4:
+            phase = "Slope of Enlightenment"
+        else:
+            phase = "Plateau of Productivity"
+        lifecycle[topic] = {
+            "topic": topic, "phase": phase,
+            "total_count": tc, "peak_val": peak_val,
+            "peak_month": sorted_months[peak_idx],
+            "current_avg": current_avg, "slope": slope,
+            "decline_ratio": decline_ratio,
+            "months_since_peak": months_since_peak,
+            "months_active": months_active,
+        }
+    return lifecycle, sorted_months
+# ---------------------------------------------------------------------------
+# Push to HuggingFace
+# ---------------------------------------------------------------------------
+def push_lifecycle_to_hf(lifecycle_en: dict, lifecycle_zh: dict,
+                         sorted_months: list[str], n_papers: int,
+                         snapshot_month: str):
+    from datasets import Dataset
+    token = _get_env("HF_TOKEN")
+    if not token:
+        raise RuntimeError("HF_TOKEN not set")
+    row = {
+        "lifecycle_data": json.dumps(lifecycle_en, ensure_ascii=False),
+        "lifecycle_data_zh": json.dumps(lifecycle_zh, ensure_ascii=False),
+        "sorted_months": json.dumps(sorted_months, ensure_ascii=False),
+        "n_papers": n_papers,
+        "n_months": len(sorted_months),
+    }
+    ds = Dataset.from_list([row])
+    split_name = _snapshot_to_split(snapshot_month)
+    ds.push_to_hub(HF_LIFECYCLE_REPO, split=split_name, token=token)
+# ---------------------------------------------------------------------------
+# Run one snapshot
+# ---------------------------------------------------------------------------
+def run_snapshot(snapshot_month: str, all_papers: list[dict],
+                 existing_splits: set[str], no_push: bool = False):
+    split_name = _snapshot_to_split(snapshot_month)
+    if split_name in existing_splits:
+        print(f"  {_GRAY}⊘  {snapshot_month} — already on HF, skipping{_RESET}")
+        return
+    papers = [p for p in all_papers if p.get("_month", "") <= snapshot_month]
+    if not papers:
+        print(f"  {_YELLOW}⊘  {snapshot_month} — no papers, skipping{_RESET}")
+        return
+    print(f"  {_CYAN}⟳  {snapshot_month}{_RESET} — {len(papers)} papers...", end="", flush=True)
+    lc_en, months_en = compute_lifecycle(papers, lang="en")
+    lc_zh, _ = compute_lifecycle(papers, lang="zh")
+    print(f" {len(lc_en)} topics (en), {len(lc_zh)} topics (zh)", end="", flush=True)
+    if no_push:
+        print(f"  {_GRAY}[--no-push]{_RESET}")
+    else:
+        try:
+            push_lifecycle_to_hf(lc_en, lc_zh, months_en, len(papers), snapshot_month)
+            print(f"  {_GREEN}✓ pushed{_RESET}")
+        except Exception as e:
+            print(f"  {_YELLOW}✗ push failed: {e}{_RESET}")
+# ---------------------------------------------------------------------------
+# Main
+# ---------------------------------------------------------------------------
+def main():
+    parser = argparse.ArgumentParser(
+        description="Compute bimonthly topic lifecycle snapshots and push to HuggingFace"
+    )
+    parser.add_argument("--snapshot", type=str, default=None,
+                        help="Snapshot month (YYYY-MM, even month). Default: latest bimonthly.")
+    parser.add_argument("--all", action="store_true",
+                        help="Compute all missing bimonthly snapshots")
+    parser.add_argument("--no-push", action="store_true",
+                        help="Skip pushing results to HuggingFace")
+    args = parser.parse_args()
+    print(f"\n  {_BOLD}📊 Lifecycle Snapshot Retriever{_RESET}\n")
+    # Step 1: List dataset files
+    print(f"  {_DIM}Listing dataset files...{_RESET}", end="", flush=True)
+    all_files = _list_repo_files(HF_DATASET_REPO)
+    if not all_files:
+        print(f"\n  {_YELLOW}Error: could not list files — check HF_TOKEN{_RESET}")
+        return
+    print(f" {len(all_files)} files")
+    # Step 2: Load all papers
+    print(f"  {_DIM}Loading all papers...{_RESET}", end="", flush=True)
+    t0 = time.time()
+    all_papers = _load_all_papers(all_files)
+    elapsed = time.time() - t0
+    print(f" {len(all_papers)} papers in {elapsed:.1f}s")
+    if not all_papers:
+        print(f"  {_YELLOW}No papers found{_RESET}")
+        return
+    # Step 3: Determine data range
+    all_months = sorted(set(p["_month"] for p in all_papers if p.get("_month")))
+    print(f"  {_DIM}Data range: {all_months[0]} → {all_months[-1]} ({len(all_months)} months){_RESET}")
+    # List existing lifecycle splits
+    lifecycle_files = _list_repo_files(HF_LIFECYCLE_REPO)
+    existing_splits: set[str] = set()
+    for f in lifecycle_files:
+        name = f.split("/")[-1].replace(".parquet", "").replace(".arrow", "")
+        for part in name.split("-"):
+            if part.startswith("snapshot_"):
+                existing_splits.add(part)
+    # Step 4: Determine snapshots to compute
+    if args.all:
+        snapshots = [m for m in all_months if int(m[5:7]) in SNAPSHOT_MONTHS]
+    elif args.snapshot:
+        snapshots = [args.snapshot]
+    else:
+        now = datetime.now(timezone.utc)
+        last_completed = now.month - 1 if now.month > 1 else 12
+        snap_year = now.year if now.month > 1 else now.year - 1
+        snap_month = last_completed if last_completed % 2 == 0 else last_completed - 1
+        if snap_month <= 0:
+            snap_month = 12
+            snap_year -= 1
+        snapshots = [f"{snap_year}-{snap_month:02d}"]
+    print(f"  {_DIM}Snapshots to process: {len(snapshots)}{_RESET}\n")
+    for snapshot in snapshots:
+        run_snapshot(snapshot, all_papers, existing_splits, no_push=args.no_push)
+    print(f"\n  {_GREEN}{_BOLD}✓{_RESET} Done\n")
+if __name__ == "__main__":
+    main()

src/streamlit_app.py CHANGED Viewed

@@ -282,6 +282,7 @@ DATA_DIR = Path(__file__).resolve().parent.parent / "data"
 HF_DATASET_REPO = "Elfsong/hf_paper_summary"
 HF_TRENDING_REPO = "Elfsong/hf_paper_daily_trending"
 HF_MONTHLY_TRENDING_REPO = "Elfsong/hf_paper_monthly_trending"
 def _get_hf_token() -> str | None:
@@ -1001,48 +1002,9 @@ def pull_monthly_trending_from_hf(month_str: str) -> dict | None:
 # ---------------------------------------------------------------------------
-# Topic lifecycle analysis
 # ---------------------------------------------------------------------------
-@st.cache_data(ttl=3600, show_spinner=False)
-def _load_papers_for_months(months: tuple[str, ...]) -> list[dict]:
-    """Bulk-load papers for multiple months directly from parquet files."""
-    import pandas as pd
-    from huggingface_hub import hf_hub_download
-    token = _get_hf_token()
-    files = _list_repo_files_cached(HF_DATASET_REPO)
-    parquet_files = [f for f in files if f.endswith(".parquet")]
-    month_set = set(months)
-    seen_ids: set[str] = set()
-    papers: list[dict] = []
-    for pf in parquet_files:
-        fname = pf.split("/")[-1]
-        date_part = fname.split("-00")[0]
-        date_str = date_part.replace("date_", "").replace("_", "-")
-        if date_str[:7] not in month_set:
-            continue
-        try:
-            local_path = hf_hub_download(
-                HF_DATASET_REPO, pf, repo_type="dataset", token=token,
-            )
-            df = pd.read_parquet(local_path)
-            for _, row in df.iterrows():
-                paper = row.to_dict()
-                pid = paper.get("paper_id", "")
-                if pid and pid not in seen_ids:
-                    seen_ids.add(pid)
-                    paper["_date"] = date_str
-                    paper["_month"] = date_str[:7]
-                    papers.append(_parse_paper_row(paper))
-        except Exception:
-            continue
-    return papers
 _PHASES_ORDER = [
     "Innovation Trigger",
     "Peak of Inflated Expectations",
@@ -1052,86 +1014,26 @@ _PHASES_ORDER = [
 ]
-def _compute_lifecycle(papers: list[dict], lang: bool) -> tuple[dict, list[str]]:
-    """Compute lifecycle metrics and classify phases for all topics."""
-    from collections import Counter, defaultdict
-    import numpy as np
-    from scipy.stats import linregress
-    topics_by_month: dict[str, Counter] = defaultdict(Counter)
-    all_topics: Counter = Counter()
-    for p in papers:
-        month = (p.get("_date", "") or p.get("published_at", "")[:10])[:7]
-        if not month:
-            continue
-        topics = _get_paper_topics(p, lang)
-        topics_by_month[month].update(topics)
-        all_topics.update(topics)
-    sorted_months = sorted(topics_by_month.keys())
-    if len(sorted_months) < 2:
-        return {}, sorted_months
-    total_by_month = {m: sum(topics_by_month[m].values()) for m in sorted_months}
-    n_months = len(sorted_months)
-    min_papers = max(3, n_months)
-    candidates = [t for t, c in all_topics.items() if c >= min_papers]
-    lifecycle: dict = {}
-    for topic in candidates:
-        proportions = np.array([
-            topics_by_month[m].get(topic, 0) / total_by_month[m]
-            if total_by_month[m] > 0 else 0
-            for m in sorted_months
-        ])
-        counts = np.array([topics_by_month[m].get(topic, 0) for m in sorted_months])
-        nonzero = np.where(proportions > 0)[0]
-        if len(nonzero) < 2:
-            continue
-        first_idx = int(nonzero[0])
-        peak_idx = int(np.argmax(proportions))
-        peak_val = float(proportions[peak_idx])
-        current_avg = float(np.mean(proportions[-min(3, n_months):]))
-        window = min(6, n_months)
-        recent = proportions[-window:]
-        slope = float(linregress(np.arange(len(recent)), recent).slope) if len(recent) >= 3 else 0.0
-        decline_ratio = current_avg / peak_val if peak_val > 0 else 0
-        months_since_peak = n_months - 1 - peak_idx
-        months_active = n_months - first_idx
-        recent_window = min(8, len(counts))
-        recent_fraction = float(counts[-recent_window:].sum() / max(counts.sum(), 1))
-        # Phase classification (thresholds adapted for variable-length windows)
-        dr, sl, ma, msp = decline_ratio, slope, months_active, months_since_peak
-        tc = int(counts.sum())
-        if ma <= max(3, n_months * 0.4) and tc < n_months * 10:
-            phase = "Innovation Trigger"
-        elif (dr > 0.70 and msp <= max(2, n_months // 3)) or (sl > 0.001 and dr > 0.65):
-            phase = "Peak of Inflated Expectations"
-        elif dr < 0.65:
-            phase = "Slope of Enlightenment" if sl > 0.0003 else "Trough of Disillusionment"
-        elif sl < -0.001 and dr < 0.75:
-            phase = "Trough of Disillusionment"
-        elif dr < 0.85 and sl > 0.0005 and msp > max(2, n_months // 4):
-            phase = "Slope of Enlightenment"
-        else:
-            phase = "Plateau of Productivity"
-        lifecycle[topic] = {
-            "topic": topic, "phase": phase,
-            "total_count": tc, "peak_val": peak_val,
-            "peak_month": sorted_months[peak_idx],
-            "current_avg": current_avg, "slope": slope,
-            "decline_ratio": decline_ratio,
-            "months_since_peak": months_since_peak,
-            "months_active": months_active,
-        }
-    return lifecycle, sorted_months
 def _render_hype_cycle(lifecycle_data: dict, lang: bool):
@@ -1835,47 +1737,45 @@ elif active_tab == "Monthly":
 # ---- Lifecycle tab ----
 elif active_tab == "Lifecycle":
-    _lc_splits_key = "lifecycle_available_halves"
     if _lc_splits_key not in st.session_state:
-        splits = _list_dataset_splits()
-        all_dates = [_split_to_date(s) for s in splits]
-        halves: set[str] = set()
-        for d in all_dates:
-            half = "H1" if int(d[5:7]) <= 6 else "H2"
-            halves.add(f"{d[:4]}-{half}")
-        st.session_state[_lc_splits_key] = sorted(halves, reverse=True)
-    half_year_options = st.session_state[_lc_splits_key]
-    if not half_year_options:
-        st.info("No data available for lifecycle analysis.")
     else:
         with hdr[1]:
-            selected_half = st.selectbox(
-                "Select period", options=half_year_options,
                 label_visibility="collapsed", key="lifecycle_select",
             )
-        year = int(selected_half[:4])
-        if selected_half.endswith("H1"):
-            month_range = tuple(f"{year}-{m:02d}" for m in range(1, 7))
         else:
-            month_range = tuple(f"{year}-{m:02d}" for m in range(7, 13))
-        _lc_papers_key = f"lifecycle_papers_{selected_half}"
-        if _lc_papers_key not in st.session_state:
-            with st.spinner(f"Loading papers for {selected_half}..."):
-                st.session_state[_lc_papers_key] = _load_papers_for_months(month_range)
-        lc_papers = st.session_state[_lc_papers_key]
-        if not lc_papers:
-            st.warning(f"No papers found for {selected_half}")
         else:
-            st.metric("Papers", f"{len(lc_papers):,}")
-            _lc_data_key = f"lifecycle_data_{selected_half}_{lang}"
-            if _lc_data_key not in st.session_state:
-                st.session_state[_lc_data_key] = _compute_lifecycle(lc_papers, lang)
-            lc_data, lc_months = st.session_state[_lc_data_key]
             if not lc_data:
                 st.warning("Not enough data for lifecycle analysis.")

 HF_DATASET_REPO = "Elfsong/hf_paper_summary"
 HF_TRENDING_REPO = "Elfsong/hf_paper_daily_trending"
 HF_MONTHLY_TRENDING_REPO = "Elfsong/hf_paper_monthly_trending"
+HF_LIFECYCLE_REPO = "Elfsong/hf_paper_lifecycle"
 def _get_hf_token() -> str | None:
 # ---------------------------------------------------------------------------
+# Topic lifecycle (read-only from HF, generated by lifecycle_retrieve.py)
 # ---------------------------------------------------------------------------
 _PHASES_ORDER = [
     "Innovation Trigger",
     "Peak of Inflated Expectations",
 ]
+@st.cache_data(ttl=300, show_spinner=False)
+def pull_lifecycle_from_hf(snapshot_str: str) -> dict | None:
+    """Load a pre-computed lifecycle snapshot from HF."""
+    log.info("[pull_lifecycle] snapshot_str=%s", snapshot_str)
+    files = _list_repo_files_cached(HF_LIFECYCLE_REPO)
+    splits = _extract_splits(files, prefix="snapshot_")
+    target_split = "snapshot_" + snapshot_str.replace("-", "_")
+    if target_split not in splits:
+        return None
+    rows = _download_split_rows(HF_LIFECYCLE_REPO, target_split)
+    if not rows:
+        return None
+    row = rows[0]
+    return {
+        "lifecycle_data": json.loads(row.get("lifecycle_data", "{}")),
+        "lifecycle_data_zh": json.loads(row.get("lifecycle_data_zh", "{}")),
+        "sorted_months": json.loads(row.get("sorted_months", "[]")),
+        "n_papers": row.get("n_papers", 0),
+        "n_months": row.get("n_months", 0),
+    }
 def _render_hype_cycle(lifecycle_data: dict, lang: bool):
 # ---- Lifecycle tab ----
 elif active_tab == "Lifecycle":
+    _lc_splits_key = "lifecycle_available_snapshots"
     if _lc_splits_key not in st.session_state:
+        lc_files = _list_repo_files_cached(HF_LIFECYCLE_REPO)
+        st.session_state[_lc_splits_key] = sorted(
+            [s.replace("snapshot_", "").replace("_", "-")
+             for s in _extract_splits(lc_files, prefix="snapshot_")],
+            reverse=True,
+        )
+    snapshot_options = st.session_state[_lc_splits_key]
+    if not snapshot_options:
+        st.info("No lifecycle data available yet. Run `uv run python src/lifecycle_retrieve.py --all` to generate.")
     else:
         with hdr[1]:
+            selected_snapshot = st.selectbox(
+                "Select snapshot", options=snapshot_options,
                 label_visibility="collapsed", key="lifecycle_select",
             )
+        _lc_cache_key = f"lifecycle_{selected_snapshot}"
+        lc_raw = None
+        if _lc_cache_key in st.session_state:
+            lc_raw = st.session_state[_lc_cache_key]
         else:
+            lc_raw = pull_lifecycle_from_hf(selected_snapshot)
+            if lc_raw:
+                st.session_state[_lc_cache_key] = lc_raw
+        if not lc_raw:
+            st.warning(f"Could not load lifecycle data for {selected_snapshot}")
         else:
+            lc_data = lc_raw["lifecycle_data_zh"] if lang else lc_raw["lifecycle_data"]
+            sorted_months = lc_raw["sorted_months"]
+            st.metric("Papers", f"{lc_raw['n_papers']:,}")
+            if sorted_months:
+                st.caption(
+                    f"{lc_raw['n_months']} months ({sorted_months[0]} → {sorted_months[-1]})"
+                )
             if not lc_data:
                 st.warning("Not enough data for lifecycle analysis.")