Spaces:
Running
Running
| import json | |
| import glob | |
| import logging | |
| from datetime import datetime, timedelta, timezone | |
| from pathlib import Path | |
| import streamlit as st | |
| logging.basicConfig( | |
| level=logging.INFO, | |
| format="%(asctime)s [%(levelname)s] %(message)s", | |
| datefmt="%H:%M:%S", | |
| ) | |
| log = logging.getLogger("paper_reader") | |
| # --------------------------------------------------------------------------- | |
| # Page config | |
| # --------------------------------------------------------------------------- | |
| st.set_page_config( | |
| page_title="Paper Espresso", | |
| page_icon="☕️", | |
| layout="wide", | |
| initial_sidebar_state="collapsed", | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Custom CSS – HuggingFace-inspired design | |
| # --------------------------------------------------------------------------- | |
| st.markdown( | |
| """ | |
| <style> | |
| /* ---------- global ---------- */ | |
| [data-testid="stAppViewContainer"] { background: #f6f8fa; } | |
| [data-testid="stHeader"] { background: #f6f8fa; } | |
| .block-container { padding-top: 3rem !important; } | |
| h1, h2, h3, h4 { color: #1f2328 !important; } | |
| p, li, span, label { color: #424a53; } | |
| /* ---------- upvote / rank ---------- */ | |
| .upvote-badge { | |
| display: inline-flex; align-items: center; gap: 5px; | |
| background: #fff8e1; | |
| border: 1px solid #f0d060; | |
| padding: 4px 12px; border-radius: 20px; | |
| font-size: 13px; font-weight: 700; color: #9a6700; | |
| flex-shrink: 0; | |
| } | |
| .paper-rank { | |
| display: inline-flex; align-items: center; justify-content: center; | |
| width: 28px; height: 28px; border-radius: 8px; | |
| font-weight: 700; font-size: 13px; | |
| background: #eef1f5; color: #656d76; | |
| flex-shrink: 0; | |
| } | |
| .paper-rank.top3 { | |
| background: linear-gradient(135deg, #dbeafe, #ede9fe); | |
| color: #2563eb; | |
| } | |
| .paper-authors { | |
| font-size: 13px; | |
| color: #656d76; | |
| margin-bottom: 12px; | |
| line-height: 1.5; | |
| } | |
| .paper-links { | |
| display: flex; gap: 8px; flex-wrap: wrap; | |
| } | |
| .paper-links a { | |
| display: inline-flex; align-items: center; gap: 4px; | |
| padding: 4px 12px; border-radius: 8px; | |
| border: 1px solid #d1d9e0; color: #656d76; | |
| text-decoration: none; font-size: 12px; font-weight: 500; | |
| transition: all 0.2s; | |
| } | |
| .paper-links a:hover { | |
| border-color: #2563eb; color: #2563eb; | |
| background: rgba(37,99,235,0.05); | |
| } | |
| /* ---------- stats bar ---------- */ | |
| .stats-bar { | |
| display: flex; gap: 32px; padding: 16px 24px; | |
| background: #ffffff; border: 1px solid #d1d9e0; border-radius: 14px; | |
| margin-bottom: 28px; flex-wrap: wrap; | |
| } | |
| .stat-item { font-size: 13px; color: #656d76; } | |
| .stat-value { font-weight: 700; color: #1f2328; font-size: 18px; margin-right: 6px; } | |
| /* ---------- dialog styles ---------- */ | |
| div[role="dialog"] { | |
| background: #ffffff !important; | |
| border: 1px solid #d1d9e0 !important; | |
| border-radius: 16px !important; | |
| } | |
| div[role="dialog"] h3, div[role="dialog"] h4 { color: #1f2328 !important; } | |
| div[role="dialog"] p, div[role="dialog"] li { color: #424a53 !important; } | |
| div[role="dialog"] hr { border-color: #d1d9e0 !important; } | |
| /* pros / cons in dialog */ | |
| .pros-box, .cons-box { padding: 14px 16px; border-radius: 10px; margin-bottom: 12px; } | |
| .pros-box { background: #f0fdf4; border: 1px solid #bbf7d0; } | |
| .cons-box { background: #fef2f2; border: 1px solid #fecaca; } | |
| .section-label { | |
| font-size: 11px; font-weight: 700; text-transform: uppercase; | |
| letter-spacing: .8px; margin-bottom: 10px; | |
| } | |
| .pros-box .section-label { color: #16a34a; } | |
| .cons-box .section-label { color: #dc2626; } | |
| .point { | |
| font-size: 13px; line-height: 1.6; color: #424a53; | |
| padding: 6px 0 6px 18px; position: relative; | |
| border-bottom: 1px solid rgba(0,0,0,.05); | |
| } | |
| .point:last-child { border-bottom: none; } | |
| .point::before { | |
| content: ''; position: absolute; left: 0; top: 14px; | |
| width: 6px; height: 6px; border-radius: 50%; | |
| } | |
| .pros-box .point::before { background: #16a34a; } | |
| .cons-box .point::before { background: #dc2626; } | |
| /* card image – full width flush to container */ | |
| div[data-testid="stColumn"] div[data-testid="stImage"] { | |
| aspect-ratio: 2 / 1; | |
| overflow: hidden !important; | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| } | |
| div[data-testid="stColumn"] div[data-testid="stImage"] img { | |
| width: 100% !important; | |
| height: 100% !important; | |
| object-fit: cover !important; | |
| border-radius: 14px 14px 0 0 !important; | |
| } | |
| /* ---------- topic filter pills — override Streamlit primary color ---------- */ | |
| :root { | |
| --primary-color: #2563eb !important; | |
| } | |
| div[data-testid="stPills"] button { | |
| padding: 2px 10px !important; | |
| border-radius: 12px !important; | |
| font-size: 12px !important; | |
| font-weight: 600 !important; | |
| min-height: 0 !important; | |
| height: auto !important; | |
| line-height: 1.4 !important; | |
| } | |
| div[data-testid="stPills"] button p { | |
| font-size: 12px !important; | |
| font-weight: 600 !important; | |
| line-height: 1.4 !important; | |
| margin: 0 !important; | |
| padding: 0 !important; | |
| } | |
| div[data-testid="stPills"] label { | |
| font-size: 12px !important; | |
| color: #656d76 !important; | |
| } | |
| div[data-testid="stPills"] [role="tablist"] { | |
| gap: 6px !important; | |
| } | |
| /* ---------- hide streamlit defaults ---------- */ | |
| .stDeployButton, footer, #MainMenu, | |
| [data-testid="stSidebar"], [data-testid="collapsedControl"] { display: none !important; } | |
| /* style the card button (title) – max 3 lines */ | |
| div[data-testid="stColumn"] button[data-testid="stBaseButton-secondary"] { | |
| background: transparent !important; | |
| border: none !important; | |
| padding: 0 !important; | |
| text-align: left !important; | |
| color: #1f2328 !important; | |
| font-size: 16px !important; | |
| font-weight: 700 !important; | |
| line-height: 1.4 !important; | |
| width: 100% !important; | |
| display: -webkit-box !important; | |
| -webkit-line-clamp: 3 !important; | |
| -webkit-box-orient: vertical !important; | |
| overflow: hidden !important; | |
| min-height: calc(16px * 1.4 * 3) !important; | |
| max-height: calc(16px * 1.4 * 3) !important; | |
| } | |
| div[data-testid="stColumn"] button[data-testid="stBaseButton-secondary"]:hover { | |
| color: #2563eb !important; | |
| background: transparent !important; | |
| border: none !important; | |
| } | |
| /* authors – max 2 lines */ | |
| .paper-authors { | |
| display: -webkit-box; | |
| -webkit-line-clamp: 2; | |
| -webkit-box-orient: vertical; | |
| overflow: hidden; | |
| min-height: calc(13px * 1.5 * 2); | |
| max-height: calc(13px * 1.5 * 2); | |
| } | |
| /* card topic tags – max 2 lines, reserve space for 2 rows */ | |
| .card-topics { | |
| display: flex; | |
| align-items: flex-start; | |
| align-content: flex-start; | |
| gap: 4px; | |
| flex-wrap: wrap; | |
| padding: 0 4px; | |
| margin-top: 4px; | |
| margin-bottom: 8px; | |
| overflow: hidden; | |
| min-height: 42px; | |
| max-height: 42px; | |
| } | |
| /* container styling – equal height + clear border */ | |
| div[data-testid="stVerticalBlockBorderWrapper"] { | |
| border: 2px solid #d1d9e0 !important; | |
| border-radius: 16px !important; | |
| background: #ffffff !important; | |
| overflow: hidden !important; | |
| height: 100%; | |
| padding: 0 !important; | |
| } | |
| /* remove all inner padding from bordered container */ | |
| div[data-testid="stColumn"] div[data-testid="stVerticalBlockBorderWrapper"] > div { | |
| padding: 0 !important; | |
| } | |
| div[data-testid="stColumn"] div[data-testid="stVerticalBlockBorderWrapper"] > div > div { | |
| padding: 0 !important; | |
| gap: 0 !important; | |
| } | |
| div[data-testid="stColumn"] div[data-testid="stVerticalBlockBorderWrapper"] > div > div > div { | |
| padding: 0 !important; | |
| gap: 0.25rem !important; | |
| } | |
| /* add padding back to non-image elements */ | |
| div[data-testid="stColumn"] div[data-testid="stVerticalBlockBorderWrapper"] button, | |
| div[data-testid="stColumn"] div[data-testid="stVerticalBlockBorderWrapper"] div[data-testid="stMarkdownContainer"] { | |
| margin-left: 1rem !important; | |
| margin-right: 1rem !important; | |
| } | |
| div[data-testid="stVerticalBlockBorderWrapper"]:hover { | |
| border-color: #2563eb !important; | |
| box-shadow: 0 4px 16px rgba(0,0,0,0.08); | |
| } | |
| /* make columns stretch to equal height */ | |
| div[data-testid="stHorizontalBlock"] { | |
| align-items: stretch !important; | |
| } | |
| div[data-testid="stHorizontalBlock"] > div[data-testid="stColumn"] { | |
| display: flex !important; | |
| flex-direction: column !important; | |
| } | |
| div[data-testid="stHorizontalBlock"] > div[data-testid="stColumn"] > div { | |
| flex: 1 !important; | |
| display: flex !important; | |
| flex-direction: column !important; | |
| } | |
| div[data-testid="stHorizontalBlock"] > div[data-testid="stColumn"] > div > div[data-testid="stVerticalBlockBorderWrapper"] { | |
| flex: 1 !important; | |
| } | |
| </style> | |
| """, | |
| unsafe_allow_html=True, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Data helpers | |
| # --------------------------------------------------------------------------- | |
| DATA_DIR = Path(__file__).resolve().parent.parent / "data" | |
| HF_DATASET_REPO = "Elfsong/hf_paper_summary" | |
| HF_TRENDING_REPO = "Elfsong/hf_paper_daily_trending" | |
| HF_MONTHLY_TRENDING_REPO = "Elfsong/hf_paper_monthly_trending" | |
| HF_LIFECYCLE_REPO = "Elfsong/hf_paper_lifecycle" | |
| def _get_hf_token() -> str | None: | |
| import os | |
| token = os.getenv("HF_TOKEN", "") | |
| if token: | |
| return token | |
| env_path = Path(__file__).resolve().parent.parent / ".env" | |
| if env_path.exists(): | |
| for line in env_path.read_text().splitlines(): | |
| if line.startswith("HF_TOKEN="): | |
| return line.split("=", 1)[1].strip() | |
| return None | |
| def _date_to_split(date_str: str) -> str: | |
| """Convert '2026-03-11' to 'date_2026_03_11' for valid split name.""" | |
| return "date_" + date_str.replace("-", "_") | |
| def _split_to_date(split_name: str) -> str: | |
| """Convert 'date_2026_03_11' back to '2026-03-11'.""" | |
| return split_name.replace("date_", "", 1).replace("_", "-") | |
| def _month_to_split(month_str: str) -> str: | |
| """Convert '2026-03' to 'month_2026_03'.""" | |
| return "month_" + month_str.replace("-", "_") | |
| def _last_day_of_month(year: int, month: int): | |
| """Return the last date of the given month.""" | |
| if month == 12: | |
| return datetime(year + 1, 1, 1, tzinfo=timezone.utc).date() - timedelta(days=1) | |
| return datetime(year, month + 1, 1, tzinfo=timezone.utc).date() - timedelta(days=1) | |
| def _list_repo_files(repo: str) -> list[str]: | |
| """List all files in a HF dataset repo (uncached, usable from any thread).""" | |
| from huggingface_hub import HfApi | |
| log.info("[_list_repo_files] listing files for %s", repo) | |
| token = _get_hf_token() | |
| api = HfApi(token=token) | |
| try: | |
| result = list(api.list_repo_files(repo, repo_type="dataset")) | |
| log.info("[_list_repo_files] %s → %d files", repo, len(result)) | |
| return result | |
| except Exception as e: | |
| log.error("[_list_repo_files] %s failed: %s", repo, e) | |
| return [] | |
| def _list_repo_files_cached(repo: str) -> list[str]: | |
| """List all files in a HF dataset repo (Streamlit cached).""" | |
| return _list_repo_files(repo) | |
| def _extract_splits(files: list[str], prefix: str = "date_") -> list[str]: | |
| """Extract sorted split names from a list of repo file paths.""" | |
| splits = set() | |
| for f in files: | |
| name = f.split("/")[-1] | |
| for part in name.replace(".parquet", "").replace(".arrow", "").split("-"): | |
| if part.startswith(prefix): | |
| splits.add(part) | |
| break | |
| return sorted(splits, reverse=True) | |
| def _list_dataset_splits() -> list[str]: | |
| """List available date splits from the HF dataset repo without loading data.""" | |
| return _extract_splits(_list_repo_files_cached(HF_DATASET_REPO)) | |
| def _download_split_rows(repo: str, split_name: str) -> list[dict]: | |
| """Download only the parquet files for ONE split, return rows as list[dict]. | |
| Uses hf_hub_download (per-file) instead of load_dataset (all-splits).""" | |
| import pandas as pd | |
| from huggingface_hub import hf_hub_download | |
| log.info("[_download_split_rows] repo=%s split=%s", repo, split_name) | |
| token = _get_hf_token() | |
| files = _list_repo_files_cached(repo) | |
| split_files = [f for f in files if split_name in f and f.endswith(".parquet")] | |
| log.debug("[_download_split_rows] matched %d parquet files: %s", len(split_files), split_files) | |
| if not split_files: | |
| return [] | |
| dfs = [] | |
| for f in split_files: | |
| try: | |
| log.info("[_download_split_rows] downloading %s", f) | |
| local_path = hf_hub_download( | |
| repo, f, repo_type="dataset", token=token | |
| ) | |
| log.info("[_download_split_rows] reading parquet %s", local_path) | |
| dfs.append(pd.read_parquet(local_path)) | |
| except Exception as e: | |
| log.error("[_download_split_rows] failed on %s: %s", f, e) | |
| continue | |
| if not dfs: | |
| return [] | |
| result = pd.concat(dfs, ignore_index=True).to_dict("records") | |
| log.info("[_download_split_rows] returning %d rows", len(result)) | |
| return result | |
| def _parse_paper_row(paper: dict) -> dict: | |
| """Decode JSON string fields in a paper row.""" | |
| for key in ("detailed_analysis", "detailed_analysis_zh"): | |
| v = paper.get(key, "{}") | |
| if isinstance(v, str): | |
| paper[key] = json.loads(v) if v else {} | |
| for key in ("topics", "topics_zh", "keywords", "keywords_zh"): | |
| v = paper.get(key, "[]") | |
| if isinstance(v, str): | |
| paper[key] = json.loads(v) if v else [] | |
| # pandas may convert list columns to numpy arrays | |
| if not isinstance(paper.get("authors"), list): | |
| try: | |
| paper["authors"] = list(paper["authors"]) | |
| except Exception: | |
| paper["authors"] = [] | |
| return paper | |
| def pull_from_hf_dataset(target_date: str | None = None) -> dict[str, list[dict]]: | |
| """Load a single date split from HF dataset. | |
| Returns {date_str: papers_list}.""" | |
| log.info("[pull_from_hf_dataset] target_date=%s", target_date) | |
| splits = _list_dataset_splits() | |
| if not splits: | |
| return {} | |
| if target_date: | |
| target_split = _date_to_split(target_date) | |
| if target_split not in splits: | |
| return {} | |
| split_to_load = target_split | |
| else: | |
| split_to_load = splits[0] | |
| date_str = _split_to_date(split_to_load) | |
| rows = _download_split_rows(HF_DATASET_REPO, split_to_load) | |
| if not rows: | |
| return {} | |
| papers = [_parse_paper_row(r) for r in rows] | |
| return {date_str: papers} | |
| def list_available_dates() -> list[str]: | |
| """Return available dates (YYYY-MM-DD) from HF dataset and local files, sorted descending.""" | |
| log.info("[list_available_dates] START") | |
| dates = set() | |
| # From HF dataset splits | |
| for split in _list_dataset_splits(): | |
| dates.add(_split_to_date(split)) | |
| # From local JSON files | |
| for date_str in find_json_files(): | |
| dates.add(date_str) | |
| result = sorted(dates, reverse=True) | |
| log.info("[list_available_dates] found %d dates", len(result)) | |
| return result | |
| def find_json_files() -> dict[str, Path]: | |
| """Return {date_str: path} for all summarized JSON files.""" | |
| files: dict[str, Path] = {} | |
| for fp in glob.glob(str(DATA_DIR / "hf_papers_*_summarized.json")): | |
| p = Path(fp) | |
| for part in p.stem.split("_"): | |
| if len(part) == 10 and part[4] == "-" and part[7] == "-": | |
| files[part] = p | |
| break | |
| return dict(sorted(files.items(), reverse=True)) | |
| def load_papers(source) -> list[dict]: | |
| if isinstance(source, (str, Path)): | |
| with open(source, "r", encoding="utf-8") as f: | |
| return json.load(f) | |
| return json.loads(source.read()) | |
| HF_THUMB = "https://cdn-thumbnails.huggingface.co/social-thumbnails/papers/{pid}.png" | |
| def load_papers_for_dates(dates: tuple[str, ...]) -> list[dict]: | |
| """Load and deduplicate papers across multiple dates (for monthly).""" | |
| all_papers: list[dict] = [] | |
| seen_ids: set[str] = set() | |
| for date_str in dates: | |
| day_papers: list[dict] = [] | |
| hf_data = pull_from_hf_dataset(target_date=date_str) | |
| if hf_data and date_str in hf_data: | |
| day_papers = hf_data[date_str] | |
| if not day_papers: | |
| json_files = find_json_files() | |
| if date_str in json_files: | |
| day_papers = load_papers(json_files[date_str]) | |
| for p in day_papers: | |
| pid = p.get("paper_id", "") | |
| if pid and pid not in seen_ids: | |
| seen_ids.add(pid) | |
| all_papers.append(p) | |
| return all_papers | |
| # --------------------------------------------------------------------------- | |
| # Trending summary | |
| # --------------------------------------------------------------------------- | |
| def _deserialize_trending_row(row: dict) -> dict: | |
| """Deserialize JSON string fields in a trending row.""" | |
| for key in ("top_topics", "top_topics_zh", "keywords", "keywords_zh"): | |
| v = row.get(key, "[]") | |
| if isinstance(v, str): | |
| row[key] = json.loads(v) if v else [] | |
| for key in ("topic_mapping", "topic_mapping_zh"): | |
| v = row.get(key) | |
| if isinstance(v, str): | |
| row[key] = json.loads(v) if v else {} | |
| return row | |
| def pull_trending_from_hf(target_date: str | None = None) -> dict | None: | |
| """Load trending summary from HF dataset. Returns dict or None.""" | |
| log.info("[pull_trending_from_hf] target_date=%s", target_date) | |
| files = _list_repo_files_cached(HF_TRENDING_REPO) | |
| splits = _extract_splits(files) | |
| if not splits: | |
| return None | |
| if target_date: | |
| target_split = _date_to_split(target_date) | |
| if target_split not in splits: | |
| return None | |
| split_to_load = target_split | |
| else: | |
| split_to_load = splits[0] | |
| rows = _download_split_rows(HF_TRENDING_REPO, split_to_load) | |
| if not rows: | |
| return None | |
| return _deserialize_trending_row(rows[0]) | |
| def get_cached_trending(date_str: str) -> tuple[dict | None, str]: | |
| """Try to load trending from HF cache only (no generation). | |
| Returns (trending_dict, date_range_str).""" | |
| log.info("[get_cached_trending] date_str=%s", date_str) | |
| trending = pull_trending_from_hf(target_date=date_str) | |
| if trending: | |
| return trending, trending.get("date_range", "") | |
| return None, "" | |
| # --------------------------------------------------------------------------- | |
| # Monthly trending (read-only from HF, generated by monthly_retrieve.py) | |
| # --------------------------------------------------------------------------- | |
| def pull_monthly_trending_from_hf(month_str: str) -> dict | None: | |
| """Load monthly trending summary from HF dataset.""" | |
| log.info("[pull_monthly_trending] month_str=%s", month_str) | |
| files = _list_repo_files_cached(HF_MONTHLY_TRENDING_REPO) | |
| splits = _extract_splits(files, prefix="month_") | |
| if not splits: | |
| return None | |
| target_split = _month_to_split(month_str) | |
| if target_split not in splits: | |
| return None | |
| rows = _download_split_rows(HF_MONTHLY_TRENDING_REPO, target_split) | |
| if not rows: | |
| return None | |
| return _deserialize_trending_row(rows[0]) | |
| # --------------------------------------------------------------------------- | |
| # Topic lifecycle (read-only from HF, generated by lifecycle_retrieve.py) | |
| # --------------------------------------------------------------------------- | |
| _PHASES_ORDER = [ | |
| "Innovation Trigger", | |
| "Peak of Inflated Expectations", | |
| "Trough of Disillusionment", | |
| "Slope of Enlightenment", | |
| "Plateau of Productivity", | |
| ] | |
| def pull_lifecycle_from_hf(snapshot_str: str) -> dict | None: | |
| """Load a pre-computed lifecycle snapshot from HF.""" | |
| log.info("[pull_lifecycle] snapshot_str=%s", snapshot_str) | |
| files = _list_repo_files_cached(HF_LIFECYCLE_REPO) | |
| splits = _extract_splits(files, prefix="snapshot_") | |
| target_split = "snapshot_" + snapshot_str.replace("-", "_") | |
| if target_split not in splits: | |
| return None | |
| rows = _download_split_rows(HF_LIFECYCLE_REPO, target_split) | |
| if not rows: | |
| return None | |
| row = rows[0] | |
| return { | |
| "lifecycle_data": json.loads(row.get("lifecycle_data", "{}")), | |
| "lifecycle_data_zh": json.loads(row.get("lifecycle_data_zh", "{}")), | |
| "sorted_months": json.loads(row.get("sorted_months", "[]")), | |
| "n_papers": row.get("n_papers", 0), | |
| "n_months": row.get("n_months", 0), | |
| "topics_by_month": json.loads(row.get("topics_by_month", "{}")), | |
| "total_by_month": json.loads(row.get("total_by_month", "{}")), | |
| "topics_by_month_zh": json.loads(row.get("topics_by_month_zh", "{}")), | |
| "total_by_month_zh": json.loads(row.get("total_by_month_zh", "{}")), | |
| } | |
| def _render_hype_cycle(lifecycle_data: dict, lang: bool): | |
| """Render a Gartner-style hype cycle figure with matplotlib.""" | |
| import matplotlib | |
| matplotlib.use("Agg") | |
| import matplotlib.pyplot as plt | |
| import matplotlib.patheffects as pe | |
| import matplotlib.font_manager as fm | |
| import numpy as np | |
| from scipy.interpolate import CubicSpline | |
| from collections import defaultdict | |
| # Use a CJK-capable font when rendering Chinese | |
| if lang: | |
| cjk_candidates = [ | |
| "Noto Sans CJK JP", "Noto Sans CJK SC", "Noto Sans CJK TC", | |
| "Noto Sans CJK", "PingFang SC", "Heiti SC", "Heiti TC", | |
| "Microsoft YaHei", "Noto Sans SC", "WenQuanYi Micro Hei", | |
| "Source Han Sans SC", "SimHei", "AR PL UMing CN", | |
| ] | |
| available = {f.name for f in fm.fontManager.ttflist} | |
| cjk_font = next((f for f in cjk_candidates if f in available), None) | |
| if cjk_font: | |
| plt.rcParams["font.family"] = "sans-serif" | |
| plt.rcParams["font.sans-serif"] = [cjk_font] | |
| plt.rcParams["axes.unicode_minus"] = False | |
| phase_groups: dict[str, list] = defaultdict(list) | |
| for lc in lifecycle_data.values(): | |
| phase_groups[lc["phase"]].append(lc) | |
| sort_keys = { | |
| "Innovation Trigger": lambda x: -x["current_avg"], | |
| "Peak of Inflated Expectations": lambda x: -x["total_count"], | |
| "Trough of Disillusionment": lambda x: -x["total_count"], | |
| "Slope of Enlightenment": lambda x: -x["total_count"], | |
| "Plateau of Productivity": lambda x: -x["total_count"], | |
| } | |
| for phase, key_fn in sort_keys.items(): | |
| if phase in phase_groups: | |
| phase_groups[phase].sort(key=key_fn) | |
| max_per_phase = { | |
| "Innovation Trigger": 3, | |
| "Peak of Inflated Expectations": 4, | |
| "Trough of Disillusionment": 4, | |
| "Slope of Enlightenment": 3, | |
| "Plateau of Productivity": 2, | |
| } | |
| selected = [] | |
| for phase in _PHASES_ORDER: | |
| selected.extend(phase_groups.get(phase, [])[:max_per_phase[phase]]) | |
| if not selected: | |
| return None | |
| # Hype cycle curve (cubic spline through control points) | |
| ctrl_x = np.array([0, 0.5, 1, 1.5, 2, 2.3, 2.8, 3.2, 3.8, 4.5, 5.5, 6.5, 7.5, 8.5, 10]) | |
| ctrl_y = np.array([.02, .08, .22, .58, .98, .78, .38, .16, .10, .15, .26, .36, .42, .45, .47]) | |
| cs = CubicSpline(ctrl_x, ctrl_y) | |
| curve_x = np.linspace(0, 10, 500) | |
| curve_y = cs(curve_x) | |
| phase_ranges = { | |
| "Innovation Trigger": (0.3, 1.3), | |
| "Peak of Inflated Expectations": (1.4, 2.6), | |
| "Trough of Disillusionment": (2.8, 4.3), | |
| "Slope of Enlightenment": (4.8, 7.2), | |
| "Plateau of Productivity": (7.5, 9.5), | |
| } | |
| phase_colors = { | |
| "Innovation Trigger": "#16a34a", | |
| "Peak of Inflated Expectations": "#dc2626", | |
| "Trough of Disillusionment": "#2563eb", | |
| "Slope of Enlightenment": "#d97706", | |
| "Plateau of Productivity": "#6b7280", | |
| } | |
| phase_labels = { | |
| "Innovation Trigger": "技术\n萌芽期" if lang else "Innovation\nTrigger", | |
| "Peak of Inflated Expectations": "期望\n膨胀期" if lang else "Peak of Inflated\nExpectations", | |
| "Trough of Disillusionment": "泡沫\n破裂期" if lang else "Trough of\nDisillusionment", | |
| "Slope of Enlightenment": "稳步\n爬升期" if lang else "Slope of\nEnlightenment", | |
| "Plateau of Productivity": "生产\n成熟期" if lang else "Plateau of\nProductivity", | |
| } | |
| offset_patterns = { | |
| "Innovation Trigger": [(0.10, "bottom"), (-0.08, "top"), (0.15, "bottom")], | |
| "Peak of Inflated Expectations": [(0.16, "bottom"), (-0.13, "top"), (0.10, "bottom"), (-0.09, "top")], | |
| "Trough of Disillusionment": [(0.10, "bottom"), (-0.07, "top"), (0.14, "bottom"), (-0.10, "top")], | |
| "Slope of Enlightenment": [(0.10, "bottom"), (-0.08, "top"), (0.14, "bottom")], | |
| "Plateau of Productivity": [(0.10, "bottom"), (-0.08, "top")], | |
| } | |
| fig, ax = plt.subplots(figsize=(10, 5)) | |
| ax.plot(curve_x, curve_y, color="#d1d5db", linewidth=3.5, zorder=1, solid_capstyle="round") | |
| ax.fill_between(curve_x, 0, curve_y, alpha=0.03, color="#9ca3af") | |
| for bx in [1.35, 2.7, 4.55, 7.35]: | |
| ax.axvline(bx, color="#e5e7eb", linewidth=0.6, linestyle="--", zorder=0) | |
| for phase in _PHASES_ORDER: | |
| x_lo, x_hi = phase_ranges[phase] | |
| pts = [lc for lc in selected if lc["phase"] == phase] | |
| if not pts: | |
| continue | |
| x_positions = np.linspace(x_lo, x_hi, len(pts) + 2)[1:-1] | |
| color = phase_colors[phase] | |
| offsets = offset_patterns[phase] | |
| for i, lc in enumerate(pts): | |
| xp = x_positions[i] | |
| yp = float(cs(xp)) | |
| dot_size = max(4, min(10, lc["total_count"] / 30)) | |
| ax.plot(xp, yp, "o", color=color, markersize=dot_size, zorder=3, | |
| markeredgecolor="white", markeredgewidth=0.6) | |
| offset_y, va = offsets[i % len(offsets)] | |
| ax.annotate( | |
| lc["topic"], xy=(xp, yp), xytext=(xp, yp + offset_y), | |
| fontsize=7, color=color, fontweight="bold", ha="center", va=va, | |
| arrowprops=dict(arrowstyle="-", color=color, alpha=0.3, lw=0.5), | |
| path_effects=[pe.withStroke(linewidth=2.5, foreground="white")], | |
| ) | |
| for phase in _PHASES_ORDER: | |
| x_lo, x_hi = phase_ranges[phase] | |
| ax.text((x_lo + x_hi) / 2, -0.10, phase_labels[phase], | |
| fontsize=7, ha="center", va="top", | |
| color=phase_colors[phase], fontweight="bold", style="italic") | |
| ax.set_xlim(-0.3, 10.3) | |
| ax.set_ylim(-0.22, 1.20) | |
| ax.set_ylabel("关注度" if lang else "Visibility", fontsize=10) | |
| ax.annotate( | |
| "成熟度" if lang else "Maturity", | |
| xy=(10.2, -0.03), xytext=(8.5, -0.03), fontsize=9, | |
| arrowprops=dict(arrowstyle="->", color="#6b7280", lw=1.2), | |
| color="#6b7280", va="center", | |
| ) | |
| ax.spines["top"].set_visible(False) | |
| ax.spines["right"].set_visible(False) | |
| ax.spines["bottom"].set_visible(False) | |
| ax.set_xticks([]) | |
| ax.set_yticks([]) | |
| fig.tight_layout() | |
| return fig | |
| # --------------------------------------------------------------------------- | |
| # Summary dialog | |
| # --------------------------------------------------------------------------- | |
| def show_summary(paper: dict): | |
| st.markdown(f"### {paper.get('title', '')}") | |
| # Authors | |
| authors = paper.get("authors", []) | |
| if authors: | |
| st.caption(", ".join(authors)) | |
| # Resource links | |
| links_html = f"""<div class="paper-links" style="margin-bottom:12px;"> | |
| <a href="{paper.get('hf_url','#')}" target="_blank">🤗 HuggingFace</a> | |
| <a href="{paper.get('arxiv_url','#')}" target="_blank">📄 arXiv</a> | |
| <a href="{paper.get('pdf_url','#')}" target="_blank">📥 PDF</a> | |
| </div>""" | |
| st.markdown(links_html, unsafe_allow_html=True) | |
| # Use global language toggle | |
| lang = st.session_state.get("lang_toggle", False) | |
| # Topics & Keywords | |
| if lang: | |
| topics = paper.get("topics_zh", []) or paper.get("topics", []) | |
| kws = paper.get("keywords_zh", []) or paper.get("keywords", []) | |
| else: | |
| topics = paper.get("topics", []) | |
| kws = paper.get("keywords", []) | |
| if topics or kws: | |
| lines = [] | |
| if topics: | |
| topic_spans = "".join( | |
| f'<span style="background:#eef1f5;padding:3px 10px;border-radius:12px;' | |
| f'font-size:12px;font-weight:600;color:#2563eb;">{t}</span>' | |
| for t in topics | |
| ) | |
| lines.append(f'<div style="display:flex;gap:6px;flex-wrap:wrap;">{topic_spans}</div>') | |
| if kws: | |
| kw_spans = "".join( | |
| f'<span style="background:#fff8e1;padding:3px 10px;border-radius:12px;' | |
| f'font-size:11px;font-weight:500;color:#9a6700;border:1px solid #f0d060;">{k}</span>' | |
| for k in kws | |
| ) | |
| lines.append(f'<div style="display:flex;gap:6px;flex-wrap:wrap;">{kw_spans}</div>') | |
| st.markdown( | |
| f'<div style="display:flex;flex-direction:column;gap:8px;margin-bottom:12px;">{"".join(lines)}</div>', | |
| unsafe_allow_html=True, | |
| ) | |
| # TL;DR | |
| if lang: | |
| concise = paper.get("concise_summary_zh", "") or paper.get( | |
| "concise_summary", "" | |
| ) | |
| else: | |
| concise = paper.get("concise_summary", "") | |
| if concise: | |
| st.markdown("#### 📝 TL;DR") | |
| st.markdown(concise) | |
| # Detailed Analysis | |
| if lang: | |
| analysis = paper.get("detailed_analysis_zh", {}) or paper.get( | |
| "detailed_analysis", {} | |
| ) | |
| else: | |
| analysis = paper.get("detailed_analysis", {}) | |
| if analysis: | |
| st.divider() | |
| st.markdown("#### 🔬 Detailed Analysis" if not lang else "#### 🔬 详细分析") | |
| st.markdown(analysis.get("summary", "")) | |
| st.divider() | |
| col_a, col_b = st.columns(2) | |
| with col_a: | |
| pros = analysis.get("pros", []) | |
| pros_html = "".join(f'<div class="point">{p}</div>' for p in pros) | |
| label = "✓ Strengths" if not lang else "✓ 优势" | |
| st.markdown( | |
| f'<div class="pros-box"><div class="section-label">{label}</div>{pros_html}</div>', | |
| unsafe_allow_html=True, | |
| ) | |
| with col_b: | |
| cons = analysis.get("cons", []) | |
| cons_html = "".join(f'<div class="point">{c}</div>' for c in cons) | |
| label = "✗ Limitations" if not lang else "✗ 不足" | |
| st.markdown( | |
| f'<div class="cons-box"><div class="section-label">{label}</div>{cons_html}</div>', | |
| unsafe_allow_html=True, | |
| ) | |
| # --------------------------------------------------------------------------- | |
| # Render paper card | |
| # --------------------------------------------------------------------------- | |
| def render_card(paper: dict, rank: int, tab_key: str = ""): | |
| pid = paper.get("paper_id", "") | |
| title = paper.get("title", "Untitled") | |
| authors = paper.get("authors", []) | |
| thumb_url = HF_THUMB.format(pid=pid) | |
| if authors: | |
| authors_str = ", ".join(authors) | |
| else: | |
| authors_str = "Unknown authors" | |
| with st.container(border=True): | |
| # Thumbnail | |
| st.image(thumb_url, width="stretch") | |
| # Title as clickable button | |
| if st.button(f"**{title}**", key=f"card-{tab_key}-{rank}", use_container_width=True): | |
| show_summary(paper) | |
| # Authors | |
| lang = st.session_state.get("lang_toggle", False) | |
| if lang: | |
| topics = paper.get("topics_zh", []) or paper.get("topics", []) | |
| else: | |
| topics = paper.get("topics", []) | |
| topic_spans = "".join( | |
| f'<span style="background:#eef1f5;padding:2px 8px;border-radius:10px;' | |
| f'font-size:11px;font-weight:600;color:#2563eb;white-space:nowrap;">{t}</span>' | |
| for t in topics | |
| ) | |
| html = f""" | |
| <div style="padding: 0 4px;"> | |
| <div class="paper-authors">{authors_str}</div> | |
| </div> | |
| <div class="card-topics">{topic_spans}</div>""" | |
| st.markdown(html, unsafe_allow_html=True) | |
| # --------------------------------------------------------------------------- | |
| # Shared rendering helpers | |
| # --------------------------------------------------------------------------- | |
| def _render_trending_content(trending: dict, trending_date_range: str, lang: bool, placeholder): | |
| """Render a trending summary dict into the given placeholder.""" | |
| if lang: | |
| summary_text = trending.get("trending_summary_zh", "") or trending.get("trending_summary", "") | |
| topics = trending.get("top_topics_zh", []) or trending.get("top_topics", []) | |
| keywords = trending.get("keywords_zh", []) or trending.get("keywords", []) | |
| else: | |
| summary_text = trending.get("trending_summary", "") | |
| topics = trending.get("top_topics", []) | |
| keywords = trending.get("keywords", []) | |
| topics_html = " ".join( | |
| f'<span style="background:#eef1f5;padding:2px 10px;border-radius:12px;' | |
| f'font-size:12px;font-weight:600;color:#2563eb;">{t}</span>' | |
| for t in topics | |
| ) | |
| keywords_html = " ".join( | |
| f'<span style="background:#fff8e1;padding:2px 10px;border-radius:12px;' | |
| f'font-size:11px;font-weight:500;color:#9a6700;border:1px solid #f0d060;">{k}</span>' | |
| for k in keywords | |
| ) | |
| date_range_label = ( | |
| f'<span style="font-size:12px;color:#9a6700;font-weight:600;">({trending_date_range})</span>' | |
| if trending_date_range | |
| else "" | |
| ) | |
| placeholder.markdown( | |
| f"""<div class="stats-bar"> | |
| <div style="flex:1;min-width:200px;"> | |
| <div style="font-size:13px;color:#656d76;margin-bottom:4px;"> | |
| {"🔥 趋势" if lang else "🔥 Trending"} {date_range_label} | |
| </div> | |
| <div style="font-size:13px;color:#424a53;line-height:1.5;">{summary_text}</div> | |
| <div style="display:flex;gap:6px;flex-wrap:wrap;margin-top:8px;">{topics_html}</div> | |
| <div style="display:flex;gap:6px;flex-wrap:wrap;margin-top:8px;">{keywords_html}</div> | |
| </div> | |
| </div>""", | |
| unsafe_allow_html=True, | |
| ) | |
| def _render_trending(date_str: str, lang: bool, placeholder): | |
| """Load and render trending summary into the given placeholder.""" | |
| _trending_cache_key = f"trending_{date_str}" | |
| trending = None | |
| trending_date_range = "" | |
| if _trending_cache_key in st.session_state: | |
| trending, trending_date_range = st.session_state[_trending_cache_key] | |
| else: | |
| with st.spinner("Loading trending summary..."): | |
| trending, trending_date_range = get_cached_trending(date_str) | |
| if trending: | |
| st.session_state[_trending_cache_key] = (trending, trending_date_range) | |
| if not trending: | |
| return | |
| _render_trending_content(trending, trending_date_range, lang, placeholder) | |
| def _get_paper_topics(paper: dict, lang: bool) -> list[str]: | |
| """Get topic labels for a paper, respecting language preference.""" | |
| if lang: | |
| return paper.get("topics_zh", []) or paper.get("topics", []) | |
| return paper.get("topics", []) | |
| def _render_papers_section( | |
| papers: list[dict], | |
| lang: bool, | |
| date_str: str, | |
| tab_key: str, | |
| clustered_topics: list[str] | None = None, | |
| topic_mapping: dict[str, list[str]] | None = None, | |
| trending_data: tuple[dict, str] | None = None, | |
| ): | |
| """Render trending, topic filters, and paper grid for a list of papers.""" | |
| if not papers: | |
| st.error("No papers retrieved. Please check back later.") | |
| return | |
| papers.sort(key=lambda p: p.get("upvotes", 0), reverse=True) | |
| trending_placeholder = st.empty() | |
| if clustered_topics: | |
| all_topics = clustered_topics | |
| else: | |
| all_topics = [] | |
| seen_topics: set[str] = set() | |
| for p in papers: | |
| for t in _get_paper_topics(p, lang): | |
| if t not in seen_topics: | |
| seen_topics.add(t) | |
| all_topics.append(t) | |
| selected_topics: list[str] = [] | |
| if all_topics: | |
| selected_topics = st.pills( | |
| "🏷️ Filter by topic" if not lang else "🏷️ 按主题筛选", | |
| options=all_topics, | |
| selection_mode="multi", | |
| default=None, | |
| key=f"topic_filter_{tab_key}", | |
| ) | |
| if selected_topics: | |
| if topic_mapping: | |
| match_set: set[str] = set() | |
| for sel in selected_topics: | |
| match_set.update(topic_mapping.get(sel, [sel])) | |
| else: | |
| match_set = set(selected_topics) | |
| display_papers = [ | |
| p for p in papers | |
| if any(t in match_set for t in _get_paper_topics(p, lang)) | |
| ] | |
| else: | |
| display_papers = papers | |
| NUM_COLS = 3 | |
| for row_start in range(0, len(display_papers), NUM_COLS): | |
| cols = st.columns(NUM_COLS, gap="medium") | |
| for col_idx, col in enumerate(cols): | |
| paper_idx = row_start + col_idx | |
| if paper_idx >= len(display_papers): | |
| break | |
| with col: | |
| render_card(display_papers[paper_idx], rank=paper_idx + 1, tab_key=tab_key) | |
| if trending_data: | |
| _render_trending_content(trending_data[0], trending_data[1], lang, trending_placeholder) | |
| else: | |
| _render_trending(date_str, lang, trending_placeholder) | |
| # --------------------------------------------------------------------------- | |
| # Main content | |
| # --------------------------------------------------------------------------- | |
| yesterday_str = (datetime.now(timezone.utc) - timedelta(days=1)).strftime("%Y-%m-%d") | |
| # --- Header --- | |
| today = datetime.now(timezone.utc).date() | |
| hdr = st.columns([1, 2, 1.2, 0.8, 4.5], vertical_alignment="center") | |
| with hdr[0]: | |
| st.markdown("**☕️ Paper Espresso**") | |
| with hdr[2]: | |
| active_tab = st.selectbox( | |
| "Mode", ["Daily", "Monthly", "Lifecycle"], | |
| label_visibility="collapsed", key="mode_select", | |
| ) | |
| with hdr[3]: | |
| is_chinese = st.toggle("中文", key="lang_toggle") | |
| lang = is_chinese | |
| # ---- Daily ---- | |
| if active_tab == "Daily": | |
| with hdr[1]: | |
| available_dates = sorted( | |
| [_split_to_date(s) for s in _list_dataset_splits()], | |
| reverse=True, | |
| ) | |
| selected_date = st.date_input( | |
| "Select date", | |
| value=( | |
| datetime.strptime(available_dates[0], "%Y-%m-%d").date() | |
| if available_dates | |
| else (today - timedelta(days=1)).date() | |
| ), | |
| format="YYYY-MM-DD", | |
| label_visibility="collapsed", | |
| key="daily_date", | |
| ) | |
| selected_date_str = selected_date.strftime("%Y-%m-%d") | |
| papers: list[dict] = [] | |
| _papers_cache_key = f"papers_daily_{selected_date_str}" | |
| if _papers_cache_key not in st.session_state: | |
| with st.spinner("Loading papers..."): | |
| hf_data = pull_from_hf_dataset(target_date=selected_date_str) | |
| if hf_data: | |
| papers = hf_data[selected_date_str] | |
| if not papers: | |
| json_files = find_json_files() | |
| if selected_date_str in json_files: | |
| papers = load_papers(json_files[selected_date_str]) | |
| st.session_state[_papers_cache_key] = papers | |
| else: | |
| papers = st.session_state[_papers_cache_key] | |
| if papers: | |
| st.toast(f"**{len(papers)}** papers found for {selected_date_str}", icon="📰") | |
| else: | |
| st.toast(f"No papers found for {selected_date_str}", icon="⚠️") | |
| _render_papers_section(papers, lang, selected_date_str, "daily") | |
| # ---- Monthly tab ---- | |
| elif active_tab == "Monthly": | |
| _monthly_splits_key = "monthly_available_splits" | |
| if _monthly_splits_key not in st.session_state: | |
| trending_files = _list_repo_files_cached(HF_MONTHLY_TRENDING_REPO) | |
| st.session_state[_monthly_splits_key] = sorted( | |
| [s.replace("month_", "").replace("_", "-") | |
| for s in _extract_splits(trending_files, prefix="month_")], | |
| reverse=True, | |
| ) | |
| month_options = st.session_state[_monthly_splits_key] | |
| if not month_options: | |
| st.info("No monthly data available yet. Run `uv run python src/monthly_retrieve.py` to generate.") | |
| else: | |
| with hdr[1]: | |
| selected_month = st.selectbox( | |
| "Select month", options=month_options, | |
| label_visibility="collapsed", key="monthly_select", | |
| ) | |
| year, month_num = int(selected_month[:4]), int(selected_month[5:7]) | |
| first_day = datetime(year, month_num, 1, tzinfo=timezone.utc).date() | |
| last_day = _last_day_of_month(year, month_num) | |
| month_dates = tuple( | |
| (first_day + timedelta(days=i)).strftime("%Y-%m-%d") | |
| for i in range((last_day - first_day).days + 1) | |
| ) | |
| # --- Load trending from HF (pre-generated by monthly_retrieve.py) --- | |
| _mt_cache_key = f"monthly_trending_{selected_month}" | |
| monthly_trending = None | |
| if _mt_cache_key in st.session_state: | |
| monthly_trending = st.session_state[_mt_cache_key] | |
| else: | |
| monthly_trending = pull_monthly_trending_from_hf(selected_month) | |
| if monthly_trending: | |
| st.session_state[_mt_cache_key] = monthly_trending | |
| # --- Load papers --- | |
| _monthly_cache_key = f"papers_monthly_{selected_month}" | |
| if _monthly_cache_key not in st.session_state: | |
| with st.spinner(f"Loading papers for {selected_month}..."): | |
| st.session_state[_monthly_cache_key] = load_papers_for_dates(month_dates) | |
| monthly_papers = st.session_state[_monthly_cache_key] | |
| if not monthly_papers: | |
| st.warning(f"No papers found for {selected_month}") | |
| else: | |
| # --- Statistics + histogram --- | |
| from collections import Counter | |
| total_papers = len(monthly_papers) | |
| st.metric("Papers", f"{total_papers:,}") | |
| date_counts = Counter() | |
| for p in monthly_papers: | |
| d = p.get("_date", "") or p.get("published_at", "")[:10] | |
| if d: | |
| date_counts[d] += 1 | |
| if date_counts: | |
| import pandas as pd | |
| import altair as alt | |
| all_days = [ | |
| (first_day + timedelta(days=i)).strftime("%Y-%m-%d") | |
| for i in range((last_day - first_day).days + 1) | |
| ] | |
| weekdays = ["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"] | |
| labels = [ | |
| f"{d[-2:]} ({weekdays[datetime.strptime(d, '%Y-%m-%d').weekday()]})" | |
| for d in all_days | |
| ] | |
| df = pd.DataFrame({ | |
| "date": all_days, | |
| "label": labels, | |
| "papers": [date_counts.get(d, 0) for d in all_days], | |
| }) | |
| weekday_names = [weekdays[datetime.strptime(d, "%Y-%m-%d").weekday()] for d in all_days] | |
| df["weekday"] = weekday_names | |
| weekday_colors = alt.Scale( | |
| domain=weekdays, | |
| range=["#2563eb", "#7c3aed", "#0891b2", "#059669", "#d97706", "#e11d48", "#dc2626"], | |
| ) | |
| chart = alt.Chart(df).mark_bar(cornerRadiusTopLeft=3, cornerRadiusTopRight=3).encode( | |
| x=alt.X("label:N", sort=None, axis=alt.Axis(title=None, labelAngle=-45, labelFontSize=9)), | |
| y=alt.Y("papers:Q", axis=alt.Axis(title=None, labels=False, ticks=False)), | |
| color=alt.Color("weekday:N", scale=weekday_colors, legend=None), | |
| tooltip=["date:N", "papers:Q"], | |
| ).properties(height=180).configure_bar( | |
| discreteBandSize=12, | |
| ) | |
| st.altair_chart(chart, use_container_width=True) | |
| # --- Trending insights --- | |
| topics = [] | |
| topic_mapping = {} | |
| if monthly_trending: | |
| if lang: | |
| summary_text = monthly_trending.get("trending_summary_zh", "") or monthly_trending.get("trending_summary", "") | |
| topics = monthly_trending.get("top_topics_zh", []) or monthly_trending.get("top_topics", []) | |
| keywords = monthly_trending.get("keywords_zh", []) or monthly_trending.get("keywords", []) | |
| topic_mapping = monthly_trending.get("topic_mapping_zh", {}) or monthly_trending.get("topic_mapping", {}) | |
| else: | |
| summary_text = monthly_trending.get("trending_summary", "") | |
| topics = monthly_trending.get("top_topics", []) | |
| keywords = monthly_trending.get("keywords", []) | |
| topic_mapping = monthly_trending.get("topic_mapping", {}) | |
| st.markdown( | |
| f"""<div class="stats-bar"><div style="flex:1;min-width:200px;"> | |
| <div style="font-size:14px;font-weight:600;color:#656d76;margin-bottom:6px;"> | |
| {"🔥 月度趋势" if lang else "🔥 Monthly Insights"}</div> | |
| <div style="font-size:13px;color:#424a53;line-height:1.6;">{summary_text}</div> | |
| </div></div>""", | |
| unsafe_allow_html=True, | |
| ) | |
| if keywords: | |
| kw_html = " ".join( | |
| f'<span style="background:#fff8e1;padding:3px 10px;border-radius:12px;' | |
| f'font-size:12px;font-weight:500;color:#9a6700;border:1px solid #f0d060;">{k}</span>' | |
| for k in keywords | |
| ) | |
| st.markdown( | |
| f'<div style="display:flex;gap:6px;flex-wrap:wrap;margin:8px 0 16px 0;">{kw_html}</div>', | |
| unsafe_allow_html=True, | |
| ) | |
| # --- Topic Co-occurrence Heatmap --- | |
| all_paper_topics = [_get_paper_topics(p, lang) for p in monthly_papers] | |
| all_paper_topics = [ts for ts in all_paper_topics if ts] | |
| if all_paper_topics: | |
| import pandas as pd | |
| import altair as alt | |
| from matplotlib.colors import Normalize, LinearSegmentedColormap | |
| topic_freq = Counter() | |
| for ts in all_paper_topics: | |
| topic_freq.update(ts) | |
| top_n = 40 | |
| top_cooc_topics = [t for t, _ in topic_freq.most_common(top_n)] | |
| top_set = set(top_cooc_topics) | |
| n = len(top_cooc_topics) | |
| topic_idx = {t: i for i, t in enumerate(top_cooc_topics)} | |
| cooc_counts = Counter() | |
| for ts in all_paper_topics: | |
| filtered = sorted(set(t for t in ts if t in top_set)) | |
| for i in range(len(filtered)): | |
| for j in range(i + 1, len(filtered)): | |
| cooc_counts[(filtered[i], filtered[j])] += 1 | |
| if cooc_counts: | |
| import numpy as np | |
| # Build symmetric co-occurrence matrix | |
| matrix = np.zeros((n, n), dtype=int) | |
| for (t1, t2), count in cooc_counts.items(): | |
| i, j = topic_idx[t1], topic_idx[t2] | |
| matrix[i, j] = count | |
| matrix[j, i] = count | |
| # Per-topic paper counts for Jaccard | |
| topic_paper_count = Counter() | |
| for ts in all_paper_topics: | |
| for t in set(ts): | |
| if t in topic_idx: | |
| topic_paper_count[t] += 1 | |
| # Jaccard matrix | |
| jaccard = np.zeros((n, n)) | |
| for i in range(n): | |
| for j in range(n): | |
| if i == j: | |
| continue | |
| intersection = matrix[i, j] | |
| union = topic_paper_count[top_cooc_topics[i]] + topic_paper_count[top_cooc_topics[j]] - intersection | |
| jaccard[i, j] = intersection / union if union > 0 else 0 | |
| # Pre-compute hex colors for the split heatmap | |
| cmap_count = LinearSegmentedColormap.from_list("gray_red", ["#d0d0d0", "#e04040"]) | |
| cmap_jaccard = LinearSegmentedColormap.from_list("gray_blue", ["#d0d0d0", "#4080e0"]) | |
| off_diag = matrix[~np.eye(n, dtype=bool)] | |
| vmax_count = int(off_diag.max()) if off_diag.size > 0 else 1 | |
| vmax_jaccard = float(jaccard[~np.eye(n, dtype=bool)].max()) or 1.0 | |
| norm_count = Normalize(vmin=0, vmax=vmax_count) | |
| norm_jaccard = Normalize(vmin=0, vmax=vmax_jaccard) | |
| def _rgba_to_hex(rgba): | |
| r, g, b = (int(c * 255) for c in rgba[:3]) | |
| return f"#{r:02x}{g:02x}{b:02x}" | |
| cooc_rows = [] | |
| for ri, t_row in enumerate(top_cooc_topics): | |
| for ci, t_col in enumerate(top_cooc_topics): | |
| if ri == ci: | |
| cooc_rows.append({"topic_a": t_col, "topic_b": t_row, | |
| "count": 0, "jaccard": 0.0, | |
| "metric": "—", "color": "#f6f8fa"}) | |
| elif ri > ci: # lower triangle: counts | |
| val = int(matrix[ri, ci]) | |
| cooc_rows.append({"topic_a": t_col, "topic_b": t_row, | |
| "count": val, "jaccard": 0.0, | |
| "metric": "count", | |
| "color": _rgba_to_hex(cmap_count(norm_count(val)))}) | |
| else: # upper triangle: jaccard | |
| jval = float(jaccard[ri, ci]) | |
| cooc_rows.append({"topic_a": t_col, "topic_b": t_row, | |
| "count": 0, "jaccard": round(jval, 4), | |
| "metric": "jaccard", | |
| "color": _rgba_to_hex(cmap_jaccard(norm_jaccard(jval)))}) | |
| cooc_df = pd.DataFrame(cooc_rows) | |
| heatmap = alt.Chart(cooc_df).mark_rect(cornerRadius=2).encode( | |
| x=alt.X("topic_a:N", sort=top_cooc_topics, title=None, | |
| axis=alt.Axis(labelAngle=-45, labelFontSize=9, labelOverlap=False)), | |
| y=alt.Y("topic_b:N", sort=top_cooc_topics, title=None, | |
| axis=alt.Axis(labelFontSize=9, labelOverlap=False)), | |
| color=alt.Color("color:N", scale=None), | |
| tooltip=[ | |
| alt.Tooltip("topic_a:N", title="Topic X"), | |
| alt.Tooltip("topic_b:N", title="Topic Y"), | |
| alt.Tooltip("count:Q", title="Co-occurrence"), | |
| alt.Tooltip("jaccard:Q", title="Jaccard", format=".3f"), | |
| ], | |
| ).properties( | |
| width=alt.Step(25), | |
| height=alt.Step(25), | |
| ) | |
| with st.expander( | |
| "🔗 " + ("主题共现图" if lang else "Topic Co-occurrence"), | |
| expanded=False, | |
| ): | |
| st.altair_chart(heatmap, use_container_width=False) | |
| # --- Topic filter --- | |
| if not topics: | |
| seen: set[str] = set() | |
| for p in monthly_papers: | |
| for t in _get_paper_topics(p, lang): | |
| if t not in seen: | |
| seen.add(t) | |
| topics.append(t) | |
| selected_topics: list[str] = [] | |
| if topics: | |
| selected_topics = st.pills( | |
| "🏷️ Filter by topic" if not lang else "🏷️ 按主题筛选", | |
| options=topics, selection_mode="multi", default=None, | |
| key="topic_filter_monthly", | |
| ) | |
| # --- Filter papers --- | |
| if selected_topics: | |
| if topic_mapping: | |
| match_set: set[str] = set() | |
| for sel in selected_topics: | |
| match_set.update(topic_mapping.get(sel, [sel])) | |
| else: | |
| match_set = set(selected_topics) | |
| display_papers = [ | |
| p for p in monthly_papers | |
| if any(t in match_set for t in _get_paper_topics(p, lang)) | |
| ] | |
| else: | |
| display_papers = monthly_papers | |
| display_papers.sort(key=lambda p: p.get("upvotes", 0), reverse=True) | |
| if selected_topics: | |
| st.caption(f"Showing {len(display_papers)} of {total_papers} papers") | |
| # --- Paper card grid (3 columns) --- | |
| NUM_COLS = 3 | |
| for row_start in range(0, len(display_papers), NUM_COLS): | |
| cols = st.columns(NUM_COLS, gap="medium") | |
| for col_idx, col in enumerate(cols): | |
| paper_idx = row_start + col_idx | |
| if paper_idx >= len(display_papers): | |
| break | |
| with col: | |
| render_card(display_papers[paper_idx], rank=paper_idx + 1, tab_key="monthly") | |
| # ---- Lifecycle tab ---- | |
| elif active_tab == "Lifecycle": | |
| _lc_splits_key = "lifecycle_available_snapshots" | |
| if _lc_splits_key not in st.session_state: | |
| lc_files = _list_repo_files_cached(HF_LIFECYCLE_REPO) | |
| st.session_state[_lc_splits_key] = sorted( | |
| [s.replace("snapshot_", "").replace("_", "-") | |
| for s in _extract_splits(lc_files, prefix="snapshot_")], | |
| reverse=True, | |
| ) | |
| snapshot_options = st.session_state[_lc_splits_key] | |
| if not snapshot_options: | |
| st.info("No lifecycle data available yet. Run `uv run python src/lifecycle_retrieve.py --all` to generate.") | |
| else: | |
| with hdr[1]: | |
| selected_snapshot = st.selectbox( | |
| "Select snapshot", options=snapshot_options, | |
| label_visibility="collapsed", key="lifecycle_select", | |
| ) | |
| _lc_cache_key = f"lifecycle_{selected_snapshot}" | |
| lc_raw = None | |
| if _lc_cache_key in st.session_state: | |
| lc_raw = st.session_state[_lc_cache_key] | |
| else: | |
| lc_raw = pull_lifecycle_from_hf(selected_snapshot) | |
| if lc_raw: | |
| st.session_state[_lc_cache_key] = lc_raw | |
| if not lc_raw: | |
| st.warning(f"Could not load lifecycle data for {selected_snapshot}") | |
| else: | |
| lc_data = lc_raw["lifecycle_data_zh"] if lang else lc_raw["lifecycle_data"] | |
| sorted_months = lc_raw["sorted_months"] | |
| st.metric("Papers", f"{lc_raw['n_papers']:,}") | |
| if sorted_months: | |
| st.caption( | |
| f"{lc_raw['n_months']} months ({sorted_months[0]} → {sorted_months[-1]})" | |
| ) | |
| if not lc_data: | |
| st.warning("Not enough data for lifecycle analysis.") | |
| else: | |
| fig = _render_hype_cycle(lc_data, lang) | |
| if fig: | |
| import matplotlib.pyplot as plt | |
| st.pyplot(fig, use_container_width=True) | |
| plt.close(fig) | |
| # --- Topic selector & time-series chart --- | |
| tbm = lc_raw.get("topics_by_month_zh" if lang else "topics_by_month") or {} | |
| tbt = lc_raw.get("total_by_month_zh" if lang else "total_by_month") or {} | |
| if tbm and tbt: | |
| all_topic_names = sorted( | |
| lc_data.keys(), | |
| key=lambda t: -lc_data[t]["total_count"], | |
| ) | |
| selected_topics = st.multiselect( | |
| "📊 " + ("选择主题(最多5个)" if lang else "Select topics (max 5)"), | |
| options=all_topic_names, | |
| default=all_topic_names[:3], | |
| max_selections=5, | |
| key="lifecycle_topic_select", | |
| ) | |
| if selected_topics: | |
| import pandas as pd | |
| import altair as alt | |
| count_rows = [] | |
| prop_rows = [] | |
| for m in sorted_months: | |
| month_topics = tbm.get(m, {}) | |
| month_total = tbt.get(m, 0) | |
| for t in selected_topics: | |
| c = month_topics.get(t, 0) | |
| count_rows.append({"Month": m, "Topic": t, "Count": c}) | |
| prop_rows.append({ | |
| "Month": m, "Topic": t, | |
| "Proportion": round(c / month_total, 4) if month_total > 0 else 0, | |
| }) | |
| df_count = pd.DataFrame(count_rows) | |
| df_prop = pd.DataFrame(prop_rows) | |
| def _alt_line(df, y_field, y_title): | |
| nearest = alt.selection_point( | |
| nearest=True, on="pointerover", | |
| fields=["Month"], empty=False, | |
| ) | |
| line = alt.Chart(df).mark_line( | |
| interpolate="monotone", strokeWidth=2, | |
| ).encode( | |
| x=alt.X("Month:N", sort=sorted_months, title=None, | |
| axis=alt.Axis(labelAngle=-45, labelFontSize=8)), | |
| y=alt.Y(f"{y_field}:Q", title=y_title, | |
| axis=alt.Axis(titleFontSize=10)), | |
| color=alt.Color("Topic:N", legend=alt.Legend( | |
| orient="top", title=None, labelFontSize=9)), | |
| ) | |
| points = line.mark_point(size=40).encode( | |
| opacity=alt.condition(nearest, alt.value(1), alt.value(0)), | |
| tooltip=[ | |
| alt.Tooltip("Month:N"), | |
| alt.Tooltip("Topic:N"), | |
| alt.Tooltip(f"{y_field}:Q", title=y_title, | |
| format=".4f" if y_field == "Proportion" else "d"), | |
| ], | |
| ).add_params(nearest) | |
| rule = alt.Chart(df).mark_rule(color="gray", strokeDash=[4, 4]).encode( | |
| x="Month:N", | |
| ).transform_filter(nearest) | |
| return (line + points + rule).properties(height=260) | |
| col_a, col_b = st.columns(2) | |
| with col_a: | |
| chart_c = _alt_line(df_count, "Count", | |
| "论文数量" if lang else "Paper Count") | |
| st.altair_chart(chart_c, use_container_width=True) | |
| with col_b: | |
| chart_p = _alt_line(df_prop, "Proportion", | |
| "占比" if lang else "Proportion") | |
| st.altair_chart(chart_p, use_container_width=True) | |
| _phase_labels_zh = { | |
| "Innovation Trigger": "技术萌芽期", | |
| "Peak of Inflated Expectations": "期望膨胀期", | |
| "Trough of Disillusionment": "泡沫破裂期", | |
| "Slope of Enlightenment": "稳步爬升期", | |
| "Plateau of Productivity": "生产成熟期", | |
| } | |
| phase_icons = { | |
| "Innovation Trigger": "🌱", | |
| "Peak of Inflated Expectations": "🔥", | |
| "Trough of Disillusionment": "📉", | |
| "Slope of Enlightenment": "📈", | |
| "Plateau of Productivity": "⚙️", | |
| } | |
| for phase in _PHASES_ORDER: | |
| topics_in_phase = sorted( | |
| [lc for lc in lc_data.values() if lc["phase"] == phase], | |
| key=lambda x: -x["total_count"], | |
| ) | |
| if not topics_in_phase: | |
| continue | |
| icon = phase_icons[phase] | |
| label = _phase_labels_zh[phase] if lang else phase | |
| unit = "个主题" if lang else "topics" | |
| with st.expander(f"{icon} {label} ({len(topics_in_phase)} {unit})"): | |
| for lc in topics_in_phase: | |
| st.markdown( | |
| f"**{lc['topic']}** — {lc['total_count']} papers, " | |
| f"peak: {lc['peak_month']}, trend: {lc['slope']:+.4f}" | |
| ) | |