Spaces:

Elfsong
/

Daily_Paper_Reader

Restarting

elfsong commited on 4 days ago

Commit

c65838f

1 Parent(s): ba9bdfb

Enhance Streamlit app with new features and UI improvements

- Added padding to the block container for better layout.
- Introduced a new Hugging Face trending repository for enhanced data integration.
- Refactored data handling in push_to_hf_dataset and pull_from_hf_dataset functions for improved readability.
- Updated the fetch_daily_papers function to streamline paper retrieval.
- Added a new TRENDING_SYSTEM_PROMPT for identifying key research trends in papers.

Files changed (1) hide show

src/streamlit_app.py +334 -54

src/streamlit_app.py CHANGED Viewed

@@ -28,6 +28,7 @@ st.markdown(
 /* ---------- global ---------- */
 [data-testid="stAppViewContainer"] { background: #f6f8fa; }
 [data-testid="stHeader"] { background: #f6f8fa; }
 h1, h2, h3, h4 { color: #1f2328 !important; }
 p, li, span, label { color: #424a53; }
@@ -228,10 +229,12 @@ div[data-testid="stHorizontalBlock"] > div[data-testid="stColumn"] > div > div[d
 # ---------------------------------------------------------------------------
 DATA_DIR = Path(__file__).resolve().parent.parent / "data"
 HF_DATASET_REPO = "Elfsong/hf_paper_summary"
 def _get_hf_token() -> str | None:
     import os
     token = os.getenv("HF_TOKEN", "")
     if token:
         return token
@@ -256,27 +259,34 @@ def _split_to_date(split_name: str) -> str:
 def push_to_hf_dataset(papers: list[dict], date_str: str):
     """Push papers list to HuggingFace dataset as a date split."""
     from datasets import Dataset
     token = _get_hf_token()
     if not token:
         return
     rows = []
     for p in papers:
-        rows.append({
-            "title": p.get("title", ""),
-            "paper_id": p.get("paper_id", ""),
-            "hf_url": p.get("hf_url", ""),
-            "arxiv_url": p.get("arxiv_url", ""),
-            "pdf_url": p.get("pdf_url", ""),
-            "authors": p.get("authors", []),
-            "summary": p.get("summary", ""),
-            "upvotes": p.get("upvotes", 0),
-            "published_at": p.get("published_at", ""),
-            "concise_summary": p.get("concise_summary", ""),
-            "concise_summary_zh": p.get("concise_summary_zh", ""),
-            "detailed_analysis": json.dumps(p.get("detailed_analysis", {}), ensure_ascii=False),
-            "detailed_analysis_zh": json.dumps(p.get("detailed_analysis_zh", {}), ensure_ascii=False),
-        })
     ds = Dataset.from_list(rows)
     split_name = _date_to_split(date_str)
@@ -286,6 +296,7 @@ def push_to_hf_dataset(papers: list[dict], date_str: str):
 def _list_dataset_splits() -> list[str]:
     """List available date splits from the HF dataset repo without loading data."""
     from huggingface_hub import HfApi
     token = _get_hf_token()
     api = HfApi(token=token)
     try:
@@ -307,6 +318,7 @@ def pull_from_hf_dataset(target_date: str | None = None) -> dict[str, list[dict]
     """Load a date split from HF dataset. If target_date is None, load the latest.
     Returns {date_str: papers_list}."""
     from datasets import load_dataset
     token = _get_hf_token()
     splits = _list_dataset_splits()
@@ -331,7 +343,9 @@ def pull_from_hf_dataset(target_date: str | None = None) -> dict[str, list[dict]
     for row in ds:
         paper = dict(row)
         paper["detailed_analysis"] = json.loads(paper.get("detailed_analysis", "{}"))
-        paper["detailed_analysis_zh"] = json.loads(paper.get("detailed_analysis_zh", "{}"))
         papers.append(paper)
     return {date_str: papers}
@@ -373,6 +387,7 @@ def load_papers(source) -> list[dict]:
 SSL_CTX = ssl.create_default_context()
 try:
     import certifi
     SSL_CTX.load_verify_locations(certifi.where())
 except ImportError:
     SSL_CTX.check_hostname = False
@@ -401,6 +416,29 @@ with the same structure: "summary", "pros", "cons".
 Reply with ONLY valid JSON — no markdown fences, no extra text."""
 def fetch_daily_papers(date_str: str) -> list[dict]:
     url = f"{HF_API_URL}?date={date_str}"
@@ -416,23 +454,26 @@ def fetch_daily_papers(date_str: str) -> list[dict]:
         paper = item.get("paper", {})
         paper_id = paper.get("id", "")
         authors = [a.get("name", "") for a in paper.get("authors", [])]
-        papers.append({
-            "title": paper.get("title", ""),
-            "paper_id": paper_id,
-            "hf_url": f"https://huggingface.co/papers/{paper_id}",
-            "arxiv_url": f"https://arxiv.org/abs/{paper_id}",
-            "pdf_url": f"https://arxiv.org/pdf/{paper_id}",
-            "authors": authors,
-            "summary": paper.get("summary", ""),
-            "upvotes": paper.get("upvotes", 0),
-            "published_at": paper.get("publishedAt", ""),
-        })
     papers.sort(key=lambda x: x["upvotes"], reverse=True)
     return papers
 def _get_gemini_key() -> str:
     import os
     api_key = os.getenv("GEMINI_API_KEY", "")
     if api_key:
         return api_key
@@ -441,11 +482,14 @@ def _get_gemini_key() -> str:
         for line in env_path.read_text().splitlines():
             if line.startswith("GEMINI_API_KEY="):
                 return line.split("=", 1)[1].strip()
-    raise RuntimeError("GEMINI_API_KEY not found. Set it as a HF Space secret or in .env")
 def summarize_paper_gemini(title: str, abstract: str) -> dict:
     from google import genai
     api_key = _get_gemini_key()
     client = genai.Client(api_key=api_key)
     resp = client.models.generate_content(
@@ -514,6 +558,189 @@ def crawl_and_summarize(date_str: str) -> Path:
     return output_path
 # ---------------------------------------------------------------------------
 # Summary dialog
 # ---------------------------------------------------------------------------
@@ -539,7 +766,9 @@ def show_summary(paper: dict):
     # TL;DR
     if lang:
-        concise = paper.get("concise_summary_zh", "") or paper.get("concise_summary", "")
     else:
         concise = paper.get("concise_summary", "")
     if concise:
@@ -548,7 +777,9 @@ def show_summary(paper: dict):
     # Detailed Analysis
     if lang:
-        analysis = paper.get("detailed_analysis_zh", {}) or paper.get("detailed_analysis", {})
     else:
         analysis = paper.get("detailed_analysis", {})
     if analysis:
@@ -619,7 +850,11 @@ with col_date:
     available_dates = list_available_dates()
     selected_date = st.date_input(
         "Select date",
-        value=datetime.strptime(available_dates[0], "%Y-%m-%d").date() if available_dates else (datetime.now(timezone.utc) - timedelta(days=1)).date(),
         format="YYYY-MM-DD",
         label_visibility="collapsed",
     )
@@ -630,38 +865,35 @@ with col_lang:
 latest_date = selected_date_str
-# Try HF dataset for selected date
-hf_data = pull_from_hf_dataset(target_date=selected_date_str)
-if hf_data:
-    papers = hf_data[selected_date_str]
-# Fall back to local files
-if not papers:
-    json_files = find_json_files()
-    if selected_date_str in json_files:
-        papers = load_papers(json_files[selected_date_str])
-# Auto-fetch if no data for selected date
-if not papers:
-    st.toast(f"No cached data for {selected_date_str}. Fetching and summarizing...", icon="🔄")
-    result_path = crawl_and_summarize(selected_date_str)
-    if result_path:
-        papers = load_papers(result_path)
 if not papers:
-    st.info("No papers found. Please check back later.")
     st.stop()
 papers.sort(key=lambda p: p.get("upvotes", 0), reverse=True)
 date_label = latest_date
-st.markdown(
-    f"""<div class="stats-bar">
-    <div class="stat-item"><span class="stat-value">{date_label}</span></div>
-    <div class="stat-item"><span class="stat-value">{len(papers)}</span> Papers</div>
-</div>""",
-    unsafe_allow_html=True,
-)
 # --- Render paper grid (3 columns) ---
 NUM_COLS = 3
@@ -673,3 +905,51 @@ for row_start in range(0, len(papers), NUM_COLS):
             break
         with col:
             render_card(papers[paper_idx], rank=paper_idx + 1)

 /* ---------- global ---------- */
 [data-testid="stAppViewContainer"] { background: #f6f8fa; }
 [data-testid="stHeader"] { background: #f6f8fa; }
+.block-container { padding-top: 1rem !important; }
 h1, h2, h3, h4 { color: #1f2328 !important; }
 p, li, span, label { color: #424a53; }
 # ---------------------------------------------------------------------------
 DATA_DIR = Path(__file__).resolve().parent.parent / "data"
 HF_DATASET_REPO = "Elfsong/hf_paper_summary"
+HF_TRENDING_REPO = "Elfsong/hf_paper_trending"
 def _get_hf_token() -> str | None:
     import os
     token = os.getenv("HF_TOKEN", "")
     if token:
         return token
 def push_to_hf_dataset(papers: list[dict], date_str: str):
     """Push papers list to HuggingFace dataset as a date split."""
     from datasets import Dataset
     token = _get_hf_token()
     if not token:
         return
     rows = []
     for p in papers:
+        rows.append(
+            {
+                "title": p.get("title", ""),
+                "paper_id": p.get("paper_id", ""),
+                "hf_url": p.get("hf_url", ""),
+                "arxiv_url": p.get("arxiv_url", ""),
+                "pdf_url": p.get("pdf_url", ""),
+                "authors": p.get("authors", []),
+                "summary": p.get("summary", ""),
+                "upvotes": p.get("upvotes", 0),
+                "published_at": p.get("published_at", ""),
+                "concise_summary": p.get("concise_summary", ""),
+                "concise_summary_zh": p.get("concise_summary_zh", ""),
+                "detailed_analysis": json.dumps(
+                    p.get("detailed_analysis", {}), ensure_ascii=False
+                ),
+                "detailed_analysis_zh": json.dumps(
+                    p.get("detailed_analysis_zh", {}), ensure_ascii=False
+                ),
+            }
+        )
     ds = Dataset.from_list(rows)
     split_name = _date_to_split(date_str)
 def _list_dataset_splits() -> list[str]:
     """List available date splits from the HF dataset repo without loading data."""
     from huggingface_hub import HfApi
     token = _get_hf_token()
     api = HfApi(token=token)
     try:
     """Load a date split from HF dataset. If target_date is None, load the latest.
     Returns {date_str: papers_list}."""
     from datasets import load_dataset
     token = _get_hf_token()
     splits = _list_dataset_splits()
     for row in ds:
         paper = dict(row)
         paper["detailed_analysis"] = json.loads(paper.get("detailed_analysis", "{}"))
+        paper["detailed_analysis_zh"] = json.loads(
+            paper.get("detailed_analysis_zh", "{}")
+        )
         papers.append(paper)
     return {date_str: papers}
 SSL_CTX = ssl.create_default_context()
 try:
     import certifi
     SSL_CTX.load_verify_locations(certifi.where())
 except ImportError:
     SSL_CTX.check_hostname = False
 Reply with ONLY valid JSON — no markdown fences, no extra text."""
+TRENDING_SYSTEM_PROMPT = """\
+You are a senior AI researcher. Given a collection of top papers from the last several days, \
+identify the key research trends and produce a JSON object with exactly six keys:
+1. "trending_summary": A 2-3 sentence English summary of the dominant research trends \
+and themes across these papers. Focus on emerging patterns, hot topics, and notable shifts.
+2. "trending_summary_zh": The same trending summary translated into Chinese (简体中文).
+3. "top_topics": A list of 3-5 short topic labels (e.g. "Multimodal LLMs", "Efficient Fine-tuning") \
+representing the most prominent themes, in English.
+4. "top_topics_zh": The same topic labels translated into Chinese (简体中文).
+5. "keywords": A list of 5-10 specific technical keywords or terms that appear frequently \
+or are central to the papers (e.g. "LoRA", "RLHF", "diffusion", "chain-of-thought", "MoE", \
+"RAG", "MLLM", "DPO"). Use the canonical technical term, not a paraphrase.
+6. "keywords_zh": The same technical keywords translated into Chinese where applicable \
+(keep English acronyms as-is, e.g. "LoRA", "RLHF", "扩散模型", "思维链").
+Reply with ONLY valid JSON — no markdown fences, no extra text."""
 def fetch_daily_papers(date_str: str) -> list[dict]:
     url = f"{HF_API_URL}?date={date_str}"
         paper = item.get("paper", {})
         paper_id = paper.get("id", "")
         authors = [a.get("name", "") for a in paper.get("authors", [])]
+        papers.append(
+            {
+                "title": paper.get("title", ""),
+                "paper_id": paper_id,
+                "hf_url": f"https://huggingface.co/papers/{paper_id}",
+                "arxiv_url": f"https://arxiv.org/abs/{paper_id}",
+                "pdf_url": f"https://arxiv.org/pdf/{paper_id}",
+                "authors": authors,
+                "summary": paper.get("summary", ""),
+                "upvotes": paper.get("upvotes", 0),
+                "published_at": paper.get("publishedAt", ""),
+            }
+        )
     papers.sort(key=lambda x: x["upvotes"], reverse=True)
     return papers
 def _get_gemini_key() -> str:
     import os
     api_key = os.getenv("GEMINI_API_KEY", "")
     if api_key:
         return api_key
         for line in env_path.read_text().splitlines():
             if line.startswith("GEMINI_API_KEY="):
                 return line.split("=", 1)[1].strip()
+    raise RuntimeError(
+        "GEMINI_API_KEY not found. Set it as a HF Space secret or in .env"
+    )
 def summarize_paper_gemini(title: str, abstract: str) -> dict:
     from google import genai
     api_key = _get_gemini_key()
     client = genai.Client(api_key=api_key)
     resp = client.models.generate_content(
     return output_path
+# ---------------------------------------------------------------------------
+# Trending summary
+# ---------------------------------------------------------------------------
+def _load_recent_papers(n_days: int = 5) -> tuple[list[dict], str, str]:
+    """Load top papers from the most recent n_days splits.
+    Returns (papers, earliest_date, latest_date)."""
+    from datasets import load_dataset
+    token = _get_hf_token()
+    splits = _list_dataset_splits()[:n_days]
+    all_papers = []
+    loaded_dates = []
+    for split in splits:
+        try:
+            ds = load_dataset(HF_DATASET_REPO, split=split, token=token)
+            date = _split_to_date(split)
+            loaded_dates.append(date)
+            for row in ds:
+                paper = dict(row)
+                paper["_date"] = date
+                all_papers.append(paper)
+        except Exception:
+            continue
+    all_papers.sort(key=lambda p: p.get("upvotes", 0), reverse=True)
+    earliest = min(loaded_dates) if loaded_dates else ""
+    latest = max(loaded_dates) if loaded_dates else ""
+    return all_papers, earliest, latest
+def generate_trending_summary(papers: list[dict]) -> dict:
+    """Call Gemini to produce a trending summary from recent papers."""
+    from google import genai
+    api_key = _get_gemini_key()
+    client = genai.Client(api_key=api_key)
+    # Build input: title + concise_summary + detailed analysis for each paper
+    lines = []
+    for p in papers:
+        date = p.get("_date", "")
+        title = p.get("title", "")
+        summary = p.get("concise_summary", "") or p.get("summary", "")
+        upvotes = p.get("upvotes", 0)
+        parts = [f"[{date}] (upvotes: {upvotes}) {title}", summary]
+        analysis = p.get("detailed_analysis", {})
+        if isinstance(analysis, str):
+            try:
+                analysis = json.loads(analysis)
+            except Exception:
+                analysis = {}
+        if analysis:
+            if analysis.get("summary"):
+                parts.append(f"Analysis: {analysis['summary']}")
+            pros = analysis.get("pros", [])
+            if pros:
+                parts.append("Strengths: " + "; ".join(pros))
+            cons = analysis.get("cons", [])
+            if cons:
+                parts.append("Limitations: " + "; ".join(cons))
+        lines.append("\n".join(parts))
+    content = "\n\n".join(lines)
+    resp = client.models.generate_content(
+        model="gemini-2.5-flash",
+        contents=content,
+        config=genai.types.GenerateContentConfig(
+            system_instruction=TRENDING_SYSTEM_PROMPT,
+            temperature=0.3,
+            max_output_tokens=4096*6,
+            response_mime_type="application/json",
+        ),
+    )
+    return json.loads(resp.text)
+def push_trending_to_hf(trending: dict, date_str: str):
+    """Push trending summary to HF dataset."""
+    from datasets import Dataset
+    token = _get_hf_token()
+    if not token:
+        return
+    row = {
+        "trending_summary": trending.get("trending_summary", ""),
+        "trending_summary_zh": trending.get("trending_summary_zh", ""),
+        "top_topics": json.dumps(trending.get("top_topics", []), ensure_ascii=False),
+        "top_topics_zh": json.dumps(
+            trending.get("top_topics_zh", []), ensure_ascii=False
+        ),
+        "keywords": json.dumps(trending.get("keywords", []), ensure_ascii=False),
+        "keywords_zh": json.dumps(trending.get("keywords_zh", []), ensure_ascii=False),
+        "date_range": trending.get("date_range", ""),
+        "generated_date": date_str,
+    }
+    ds = Dataset.from_list([row])
+    split_name = _date_to_split(date_str)
+    ds.push_to_hub(HF_TRENDING_REPO, split=split_name, token=token)
+def pull_trending_from_hf(target_date: str | None = None) -> dict | None:
+    """Load trending summary from HF dataset. Returns dict or None."""
+    from huggingface_hub import HfApi
+    from datasets import load_dataset
+    token = _get_hf_token()
+    api = HfApi(token=token)
+    try:
+        files = api.list_repo_files(HF_TRENDING_REPO, repo_type="dataset")
+    except Exception:
+        return None
+    splits = set()
+    for f in files:
+        name = f.split("/")[-1]
+        for part in name.replace(".parquet", "").replace(".arrow", "").split("-"):
+            if part.startswith("date_"):
+                splits.add(part)
+                break
+    splits = sorted(splits, reverse=True)
+    if not splits:
+        return None
+    if target_date:
+        target_split = _date_to_split(target_date)
+        if target_split not in splits:
+            return None
+        split_to_load = target_split
+    else:
+        split_to_load = splits[0]
+    try:
+        ds = load_dataset(HF_TRENDING_REPO, split=split_to_load, token=token)
+    except Exception:
+        return None
+    row = dict(ds[0])
+    row["top_topics"] = json.loads(row.get("top_topics", "[]"))
+    row["top_topics_zh"] = json.loads(row.get("top_topics_zh", "[]"))
+    row["keywords"] = json.loads(row.get("keywords", "[]"))
+    row["keywords_zh"] = json.loads(row.get("keywords_zh", "[]"))
+    return row
+def get_or_generate_trending(date_str: str, status=None) -> tuple[dict | None, str]:
+    """Get trending from HF cache, or generate and push it.
+    Returns (trending_dict, date_range_str)."""
+    if status:
+        status.info("Checking cached trending summary...")
+    trending = pull_trending_from_hf(target_date=date_str)
+    if trending:
+        date_range = trending.get("date_range", "")
+        return trending, date_range
+    # Generate fresh trending
+    if status:
+        status.info("Loading recent papers for trending analysis...")
+    recent_papers, earliest, latest = _load_recent_papers(n_days=5)
+    if not recent_papers:
+        if status:
+            status.warning("No recent papers available for trending analysis.")
+        return None, ""
+    date_range = f"{earliest} ~ {latest}" if earliest and latest else ""
+    try:
+        if status:
+            status.info("Generating trending summary with Gemini...")
+        trending = generate_trending_summary(recent_papers)
+        trending["date_range"] = date_range
+    except Exception as e:
+        if status:
+            status.error(f"Trending generation failed: {e}")
+        return None, ""
+    try:
+        if status:
+            status.info("Saving trending summary to HuggingFace...")
+        push_trending_to_hf(trending, date_str)
+    except Exception as e:
+        if status:
+            status.warning(f"HF push failed: {e}")
+    return trending, date_range
 # ---------------------------------------------------------------------------
 # Summary dialog
 # ---------------------------------------------------------------------------
     # TL;DR
     if lang:
+        concise = paper.get("concise_summary_zh", "") or paper.get(
+            "concise_summary", ""
+        )
     else:
         concise = paper.get("concise_summary", "")
     if concise:
     # Detailed Analysis
     if lang:
+        analysis = paper.get("detailed_analysis_zh", {}) or paper.get(
+            "detailed_analysis", {}
+        )
     else:
         analysis = paper.get("detailed_analysis", {})
     if analysis:
     available_dates = list_available_dates()
     selected_date = st.date_input(
         "Select date",
+        value=(
+            datetime.strptime(available_dates[0], "%Y-%m-%d").date()
+            if available_dates
+            else (datetime.now(timezone.utc) - timedelta(days=1)).date()
+        ),
         format="YYYY-MM-DD",
         label_visibility="collapsed",
     )
 latest_date = selected_date_str
+with st.spinner("Loading papers..."):
+    hf_data = pull_from_hf_dataset(target_date=selected_date_str)
+    if hf_data:
+        papers = hf_data[selected_date_str]
+    if not papers:
+        json_files = find_json_files()
+        if selected_date_str in json_files:
+            papers = load_papers(json_files[selected_date_str])
+    if not papers:
+        result_path = crawl_and_summarize(selected_date_str)
+        if result_path:
+            papers = load_papers(result_path)
 if not papers:
+    st.error("No papers found. Please check back later.")
     st.stop()
 papers.sort(key=lambda p: p.get("upvotes", 0), reverse=True)
 date_label = latest_date
+lang = st.session_state.get("global_lang_toggle", False)
+# --- Trending status (spinner under title, filled later) ---
+trending_spinner = st.empty()
+# --- Trending summary placeholder (filled after papers render) ---
+trending_placeholder = st.empty()
 # --- Render paper grid (3 columns) ---
 NUM_COLS = 3
             break
         with col:
             render_card(papers[paper_idx], rank=paper_idx + 1)
+# --- Trending summary (loaded after papers are displayed) ---
+with trending_spinner.container():
+    with st.spinner("Loading trending summary..."):
+        trending, trending_date_range = get_or_generate_trending(
+            selected_date_str, status=None
+        )
+trending_spinner.empty()
+if trending:
+    if lang:
+        summary_text = trending.get("trending_summary_zh", "") or trending.get(
+            "trending_summary", ""
+        )
+        topics = trending.get("top_topics_zh", []) or trending.get("top_topics", [])
+        keywords = trending.get("keywords_zh", []) or trending.get("keywords", [])
+    else:
+        summary_text = trending.get("trending_summary", "")
+        topics = trending.get("top_topics", [])
+        keywords = trending.get("keywords", [])
+    topics_html = " ".join(
+        f'<span style="background:#eef1f5;padding:2px 10px;border-radius:12px;'
+        f'font-size:12px;font-weight:600;color:#2563eb;">{t}</span>'
+        for t in topics
+    )
+    keywords_html = " ".join(
+        f'<span style="background:#fff8e1;padding:2px 10px;border-radius:12px;'
+        f'font-size:11px;font-weight:500;color:#9a6700;border:1px solid #f0d060;">{k}</span>'
+        for k in keywords
+    )
+    date_range_label = (
+        f'<span style="font-size:12px;color:#9a6700;font-weight:600;">({trending_date_range})</span>'
+        if trending_date_range
+        else ""
+    )
+    trending_placeholder.markdown(
+        f"""<div class="stats-bar">
+        <div style="flex:1;min-width:200px;">
+            <div style="font-size:13px;color:#656d76;margin-bottom:4px;">
+                {"🔥 趋势" if lang else "🔥 Trending"} {date_range_label}
+            </div>
+            <div style="font-size:13px;color:#424a53;line-height:1.5;">{summary_text}</div>
+            <div style="display:flex;gap:6px;flex-wrap:wrap;margin-top:8px;">{topics_html}</div>
+            <div style="display:flex;gap:6px;flex-wrap:wrap;margin-top:8px;">{keywords_html}</div>
+        </div>
+    </div>""",
+        unsafe_allow_html=True,
+    )