Spaces:
Restarting
Restarting
elfsong commited on
Commit ·
c65838f
1
Parent(s): ba9bdfb
Enhance Streamlit app with new features and UI improvements
Browse files- Added padding to the block container for better layout.
- Introduced a new Hugging Face trending repository for enhanced data integration.
- Refactored data handling in push_to_hf_dataset and pull_from_hf_dataset functions for improved readability.
- Updated the fetch_daily_papers function to streamline paper retrieval.
- Added a new TRENDING_SYSTEM_PROMPT for identifying key research trends in papers.
- src/streamlit_app.py +334 -54
src/streamlit_app.py
CHANGED
|
@@ -28,6 +28,7 @@ st.markdown(
|
|
| 28 |
/* ---------- global ---------- */
|
| 29 |
[data-testid="stAppViewContainer"] { background: #f6f8fa; }
|
| 30 |
[data-testid="stHeader"] { background: #f6f8fa; }
|
|
|
|
| 31 |
|
| 32 |
h1, h2, h3, h4 { color: #1f2328 !important; }
|
| 33 |
p, li, span, label { color: #424a53; }
|
|
@@ -228,10 +229,12 @@ div[data-testid="stHorizontalBlock"] > div[data-testid="stColumn"] > div > div[d
|
|
| 228 |
# ---------------------------------------------------------------------------
|
| 229 |
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
|
| 230 |
HF_DATASET_REPO = "Elfsong/hf_paper_summary"
|
|
|
|
| 231 |
|
| 232 |
|
| 233 |
def _get_hf_token() -> str | None:
|
| 234 |
import os
|
|
|
|
| 235 |
token = os.getenv("HF_TOKEN", "")
|
| 236 |
if token:
|
| 237 |
return token
|
|
@@ -256,27 +259,34 @@ def _split_to_date(split_name: str) -> str:
|
|
| 256 |
def push_to_hf_dataset(papers: list[dict], date_str: str):
|
| 257 |
"""Push papers list to HuggingFace dataset as a date split."""
|
| 258 |
from datasets import Dataset
|
|
|
|
| 259 |
token = _get_hf_token()
|
| 260 |
if not token:
|
| 261 |
return
|
| 262 |
|
| 263 |
rows = []
|
| 264 |
for p in papers:
|
| 265 |
-
rows.append(
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
|
| 278 |
-
|
| 279 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 280 |
|
| 281 |
ds = Dataset.from_list(rows)
|
| 282 |
split_name = _date_to_split(date_str)
|
|
@@ -286,6 +296,7 @@ def push_to_hf_dataset(papers: list[dict], date_str: str):
|
|
| 286 |
def _list_dataset_splits() -> list[str]:
|
| 287 |
"""List available date splits from the HF dataset repo without loading data."""
|
| 288 |
from huggingface_hub import HfApi
|
|
|
|
| 289 |
token = _get_hf_token()
|
| 290 |
api = HfApi(token=token)
|
| 291 |
try:
|
|
@@ -307,6 +318,7 @@ def pull_from_hf_dataset(target_date: str | None = None) -> dict[str, list[dict]
|
|
| 307 |
"""Load a date split from HF dataset. If target_date is None, load the latest.
|
| 308 |
Returns {date_str: papers_list}."""
|
| 309 |
from datasets import load_dataset
|
|
|
|
| 310 |
token = _get_hf_token()
|
| 311 |
|
| 312 |
splits = _list_dataset_splits()
|
|
@@ -331,7 +343,9 @@ def pull_from_hf_dataset(target_date: str | None = None) -> dict[str, list[dict]
|
|
| 331 |
for row in ds:
|
| 332 |
paper = dict(row)
|
| 333 |
paper["detailed_analysis"] = json.loads(paper.get("detailed_analysis", "{}"))
|
| 334 |
-
paper["detailed_analysis_zh"] = json.loads(
|
|
|
|
|
|
|
| 335 |
papers.append(paper)
|
| 336 |
return {date_str: papers}
|
| 337 |
|
|
@@ -373,6 +387,7 @@ def load_papers(source) -> list[dict]:
|
|
| 373 |
SSL_CTX = ssl.create_default_context()
|
| 374 |
try:
|
| 375 |
import certifi
|
|
|
|
| 376 |
SSL_CTX.load_verify_locations(certifi.where())
|
| 377 |
except ImportError:
|
| 378 |
SSL_CTX.check_hostname = False
|
|
@@ -401,6 +416,29 @@ with the same structure: "summary", "pros", "cons".
|
|
| 401 |
|
| 402 |
Reply with ONLY valid JSON — no markdown fences, no extra text."""
|
| 403 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 404 |
|
| 405 |
def fetch_daily_papers(date_str: str) -> list[dict]:
|
| 406 |
url = f"{HF_API_URL}?date={date_str}"
|
|
@@ -416,23 +454,26 @@ def fetch_daily_papers(date_str: str) -> list[dict]:
|
|
| 416 |
paper = item.get("paper", {})
|
| 417 |
paper_id = paper.get("id", "")
|
| 418 |
authors = [a.get("name", "") for a in paper.get("authors", [])]
|
| 419 |
-
papers.append(
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
| 429 |
-
|
|
|
|
|
|
|
| 430 |
papers.sort(key=lambda x: x["upvotes"], reverse=True)
|
| 431 |
return papers
|
| 432 |
|
| 433 |
|
| 434 |
def _get_gemini_key() -> str:
|
| 435 |
import os
|
|
|
|
| 436 |
api_key = os.getenv("GEMINI_API_KEY", "")
|
| 437 |
if api_key:
|
| 438 |
return api_key
|
|
@@ -441,11 +482,14 @@ def _get_gemini_key() -> str:
|
|
| 441 |
for line in env_path.read_text().splitlines():
|
| 442 |
if line.startswith("GEMINI_API_KEY="):
|
| 443 |
return line.split("=", 1)[1].strip()
|
| 444 |
-
raise RuntimeError(
|
|
|
|
|
|
|
| 445 |
|
| 446 |
|
| 447 |
def summarize_paper_gemini(title: str, abstract: str) -> dict:
|
| 448 |
from google import genai
|
|
|
|
| 449 |
api_key = _get_gemini_key()
|
| 450 |
client = genai.Client(api_key=api_key)
|
| 451 |
resp = client.models.generate_content(
|
|
@@ -514,6 +558,189 @@ def crawl_and_summarize(date_str: str) -> Path:
|
|
| 514 |
return output_path
|
| 515 |
|
| 516 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 517 |
# ---------------------------------------------------------------------------
|
| 518 |
# Summary dialog
|
| 519 |
# ---------------------------------------------------------------------------
|
|
@@ -539,7 +766,9 @@ def show_summary(paper: dict):
|
|
| 539 |
|
| 540 |
# TL;DR
|
| 541 |
if lang:
|
| 542 |
-
concise = paper.get("concise_summary_zh", "") or paper.get(
|
|
|
|
|
|
|
| 543 |
else:
|
| 544 |
concise = paper.get("concise_summary", "")
|
| 545 |
if concise:
|
|
@@ -548,7 +777,9 @@ def show_summary(paper: dict):
|
|
| 548 |
|
| 549 |
# Detailed Analysis
|
| 550 |
if lang:
|
| 551 |
-
analysis = paper.get("detailed_analysis_zh", {}) or paper.get(
|
|
|
|
|
|
|
| 552 |
else:
|
| 553 |
analysis = paper.get("detailed_analysis", {})
|
| 554 |
if analysis:
|
|
@@ -619,7 +850,11 @@ with col_date:
|
|
| 619 |
available_dates = list_available_dates()
|
| 620 |
selected_date = st.date_input(
|
| 621 |
"Select date",
|
| 622 |
-
value=
|
|
|
|
|
|
|
|
|
|
|
|
|
| 623 |
format="YYYY-MM-DD",
|
| 624 |
label_visibility="collapsed",
|
| 625 |
)
|
|
@@ -630,38 +865,35 @@ with col_lang:
|
|
| 630 |
|
| 631 |
latest_date = selected_date_str
|
| 632 |
|
| 633 |
-
|
| 634 |
-
hf_data = pull_from_hf_dataset(target_date=selected_date_str)
|
| 635 |
-
if hf_data:
|
| 636 |
-
|
| 637 |
|
| 638 |
-
|
| 639 |
-
|
| 640 |
-
|
| 641 |
-
|
| 642 |
-
papers = load_papers(json_files[selected_date_str])
|
| 643 |
|
| 644 |
-
|
| 645 |
-
|
| 646 |
-
|
| 647 |
-
|
| 648 |
-
if result_path:
|
| 649 |
-
papers = load_papers(result_path)
|
| 650 |
|
| 651 |
if not papers:
|
| 652 |
-
st.
|
| 653 |
st.stop()
|
| 654 |
|
| 655 |
papers.sort(key=lambda p: p.get("upvotes", 0), reverse=True)
|
| 656 |
|
| 657 |
date_label = latest_date
|
| 658 |
-
st.
|
| 659 |
-
|
| 660 |
-
|
| 661 |
-
|
| 662 |
-
|
| 663 |
-
|
| 664 |
-
)
|
| 665 |
|
| 666 |
# --- Render paper grid (3 columns) ---
|
| 667 |
NUM_COLS = 3
|
|
@@ -673,3 +905,51 @@ for row_start in range(0, len(papers), NUM_COLS):
|
|
| 673 |
break
|
| 674 |
with col:
|
| 675 |
render_card(papers[paper_idx], rank=paper_idx + 1)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 28 |
/* ---------- global ---------- */
|
| 29 |
[data-testid="stAppViewContainer"] { background: #f6f8fa; }
|
| 30 |
[data-testid="stHeader"] { background: #f6f8fa; }
|
| 31 |
+
.block-container { padding-top: 1rem !important; }
|
| 32 |
|
| 33 |
h1, h2, h3, h4 { color: #1f2328 !important; }
|
| 34 |
p, li, span, label { color: #424a53; }
|
|
|
|
| 229 |
# ---------------------------------------------------------------------------
|
| 230 |
DATA_DIR = Path(__file__).resolve().parent.parent / "data"
|
| 231 |
HF_DATASET_REPO = "Elfsong/hf_paper_summary"
|
| 232 |
+
HF_TRENDING_REPO = "Elfsong/hf_paper_trending"
|
| 233 |
|
| 234 |
|
| 235 |
def _get_hf_token() -> str | None:
|
| 236 |
import os
|
| 237 |
+
|
| 238 |
token = os.getenv("HF_TOKEN", "")
|
| 239 |
if token:
|
| 240 |
return token
|
|
|
|
| 259 |
def push_to_hf_dataset(papers: list[dict], date_str: str):
|
| 260 |
"""Push papers list to HuggingFace dataset as a date split."""
|
| 261 |
from datasets import Dataset
|
| 262 |
+
|
| 263 |
token = _get_hf_token()
|
| 264 |
if not token:
|
| 265 |
return
|
| 266 |
|
| 267 |
rows = []
|
| 268 |
for p in papers:
|
| 269 |
+
rows.append(
|
| 270 |
+
{
|
| 271 |
+
"title": p.get("title", ""),
|
| 272 |
+
"paper_id": p.get("paper_id", ""),
|
| 273 |
+
"hf_url": p.get("hf_url", ""),
|
| 274 |
+
"arxiv_url": p.get("arxiv_url", ""),
|
| 275 |
+
"pdf_url": p.get("pdf_url", ""),
|
| 276 |
+
"authors": p.get("authors", []),
|
| 277 |
+
"summary": p.get("summary", ""),
|
| 278 |
+
"upvotes": p.get("upvotes", 0),
|
| 279 |
+
"published_at": p.get("published_at", ""),
|
| 280 |
+
"concise_summary": p.get("concise_summary", ""),
|
| 281 |
+
"concise_summary_zh": p.get("concise_summary_zh", ""),
|
| 282 |
+
"detailed_analysis": json.dumps(
|
| 283 |
+
p.get("detailed_analysis", {}), ensure_ascii=False
|
| 284 |
+
),
|
| 285 |
+
"detailed_analysis_zh": json.dumps(
|
| 286 |
+
p.get("detailed_analysis_zh", {}), ensure_ascii=False
|
| 287 |
+
),
|
| 288 |
+
}
|
| 289 |
+
)
|
| 290 |
|
| 291 |
ds = Dataset.from_list(rows)
|
| 292 |
split_name = _date_to_split(date_str)
|
|
|
|
| 296 |
def _list_dataset_splits() -> list[str]:
|
| 297 |
"""List available date splits from the HF dataset repo without loading data."""
|
| 298 |
from huggingface_hub import HfApi
|
| 299 |
+
|
| 300 |
token = _get_hf_token()
|
| 301 |
api = HfApi(token=token)
|
| 302 |
try:
|
|
|
|
| 318 |
"""Load a date split from HF dataset. If target_date is None, load the latest.
|
| 319 |
Returns {date_str: papers_list}."""
|
| 320 |
from datasets import load_dataset
|
| 321 |
+
|
| 322 |
token = _get_hf_token()
|
| 323 |
|
| 324 |
splits = _list_dataset_splits()
|
|
|
|
| 343 |
for row in ds:
|
| 344 |
paper = dict(row)
|
| 345 |
paper["detailed_analysis"] = json.loads(paper.get("detailed_analysis", "{}"))
|
| 346 |
+
paper["detailed_analysis_zh"] = json.loads(
|
| 347 |
+
paper.get("detailed_analysis_zh", "{}")
|
| 348 |
+
)
|
| 349 |
papers.append(paper)
|
| 350 |
return {date_str: papers}
|
| 351 |
|
|
|
|
| 387 |
SSL_CTX = ssl.create_default_context()
|
| 388 |
try:
|
| 389 |
import certifi
|
| 390 |
+
|
| 391 |
SSL_CTX.load_verify_locations(certifi.where())
|
| 392 |
except ImportError:
|
| 393 |
SSL_CTX.check_hostname = False
|
|
|
|
| 416 |
|
| 417 |
Reply with ONLY valid JSON — no markdown fences, no extra text."""
|
| 418 |
|
| 419 |
+
TRENDING_SYSTEM_PROMPT = """\
|
| 420 |
+
You are a senior AI researcher. Given a collection of top papers from the last several days, \
|
| 421 |
+
identify the key research trends and produce a JSON object with exactly six keys:
|
| 422 |
+
|
| 423 |
+
1. "trending_summary": A 2-3 sentence English summary of the dominant research trends \
|
| 424 |
+
and themes across these papers. Focus on emerging patterns, hot topics, and notable shifts.
|
| 425 |
+
|
| 426 |
+
2. "trending_summary_zh": The same trending summary translated into Chinese (简体中文).
|
| 427 |
+
|
| 428 |
+
3. "top_topics": A list of 3-5 short topic labels (e.g. "Multimodal LLMs", "Efficient Fine-tuning") \
|
| 429 |
+
representing the most prominent themes, in English.
|
| 430 |
+
|
| 431 |
+
4. "top_topics_zh": The same topic labels translated into Chinese (简体中文).
|
| 432 |
+
|
| 433 |
+
5. "keywords": A list of 5-10 specific technical keywords or terms that appear frequently \
|
| 434 |
+
or are central to the papers (e.g. "LoRA", "RLHF", "diffusion", "chain-of-thought", "MoE", \
|
| 435 |
+
"RAG", "MLLM", "DPO"). Use the canonical technical term, not a paraphrase.
|
| 436 |
+
|
| 437 |
+
6. "keywords_zh": The same technical keywords translated into Chinese where applicable \
|
| 438 |
+
(keep English acronyms as-is, e.g. "LoRA", "RLHF", "扩散模型", "思维链").
|
| 439 |
+
|
| 440 |
+
Reply with ONLY valid JSON — no markdown fences, no extra text."""
|
| 441 |
+
|
| 442 |
|
| 443 |
def fetch_daily_papers(date_str: str) -> list[dict]:
|
| 444 |
url = f"{HF_API_URL}?date={date_str}"
|
|
|
|
| 454 |
paper = item.get("paper", {})
|
| 455 |
paper_id = paper.get("id", "")
|
| 456 |
authors = [a.get("name", "") for a in paper.get("authors", [])]
|
| 457 |
+
papers.append(
|
| 458 |
+
{
|
| 459 |
+
"title": paper.get("title", ""),
|
| 460 |
+
"paper_id": paper_id,
|
| 461 |
+
"hf_url": f"https://huggingface.co/papers/{paper_id}",
|
| 462 |
+
"arxiv_url": f"https://arxiv.org/abs/{paper_id}",
|
| 463 |
+
"pdf_url": f"https://arxiv.org/pdf/{paper_id}",
|
| 464 |
+
"authors": authors,
|
| 465 |
+
"summary": paper.get("summary", ""),
|
| 466 |
+
"upvotes": paper.get("upvotes", 0),
|
| 467 |
+
"published_at": paper.get("publishedAt", ""),
|
| 468 |
+
}
|
| 469 |
+
)
|
| 470 |
papers.sort(key=lambda x: x["upvotes"], reverse=True)
|
| 471 |
return papers
|
| 472 |
|
| 473 |
|
| 474 |
def _get_gemini_key() -> str:
|
| 475 |
import os
|
| 476 |
+
|
| 477 |
api_key = os.getenv("GEMINI_API_KEY", "")
|
| 478 |
if api_key:
|
| 479 |
return api_key
|
|
|
|
| 482 |
for line in env_path.read_text().splitlines():
|
| 483 |
if line.startswith("GEMINI_API_KEY="):
|
| 484 |
return line.split("=", 1)[1].strip()
|
| 485 |
+
raise RuntimeError(
|
| 486 |
+
"GEMINI_API_KEY not found. Set it as a HF Space secret or in .env"
|
| 487 |
+
)
|
| 488 |
|
| 489 |
|
| 490 |
def summarize_paper_gemini(title: str, abstract: str) -> dict:
|
| 491 |
from google import genai
|
| 492 |
+
|
| 493 |
api_key = _get_gemini_key()
|
| 494 |
client = genai.Client(api_key=api_key)
|
| 495 |
resp = client.models.generate_content(
|
|
|
|
| 558 |
return output_path
|
| 559 |
|
| 560 |
|
| 561 |
+
# ---------------------------------------------------------------------------
|
| 562 |
+
# Trending summary
|
| 563 |
+
# ---------------------------------------------------------------------------
|
| 564 |
+
def _load_recent_papers(n_days: int = 5) -> tuple[list[dict], str, str]:
|
| 565 |
+
"""Load top papers from the most recent n_days splits.
|
| 566 |
+
Returns (papers, earliest_date, latest_date)."""
|
| 567 |
+
from datasets import load_dataset
|
| 568 |
+
|
| 569 |
+
token = _get_hf_token()
|
| 570 |
+
splits = _list_dataset_splits()[:n_days]
|
| 571 |
+
all_papers = []
|
| 572 |
+
loaded_dates = []
|
| 573 |
+
for split in splits:
|
| 574 |
+
try:
|
| 575 |
+
ds = load_dataset(HF_DATASET_REPO, split=split, token=token)
|
| 576 |
+
date = _split_to_date(split)
|
| 577 |
+
loaded_dates.append(date)
|
| 578 |
+
for row in ds:
|
| 579 |
+
paper = dict(row)
|
| 580 |
+
paper["_date"] = date
|
| 581 |
+
all_papers.append(paper)
|
| 582 |
+
except Exception:
|
| 583 |
+
continue
|
| 584 |
+
all_papers.sort(key=lambda p: p.get("upvotes", 0), reverse=True)
|
| 585 |
+
earliest = min(loaded_dates) if loaded_dates else ""
|
| 586 |
+
latest = max(loaded_dates) if loaded_dates else ""
|
| 587 |
+
return all_papers, earliest, latest
|
| 588 |
+
|
| 589 |
+
|
| 590 |
+
def generate_trending_summary(papers: list[dict]) -> dict:
|
| 591 |
+
"""Call Gemini to produce a trending summary from recent papers."""
|
| 592 |
+
from google import genai
|
| 593 |
+
|
| 594 |
+
api_key = _get_gemini_key()
|
| 595 |
+
client = genai.Client(api_key=api_key)
|
| 596 |
+
|
| 597 |
+
# Build input: title + concise_summary + detailed analysis for each paper
|
| 598 |
+
lines = []
|
| 599 |
+
for p in papers:
|
| 600 |
+
date = p.get("_date", "")
|
| 601 |
+
title = p.get("title", "")
|
| 602 |
+
summary = p.get("concise_summary", "") or p.get("summary", "")
|
| 603 |
+
upvotes = p.get("upvotes", 0)
|
| 604 |
+
parts = [f"[{date}] (upvotes: {upvotes}) {title}", summary]
|
| 605 |
+
analysis = p.get("detailed_analysis", {})
|
| 606 |
+
if isinstance(analysis, str):
|
| 607 |
+
try:
|
| 608 |
+
analysis = json.loads(analysis)
|
| 609 |
+
except Exception:
|
| 610 |
+
analysis = {}
|
| 611 |
+
if analysis:
|
| 612 |
+
if analysis.get("summary"):
|
| 613 |
+
parts.append(f"Analysis: {analysis['summary']}")
|
| 614 |
+
pros = analysis.get("pros", [])
|
| 615 |
+
if pros:
|
| 616 |
+
parts.append("Strengths: " + "; ".join(pros))
|
| 617 |
+
cons = analysis.get("cons", [])
|
| 618 |
+
if cons:
|
| 619 |
+
parts.append("Limitations: " + "; ".join(cons))
|
| 620 |
+
lines.append("\n".join(parts))
|
| 621 |
+
content = "\n\n".join(lines)
|
| 622 |
+
|
| 623 |
+
resp = client.models.generate_content(
|
| 624 |
+
model="gemini-2.5-flash",
|
| 625 |
+
contents=content,
|
| 626 |
+
config=genai.types.GenerateContentConfig(
|
| 627 |
+
system_instruction=TRENDING_SYSTEM_PROMPT,
|
| 628 |
+
temperature=0.3,
|
| 629 |
+
max_output_tokens=4096*6,
|
| 630 |
+
response_mime_type="application/json",
|
| 631 |
+
),
|
| 632 |
+
)
|
| 633 |
+
return json.loads(resp.text)
|
| 634 |
+
|
| 635 |
+
|
| 636 |
+
def push_trending_to_hf(trending: dict, date_str: str):
|
| 637 |
+
"""Push trending summary to HF dataset."""
|
| 638 |
+
from datasets import Dataset
|
| 639 |
+
|
| 640 |
+
token = _get_hf_token()
|
| 641 |
+
if not token:
|
| 642 |
+
return
|
| 643 |
+
row = {
|
| 644 |
+
"trending_summary": trending.get("trending_summary", ""),
|
| 645 |
+
"trending_summary_zh": trending.get("trending_summary_zh", ""),
|
| 646 |
+
"top_topics": json.dumps(trending.get("top_topics", []), ensure_ascii=False),
|
| 647 |
+
"top_topics_zh": json.dumps(
|
| 648 |
+
trending.get("top_topics_zh", []), ensure_ascii=False
|
| 649 |
+
),
|
| 650 |
+
"keywords": json.dumps(trending.get("keywords", []), ensure_ascii=False),
|
| 651 |
+
"keywords_zh": json.dumps(trending.get("keywords_zh", []), ensure_ascii=False),
|
| 652 |
+
"date_range": trending.get("date_range", ""),
|
| 653 |
+
"generated_date": date_str,
|
| 654 |
+
}
|
| 655 |
+
ds = Dataset.from_list([row])
|
| 656 |
+
split_name = _date_to_split(date_str)
|
| 657 |
+
ds.push_to_hub(HF_TRENDING_REPO, split=split_name, token=token)
|
| 658 |
+
|
| 659 |
+
|
| 660 |
+
def pull_trending_from_hf(target_date: str | None = None) -> dict | None:
|
| 661 |
+
"""Load trending summary from HF dataset. Returns dict or None."""
|
| 662 |
+
from huggingface_hub import HfApi
|
| 663 |
+
from datasets import load_dataset
|
| 664 |
+
|
| 665 |
+
token = _get_hf_token()
|
| 666 |
+
api = HfApi(token=token)
|
| 667 |
+
try:
|
| 668 |
+
files = api.list_repo_files(HF_TRENDING_REPO, repo_type="dataset")
|
| 669 |
+
except Exception:
|
| 670 |
+
return None
|
| 671 |
+
|
| 672 |
+
splits = set()
|
| 673 |
+
for f in files:
|
| 674 |
+
name = f.split("/")[-1]
|
| 675 |
+
for part in name.replace(".parquet", "").replace(".arrow", "").split("-"):
|
| 676 |
+
if part.startswith("date_"):
|
| 677 |
+
splits.add(part)
|
| 678 |
+
break
|
| 679 |
+
splits = sorted(splits, reverse=True)
|
| 680 |
+
if not splits:
|
| 681 |
+
return None
|
| 682 |
+
|
| 683 |
+
if target_date:
|
| 684 |
+
target_split = _date_to_split(target_date)
|
| 685 |
+
if target_split not in splits:
|
| 686 |
+
return None
|
| 687 |
+
split_to_load = target_split
|
| 688 |
+
else:
|
| 689 |
+
split_to_load = splits[0]
|
| 690 |
+
|
| 691 |
+
try:
|
| 692 |
+
ds = load_dataset(HF_TRENDING_REPO, split=split_to_load, token=token)
|
| 693 |
+
except Exception:
|
| 694 |
+
return None
|
| 695 |
+
|
| 696 |
+
row = dict(ds[0])
|
| 697 |
+
row["top_topics"] = json.loads(row.get("top_topics", "[]"))
|
| 698 |
+
row["top_topics_zh"] = json.loads(row.get("top_topics_zh", "[]"))
|
| 699 |
+
row["keywords"] = json.loads(row.get("keywords", "[]"))
|
| 700 |
+
row["keywords_zh"] = json.loads(row.get("keywords_zh", "[]"))
|
| 701 |
+
return row
|
| 702 |
+
|
| 703 |
+
|
| 704 |
+
def get_or_generate_trending(date_str: str, status=None) -> tuple[dict | None, str]:
|
| 705 |
+
"""Get trending from HF cache, or generate and push it.
|
| 706 |
+
Returns (trending_dict, date_range_str)."""
|
| 707 |
+
if status:
|
| 708 |
+
status.info("Checking cached trending summary...")
|
| 709 |
+
trending = pull_trending_from_hf(target_date=date_str)
|
| 710 |
+
if trending:
|
| 711 |
+
date_range = trending.get("date_range", "")
|
| 712 |
+
return trending, date_range
|
| 713 |
+
|
| 714 |
+
# Generate fresh trending
|
| 715 |
+
if status:
|
| 716 |
+
status.info("Loading recent papers for trending analysis...")
|
| 717 |
+
recent_papers, earliest, latest = _load_recent_papers(n_days=5)
|
| 718 |
+
if not recent_papers:
|
| 719 |
+
if status:
|
| 720 |
+
status.warning("No recent papers available for trending analysis.")
|
| 721 |
+
return None, ""
|
| 722 |
+
date_range = f"{earliest} ~ {latest}" if earliest and latest else ""
|
| 723 |
+
try:
|
| 724 |
+
if status:
|
| 725 |
+
status.info("Generating trending summary with Gemini...")
|
| 726 |
+
trending = generate_trending_summary(recent_papers)
|
| 727 |
+
trending["date_range"] = date_range
|
| 728 |
+
except Exception as e:
|
| 729 |
+
if status:
|
| 730 |
+
status.error(f"Trending generation failed: {e}")
|
| 731 |
+
return None, ""
|
| 732 |
+
|
| 733 |
+
try:
|
| 734 |
+
if status:
|
| 735 |
+
status.info("Saving trending summary to HuggingFace...")
|
| 736 |
+
push_trending_to_hf(trending, date_str)
|
| 737 |
+
except Exception as e:
|
| 738 |
+
if status:
|
| 739 |
+
status.warning(f"HF push failed: {e}")
|
| 740 |
+
|
| 741 |
+
return trending, date_range
|
| 742 |
+
|
| 743 |
+
|
| 744 |
# ---------------------------------------------------------------------------
|
| 745 |
# Summary dialog
|
| 746 |
# ---------------------------------------------------------------------------
|
|
|
|
| 766 |
|
| 767 |
# TL;DR
|
| 768 |
if lang:
|
| 769 |
+
concise = paper.get("concise_summary_zh", "") or paper.get(
|
| 770 |
+
"concise_summary", ""
|
| 771 |
+
)
|
| 772 |
else:
|
| 773 |
concise = paper.get("concise_summary", "")
|
| 774 |
if concise:
|
|
|
|
| 777 |
|
| 778 |
# Detailed Analysis
|
| 779 |
if lang:
|
| 780 |
+
analysis = paper.get("detailed_analysis_zh", {}) or paper.get(
|
| 781 |
+
"detailed_analysis", {}
|
| 782 |
+
)
|
| 783 |
else:
|
| 784 |
analysis = paper.get("detailed_analysis", {})
|
| 785 |
if analysis:
|
|
|
|
| 850 |
available_dates = list_available_dates()
|
| 851 |
selected_date = st.date_input(
|
| 852 |
"Select date",
|
| 853 |
+
value=(
|
| 854 |
+
datetime.strptime(available_dates[0], "%Y-%m-%d").date()
|
| 855 |
+
if available_dates
|
| 856 |
+
else (datetime.now(timezone.utc) - timedelta(days=1)).date()
|
| 857 |
+
),
|
| 858 |
format="YYYY-MM-DD",
|
| 859 |
label_visibility="collapsed",
|
| 860 |
)
|
|
|
|
| 865 |
|
| 866 |
latest_date = selected_date_str
|
| 867 |
|
| 868 |
+
with st.spinner("Loading papers..."):
|
| 869 |
+
hf_data = pull_from_hf_dataset(target_date=selected_date_str)
|
| 870 |
+
if hf_data:
|
| 871 |
+
papers = hf_data[selected_date_str]
|
| 872 |
|
| 873 |
+
if not papers:
|
| 874 |
+
json_files = find_json_files()
|
| 875 |
+
if selected_date_str in json_files:
|
| 876 |
+
papers = load_papers(json_files[selected_date_str])
|
|
|
|
| 877 |
|
| 878 |
+
if not papers:
|
| 879 |
+
result_path = crawl_and_summarize(selected_date_str)
|
| 880 |
+
if result_path:
|
| 881 |
+
papers = load_papers(result_path)
|
|
|
|
|
|
|
| 882 |
|
| 883 |
if not papers:
|
| 884 |
+
st.error("No papers found. Please check back later.")
|
| 885 |
st.stop()
|
| 886 |
|
| 887 |
papers.sort(key=lambda p: p.get("upvotes", 0), reverse=True)
|
| 888 |
|
| 889 |
date_label = latest_date
|
| 890 |
+
lang = st.session_state.get("global_lang_toggle", False)
|
| 891 |
+
|
| 892 |
+
# --- Trending status (spinner under title, filled later) ---
|
| 893 |
+
trending_spinner = st.empty()
|
| 894 |
+
|
| 895 |
+
# --- Trending summary placeholder (filled after papers render) ---
|
| 896 |
+
trending_placeholder = st.empty()
|
| 897 |
|
| 898 |
# --- Render paper grid (3 columns) ---
|
| 899 |
NUM_COLS = 3
|
|
|
|
| 905 |
break
|
| 906 |
with col:
|
| 907 |
render_card(papers[paper_idx], rank=paper_idx + 1)
|
| 908 |
+
|
| 909 |
+
# --- Trending summary (loaded after papers are displayed) ---
|
| 910 |
+
with trending_spinner.container():
|
| 911 |
+
with st.spinner("Loading trending summary..."):
|
| 912 |
+
trending, trending_date_range = get_or_generate_trending(
|
| 913 |
+
selected_date_str, status=None
|
| 914 |
+
)
|
| 915 |
+
trending_spinner.empty()
|
| 916 |
+
|
| 917 |
+
if trending:
|
| 918 |
+
if lang:
|
| 919 |
+
summary_text = trending.get("trending_summary_zh", "") or trending.get(
|
| 920 |
+
"trending_summary", ""
|
| 921 |
+
)
|
| 922 |
+
topics = trending.get("top_topics_zh", []) or trending.get("top_topics", [])
|
| 923 |
+
keywords = trending.get("keywords_zh", []) or trending.get("keywords", [])
|
| 924 |
+
else:
|
| 925 |
+
summary_text = trending.get("trending_summary", "")
|
| 926 |
+
topics = trending.get("top_topics", [])
|
| 927 |
+
keywords = trending.get("keywords", [])
|
| 928 |
+
topics_html = " ".join(
|
| 929 |
+
f'<span style="background:#eef1f5;padding:2px 10px;border-radius:12px;'
|
| 930 |
+
f'font-size:12px;font-weight:600;color:#2563eb;">{t}</span>'
|
| 931 |
+
for t in topics
|
| 932 |
+
)
|
| 933 |
+
keywords_html = " ".join(
|
| 934 |
+
f'<span style="background:#fff8e1;padding:2px 10px;border-radius:12px;'
|
| 935 |
+
f'font-size:11px;font-weight:500;color:#9a6700;border:1px solid #f0d060;">{k}</span>'
|
| 936 |
+
for k in keywords
|
| 937 |
+
)
|
| 938 |
+
date_range_label = (
|
| 939 |
+
f'<span style="font-size:12px;color:#9a6700;font-weight:600;">({trending_date_range})</span>'
|
| 940 |
+
if trending_date_range
|
| 941 |
+
else ""
|
| 942 |
+
)
|
| 943 |
+
trending_placeholder.markdown(
|
| 944 |
+
f"""<div class="stats-bar">
|
| 945 |
+
<div style="flex:1;min-width:200px;">
|
| 946 |
+
<div style="font-size:13px;color:#656d76;margin-bottom:4px;">
|
| 947 |
+
{"🔥 趋势" if lang else "🔥 Trending"} {date_range_label}
|
| 948 |
+
</div>
|
| 949 |
+
<div style="font-size:13px;color:#424a53;line-height:1.5;">{summary_text}</div>
|
| 950 |
+
<div style="display:flex;gap:6px;flex-wrap:wrap;margin-top:8px;">{topics_html}</div>
|
| 951 |
+
<div style="display:flex;gap:6px;flex-wrap:wrap;margin-top:8px;">{keywords_html}</div>
|
| 952 |
+
</div>
|
| 953 |
+
</div>""",
|
| 954 |
+
unsafe_allow_html=True,
|
| 955 |
+
)
|