Spaces:

Elfsong
/

Daily_Paper_Reader

Running

File size: 5,970 Bytes

ea972e7

"""
HF Paper Reader — 用 Gemini 为每篇论文生成两种摘要 (单次调用，JSON 输出)
Usage:
    python reader.py                                          # 默认读取最新 json
    python reader.py --input hf_papers_2026-03-10.json        # 指定输入
    python reader.py --input hf_papers_2026-03-10.json --top 10  # 只处理前10篇
"""

import argparse
import json
import os
import time
from pathlib import Path

from dotenv import load_dotenv
from google import genai

load_dotenv(Path(__file__).parent / ".env")

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise RuntimeError("GEMINI_API_KEY not found in .env")

client = genai.Client(api_key=GEMINI_API_KEY)

SYSTEM_PROMPT = """\
You are a senior AI researcher. Given a paper's title and abstract, produce a JSON object \
with exactly two keys:

1. "concise_summary": A 2-4 sentence plain-language summary explaining WHAT the paper does \
and WHY it matters. Avoid jargon; end with the key result or takeaway.

2. "detailed_analysis": A longer analysis with your own understanding, structured as:
   - "background_and_motivation": 2-3 sentences on what existing problem or gap in the field \
this paper addresses. What prior work fell short, and why is this research needed now?
   - "summary": 4-6 sentences. Go beyond restating the abstract — interpret the approach \
and explain how it fits into the broader research landscape.
   - "pros": A list of 3-4 strengths (novelty, practical impact, methodology, etc.)
   - "cons": A list of 2-3 weaknesses or limitations (scope, assumptions, scalability, etc.)

Reply with ONLY valid JSON — no markdown fences, no extra text. English only."""


def summarize_paper(title: str, abstract: str) -> dict:
    """单次调用 Gemini，返回包含两种摘要的 dict"""
    prompt = f"Title: {title}\n\nAbstract: {abstract}"
    resp = client.models.generate_content(
        model="gemini-3-pro-preview",
        contents=prompt,
        config=genai.types.GenerateContentConfig(
            system_instruction=SYSTEM_PROMPT,
            temperature=0.3,
            max_output_tokens=16384*4,
            response_mime_type="application/json",
        ),
    )
    return json.loads(resp.text)


def main():
    parser = argparse.ArgumentParser(description="用 Gemini 为 HF 论文生成摘要")
    parser.add_argument("--input", "-i", type=str, default=None, help="论文 JSON 文件路径")
    parser.add_argument("--output", "-o", type=str, default=None, help="输出 JSON 文件路径")
    parser.add_argument("--top", type=int, default=0, help="只处理前 N 篇 (按 upvotes)")
    args = parser.parse_args()

    # 自动查找最新的 hf_papers_*.json (排除已生成的 _summarized 文件)
    if args.input:
        input_path = Path(args.input)
    else:
        candidates = sorted(
            p for p in Path(__file__).parent.glob("hf_papers_*.json")
            if "_summarized" not in p.name
        )
        if not candidates:
            print("未找到 hf_papers_*.json 文件，请用 --input 指定")
            return
        input_path = candidates[-1]

    with open(input_path, "r", encoding="utf-8") as f:
        papers = json.load(f)

    if args.top > 0:
        papers = papers[: args.top]

    total = len(papers)
    print(f"读取 {input_path}，共 {total} 篇论文，开始生成摘要...\n")

    for i, paper in enumerate(papers, 1):
        title = paper["title"]
        abstract = paper.get("summary", "")
        if not abstract:
            print(f"[{i}/{total}] 跳过（无摘要）: {title}")
            paper["concise_summary"] = ""
            paper["detailed_analysis"] = {}
            continue

        print(f"[{i}/{total}] {title}")
        try:
            result = summarize_paper(title, abstract)
            paper["concise_summary"] = result.get("concise_summary", "")
            paper["detailed_analysis"] = result.get("detailed_analysis", {})
            print(f"  [concise] {paper['concise_summary'][:120]}...")
            summary_preview = paper["detailed_analysis"].get("summary", "")[:120]
            print(f"  [detailed] {summary_preview}...\n")
        except Exception as e:
            print(f"  ✗ 生成失败: {e}\n")
            paper["concise_summary"] = f"ERROR: {e}"
            paper["detailed_analysis"] = {"error": str(e)}

        # 简单限速，避免 API rate limit
        if i < total:
            time.sleep(1)

    # 保存 JSON
    output_path = Path(args.output) if args.output else input_path.with_name(
        input_path.stem + "_summarized.json"
    )
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(papers, f, ensure_ascii=False, indent=2)
    print(f"\n已保存到 {output_path}")

    # 输出易读的文本版
    txt_path = output_path.with_suffix(".txt")
    with open(txt_path, "w", encoding="utf-8") as f:
        for i, p in enumerate(papers, 1):
            f.write(f"{'='*80}\n")
            f.write(f"[{i}] {p['title']}\n")
            f.write(f"    Upvotes: {p.get('upvotes', 0)}  |  {p['hf_url']}\n")
            f.write(f"{'='*80}\n\n")
            f.write(f"--- Concise Summary ---\n{p.get('concise_summary', 'N/A')}\n\n")
            da = p.get("detailed_analysis", {})
            if isinstance(da, dict) and "summary" in da:
                f.write(f"--- Detailed Analysis ---\n")
                if da.get("background_and_motivation"):
                    f.write(f"Background & Motivation:\n{da['background_and_motivation']}\n\n")
                f.write(f"Summary:\n{da['summary']}\n\n")
                f.write(f"Pros:\n")
                for pro in da.get("pros", []):
                    f.write(f"  + {pro}\n")
                f.write(f"\nCons:\n")
                for con in da.get("cons", []):
                    f.write(f"  - {con}\n")
            f.write(f"\n\n")
    print(f"文本版已保存到 {txt_path}")


if __name__ == "__main__":
    main()