File size: 5,970 Bytes
ea972e7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
"""
HF Paper Reader — 用 Gemini 为每篇论文生成两种摘要 (单次调用,JSON 输出)
Usage:
    python reader.py                                          # 默认读取最新 json
    python reader.py --input hf_papers_2026-03-10.json        # 指定输入
    python reader.py --input hf_papers_2026-03-10.json --top 10  # 只处理前10篇
"""

import argparse
import json
import os
import time
from pathlib import Path

from dotenv import load_dotenv
from google import genai

load_dotenv(Path(__file__).parent / ".env")

GEMINI_API_KEY = os.getenv("GEMINI_API_KEY")
if not GEMINI_API_KEY:
    raise RuntimeError("GEMINI_API_KEY not found in .env")

client = genai.Client(api_key=GEMINI_API_KEY)

SYSTEM_PROMPT = """\
You are a senior AI researcher. Given a paper's title and abstract, produce a JSON object \
with exactly two keys:

1. "concise_summary": A 2-4 sentence plain-language summary explaining WHAT the paper does \
and WHY it matters. Avoid jargon; end with the key result or takeaway.

2. "detailed_analysis": A longer analysis with your own understanding, structured as:
   - "background_and_motivation": 2-3 sentences on what existing problem or gap in the field \
this paper addresses. What prior work fell short, and why is this research needed now?
   - "summary": 4-6 sentences. Go beyond restating the abstract — interpret the approach \
and explain how it fits into the broader research landscape.
   - "pros": A list of 3-4 strengths (novelty, practical impact, methodology, etc.)
   - "cons": A list of 2-3 weaknesses or limitations (scope, assumptions, scalability, etc.)

Reply with ONLY valid JSON — no markdown fences, no extra text. English only."""


def summarize_paper(title: str, abstract: str) -> dict:
    """单次调用 Gemini,返回包含两种摘要的 dict"""
    prompt = f"Title: {title}\n\nAbstract: {abstract}"
    resp = client.models.generate_content(
        model="gemini-3-pro-preview",
        contents=prompt,
        config=genai.types.GenerateContentConfig(
            system_instruction=SYSTEM_PROMPT,
            temperature=0.3,
            max_output_tokens=16384*4,
            response_mime_type="application/json",
        ),
    )
    return json.loads(resp.text)


def main():
    parser = argparse.ArgumentParser(description="用 Gemini 为 HF 论文生成摘要")
    parser.add_argument("--input", "-i", type=str, default=None, help="论文 JSON 文件路径")
    parser.add_argument("--output", "-o", type=str, default=None, help="输出 JSON 文件路径")
    parser.add_argument("--top", type=int, default=0, help="只处理前 N 篇 (按 upvotes)")
    args = parser.parse_args()

    # 自动查找最新的 hf_papers_*.json (排除已生成的 _summarized 文件)
    if args.input:
        input_path = Path(args.input)
    else:
        candidates = sorted(
            p for p in Path(__file__).parent.glob("hf_papers_*.json")
            if "_summarized" not in p.name
        )
        if not candidates:
            print("未找到 hf_papers_*.json 文件,请用 --input 指定")
            return
        input_path = candidates[-1]

    with open(input_path, "r", encoding="utf-8") as f:
        papers = json.load(f)

    if args.top > 0:
        papers = papers[: args.top]

    total = len(papers)
    print(f"读取 {input_path},共 {total} 篇论文,开始生成摘要...\n")

    for i, paper in enumerate(papers, 1):
        title = paper["title"]
        abstract = paper.get("summary", "")
        if not abstract:
            print(f"[{i}/{total}] 跳过(无摘要): {title}")
            paper["concise_summary"] = ""
            paper["detailed_analysis"] = {}
            continue

        print(f"[{i}/{total}] {title}")
        try:
            result = summarize_paper(title, abstract)
            paper["concise_summary"] = result.get("concise_summary", "")
            paper["detailed_analysis"] = result.get("detailed_analysis", {})
            print(f"  [concise] {paper['concise_summary'][:120]}...")
            summary_preview = paper["detailed_analysis"].get("summary", "")[:120]
            print(f"  [detailed] {summary_preview}...\n")
        except Exception as e:
            print(f"  ✗ 生成失败: {e}\n")
            paper["concise_summary"] = f"ERROR: {e}"
            paper["detailed_analysis"] = {"error": str(e)}

        # 简单限速,避免 API rate limit
        if i < total:
            time.sleep(1)

    # 保存 JSON
    output_path = Path(args.output) if args.output else input_path.with_name(
        input_path.stem + "_summarized.json"
    )
    with open(output_path, "w", encoding="utf-8") as f:
        json.dump(papers, f, ensure_ascii=False, indent=2)
    print(f"\n已保存到 {output_path}")

    # 输出易读的文本版
    txt_path = output_path.with_suffix(".txt")
    with open(txt_path, "w", encoding="utf-8") as f:
        for i, p in enumerate(papers, 1):
            f.write(f"{'='*80}\n")
            f.write(f"[{i}] {p['title']}\n")
            f.write(f"    Upvotes: {p.get('upvotes', 0)}  |  {p['hf_url']}\n")
            f.write(f"{'='*80}\n\n")
            f.write(f"--- Concise Summary ---\n{p.get('concise_summary', 'N/A')}\n\n")
            da = p.get("detailed_analysis", {})
            if isinstance(da, dict) and "summary" in da:
                f.write(f"--- Detailed Analysis ---\n")
                if da.get("background_and_motivation"):
                    f.write(f"Background & Motivation:\n{da['background_and_motivation']}\n\n")
                f.write(f"Summary:\n{da['summary']}\n\n")
                f.write(f"Pros:\n")
                for pro in da.get("pros", []):
                    f.write(f"  + {pro}\n")
                f.write(f"\nCons:\n")
                for con in da.get("cons", []):
                    f.write(f"  - {con}\n")
            f.write(f"\n\n")
    print(f"文本版已保存到 {txt_path}")


if __name__ == "__main__":
    main()