stardust-coder commited on
Commit
5f897a9
·
1 Parent(s): 0a1e821

[add] link to paper

Browse files
Files changed (2) hide show
  1. README.md +0 -19
  2. src/streamlit_app.py +200 -37
README.md DELETED
@@ -1,19 +0,0 @@
1
- ---
2
- title: Paper Extractor
3
- emoji: 🚀
4
- colorFrom: red
5
- colorTo: red
6
- sdk: docker
7
- app_port: 8501
8
- tags:
9
- - streamlit
10
- pinned: false
11
- short_description: Streamlit template space
12
- ---
13
-
14
- # Welcome to Streamlit!
15
-
16
- Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
17
-
18
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
19
- forums](https://discuss.streamlit.io).
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
src/streamlit_app.py CHANGED
@@ -1,3 +1,4 @@
 
1
  import re
2
  import requests
3
  import streamlit as st
@@ -15,12 +16,11 @@ def get_openai_client():
15
  return OpenAI(api_key=api_key)
16
 
17
 
18
- def ask_llm(prompt, model="gpt-4.1-mini"):
19
  client = get_openai_client()
20
  res = client.chat.completions.create(
21
  model=model,
22
  messages=[{"role": "user", "content": prompt}],
23
- temperature=0.2,
24
  )
25
  return (res.choices[0].message.content or "").strip()
26
 
@@ -61,24 +61,114 @@ def deduplicate_papers(papers):
61
  # arXiv Search
62
  # =========================
63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
  def parse_arxiv_response(xml_text):
65
  root = ET.fromstring(xml_text)
66
  papers = []
67
 
68
- for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
69
- title_el = entry.find("{http://www.w3.org/2005/Atom}title")
70
- abstract_el = entry.find("{http://www.w3.org/2005/Atom}summary")
71
- date_el = entry.find("{http://www.w3.org/2005/Atom}published")
 
 
 
 
 
 
 
72
 
73
  authors = []
74
- for a in entry.findall("{http://www.w3.org/2005/Atom}author"):
75
- name_el = a.find("{http://www.w3.org/2005/Atom}name")
76
  if name_el is not None and name_el.text:
77
- authors.append(name_el.text.strip())
78
 
79
- title = title_el.text.strip() if title_el is not None and title_el.text else ""
80
- abstract = abstract_el.text.strip() if abstract_el is not None and abstract_el.text else ""
81
- date = date_el.text.strip() if date_el is not None and date_el.text else ""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
 
83
  if title:
84
  papers.append(
@@ -88,16 +178,46 @@ def parse_arxiv_response(xml_text):
88
  "abstract": abstract,
89
  "date": date,
90
  "source": "arXiv",
91
- "venue": "",
92
- "url": "",
 
 
 
93
  }
94
  )
95
 
96
  return papers
97
 
98
 
99
- def search_arxiv_once(search_query, max_results=3):
100
- url = "https://export.arxiv.org/api/query"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
101
  params = {
102
  "search_query": search_query,
103
  "start": 0,
@@ -106,31 +226,69 @@ def search_arxiv_once(search_query, max_results=3):
106
  "sortOrder": "descending",
107
  }
108
 
109
- res = requests.get(
110
- url,
111
- params=params,
112
- timeout=30,
113
- headers={"User-Agent": "paper-finder/0.1"},
114
- )
115
- res.raise_for_status()
116
- return parse_arxiv_response(res.text)
 
 
 
 
 
 
 
 
117
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
118
 
119
  def search_arxiv(query, max_results=3, debug=False):
120
  query = normalize_text(query)
121
  if not query:
122
  return []
123
 
 
 
124
  terms = [t for t in re.split(r"\s+", query) if t]
 
125
  strategies = []
126
 
127
- # 緩い順に試す
128
- strategies.append(f'all:{query}')
129
  strategies.append(f'all:"{query}"')
 
 
130
  strategies.append(f'ti:"{query}"')
131
 
 
 
 
 
132
  if terms:
133
- strategies.append(" AND ".join([f'all:{t}' for t in terms]))
 
 
 
 
 
 
134
 
135
  seen = set()
136
  all_papers = []
@@ -157,7 +315,6 @@ def search_arxiv(query, max_results=3, debug=False):
157
 
158
  return all_papers[:max_results]
159
 
160
-
161
  # =========================
162
  # OpenAlex Search
163
  # =========================
@@ -408,15 +565,16 @@ st.title("📚 Paper Finder")
408
 
409
  st.sidebar.header("Settings")
410
 
411
- openai_api_key = st.sidebar.text_input("OpenAI API Key", type="password")
412
- if openai_api_key:
413
- st.session_state["OPENAI_API_KEY"] = openai_api_key
414
 
415
- model = st.sidebar.selectbox(
416
- "Model",
417
- ["gpt-4.1-mini", "gpt-4.1", "gpt-4o-mini"],
418
- index=0,
419
- )
 
420
 
421
  debug_mode = st.sidebar.checkbox("Debug mode", value=True)
422
 
@@ -527,7 +685,12 @@ if st.button("Search Papers"):
527
  summary = f"要約生成に失敗しました: {e}"
528
 
529
  st.markdown("---")
530
- st.subheader(p.get("title", "Untitled"))
 
 
 
 
 
531
  st.write("**Explanation:**")
532
  st.write(summary)
533
  st.write("**Authors:**", ", ".join(p.get("authors", [])) if p.get("authors") else "-")
 
1
+ import time
2
  import re
3
  import requests
4
  import streamlit as st
 
16
  return OpenAI(api_key=api_key)
17
 
18
 
19
+ def ask_llm(prompt, model="gpt-5-nano"):
20
  client = get_openai_client()
21
  res = client.chat.completions.create(
22
  model=model,
23
  messages=[{"role": "user", "content": prompt}],
 
24
  )
25
  return (res.choices[0].message.content or "").strip()
26
 
 
61
  # arXiv Search
62
  # =========================
63
 
64
+ import re
65
+ import xml.etree.ElementTree as ET
66
+
67
+
68
+ def normalize_space(text: str) -> str:
69
+ return re.sub(r"\s+", " ", text or "").strip()
70
+
71
+
72
+ def extract_venue_from_arxiv(journal_ref: str, comment: str) -> str:
73
+ text = f"{journal_ref} {comment}".strip()
74
+
75
+ if not text:
76
+ return ""
77
+
78
+ # よくある国際会議・ジャーナル略称
79
+ venue_patterns = [
80
+ r"\bNeurIPS\s*\d{4}\b",
81
+ r"\bNIPS\s*\d{4}\b",
82
+ r"\bICML\s*\d{4}\b",
83
+ r"\bICLR\s*\d{4}\b",
84
+ r"\bACL\s*\d{4}\b",
85
+ r"\bEMNLP\s*\d{4}\b",
86
+ r"\bNAACL\s*\d{4}\b",
87
+ r"\bCOLING\s*\d{4}\b",
88
+ r"\bCVPR\s*\d{4}\b",
89
+ r"\bICCV\s*\d{4}\b",
90
+ r"\bECCV\s*\d{4}\b",
91
+ r"\bAAAI\s*\d{4}\b",
92
+ r"\bIJCAI\s*\d{4}\b",
93
+ r"\bKDD\s*\d{4}\b",
94
+ r"\bSIGIR\s*\d{4}\b",
95
+ r"\bWWW\s*\d{4}\b",
96
+ r"\bTheWebConf\s*\d{4}\b",
97
+ r"\bCHI\s*\d{4}\b",
98
+ r"\bUAI\s*\d{4}\b",
99
+ r"\bAISTATS\s*\d{4}\b",
100
+ r"\bICRA\s*\d{4}\b",
101
+ r"\bIROS\s*\d{4}\b",
102
+ ]
103
+
104
+ for pattern in venue_patterns:
105
+ m = re.search(pattern, text, flags=re.IGNORECASE)
106
+ if m:
107
+ return m.group(0)
108
+
109
+ # journal_refがあるなら、まずそれをvenueとして使う
110
+ if journal_ref:
111
+ return journal_ref
112
+
113
+ # commentに Accepted / Published / To appear などがあれば、それをvenue候補にする
114
+ accepted_patterns = [
115
+ r"(?:Accepted|Accepted at|Accepted to|To appear in|Published in)\s+(.+?)(?:\.|$)",
116
+ r"(?:Proceedings of)\s+(.+?)(?:\.|$)",
117
+ ]
118
+
119
+ for pattern in accepted_patterns:
120
+ m = re.search(pattern, comment, flags=re.IGNORECASE)
121
+ if m:
122
+ return normalize_space(m.group(1))
123
+
124
+ return ""
125
+
126
  def parse_arxiv_response(xml_text):
127
  root = ET.fromstring(xml_text)
128
  papers = []
129
 
130
+ ATOM = "{http://www.w3.org/2005/Atom}"
131
+ ARXIV = "{http://arxiv.org/schemas/atom}"
132
+
133
+ for entry in root.findall(f"{ATOM}entry"):
134
+ title_el = entry.find(f"{ATOM}title")
135
+ abstract_el = entry.find(f"{ATOM}summary")
136
+ date_el = entry.find(f"{ATOM}published")
137
+ id_el = entry.find(f"{ATOM}id")
138
+
139
+ journal_ref_el = entry.find(f"{ARXIV}journal_ref")
140
+ comment_el = entry.find(f"{ARXIV}comment")
141
 
142
  authors = []
143
+ for a in entry.findall(f"{ATOM}author"):
144
+ name_el = a.find(f"{ATOM}name")
145
  if name_el is not None and name_el.text:
146
+ authors.append(normalize_space(name_el.text))
147
 
148
+ title = normalize_space(title_el.text) if title_el is not None and title_el.text else ""
149
+ abstract = normalize_space(abstract_el.text) if abstract_el is not None and abstract_el.text else ""
150
+ date = normalize_space(date_el.text) if date_el is not None and date_el.text else ""
151
+ url = normalize_space(id_el.text) if id_el is not None and id_el.text else ""
152
+
153
+ journal_ref = (
154
+ normalize_space(journal_ref_el.text)
155
+ if journal_ref_el is not None and journal_ref_el.text
156
+ else ""
157
+ )
158
+
159
+ comment = (
160
+ normalize_space(comment_el.text)
161
+ if comment_el is not None and comment_el.text
162
+ else ""
163
+ )
164
+
165
+ venue = extract_venue_from_arxiv(journal_ref, comment)
166
+
167
+ pdf_url = ""
168
+ for link in entry.findall(f"{ATOM}link"):
169
+ if link.attrib.get("title") == "pdf":
170
+ pdf_url = link.attrib.get("href", "")
171
+ break
172
 
173
  if title:
174
  papers.append(
 
178
  "abstract": abstract,
179
  "date": date,
180
  "source": "arXiv",
181
+ "venue": venue,
182
+ "journal_ref": journal_ref,
183
+ "comment": comment,
184
+ "url": url,
185
+ "pdf_url": pdf_url,
186
  }
187
  )
188
 
189
  return papers
190
 
191
 
192
+
193
+ ARXIV_API_URL = "https://export.arxiv.org/api/query"
194
+ _last_arxiv_request_time = 0
195
+
196
+
197
+ def escape_arxiv_phrase(text: str) -> str:
198
+ """
199
+ arXivのフレーズ検索用に最低限エスケープする。
200
+ """
201
+ text = text.strip()
202
+ text = text.replace('"', " ")
203
+ text = re.sub(r"\s+", " ", text)
204
+ return text
205
+
206
+
207
+ def wait_for_arxiv_rate_limit(min_interval=3.2):
208
+ """
209
+ arXiv APIは連続アクセスに弱いので、最低3秒以上空ける。
210
+ """
211
+ global _last_arxiv_request_time
212
+
213
+ elapsed = time.time() - _last_arxiv_request_time
214
+ if elapsed < min_interval:
215
+ time.sleep(min_interval - elapsed)
216
+
217
+
218
+ def search_arxiv_once(search_query, max_results=3, retries=3):
219
+ global _last_arxiv_request_time
220
+
221
  params = {
222
  "search_query": search_query,
223
  "start": 0,
 
226
  "sortOrder": "descending",
227
  }
228
 
229
+ headers = {
230
+ "User-Agent": "paper-finder/0.1 contact:your-email@example.com"
231
+ }
232
+
233
+ last_error = None
234
+
235
+ for attempt in range(retries):
236
+ wait_for_arxiv_rate_limit()
237
+
238
+ try:
239
+ res = requests.get(
240
+ ARXIV_API_URL,
241
+ params=params,
242
+ timeout=30,
243
+ headers=headers,
244
+ )
245
 
246
+ _last_arxiv_request_time = time.time()
247
+
248
+ if res.status_code == 429:
249
+ wait = 5 * (attempt + 1)
250
+ time.sleep(wait)
251
+ last_error = RuntimeError("arXiv rate limited: 429")
252
+ continue
253
+
254
+ res.raise_for_status()
255
+ return parse_arxiv_response(res.text)
256
+
257
+ except requests.RequestException as e:
258
+ last_error = e
259
+ time.sleep(2 * (attempt + 1))
260
+
261
+ raise last_error
262
 
263
  def search_arxiv(query, max_results=3, debug=False):
264
  query = normalize_text(query)
265
  if not query:
266
  return []
267
 
268
+ query = escape_arxiv_phrase(query)
269
+
270
  terms = [t for t in re.split(r"\s+", query) if t]
271
+
272
  strategies = []
273
 
274
+ # まずフレーズ検索
 
275
  strategies.append(f'all:"{query}"')
276
+
277
+ # タイトル検索
278
  strategies.append(f'ti:"{query}"')
279
 
280
+ # abstract検索も追加
281
+ strategies.append(f'abs:"{query}"')
282
+
283
+ # 単語AND検索
284
  if terms:
285
+ safe_terms = [escape_arxiv_phrase(t) for t in terms]
286
+ strategies.append(" AND ".join([f'all:{t}' for t in safe_terms]))
287
+
288
+ # 最後に緩めの単語OR検索
289
+ if len(terms) >= 2:
290
+ safe_terms = [escape_arxiv_phrase(t) for t in terms]
291
+ strategies.append(" OR ".join([f'all:{t}' for t in safe_terms]))
292
 
293
  seen = set()
294
  all_papers = []
 
315
 
316
  return all_papers[:max_results]
317
 
 
318
  # =========================
319
  # OpenAlex Search
320
  # =========================
 
565
 
566
  st.sidebar.header("Settings")
567
 
568
+ import os
569
+ openai_api_key = os.getenv("OPENAI_API_KEY")
570
+ st.session_state["OPENAI_API_KEY"] = openai_api_key
571
 
572
+ # model = st.sidebar.selectbox(
573
+ # "Model",
574
+ # ["gpt-5-nano"],
575
+ # index=0,
576
+ # )
577
+ model = "gpt-5-nano"
578
 
579
  debug_mode = st.sidebar.checkbox("Debug mode", value=True)
580
 
 
685
  summary = f"要約生成に失敗しました: {e}"
686
 
687
  st.markdown("---")
688
+ title = p.get("title", "No title")
689
+ url = p.get("url")
690
+ if url:
691
+ st.markdown(f"### [{title}]({url})")
692
+ else:
693
+ st.markdown(f"### {title}")
694
  st.write("**Explanation:**")
695
  st.write(summary)
696
  st.write("**Authors:**", ", ".join(p.get("authors", [])) if p.get("authors") else "-")