Spaces:
Running
Running
Commit ·
5f897a9
1
Parent(s): 0a1e821
[add] link to paper
Browse files- README.md +0 -19
- src/streamlit_app.py +200 -37
README.md
DELETED
|
@@ -1,19 +0,0 @@
|
|
| 1 |
-
---
|
| 2 |
-
title: Paper Extractor
|
| 3 |
-
emoji: 🚀
|
| 4 |
-
colorFrom: red
|
| 5 |
-
colorTo: red
|
| 6 |
-
sdk: docker
|
| 7 |
-
app_port: 8501
|
| 8 |
-
tags:
|
| 9 |
-
- streamlit
|
| 10 |
-
pinned: false
|
| 11 |
-
short_description: Streamlit template space
|
| 12 |
-
---
|
| 13 |
-
|
| 14 |
-
# Welcome to Streamlit!
|
| 15 |
-
|
| 16 |
-
Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
|
| 17 |
-
|
| 18 |
-
If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
|
| 19 |
-
forums](https://discuss.streamlit.io).
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/streamlit_app.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
import re
|
| 2 |
import requests
|
| 3 |
import streamlit as st
|
|
@@ -15,12 +16,11 @@ def get_openai_client():
|
|
| 15 |
return OpenAI(api_key=api_key)
|
| 16 |
|
| 17 |
|
| 18 |
-
def ask_llm(prompt, model="gpt-
|
| 19 |
client = get_openai_client()
|
| 20 |
res = client.chat.completions.create(
|
| 21 |
model=model,
|
| 22 |
messages=[{"role": "user", "content": prompt}],
|
| 23 |
-
temperature=0.2,
|
| 24 |
)
|
| 25 |
return (res.choices[0].message.content or "").strip()
|
| 26 |
|
|
@@ -61,24 +61,114 @@ def deduplicate_papers(papers):
|
|
| 61 |
# arXiv Search
|
| 62 |
# =========================
|
| 63 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 64 |
def parse_arxiv_response(xml_text):
|
| 65 |
root = ET.fromstring(xml_text)
|
| 66 |
papers = []
|
| 67 |
|
| 68 |
-
|
| 69 |
-
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 72 |
|
| 73 |
authors = []
|
| 74 |
-
for a in entry.findall("{
|
| 75 |
-
name_el = a.find("{
|
| 76 |
if name_el is not None and name_el.text:
|
| 77 |
-
authors.append(name_el.text
|
| 78 |
|
| 79 |
-
title = title_el.text
|
| 80 |
-
abstract = abstract_el.text
|
| 81 |
-
date = date_el.text
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 82 |
|
| 83 |
if title:
|
| 84 |
papers.append(
|
|
@@ -88,16 +178,46 @@ def parse_arxiv_response(xml_text):
|
|
| 88 |
"abstract": abstract,
|
| 89 |
"date": date,
|
| 90 |
"source": "arXiv",
|
| 91 |
-
"venue":
|
| 92 |
-
"
|
|
|
|
|
|
|
|
|
|
| 93 |
}
|
| 94 |
)
|
| 95 |
|
| 96 |
return papers
|
| 97 |
|
| 98 |
|
| 99 |
-
|
| 100 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 101 |
params = {
|
| 102 |
"search_query": search_query,
|
| 103 |
"start": 0,
|
|
@@ -106,31 +226,69 @@ def search_arxiv_once(search_query, max_results=3):
|
|
| 106 |
"sortOrder": "descending",
|
| 107 |
}
|
| 108 |
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 117 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 118 |
|
| 119 |
def search_arxiv(query, max_results=3, debug=False):
|
| 120 |
query = normalize_text(query)
|
| 121 |
if not query:
|
| 122 |
return []
|
| 123 |
|
|
|
|
|
|
|
| 124 |
terms = [t for t in re.split(r"\s+", query) if t]
|
|
|
|
| 125 |
strategies = []
|
| 126 |
|
| 127 |
-
#
|
| 128 |
-
strategies.append(f'all:{query}')
|
| 129 |
strategies.append(f'all:"{query}"')
|
|
|
|
|
|
|
| 130 |
strategies.append(f'ti:"{query}"')
|
| 131 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 132 |
if terms:
|
| 133 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 134 |
|
| 135 |
seen = set()
|
| 136 |
all_papers = []
|
|
@@ -157,7 +315,6 @@ def search_arxiv(query, max_results=3, debug=False):
|
|
| 157 |
|
| 158 |
return all_papers[:max_results]
|
| 159 |
|
| 160 |
-
|
| 161 |
# =========================
|
| 162 |
# OpenAlex Search
|
| 163 |
# =========================
|
|
@@ -408,15 +565,16 @@ st.title("📚 Paper Finder")
|
|
| 408 |
|
| 409 |
st.sidebar.header("Settings")
|
| 410 |
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
|
| 415 |
-
model = st.sidebar.selectbox(
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
)
|
|
|
|
| 420 |
|
| 421 |
debug_mode = st.sidebar.checkbox("Debug mode", value=True)
|
| 422 |
|
|
@@ -527,7 +685,12 @@ if st.button("Search Papers"):
|
|
| 527 |
summary = f"要約生成に失敗しました: {e}"
|
| 528 |
|
| 529 |
st.markdown("---")
|
| 530 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 531 |
st.write("**Explanation:**")
|
| 532 |
st.write(summary)
|
| 533 |
st.write("**Authors:**", ", ".join(p.get("authors", [])) if p.get("authors") else "-")
|
|
|
|
| 1 |
+
import time
|
| 2 |
import re
|
| 3 |
import requests
|
| 4 |
import streamlit as st
|
|
|
|
| 16 |
return OpenAI(api_key=api_key)
|
| 17 |
|
| 18 |
|
| 19 |
+
def ask_llm(prompt, model="gpt-5-nano"):
|
| 20 |
client = get_openai_client()
|
| 21 |
res = client.chat.completions.create(
|
| 22 |
model=model,
|
| 23 |
messages=[{"role": "user", "content": prompt}],
|
|
|
|
| 24 |
)
|
| 25 |
return (res.choices[0].message.content or "").strip()
|
| 26 |
|
|
|
|
| 61 |
# arXiv Search
|
| 62 |
# =========================
|
| 63 |
|
| 64 |
+
import re
|
| 65 |
+
import xml.etree.ElementTree as ET
|
| 66 |
+
|
| 67 |
+
|
| 68 |
+
def normalize_space(text: str) -> str:
|
| 69 |
+
return re.sub(r"\s+", " ", text or "").strip()
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def extract_venue_from_arxiv(journal_ref: str, comment: str) -> str:
|
| 73 |
+
text = f"{journal_ref} {comment}".strip()
|
| 74 |
+
|
| 75 |
+
if not text:
|
| 76 |
+
return ""
|
| 77 |
+
|
| 78 |
+
# よくある国際会議・ジャーナル略称
|
| 79 |
+
venue_patterns = [
|
| 80 |
+
r"\bNeurIPS\s*\d{4}\b",
|
| 81 |
+
r"\bNIPS\s*\d{4}\b",
|
| 82 |
+
r"\bICML\s*\d{4}\b",
|
| 83 |
+
r"\bICLR\s*\d{4}\b",
|
| 84 |
+
r"\bACL\s*\d{4}\b",
|
| 85 |
+
r"\bEMNLP\s*\d{4}\b",
|
| 86 |
+
r"\bNAACL\s*\d{4}\b",
|
| 87 |
+
r"\bCOLING\s*\d{4}\b",
|
| 88 |
+
r"\bCVPR\s*\d{4}\b",
|
| 89 |
+
r"\bICCV\s*\d{4}\b",
|
| 90 |
+
r"\bECCV\s*\d{4}\b",
|
| 91 |
+
r"\bAAAI\s*\d{4}\b",
|
| 92 |
+
r"\bIJCAI\s*\d{4}\b",
|
| 93 |
+
r"\bKDD\s*\d{4}\b",
|
| 94 |
+
r"\bSIGIR\s*\d{4}\b",
|
| 95 |
+
r"\bWWW\s*\d{4}\b",
|
| 96 |
+
r"\bTheWebConf\s*\d{4}\b",
|
| 97 |
+
r"\bCHI\s*\d{4}\b",
|
| 98 |
+
r"\bUAI\s*\d{4}\b",
|
| 99 |
+
r"\bAISTATS\s*\d{4}\b",
|
| 100 |
+
r"\bICRA\s*\d{4}\b",
|
| 101 |
+
r"\bIROS\s*\d{4}\b",
|
| 102 |
+
]
|
| 103 |
+
|
| 104 |
+
for pattern in venue_patterns:
|
| 105 |
+
m = re.search(pattern, text, flags=re.IGNORECASE)
|
| 106 |
+
if m:
|
| 107 |
+
return m.group(0)
|
| 108 |
+
|
| 109 |
+
# journal_refがあるなら、まずそれをvenueとして使う
|
| 110 |
+
if journal_ref:
|
| 111 |
+
return journal_ref
|
| 112 |
+
|
| 113 |
+
# commentに Accepted / Published / To appear などがあれば、それをvenue候補にする
|
| 114 |
+
accepted_patterns = [
|
| 115 |
+
r"(?:Accepted|Accepted at|Accepted to|To appear in|Published in)\s+(.+?)(?:\.|$)",
|
| 116 |
+
r"(?:Proceedings of)\s+(.+?)(?:\.|$)",
|
| 117 |
+
]
|
| 118 |
+
|
| 119 |
+
for pattern in accepted_patterns:
|
| 120 |
+
m = re.search(pattern, comment, flags=re.IGNORECASE)
|
| 121 |
+
if m:
|
| 122 |
+
return normalize_space(m.group(1))
|
| 123 |
+
|
| 124 |
+
return ""
|
| 125 |
+
|
| 126 |
def parse_arxiv_response(xml_text):
|
| 127 |
root = ET.fromstring(xml_text)
|
| 128 |
papers = []
|
| 129 |
|
| 130 |
+
ATOM = "{http://www.w3.org/2005/Atom}"
|
| 131 |
+
ARXIV = "{http://arxiv.org/schemas/atom}"
|
| 132 |
+
|
| 133 |
+
for entry in root.findall(f"{ATOM}entry"):
|
| 134 |
+
title_el = entry.find(f"{ATOM}title")
|
| 135 |
+
abstract_el = entry.find(f"{ATOM}summary")
|
| 136 |
+
date_el = entry.find(f"{ATOM}published")
|
| 137 |
+
id_el = entry.find(f"{ATOM}id")
|
| 138 |
+
|
| 139 |
+
journal_ref_el = entry.find(f"{ARXIV}journal_ref")
|
| 140 |
+
comment_el = entry.find(f"{ARXIV}comment")
|
| 141 |
|
| 142 |
authors = []
|
| 143 |
+
for a in entry.findall(f"{ATOM}author"):
|
| 144 |
+
name_el = a.find(f"{ATOM}name")
|
| 145 |
if name_el is not None and name_el.text:
|
| 146 |
+
authors.append(normalize_space(name_el.text))
|
| 147 |
|
| 148 |
+
title = normalize_space(title_el.text) if title_el is not None and title_el.text else ""
|
| 149 |
+
abstract = normalize_space(abstract_el.text) if abstract_el is not None and abstract_el.text else ""
|
| 150 |
+
date = normalize_space(date_el.text) if date_el is not None and date_el.text else ""
|
| 151 |
+
url = normalize_space(id_el.text) if id_el is not None and id_el.text else ""
|
| 152 |
+
|
| 153 |
+
journal_ref = (
|
| 154 |
+
normalize_space(journal_ref_el.text)
|
| 155 |
+
if journal_ref_el is not None and journal_ref_el.text
|
| 156 |
+
else ""
|
| 157 |
+
)
|
| 158 |
+
|
| 159 |
+
comment = (
|
| 160 |
+
normalize_space(comment_el.text)
|
| 161 |
+
if comment_el is not None and comment_el.text
|
| 162 |
+
else ""
|
| 163 |
+
)
|
| 164 |
+
|
| 165 |
+
venue = extract_venue_from_arxiv(journal_ref, comment)
|
| 166 |
+
|
| 167 |
+
pdf_url = ""
|
| 168 |
+
for link in entry.findall(f"{ATOM}link"):
|
| 169 |
+
if link.attrib.get("title") == "pdf":
|
| 170 |
+
pdf_url = link.attrib.get("href", "")
|
| 171 |
+
break
|
| 172 |
|
| 173 |
if title:
|
| 174 |
papers.append(
|
|
|
|
| 178 |
"abstract": abstract,
|
| 179 |
"date": date,
|
| 180 |
"source": "arXiv",
|
| 181 |
+
"venue": venue,
|
| 182 |
+
"journal_ref": journal_ref,
|
| 183 |
+
"comment": comment,
|
| 184 |
+
"url": url,
|
| 185 |
+
"pdf_url": pdf_url,
|
| 186 |
}
|
| 187 |
)
|
| 188 |
|
| 189 |
return papers
|
| 190 |
|
| 191 |
|
| 192 |
+
|
| 193 |
+
ARXIV_API_URL = "https://export.arxiv.org/api/query"
|
| 194 |
+
_last_arxiv_request_time = 0
|
| 195 |
+
|
| 196 |
+
|
| 197 |
+
def escape_arxiv_phrase(text: str) -> str:
|
| 198 |
+
"""
|
| 199 |
+
arXivのフレーズ検索用に最低限エスケープする。
|
| 200 |
+
"""
|
| 201 |
+
text = text.strip()
|
| 202 |
+
text = text.replace('"', " ")
|
| 203 |
+
text = re.sub(r"\s+", " ", text)
|
| 204 |
+
return text
|
| 205 |
+
|
| 206 |
+
|
| 207 |
+
def wait_for_arxiv_rate_limit(min_interval=3.2):
|
| 208 |
+
"""
|
| 209 |
+
arXiv APIは連続アクセスに弱いので、最低3秒以上空ける。
|
| 210 |
+
"""
|
| 211 |
+
global _last_arxiv_request_time
|
| 212 |
+
|
| 213 |
+
elapsed = time.time() - _last_arxiv_request_time
|
| 214 |
+
if elapsed < min_interval:
|
| 215 |
+
time.sleep(min_interval - elapsed)
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def search_arxiv_once(search_query, max_results=3, retries=3):
|
| 219 |
+
global _last_arxiv_request_time
|
| 220 |
+
|
| 221 |
params = {
|
| 222 |
"search_query": search_query,
|
| 223 |
"start": 0,
|
|
|
|
| 226 |
"sortOrder": "descending",
|
| 227 |
}
|
| 228 |
|
| 229 |
+
headers = {
|
| 230 |
+
"User-Agent": "paper-finder/0.1 contact:your-email@example.com"
|
| 231 |
+
}
|
| 232 |
+
|
| 233 |
+
last_error = None
|
| 234 |
+
|
| 235 |
+
for attempt in range(retries):
|
| 236 |
+
wait_for_arxiv_rate_limit()
|
| 237 |
+
|
| 238 |
+
try:
|
| 239 |
+
res = requests.get(
|
| 240 |
+
ARXIV_API_URL,
|
| 241 |
+
params=params,
|
| 242 |
+
timeout=30,
|
| 243 |
+
headers=headers,
|
| 244 |
+
)
|
| 245 |
|
| 246 |
+
_last_arxiv_request_time = time.time()
|
| 247 |
+
|
| 248 |
+
if res.status_code == 429:
|
| 249 |
+
wait = 5 * (attempt + 1)
|
| 250 |
+
time.sleep(wait)
|
| 251 |
+
last_error = RuntimeError("arXiv rate limited: 429")
|
| 252 |
+
continue
|
| 253 |
+
|
| 254 |
+
res.raise_for_status()
|
| 255 |
+
return parse_arxiv_response(res.text)
|
| 256 |
+
|
| 257 |
+
except requests.RequestException as e:
|
| 258 |
+
last_error = e
|
| 259 |
+
time.sleep(2 * (attempt + 1))
|
| 260 |
+
|
| 261 |
+
raise last_error
|
| 262 |
|
| 263 |
def search_arxiv(query, max_results=3, debug=False):
|
| 264 |
query = normalize_text(query)
|
| 265 |
if not query:
|
| 266 |
return []
|
| 267 |
|
| 268 |
+
query = escape_arxiv_phrase(query)
|
| 269 |
+
|
| 270 |
terms = [t for t in re.split(r"\s+", query) if t]
|
| 271 |
+
|
| 272 |
strategies = []
|
| 273 |
|
| 274 |
+
# まずフレーズ検索
|
|
|
|
| 275 |
strategies.append(f'all:"{query}"')
|
| 276 |
+
|
| 277 |
+
# タイトル検索
|
| 278 |
strategies.append(f'ti:"{query}"')
|
| 279 |
|
| 280 |
+
# abstract検索も追加
|
| 281 |
+
strategies.append(f'abs:"{query}"')
|
| 282 |
+
|
| 283 |
+
# 単語AND検索
|
| 284 |
if terms:
|
| 285 |
+
safe_terms = [escape_arxiv_phrase(t) for t in terms]
|
| 286 |
+
strategies.append(" AND ".join([f'all:{t}' for t in safe_terms]))
|
| 287 |
+
|
| 288 |
+
# 最後に緩めの単語OR検索
|
| 289 |
+
if len(terms) >= 2:
|
| 290 |
+
safe_terms = [escape_arxiv_phrase(t) for t in terms]
|
| 291 |
+
strategies.append(" OR ".join([f'all:{t}' for t in safe_terms]))
|
| 292 |
|
| 293 |
seen = set()
|
| 294 |
all_papers = []
|
|
|
|
| 315 |
|
| 316 |
return all_papers[:max_results]
|
| 317 |
|
|
|
|
| 318 |
# =========================
|
| 319 |
# OpenAlex Search
|
| 320 |
# =========================
|
|
|
|
| 565 |
|
| 566 |
st.sidebar.header("Settings")
|
| 567 |
|
| 568 |
+
import os
|
| 569 |
+
openai_api_key = os.getenv("OPENAI_API_KEY")
|
| 570 |
+
st.session_state["OPENAI_API_KEY"] = openai_api_key
|
| 571 |
|
| 572 |
+
# model = st.sidebar.selectbox(
|
| 573 |
+
# "Model",
|
| 574 |
+
# ["gpt-5-nano"],
|
| 575 |
+
# index=0,
|
| 576 |
+
# )
|
| 577 |
+
model = "gpt-5-nano"
|
| 578 |
|
| 579 |
debug_mode = st.sidebar.checkbox("Debug mode", value=True)
|
| 580 |
|
|
|
|
| 685 |
summary = f"要約生成に失敗しました: {e}"
|
| 686 |
|
| 687 |
st.markdown("---")
|
| 688 |
+
title = p.get("title", "No title")
|
| 689 |
+
url = p.get("url")
|
| 690 |
+
if url:
|
| 691 |
+
st.markdown(f"### [{title}]({url})")
|
| 692 |
+
else:
|
| 693 |
+
st.markdown(f"### {title}")
|
| 694 |
st.write("**Explanation:**")
|
| 695 |
st.write(summary)
|
| 696 |
st.write("**Authors:**", ", ".join(p.get("authors", [])) if p.get("authors") else "-")
|