Spaces:

EQUES
/

Paper-Extractor

Running

App Files Files Community

stardust-coder commited on 6 days ago

Commit

5f897a9

1 Parent(s): 0a1e821

[add] link to paper

Browse files

Files changed (2) hide show

README.md +0 -19
src/streamlit_app.py +200 -37

README.md DELETED Viewed

@@ -1,19 +0,0 @@
----
-title: Paper Extractor
-emoji: 🚀
-colorFrom: red
-colorTo: red
-sdk: docker
-app_port: 8501
-tags:
-- streamlit
-pinned: false
-short_description: Streamlit template space
----
-# Welcome to Streamlit!
-Edit `/src/streamlit_app.py` to customize this app to your heart's desire. :heart:
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).

src/streamlit_app.py CHANGED Viewed

@@ -1,3 +1,4 @@
 import re
 import requests
 import streamlit as st
@@ -15,12 +16,11 @@ def get_openai_client():
     return OpenAI(api_key=api_key)
-def ask_llm(prompt, model="gpt-4.1-mini"):
     client = get_openai_client()
     res = client.chat.completions.create(
         model=model,
         messages=[{"role": "user", "content": prompt}],
-        temperature=0.2,
     )
     return (res.choices[0].message.content or "").strip()
@@ -61,24 +61,114 @@ def deduplicate_papers(papers):
 # arXiv Search
 # =========================
 def parse_arxiv_response(xml_text):
     root = ET.fromstring(xml_text)
     papers = []
-    for entry in root.findall("{http://www.w3.org/2005/Atom}entry"):
-        title_el = entry.find("{http://www.w3.org/2005/Atom}title")
-        abstract_el = entry.find("{http://www.w3.org/2005/Atom}summary")
-        date_el = entry.find("{http://www.w3.org/2005/Atom}published")
         authors = []
-        for a in entry.findall("{http://www.w3.org/2005/Atom}author"):
-            name_el = a.find("{http://www.w3.org/2005/Atom}name")
             if name_el is not None and name_el.text:
-                authors.append(name_el.text.strip())
-        title = title_el.text.strip() if title_el is not None and title_el.text else ""
-        abstract = abstract_el.text.strip() if abstract_el is not None and abstract_el.text else ""
-        date = date_el.text.strip() if date_el is not None and date_el.text else ""
         if title:
             papers.append(
@@ -88,16 +178,46 @@ def parse_arxiv_response(xml_text):
                     "abstract": abstract,
                     "date": date,
                     "source": "arXiv",
-                    "venue": "",
-                    "url": "",
                 }
             )
     return papers
-def search_arxiv_once(search_query, max_results=3):
-    url = "https://export.arxiv.org/api/query"
     params = {
         "search_query": search_query,
         "start": 0,
@@ -106,31 +226,69 @@ def search_arxiv_once(search_query, max_results=3):
         "sortOrder": "descending",
     }
-    res = requests.get(
-        url,
-        params=params,
-        timeout=30,
-        headers={"User-Agent": "paper-finder/0.1"},
-    )
-    res.raise_for_status()
-    return parse_arxiv_response(res.text)
 def search_arxiv(query, max_results=3, debug=False):
     query = normalize_text(query)
     if not query:
         return []
     terms = [t for t in re.split(r"\s+", query) if t]
     strategies = []
-    # 緩い順に試す
-    strategies.append(f'all:{query}')
     strategies.append(f'all:"{query}"')
     strategies.append(f'ti:"{query}"')
     if terms:
-        strategies.append(" AND ".join([f'all:{t}' for t in terms]))
     seen = set()
     all_papers = []
@@ -157,7 +315,6 @@ def search_arxiv(query, max_results=3, debug=False):
     return all_papers[:max_results]
 # =========================
 # OpenAlex Search
 # =========================
@@ -408,15 +565,16 @@ st.title("📚 Paper Finder")
 st.sidebar.header("Settings")
-openai_api_key = st.sidebar.text_input("OpenAI API Key", type="password")
-if openai_api_key:
-    st.session_state["OPENAI_API_KEY"] = openai_api_key
-model = st.sidebar.selectbox(
-    "Model",
-    ["gpt-4.1-mini", "gpt-4.1", "gpt-4o-mini"],
-    index=0,
-)
 debug_mode = st.sidebar.checkbox("Debug mode", value=True)
@@ -527,7 +685,12 @@ if st.button("Search Papers"):
             summary = f"要約生成に失敗しました: {e}"
         st.markdown("---")
-        st.subheader(p.get("title", "Untitled"))
         st.write("**Explanation:**")
         st.write(summary)
         st.write("**Authors:**", ", ".join(p.get("authors", [])) if p.get("authors") else "-")

+import time
 import re
 import requests
 import streamlit as st
     return OpenAI(api_key=api_key)
+def ask_llm(prompt, model="gpt-5-nano"):
     client = get_openai_client()
     res = client.chat.completions.create(
         model=model,
         messages=[{"role": "user", "content": prompt}],
     )
     return (res.choices[0].message.content or "").strip()
 # arXiv Search
 # =========================
+import re
+import xml.etree.ElementTree as ET
+def normalize_space(text: str) -> str:
+    return re.sub(r"\s+", " ", text or "").strip()
+def extract_venue_from_arxiv(journal_ref: str, comment: str) -> str:
+    text = f"{journal_ref} {comment}".strip()
+    if not text:
+        return ""
+    # よくある国際会議・ジャーナル略称
+    venue_patterns = [
+        r"\bNeurIPS\s*\d{4}\b",
+        r"\bNIPS\s*\d{4}\b",
+        r"\bICML\s*\d{4}\b",
+        r"\bICLR\s*\d{4}\b",
+        r"\bACL\s*\d{4}\b",
+        r"\bEMNLP\s*\d{4}\b",
+        r"\bNAACL\s*\d{4}\b",
+        r"\bCOLING\s*\d{4}\b",
+        r"\bCVPR\s*\d{4}\b",
+        r"\bICCV\s*\d{4}\b",
+        r"\bECCV\s*\d{4}\b",
+        r"\bAAAI\s*\d{4}\b",
+        r"\bIJCAI\s*\d{4}\b",
+        r"\bKDD\s*\d{4}\b",
+        r"\bSIGIR\s*\d{4}\b",
+        r"\bWWW\s*\d{4}\b",
+        r"\bTheWebConf\s*\d{4}\b",
+        r"\bCHI\s*\d{4}\b",
+        r"\bUAI\s*\d{4}\b",
+        r"\bAISTATS\s*\d{4}\b",
+        r"\bICRA\s*\d{4}\b",
+        r"\bIROS\s*\d{4}\b",
+    ]
+    for pattern in venue_patterns:
+        m = re.search(pattern, text, flags=re.IGNORECASE)
+        if m:
+            return m.group(0)
+    # journal_refがあるなら、まずそれをvenueとして使う
+    if journal_ref:
+        return journal_ref
+    # commentに Accepted / Published / To appear などがあれば、それをvenue候補にする
+    accepted_patterns = [
+        r"(?:Accepted|Accepted at|Accepted to|To appear in|Published in)\s+(.+?)(?:\.|$)",
+        r"(?:Proceedings of)\s+(.+?)(?:\.|$)",
+    ]
+    for pattern in accepted_patterns:
+        m = re.search(pattern, comment, flags=re.IGNORECASE)
+        if m:
+            return normalize_space(m.group(1))
+    return ""
 def parse_arxiv_response(xml_text):
     root = ET.fromstring(xml_text)
     papers = []
+    ATOM = "{http://www.w3.org/2005/Atom}"
+    ARXIV = "{http://arxiv.org/schemas/atom}"
+    for entry in root.findall(f"{ATOM}entry"):
+        title_el = entry.find(f"{ATOM}title")
+        abstract_el = entry.find(f"{ATOM}summary")
+        date_el = entry.find(f"{ATOM}published")
+        id_el = entry.find(f"{ATOM}id")
+        journal_ref_el = entry.find(f"{ARXIV}journal_ref")
+        comment_el = entry.find(f"{ARXIV}comment")
         authors = []
+        for a in entry.findall(f"{ATOM}author"):
+            name_el = a.find(f"{ATOM}name")
             if name_el is not None and name_el.text:
+                authors.append(normalize_space(name_el.text))
+        title = normalize_space(title_el.text) if title_el is not None and title_el.text else ""
+        abstract = normalize_space(abstract_el.text) if abstract_el is not None and abstract_el.text else ""
+        date = normalize_space(date_el.text) if date_el is not None and date_el.text else ""
+        url = normalize_space(id_el.text) if id_el is not None and id_el.text else ""
+        journal_ref = (
+            normalize_space(journal_ref_el.text)
+            if journal_ref_el is not None and journal_ref_el.text
+            else ""
+        )
+        comment = (
+            normalize_space(comment_el.text)
+            if comment_el is not None and comment_el.text
+            else ""
+        )
+        venue = extract_venue_from_arxiv(journal_ref, comment)
+        pdf_url = ""
+        for link in entry.findall(f"{ATOM}link"):
+            if link.attrib.get("title") == "pdf":
+                pdf_url = link.attrib.get("href", "")
+                break
         if title:
             papers.append(
                     "abstract": abstract,
                     "date": date,
                     "source": "arXiv",
+                    "venue": venue,
+                    "journal_ref": journal_ref,
+                    "comment": comment,
+                    "url": url,
+                    "pdf_url": pdf_url,
                 }
             )
     return papers
+ARXIV_API_URL = "https://export.arxiv.org/api/query"
+_last_arxiv_request_time = 0
+def escape_arxiv_phrase(text: str) -> str:
+    """
+    arXivのフレーズ検索用に最低限エスケープする。
+    """
+    text = text.strip()
+    text = text.replace('"', " ")
+    text = re.sub(r"\s+", " ", text)
+    return text
+def wait_for_arxiv_rate_limit(min_interval=3.2):
+    """
+    arXiv APIは連続アクセスに弱いので、最低3秒以上空ける。
+    """
+    global _last_arxiv_request_time
+    elapsed = time.time() - _last_arxiv_request_time
+    if elapsed < min_interval:
+        time.sleep(min_interval - elapsed)
+def search_arxiv_once(search_query, max_results=3, retries=3):
+    global _last_arxiv_request_time
     params = {
         "search_query": search_query,
         "start": 0,
         "sortOrder": "descending",
     }
+    headers = {
+        "User-Agent": "paper-finder/0.1 contact:your-email@example.com"
+    }
+    last_error = None
+    for attempt in range(retries):
+        wait_for_arxiv_rate_limit()
+        try:
+            res = requests.get(
+                ARXIV_API_URL,
+                params=params,
+                timeout=30,
+                headers=headers,
+            )
+            _last_arxiv_request_time = time.time()
+            if res.status_code == 429:
+                wait = 5 * (attempt + 1)
+                time.sleep(wait)
+                last_error = RuntimeError("arXiv rate limited: 429")
+                continue
+            res.raise_for_status()
+            return parse_arxiv_response(res.text)
+        except requests.RequestException as e:
+            last_error = e
+            time.sleep(2 * (attempt + 1))
+    raise last_error
 def search_arxiv(query, max_results=3, debug=False):
     query = normalize_text(query)
     if not query:
         return []
+    query = escape_arxiv_phrase(query)
     terms = [t for t in re.split(r"\s+", query) if t]
     strategies = []
+    # まずフレーズ検索
     strategies.append(f'all:"{query}"')
+    # タイトル検索
     strategies.append(f'ti:"{query}"')
+    # abstract検索も追加
+    strategies.append(f'abs:"{query}"')
+    # 単語AND検索
     if terms:
+        safe_terms = [escape_arxiv_phrase(t) for t in terms]
+        strategies.append(" AND ".join([f'all:{t}' for t in safe_terms]))
+    # 最後に緩めの単語OR検索
+    if len(terms) >= 2:
+        safe_terms = [escape_arxiv_phrase(t) for t in terms]
+        strategies.append(" OR ".join([f'all:{t}' for t in safe_terms]))
     seen = set()
     all_papers = []
     return all_papers[:max_results]
 # =========================
 # OpenAlex Search
 # =========================
 st.sidebar.header("Settings")
+import os
+openai_api_key = os.getenv("OPENAI_API_KEY")
+st.session_state["OPENAI_API_KEY"] = openai_api_key
+# model = st.sidebar.selectbox(
+#     "Model",
+#     ["gpt-5-nano"],
+#     index=0,
+# )
+model = "gpt-5-nano"
 debug_mode = st.sidebar.checkbox("Debug mode", value=True)
             summary = f"要約生成に失敗しました: {e}"
         st.markdown("---")
+        title = p.get("title", "No title")
+        url = p.get("url")
+        if url:
+            st.markdown(f"### [{title}]({url})")
+        else:
+            st.markdown(f"### {title}")
         st.write("**Explanation:**")
         st.write(summary)
         st.write("**Authors:**", ", ".join(p.get("authors", [])) if p.get("authors") else "-")