Spaces:

EQUES
/

Paper-Extractor

Sleeping

App Files Files Community

stardust-coder commited on 25 days ago

Commit

d3d1949

1 Parent(s): 738680d

[add] first commit

Browse files

Files changed (2) hide show

requirements.txt +3 -1
src/streamlit_app.py +119 -38

requirements.txt CHANGED Viewed

@@ -1,3 +1,5 @@
 altair
 pandas
-streamlit

 altair
 pandas
+streamlit
+requests
+beautifulsoup4

src/streamlit_app.py CHANGED Viewed

@@ -1,40 +1,121 @@
-import altair as alt
-import numpy as np
-import pandas as pd
 import streamlit as st
-"""
-# Welcome to Streamlit!
-Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
-If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
-forums](https://discuss.streamlit.io).
-In the meantime, below is an example of what you can do with just a few lines of code:
-"""
-num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
-num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
-indices = np.linspace(0, 1, num_points)
-theta = 2 * np.pi * num_turns * indices
-radius = indices
-x = radius * np.cos(theta)
-y = radius * np.sin(theta)
-df = pd.DataFrame({
-    "x": x,
-    "y": y,
-    "idx": indices,
-    "rand": np.random.randn(num_points),
-})
-st.altair_chart(alt.Chart(df, height=700, width=700)
-    .mark_point(filled=True)
-    .encode(
-        x=alt.X("x", axis=None),
-        y=alt.Y("y", axis=None),
-        color=alt.Color("idx", legend=None, scale=alt.Scale()),
-        size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
-    ))

+import json
+import time
+import requests
 import streamlit as st
+from bs4 import BeautifulSoup
+# Google Scholar から論文リスト要素を取得
+def scrape_listings(soup):
+    return soup.select("div.gs_r.gs_or.gs_scl")
+# タイトル取得
+def scrape_scholar_title(listing):
+    title_element = listing.select_one("h3.gs_rt > a")
+    if title_element:
+        return title_element.text.strip()
+    return "タイトルなし"
+# publication info 取得
+def scrape_scholar_publication_info(listing):
+    publication_info_element = listing.select_one("div.gs_a")
+    if publication_info_element:
+        return publication_info_element.text.strip()
+    return "出版情報なし"
+# スニペット取得
+def scrape_scholar_snippet(listing):
+    snippet_element = listing.select_one("div.gs_rs, div.gs_snippet")
+    if snippet_element:
+        return snippet_element.text.strip()
+    return "スニペットなし"
+# Google Scholar からデータ取得
+def fetch_google_scholar_data(query):
+    url = "https://scholar.google.com/scholar"
+    params = {
+        "hl": "en",
+        "q": query,
+    }
+    headers = {
+        "User-Agent": (
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+            "AppleWebKit/537.36 (KHTML, like Gecko) "
+            "Chrome/122.0.0.0 Safari/537.36"
+        )
+    }
+    response = requests.get(url, params=params, headers=headers, timeout=15, verify=False)
+    response.raise_for_status()
+    time.sleep(2)
+    soup = BeautifulSoup(response.text, "html.parser")
+    listings = scrape_listings(soup)
+    scholar_data = []
+    for listing in listings:
+        title = scrape_scholar_title(listing)
+        publication_info = scrape_scholar_publication_info(listing)
+        snippet = scrape_scholar_snippet(listing)
+        scholar_info = {
+            "title": title,
+            "publication_info": publication_info,
+            "snippet": snippet,
+        }
+        scholar_data.append(scholar_info)
+    return scholar_data
+def main():
+    st.set_page_config(page_title="Google Scholar Scraper", layout="wide")
+    st.title("Google Scholar Scraper")
+    st.write("Google Scholar の検索結果を取得して表示します。")
+    query = st.text_input("検索キーワード", value="biology")
+    if st.button("検索"):
+        with st.spinner("Google Scholar からデータ取得中..."):
+            try:
+                scholar_data = fetch_google_scholar_data(query)
+                if not scholar_data:
+                    st.warning("検索結果が取得できませんでした。")
+                    return
+                st.success(f"{len(scholar_data)} 件の結果を取得しました。")
+                for i, item in enumerate(scholar_data, start=1):
+                    with st.container():
+                        st.subheader(f"{i}. {item['title']}")
+                        st.write(f"**Publication Info:** {item['publication_info']}")
+                        st.write(f"**Snippet:** {item['snippet']}")
+                        st.divider()
+                json_data = json.dumps(scholar_data, indent=4, ensure_ascii=False)
+                st.download_button(
+                    label="JSONをダウンロード",
+                    data=json_data,
+                    file_name="google_scholar_data.json",
+                    mime="application/json",
+                )
+                st.json(scholar_data)
+            except requests.exceptions.RequestException as e:
+                st.error(f"リクエスト中にエラーが発生しました: {e}")
+            except Exception as e:
+                st.error(f"予期しないエラーが発生しました: {e}")
+if __name__ == "__main__":
+    main()