stardust-coder commited on
Commit
d3d1949
·
1 Parent(s): 738680d

[add] first commit

Browse files
Files changed (2) hide show
  1. requirements.txt +3 -1
  2. src/streamlit_app.py +119 -38
requirements.txt CHANGED
@@ -1,3 +1,5 @@
1
  altair
2
  pandas
3
- streamlit
 
 
 
1
  altair
2
  pandas
3
+ streamlit
4
+ requests
5
+ beautifulsoup4
src/streamlit_app.py CHANGED
@@ -1,40 +1,121 @@
1
- import altair as alt
2
- import numpy as np
3
- import pandas as pd
 
4
  import streamlit as st
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5
 
6
- """
7
- # Welcome to Streamlit!
8
-
9
- Edit `/streamlit_app.py` to customize this app to your heart's desire :heart:.
10
- If you have any questions, checkout our [documentation](https://docs.streamlit.io) and [community
11
- forums](https://discuss.streamlit.io).
12
-
13
- In the meantime, below is an example of what you can do with just a few lines of code:
14
- """
15
-
16
- num_points = st.slider("Number of points in spiral", 1, 10000, 1100)
17
- num_turns = st.slider("Number of turns in spiral", 1, 300, 31)
18
-
19
- indices = np.linspace(0, 1, num_points)
20
- theta = 2 * np.pi * num_turns * indices
21
- radius = indices
22
-
23
- x = radius * np.cos(theta)
24
- y = radius * np.sin(theta)
25
-
26
- df = pd.DataFrame({
27
- "x": x,
28
- "y": y,
29
- "idx": indices,
30
- "rand": np.random.randn(num_points),
31
- })
32
-
33
- st.altair_chart(alt.Chart(df, height=700, width=700)
34
- .mark_point(filled=True)
35
- .encode(
36
- x=alt.X("x", axis=None),
37
- y=alt.Y("y", axis=None),
38
- color=alt.Color("idx", legend=None, scale=alt.Scale()),
39
- size=alt.Size("rand", legend=None, scale=alt.Scale(range=[1, 150])),
40
- ))
 
1
+ import json
2
+ import time
3
+
4
+ import requests
5
  import streamlit as st
6
+ from bs4 import BeautifulSoup
7
+
8
+
9
+ # Google Scholar から論文リスト要素を取得
10
+ def scrape_listings(soup):
11
+ return soup.select("div.gs_r.gs_or.gs_scl")
12
+
13
+
14
+ # タイトル取得
15
+ def scrape_scholar_title(listing):
16
+ title_element = listing.select_one("h3.gs_rt > a")
17
+ if title_element:
18
+ return title_element.text.strip()
19
+ return "タイトルなし"
20
+
21
+
22
+ # publication info 取得
23
+ def scrape_scholar_publication_info(listing):
24
+ publication_info_element = listing.select_one("div.gs_a")
25
+ if publication_info_element:
26
+ return publication_info_element.text.strip()
27
+ return "出版情報なし"
28
+
29
+
30
+ # スニペット取得
31
+ def scrape_scholar_snippet(listing):
32
+ snippet_element = listing.select_one("div.gs_rs, div.gs_snippet")
33
+ if snippet_element:
34
+ return snippet_element.text.strip()
35
+ return "スニペットなし"
36
+
37
+
38
+ # Google Scholar からデータ取得
39
+ def fetch_google_scholar_data(query):
40
+ url = "https://scholar.google.com/scholar"
41
+ params = {
42
+ "hl": "en",
43
+ "q": query,
44
+ }
45
+
46
+ headers = {
47
+ "User-Agent": (
48
+ "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
49
+ "AppleWebKit/537.36 (KHTML, like Gecko) "
50
+ "Chrome/122.0.0.0 Safari/537.36"
51
+ )
52
+ }
53
+
54
+ response = requests.get(url, params=params, headers=headers, timeout=15, verify=False)
55
+ response.raise_for_status()
56
+
57
+ time.sleep(2)
58
+ soup = BeautifulSoup(response.text, "html.parser")
59
+
60
+ listings = scrape_listings(soup)
61
+
62
+ scholar_data = []
63
+ for listing in listings:
64
+ title = scrape_scholar_title(listing)
65
+ publication_info = scrape_scholar_publication_info(listing)
66
+ snippet = scrape_scholar_snippet(listing)
67
+
68
+ scholar_info = {
69
+ "title": title,
70
+ "publication_info": publication_info,
71
+ "snippet": snippet,
72
+ }
73
+ scholar_data.append(scholar_info)
74
+
75
+ return scholar_data
76
+
77
+
78
+ def main():
79
+ st.set_page_config(page_title="Google Scholar Scraper", layout="wide")
80
+ st.title("Google Scholar Scraper")
81
+ st.write("Google Scholar の検索結果を取得して表示します。")
82
+
83
+ query = st.text_input("検索キーワード", value="biology")
84
+
85
+ if st.button("検索"):
86
+ with st.spinner("Google Scholar からデータ取得中..."):
87
+ try:
88
+ scholar_data = fetch_google_scholar_data(query)
89
+
90
+ if not scholar_data:
91
+ st.warning("検索結果が取得できませんでした。")
92
+ return
93
+
94
+ st.success(f"{len(scholar_data)} 件の結果を取得しました。")
95
+
96
+ for i, item in enumerate(scholar_data, start=1):
97
+ with st.container():
98
+ st.subheader(f"{i}. {item['title']}")
99
+ st.write(f"**Publication Info:** {item['publication_info']}")
100
+ st.write(f"**Snippet:** {item['snippet']}")
101
+ st.divider()
102
+
103
+ json_data = json.dumps(scholar_data, indent=4, ensure_ascii=False)
104
+
105
+ st.download_button(
106
+ label="JSONをダウンロード",
107
+ data=json_data,
108
+ file_name="google_scholar_data.json",
109
+ mime="application/json",
110
+ )
111
+
112
+ st.json(scholar_data)
113
+
114
+ except requests.exceptions.RequestException as e:
115
+ st.error(f"リクエスト中にエラーが発生しました: {e}")
116
+ except Exception as e:
117
+ st.error(f"予期しないエラーが発生しました: {e}")
118
+
119
 
120
+ if __name__ == "__main__":
121
+ main()