| import os |
| import json |
| import dimcli |
| import pandas as pd |
| import plotly.express as px |
| import streamlit as st |
| import scholarpy |
| import leafmap.foliumap as leafmap |
| import datetime |
|
|
| current_year = datetime.datetime.now().year |
|
|
| if "dsl" not in st.session_state: |
| st.session_state["dsl"] = scholarpy.Dsl() |
|
|
| |
| FOLDER_NAME = "data" |
| if not (os.path.exists(FOLDER_NAME)): |
| os.mkdir(FOLDER_NAME) |
|
|
|
|
| def save(df, filename_dot_csv): |
| df.to_csv(FOLDER_NAME + "/" + filename_dot_csv, index=False) |
|
|
|
|
| def read(filename_dot_csv): |
| df = pd.read_csv(FOLDER_NAME + "/" + filename_dot_csv) |
| return df |
|
|
|
|
| @st.cache_data |
| def get_token(): |
|
|
| return os.environ.get("DIM_TOKEN") |
|
|
|
|
| @st.cache_data |
| def get_journals(): |
|
|
| with open("data/journals.json") as f: |
| journals = json.load(f) |
|
|
| return journals |
|
|
|
|
| @st.cache_data |
| def read_excel(sheet_name): |
|
|
| df = pd.read_excel( |
| "data/journals.xlsx", sheet_name=sheet_name, index_col=False, engine="openpyxl" |
| ) |
| df.set_index("Rank", inplace=True) |
| return df |
|
|
|
|
| def app(): |
|
|
| st.title("Search Journals") |
| dsl = st.session_state["dsl"] |
| search_type = st.radio( |
| "Select a search type", |
| ["Search by journal title", "List Google Scholar journal categories"], |
| ) |
|
|
| if search_type == "Search by journal title": |
| row1_col1, row1_col2, row1_col3, _ = st.columns([1, 1, 2, 1]) |
| with row1_col1: |
| name = st.text_input("Enter a journal title") |
|
|
| with row1_col2: |
| exact_match = st.checkbox("Exact match") |
|
|
| with row1_col3: |
| options = [ |
| "book", |
| "book_series", |
| "proceeding", |
| "journal", |
| "preprint_platform", |
| ] |
| types = st.multiselect( |
| "Select journal types", options, ["journal", "book_series"] |
| ) |
|
|
| if name: |
| result = dsl.search_journal_by_title(name, exact_match=exact_match) |
| if result is not None: |
| titles = result.as_dataframe() |
| titles = titles[titles["type"].isin(types)] |
| titles.sort_values("title", inplace=True) |
| else: |
| titles = pd.DataFrame() |
| |
| if not titles.empty: |
|
|
| markdown = f""" |
| Returned Journals: {len(titles)} |
| |
| """ |
| st.markdown(markdown) |
|
|
| st.dataframe(titles) |
| titles["uid"] = ( |
| titles["id"] + " | " + titles["type"] + " | " + titles["title"] |
| ) |
|
|
| row2_col1, row2_col2, row2_col3, row2_col4, row2_col5 = st.columns( |
| [2.4, 1, 0.6, 1, 1] |
| ) |
|
|
| with row2_col1: |
| title = st.selectbox( |
| "Select a journal title", titles["uid"].values.tolist() |
| ) |
|
|
| with row2_col2: |
| keyword = st.text_input("Enter a keyword to search for") |
|
|
| with row2_col3: |
| exact_match = st.checkbox("Exact match", True) |
|
|
| with row2_col4: |
| scope = st.selectbox( |
| "Select a search scope", |
| [ |
| "authors", |
| "concepts", |
| "full_data", |
| "full_data_exact", |
| "title_abstract_only", |
| "title_only", |
| ], |
| index=5, |
| ) |
|
|
| with row2_col5: |
| years = st.slider( |
| "Select the start and end year:", |
| 1950, |
| current_year, |
| (1980, current_year), |
| ) |
|
|
| if title: |
| journal_id = title.split(" | ")[0] |
| if keyword: |
| pubs = dsl.search_pubs_by_keyword( |
| keyword, exact_match, scope, years[0], years[1], journal_id |
| ) |
| else: |
| pubs = dsl.search_pubs_by_journal_id( |
| journal_id, years[0], years[1] |
| ) |
| pubs_df = pubs.as_dataframe() |
| if pubs_df is not None and (not pubs_df.empty): |
| st.write( |
| f"Total number of pulications: {pubs.count_total:,}. Display {min(pubs.count_total, 1000)} publications below." |
| ) |
| try: |
| st.dataframe(pubs_df) |
| except Exception as e: |
| st.dataframe(scholarpy.json_to_df(pubs)) |
| |
| leafmap.st_download_button( |
| "Download data", pubs_df, csv_sep="\t" |
| ) |
| else: |
| st.text("No results found") |
|
|
| elif search_type == "List Google Scholar journal categories": |
|
|
| st.markdown( |
| """ |
| The journal categories are adopted from [Google Scholar](https://scholar.google.com/citations?view_op=top_venues&hl=en&inst=9897619243961157265). |
| See the list of journals [here](https://docs.google.com/spreadsheets/d/1uCEi3TsJCWl9QEZimvjlM8wjt7hNq3QvMqHGeT44HXQ/edit?usp=sharing). |
| """ |
| ) |
|
|
| st.session_state["orcids"] = None |
| |
| |
| |
| |
|
|
| categories = get_journals() |
|
|
| row1_col1, row1_col2, _, row1_col3 = st.columns([1, 1, 0.05, 1]) |
|
|
| with row1_col1: |
| category = st.selectbox("Select a category:", categories.keys()) |
|
|
| if category: |
| with row1_col2: |
| journal = st.selectbox("Select a journal:", categories[category].keys()) |
|
|
| with row1_col3: |
| years = st.slider( |
| "Select the start and end year:", |
| 1950, |
| current_year, |
| (1980, current_year), |
| ) |
|
|
| if journal: |
| pubs = read_excel(sheet_name=category) |
| with st.expander("Show journal metrics"): |
| st.dataframe(pubs) |
|
|
| journal_id = categories[category][journal] |
| if journal_id is not None and str(journal_id).startswith("jour"): |
| q_template = """search publications where |
| journal.id="{}" and |
| year>={} and |
| year<={} |
| return publications[id+title+doi+year+authors+type+pages+journal+issue+volume+altmetric+times_cited] |
| limit 1000""" |
| q = q_template.format(journal_id, years[0], years[1]) |
| else: |
| q_template = """search publications where |
| journal.title="{}" and |
| year>={} and |
| year<={} |
| return publications[id+title+doi+year+authors+type+pages+journal+issue+volume+altmetric+times_cited] |
| limit 1000""" |
| q = q_template.format(journal, years[0], years[1]) |
|
|
| pubs = dsl.query(q) |
| if pubs.count_total > 0: |
| st.header("Publications") |
| st.write( |
| f"Total number of pulications: {pubs.count_total:,}. Display 1,000 publications below." |
| ) |
| df_pubs = pubs.as_dataframe() |
| save(df_pubs, "publications.csv") |
| df_pubs = read("publications.csv") |
| st.dataframe(df_pubs) |
|
|
| st.header("Authors") |
| authors = pubs.as_dataframe_authors() |
| st.write( |
| f"Total number of authors of the 1,000 pubs shown above: {authors.shape[0]:,}" |
| ) |
| save(authors, "authors.csv") |
| df_authors = read("authors.csv") |
| st.dataframe(df_authors) |
|
|
| df_authors_orcid = df_authors[~df_authors["orcid"].isna()] |
| |
| orcids = list(set(df_authors_orcid["orcid"].values.tolist())) |
| orcids = [i[2:21] for i in orcids] |
| orcids.sort() |
| |
| st.session_state["orcids"] = orcids |
|
|
| st.header("Affiliations") |
| affiliations = pubs.as_dataframe_authors_affiliations() |
| st.write( |
| f"Total number of affiliations of the 1,000 pubs shown above: {affiliations.shape[0]:,}" |
| ) |
| save(affiliations, "affiliations.csv") |
| df_affiliations = read("affiliations.csv") |
| st.dataframe(df_affiliations) |
|
|
| researchers = authors.query("researcher_id!=''") |
| |
| df_researchers = pd.DataFrame( |
| { |
| "measure": [ |
| "Authors in total (non unique)", |
| "Authors with a researcher ID", |
| "Authors with a researcher ID (unique)", |
| ], |
| "count": [ |
| len(authors), |
| len(researchers), |
| researchers["researcher_id"].nunique(), |
| ], |
| } |
| ) |
| fig_researchers = px.bar( |
| df_researchers, |
| x="measure", |
| y="count", |
| title=f"Author Research ID stats for {journal} ({years[0]}-{years[1]})", |
| ) |
|
|
| orcids = authors.query("orcid!=''") |
| |
| df_orcids = pd.DataFrame( |
| { |
| "measure": [ |
| "Authors in total (non unique)", |
| "Authors with a ORCID", |
| "Authors with a ORCID (unique)", |
| ], |
| "count": [ |
| len(authors), |
| len(orcids), |
| orcids["orcid"].nunique(), |
| ], |
| } |
| ) |
| fig_orcids = px.bar( |
| df_orcids, |
| x="measure", |
| y="count", |
| title=f"Author ORCID stats for {journal} ({years[0]}-{years[1]})", |
| ) |
|
|
| st.header("Stats") |
|
|
| row2_col1, row1_col2 = st.columns(2) |
| with row2_col1: |
| st.plotly_chart(fig_researchers) |
| with row1_col2: |
| st.plotly_chart(fig_orcids) |
|
|
| else: |
| st.warning("No publications found") |
|
|