##packages code import pandas as pd import streamlit as st import numpy as np import json, requests #from pandas.io.json import json_normalize from tqdm import tqdm path = "Climate_site/python_scripts/" @st.cache_data # 👈 Add the caching decorator def load_data(): url = path + "institutions.tsv" dic = pd.read_csv(url, delimiter = "\t" , index_col = 1).to_dict('index') return dic dic_institutions = load_data() dic_country_codes = {'Europe': ['AD', 'AL', 'AT', 'BE', 'BG', 'BY', 'CZ', 'DE', 'DK', 'EE', 'FI', 'FR', 'GR', 'HU', 'IE', 'IS', 'IT', 'LI', 'LT', 'LU', 'LV', 'MK', 'MT', 'NL', 'NO', 'PL', 'PT', 'RO', 'RU', 'SE', 'SI', 'SK', 'SM', 'UA', 'VA', 'BA', 'HR', 'MD', 'MC', 'ME', 'RS', 'ES', 'CH', 'GB'], 'Asia': ['AF', 'AM', 'AZ', 'BD', 'BH', 'BN', 'BT', 'CN', 'CY', 'GE', 'ID', 'IL', 'IN', 'IQ', 'IR', 'JO', 'JP', 'KG', 'KP', 'KR', 'KW', 'LB', 'MM', 'MN', 'MV', 'MY', 'NP', 'OM', 'PH', 'PK', 'QA', 'SA', 'SG', 'SY', 'TH', 'TJ', 'TM', 'TR', 'UZ', 'VN', 'YE', 'KH', 'TL', 'KZ', 'LA', 'LK', 'AE'], 'North America': ['AG', 'BB', 'BS', 'BZ', 'CA', 'CR', 'CU', 'DM', 'DO', 'GT', 'GT', 'HN', 'JM', 'MX', 'NI', 'PA', 'TT', 'US', 'SV', 'GD', 'KN', 'LC', 'VC'], 'Africa': ['AO', 'BF', 'BI', 'BJ', 'BW', 'CD', 'CG', 'CI', 'CM', 'CV', 'DJ', 'EG', 'ER', 'ET', 'GA', 'GH', 'GM', 'GN', 'GW', 'KE', 'LR', 'LS', 'LY', 'MG', 'ML', 'MR', 'MU', 'MW', 'MZ', 'NA', 'NE', 'NG', 'RW', 'SC', 'SD', 'SL', 'SN', 'SO', 'ST', 'TG', 'TN', 'TZ', 'UG', 'ZM', 'ZW', 'DZ', 'CF', 'TD', 'KM', 'GQ', 'MA', 'ZA', 'SZ'], 'South America': ['AR', 'BO', 'BR', 'CL', 'CO', 'EC', 'GY', 'PE', 'PY', 'SR', 'UY', 'VE'], 'Oceania': ['AU', 'FJ', 'KI', 'MH', 'NR', 'NZ', 'PG', 'PW', 'SB', 'TO', 'TV', 'VU', 'FM', 'WS']} #################### General Functions ############################# def URL(base_URL , entity_type , filters): url = base_URL + entity_type + filters return url def get_data(url): url = requests.get(url) text = url.text import json data = json.loads(text) return data ## selecting the ids we want def author_id_from_name(name): dic_names = {} name = name.title() url = "https://api.openalex.org/authors?search=" + name + "&per_page=200&mailto=emma_scharfmann@berkeley.edu" data = get_data(url)["results"] for k in range(len(data)): author_id = data[k]["id"][21:] dic_names[author_id] = {} dic_names[author_id]["author_name"] = data[k]["display_name"] dic_names[author_id]["number_of_works"] = data[k]["works_count"] dic_names[author_id]["number_of_citations"] = data[k]["cited_by_count"] dic_names[author_id]["field_of_study"] = ", ".join([ data[k]["x_concepts"][j]["display_name"] for j in range(min(5,len(data[k]["x_concepts"])))]) dic_names[author_id]["last_known_institution"] = None dic_names[author_id]["country_code"] = None if data[k]["last_known_institutions"] != None and len(data[k]["last_known_institutions"]) > 0: dic_names[author_id]["last_known_institution"] = data[k]["last_known_institutions"][0]["display_name"] dic_names[author_id]["country_code"] = data[k]["last_known_institutions"][0]["country_code"] dic_names[author_id]["orcid"] = data[k]["orcid"] return pd.DataFrame(dic_names).T , [ ( k , ", ".join([ elem["author_name"] , str(elem["last_known_institution"]) , str(elem["field_of_study"].split(", ")[0]) , "Number of works: " + str(elem["number_of_works"]) ] ) ) for k , elem in dic_names.items()] def from_author_id(main_author_ids, year, country_code , size): dic_main_workers = {} for main_author_id in main_author_ids: url = "https://api.openalex.org/works?filter=author.id:" + main_author_id + ",publication_year:>" + str(year) + "&per_page=200&mailto=emma_scharfmann@berkeley.edu" try: data = get_data(url)["results"] for k in tqdm(range(len(data))): work_id = data[k]["id"][21:] for j in range(len(data[k]["authorships"])): author_id = data[k]["authorships"][j]["author"]["id"][21:] if author_id != main_author_id: if author_id not in dic_main_workers: dic_main_workers[author_id] = {} dic_main_workers[author_id]["author_name"] = data[k]["authorships"][j]["author"]["display_name"] dic_main_workers[author_id]["co_authors"] = 0 dic_main_workers[author_id]["citations"] = 0 dic_main_workers[author_id]["institution"] = None dic_main_workers[author_id]["country_code"] = None dic_main_workers[author_id]["id"] = None dic_main_workers[author_id]["longitude"] = None dic_main_workers[author_id]["latitude"] = None if dic_main_workers[author_id]["institution"] == None and data[k]["authorships"][j]["institutions"] != [] and "display_name" in data[k]["authorships"][j]["institutions"][0]: dic_main_workers[author_id]["institution"] = data[k]["authorships"][j]["institutions"][0]["display_name"] if dic_main_workers[author_id]["country_code"] == None and data[k]["authorships"][j]["institutions"] != [] and "country_code" in data[k]["authorships"][j]["institutions"][0]: dic_main_workers[author_id]["country_code"] = data[k]["authorships"][j]["institutions"][0]["country_code"] if dic_main_workers[author_id]["id"] == None and data[k]["authorships"][j]["institutions"] != [] and "id" in data[k]["authorships"][j]["institutions"][0] and data[k]["authorships"][j]["institutions"][0]["id"] != None: dic_main_workers[author_id]["id"] = data[k]["authorships"][j]["institutions"][0]["id"][21:] dic_main_workers[author_id]["co_authors"] += 1 url = "https://api.openalex.org/works?filter=referenced_works:" + work_id + ",publication_year:>" + str(year) + "&per_page=200&mailto=emma_scharfmann@berkeley.edu" try: citations_data = get_data(url)["results"] for i in range(len(citations_data)): citing_work_id = citations_data[i]["id"][21:] for j in range(len(citations_data[i]["authorships"])): citing_author_id = citations_data[i]["authorships"][j]["author"]["id"][21:] if citing_author_id != main_author_id: if citing_author_id not in dic_main_workers: dic_main_workers[citing_author_id] = {} dic_main_workers[citing_author_id]["author_name"] = citations_data[i]["authorships"][j]["author"]["display_name"] dic_main_workers[citing_author_id]["co_authors"] = 0 dic_main_workers[citing_author_id]["citations"] = 0 dic_main_workers[citing_author_id]["institution"] = None dic_main_workers[citing_author_id]["country_code"] = None dic_main_workers[citing_author_id]["id"] = None dic_main_workers[citing_author_id]["longitude"] = None dic_main_workers[citing_author_id]["latitude"] = None if dic_main_workers[citing_author_id]["institution"] == None and (citations_data[i]["authorships"][j]["institutions"] != [] and "display_name" in citations_data[i]["authorships"][j]["institutions"][0]): dic_main_workers[citing_author_id]["institution"] = citations_data[i]["authorships"][j]["institutions"][0]["display_name"] if dic_main_workers[citing_author_id]["country_code"] == None and (citations_data[i]["authorships"][j]["institutions"] != [] and "country_code" in citations_data[i]["authorships"][j]["institutions"][0] ): dic_main_workers[citing_author_id]["country_code"] = citations_data[i]["authorships"][j]["institutions"][0]["country_code"] if dic_main_workers[citing_author_id]["id"] == None and ( citations_data[i]["authorships"][j]["institutions"] != [] and "id" in citations_data[i]["authorships"][j]["institutions"][0] and citations_data[i]["authorships"][j]["institutions"][0]["id"] != None): dic_main_workers[citing_author_id]["id"] = citations_data[i]["authorships"][j]["institutions"][0]["id"][21:] dic_main_workers[citing_author_id]["citations"] += 1 except: pass except: pass for author_id in dic_main_workers: if dic_main_workers[author_id]["id"] != None: institution_id = dic_main_workers[author_id]["id"] if institution_id in dic_institutions: geo_data = dic_institutions[institution_id] dic_main_workers[author_id]["longitude"] = geo_data["longitude"] dic_main_workers[author_id]["latitude"] = geo_data["latitude"] res = pd.DataFrame(dic_main_workers).T.sort_values("citations" , ascending = False)[['author_name', 'co_authors', 'citations', 'institution', 'country_code']] res_geo = pd.DataFrame(dic_main_workers).T.sort_values("citations" , ascending = False)[['author_name', 'co_authors', 'citations', 'institution', 'country_code','longitude', 'latitude']] res_geo = res_geo[res_geo["longitude"].notnull()] if country_code != False: if country_code in [ "Europe" , "North America" , "Asia" , "South America", "Oceania" , "Africa" ]: return res[res["country_code"].isin(dic_country_codes[country_code])].head(size) , res_geo[res_geo["country_code"].isin(dic_country_codes[country_code])].head(size) else: return res[res["country_code"] == country_code].head(size) , res_geo[res_geo["country_code"] == country_code].head(size) else: return res.head(size), res_geo.head(size)