|
|
|
|
|
from shapely.geometry import Point |
|
|
import pandas as pd |
|
|
from tqdm import tqdm |
|
|
import streamlit as st |
|
|
import numpy as np |
|
|
import json, requests |
|
|
import pandas as pd |
|
|
|
|
|
|
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
from math import radians, cos, sin, asin, sqrt |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from sentence_transformers import SentenceTransformer, util |
|
|
|
|
|
@st.cache_resource |
|
|
def model_nlp(): |
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
return model |
|
|
|
|
|
import unicodedata |
|
|
|
|
|
from metaphone import doublemetaphone |
|
|
from fuzzywuzzy import fuzz |
|
|
from difflib import SequenceMatcher |
|
|
import re |
|
|
|
|
|
import geopandas as gpd |
|
|
from geopandas import GeoDataFrame |
|
|
|
|
|
path = "Climate_site/python_scripts/" |
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def load_data(): |
|
|
url = path + "institutions.tsv" |
|
|
dic = pd.read_csv(url, delimiter = "\t" , index_col = 1).to_dict('index') |
|
|
return dic |
|
|
|
|
|
dic_institutions = load_data() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def URL(base_URL , entity_type , filters): |
|
|
url = base_URL + entity_type + filters |
|
|
return url |
|
|
|
|
|
|
|
|
def get_data(url): |
|
|
url = requests.get(url) |
|
|
text = url.text |
|
|
import json |
|
|
data = json.loads(text) |
|
|
return data |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def reconstruction_abstract(abstract_inverted_index): |
|
|
|
|
|
|
|
|
if abstract_inverted_index != None: |
|
|
|
|
|
list_values = list(abstract_inverted_index.values()) |
|
|
list_keys = list(abstract_inverted_index.keys()) |
|
|
|
|
|
|
|
|
size_abstract = max([ max(elem) for elem in abstract_inverted_index.values() ] ) |
|
|
|
|
|
abstract = [""]*(size_abstract +1) |
|
|
|
|
|
for i in range(len(list_values)): |
|
|
for pos in list_values[i]: |
|
|
abstract[pos] = list_keys[i] |
|
|
|
|
|
return " ".join(list(abstract)) |
|
|
|
|
|
else: |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def norm(vector): |
|
|
return np.sqrt(sum(x * x for x in vector)) |
|
|
|
|
|
def cosine_similarity2(vec_a, vec_b): |
|
|
norm_a = norm(vec_a) |
|
|
norm_b = norm(vec_b) |
|
|
dot = sum(a * b for a, b in zip(vec_a, vec_b)) |
|
|
return dot / (norm_a * norm_b) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def related_papers_own_research( research_key_words , display ): |
|
|
|
|
|
dic = {} |
|
|
|
|
|
max_count = 0 |
|
|
base_URL_OA = f'https://api.openalex.org/' |
|
|
filter_works = f'works?' |
|
|
|
|
|
|
|
|
|
|
|
filter_openalex = f"search=" + research_key_words + "&per_page=200&mailto=emma_scharfmann@berkeley.edu" |
|
|
|
|
|
|
|
|
filter_openalex = filter_openalex.replace(" " , "%20") |
|
|
|
|
|
url = URL(base_URL_OA , filter_works, filter_openalex) |
|
|
data = get_data(url) |
|
|
count = data["meta"]["count"] |
|
|
|
|
|
|
|
|
|
|
|
if display == True: |
|
|
print( data["meta"]["count"] , [elem[i] for i in range(1,len(elem))] ) |
|
|
print(url) |
|
|
|
|
|
for i in range(len(data["results"])): |
|
|
dic[ data["results"][i]["id"]] = {} |
|
|
dic[ data["results"][i]["id"]]["title"] = data["results"][i]["title"] |
|
|
dic[ data["results"][i]["id"]]["abstract"] = reconstruction_abstract(data["results"][i]["abstract_inverted_index"]) |
|
|
dic[ data["results"][i]["id"]]["concepts"] = data["results"][i]["concepts"] |
|
|
dic[ data["results"][i]["id"]]["date"] = data["results"][i]["publication_date"] |
|
|
dic[ data["results"][i]["id"]]["authorships"] = data["results"][i]["authorships"] |
|
|
dic[ data["results"][i]["id"]]["cited_by_count"] = data["results"][i]["cited_by_count"] |
|
|
|
|
|
if len(data["results"][i]["authorships"]) > 0: |
|
|
if data["results"][i]["authorships"][0]["institutions"] != []: |
|
|
dic[ data["results"][i]["id"]]["countries"] = data["results"][i]["authorships"][0]["institutions"][0]["country_code"] |
|
|
dic[ data["results"][i]["id"]]["institutions"] = data["results"][i]["authorships"][0]["institutions"][0]["display_name"] |
|
|
else: |
|
|
dic[ data["results"][i]["id"]]["countries"] = "" |
|
|
dic[ data["results"][i]["id"]]["institutions"] = "" |
|
|
dic[ data["results"][i]["id"]]["authors"] = data["results"][i]["authorships"][0]["author"]["display_name"] |
|
|
|
|
|
for j in range(1 , len(data["results"][i]["authorships"])): |
|
|
if data["results"][i]["authorships"][j]["institutions"] != []: |
|
|
dic[ data["results"][i]["id"]]["institutions"] += ", " + data["results"][i]["authorships"][j]["institutions"][0]["display_name"] |
|
|
dic[ data["results"][i]["id"]]["countries"] = data["results"][i]["authorships"][j]["institutions"][0]["country_code"] |
|
|
dic[ data["results"][i]["id"]]["authors"] += ", " + data["results"][i]["authorships"][j]["author"]["display_name"] |
|
|
|
|
|
if display == True: |
|
|
print(" ") |
|
|
|
|
|
return dic |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def ranking_own_research(research_key_words, details , display): |
|
|
|
|
|
model = model_nlp() |
|
|
|
|
|
dic_scores_papers = {} |
|
|
dic = related_papers_own_research( research_key_words , False) |
|
|
|
|
|
reference_text = details |
|
|
|
|
|
if display== True: |
|
|
print("Technology details: " , reference_text) |
|
|
print(" ") |
|
|
encoded_text = model.encode(reference_text, convert_to_tensor=False , show_progress_bar = False).tolist() |
|
|
|
|
|
|
|
|
|
|
|
for ids in list(dic.keys()): |
|
|
|
|
|
dic_scores_papers[ids] = {} |
|
|
|
|
|
if dic[ids]["title"] != None: |
|
|
encoded_title = model.encode(dic[ids]["title"], convert_to_tensor=False , show_progress_bar = False).tolist() |
|
|
score_title = cosine_similarity2(encoded_title, encoded_text) |
|
|
else: |
|
|
score_title = None |
|
|
|
|
|
|
|
|
|
|
|
if dic[ids]["abstract"] != None: |
|
|
encoded_abstract = model.encode(dic[ids]["abstract"], convert_to_tensor=False , show_progress_bar = False).tolist() |
|
|
score_abstract = cosine_similarity2(encoded_abstract, encoded_text) |
|
|
else: |
|
|
score_abstract = None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dic_scores_papers[ids]["title comparison"] = score_title |
|
|
dic_scores_papers[ids]["abstract comparison"] = score_abstract |
|
|
|
|
|
|
|
|
dic_scores_papers[ids]["title"] = dic[ids]["title"] |
|
|
dic_scores_papers[ids]["citations"] = dic[ids]["cited_by_count"] |
|
|
|
|
|
|
|
|
dic_scores_papers[ids]["date"] = dic[ids]["date"][:4] |
|
|
if "institutions" in dic[ids]: |
|
|
dic_scores_papers[ids]["institutions"] = dic[ids]["institutions"] |
|
|
|
|
|
|
|
|
else: |
|
|
dic_scores_papers[ids]["institutions"] = None |
|
|
|
|
|
|
|
|
if "authors" in dic[ids]: |
|
|
dic_scores_papers[ids]["number of co-authors"] = len(dic[ids]["authors"].split(",")) |
|
|
dic_scores_papers[ids]["authors"] = dic[ids]["authors"] |
|
|
|
|
|
return dic , dic_scores_papers |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_ranking_own_research(research_key_words, details , display , size): |
|
|
|
|
|
dic , dic_scores_papers = ranking_own_research(research_key_words, details , display) |
|
|
|
|
|
if dic_scores_papers == {}: |
|
|
return "No paper found" |
|
|
|
|
|
elif type(dic_scores_papers) == str: |
|
|
return dic_scores_papers |
|
|
|
|
|
else: |
|
|
return pd.DataFrame(dic_scores_papers).T.sort_values(by="abstract comparison" , ascending = False).head(size) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_quantitative_data_paper(work_id): |
|
|
|
|
|
url = "https://api.openalex.org/works/" + str(work_id) |
|
|
url_google = "https://explore.openalex.org/works/" + str(work_id) |
|
|
|
|
|
data = get_data(url) |
|
|
date = data["publication_date"] |
|
|
title = data["title"] |
|
|
abstract = reconstruction_abstract(data["abstract_inverted_index"]) |
|
|
concepts = ", ".join( [elem["display_name"] for elem in data["concepts"]] ) |
|
|
authors = ", ".join( [elem["author"]["display_name"] for elem in data["authorships"]] ) |
|
|
institutions = ", ".join( set([elem["institutions"][0]["display_name"] for elem in data["authorships"] if len(elem["institutions"]) > 0]) ) |
|
|
return url_google , title , abstract , date , authors , institutions |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
ln_suff = ['oster', |
|
|
'nordre', |
|
|
'vaster', |
|
|
'aust', |
|
|
'vesle', |
|
|
'da', |
|
|
'van t', |
|
|
'af', |
|
|
'al', |
|
|
'setya', |
|
|
'zu', |
|
|
'la', |
|
|
'na', |
|
|
'mic', |
|
|
'ofver', |
|
|
'el', |
|
|
'vetle', |
|
|
'van het', |
|
|
'dos', |
|
|
'ui', |
|
|
'vest', |
|
|
'ab', |
|
|
'vste', |
|
|
'nord', |
|
|
'van der', |
|
|
'bin', |
|
|
'ibn', |
|
|
'war', |
|
|
'fitz', |
|
|
'alam', |
|
|
'di', |
|
|
'erch', |
|
|
'fetch', |
|
|
'nga', |
|
|
'ka', |
|
|
'soder', |
|
|
'lille', |
|
|
'upp', |
|
|
'ua', |
|
|
'te', |
|
|
'ni', |
|
|
'bint', |
|
|
'von und zu', |
|
|
'vast', |
|
|
'vestre', |
|
|
'over', |
|
|
'syd', |
|
|
'mac', |
|
|
'nin', |
|
|
'nic', |
|
|
'putri', |
|
|
'bet', |
|
|
'verch', |
|
|
'norr', |
|
|
'bath', |
|
|
'della', |
|
|
'van', |
|
|
'ben', |
|
|
'du', |
|
|
'stor', |
|
|
'das', |
|
|
'neder', |
|
|
'abu', |
|
|
'degli', |
|
|
'vre', |
|
|
'ait', |
|
|
'ny', |
|
|
'opp', |
|
|
'pour', |
|
|
'kil', |
|
|
'der', |
|
|
'oz', |
|
|
'von', |
|
|
'at', |
|
|
'nedre', |
|
|
'van den', |
|
|
'setia', |
|
|
'ap', |
|
|
'gil', |
|
|
'myljom', |
|
|
'van de', |
|
|
'stre', |
|
|
'dele', |
|
|
'mck', |
|
|
'de', |
|
|
'mellom', |
|
|
'mhic', |
|
|
'binti', |
|
|
'ath', |
|
|
'binte', |
|
|
'snder', |
|
|
'sre', |
|
|
'ned', |
|
|
'ter', |
|
|
'bar', |
|
|
'le', |
|
|
'mala', |
|
|
'ost', |
|
|
'syndre', |
|
|
'sr', |
|
|
'bat', |
|
|
'sndre', |
|
|
'austre', |
|
|
'putra', |
|
|
'putera', |
|
|
'av', |
|
|
'lu', |
|
|
'vetch', |
|
|
'ver', |
|
|
'puteri', |
|
|
'mc', |
|
|
'tre', |
|
|
'st'] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
name_del = ['2nd', '3rd', 'Jr', 'Jr.', 'Junior', 'Sr', 'Sr.', 'Senior'] |
|
|
|
|
|
|
|
|
def name_delete(string): |
|
|
for elmt in name_del: |
|
|
if f" {elmt}" in string: |
|
|
return string.replace(f" {elmt}","") |
|
|
return string |
|
|
|
|
|
def ln_suff_merge(string): |
|
|
for suff in ln_suff: |
|
|
if f" {suff} " in string or string.startswith(f"{suff} "): |
|
|
return string.replace(f"{suff} ",suff.replace(" ","")) |
|
|
return string |
|
|
|
|
|
|
|
|
|
|
|
def normalize(data): |
|
|
normal = unicodedata.normalize('NFKD', data).encode('ASCII', 'ignore') |
|
|
val = normal.decode("utf-8") |
|
|
|
|
|
val = name_delete(val) |
|
|
|
|
|
val = re.sub(r"[A-Z]{3,}", lambda x: x.group().lower(), val) |
|
|
|
|
|
val = re.sub(r"(\w)([A-Z])", r"\1 \2", val) |
|
|
|
|
|
val = val.lower() |
|
|
|
|
|
val = re.sub('[^A-Za-z0-9 ]+', ' ', val) |
|
|
|
|
|
val = re.sub(' +', ' ', val) |
|
|
|
|
|
val = val.strip() |
|
|
|
|
|
val = ln_suff_merge(val) |
|
|
|
|
|
return val |
|
|
|
|
|
|
|
|
def main_authors( research_key_words , details , size): |
|
|
|
|
|
dic_papers , dic_papers_ranked = ranking_own_research( research_key_words , details , False ) |
|
|
|
|
|
dic_papers_co_authors = {} |
|
|
|
|
|
for paper in list(dic_papers_ranked.keys())[:size]: |
|
|
|
|
|
for k in range(len(dic_papers[paper]["authorships"])): |
|
|
coauthor_id = dic_papers[paper]["authorships"][k]["author"]["id"] |
|
|
|
|
|
|
|
|
author_name = dic_papers[paper]["authorships"][k]["author"]["display_name"] |
|
|
author_name_norm = normalize(dic_papers[paper]["authorships"][k]["author"]["display_name"]).split() |
|
|
if len (author_name_norm) > 0: |
|
|
|
|
|
author_name_norm = author_name_norm[0] + " " + author_name_norm[-1] |
|
|
|
|
|
if author_name_norm not in dic_papers_co_authors: |
|
|
dic_papers_co_authors[author_name_norm] = {} |
|
|
dic_papers_co_authors[author_name_norm]["Author's name(s)"] = author_name |
|
|
if coauthor_id != None: |
|
|
dic_papers_co_authors[author_name_norm]["Author's id(s)"] = coauthor_id[21:] |
|
|
else: |
|
|
dic_papers_co_authors[author_name_norm]["Author's id(s)"] = "" |
|
|
dic_papers_co_authors[author_name_norm]["Number of occurence within the " + str(size) + " most related papers"] = 1 |
|
|
dic_papers_co_authors[author_name_norm]["Number of related citations"] = dic_papers[paper]["cited_by_count"] |
|
|
else: |
|
|
dic_papers_co_authors[author_name_norm]["Number of occurence within the " + str(size) + " most related papers"] += 1 |
|
|
if author_name not in dic_papers_co_authors[author_name_norm]["Author's name(s)"]: |
|
|
dic_papers_co_authors[author_name_norm]["Author's name(s)"] += ", " + author_name |
|
|
if coauthor_id != None and coauthor_id[21:] not in dic_papers_co_authors[author_name_norm]["Author's id(s)"]: |
|
|
dic_papers_co_authors[author_name_norm]["Author's id(s)"] += ", " + coauthor_id[21:] |
|
|
dic_papers_co_authors[author_name_norm]["Number of related citations"] += dic_papers[paper]["cited_by_count"] |
|
|
|
|
|
|
|
|
|
|
|
if dic_papers_co_authors != {}: |
|
|
|
|
|
for author_name_norm in list(dic_papers_co_authors.keys()): |
|
|
list_ids = dic_papers_co_authors[author_name_norm]["Author's id(s)"].split(", ") |
|
|
work_count = 0 |
|
|
cited_by_count = 0 |
|
|
institutions = '' |
|
|
institution_id = None |
|
|
|
|
|
for elem in list_ids: |
|
|
if len(elem) > 3: |
|
|
|
|
|
try: |
|
|
data = get_data("https://api.openalex.org/people/" + elem) |
|
|
work_count += data["works_count"] |
|
|
cited_by_count += data["cited_by_count"] |
|
|
|
|
|
|
|
|
if data["last_known_institution"] != None and data["last_known_institution"]["id"] != None: |
|
|
institution_id = data["last_known_institution"]["id"][21:] |
|
|
|
|
|
if data["last_known_institution"] != None and data["last_known_institution"]["display_name"] != None: |
|
|
if institutions == '': |
|
|
institutions += data["last_known_institution"]["display_name"] |
|
|
else: |
|
|
institutions += ", " + data["last_known_institution"]["display_name"] |
|
|
except: |
|
|
pass |
|
|
|
|
|
if work_count == 0 or work_count > 10000: |
|
|
dic_papers_co_authors.pop(author_name_norm) |
|
|
else: |
|
|
dic_papers_co_authors[author_name_norm]["Last Known Institution"] = institutions |
|
|
dic_papers_co_authors[author_name_norm]["Number of works"] = work_count |
|
|
dic_papers_co_authors[author_name_norm]["Number of citations"] = cited_by_count |
|
|
dic_papers_co_authors[author_name_norm]["Institution_id"] = institution_id |
|
|
|
|
|
|
|
|
|
|
|
dic_papers_co_authors = {k: v for k, v in sorted(dic_papers_co_authors.items(), key=lambda item: item[1]["Number of occurence within the " + str(size) + " most related papers"] , reverse = True)[:size]} |
|
|
|
|
|
|
|
|
dic_papers_map = {} |
|
|
count = 0 |
|
|
|
|
|
|
|
|
for author_name_norm in dic_papers_co_authors: |
|
|
|
|
|
institution_id = dic_papers_co_authors[author_name_norm]["Institution_id"] |
|
|
if institution_id in dic_institutions: |
|
|
data = dic_institutions[institution_id] |
|
|
|
|
|
dic_papers_map[count] = {} |
|
|
dic_papers_map[count]["longitude"] = data["longitude"] |
|
|
dic_papers_map[count]["latitude"] = data["latitude"] |
|
|
dic_papers_map[count]["author"] = dic_papers_co_authors[author_name_norm]["Author's name(s)"] |
|
|
dic_papers_map[count]["institution"] = dic_papers_co_authors[author_name_norm]["Last Known Institution"] |
|
|
|
|
|
count += 1 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
map_df = pd.DataFrame(dic_papers_map).T |
|
|
|
|
|
map_df["longitude"]=map_df['longitude'].astype(float) |
|
|
map_df['latitude']=map_df['latitude'].astype(float) |
|
|
map_df = map_df[map_df["latitude"].notnull()] |
|
|
|
|
|
|
|
|
|
|
|
return pd.DataFrame(dic_papers_co_authors, index = [ "Author's name(s)" , "Author's id(s)" , "Number of occurence within the " + str(200) + " most related papers" , "Last Known Institution" , "Number of works" , "Number of citations" , "Number of related citations"]).T.style.hide(axis="index") , map_df |
|
|
|
|
|
else: |
|
|
return ("Select another category") |
|
|
|
|
|
|
|
|
def map_authors(research_key_words , details , size): |
|
|
|
|
|
dic_papers , dic_papers_ranked = ranking_own_research( research_key_words , details , False ) |
|
|
|
|
|
|
|
|
dic_papers_co_authors = {} |
|
|
count = 0 |
|
|
|
|
|
|
|
|
for paper in list(dic_papers_ranked.keys())[:size]: |
|
|
|
|
|
for k in range(len(dic_papers[paper]["authorships"])): |
|
|
|
|
|
if dic_papers[paper]["authorships"][k]["institutions"] != [] and "id" in dic_papers[paper]["authorships"][k]["institutions"][0] and dic_papers[paper]["authorships"][k]["institutions"][0]["id"] != None: |
|
|
institution_id = dic_papers[paper]["authorships"][k]["institutions"][0]["id"][21:] |
|
|
if institution_id in dic_institutions: |
|
|
data = dic_institutions[institution_id] |
|
|
|
|
|
dic_papers_co_authors[count] = {} |
|
|
dic_papers_co_authors[count]["longitude"] = data["longitude"] |
|
|
dic_papers_co_authors[count]["latitude"] = data["latitude"] |
|
|
dic_papers_co_authors[count]["abstract comparison"] = dic_papers_ranked[paper]["abstract comparison"] |
|
|
dic_papers_co_authors[count]["author"] = dic_papers[paper]["authorships"][k]["author"]["display_name"] |
|
|
dic_papers_co_authors[count]["institution"] = dic_papers[paper]["authorships"][k]["institutions"][0]["display_name"] |
|
|
dic_papers_co_authors[count]["date"] = dic_papers[paper]["date"] |
|
|
|
|
|
count += 1 |
|
|
|
|
|
map_df = pd.DataFrame(dic_papers_co_authors).T |
|
|
|
|
|
if dic_papers_co_authors == {}: |
|
|
return map_df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
map_df["longitude"]=map_df['longitude'].astype(float) |
|
|
map_df['latitude']=map_df['latitude'].astype(float) |
|
|
map_df = map_df[map_df["latitude"].notnull()] |
|
|
|
|
|
return map_df |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|