##packages code


import pandas as pd
import streamlit as st
import numpy as np
import json, requests 
#from pandas.io.json import json_normalize
from tqdm import tqdm

path = "Climate_site/python_scripts/"

@st.cache_data  # 👈 Add the caching decorator
def load_data():
    url = path + "institutions.tsv"
    dic = pd.read_csv(url, delimiter = "\t" , index_col = 1).to_dict('index')
    return dic

dic_institutions = load_data()


dic_country_codes = {'Europe': ['AD',
  'AL',
  'AT',
  'BE',
  'BG',
  'BY',
  'CZ',
  'DE',
  'DK',
  'EE',
  'FI',
  'FR',
  'GR',
  'HU',
  'IE',
  'IS',
  'IT',
  'LI',
  'LT',
  'LU',
  'LV',
  'MK',
  'MT',
  'NL',
  'NO',
  'PL',
  'PT',
  'RO',
  'RU',
  'SE',
  'SI',
  'SK',
  'SM',
  'UA',
  'VA',
  'BA',
  'HR',
  'MD',
  'MC',
  'ME',
  'RS',
  'ES',
  'CH',
  'GB'],
 'Asia': ['AF',
  'AM',
  'AZ',
  'BD',
  'BH',
  'BN',
  'BT',
  'CN',
  'CY',
  'GE',
  'ID',
  'IL',
  'IN',
  'IQ',
  'IR',
  'JO',
  'JP',
  'KG',
  'KP',
  'KR',
  'KW',
  'LB',
  'MM',
  'MN',
  'MV',
  'MY',
  'NP',
  'OM',
  'PH',
  'PK',
  'QA',
  'SA',
  'SG',
  'SY',
  'TH',
  'TJ',
  'TM',
  'TR',
  'UZ',
  'VN',
  'YE',
  'KH',
  'TL',
  'KZ',
  'LA',
  'LK',
  'AE'],
 'North America': ['AG',
  'BB',
  'BS',
  'BZ',
  'CA',
  'CR',
  'CU',
  'DM',
  'DO',
  'GT',
  'GT',
  'HN',
  'JM',
  'MX',
  'NI',
  'PA',
  'TT',
  'US',
  'SV',
  'GD',
  'KN',
  'LC',
  'VC'],
 'Africa': ['AO',
  'BF',
  'BI',
  'BJ',
  'BW',
  'CD',
  'CG',
  'CI',
  'CM',
  'CV',
  'DJ',
  'EG',
  'ER',
  'ET',
  'GA',
  'GH',
  'GM',
  'GN',
  'GW',
  'KE',
  'LR',
  'LS',
  'LY',
  'MG',
  'ML',
  'MR',
  'MU',
  'MW',
  'MZ',
  'NA',
  'NE',
  'NG',
  'RW',
  'SC',
  'SD',
  'SL',
  'SN',
  'SO',
  'ST',
  'TG',
  'TN',
  'TZ',
  'UG',
  'ZM',
  'ZW',
  'DZ',
  'CF',
  'TD',
  'KM',
  'GQ',
  'MA',
  'ZA',
  'SZ'],
 'South America': ['AR',
  'BO',
  'BR',
  'CL',
  'CO',
  'EC',
  'GY',
  'PE',
  'PY',
  'SR',
  'UY',
  'VE'],
 'Oceania': ['AU',
  'FJ',
  'KI',
  'MH',
  'NR',
  'NZ',
  'PG',
  'PW',
  'SB',
  'TO',
  'TV',
  'VU',
  'FM',
  'WS']}


#################### General Functions #############################

def URL(base_URL , entity_type , filters):
    url = base_URL + entity_type + filters 
    return url


def get_data(url):
    url = requests.get(url)
    text = url.text
    import json
    data = json.loads(text)
    return data

## selecting the ids we want


def author_id_from_name(name):
    
    dic_names = {}
    
    name = name.title()
    
    url = "https://api.openalex.org/authors?search=" + name + "&per_page=200&mailto=emma_scharfmann@berkeley.edu"
    
    data = get_data(url)["results"]

    for k in range(len(data)):
        author_id = data[k]["id"][21:]
        dic_names[author_id] = {}
        dic_names[author_id]["author_name"] = data[k]["display_name"]
        
        dic_names[author_id]["number_of_works"] = data[k]["works_count"]
        dic_names[author_id]["number_of_citations"] = data[k]["cited_by_count"]
        
        dic_names[author_id]["field_of_study"] = ", ".join([ data[k]["x_concepts"][j]["display_name"] for j in range(min(5,len(data[k]["x_concepts"])))])

        dic_names[author_id]["last_known_institution"] = None
        dic_names[author_id]["country_code"] = None
        
        if data[k]["last_known_institutions"] != None and len(data[k]["last_known_institutions"]) > 0:
            dic_names[author_id]["last_known_institution"] = data[k]["last_known_institutions"][0]["display_name"]
            dic_names[author_id]["country_code"] = data[k]["last_known_institutions"][0]["country_code"]
            
        dic_names[author_id]["orcid"] = data[k]["orcid"]
        
            
    return pd.DataFrame(dic_names).T , [ ( k , ", ".join([ elem["author_name"] , str(elem["last_known_institution"]) , str(elem["field_of_study"].split(", ")[0]) , "Number of works: " + str(elem["number_of_works"])  ] ) ) for k , elem in  dic_names.items()] 
        
        
def from_author_id(main_author_ids, year, country_code , size):
    
    dic_main_workers = {} 

    
    for main_author_id in main_author_ids:
    
        url = "https://api.openalex.org/works?filter=author.id:" + main_author_id + ",publication_year:>" + str(year) + "&per_page=200&mailto=emma_scharfmann@berkeley.edu"

        try:
            data = get_data(url)["results"]
    
            for k in tqdm(range(len(data))):
                work_id = data[k]["id"][21:]
                for j in range(len(data[k]["authorships"])):
                    author_id = data[k]["authorships"][j]["author"]["id"][21:]
    
                    if author_id != main_author_id:
                        if author_id not in dic_main_workers:
                            dic_main_workers[author_id] = {}
                            dic_main_workers[author_id]["author_name"] = data[k]["authorships"][j]["author"]["display_name"]
                            dic_main_workers[author_id]["co_authors"] = 0
                            dic_main_workers[author_id]["citations"] = 0
                            dic_main_workers[author_id]["institution"] = None
                            dic_main_workers[author_id]["country_code"] = None
                            dic_main_workers[author_id]["id"] = None
                            dic_main_workers[author_id]["longitude"] = None
                            dic_main_workers[author_id]["latitude"] = None
                            
                        if dic_main_workers[author_id]["institution"] == None and data[k]["authorships"][j]["institutions"] != [] and "display_name" in data[k]["authorships"][j]["institutions"][0]:
                            dic_main_workers[author_id]["institution"] = data[k]["authorships"][j]["institutions"][0]["display_name"]
                        if dic_main_workers[author_id]["country_code"] == None and data[k]["authorships"][j]["institutions"] != [] and "country_code" in data[k]["authorships"][j]["institutions"][0]:
                            dic_main_workers[author_id]["country_code"] = data[k]["authorships"][j]["institutions"][0]["country_code"]
                        if dic_main_workers[author_id]["id"] == None and data[k]["authorships"][j]["institutions"] != [] and "id" in data[k]["authorships"][j]["institutions"][0] and data[k]["authorships"][j]["institutions"][0]["id"] != None:
                            dic_main_workers[author_id]["id"] = data[k]["authorships"][j]["institutions"][0]["id"][21:]
    
    
                        dic_main_workers[author_id]["co_authors"] += 1
    
                url = "https://api.openalex.org/works?filter=referenced_works:" + work_id + ",publication_year:>" + str(year) + "&per_page=200&mailto=emma_scharfmann@berkeley.edu"
                
    
                try:
                    citations_data = get_data(url)["results"]
                    for i in range(len(citations_data)):
                        citing_work_id = citations_data[i]["id"][21:]
                        for j in range(len(citations_data[i]["authorships"])):
                            citing_author_id = citations_data[i]["authorships"][j]["author"]["id"][21:]
        
                            if citing_author_id != main_author_id:
                                if citing_author_id not in dic_main_workers:
                                    dic_main_workers[citing_author_id] = {}
                                    dic_main_workers[citing_author_id]["author_name"] = citations_data[i]["authorships"][j]["author"]["display_name"]
                                    dic_main_workers[citing_author_id]["co_authors"] = 0
                                    dic_main_workers[citing_author_id]["citations"] = 0
                                    dic_main_workers[citing_author_id]["institution"] = None
                                    dic_main_workers[citing_author_id]["country_code"] = None
                                    dic_main_workers[citing_author_id]["id"] = None
                                    dic_main_workers[citing_author_id]["longitude"] = None
                                    dic_main_workers[citing_author_id]["latitude"] = None
                                    
                                if dic_main_workers[citing_author_id]["institution"] == None and (citations_data[i]["authorships"][j]["institutions"] != [] and "display_name" in citations_data[i]["authorships"][j]["institutions"][0]):
                                        dic_main_workers[citing_author_id]["institution"] = citations_data[i]["authorships"][j]["institutions"][0]["display_name"]
                                if dic_main_workers[citing_author_id]["country_code"] == None and (citations_data[i]["authorships"][j]["institutions"] != [] and "country_code" in citations_data[i]["authorships"][j]["institutions"][0] ):
                                    dic_main_workers[citing_author_id]["country_code"] = citations_data[i]["authorships"][j]["institutions"][0]["country_code"]
                                if dic_main_workers[citing_author_id]["id"] == None and ( citations_data[i]["authorships"][j]["institutions"] != [] and "id" in citations_data[i]["authorships"][j]["institutions"][0] and citations_data[i]["authorships"][j]["institutions"][0]["id"] != None):
                                    dic_main_workers[citing_author_id]["id"] = citations_data[i]["authorships"][j]["institutions"][0]["id"][21:]
                                        
                                dic_main_workers[citing_author_id]["citations"] += 1
    
                except:
                    pass
        except:
            pass
                            
    for author_id in dic_main_workers:
        if dic_main_workers[author_id]["id"] != None:
            institution_id = dic_main_workers[author_id]["id"] 
            if institution_id in dic_institutions:
                geo_data = dic_institutions[institution_id]
                dic_main_workers[author_id]["longitude"] = geo_data["longitude"]
                dic_main_workers[author_id]["latitude"] = geo_data["latitude"]


    res = pd.DataFrame(dic_main_workers).T.sort_values("citations" , ascending = False)[['author_name', 'co_authors', 'citations', 'institution', 'country_code']]
    
    res_geo = pd.DataFrame(dic_main_workers).T.sort_values("citations" , ascending = False)[['author_name', 'co_authors', 'citations', 'institution', 'country_code','longitude', 'latitude']]
    res_geo = res_geo[res_geo["longitude"].notnull()]
    if country_code != False:
        
        if country_code in [ "Europe" , "North America" , "Asia" , "South America", "Oceania" , "Africa" ]:
            
            return res[res["country_code"].isin(dic_country_codes[country_code])].head(size) , res_geo[res_geo["country_code"].isin(dic_country_codes[country_code])].head(size)
            
        else:
            return res[res["country_code"] == country_code].head(size) , res_geo[res_geo["country_code"] == country_code].head(size)
    else: 
        return res.head(size), res_geo.head(size)