Spaces:

EmmaScharfmannBerkeley
/

Synapse_project

Sleeping

File size: 11,990 Bytes

##packages code

import streamlit as st
from shapely.geometry import Point
import pandas as pd
import re
import json


path = 'Climate_site/python_scripts/'

@st.cache_data
def load_dic():
    f = open(path + "related_companies.json","r")
    dic_companies = json.load(f)
    return dic_companies
    
@st.cache_data
def load_data():
    url = path + "preqin_companies_IEA.tsv"
    table = pd.read_csv(url, delimiter = "\t" , index_col = 0)
    table = table.astype({'portfolio_company_id': 'str'})

    return table

table_companies = load_data()
dic_companies = load_dic()


#################### General Functions #############################



def print_extracted_text(name_file):

    file = open(path + 'iea.txt', "r", encoding='utf8')
    lines = file.readlines()
    count = 0
    for index, line in enumerate(lines):
        read_line = line.strip()
        print(read_line)

    file.close()
    
    
    iea.txt

def details(name_file , display):
    
    file = open(path + "iea.txt", "r")
    lines = file.readlines()

    mark = 0 
    dic_details = {}
    count = -1
    for index, line in enumerate(lines):

        line = line.strip()
        if line == "Close explanation":
            break

        if line != "" and  (line[0].isnumeric() and ">" in line and " 	" in line) :
            count += 1


        if mark == 1 and line != "" and line[0] == "*":
            
            if display == True:
                print(count)
                print(text)
                print(" ")
            dic_details[count] = text
            mark = 0


        if mark == 1:
            text = text + line + " "

        if line.split(" 	")[-1] == "Details" or line.split(" 	")[-1] == "Hide":
            mark = 1
            text = ""
            
    return dic_details







def key_initiatives(name_file , display ):
    
    file = open(path + 'iea.txt', "r", encoding='utf8')
    lines = file.readlines()

      
    mark = 0 
    dic_key_initiatives = {}
    count = -1
    for index, line in enumerate(lines):

        line = line.strip()
        if line == "Close explanation":
            break

        if line != "" and  (line[0].isnumeric() and ">" in line and " 	" in line) :

            count += 1



        if mark == 1 and line != "" and ( (line[0].isnumeric() and  ">" in line and " 	" in line) or line == "*Deployment targets:*" or line == "*Announced development targets:*"):
            if display == True: 
                print(count)
                print(text)
                print(" ")
            
            dic_key_initiatives[count] = text
            mark = 0


        if mark == 1:
            text = text + line + " "

        if line == "*Key initiatives:*":


            mark = 1
            text = ""
            
    return dic_key_initiatives




def deployment_target(name_file , display):
    
    file = open(path + 'iea.txt', "r", encoding='utf8')
    lines = file.readlines()


    mark = 0 
    dic_target = {}
    count = -1
    for index, line in enumerate(lines):

        line = line.strip()
        if line == "Close explanation":
            break

        if line != "" and  (line[0].isnumeric() and ">" in line and " 	" in line) :
            count += 1



        if mark == 1 and line != "" and  ((line[0].isnumeric() and ">" in line and " 	" in line)  or line == "*Announced cost reduction targets:*" or line == "*Announced development targets:*"):
            
            if display == True:
                print(count)
                print(text)
                print(" ")
                
            dic_target[count] = text
            mark = 0


        if mark == 1:
            text = text + line + " "

        if line == "*Deployment targets:*" or line == "*Announced development targets:*":

            mark = 1
            text = ""
            
    return dic_target


 
    
def cost_reduction_target(name_file , display):
    
    file = open(path + 'iea.txt', "r", encoding='utf8')
    lines = file.readlines()
    
    mark = 0 
    dic_cost = {}
    count = -1
    for index, line in enumerate(lines):

        line = line.strip()
        
        if line == "Close explanation":
            break

        if line != "" and  (line[0].isnumeric() and ">" in line and " 	" in line) :


            count += 1



        if mark == 1 and line != ""  and (line[0].isnumeric() and ">" in line and " 	" in line) :
            
            if display == True: 
                print(count)
                print(text)
                print(" ")
                
            dic_cost[count] = text
            mark = 0


        if mark == 1:
            text = text + line + " "

        if line == "*Announced cost reduction targets:*":

            mark = 1
            text = ""
            
    return dic_cost



def key_words(name_file, display ):
    
    file = open(path + 'iea.txt', "r", encoding='utf8')
    
    lines = file.readlines()
    
    list_categories = []
    count = -1
    for index, line in enumerate(lines):

        line = line.strip()
        
        if line == "Close explanation":
            break

        if line != "" and  (line[0].isnumeric() and ">" in line and " 	" in line) :
            count += 1
            
            if display == True:
                print("Technologies"  , count+1 , ":")

        if line != "":

            if line[0].isnumeric() and ">" in line and " 	" in line:
                i = 0
                try:
                    line = line.split(" 	")[2]
                except:
                    print(line)
                    break
                
                if "Details" not in lines[index] and "Moderate" not in lines[index]:
                
                    while "	" not in line:
                        i += 1
                        if "Details"==lines[index + i][:7] or "End-use"==lines[index + i][:7]:
                            break
                        else:
                            line = line + "  " +  lines[index + i]

                #if " Production" in line:
                    #line = line.replace(" Production" , "")

                line = line.replace("\n" , " ")
                line = line.replace("/" , " ")
                line = line.replace("-" , " ")
                line = line.split(" 	")[0]

                if "  " in line:
                    line = line.replace("  ", " ")
                line = line.split(">")


                if "(" in line[-1]:
                    line[-1] = line[-1].split("(")[0] 


                for i in range(len(line)):

                    # remove multiple spaces
                    line[i] = re.sub(' +', ' ', line[i])
                    # remove trailing spaces
                    line[i] = line[i].strip()

    

                if display == True:
                    print(line)
                    print(" ")
                    
                if '' in line:
                    line.remove('')

                list_categories.append([count , line])
                
    return list_categories



def technology(name_file, display ):
    # Filepath too specific, need to change to relative path
    file = open(path + 'iea.txt', "r", encoding='utf8')
    lines = file.readlines()
    
    list_categories = []
    count = -1
    for index, line in enumerate(lines):

        line = line.strip()
        
        if line == "Close explanation":
            break

        if line != "" and  (line[0].isnumeric() and ">" in line and " 	" in line) :
            count += 1
            
            if display == True:
                print("Technologies"  , count+1 , ":")

        if line != "":

            if line[0].isnumeric() and ">" in line and " 	" in line:
                i = 0
                try:
                    line = line.split(" 	")[1]
                except:
                    print(line)
                    break
                

                line = line.replace("\n" , " ")
                line = line.replace("/" , " ")
                line = line.replace("-" , " ")
                line = line.strip()
                line = re.sub(' +', ' ', line)
                line = line.split(" 	")[0]
                line = line.split(">")


                if "(" in line[-1]:
                    line[-1] = line[-1].split("(")[0] 


                for i in range(len(line)):

                    # remove multiple spaces
                    line[i] = re.sub(' +', ' ', line[i])
                    # remove trailing spaces
                    line[i] = line[i].strip()

    

                if display == True:
                    print(line)
                    print(" ")
                

                list_categories.append([count , line])
                
    return list_categories


#################### Companies Functions #############################


def extract_quantitative_data_technology(technologies, number_technology):
    
      
    name_file = "iea"
    dic_target = deployment_target(name_file , False)
    dic_cost = cost_reduction_target(name_file , False)
    dic_details = details(name_file , False)
    cost_target_text = 'No information'
    cost_text = 'No information'
    
    if number_technology in dic_details:
        reference_text = dic_details[number_technology]
            

    if number_technology in dic_target:
        cost_target_text = dic_target[number_technology]

    if number_technology in dic_cost:
        cost_text = dic_cost[number_technology]

    return reference_text, cost_target_text, cost_text


def related_VC_deals(category , number_technology , size):
    
    number_technology = str(number_technology)
    
   
    list_results = list(dic_companies[number_technology].keys()) 
    res = table_companies[table_companies["portfolio_company_id"].isin(list_results)].set_index("portfolio_company_id")
    
    table = res.copy()
    for elem in table.index:
        table.loc[ elem , "score" ] = dic_companies[number_technology][str(elem)]
        
    table = table.sort_values("score" , ascending = False).head(size)
    table = table[['portfolio_company_name', 'year_established','portfolio_company_website','firm_about',
       'portfolio_company_country',  'portfolio_company_state',
        'firm_othernames', 'industry_classification',
       'primary_industry', 'sub_industries', 'score']]


    table["year_established"] = table["year_established"].replace(",", "", regex=True).astype(int, errors='ignore')



    return table




################################### Extracted texts ###############################################################

#@title Which patents are related to the technology?


def finder():
    name_file = 'iea'
    res = technology("iea", False )
    list_categories_tech = []
    list_categories = key_words("iea" , False)
    list_technologies = [ ( ", ".join(list_categories[i][1]) , i ) for i in range(len(list_categories))  ] 
    dic_technologies = {}
    for i in range(len(res)):
        names = res[i][1]
        if ", ".join(names) not in list_categories_tech:
            list_categories_tech.append(", ".join(names))
            dic_technologies[", ".join(names)] = []
        dic_technologies[", ".join(names)].append( (", ".join(list_categories[i][1]) , i ))
        

    list_climate = [ ("Any related papers" , False ) , ("Climate related papers" , True)]

    dic_categories = {}
    for elem in list_technologies:
        list_words = elem[0].split(",")[-3:]
        for i in range(len(list_words)):
            if "CCUS" in list_words[i]:
                list_words[i] = list_words[i].replace("CCUS" , "carbon capture storage")
        dic_categories[elem[1]]  = [  ", ".join([ " ".join(words.split()[:3]) for words in list_words ] )  ,  ", ".join([ " ".join(words.split()[:3]) for words in list_words[:-1] ]) , ", ".join([ " ".join(words.split()[:3]) for words in list_words[1:] ] ) ]  
    
    return dic_technologies, dic_categories, list_categories_tech, list_technologies