EmmaScharfmannBerkeley's picture
Update Climate_site/python_scripts/functions_contest.py
4b8fd00
##packages code
import streamlit as st
from shapely.geometry import Point
from math import radians, cos, sin, asin, sqrt
import pandas as pd
import re
import json
from sentence_transformers import SentenceTransformer, util
path = 'Climate_site/python_scripts/'
@st.cache_resource
def model_nlp():
model = SentenceTransformer('all-mpnet-base-v2')
return model
@st.cache_data
def load_data():
url = path + "big_ideas_contest.tsv"
dic = pd.read_csv(url, delimiter = "\t" , index_col = 0).to_dict('index')
return dic
#################### General Functions #############################
def clean_encoding(encoded_text):
if encoded_text == None:
return None
else:
if "\n" in encoded_text:
encoded_text = encoded_text.replace("\n" , "")
encoded_text = encoded_text[1:-1]
encoded_text = list(map(float , encoded_text.split(", ")))
return encoded_text
def norm(vector):
return sqrt(sum(x * x for x in vector))
def cosine_similarity2(vec_a, vec_b):
norm_a = norm(vec_a)
norm_b = norm(vec_b)
dot = sum(a * b for a, b in zip(vec_a, vec_b))
return dot / (norm_a * norm_b)
def print_extracted_text(name_file):
file = open(path + 'iea.txt', "r", encoding='utf8')
lines = file.readlines()
count = 0
for index, line in enumerate(lines):
read_line = line.strip()
print(read_line)
file.close()
iea.txt
def details(name_file , display):
file = open(path + "iea.txt", "r")
lines = file.readlines()
mark = 0
dic_details = {}
count = -1
for index, line in enumerate(lines):
line = line.strip()
if line == "Close explanation":
break
if line != "" and (line[0].isnumeric() and ">" in line and " " in line) :
count += 1
if mark == 1 and line != "" and line[0] == "*":
if display == True:
print(count)
print(text)
print(" ")
dic_details[count] = text
mark = 0
if mark == 1:
text = text + line + " "
if line.split(" ")[-1] == "Details" or line.split(" ")[-1] == "Hide":
mark = 1
text = ""
return dic_details
def key_initiatives(name_file , display ):
file = open(path + 'iea.txt', "r", encoding='utf8')
lines = file.readlines()
mark = 0
dic_key_initiatives = {}
count = -1
for index, line in enumerate(lines):
line = line.strip()
if line == "Close explanation":
break
if line != "" and (line[0].isnumeric() and ">" in line and " " in line) :
count += 1
if mark == 1 and line != "" and ( (line[0].isnumeric() and ">" in line and " " in line) or line == "*Deployment targets:*" or line == "*Announced development targets:*"):
if display == True:
print(count)
print(text)
print(" ")
dic_key_initiatives[count] = text
mark = 0
if mark == 1:
text = text + line + " "
if line == "*Key initiatives:*":
mark = 1
text = ""
return dic_key_initiatives
def deployment_target(name_file , display):
file = open(path + 'iea.txt', "r", encoding='utf8')
lines = file.readlines()
mark = 0
dic_target = {}
count = -1
for index, line in enumerate(lines):
line = line.strip()
if line == "Close explanation":
break
if line != "" and (line[0].isnumeric() and ">" in line and " " in line) :
count += 1
if mark == 1 and line != "" and ((line[0].isnumeric() and ">" in line and " " in line) or line == "*Announced cost reduction targets:*" or line == "*Announced development targets:*"):
if display == True:
print(count)
print(text)
print(" ")
dic_target[count] = text
mark = 0
if mark == 1:
text = text + line + " "
if line == "*Deployment targets:*" or line == "*Announced development targets:*":
mark = 1
text = ""
return dic_target
def cost_reduction_target(name_file , display):
file = open(path + 'iea.txt', "r", encoding='utf8')
lines = file.readlines()
mark = 0
dic_cost = {}
count = -1
for index, line in enumerate(lines):
line = line.strip()
if line == "Close explanation":
break
if line != "" and (line[0].isnumeric() and ">" in line and " " in line) :
count += 1
if mark == 1 and line != "" and (line[0].isnumeric() and ">" in line and " " in line) :
if display == True:
print(count)
print(text)
print(" ")
dic_cost[count] = text
mark = 0
if mark == 1:
text = text + line + " "
if line == "*Announced cost reduction targets:*":
mark = 1
text = ""
return dic_cost
def key_words(name_file, display ):
file = open(path + 'iea.txt', "r", encoding='utf8')
lines = file.readlines()
list_categories = []
count = -1
for index, line in enumerate(lines):
line = line.strip()
if line == "Close explanation":
break
if line != "" and (line[0].isnumeric() and ">" in line and " " in line) :
count += 1
if display == True:
print("Technologies" , count+1 , ":")
if line != "":
if line[0].isnumeric() and ">" in line and " " in line:
i = 0
try:
line = line.split(" ")[2]
except:
print(line)
break
if "Details" not in lines[index] and "Moderate" not in lines[index]:
while " " not in line:
i += 1
if "Details"==lines[index + i][:7] or "End-use"==lines[index + i][:7]:
break
else:
line = line + " " + lines[index + i]
#if " Production" in line:
#line = line.replace(" Production" , "")
line = line.replace("\n" , " ")
line = line.replace("/" , " ")
line = line.replace("-" , " ")
line = line.split(" ")[0]
if " " in line:
line = line.replace(" ", " ")
line = line.split(">")
if "(" in line[-1]:
line[-1] = line[-1].split("(")[0]
for i in range(len(line)):
# remove multiple spaces
line[i] = re.sub(' +', ' ', line[i])
# remove trailing spaces
line[i] = line[i].strip()
if display == True:
print(line)
print(" ")
if '' in line:
line.remove('')
list_categories.append([count , line])
return list_categories
def technology(name_file, display ):
# Filepath too specific, need to change to relative path
file = open(path + 'iea.txt', "r", encoding='utf8')
lines = file.readlines()
list_categories = []
count = -1
for index, line in enumerate(lines):
line = line.strip()
if line == "Close explanation":
break
if line != "" and (line[0].isnumeric() and ">" in line and " " in line) :
count += 1
if display == True:
print("Technologies" , count+1 , ":")
if line != "":
if line[0].isnumeric() and ">" in line and " " in line:
i = 0
try:
line = line.split(" ")[1]
except:
print(line)
break
line = line.replace("\n" , " ")
line = line.replace("/" , " ")
line = line.replace("-" , " ")
line = line.strip()
line = re.sub(' +', ' ', line)
line = line.split(" ")[0]
line = line.split(">")
if "(" in line[-1]:
line[-1] = line[-1].split("(")[0]
for i in range(len(line)):
# remove multiple spaces
line[i] = re.sub(' +', ' ', line[i])
# remove trailing spaces
line[i] = line[i].strip()
if display == True:
print(line)
print(" ")
list_categories.append([count , line])
return list_categories
#################### Contest Functions #############################
def extract_quantitative_data_technology(technologies, number_technology):
name_file = "iea"
dic_target = deployment_target(name_file , False)
dic_cost = cost_reduction_target(name_file , False)
dic_details = details(name_file , False)
cost_target_text = 'No information'
cost_text = 'No information'
if number_technology in dic_details:
reference_text = dic_details[number_technology]
if number_technology in dic_target:
cost_target_text = dic_target[number_technology]
if number_technology in dic_cost:
cost_text = dic_cost[number_technology]
return reference_text, cost_target_text, cost_text
def big_ideas_encoding(number_technology):
model = model_nlp()
dic_big_ideas = load_data()
dic_matches = {}
name_file = 'iea'
dic_details = details(name_file , False)
k = 0
IEA_encoding = model.encode(dic_details[number_technology] , convert_to_tensor=False , show_progress_bar = False).tolist()
for count_contest in dic_big_ideas:
if pd.isna(dic_big_ideas[count_contest]['encoded_description']) != True:
score = cosine_similarity2(IEA_encoding, clean_encoding(dic_big_ideas[count_contest]['encoded_description']))
if score > 0.45:
dic_matches[k] = dic_big_ideas[count_contest]
dic_matches[k]["score"] = score
k += 1
if len(dic_matches) > 0:
return pd.DataFrame(dic_matches).T.sort_values("score" , ascending = False)[["Project_name" , "Description" ,"Project_date" , "University" , "Field" , "Team_members", "score" ]]
else:
return "No related project found"
################################### Extracted texts ###############################################################
#@title Which patents are related to the technology?
def finder():
name_file = 'iea'
res = technology("iea", False )
list_categories_tech = []
list_categories = key_words("iea" , False)
list_technologies = [ ( ", ".join(list_categories[i][1]) , i ) for i in range(len(list_categories)) ]
dic_technologies = {}
for i in range(len(res)):
names = res[i][1]
if ", ".join(names) not in list_categories_tech:
list_categories_tech.append(", ".join(names))
dic_technologies[", ".join(names)] = []
dic_technologies[", ".join(names)].append( (", ".join(list_categories[i][1]) , i ))
list_climate = [ ("Any related papers" , False ) , ("Climate related papers" , True)]
dic_categories = {}
for elem in list_technologies:
list_words = elem[0].split(",")[-3:]
for i in range(len(list_words)):
if "CCUS" in list_words[i]:
list_words[i] = list_words[i].replace("CCUS" , "carbon capture storage")
dic_categories[elem[1]] = [ ", ".join([ " ".join(words.split()[:3]) for words in list_words ] ) , ", ".join([ " ".join(words.split()[:3]) for words in list_words[:-1] ]) , ", ".join([ " ".join(words.split()[:3]) for words in list_words[1:] ] ) ]
return dic_technologies, dic_categories, list_categories_tech, list_technologies