Spaces:

EmmaScharfmannBerkeley
/

Synapse_project

Sleeping

App Files Files Community

Synapse_project / Climate_site /python_scripts /functions_contest.py

EmmaScharfmannBerkeley

Update Climate_site/python_scripts/functions_contest.py

4b8fd00 over 2 years ago

raw

history blame contribute delete

12.7 kB

	##packages code

	import streamlit as st
	from shapely.geometry import Point
	from math import radians, cos, sin, asin, sqrt
	import pandas as pd
	import re
	import json

	from sentence_transformers import SentenceTransformer, util


	path = 'Climate_site/python_scripts/'

	@st.cache_resource
	def model_nlp():
	model = SentenceTransformer('all-mpnet-base-v2')
	return model

	@st.cache_data
	def load_data():
	url = path + "big_ideas_contest.tsv"
	dic = pd.read_csv(url, delimiter = "\t" , index_col = 0).to_dict('index')
	return dic



	#################### General Functions #############################

	def clean_encoding(encoded_text):
	if encoded_text == None:
	return None
	else:
	if "\n" in encoded_text:
	encoded_text = encoded_text.replace("\n" , "")
	encoded_text = encoded_text[1:-1]
	encoded_text = list(map(float , encoded_text.split(", ")))
	return encoded_text


	def norm(vector):
	return sqrt(sum(x * x for x in vector))

	def cosine_similarity2(vec_a, vec_b):
	norm_a = norm(vec_a)
	norm_b = norm(vec_b)
	dot = sum(a * b for a, b in zip(vec_a, vec_b))
	return dot / (norm_a * norm_b)

	def print_extracted_text(name_file):

	file = open(path + 'iea.txt', "r", encoding='utf8')
	lines = file.readlines()
	count = 0
	for index, line in enumerate(lines):
	read_line = line.strip()
	print(read_line)

	file.close()


	iea.txt

	def details(name_file , display):

	file = open(path + "iea.txt", "r")
	lines = file.readlines()

	mark = 0
	dic_details = {}
	count = -1
	for index, line in enumerate(lines):

	line = line.strip()
	if line == "Close explanation":
	break

	if line != "" and (line[0].isnumeric() and ">" in line and " " in line) :
	count += 1


	if mark == 1 and line != "" and line[0] == "*":

	if display == True:
	print(count)
	print(text)
	print(" ")
	dic_details[count] = text
	mark = 0


	if mark == 1:
	text = text + line + " "

	if line.split(" ")[-1] == "Details" or line.split(" ")[-1] == "Hide":
	mark = 1
	text = ""

	return dic_details







	def key_initiatives(name_file , display ):

	file = open(path + 'iea.txt', "r", encoding='utf8')
	lines = file.readlines()


	mark = 0
	dic_key_initiatives = {}
	count = -1
	for index, line in enumerate(lines):

	line = line.strip()
	if line == "Close explanation":
	break

	if line != "" and (line[0].isnumeric() and ">" in line and " " in line) :

	count += 1



	if mark == 1 and line != "" and ( (line[0].isnumeric() and ">" in line and " " in line) or line == "Deployment targets:" or line == "Announced development targets:"):
	if display == True:
	print(count)
	print(text)
	print(" ")

	dic_key_initiatives[count] = text
	mark = 0


	if mark == 1:
	text = text + line + " "

	if line == "Key initiatives:":


	mark = 1
	text = ""

	return dic_key_initiatives




	def deployment_target(name_file , display):

	file = open(path + 'iea.txt', "r", encoding='utf8')
	lines = file.readlines()


	mark = 0
	dic_target = {}
	count = -1
	for index, line in enumerate(lines):

	line = line.strip()
	if line == "Close explanation":
	break

	if line != "" and (line[0].isnumeric() and ">" in line and " " in line) :
	count += 1



	if mark == 1 and line != "" and ((line[0].isnumeric() and ">" in line and " " in line) or line == "Announced cost reduction targets:" or line == "Announced development targets:"):

	if display == True:
	print(count)
	print(text)
	print(" ")

	dic_target[count] = text
	mark = 0


	if mark == 1:
	text = text + line + " "

	if line == "Deployment targets:" or line == "Announced development targets:":

	mark = 1
	text = ""

	return dic_target




	def cost_reduction_target(name_file , display):

	file = open(path + 'iea.txt', "r", encoding='utf8')
	lines = file.readlines()

	mark = 0
	dic_cost = {}
	count = -1
	for index, line in enumerate(lines):

	line = line.strip()

	if line == "Close explanation":
	break

	if line != "" and (line[0].isnumeric() and ">" in line and " " in line) :


	count += 1



	if mark == 1 and line != "" and (line[0].isnumeric() and ">" in line and " " in line) :

	if display == True:
	print(count)
	print(text)
	print(" ")

	dic_cost[count] = text
	mark = 0


	if mark == 1:
	text = text + line + " "

	if line == "Announced cost reduction targets:":

	mark = 1
	text = ""

	return dic_cost



	def key_words(name_file, display ):

	file = open(path + 'iea.txt', "r", encoding='utf8')

	lines = file.readlines()

	list_categories = []
	count = -1
	for index, line in enumerate(lines):

	line = line.strip()

	if line == "Close explanation":
	break

	if line != "" and (line[0].isnumeric() and ">" in line and " " in line) :
	count += 1

	if display == True:
	print("Technologies" , count+1 , ":")

	if line != "":

	if line[0].isnumeric() and ">" in line and " " in line:
	i = 0
	try:
	line = line.split(" ")[2]
	except:
	print(line)
	break

	if "Details" not in lines[index] and "Moderate" not in lines[index]:

	while " " not in line:
	i += 1
	if "Details"==lines[index + i][:7] or "End-use"==lines[index + i][:7]:
	break
	else:
	line = line + " " + lines[index + i]

	#if " Production" in line:
	#line = line.replace(" Production" , "")

	line = line.replace("\n" , " ")
	line = line.replace("/" , " ")
	line = line.replace("-" , " ")
	line = line.split(" ")[0]

	if " " in line:
	line = line.replace(" ", " ")
	line = line.split(">")


	if "(" in line[-1]:
	line[-1] = line[-1].split("(")[0]


	for i in range(len(line)):

	# remove multiple spaces
	line[i] = re.sub(' +', ' ', line[i])
	# remove trailing spaces
	line[i] = line[i].strip()



	if display == True:
	print(line)
	print(" ")

	if '' in line:
	line.remove('')

	list_categories.append([count , line])

	return list_categories



	def technology(name_file, display ):
	# Filepath too specific, need to change to relative path
	file = open(path + 'iea.txt', "r", encoding='utf8')
	lines = file.readlines()

	list_categories = []
	count = -1
	for index, line in enumerate(lines):

	line = line.strip()

	if line == "Close explanation":
	break

	if line != "" and (line[0].isnumeric() and ">" in line and " " in line) :
	count += 1

	if display == True:
	print("Technologies" , count+1 , ":")

	if line != "":

	if line[0].isnumeric() and ">" in line and " " in line:
	i = 0
	try:
	line = line.split(" ")[1]
	except:
	print(line)
	break


	line = line.replace("\n" , " ")
	line = line.replace("/" , " ")
	line = line.replace("-" , " ")
	line = line.strip()
	line = re.sub(' +', ' ', line)
	line = line.split(" ")[0]
	line = line.split(">")


	if "(" in line[-1]:
	line[-1] = line[-1].split("(")[0]


	for i in range(len(line)):

	# remove multiple spaces
	line[i] = re.sub(' +', ' ', line[i])
	# remove trailing spaces
	line[i] = line[i].strip()



	if display == True:
	print(line)
	print(" ")


	list_categories.append([count , line])

	return list_categories




	#################### Contest Functions #############################


	def extract_quantitative_data_technology(technologies, number_technology):


	name_file = "iea"
	dic_target = deployment_target(name_file , False)
	dic_cost = cost_reduction_target(name_file , False)
	dic_details = details(name_file , False)
	cost_target_text = 'No information'
	cost_text = 'No information'

	if number_technology in dic_details:
	reference_text = dic_details[number_technology]


	if number_technology in dic_target:
	cost_target_text = dic_target[number_technology]

	if number_technology in dic_cost:
	cost_text = dic_cost[number_technology]

	return reference_text, cost_target_text, cost_text



	def big_ideas_encoding(number_technology):

	model = model_nlp()

	dic_big_ideas = load_data()

	dic_matches = {}


	name_file = 'iea'
	dic_details = details(name_file , False)
	k = 0



	IEA_encoding = model.encode(dic_details[number_technology] , convert_to_tensor=False , show_progress_bar = False).tolist()
	for count_contest in dic_big_ideas:
	if pd.isna(dic_big_ideas[count_contest]['encoded_description']) != True:
	score = cosine_similarity2(IEA_encoding, clean_encoding(dic_big_ideas[count_contest]['encoded_description']))
	if score > 0.45:
	dic_matches[k] = dic_big_ideas[count_contest]
	dic_matches[k]["score"] = score
	k += 1



	if len(dic_matches) > 0:
	return pd.DataFrame(dic_matches).T.sort_values("score" , ascending = False)[["Project_name" , "Description" ,"Project_date" , "University" , "Field" , "Team_members", "score" ]]
	else:
	return "No related project found"









	################################### Extracted texts ###############################################################

	#@title Which patents are related to the technology?


	def finder():
	name_file = 'iea'
	res = technology("iea", False )
	list_categories_tech = []
	list_categories = key_words("iea" , False)
	list_technologies = [ ( ", ".join(list_categories[i][1]) , i ) for i in range(len(list_categories)) ]
	dic_technologies = {}
	for i in range(len(res)):
	names = res[i][1]
	if ", ".join(names) not in list_categories_tech:
	list_categories_tech.append(", ".join(names))
	dic_technologies[", ".join(names)] = []
	dic_technologies[", ".join(names)].append( (", ".join(list_categories[i][1]) , i ))


	list_climate = [ ("Any related papers" , False ) , ("Climate related papers" , True)]

	dic_categories = {}
	for elem in list_technologies:
	list_words = elem[0].split(",")[-3:]
	for i in range(len(list_words)):
	if "CCUS" in list_words[i]:
	list_words[i] = list_words[i].replace("CCUS" , "carbon capture storage")
	dic_categories[elem[1]] = [ ", ".join([ " ".join(words.split()[:3]) for words in list_words ] ) , ", ".join([ " ".join(words.split()[:3]) for words in list_words[:-1] ]) , ", ".join([ " ".join(words.split()[:3]) for words in list_words[1:] ] ) ]

	return dic_technologies, dic_categories, list_categories_tech, list_technologies