|
|
|
|
|
|
|
|
import streamlit as st |
|
|
from shapely.geometry import Point |
|
|
from math import radians, cos, sin, asin, sqrt |
|
|
import pandas as pd |
|
|
import re |
|
|
import json |
|
|
|
|
|
from sentence_transformers import SentenceTransformer, util |
|
|
|
|
|
|
|
|
path = 'Climate_site/python_scripts/' |
|
|
|
|
|
@st.cache_resource |
|
|
def model_nlp(): |
|
|
model = SentenceTransformer('all-mpnet-base-v2') |
|
|
return model |
|
|
|
|
|
@st.cache_data |
|
|
def load_data(): |
|
|
url = path + "big_ideas_contest.tsv" |
|
|
dic = pd.read_csv(url, delimiter = "\t" , index_col = 0).to_dict('index') |
|
|
return dic |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def clean_encoding(encoded_text): |
|
|
if encoded_text == None: |
|
|
return None |
|
|
else: |
|
|
if "\n" in encoded_text: |
|
|
encoded_text = encoded_text.replace("\n" , "") |
|
|
encoded_text = encoded_text[1:-1] |
|
|
encoded_text = list(map(float , encoded_text.split(", "))) |
|
|
return encoded_text |
|
|
|
|
|
|
|
|
def norm(vector): |
|
|
return sqrt(sum(x * x for x in vector)) |
|
|
|
|
|
def cosine_similarity2(vec_a, vec_b): |
|
|
norm_a = norm(vec_a) |
|
|
norm_b = norm(vec_b) |
|
|
dot = sum(a * b for a, b in zip(vec_a, vec_b)) |
|
|
return dot / (norm_a * norm_b) |
|
|
|
|
|
def print_extracted_text(name_file): |
|
|
|
|
|
file = open(path + 'iea.txt', "r", encoding='utf8') |
|
|
lines = file.readlines() |
|
|
count = 0 |
|
|
for index, line in enumerate(lines): |
|
|
read_line = line.strip() |
|
|
print(read_line) |
|
|
|
|
|
file.close() |
|
|
|
|
|
|
|
|
iea.txt |
|
|
|
|
|
def details(name_file , display): |
|
|
|
|
|
file = open(path + "iea.txt", "r") |
|
|
lines = file.readlines() |
|
|
|
|
|
mark = 0 |
|
|
dic_details = {} |
|
|
count = -1 |
|
|
for index, line in enumerate(lines): |
|
|
|
|
|
line = line.strip() |
|
|
if line == "Close explanation": |
|
|
break |
|
|
|
|
|
if line != "" and (line[0].isnumeric() and ">" in line and " " in line) : |
|
|
count += 1 |
|
|
|
|
|
|
|
|
if mark == 1 and line != "" and line[0] == "*": |
|
|
|
|
|
if display == True: |
|
|
print(count) |
|
|
print(text) |
|
|
print(" ") |
|
|
dic_details[count] = text |
|
|
mark = 0 |
|
|
|
|
|
|
|
|
if mark == 1: |
|
|
text = text + line + " " |
|
|
|
|
|
if line.split(" ")[-1] == "Details" or line.split(" ")[-1] == "Hide": |
|
|
mark = 1 |
|
|
text = "" |
|
|
|
|
|
return dic_details |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def key_initiatives(name_file , display ): |
|
|
|
|
|
file = open(path + 'iea.txt', "r", encoding='utf8') |
|
|
lines = file.readlines() |
|
|
|
|
|
|
|
|
mark = 0 |
|
|
dic_key_initiatives = {} |
|
|
count = -1 |
|
|
for index, line in enumerate(lines): |
|
|
|
|
|
line = line.strip() |
|
|
if line == "Close explanation": |
|
|
break |
|
|
|
|
|
if line != "" and (line[0].isnumeric() and ">" in line and " " in line) : |
|
|
|
|
|
count += 1 |
|
|
|
|
|
|
|
|
|
|
|
if mark == 1 and line != "" and ( (line[0].isnumeric() and ">" in line and " " in line) or line == "*Deployment targets:*" or line == "*Announced development targets:*"): |
|
|
if display == True: |
|
|
print(count) |
|
|
print(text) |
|
|
print(" ") |
|
|
|
|
|
dic_key_initiatives[count] = text |
|
|
mark = 0 |
|
|
|
|
|
|
|
|
if mark == 1: |
|
|
text = text + line + " " |
|
|
|
|
|
if line == "*Key initiatives:*": |
|
|
|
|
|
|
|
|
mark = 1 |
|
|
text = "" |
|
|
|
|
|
return dic_key_initiatives |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def deployment_target(name_file , display): |
|
|
|
|
|
file = open(path + 'iea.txt', "r", encoding='utf8') |
|
|
lines = file.readlines() |
|
|
|
|
|
|
|
|
mark = 0 |
|
|
dic_target = {} |
|
|
count = -1 |
|
|
for index, line in enumerate(lines): |
|
|
|
|
|
line = line.strip() |
|
|
if line == "Close explanation": |
|
|
break |
|
|
|
|
|
if line != "" and (line[0].isnumeric() and ">" in line and " " in line) : |
|
|
count += 1 |
|
|
|
|
|
|
|
|
|
|
|
if mark == 1 and line != "" and ((line[0].isnumeric() and ">" in line and " " in line) or line == "*Announced cost reduction targets:*" or line == "*Announced development targets:*"): |
|
|
|
|
|
if display == True: |
|
|
print(count) |
|
|
print(text) |
|
|
print(" ") |
|
|
|
|
|
dic_target[count] = text |
|
|
mark = 0 |
|
|
|
|
|
|
|
|
if mark == 1: |
|
|
text = text + line + " " |
|
|
|
|
|
if line == "*Deployment targets:*" or line == "*Announced development targets:*": |
|
|
|
|
|
mark = 1 |
|
|
text = "" |
|
|
|
|
|
return dic_target |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def cost_reduction_target(name_file , display): |
|
|
|
|
|
file = open(path + 'iea.txt', "r", encoding='utf8') |
|
|
lines = file.readlines() |
|
|
|
|
|
mark = 0 |
|
|
dic_cost = {} |
|
|
count = -1 |
|
|
for index, line in enumerate(lines): |
|
|
|
|
|
line = line.strip() |
|
|
|
|
|
if line == "Close explanation": |
|
|
break |
|
|
|
|
|
if line != "" and (line[0].isnumeric() and ">" in line and " " in line) : |
|
|
|
|
|
|
|
|
count += 1 |
|
|
|
|
|
|
|
|
|
|
|
if mark == 1 and line != "" and (line[0].isnumeric() and ">" in line and " " in line) : |
|
|
|
|
|
if display == True: |
|
|
print(count) |
|
|
print(text) |
|
|
print(" ") |
|
|
|
|
|
dic_cost[count] = text |
|
|
mark = 0 |
|
|
|
|
|
|
|
|
if mark == 1: |
|
|
text = text + line + " " |
|
|
|
|
|
if line == "*Announced cost reduction targets:*": |
|
|
|
|
|
mark = 1 |
|
|
text = "" |
|
|
|
|
|
return dic_cost |
|
|
|
|
|
|
|
|
|
|
|
def key_words(name_file, display ): |
|
|
|
|
|
file = open(path + 'iea.txt', "r", encoding='utf8') |
|
|
|
|
|
lines = file.readlines() |
|
|
|
|
|
list_categories = [] |
|
|
count = -1 |
|
|
for index, line in enumerate(lines): |
|
|
|
|
|
line = line.strip() |
|
|
|
|
|
if line == "Close explanation": |
|
|
break |
|
|
|
|
|
if line != "" and (line[0].isnumeric() and ">" in line and " " in line) : |
|
|
count += 1 |
|
|
|
|
|
if display == True: |
|
|
print("Technologies" , count+1 , ":") |
|
|
|
|
|
if line != "": |
|
|
|
|
|
if line[0].isnumeric() and ">" in line and " " in line: |
|
|
i = 0 |
|
|
try: |
|
|
line = line.split(" ")[2] |
|
|
except: |
|
|
print(line) |
|
|
break |
|
|
|
|
|
if "Details" not in lines[index] and "Moderate" not in lines[index]: |
|
|
|
|
|
while " " not in line: |
|
|
i += 1 |
|
|
if "Details"==lines[index + i][:7] or "End-use"==lines[index + i][:7]: |
|
|
break |
|
|
else: |
|
|
line = line + " " + lines[index + i] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
line = line.replace("\n" , " ") |
|
|
line = line.replace("/" , " ") |
|
|
line = line.replace("-" , " ") |
|
|
line = line.split(" ")[0] |
|
|
|
|
|
if " " in line: |
|
|
line = line.replace(" ", " ") |
|
|
line = line.split(">") |
|
|
|
|
|
|
|
|
if "(" in line[-1]: |
|
|
line[-1] = line[-1].split("(")[0] |
|
|
|
|
|
|
|
|
for i in range(len(line)): |
|
|
|
|
|
|
|
|
line[i] = re.sub(' +', ' ', line[i]) |
|
|
|
|
|
line[i] = line[i].strip() |
|
|
|
|
|
|
|
|
|
|
|
if display == True: |
|
|
print(line) |
|
|
print(" ") |
|
|
|
|
|
if '' in line: |
|
|
line.remove('') |
|
|
|
|
|
list_categories.append([count , line]) |
|
|
|
|
|
return list_categories |
|
|
|
|
|
|
|
|
|
|
|
def technology(name_file, display ): |
|
|
|
|
|
file = open(path + 'iea.txt', "r", encoding='utf8') |
|
|
lines = file.readlines() |
|
|
|
|
|
list_categories = [] |
|
|
count = -1 |
|
|
for index, line in enumerate(lines): |
|
|
|
|
|
line = line.strip() |
|
|
|
|
|
if line == "Close explanation": |
|
|
break |
|
|
|
|
|
if line != "" and (line[0].isnumeric() and ">" in line and " " in line) : |
|
|
count += 1 |
|
|
|
|
|
if display == True: |
|
|
print("Technologies" , count+1 , ":") |
|
|
|
|
|
if line != "": |
|
|
|
|
|
if line[0].isnumeric() and ">" in line and " " in line: |
|
|
i = 0 |
|
|
try: |
|
|
line = line.split(" ")[1] |
|
|
except: |
|
|
print(line) |
|
|
break |
|
|
|
|
|
|
|
|
line = line.replace("\n" , " ") |
|
|
line = line.replace("/" , " ") |
|
|
line = line.replace("-" , " ") |
|
|
line = line.strip() |
|
|
line = re.sub(' +', ' ', line) |
|
|
line = line.split(" ")[0] |
|
|
line = line.split(">") |
|
|
|
|
|
|
|
|
if "(" in line[-1]: |
|
|
line[-1] = line[-1].split("(")[0] |
|
|
|
|
|
|
|
|
for i in range(len(line)): |
|
|
|
|
|
|
|
|
line[i] = re.sub(' +', ' ', line[i]) |
|
|
|
|
|
line[i] = line[i].strip() |
|
|
|
|
|
|
|
|
|
|
|
if display == True: |
|
|
print(line) |
|
|
print(" ") |
|
|
|
|
|
|
|
|
list_categories.append([count , line]) |
|
|
|
|
|
return list_categories |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def extract_quantitative_data_technology(technologies, number_technology): |
|
|
|
|
|
|
|
|
name_file = "iea" |
|
|
dic_target = deployment_target(name_file , False) |
|
|
dic_cost = cost_reduction_target(name_file , False) |
|
|
dic_details = details(name_file , False) |
|
|
cost_target_text = 'No information' |
|
|
cost_text = 'No information' |
|
|
|
|
|
if number_technology in dic_details: |
|
|
reference_text = dic_details[number_technology] |
|
|
|
|
|
|
|
|
if number_technology in dic_target: |
|
|
cost_target_text = dic_target[number_technology] |
|
|
|
|
|
if number_technology in dic_cost: |
|
|
cost_text = dic_cost[number_technology] |
|
|
|
|
|
return reference_text, cost_target_text, cost_text |
|
|
|
|
|
|
|
|
|
|
|
def big_ideas_encoding(number_technology): |
|
|
|
|
|
model = model_nlp() |
|
|
|
|
|
dic_big_ideas = load_data() |
|
|
|
|
|
dic_matches = {} |
|
|
|
|
|
|
|
|
name_file = 'iea' |
|
|
dic_details = details(name_file , False) |
|
|
k = 0 |
|
|
|
|
|
|
|
|
|
|
|
IEA_encoding = model.encode(dic_details[number_technology] , convert_to_tensor=False , show_progress_bar = False).tolist() |
|
|
for count_contest in dic_big_ideas: |
|
|
if pd.isna(dic_big_ideas[count_contest]['encoded_description']) != True: |
|
|
score = cosine_similarity2(IEA_encoding, clean_encoding(dic_big_ideas[count_contest]['encoded_description'])) |
|
|
if score > 0.45: |
|
|
dic_matches[k] = dic_big_ideas[count_contest] |
|
|
dic_matches[k]["score"] = score |
|
|
k += 1 |
|
|
|
|
|
|
|
|
|
|
|
if len(dic_matches) > 0: |
|
|
return pd.DataFrame(dic_matches).T.sort_values("score" , ascending = False)[["Project_name" , "Description" ,"Project_date" , "University" , "Field" , "Team_members", "score" ]] |
|
|
else: |
|
|
return "No related project found" |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def finder(): |
|
|
name_file = 'iea' |
|
|
res = technology("iea", False ) |
|
|
list_categories_tech = [] |
|
|
list_categories = key_words("iea" , False) |
|
|
list_technologies = [ ( ", ".join(list_categories[i][1]) , i ) for i in range(len(list_categories)) ] |
|
|
dic_technologies = {} |
|
|
for i in range(len(res)): |
|
|
names = res[i][1] |
|
|
if ", ".join(names) not in list_categories_tech: |
|
|
list_categories_tech.append(", ".join(names)) |
|
|
dic_technologies[", ".join(names)] = [] |
|
|
dic_technologies[", ".join(names)].append( (", ".join(list_categories[i][1]) , i )) |
|
|
|
|
|
|
|
|
list_climate = [ ("Any related papers" , False ) , ("Climate related papers" , True)] |
|
|
|
|
|
dic_categories = {} |
|
|
for elem in list_technologies: |
|
|
list_words = elem[0].split(",")[-3:] |
|
|
for i in range(len(list_words)): |
|
|
if "CCUS" in list_words[i]: |
|
|
list_words[i] = list_words[i].replace("CCUS" , "carbon capture storage") |
|
|
dic_categories[elem[1]] = [ ", ".join([ " ".join(words.split()[:3]) for words in list_words ] ) , ", ".join([ " ".join(words.split()[:3]) for words in list_words[:-1] ]) , ", ".join([ " ".join(words.split()[:3]) for words in list_words[1:] ] ) ] |
|
|
|
|
|
return dic_technologies, dic_categories, list_categories_tech, list_technologies |
|
|
|