##packages code import streamlit as st from shapely.geometry import Point from math import radians, cos, sin, asin, sqrt import pandas as pd import re import json from sentence_transformers import SentenceTransformer, util path = 'Climate_site/python_scripts/' @st.cache_resource def model_nlp(): model = SentenceTransformer('all-mpnet-base-v2') return model @st.cache_data def load_data(): url = path + "big_ideas_contest.tsv" dic = pd.read_csv(url, delimiter = "\t" , index_col = 0).to_dict('index') return dic #################### General Functions ############################# def clean_encoding(encoded_text): if encoded_text == None: return None else: if "\n" in encoded_text: encoded_text = encoded_text.replace("\n" , "") encoded_text = encoded_text[1:-1] encoded_text = list(map(float , encoded_text.split(", "))) return encoded_text def norm(vector): return sqrt(sum(x * x for x in vector)) def cosine_similarity2(vec_a, vec_b): norm_a = norm(vec_a) norm_b = norm(vec_b) dot = sum(a * b for a, b in zip(vec_a, vec_b)) return dot / (norm_a * norm_b) def print_extracted_text(name_file): file = open(path + 'iea.txt', "r", encoding='utf8') lines = file.readlines() count = 0 for index, line in enumerate(lines): read_line = line.strip() print(read_line) file.close() iea.txt def details(name_file , display): file = open(path + "iea.txt", "r") lines = file.readlines() mark = 0 dic_details = {} count = -1 for index, line in enumerate(lines): line = line.strip() if line == "Close explanation": break if line != "" and (line[0].isnumeric() and ">" in line and " " in line) : count += 1 if mark == 1 and line != "" and line[0] == "*": if display == True: print(count) print(text) print(" ") dic_details[count] = text mark = 0 if mark == 1: text = text + line + " " if line.split(" ")[-1] == "Details" or line.split(" ")[-1] == "Hide": mark = 1 text = "" return dic_details def key_initiatives(name_file , display ): file = open(path + 'iea.txt', "r", encoding='utf8') lines = file.readlines() mark = 0 dic_key_initiatives = {} count = -1 for index, line in enumerate(lines): line = line.strip() if line == "Close explanation": break if line != "" and (line[0].isnumeric() and ">" in line and " " in line) : count += 1 if mark == 1 and line != "" and ( (line[0].isnumeric() and ">" in line and " " in line) or line == "*Deployment targets:*" or line == "*Announced development targets:*"): if display == True: print(count) print(text) print(" ") dic_key_initiatives[count] = text mark = 0 if mark == 1: text = text + line + " " if line == "*Key initiatives:*": mark = 1 text = "" return dic_key_initiatives def deployment_target(name_file , display): file = open(path + 'iea.txt', "r", encoding='utf8') lines = file.readlines() mark = 0 dic_target = {} count = -1 for index, line in enumerate(lines): line = line.strip() if line == "Close explanation": break if line != "" and (line[0].isnumeric() and ">" in line and " " in line) : count += 1 if mark == 1 and line != "" and ((line[0].isnumeric() and ">" in line and " " in line) or line == "*Announced cost reduction targets:*" or line == "*Announced development targets:*"): if display == True: print(count) print(text) print(" ") dic_target[count] = text mark = 0 if mark == 1: text = text + line + " " if line == "*Deployment targets:*" or line == "*Announced development targets:*": mark = 1 text = "" return dic_target def cost_reduction_target(name_file , display): file = open(path + 'iea.txt', "r", encoding='utf8') lines = file.readlines() mark = 0 dic_cost = {} count = -1 for index, line in enumerate(lines): line = line.strip() if line == "Close explanation": break if line != "" and (line[0].isnumeric() and ">" in line and " " in line) : count += 1 if mark == 1 and line != "" and (line[0].isnumeric() and ">" in line and " " in line) : if display == True: print(count) print(text) print(" ") dic_cost[count] = text mark = 0 if mark == 1: text = text + line + " " if line == "*Announced cost reduction targets:*": mark = 1 text = "" return dic_cost def key_words(name_file, display ): file = open(path + 'iea.txt', "r", encoding='utf8') lines = file.readlines() list_categories = [] count = -1 for index, line in enumerate(lines): line = line.strip() if line == "Close explanation": break if line != "" and (line[0].isnumeric() and ">" in line and " " in line) : count += 1 if display == True: print("Technologies" , count+1 , ":") if line != "": if line[0].isnumeric() and ">" in line and " " in line: i = 0 try: line = line.split(" ")[2] except: print(line) break if "Details" not in lines[index] and "Moderate" not in lines[index]: while " " not in line: i += 1 if "Details"==lines[index + i][:7] or "End-use"==lines[index + i][:7]: break else: line = line + " " + lines[index + i] #if " Production" in line: #line = line.replace(" Production" , "") line = line.replace("\n" , " ") line = line.replace("/" , " ") line = line.replace("-" , " ") line = line.split(" ")[0] if " " in line: line = line.replace(" ", " ") line = line.split(">") if "(" in line[-1]: line[-1] = line[-1].split("(")[0] for i in range(len(line)): # remove multiple spaces line[i] = re.sub(' +', ' ', line[i]) # remove trailing spaces line[i] = line[i].strip() if display == True: print(line) print(" ") if '' in line: line.remove('') list_categories.append([count , line]) return list_categories def technology(name_file, display ): # Filepath too specific, need to change to relative path file = open(path + 'iea.txt', "r", encoding='utf8') lines = file.readlines() list_categories = [] count = -1 for index, line in enumerate(lines): line = line.strip() if line == "Close explanation": break if line != "" and (line[0].isnumeric() and ">" in line and " " in line) : count += 1 if display == True: print("Technologies" , count+1 , ":") if line != "": if line[0].isnumeric() and ">" in line and " " in line: i = 0 try: line = line.split(" ")[1] except: print(line) break line = line.replace("\n" , " ") line = line.replace("/" , " ") line = line.replace("-" , " ") line = line.strip() line = re.sub(' +', ' ', line) line = line.split(" ")[0] line = line.split(">") if "(" in line[-1]: line[-1] = line[-1].split("(")[0] for i in range(len(line)): # remove multiple spaces line[i] = re.sub(' +', ' ', line[i]) # remove trailing spaces line[i] = line[i].strip() if display == True: print(line) print(" ") list_categories.append([count , line]) return list_categories #################### Contest Functions ############################# def extract_quantitative_data_technology(technologies, number_technology): name_file = "iea" dic_target = deployment_target(name_file , False) dic_cost = cost_reduction_target(name_file , False) dic_details = details(name_file , False) cost_target_text = 'No information' cost_text = 'No information' if number_technology in dic_details: reference_text = dic_details[number_technology] if number_technology in dic_target: cost_target_text = dic_target[number_technology] if number_technology in dic_cost: cost_text = dic_cost[number_technology] return reference_text, cost_target_text, cost_text def big_ideas_encoding(number_technology): model = model_nlp() dic_big_ideas = load_data() dic_matches = {} name_file = 'iea' dic_details = details(name_file , False) k = 0 IEA_encoding = model.encode(dic_details[number_technology] , convert_to_tensor=False , show_progress_bar = False).tolist() for count_contest in dic_big_ideas: if pd.isna(dic_big_ideas[count_contest]['encoded_description']) != True: score = cosine_similarity2(IEA_encoding, clean_encoding(dic_big_ideas[count_contest]['encoded_description'])) if score > 0.45: dic_matches[k] = dic_big_ideas[count_contest] dic_matches[k]["score"] = score k += 1 if len(dic_matches) > 0: return pd.DataFrame(dic_matches).T.sort_values("score" , ascending = False)[["Project_name" , "Description" ,"Project_date" , "University" , "Field" , "Team_members", "score" ]] else: return "No related project found" ################################### Extracted texts ############################################################### #@title Which patents are related to the technology? def finder(): name_file = 'iea' res = technology("iea", False ) list_categories_tech = [] list_categories = key_words("iea" , False) list_technologies = [ ( ", ".join(list_categories[i][1]) , i ) for i in range(len(list_categories)) ] dic_technologies = {} for i in range(len(res)): names = res[i][1] if ", ".join(names) not in list_categories_tech: list_categories_tech.append(", ".join(names)) dic_technologies[", ".join(names)] = [] dic_technologies[", ".join(names)].append( (", ".join(list_categories[i][1]) , i )) list_climate = [ ("Any related papers" , False ) , ("Climate related papers" , True)] dic_categories = {} for elem in list_technologies: list_words = elem[0].split(",")[-3:] for i in range(len(list_words)): if "CCUS" in list_words[i]: list_words[i] = list_words[i].replace("CCUS" , "carbon capture storage") dic_categories[elem[1]] = [ ", ".join([ " ".join(words.split()[:3]) for words in list_words ] ) , ", ".join([ " ".join(words.split()[:3]) for words in list_words[:-1] ]) , ", ".join([ " ".join(words.split()[:3]) for words in list_words[1:] ] ) ] return dic_technologies, dic_categories, list_categories_tech, list_technologies