##packages code import streamlit as st import pandas as pd import re import json from math import radians, cos, sin, asin, sqrt import numpy as np from sentence_transformers import SentenceTransformer model = SentenceTransformer('all-MiniLM-L6-v2') path = 'Climate_site/python_scripts/' from sentence_transformers import SentenceTransformer, util @st.cache_resource def model_nlp(): model = SentenceTransformer('all-MiniLM-L6-v2') return model @st.cache_data def load_dic(): f = open(path + "preqin_venturedealsdetails_encoded.json","r") dic_companies = json.load(f) return dic_companies @st.cache_data def load_data(): url = path + "preqin_companies_IEA.tsv" table = pd.read_csv(url, delimiter = "\t" , index_col = 0) table = table.astype({'portfolio_company_id': 'str'}) return table table_companies = load_data() dic_companies = load_dic() def norm(vector): return sqrt(sum(x * x for x in vector)) def cosine_similarity2(vec_a, vec_b): norm_a = norm(vec_a) norm_b = norm(vec_b) dot = sum(a * b for a, b in zip(vec_a, vec_b)) return dot / (norm_a * norm_b) def get_similar_company(field, description , size): model = model_nlp() companies = dic_companies[field] encoded_description = model.encode(description) scores = np.dot( np.array(list(companies.values())) , np.array(encoded_description)) dic_scores = { str(k) : { "score" : v } for k , v in zip(companies.keys(), scores )} res = pd.DataFrame(dic_scores).T.sort_values("score" , ascending = False).head(size) return res.merge(table_companies , left_index = True , right_on = "portfolio_company_id" , how = "left")