Synapse_project / Climate_site /python_scripts /companies_function_own_details.py
EmmaScharfmannBerkeley's picture
Update Climate_site/python_scripts/companies_function_own_details.py
5f11732
##packages code
import streamlit as st
import pandas as pd
import re
import json
from math import radians, cos, sin, asin, sqrt
import numpy as np
from sentence_transformers import SentenceTransformer
model = SentenceTransformer('all-MiniLM-L6-v2')
path = 'Climate_site/python_scripts/'
from sentence_transformers import SentenceTransformer, util
@st.cache_resource
def model_nlp():
model = SentenceTransformer('all-MiniLM-L6-v2')
return model
@st.cache_data
def load_dic():
f = open(path + "preqin_venturedealsdetails_encoded.json","r")
dic_companies = json.load(f)
return dic_companies
@st.cache_data
def load_data():
url = path + "preqin_companies_IEA.tsv"
table = pd.read_csv(url, delimiter = "\t" , index_col = 0)
table = table.astype({'portfolio_company_id': 'str'})
return table
table_companies = load_data()
dic_companies = load_dic()
def norm(vector):
return sqrt(sum(x * x for x in vector))
def cosine_similarity2(vec_a, vec_b):
norm_a = norm(vec_a)
norm_b = norm(vec_b)
dot = sum(a * b for a, b in zip(vec_a, vec_b))
return dot / (norm_a * norm_b)
def get_similar_company(field, description , size):
model = model_nlp()
companies = dic_companies[field]
encoded_description = model.encode(description)
scores = np.dot( np.array(list(companies.values())) , np.array(encoded_description))
dic_scores = { str(k) : { "score" : v } for k , v in zip(companies.keys(), scores )}
res = pd.DataFrame(dic_scores).T.sort_values("score" , ascending = False).head(size)
return res.merge(table_companies , left_index = True , right_on = "portfolio_company_id" , how = "left")