|
|
|
|
|
|
|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import re |
|
|
import json |
|
|
from math import radians, cos, sin, asin, sqrt |
|
|
import numpy as np |
|
|
|
|
|
from sentence_transformers import SentenceTransformer |
|
|
|
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
|
|
|
|
|
|
path = 'Climate_site/python_scripts/' |
|
|
|
|
|
|
|
|
from sentence_transformers import SentenceTransformer, util |
|
|
|
|
|
@st.cache_resource |
|
|
def model_nlp(): |
|
|
model = SentenceTransformer('all-MiniLM-L6-v2') |
|
|
return model |
|
|
|
|
|
|
|
|
@st.cache_data |
|
|
def load_dic(): |
|
|
f = open(path + "preqin_venturedealsdetails_encoded.json","r") |
|
|
dic_companies = json.load(f) |
|
|
return dic_companies |
|
|
|
|
|
@st.cache_data |
|
|
def load_data(): |
|
|
url = path + "preqin_companies_IEA.tsv" |
|
|
table = pd.read_csv(url, delimiter = "\t" , index_col = 0) |
|
|
table = table.astype({'portfolio_company_id': 'str'}) |
|
|
|
|
|
return table |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
table_companies = load_data() |
|
|
dic_companies = load_dic() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def norm(vector): |
|
|
return sqrt(sum(x * x for x in vector)) |
|
|
|
|
|
def cosine_similarity2(vec_a, vec_b): |
|
|
norm_a = norm(vec_a) |
|
|
norm_b = norm(vec_b) |
|
|
dot = sum(a * b for a, b in zip(vec_a, vec_b)) |
|
|
return dot / (norm_a * norm_b) |
|
|
|
|
|
|
|
|
|
|
|
def get_similar_company(field, description , size): |
|
|
|
|
|
model = model_nlp() |
|
|
|
|
|
companies = dic_companies[field] |
|
|
|
|
|
|
|
|
encoded_description = model.encode(description) |
|
|
scores = np.dot( np.array(list(companies.values())) , np.array(encoded_description)) |
|
|
|
|
|
|
|
|
dic_scores = { str(k) : { "score" : v } for k , v in zip(companies.keys(), scores )} |
|
|
|
|
|
res = pd.DataFrame(dic_scores).T.sort_values("score" , ascending = False).head(size) |
|
|
|
|
|
return res.merge(table_companies , left_index = True , right_on = "portfolio_company_id" , how = "left") |
|
|
|
|
|
|
|
|
|