Spaces:

harshithakr
/

mapping_bert_topic

Runtime error

File size: 2,622 Bytes

114e5db
b994c8d
114e5db
2cad3f7
114e5db
 
87a3999
241a7ed
bf78592
114e5db
 
2cad3f7
114e5db
 
 
87a3999
114e5db
 
 
241a7ed
 
 
 
375f9e5
241a7ed
 
114e5db
 
 
 
 
 
 
 
 
 
 
 
 
241a7ed
 
 
 
 
 
114e5db
241a7ed
 
 
 
 
 
 
114e5db
241a7ed

import pickle
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from preprocess_function import preprocess_text
model_sent = SentenceTransformer("all-mpnet-base-v2")

sector_model = pickle.load(open('sector_knn.sav', 'rb'))
indus_model = pickle.load(open('indus_knn.sav', 'rb'))
other_indus_model = pickle.load(open('other_industries_knn.sav', 'rb'))

def get_mapping(prep_text):
  event_discr_embeddings = model_sent.encode([preprocess_text(prep_text)],
                                      batch_size=250,
                                      show_progress_bar=True)
  event_embedd = event_discr_embeddings[0]
  sectors = pd.read_excel('sect_other.xlsx', sheet_name = 'sectors')
  sectors['name_clean'] = sectors['name'].str.replace('&','').str.strip()
  sectors['name_clean'] = sectors['name_clean'].str.replace('IT','information technology').str.replace(',','').str.lower()

  industries = pd.read_excel('sect_other.xlsx', sheet_name = 'other_indus')
  industries['industries_name_clean'] = industries['name'].str.replace('&','').str.strip()
  industries['industries_name_clean'] = industries['industries_name_clean'].str.replace('IT','information technology').str.replace(',','').str.lower()

  other_industries = pd.read_csv('other_industries.csv')
  other_industries['other_industries_name_clean'] = other_industries['Industry'].str.replace('-',' ').str.replace('/',' ').str.replace('(',' ').str.replace(')',' ').replace('&',' ').str.strip().str.lower()
  
  n_neighbors = 1
  threshold = 0.40

  #sectors
  distances, indices = sector_model.kneighbors([event_embedd], n_neighbors=2)
  name_index = indices[0]
  distance_name = str(distances[0])
  topic_name = []
  for index_i in name_index:
    topic_name.append(sectors['name_clean'].tolist()[index_i])
  topic_name = str(topic_name)


  #industries
  distances_indus, indices_indus = indus_model.kneighbors([event_embedd], n_neighbors=n_neighbors)
  name_index_indus = indices_indus[0][0]
  distance_name_indus = distances_indus[0][0]
  topic_name_indus = industries['industries_name_clean'].tolist()[name_index_indus]

  #other_industries
  distances_other, indices_other = other_indus_model.kneighbors([event_embedd], n_neighbors=3)
  name_index_other = indices_other[0]
  distance_name_other = str(distances_other[0])
  topic_name_other = []
  for index_o in name_index_other:
    topic_name_other.append(other_industries['other_industries_name_clean'].tolist()[index_o])
  topic_name_other = str(topic_name_other)

  return topic_name, distance_name, topic_name_indus, distance_name_indus,topic_name_other,distance_name_other