mapping_bert_topic_copy

Runtime error

File size: 2,205 Bytes

4767334
 
 
 
ba59cb9
b15874e
4767334
 
 
 
 
 
ba59cb9
8cd76cf
ba59cb9
467bca2
 
ba59cb9
467bca2
 
 
 
 
b15874e
467bca2
 
 
ba59cb9
467bca2
 
 
b15874e
467bca2
 
 
 
 
 
 
 
 
 
ca2a6a2
467bca2
 
 
 
 
 
 
 
 
b15874e
4767334
467bca2
c7a0b11

import pickle
import pandas as pd
from sentence_transformers import SentenceTransformer, util
from preprocess_function import preprocess_text
from topics_extraction import classify

model_sent = SentenceTransformer("all-mpnet-base-v2")

sector_model = pickle.load(open('sector_knn.sav', 'rb'))
indus_model = pickle.load(open('indus_knn.sav', 'rb'))

def get_mapping(prep_text):
    
  tags_list = classify(prep_text)
  tags_list = tags_list['tags']

  if tags_list!=[]:
    
      event_discr_embeddings = model_sent.encode([' '.join(tags_list)],
                                          batch_size=250,
                                          show_progress_bar=True)
        
      event_embedd = event_discr_embeddings[0]
      
      sectors = pd.read_excel('sect_other.xlsx', sheet_name = 'sectors')
      sectors['name_clean'] = sectors['name'].str.replace('&','').str.strip()
      sectors['name_clean'] = sectors['name_clean'].str.replace('IT','information technology').str.replace(',','').str.lower()
    
      industries = pd.read_excel('sect_other.xlsx', sheet_name = 'other_indus')
      industries['industries_name_clean'] = industries['name'].str.replace('&','').str.strip()
      industries['industries_name_clean'] = industries['industries_name_clean'].str.replace('IT','information technology').str.replace(',','').str.lower()

      n_neighbors = 1
      threshold = 0.40
    
      #sectors
      distances, indices = sector_model.kneighbors([event_embedd], n_neighbors=2)
      name_index = indices[0]
      distance_name = str(distances[0])
      topic_name = []
      for index_i in name_index:
        topic_name.append(sectors['name_clean'].tolist()[index_i])
      #topic_name = str(topic_name)
    
    
      #industries
      distances_indus, indices_indus = indus_model.kneighbors([event_embedd], n_neighbors=n_neighbors)
      name_index_indus = indices_indus[0][0]
      distance_name_indus = distances_indus[0][0]
      topic_name_indus = industries['industries_name_clean'].tolist()[name_index_indus]
    
    
      return topic_name, distance_name, topic_name_indus, distance_name_indus,tags_list

  else:
      return 'no tags identified', None, None, None,tags_list