harshithakr commited on
Commit
467bca2
·
1 Parent(s): ba59cb9

Update mapping.py

Browse files
Files changed (1) hide show
  1. mapping.py +48 -43
mapping.py CHANGED
@@ -13,49 +13,54 @@ def get_mapping(prep_text):
13
 
14
  tags_list = classify(event_info)
15
  tags_list = tags_list['tags']
 
 
16
 
17
- event_discr_embeddings = model_sent.encode([' '.join(tags_list)],
18
- batch_size=250,
19
- show_progress_bar=True)
 
 
 
 
 
20
 
21
- event_embedd = event_discr_embeddings[0]
22
- sectors = pd.read_excel('sect_other.xlsx', sheet_name = 'sectors')
23
- sectors['name_clean'] = sectors['name'].str.replace('&','').str.strip()
24
- sectors['name_clean'] = sectors['name_clean'].str.replace('IT','information technology').str.replace(',','').str.lower()
25
-
26
- industries = pd.read_excel('sect_other.xlsx', sheet_name = 'other_indus')
27
- industries['industries_name_clean'] = industries['name'].str.replace('&','').str.strip()
28
- industries['industries_name_clean'] = industries['industries_name_clean'].str.replace('IT','information technology').str.replace(',','').str.lower()
29
-
30
- other_industries = pd.read_csv('other_industries.csv')
31
- other_industries['other_industries_name_clean'] = other_industries['Industry'].str.replace('-',' ').str.replace('/',' ').str.replace('(',' ').str.replace(')',' ').replace('&',' ').str.strip().str.lower()
32
-
33
- n_neighbors = 1
34
- threshold = 0.40
35
-
36
- #sectors
37
- distances, indices = sector_model.kneighbors([event_embedd], n_neighbors=2)
38
- name_index = indices[0]
39
- distance_name = str(distances[0])
40
- topic_name = []
41
- for index_i in name_index:
42
- topic_name.append(sectors['name_clean'].tolist()[index_i])
43
- topic_name = str(topic_name)
44
-
45
-
46
- #industries
47
- distances_indus, indices_indus = indus_model.kneighbors([event_embedd], n_neighbors=n_neighbors)
48
- name_index_indus = indices_indus[0][0]
49
- distance_name_indus = distances_indus[0][0]
50
- topic_name_indus = industries['industries_name_clean'].tolist()[name_index_indus]
51
-
52
- #other_industries
53
- distances_other, indices_other = other_indus_model.kneighbors([event_embedd], n_neighbors=3)
54
- name_index_other = indices_other[0]
55
- distance_name_other = str(distances_other[0])
56
- topic_name_other = []
57
- for index_o in name_index_other:
58
- topic_name_other.append(other_industries['other_industries_name_clean'].tolist()[index_o])
59
- topic_name_other = str(topic_name_other)
60
 
61
- return topic_name, distance_name, topic_name_indus, distance_name_indus,topic_name_other,distance_name_other
 
 
13
 
14
  tags_list = classify(event_info)
15
  tags_list = tags_list['tags']
16
+
17
+ if tags_list!=[]:
18
 
19
+ event_discr_embeddings = model_sent.encode([' '.join(tags_list)],
20
+ batch_size=250,
21
+ show_progress_bar=True)
22
+
23
+ event_embedd = event_discr_embeddings[0]
24
+ sectors = pd.read_excel('sect_other.xlsx', sheet_name = 'sectors')
25
+ sectors['name_clean'] = sectors['name'].str.replace('&','').str.strip()
26
+ sectors['name_clean'] = sectors['name_clean'].str.replace('IT','information technology').str.replace(',','').str.lower()
27
 
28
+ industries = pd.read_excel('sect_other.xlsx', sheet_name = 'other_indus')
29
+ industries['industries_name_clean'] = industries['name'].str.replace('&','').str.strip()
30
+ industries['industries_name_clean'] = industries['industries_name_clean'].str.replace('IT','information technology').str.replace(',','').str.lower()
31
+
32
+ other_industries = pd.read_csv('other_industries.csv')
33
+ other_industries['other_industries_name_clean'] = other_industries['Industry'].str.replace('-',' ').str.replace('/',' ').str.replace('(',' ').str.replace(')',' ').replace('&',' ').str.strip().str.lower()
34
+
35
+ n_neighbors = 1
36
+ threshold = 0.40
37
+
38
+ #sectors
39
+ distances, indices = sector_model.kneighbors([event_embedd], n_neighbors=2)
40
+ name_index = indices[0]
41
+ distance_name = str(distances[0])
42
+ topic_name = []
43
+ for index_i in name_index:
44
+ topic_name.append(sectors['name_clean'].tolist()[index_i])
45
+ topic_name = str(topic_name)
46
+
47
+
48
+ #industries
49
+ distances_indus, indices_indus = indus_model.kneighbors([event_embedd], n_neighbors=n_neighbors)
50
+ name_index_indus = indices_indus[0][0]
51
+ distance_name_indus = distances_indus[0][0]
52
+ topic_name_indus = industries['industries_name_clean'].tolist()[name_index_indus]
53
+
54
+ #other_industries
55
+ distances_other, indices_other = other_indus_model.kneighbors([event_embedd], n_neighbors=3)
56
+ name_index_other = indices_other[0]
57
+ distance_name_other = str(distances_other[0])
58
+ topic_name_other = []
59
+ for index_o in name_index_other:
60
+ topic_name_other.append(other_industries['other_industries_name_clean'].tolist()[index_o])
61
+ topic_name_other = str(topic_name_other)
62
+
63
+ return topic_name, distance_name, topic_name_indus, distance_name_indus,topic_name_other,distance_name_other
 
 
 
64
 
65
+ else:
66
+ return 'no tags identified', None, None, None,None,None