| import numpy as np |
| from datetime import date, datetime |
| from sklearn.metrics.pairwise import cosine_similarity |
|
|
|
|
|
|
| def get_intent(txt, db): |
|
|
| """ |
| This function is used for identifying intent using a keyword based approach. |
| Some predefined set of keywords (associated with intent id) are searched with the given string. |
| The intent id of first matched keyword is returned as the intent. |
| Keywords are manually picked and arranged in such way and sequence that only correct intent is returned. |
| Works of smaller dataset. |
| """ |
|
|
| for i in range(len(db)): |
| for keyword in db[str(i)]['keywords']: |
| if keyword in txt: |
| return i |
|
|
|
|
| def get_intent_bert(txt, db, model): |
|
|
| """ |
| This function is used for identifying intent using sentence similarity approach. |
| BERT pre-trained model has been used for getting the sentence embeddings. |
| Cosine similarity is used for getting the similarity score. |
| A predefined set of questions with associate intent label has been kept in a database. |
| For each intent label, there are multiple questions. |
| Intent of the highest similarity score is returned. |
| Pseudocode |
| ----------------------------------------------- |
| Get the embeddings of the given text. |
| For each intent: |
| For each question of that intent: |
| Get the embedding of the question |
| Calculate similarity score |
| Update maximum score and intent to be returned |
| Return the intent |
| """ |
|
|
|
|
| rv=-1 |
| mx_score=-1 |
| txt_emb=model.encode(txt) |
| txt_emb=np.reshape(txt_emb, (1,txt_emb.shape[0])) |
|
|
| for intent in range(len(db)): |
| candidates=db[str(intent)]['0'] |
|
|
| for candidate in candidates: |
| can_emb=model.encode(candidate) |
| can_emb=np.reshape(can_emb, (1,can_emb.shape[0])) |
|
|
| score=cosine_similarity(txt_emb, can_emb) |
| if score>mx_score: |
| mx_score=score |
| rv=intent |
|
|
| return rv, mx_score |
|
|
|
|
| def get_intent_bert_avg(txt, db, model): |
|
|
| """ |
| This function is used for identifying intent using sentence similarity approach. |
| BERT pre-trained model has been used for getting the sentence embeddings. |
| Cosine similarity is used for getting the similarity score. |
| A predefined set of questions with associate intent label has been kept in a database. |
| For each intent label, there are multiple questions. |
| An average similarity score is calculated for each intent over the questions. |
| Intent of the highest average similarity score is returned. |
| Pseudocode |
| ----------------------------------------------- |
| Get the embeddings of the given text. |
| For each intent: |
| For each question of that intent: |
| Get the embedding of the question |
| Calculate similarity score |
| Calculate average similarity score that that intent |
| Return intent of highest average similarity score |
| """ |
|
|
| rv=-1 |
| mx_score=-1 |
|
|
| txt_emb=model.encode(txt) |
| txt_emb=np.reshape(txt_emb, (1,txt_emb.shape[0])) |
|
|
| scores_all=[] |
| for intent in range(len(db)): |
| candidates=db[str(intent)]['0'] |
|
|
| scores=[] |
| for candidate in candidates: |
| can_emb=model.encode(candidate) |
| can_emb=np.reshape(can_emb, (1,can_emb.shape[0])) |
|
|
| score=cosine_similarity(txt_emb, can_emb) |
| scores.append(score) |
|
|
| scores_all.append(np.mean(scores)) |
| |
| intent=np.argmax(scores_all) |
|
|
| return intent, scores_all |
|
|