Spaces:
Runtime error
Runtime error
| import numpy as np | |
| from sentence_transformers import SentenceTransformer, util | |
| from bertopic.representation import KeyBERTInspired, MaximalMarginalRelevance | |
| import nltk | |
| from nltk.corpus import stopwords | |
| from nltk.tokenize import word_tokenize | |
| import string | |
| import re | |
| nltk.download('stopwords') | |
| nltk.download('punkt') | |
| STOP_WORDS = list(stopwords.words('english')) | |
| BERTOPIC_REPRESENTATIONS = [ | |
| "KeyBERTInspired", | |
| "MaximalMarginalRelevance", | |
| ] | |
| TRANSFORMERS = ["all-mpnet-base-v2", "multi-qa-mpnet-base-dot-v1"] | |
| TRANSFORMERS_INFO = ["all-mpnet-base-v2: All-round model tuned for many use-cases. " | |
| "Trained on a large and diverse dataset of over 1 billion training pairs", | |
| "multi-qa-mpnet-base-dot-v1: This model was tuned for semantic search: Given a query/question, " | |
| "if can find relevant passages. " | |
| "It was trained on a large and diverse set of (question, answer) pairs." | |
| ] | |
| def get_bertopic_representation(representation: str): | |
| if representation == BERTOPIC_REPRESENTATIONS[0]: | |
| return KeyBERTInspired() | |
| elif representation == BERTOPIC_REPRESENTATIONS[1]: | |
| return MaximalMarginalRelevance() | |
| else: | |
| return None | |
| def tokenize_explode(df, col): | |
| df['tokenized'] = df[col].apply(word_tokenize) | |
| df = df.explode('tokenized') | |
| df['tokenized'] = df['tokenized'].str.strip() | |
| df['tokenized'] = df['tokenized'].str.lower() | |
| return df | |
| def cleanup_tokens(df, col): | |
| df = df[df[col].apply(lambda x: len(x) > 2)] | |
| df = df[~df[col].str.contains(r'^(\d+\.?\d*)$', regex=True)] | |
| df = df[~df[col].isin([p for p in string.punctuation])] | |
| df = df[df[col].isin(STOP_WORDS) == False] | |
| return df | |
| def get_embedding_model(transformer=BERTOPIC_REPRESENTATIONS[0]) -> SentenceTransformer: | |
| """ | |
| get given sentence transformer model | |
| :param transformer: | |
| :return: | |
| """ | |
| sentence_model = SentenceTransformer(transformer) | |
| return sentence_model | |
| def str_to_vector_list(text_list, sentence_model, replace_dict=None): | |
| """ | |
| embedding for the given text list using provided embedding model | |
| :param text_list: | |
| :param sentence_model: | |
| :param replace_dict: any values in the string that we may need to replace | |
| :return: | |
| """ | |
| text_list = [str(x).replace('[^\w\s]', '') for x in text_list] | |
| if replace_dict: | |
| for stp in replace_dict: | |
| text_list = [str(x).replace(stp, replace_dict[stp]) for x in text_list] | |
| embeddings = sentence_model.encode(text_list, show_progress_bar=True, batch_size=1000) | |
| return embeddings.tolist() | |
| def remove_unnecessary_tokens_from_df(df, columns, extra_stopwords=None) -> None: | |
| """ | |
| removes unnecessary token from the given columns of the dataframe | |
| :param df: | |
| :param columns: | |
| :param extra_stopwords: | |
| :return: | |
| """ | |
| df[columns] = df[columns].apply(lambda x: x.str.replace('[^\w\s]', '')) | |
| if extra_stopwords: | |
| for stp in extra_stopwords: | |
| df[columns] = df[columns].apply(lambda x: x.str.replace(stp, ' ')) | |
| def cosine_sim_matrix(embeddings_a, embeddings_b) -> np.array: | |
| """ | |
| finds out cosine similarity matrix for the given embeddings | |
| :param embeddings_a: | |
| :param embeddings_b: | |
| :return: numpy array showing the cosine similarity matrix | |
| """ | |
| return np.array( | |
| util.pytorch_cos_sim(embeddings_a, embeddings_b) | |
| ) | |