| import pandas as pd | |
| import numpy as np | |
| from sentence_transformers import SentenceTransformer | |
| from sklearn.metrics.pairwise import cosine_similarity | |
| from scipy.signal import argrelextrema | |
| import math | |
| def rev_sigmoid(x:float)->float: | |
| return (1 / (1 + math.exp(0.5*x))) | |
| def activate_similarities(similarities:np.array, p_size=10)->np.array: | |
| """ Function returns list of weighted sums of activated sentence similarities | |
| Args: | |
| similarities (numpy array): it should square matrix where each sentence corresponds to another with cosine similarity | |
| p_size (int): number of sentences are used to calculate weighted sum | |
| Returns: | |
| list: list of weighted sums | |
| """ | |
| x = np.linspace(-10,10,p_size) | |
| y = np.vectorize(rev_sigmoid) | |
| activation_weights = np.pad(y(x),(0,similarities.shape[0]-p_size)) | |
| diagonals = [similarities.diagonal(each) for each in range(0,similarities.shape[0])] | |
| diagonals = [np.pad(each, (0,similarities.shape[0]-len(each))) for each in diagonals] | |
| diagonals = np.stack(diagonals) | |
| diagonals = diagonals * activation_weights.reshape(-1,1) | |
| activated_similarities = np.sum(diagonals, axis=0) | |
| return activated_similarities | |
| def split_into_paragraphs(text): | |
| model = SentenceTransformer('all-mpnet-base-v2') | |
| sentences = text.split('. ') | |
| embeddings = model.encode(sentences) | |
| similarities = cosine_similarity(embeddings) | |
| activated_similarities = activate_similarities(similarities, p_size=5) | |
| minmimas = argrelextrema(activated_similarities, np.less, order=2) | |
| split_points = [each for each in minmimas[0]] | |
| result = '' | |
| for num,each in enumerate(sentences): | |
| if num in split_points: | |
| result+=f'\n\n {each}. ' | |
| else: | |
| result+=f'{each}. ' | |
| return result |