import pandas as pd import numpy as np from sentence_transformers import SentenceTransformer from sklearn.metrics.pairwise import cosine_similarity from scipy.signal import argrelextrema import math def rev_sigmoid(x:float)->float: return (1 / (1 + math.exp(0.5*x))) def activate_similarities(similarities:np.array, p_size=10)->np.array: """ Function returns list of weighted sums of activated sentence similarities Args: similarities (numpy array): it should square matrix where each sentence corresponds to another with cosine similarity p_size (int): number of sentences are used to calculate weighted sum Returns: list: list of weighted sums """ x = np.linspace(-10,10,p_size) y = np.vectorize(rev_sigmoid) activation_weights = np.pad(y(x),(0,similarities.shape[0]-p_size)) diagonals = [similarities.diagonal(each) for each in range(0,similarities.shape[0])] diagonals = [np.pad(each, (0,similarities.shape[0]-len(each))) for each in diagonals] diagonals = np.stack(diagonals) diagonals = diagonals * activation_weights.reshape(-1,1) activated_similarities = np.sum(diagonals, axis=0) return activated_similarities def split_into_paragraphs(text): model = SentenceTransformer('all-mpnet-base-v2') sentences = text.split('. ') embeddings = model.encode(sentences) similarities = cosine_similarity(embeddings) activated_similarities = activate_similarities(similarities, p_size=5) minmimas = argrelextrema(activated_similarities, np.less, order=2) split_points = [each for each in minmimas[0]] result = '' for num,each in enumerate(sentences): if num in split_points: result+=f'\n\n {each}. ' else: result+=f'{each}. ' return result