File size: 1,820 Bytes
bf11ea6 54cc7a5 bf11ea6 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 | import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.signal import argrelextrema
import math
def rev_sigmoid(x:float)->float:
return (1 / (1 + math.exp(0.5*x)))
def activate_similarities(similarities:np.array, p_size=10)->np.array:
""" Function returns list of weighted sums of activated sentence similarities
Args:
similarities (numpy array): it should square matrix where each sentence corresponds to another with cosine similarity
p_size (int): number of sentences are used to calculate weighted sum
Returns:
list: list of weighted sums
"""
x = np.linspace(-10,10,p_size)
y = np.vectorize(rev_sigmoid)
activation_weights = np.pad(y(x),(0,similarities.shape[0]-p_size))
diagonals = [similarities.diagonal(each) for each in range(0,similarities.shape[0])]
diagonals = [np.pad(each, (0,similarities.shape[0]-len(each))) for each in diagonals]
diagonals = np.stack(diagonals)
diagonals = diagonals * activation_weights.reshape(-1,1)
activated_similarities = np.sum(diagonals, axis=0)
return activated_similarities
def split_into_paragraphs(text):
model = SentenceTransformer('all-mpnet-base-v2')
sentences = text.split('. ')
embeddings = model.encode(sentences)
similarities = cosine_similarity(embeddings)
activated_similarities = activate_similarities(similarities, p_size=5)
minmimas = argrelextrema(activated_similarities, np.less, order=2)
split_points = [each for each in minmimas[0]]
result = ''
for num,each in enumerate(sentences):
if num in split_points:
result+=f'\n\n {each}. '
else:
result+=f'{each}. '
return result |