skryba / text_splitter.py
trojkat's picture
Update text_splitter.py
54cc7a5 verified
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.signal import argrelextrema
import math
def rev_sigmoid(x:float)->float:
return (1 / (1 + math.exp(0.5*x)))
def activate_similarities(similarities:np.array, p_size=10)->np.array:
""" Function returns list of weighted sums of activated sentence similarities
Args:
similarities (numpy array): it should square matrix where each sentence corresponds to another with cosine similarity
p_size (int): number of sentences are used to calculate weighted sum
Returns:
list: list of weighted sums
"""
x = np.linspace(-10,10,p_size)
y = np.vectorize(rev_sigmoid)
activation_weights = np.pad(y(x),(0,similarities.shape[0]-p_size))
diagonals = [similarities.diagonal(each) for each in range(0,similarities.shape[0])]
diagonals = [np.pad(each, (0,similarities.shape[0]-len(each))) for each in diagonals]
diagonals = np.stack(diagonals)
diagonals = diagonals * activation_weights.reshape(-1,1)
activated_similarities = np.sum(diagonals, axis=0)
return activated_similarities
def split_into_paragraphs(text):
model = SentenceTransformer('all-mpnet-base-v2')
sentences = text.split('. ')
embeddings = model.encode(sentences)
similarities = cosine_similarity(embeddings)
activated_similarities = activate_similarities(similarities, p_size=5)
minmimas = argrelextrema(activated_similarities, np.less, order=2)
split_points = [each for each in minmimas[0]]
result = ''
for num,each in enumerate(sentences):
if num in split_points:
result+=f'\n\n {each}. '
else:
result+=f'{each}. '
return result