File size: 1,820 Bytes
bf11ea6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54cc7a5
 
 
 
bf11ea6
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import pandas as pd
import numpy as np
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.signal import argrelextrema
import math

def rev_sigmoid(x:float)->float:
    return (1 / (1 + math.exp(0.5*x)))

def activate_similarities(similarities:np.array, p_size=10)->np.array:
    """ Function returns list of weighted sums of activated sentence similarities
    Args:
        similarities (numpy array): it should square matrix where each sentence corresponds to another with cosine similarity
        p_size (int): number of sentences are used to calculate weighted sum 
    Returns:
        list: list of weighted sums
    """
    x = np.linspace(-10,10,p_size)
    y = np.vectorize(rev_sigmoid) 

    activation_weights = np.pad(y(x),(0,similarities.shape[0]-p_size))
    
    diagonals = [similarities.diagonal(each) for each in range(0,similarities.shape[0])]
    diagonals = [np.pad(each, (0,similarities.shape[0]-len(each))) for each in diagonals]
    diagonals = np.stack(diagonals)
    diagonals = diagonals * activation_weights.reshape(-1,1)
    
    activated_similarities = np.sum(diagonals, axis=0)
    
    return activated_similarities

def split_into_paragraphs(text):
    model = SentenceTransformer('all-mpnet-base-v2')
    sentences = text.split('. ')
    embeddings = model.encode(sentences)
    similarities = cosine_similarity(embeddings)   
    activated_similarities = activate_similarities(similarities, p_size=5)
    minmimas = argrelextrema(activated_similarities, np.less, order=2)
    split_points = [each for each in minmimas[0]]

    result = ''
    for num,each in enumerate(sentences):
        if num in split_points:
            result+=f'\n\n {each}. '
        else:
            result+=f'{each}. '

    return result