File size: 3,382 Bytes
89b579b
 
 
fd2f778
89b579b
 
fd2f778
edef549
fd2f778
5432670
fd2f778
 
 
 
 
 
 
 
 
 
89b579b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import attr
import pandas as pd
import numpy as np
import spacy
from nltk.tokenize.texttiling import TextTilingTokenizer
from sklearn.metrics.pairwise import cosine_similarity
from functools import lru_cache
from sentence_transformers import SentenceTransformer

def load_sentence_transformer(model_name='all-MiniLM-L6-v2'):
    """
    all_MiniLM_L6_v2 - offline
    all-MiniLM-L6-v2 - Online
    """
    model = SentenceTransformer(model_name)
    return model

def load_spacy():
    return spacy.load('en_core_web_sm')


model = load_sentence_transformer()
nlp = load_spacy()


@attr.s
class SemanticTextSegmentation:

    """
    Segment a call transcript based on topics discussed in the call using
    TextTilling with Sentence Similarity via sentence transformer.

    Paramters
    ---------
    data: pd.Dataframe
        Pass the trascript in the dataframe format

    utterance: str
        pass the column name which represent utterance in transcript dataframe

    """

    data = attr.ib()
    utterance = attr.ib(default='utterance')

    def __attrs_post_init__(self):
        columns = self.data.columns.tolist()

    def get_segments(self, threshold=0.7):
        """
        returns the transcript segments computed with texttiling and sentence-transformer.

        Paramters
        ---------
        threshold: float
            sentence similarity threshold. (used to merge the sentences into coherant segments)

        Return
        ------
        new_segments: list
            list of segments        
        """
        segments = self._text_tilling()
        merge_index = self._merge_segments(segments, threshold)
        new_segments = []
        for i in merge_index:
            seg = ' '.join([segments[_] for _ in i])
            new_segments.append(seg)
        return new_segments

    def _merge_segments(self, segments, threshold):
        segment_map = [0]
        for index, (text1, text2) in enumerate(zip(segments[:-1], segments[1:])):
            sim = self._get_similarity(text1, text2)
            if sim >= threshold:
                segment_map.append(0)
            else:
                segment_map.append(1)
        return self._index_mapping(segment_map)

    def _index_mapping(self, segment_map):
        index_list = []
        temp = []
        for index, i in enumerate(segment_map):
            if i == 1:
                index_list.append(temp)
                temp = [index]
            else:
                temp.append(index)
        index_list.append(temp)
        return index_list

    def _get_similarity(self, text1, text2):
        sentence_1 = [i.text.strip()
                      for i in nlp(text1).sents if len(i.text.split(' ')) > 1]
        sentence_2 = [i.text.strip()
                      for i in nlp(text2).sents if len(i.text.split(' ')) > 2]
        embeding_1 = model.encode(sentence_1)
        embeding_2 = model.encode(sentence_2)
        embeding_1 = np.mean(embeding_1, axis=0).reshape(1, -1)
        embeding_2 = np.mean(embeding_2, axis=0).reshape(1, -1)
        sim = cosine_similarity(embeding_1, embeding_2)
        return sim

    def _text_tilling(self):
        tt = TextTilingTokenizer(w=15, k=10)
        text = '\n\n\t'.join(self.data[self.utterance].tolist())
        segment = tt.tokenize(text)
        segment = [i.replace("\n\n\t", ' ') for i in segment]
        return segment