File size: 3,338 Bytes
246df79
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
from sklearn.feature_extraction.text import CountVectorizer

from Pinpoint_Internal.Logger import *

c_vec = CountVectorizer(ngram_range=(1, 5))


class n_gram_aggregator():
    """
    This class is used to retrieve the most common NGrams for a given dataset corpus.
    """

    def _get_average_ngram_count(self, n_grams_dict):
        """
        takes a dict of Ngrams and identifies the average weighting
        :param n_grams_dict:
        :return:
        """
        all_count = []
        for n_gram in n_grams_dict:
            ng_count = n_grams_dict[n_gram]
            all_count.append(ng_count)

        average_count = sum(all_count) / len(all_count)
        # print(all_count)
        return average_count

    def _get_all_ngrams(self, data):
        """
        Returns all ngrams (tri, bi, and uni) for a given piece of text
        :param data:
        :return:
        """

        if type(data) is not list:
            data = [data]

        # input to fit_transform() should be an iterable with strings
        ngrams = c_vec.fit_transform(data)

        # needs to happen after fit_transform()
        vocab = c_vec.vocabulary_

        count_values = ngrams.toarray().sum(axis=0)

        # output n-grams
        uni_grams = {}
        bi_grams = {}
        tri_grams = {}

        for ng_count, ng_text in sorted([(count_values[i], k) for k, i in vocab.items()], reverse=True):
            sentence_length = len(ng_text.split(" "))

            if sentence_length == 3:
                tri_grams[ng_text] = ng_count
            elif sentence_length == 2:
                bi_grams[ng_text] = ng_count
            elif sentence_length == 1:
                uni_grams[ng_text] = ng_count

        return uni_grams, bi_grams, tri_grams

    def _get_popular_ngrams(self, ngrams_dict):
        """
        Returns ngrams for a given piece of text that are the most popular (i.e. their weighting is
        above the average ngram wighting)
        :param ngrams_dict:
        :return:
        """
        average_count = self._get_average_ngram_count(ngrams_dict)

        popular_ngrams = {}
        for n_gram in ngrams_dict:
            ng_count = ngrams_dict[n_gram]

            if ng_count >= average_count:
                popular_ngrams[n_gram] = ng_count
        return popular_ngrams

    def get_ngrams(self, data=None, file_name_to_read=None):
        """
        Wrapper function for returning uni, bi, and tri grams that are the most popular (above the average weighting in
        a given piece of text).
        :param data:
        :param file_name_to_read:
        :return:
        """
        logger().print_message("Getting Ngrams")

        if data is None and file_name_to_read is None:
            raise Exception("No data supplied to retrieve n_grams")

        if data is None and file_name_to_read is not None:
            with open(file_name_to_read, 'r') as file_to_read:
                data = file_to_read.read()

        uni_grams, bi_grams, tri_grams = self._get_all_ngrams(data)

        popular_uni_grams = list(self._get_popular_ngrams(uni_grams).keys())
        popular_bi_grams = list(self._get_popular_ngrams(bi_grams).keys())
        popular_tri_grams = list(self._get_popular_ngrams(tri_grams).keys())

        return popular_uni_grams, popular_bi_grams, popular_tri_grams