File size: 4,279 Bytes
3061dc8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import numpy as np
import json
  
swap_dict = {"ई":"इ","श":"स","ष":"स","ू":"ु","ी":"ि","ं":"ँ"}


def get_valid_chars(file_path):
    '''-> valid_characters(list)
    '''
    valid_characters = list(json.load(open(file_path,'r',encoding="utf-8")).keys())
    return valid_characters


def get_word_arr_from_text(text):
    return text.split(" ")

def get_minimal_text(text):
    new_text = ''
    for chars in text:
        if chars in swap_dict.keys():
            new_text += swap_dict[chars]
        else:
            new_text += chars
    return new_text

def add_purnabiram(text,kriyapad,samyajoak):

    # print(kriyapad)
    text_arr = get_word_arr_from_text(text)
    new_text = ''
    # print(text_arr)
    for ind,words in enumerate(text_arr[:-1]):
        if len(words) <= 0:
            continue
        # print(words)
        new_text = new_text + words + ' '
        if get_minimal_text(words) in kriyapad:
            if text_arr[ind+1] not in samyajoak and get_minimal_text(text_arr[ind+1]) not in kriyapad:
                new_text = new_text + '। '
    new_text = new_text + text_arr[-1] + '। '
    return new_text 

def remove_useless_characters(text,valid_characters):
    '''text(string), valid_characters(list)  -> sentences text(string)
    '''
    valid_text = ''
    for chars in text:
        if chars in valid_characters:
            valid_text+=chars
    # print(valid_text)
    return valid_text


def get_sentences_as_arr(text):
    '''text(string)  -> sentences (1d-array)
    '''
    arr = text.split('।')
    return arr

def remove_stop_words_and_filter_word_arr(word_arr,word_endings,stop_words):
    '''word_arr (2d-array), word_endings(string), stop_words(string)  -> new_word_arr (2d-array)
    '''
    stop_words = stop_words.split('\n')
    new_word_arr = []
    word_endings = word_endings.split("\n")
    for sentences in word_arr:
        new_sentences = []
        for words in sentences:
            for endings in word_endings:
                if words.endswith(endings):
                    words = words[:-len(endings)]
                    # print(f"endings = {endings}, word = {words}")
                    break
            # print(words)
            for st_word in stop_words:
                if st_word == words:
                    # print(words)
                    break
            new_sentences.append(words)
        # print(new_sentences)
        new_word_arr.append(new_sentences)
    return new_word_arr


def get_words_as_arr(sentence_arr):
    ''' sentence_arr(1d-array) -> word_arr(1d-array)
    '''
    ret_arr = []
    for sentence in sentence_arr:
        word_arr = sentence.split(" ")
        ret_arr.append(word_arr)
    return ret_arr

def remove_repeating_sentences(sentence_arr):
    new_sentence_arr = []
    for sentence in sentence_arr:
        if len(sentence) > 1 and sentence not in new_sentence_arr:
            new_sentence_arr.append(sentence)
    return new_sentence_arr

def remove_empty_sentences(sentences,words_arr):
    ''' sentences(1d-array), word_arr(1d-arr)  -> sentences (1-D array), word_arr(1d-array) 
    '''
    new_sentences = []
    new_words_arr = []
    for (sent , sent_arr) in zip(sentences,words_arr):
        new_sent_arr = []
        for word in sent_arr:
            if len(word) > 0:
                new_sent_arr.append(word)
        if len(new_sent_arr) > 1:      # Removing Lonely word - sentence as they won't have association with other words. Set 0 to consider those lonely words
            new_sentences.append(sent)
            new_words_arr.append(new_sent_arr)
    return new_sentences,new_words_arr

def search_and_get_index(arr,val):
    for i,item in enumerate(arr):
        if item == val:
            return i
    return -1

def tokenize(word_arr):
    word_list = []
    tokenized_sentence = []
    count = 0
    for sentence in word_arr:
        token_words = []
        for word in sentence:
            ind = search_and_get_index(word_list,word)
            if ind == -1:
                word_list.append(word)
                token_words.append(count)
                count+=1
            else:
                token_words.append(ind)
        tokenized_sentence.append(token_words)
    return tokenized_sentence, word_list