File size: 4,279 Bytes
3061dc8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | import numpy as np
import json
swap_dict = {"ई":"इ","श":"स","ष":"स","ू":"ु","ी":"ि","ं":"ँ"}
def get_valid_chars(file_path):
'''-> valid_characters(list)
'''
valid_characters = list(json.load(open(file_path,'r',encoding="utf-8")).keys())
return valid_characters
def get_word_arr_from_text(text):
return text.split(" ")
def get_minimal_text(text):
new_text = ''
for chars in text:
if chars in swap_dict.keys():
new_text += swap_dict[chars]
else:
new_text += chars
return new_text
def add_purnabiram(text,kriyapad,samyajoak):
# print(kriyapad)
text_arr = get_word_arr_from_text(text)
new_text = ''
# print(text_arr)
for ind,words in enumerate(text_arr[:-1]):
if len(words) <= 0:
continue
# print(words)
new_text = new_text + words + ' '
if get_minimal_text(words) in kriyapad:
if text_arr[ind+1] not in samyajoak and get_minimal_text(text_arr[ind+1]) not in kriyapad:
new_text = new_text + '। '
new_text = new_text + text_arr[-1] + '। '
return new_text
def remove_useless_characters(text,valid_characters):
'''text(string), valid_characters(list) -> sentences text(string)
'''
valid_text = ''
for chars in text:
if chars in valid_characters:
valid_text+=chars
# print(valid_text)
return valid_text
def get_sentences_as_arr(text):
'''text(string) -> sentences (1d-array)
'''
arr = text.split('।')
return arr
def remove_stop_words_and_filter_word_arr(word_arr,word_endings,stop_words):
'''word_arr (2d-array), word_endings(string), stop_words(string) -> new_word_arr (2d-array)
'''
stop_words = stop_words.split('\n')
new_word_arr = []
word_endings = word_endings.split("\n")
for sentences in word_arr:
new_sentences = []
for words in sentences:
for endings in word_endings:
if words.endswith(endings):
words = words[:-len(endings)]
# print(f"endings = {endings}, word = {words}")
break
# print(words)
for st_word in stop_words:
if st_word == words:
# print(words)
break
new_sentences.append(words)
# print(new_sentences)
new_word_arr.append(new_sentences)
return new_word_arr
def get_words_as_arr(sentence_arr):
''' sentence_arr(1d-array) -> word_arr(1d-array)
'''
ret_arr = []
for sentence in sentence_arr:
word_arr = sentence.split(" ")
ret_arr.append(word_arr)
return ret_arr
def remove_repeating_sentences(sentence_arr):
new_sentence_arr = []
for sentence in sentence_arr:
if len(sentence) > 1 and sentence not in new_sentence_arr:
new_sentence_arr.append(sentence)
return new_sentence_arr
def remove_empty_sentences(sentences,words_arr):
''' sentences(1d-array), word_arr(1d-arr) -> sentences (1-D array), word_arr(1d-array)
'''
new_sentences = []
new_words_arr = []
for (sent , sent_arr) in zip(sentences,words_arr):
new_sent_arr = []
for word in sent_arr:
if len(word) > 0:
new_sent_arr.append(word)
if len(new_sent_arr) > 1: # Removing Lonely word - sentence as they won't have association with other words. Set 0 to consider those lonely words
new_sentences.append(sent)
new_words_arr.append(new_sent_arr)
return new_sentences,new_words_arr
def search_and_get_index(arr,val):
for i,item in enumerate(arr):
if item == val:
return i
return -1
def tokenize(word_arr):
word_list = []
tokenized_sentence = []
count = 0
for sentence in word_arr:
token_words = []
for word in sentence:
ind = search_and_get_index(word_list,word)
if ind == -1:
word_list.append(word)
token_words.append(count)
count+=1
else:
token_words.append(ind)
tokenized_sentence.append(token_words)
return tokenized_sentence, word_list
|