Spaces:
Build error
Build error
File size: 10,587 Bytes
5ce8318 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 |
from src.apis.config.constances import DEFAULT_TEXT_ANNOTATION_FILE, DEFAULT_DESTINATIONS
import json
import underthesea
import string
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from src.utils.dictionary import (
number_dict,
translate_dict,
mispelling_dict,
wordform2vnese_dict,
emotion2wordform_dict,
)
with open(DEFAULT_TEXT_ANNOTATION_FILE, "r", encoding="utf-8") as file:
data = json.load(file)
# Prepare sentences and labels
sentences = [item[0] for item in data["annotations"]]
labels = [item[1]["entities"] for item in data["annotations"]]
# Define tags
tags = data["classes"]
# tags = ['<pad>'] + tags
# Convert tags to indices
tag2idx = {tag: 0 for idx, tag in enumerate(tags)}
for label in labels:
for entity in label:
tag2idx[entity[1]] = tag2idx[entity[1]] + 1
# Sort the dictionary by values
sorted_tags_dict = dict(sorted(tag2idx.items(), key=lambda item: item[1],reverse=True))
sorted_tags = {key: value for key, value in sorted_tags_dict.items() if value != 0}
new_tag = {'<pad>': 0}
sorted_tags = {**new_tag, **sorted_tags}
destinations = pd.read_excel(DEFAULT_DESTINATIONS)
vectorizer = CountVectorizer(max_features=10000, stop_words="english")
tags_vector = vectorizer.fit_transform(
destinations["tags"].values.astype("U")
).toarray()
tags_vector = tags_vector[1:]
feature_names = vectorizer.get_feature_names_out()
# 10 Remove stopwords
def remove_stopwords(input_text, stopwords_file="Datasets/Query/stopword.txt"):
# Read the custom stop words from the file
with open(stopwords_file, "r", encoding="utf-8") as file:
stopwords = set(line.strip() for line in file)
cleaned_words = [
word for word in input_text.split() if word.lower() not in stopwords
]
cleaned_text = " ".join(cleaned_words)
return cleaned_text
# 9 word segmentation
def word_segment(text):
return underthesea.word_tokenize(text, format="text")
# 8 Remove numbers
def remove_numbers(input_string):
# Use the isalpha() method to filter out numeric characters
cleaned_string = "".join(char for char in input_string if not char.isdigit())
return cleaned_string
# 7
def remove_extra_whitespace(input_string):
words = input_string.split()
return " ".join(words)
# 6 Tranform Number to text (8 - tám)
def number2text(sentence):
words = sentence.split()
converted_words = [number_dict.get(word, word) for word in words]
converted_sentence = " ".join(converted_words)
return converted_sentence
# 5 Transform mispelling words, acronyms, .....(include translate english words)
def translate2word(sentence, dictionary=translate_dict):
sentence = " " + sentence.strip() + " "
for key, value_list in dictionary.items():
for value in value_list:
sentence = sentence.replace(value, key)
return sentence
def mispell2word(sentence, dictionary=mispelling_dict):
sentence = " " + sentence.strip() + " "
for key, value_list in dictionary.items():
for value in value_list:
sentence = sentence.replace(value, key)
return sentence
# 4 Transform word from into vietnamese (colonsmile - cười)
def word_form2Vnese(sentence):
words = sentence.split()
converted_words = [wordform2vnese_dict.get(word, word) for word in words]
converted_sentence = " ".join(converted_words)
return converted_sentence
# 3 f
def remove_punctuation(input_string):
# Create a translation table to remove all punctuation characters
translator = str.maketrans("", "", string.punctuation)
# Use the translate method to remove punctuation
cleaned_string = input_string.translate(translator)
return cleaned_string
# 2 emoticon to word form ( :) - colonsmile )
def emoticon2word(sentence):
words = sentence.split()
converted_words = [emotion2wordform_dict.get(word, word) for word in words]
converted_sentence = " ".join(converted_words)
return converted_sentence
# 1 lower case
def lower_case(text):
return text.lower()
def data_preprocessing(text):
return remove_stopwords(
word_segment(
remove_extra_whitespace(
number2text(mispell2word(remove_punctuation(lower_case(text))))
)
)
)
def read_input(input): # hàm cuối cùng khi đọc và xử lí input sentence
return data_preprocessing(input)
def create_bias_weights():
"""
Create a weights vector for bias based on the given tags and weights.
The function initializes a weights vector to zero, then maps the weights from the weights_tags_vector to the appropriate positions in the weights_vector based on the tags present in the destinations.
"""
weights_tags_vector = [
[15, 15, 0.9, 15, 15, 10, 1, 5, 0.6, 0.9, 0.9, 0.8, 10, 10, 1, 15],
[15, 15, 0.9, 15, 15, 10, 15, 1, 10, 0.6, 0.9, 0.9, 0.8, 10, 10, 15, 0.8, 15],
[15, 0.9, 0.8, 15, 15, 1, 10, 10, 0.6, 0.9, 0.9, 0.8, 5, 5, 1, 15],
[
15,
15,
0.9,
15,
0.7,
15,
15,
15,
1,
10,
10,
1,
0.9,
0.9,
0.9,
5,
5,
15,
0.8,
15,
],
[
10,
10,
15,
15,
0.8,
0.9,
15,
15,
15,
1,
10,
10,
0.6,
0.5,
0.9,
0.9,
0.8,
0.7,
15,
15,
15,
15,
15,
],
[0.8, 0.9, 15, 0.8, 15, 0.9, 10, 15, 0.9, 0.9, 0.9, 0.8, 15, 10, 1, 15],
[0.9, 0.8, 5, 1, 0.9, 10, 15, 0.9, 0.9, 0.9, 0.9, 0.8, 15, 1, 1, 15],
[0.8, 0.9, 5, 1, 15, 15, 0.9, 0.9, 0.9, 0.8, 15, 1, 15],
[0.8, 0.7, 15, 15, 1, 10, 0.7, 0.7, 0.6, 5, 5, 15],
[0.8, 5, 1, 15, 15, 15, 0.7, 0.7, 15],
[0.8, 0.7, 1, 15, 15, 0.7, 0.7, 15],
[0.8, 0.7, 1, 15, 15, 15, 0.7, 0.9, 15],
[0.8, 0.7, 1, 15, 15, 0.7, 0.7, 15],
[0.8, 0.7, 1, 15, 15, 15, 0.7, 0.7, 15],
[0.8, 0.7, 1, 15, 15, 15, 1, 10, 15],
[10, 0.9, 0.8, 1, 15, 15, 15, 0.8, 10, 15],
[0.8, 15, 1, 15, 15, 0.8, 10, 15],
[10, 0.8, 1, 15, 1, 0.9, 0.8, 5, 0.8],
[0.8, 15, 1, 5, 0.9, 0.8, 0.7, 0.7],
[0.9, 0.8, 15, 1, 15, 0.7, 0.8, 0.7, 0.7, 5, 5, 15],
[0.8, 0.7, 1, 5, 0.9, 10, 10, 15],
[0.8, 1, 15, 15, 1, 0.9, 0.8, 0.8, 15],
[0.8, 1, 10, 5, 5, 15],
[0.8, 0.7, 1, 15, 15, 0.8, 0.9, 15],
[10, 10, 10, 1, 10, 0.8, 1, 5, 10, 10, 10, 10, 1, 0.9, 1, 1, 15],
[0.8, 0.7, 1, 15, 15, 0.8, 0.9, 15],
[0.8, 0.7, 1, 10, 10, 0.8, 0.9, 15],
[10, 0.8, 0.7, 15, 15, 1, 15, 15, 0.7, 0.7, 0.6, 5, 5, 1, 15],
[5, 0.8, 0.7, 5, 5, 1, 10, 10, 0.7, 0.7, 0.6, 5, 5, 1, 15],
[0.8, 0.7, 15, 5, 1, 10, 10, 10, 0.8, 0.7, 0.7, 5, 5, 5, 10, 15],
[5, 5, 10, 15, 15, 15, 15, 0.9, 0.8, 0.7, 0.7, 1, 15],
[10, 10, 15, 15, 10, 5, 1, 15, 15, 15, 15, 0.7, 5, 5, 0.8, 1, 15],
[10, 15, 15, 15, 10, 10, 1, 1, 1, 15, 15, 5, 5],
[0.8, 0.7, 0.6, 0.8, 1, 1, 1, 0.9, 0.8, 0.7, 0.7, 0.6, 5, 5, 1, 15],
[1, 0.8, 0.9, 0.7, 0.6, 1, 0.9, 0.8, 1, 1, 0.9, 0.8, 0.8, 0.7, 0.9, 5, 5, 15],
[
1,
0.8,
0.9,
0.7,
0.6,
1,
0.9,
0.8,
1,
1,
0.9,
0.7,
0.6,
0.8,
0.8,
0.8,
0.7,
5,
5,
1,
0.7,
0.6,
15,
],
[0.9, 0.7, 1, 1, 0.8, 0.7, 0.8, 0.8, 0.7, 1, 1, 1, 1, 15],
]
# Create a weights vector initialized to zero
weights_vector = np.zeros(tags_vector.shape)
# Map weights to the appropriate positions in the weights_vector
for i, row in enumerate(destinations["tags"][1:].values):
tags = row.split()
for tag, weight in zip(tags, weights_tags_vector[i]):
index = np.where(feature_names == tag.lower())[0][0]
weights_vector[i][index] = weight
np.save("Datasets/Weights/weights_bias.npy", weights_vector)
def create_freq_weights():
"""
This function creates a weights vector for frequency-based weights based on the given tags and their frequencies.
The function initializes a weights vector to zero, then maps the weights from the sorted_tags_dict to the appropriate positions in the weights_vector based on the tags present in the destinations.
The weights are calculated as the ratio of the tag's frequency to the maximum frequency among all tags.
Parameters:
tags_vector (numpy.ndarray): A 2D numpy array representing the tags vector. Each row corresponds to a destination, and each column corresponds to a tag. The value at each position is 1 if the tag is present in the destination, and 0 otherwise.
sorted_tags_dict (dict): A dictionary where the keys are the tags and the values are their frequencies.
feature_names (numpy.ndarray): A 1D numpy array representing the names of the features (tags).
destinations (pandas.DataFrame): A pandas DataFrame containing the destinations data, including the tags column.
Returns:
numpy.ndarray: A 2D numpy array representing the weights vector for frequency-based weights. Each row corresponds to a destination, and each column corresponds to a tag. The value at each position represents the weight of the tag for that destination.
"""
# Create a weights vector initialized to zero
weights_vector = np.zeros(tags_vector.shape)
max_freq = max(sorted_tags_dict.values())
# Map weights to the appropriate positions in the weights_vector
for i, row in enumerate(destinations["tags"][1:].values):
tags = row.split()
for tag in tags:
index = np.where(feature_names == tag.lower())[0][0]
weights_vector[i][
index
] = f"{(sorted_tags_dict[tag.replace('_', ' ')]/max_freq):.2f}"
np.save("Datasets/Weights/weights_freq.npy", weights_vector)
create_bias_weights()
create_freq_weights()
weights_bias_vector = np.load("Datasets/Weights/weights_bias.npy")
weights_freq = np.load("Datasets/Weights/weights_freq.npy")
weighted_tags_vector = weights_bias_vector |