Triventure-AI / src /utils /preprocessing.py
ABAO77's picture
Upload 37 files
5ce8318 verified
from src.apis.config.constances import DEFAULT_TEXT_ANNOTATION_FILE, DEFAULT_DESTINATIONS
import json
import underthesea
import string
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from src.utils.dictionary import (
number_dict,
translate_dict,
mispelling_dict,
wordform2vnese_dict,
emotion2wordform_dict,
)
with open(DEFAULT_TEXT_ANNOTATION_FILE, "r", encoding="utf-8") as file:
data = json.load(file)
# Prepare sentences and labels
sentences = [item[0] for item in data["annotations"]]
labels = [item[1]["entities"] for item in data["annotations"]]
# Define tags
tags = data["classes"]
# tags = ['<pad>'] + tags
# Convert tags to indices
tag2idx = {tag: 0 for idx, tag in enumerate(tags)}
for label in labels:
for entity in label:
tag2idx[entity[1]] = tag2idx[entity[1]] + 1
# Sort the dictionary by values
sorted_tags_dict = dict(sorted(tag2idx.items(), key=lambda item: item[1],reverse=True))
sorted_tags = {key: value for key, value in sorted_tags_dict.items() if value != 0}
new_tag = {'<pad>': 0}
sorted_tags = {**new_tag, **sorted_tags}
destinations = pd.read_excel(DEFAULT_DESTINATIONS)
vectorizer = CountVectorizer(max_features=10000, stop_words="english")
tags_vector = vectorizer.fit_transform(
destinations["tags"].values.astype("U")
).toarray()
tags_vector = tags_vector[1:]
feature_names = vectorizer.get_feature_names_out()
# 10 Remove stopwords
def remove_stopwords(input_text, stopwords_file="Datasets/Query/stopword.txt"):
# Read the custom stop words from the file
with open(stopwords_file, "r", encoding="utf-8") as file:
stopwords = set(line.strip() for line in file)
cleaned_words = [
word for word in input_text.split() if word.lower() not in stopwords
]
cleaned_text = " ".join(cleaned_words)
return cleaned_text
# 9 word segmentation
def word_segment(text):
return underthesea.word_tokenize(text, format="text")
# 8 Remove numbers
def remove_numbers(input_string):
# Use the isalpha() method to filter out numeric characters
cleaned_string = "".join(char for char in input_string if not char.isdigit())
return cleaned_string
# 7
def remove_extra_whitespace(input_string):
words = input_string.split()
return " ".join(words)
# 6 Tranform Number to text (8 - tám)
def number2text(sentence):
words = sentence.split()
converted_words = [number_dict.get(word, word) for word in words]
converted_sentence = " ".join(converted_words)
return converted_sentence
# 5 Transform mispelling words, acronyms, .....(include translate english words)
def translate2word(sentence, dictionary=translate_dict):
sentence = " " + sentence.strip() + " "
for key, value_list in dictionary.items():
for value in value_list:
sentence = sentence.replace(value, key)
return sentence
def mispell2word(sentence, dictionary=mispelling_dict):
sentence = " " + sentence.strip() + " "
for key, value_list in dictionary.items():
for value in value_list:
sentence = sentence.replace(value, key)
return sentence
# 4 Transform word from into vietnamese (colonsmile - cười)
def word_form2Vnese(sentence):
words = sentence.split()
converted_words = [wordform2vnese_dict.get(word, word) for word in words]
converted_sentence = " ".join(converted_words)
return converted_sentence
# 3 f
def remove_punctuation(input_string):
# Create a translation table to remove all punctuation characters
translator = str.maketrans("", "", string.punctuation)
# Use the translate method to remove punctuation
cleaned_string = input_string.translate(translator)
return cleaned_string
# 2 emoticon to word form ( :) - colonsmile )
def emoticon2word(sentence):
words = sentence.split()
converted_words = [emotion2wordform_dict.get(word, word) for word in words]
converted_sentence = " ".join(converted_words)
return converted_sentence
# 1 lower case
def lower_case(text):
return text.lower()
def data_preprocessing(text):
return remove_stopwords(
word_segment(
remove_extra_whitespace(
number2text(mispell2word(remove_punctuation(lower_case(text))))
)
)
)
def read_input(input): # hàm cuối cùng khi đọc và xử lí input sentence
return data_preprocessing(input)
def create_bias_weights():
"""
Create a weights vector for bias based on the given tags and weights.
The function initializes a weights vector to zero, then maps the weights from the weights_tags_vector to the appropriate positions in the weights_vector based on the tags present in the destinations.
"""
weights_tags_vector = [
[15, 15, 0.9, 15, 15, 10, 1, 5, 0.6, 0.9, 0.9, 0.8, 10, 10, 1, 15],
[15, 15, 0.9, 15, 15, 10, 15, 1, 10, 0.6, 0.9, 0.9, 0.8, 10, 10, 15, 0.8, 15],
[15, 0.9, 0.8, 15, 15, 1, 10, 10, 0.6, 0.9, 0.9, 0.8, 5, 5, 1, 15],
[
15,
15,
0.9,
15,
0.7,
15,
15,
15,
1,
10,
10,
1,
0.9,
0.9,
0.9,
5,
5,
15,
0.8,
15,
],
[
10,
10,
15,
15,
0.8,
0.9,
15,
15,
15,
1,
10,
10,
0.6,
0.5,
0.9,
0.9,
0.8,
0.7,
15,
15,
15,
15,
15,
],
[0.8, 0.9, 15, 0.8, 15, 0.9, 10, 15, 0.9, 0.9, 0.9, 0.8, 15, 10, 1, 15],
[0.9, 0.8, 5, 1, 0.9, 10, 15, 0.9, 0.9, 0.9, 0.9, 0.8, 15, 1, 1, 15],
[0.8, 0.9, 5, 1, 15, 15, 0.9, 0.9, 0.9, 0.8, 15, 1, 15],
[0.8, 0.7, 15, 15, 1, 10, 0.7, 0.7, 0.6, 5, 5, 15],
[0.8, 5, 1, 15, 15, 15, 0.7, 0.7, 15],
[0.8, 0.7, 1, 15, 15, 0.7, 0.7, 15],
[0.8, 0.7, 1, 15, 15, 15, 0.7, 0.9, 15],
[0.8, 0.7, 1, 15, 15, 0.7, 0.7, 15],
[0.8, 0.7, 1, 15, 15, 15, 0.7, 0.7, 15],
[0.8, 0.7, 1, 15, 15, 15, 1, 10, 15],
[10, 0.9, 0.8, 1, 15, 15, 15, 0.8, 10, 15],
[0.8, 15, 1, 15, 15, 0.8, 10, 15],
[10, 0.8, 1, 15, 1, 0.9, 0.8, 5, 0.8],
[0.8, 15, 1, 5, 0.9, 0.8, 0.7, 0.7],
[0.9, 0.8, 15, 1, 15, 0.7, 0.8, 0.7, 0.7, 5, 5, 15],
[0.8, 0.7, 1, 5, 0.9, 10, 10, 15],
[0.8, 1, 15, 15, 1, 0.9, 0.8, 0.8, 15],
[0.8, 1, 10, 5, 5, 15],
[0.8, 0.7, 1, 15, 15, 0.8, 0.9, 15],
[10, 10, 10, 1, 10, 0.8, 1, 5, 10, 10, 10, 10, 1, 0.9, 1, 1, 15],
[0.8, 0.7, 1, 15, 15, 0.8, 0.9, 15],
[0.8, 0.7, 1, 10, 10, 0.8, 0.9, 15],
[10, 0.8, 0.7, 15, 15, 1, 15, 15, 0.7, 0.7, 0.6, 5, 5, 1, 15],
[5, 0.8, 0.7, 5, 5, 1, 10, 10, 0.7, 0.7, 0.6, 5, 5, 1, 15],
[0.8, 0.7, 15, 5, 1, 10, 10, 10, 0.8, 0.7, 0.7, 5, 5, 5, 10, 15],
[5, 5, 10, 15, 15, 15, 15, 0.9, 0.8, 0.7, 0.7, 1, 15],
[10, 10, 15, 15, 10, 5, 1, 15, 15, 15, 15, 0.7, 5, 5, 0.8, 1, 15],
[10, 15, 15, 15, 10, 10, 1, 1, 1, 15, 15, 5, 5],
[0.8, 0.7, 0.6, 0.8, 1, 1, 1, 0.9, 0.8, 0.7, 0.7, 0.6, 5, 5, 1, 15],
[1, 0.8, 0.9, 0.7, 0.6, 1, 0.9, 0.8, 1, 1, 0.9, 0.8, 0.8, 0.7, 0.9, 5, 5, 15],
[
1,
0.8,
0.9,
0.7,
0.6,
1,
0.9,
0.8,
1,
1,
0.9,
0.7,
0.6,
0.8,
0.8,
0.8,
0.7,
5,
5,
1,
0.7,
0.6,
15,
],
[0.9, 0.7, 1, 1, 0.8, 0.7, 0.8, 0.8, 0.7, 1, 1, 1, 1, 15],
]
# Create a weights vector initialized to zero
weights_vector = np.zeros(tags_vector.shape)
# Map weights to the appropriate positions in the weights_vector
for i, row in enumerate(destinations["tags"][1:].values):
tags = row.split()
for tag, weight in zip(tags, weights_tags_vector[i]):
index = np.where(feature_names == tag.lower())[0][0]
weights_vector[i][index] = weight
np.save("Datasets/Weights/weights_bias.npy", weights_vector)
def create_freq_weights():
"""
This function creates a weights vector for frequency-based weights based on the given tags and their frequencies.
The function initializes a weights vector to zero, then maps the weights from the sorted_tags_dict to the appropriate positions in the weights_vector based on the tags present in the destinations.
The weights are calculated as the ratio of the tag's frequency to the maximum frequency among all tags.
Parameters:
tags_vector (numpy.ndarray): A 2D numpy array representing the tags vector. Each row corresponds to a destination, and each column corresponds to a tag. The value at each position is 1 if the tag is present in the destination, and 0 otherwise.
sorted_tags_dict (dict): A dictionary where the keys are the tags and the values are their frequencies.
feature_names (numpy.ndarray): A 1D numpy array representing the names of the features (tags).
destinations (pandas.DataFrame): A pandas DataFrame containing the destinations data, including the tags column.
Returns:
numpy.ndarray: A 2D numpy array representing the weights vector for frequency-based weights. Each row corresponds to a destination, and each column corresponds to a tag. The value at each position represents the weight of the tag for that destination.
"""
# Create a weights vector initialized to zero
weights_vector = np.zeros(tags_vector.shape)
max_freq = max(sorted_tags_dict.values())
# Map weights to the appropriate positions in the weights_vector
for i, row in enumerate(destinations["tags"][1:].values):
tags = row.split()
for tag in tags:
index = np.where(feature_names == tag.lower())[0][0]
weights_vector[i][
index
] = f"{(sorted_tags_dict[tag.replace('_', ' ')]/max_freq):.2f}"
np.save("Datasets/Weights/weights_freq.npy", weights_vector)
create_bias_weights()
create_freq_weights()
weights_bias_vector = np.load("Datasets/Weights/weights_bias.npy")
weights_freq = np.load("Datasets/Weights/weights_freq.npy")
weighted_tags_vector = weights_bias_vector