from src.apis.config.constances import DEFAULT_TEXT_ANNOTATION_FILE, DEFAULT_DESTINATIONS import json import underthesea import string import pandas as pd import numpy as np from sklearn.feature_extraction.text import CountVectorizer from src.utils.dictionary import ( number_dict, translate_dict, mispelling_dict, wordform2vnese_dict, emotion2wordform_dict, ) with open(DEFAULT_TEXT_ANNOTATION_FILE, "r", encoding="utf-8") as file: data = json.load(file) # Prepare sentences and labels sentences = [item[0] for item in data["annotations"]] labels = [item[1]["entities"] for item in data["annotations"]] # Define tags tags = data["classes"] # tags = [''] + tags # Convert tags to indices tag2idx = {tag: 0 for idx, tag in enumerate(tags)} for label in labels: for entity in label: tag2idx[entity[1]] = tag2idx[entity[1]] + 1 # Sort the dictionary by values sorted_tags_dict = dict(sorted(tag2idx.items(), key=lambda item: item[1],reverse=True)) sorted_tags = {key: value for key, value in sorted_tags_dict.items() if value != 0} new_tag = {'': 0} sorted_tags = {**new_tag, **sorted_tags} destinations = pd.read_excel(DEFAULT_DESTINATIONS) vectorizer = CountVectorizer(max_features=10000, stop_words="english") tags_vector = vectorizer.fit_transform( destinations["tags"].values.astype("U") ).toarray() tags_vector = tags_vector[1:] feature_names = vectorizer.get_feature_names_out() # 10 Remove stopwords def remove_stopwords(input_text, stopwords_file="Datasets/Query/stopword.txt"): # Read the custom stop words from the file with open(stopwords_file, "r", encoding="utf-8") as file: stopwords = set(line.strip() for line in file) cleaned_words = [ word for word in input_text.split() if word.lower() not in stopwords ] cleaned_text = " ".join(cleaned_words) return cleaned_text # 9 word segmentation def word_segment(text): return underthesea.word_tokenize(text, format="text") # 8 Remove numbers def remove_numbers(input_string): # Use the isalpha() method to filter out numeric characters cleaned_string = "".join(char for char in input_string if not char.isdigit()) return cleaned_string # 7 def remove_extra_whitespace(input_string): words = input_string.split() return " ".join(words) # 6 Tranform Number to text (8 - tám) def number2text(sentence): words = sentence.split() converted_words = [number_dict.get(word, word) for word in words] converted_sentence = " ".join(converted_words) return converted_sentence # 5 Transform mispelling words, acronyms, .....(include translate english words) def translate2word(sentence, dictionary=translate_dict): sentence = " " + sentence.strip() + " " for key, value_list in dictionary.items(): for value in value_list: sentence = sentence.replace(value, key) return sentence def mispell2word(sentence, dictionary=mispelling_dict): sentence = " " + sentence.strip() + " " for key, value_list in dictionary.items(): for value in value_list: sentence = sentence.replace(value, key) return sentence # 4 Transform word from into vietnamese (colonsmile - cười) def word_form2Vnese(sentence): words = sentence.split() converted_words = [wordform2vnese_dict.get(word, word) for word in words] converted_sentence = " ".join(converted_words) return converted_sentence # 3 f def remove_punctuation(input_string): # Create a translation table to remove all punctuation characters translator = str.maketrans("", "", string.punctuation) # Use the translate method to remove punctuation cleaned_string = input_string.translate(translator) return cleaned_string # 2 emoticon to word form ( :) - colonsmile ) def emoticon2word(sentence): words = sentence.split() converted_words = [emotion2wordform_dict.get(word, word) for word in words] converted_sentence = " ".join(converted_words) return converted_sentence # 1 lower case def lower_case(text): return text.lower() def data_preprocessing(text): return remove_stopwords( word_segment( remove_extra_whitespace( number2text(mispell2word(remove_punctuation(lower_case(text)))) ) ) ) def read_input(input): # hàm cuối cùng khi đọc và xử lí input sentence return data_preprocessing(input) def create_bias_weights(): """ Create a weights vector for bias based on the given tags and weights. The function initializes a weights vector to zero, then maps the weights from the weights_tags_vector to the appropriate positions in the weights_vector based on the tags present in the destinations. """ weights_tags_vector = [ [15, 15, 0.9, 15, 15, 10, 1, 5, 0.6, 0.9, 0.9, 0.8, 10, 10, 1, 15], [15, 15, 0.9, 15, 15, 10, 15, 1, 10, 0.6, 0.9, 0.9, 0.8, 10, 10, 15, 0.8, 15], [15, 0.9, 0.8, 15, 15, 1, 10, 10, 0.6, 0.9, 0.9, 0.8, 5, 5, 1, 15], [ 15, 15, 0.9, 15, 0.7, 15, 15, 15, 1, 10, 10, 1, 0.9, 0.9, 0.9, 5, 5, 15, 0.8, 15, ], [ 10, 10, 15, 15, 0.8, 0.9, 15, 15, 15, 1, 10, 10, 0.6, 0.5, 0.9, 0.9, 0.8, 0.7, 15, 15, 15, 15, 15, ], [0.8, 0.9, 15, 0.8, 15, 0.9, 10, 15, 0.9, 0.9, 0.9, 0.8, 15, 10, 1, 15], [0.9, 0.8, 5, 1, 0.9, 10, 15, 0.9, 0.9, 0.9, 0.9, 0.8, 15, 1, 1, 15], [0.8, 0.9, 5, 1, 15, 15, 0.9, 0.9, 0.9, 0.8, 15, 1, 15], [0.8, 0.7, 15, 15, 1, 10, 0.7, 0.7, 0.6, 5, 5, 15], [0.8, 5, 1, 15, 15, 15, 0.7, 0.7, 15], [0.8, 0.7, 1, 15, 15, 0.7, 0.7, 15], [0.8, 0.7, 1, 15, 15, 15, 0.7, 0.9, 15], [0.8, 0.7, 1, 15, 15, 0.7, 0.7, 15], [0.8, 0.7, 1, 15, 15, 15, 0.7, 0.7, 15], [0.8, 0.7, 1, 15, 15, 15, 1, 10, 15], [10, 0.9, 0.8, 1, 15, 15, 15, 0.8, 10, 15], [0.8, 15, 1, 15, 15, 0.8, 10, 15], [10, 0.8, 1, 15, 1, 0.9, 0.8, 5, 0.8], [0.8, 15, 1, 5, 0.9, 0.8, 0.7, 0.7], [0.9, 0.8, 15, 1, 15, 0.7, 0.8, 0.7, 0.7, 5, 5, 15], [0.8, 0.7, 1, 5, 0.9, 10, 10, 15], [0.8, 1, 15, 15, 1, 0.9, 0.8, 0.8, 15], [0.8, 1, 10, 5, 5, 15], [0.8, 0.7, 1, 15, 15, 0.8, 0.9, 15], [10, 10, 10, 1, 10, 0.8, 1, 5, 10, 10, 10, 10, 1, 0.9, 1, 1, 15], [0.8, 0.7, 1, 15, 15, 0.8, 0.9, 15], [0.8, 0.7, 1, 10, 10, 0.8, 0.9, 15], [10, 0.8, 0.7, 15, 15, 1, 15, 15, 0.7, 0.7, 0.6, 5, 5, 1, 15], [5, 0.8, 0.7, 5, 5, 1, 10, 10, 0.7, 0.7, 0.6, 5, 5, 1, 15], [0.8, 0.7, 15, 5, 1, 10, 10, 10, 0.8, 0.7, 0.7, 5, 5, 5, 10, 15], [5, 5, 10, 15, 15, 15, 15, 0.9, 0.8, 0.7, 0.7, 1, 15], [10, 10, 15, 15, 10, 5, 1, 15, 15, 15, 15, 0.7, 5, 5, 0.8, 1, 15], [10, 15, 15, 15, 10, 10, 1, 1, 1, 15, 15, 5, 5], [0.8, 0.7, 0.6, 0.8, 1, 1, 1, 0.9, 0.8, 0.7, 0.7, 0.6, 5, 5, 1, 15], [1, 0.8, 0.9, 0.7, 0.6, 1, 0.9, 0.8, 1, 1, 0.9, 0.8, 0.8, 0.7, 0.9, 5, 5, 15], [ 1, 0.8, 0.9, 0.7, 0.6, 1, 0.9, 0.8, 1, 1, 0.9, 0.7, 0.6, 0.8, 0.8, 0.8, 0.7, 5, 5, 1, 0.7, 0.6, 15, ], [0.9, 0.7, 1, 1, 0.8, 0.7, 0.8, 0.8, 0.7, 1, 1, 1, 1, 15], ] # Create a weights vector initialized to zero weights_vector = np.zeros(tags_vector.shape) # Map weights to the appropriate positions in the weights_vector for i, row in enumerate(destinations["tags"][1:].values): tags = row.split() for tag, weight in zip(tags, weights_tags_vector[i]): index = np.where(feature_names == tag.lower())[0][0] weights_vector[i][index] = weight np.save("Datasets/Weights/weights_bias.npy", weights_vector) def create_freq_weights(): """ This function creates a weights vector for frequency-based weights based on the given tags and their frequencies. The function initializes a weights vector to zero, then maps the weights from the sorted_tags_dict to the appropriate positions in the weights_vector based on the tags present in the destinations. The weights are calculated as the ratio of the tag's frequency to the maximum frequency among all tags. Parameters: tags_vector (numpy.ndarray): A 2D numpy array representing the tags vector. Each row corresponds to a destination, and each column corresponds to a tag. The value at each position is 1 if the tag is present in the destination, and 0 otherwise. sorted_tags_dict (dict): A dictionary where the keys are the tags and the values are their frequencies. feature_names (numpy.ndarray): A 1D numpy array representing the names of the features (tags). destinations (pandas.DataFrame): A pandas DataFrame containing the destinations data, including the tags column. Returns: numpy.ndarray: A 2D numpy array representing the weights vector for frequency-based weights. Each row corresponds to a destination, and each column corresponds to a tag. The value at each position represents the weight of the tag for that destination. """ # Create a weights vector initialized to zero weights_vector = np.zeros(tags_vector.shape) max_freq = max(sorted_tags_dict.values()) # Map weights to the appropriate positions in the weights_vector for i, row in enumerate(destinations["tags"][1:].values): tags = row.split() for tag in tags: index = np.where(feature_names == tag.lower())[0][0] weights_vector[i][ index ] = f"{(sorted_tags_dict[tag.replace('_', ' ')]/max_freq):.2f}" np.save("Datasets/Weights/weights_freq.npy", weights_vector) create_bias_weights() create_freq_weights() weights_bias_vector = np.load("Datasets/Weights/weights_bias.npy") weights_freq = np.load("Datasets/Weights/weights_freq.npy") weighted_tags_vector = weights_bias_vector