Spaces:

ABAO77
/

Triventure-AI

Build error

App Files Files Community

Triventure-AI / src /utils /preprocessing.py

ABAO77

Upload 37 files

5ce8318 verified 9 months ago

raw

history blame contribute delete

10.6 kB

	from src.apis.config.constances import DEFAULT_TEXT_ANNOTATION_FILE, DEFAULT_DESTINATIONS
	import json
	import underthesea
	import string
	import pandas as pd
	import numpy as np
	from sklearn.feature_extraction.text import CountVectorizer
	from src.utils.dictionary import (
	number_dict,
	translate_dict,
	mispelling_dict,
	wordform2vnese_dict,
	emotion2wordform_dict,
	)
	with open(DEFAULT_TEXT_ANNOTATION_FILE, "r", encoding="utf-8") as file:
	data = json.load(file)

	# Prepare sentences and labels
	sentences = [item[0] for item in data["annotations"]]
	labels = [item[1]["entities"] for item in data["annotations"]]
	# Define tags
	tags = data["classes"]
	# tags = ['<pad>'] + tags

	# Convert tags to indices
	tag2idx = {tag: 0 for idx, tag in enumerate(tags)}
	for label in labels:
	for entity in label:
	tag2idx[entity[1]] = tag2idx[entity[1]] + 1
	# Sort the dictionary by values
	sorted_tags_dict = dict(sorted(tag2idx.items(), key=lambda item: item[1],reverse=True))
	sorted_tags = {key: value for key, value in sorted_tags_dict.items() if value != 0}

	new_tag = {'<pad>': 0}

	sorted_tags = {new_tag, sorted_tags}

	destinations = pd.read_excel(DEFAULT_DESTINATIONS)

	vectorizer = CountVectorizer(max_features=10000, stop_words="english")
	tags_vector = vectorizer.fit_transform(
	destinations["tags"].values.astype("U")
	).toarray()
	tags_vector = tags_vector[1:]

	feature_names = vectorizer.get_feature_names_out()

	# 10 Remove stopwords
	def remove_stopwords(input_text, stopwords_file="Datasets/Query/stopword.txt"):
	# Read the custom stop words from the file
	with open(stopwords_file, "r", encoding="utf-8") as file:
	stopwords = set(line.strip() for line in file)

	cleaned_words = [
	word for word in input_text.split() if word.lower() not in stopwords
	]
	cleaned_text = " ".join(cleaned_words)

	return cleaned_text


	# 9 word segmentation
	def word_segment(text):
	return underthesea.word_tokenize(text, format="text")


	# 8 Remove numbers
	def remove_numbers(input_string):
	# Use the isalpha() method to filter out numeric characters
	cleaned_string = "".join(char for char in input_string if not char.isdigit())
	return cleaned_string


	# 7
	def remove_extra_whitespace(input_string):
	words = input_string.split()
	return " ".join(words)


	# 6 Tranform Number to text (8 - tám)
	def number2text(sentence):
	words = sentence.split()
	converted_words = [number_dict.get(word, word) for word in words]
	converted_sentence = " ".join(converted_words)
	return converted_sentence


	# 5 Transform mispelling words, acronyms, .....(include translate english words)
	def translate2word(sentence, dictionary=translate_dict):
	sentence = " " + sentence.strip() + " "
	for key, value_list in dictionary.items():
	for value in value_list:
	sentence = sentence.replace(value, key)
	return sentence


	def mispell2word(sentence, dictionary=mispelling_dict):
	sentence = " " + sentence.strip() + " "
	for key, value_list in dictionary.items():
	for value in value_list:
	sentence = sentence.replace(value, key)
	return sentence


	# 4 Transform word from into vietnamese (colonsmile - cười)
	def word_form2Vnese(sentence):
	words = sentence.split()
	converted_words = [wordform2vnese_dict.get(word, word) for word in words]
	converted_sentence = " ".join(converted_words)
	return converted_sentence


	# 3 f
	def remove_punctuation(input_string):
	# Create a translation table to remove all punctuation characters
	translator = str.maketrans("", "", string.punctuation)

	# Use the translate method to remove punctuation
	cleaned_string = input_string.translate(translator)

	return cleaned_string


	# 2 emoticon to word form ( :) - colonsmile )
	def emoticon2word(sentence):
	words = sentence.split()
	converted_words = [emotion2wordform_dict.get(word, word) for word in words]
	converted_sentence = " ".join(converted_words)
	return converted_sentence


	# 1 lower case
	def lower_case(text):
	return text.lower()


	def data_preprocessing(text):
	return remove_stopwords(
	word_segment(
	remove_extra_whitespace(
	number2text(mispell2word(remove_punctuation(lower_case(text))))
	)
	)
	)


	def read_input(input): # hàm cuối cùng khi đọc và xử lí input sentence
	return data_preprocessing(input)


	def create_bias_weights():
	"""
	Create a weights vector for bias based on the given tags and weights.
	The function initializes a weights vector to zero, then maps the weights from the weights_tags_vector to the appropriate positions in the weights_vector based on the tags present in the destinations.
	"""
	weights_tags_vector = [
	[15, 15, 0.9, 15, 15, 10, 1, 5, 0.6, 0.9, 0.9, 0.8, 10, 10, 1, 15],
	[15, 15, 0.9, 15, 15, 10, 15, 1, 10, 0.6, 0.9, 0.9, 0.8, 10, 10, 15, 0.8, 15],
	[15, 0.9, 0.8, 15, 15, 1, 10, 10, 0.6, 0.9, 0.9, 0.8, 5, 5, 1, 15],
	[
	15,
	15,
	0.9,
	15,
	0.7,
	15,
	15,
	15,
	1,
	10,
	10,
	1,
	0.9,
	0.9,
	0.9,
	5,
	5,
	15,
	0.8,
	15,
	],
	[
	10,
	10,
	15,
	15,
	0.8,
	0.9,
	15,
	15,
	15,
	1,
	10,
	10,
	0.6,
	0.5,
	0.9,
	0.9,
	0.8,
	0.7,
	15,
	15,
	15,
	15,
	15,
	],
	[0.8, 0.9, 15, 0.8, 15, 0.9, 10, 15, 0.9, 0.9, 0.9, 0.8, 15, 10, 1, 15],
	[0.9, 0.8, 5, 1, 0.9, 10, 15, 0.9, 0.9, 0.9, 0.9, 0.8, 15, 1, 1, 15],
	[0.8, 0.9, 5, 1, 15, 15, 0.9, 0.9, 0.9, 0.8, 15, 1, 15],
	[0.8, 0.7, 15, 15, 1, 10, 0.7, 0.7, 0.6, 5, 5, 15],
	[0.8, 5, 1, 15, 15, 15, 0.7, 0.7, 15],
	[0.8, 0.7, 1, 15, 15, 0.7, 0.7, 15],
	[0.8, 0.7, 1, 15, 15, 15, 0.7, 0.9, 15],
	[0.8, 0.7, 1, 15, 15, 0.7, 0.7, 15],
	[0.8, 0.7, 1, 15, 15, 15, 0.7, 0.7, 15],
	[0.8, 0.7, 1, 15, 15, 15, 1, 10, 15],
	[10, 0.9, 0.8, 1, 15, 15, 15, 0.8, 10, 15],
	[0.8, 15, 1, 15, 15, 0.8, 10, 15],
	[10, 0.8, 1, 15, 1, 0.9, 0.8, 5, 0.8],
	[0.8, 15, 1, 5, 0.9, 0.8, 0.7, 0.7],
	[0.9, 0.8, 15, 1, 15, 0.7, 0.8, 0.7, 0.7, 5, 5, 15],
	[0.8, 0.7, 1, 5, 0.9, 10, 10, 15],
	[0.8, 1, 15, 15, 1, 0.9, 0.8, 0.8, 15],
	[0.8, 1, 10, 5, 5, 15],
	[0.8, 0.7, 1, 15, 15, 0.8, 0.9, 15],
	[10, 10, 10, 1, 10, 0.8, 1, 5, 10, 10, 10, 10, 1, 0.9, 1, 1, 15],
	[0.8, 0.7, 1, 15, 15, 0.8, 0.9, 15],
	[0.8, 0.7, 1, 10, 10, 0.8, 0.9, 15],
	[10, 0.8, 0.7, 15, 15, 1, 15, 15, 0.7, 0.7, 0.6, 5, 5, 1, 15],
	[5, 0.8, 0.7, 5, 5, 1, 10, 10, 0.7, 0.7, 0.6, 5, 5, 1, 15],
	[0.8, 0.7, 15, 5, 1, 10, 10, 10, 0.8, 0.7, 0.7, 5, 5, 5, 10, 15],
	[5, 5, 10, 15, 15, 15, 15, 0.9, 0.8, 0.7, 0.7, 1, 15],
	[10, 10, 15, 15, 10, 5, 1, 15, 15, 15, 15, 0.7, 5, 5, 0.8, 1, 15],
	[10, 15, 15, 15, 10, 10, 1, 1, 1, 15, 15, 5, 5],
	[0.8, 0.7, 0.6, 0.8, 1, 1, 1, 0.9, 0.8, 0.7, 0.7, 0.6, 5, 5, 1, 15],
	[1, 0.8, 0.9, 0.7, 0.6, 1, 0.9, 0.8, 1, 1, 0.9, 0.8, 0.8, 0.7, 0.9, 5, 5, 15],
	[
	1,
	0.8,
	0.9,
	0.7,
	0.6,
	1,
	0.9,
	0.8,
	1,
	1,
	0.9,
	0.7,
	0.6,
	0.8,
	0.8,
	0.8,
	0.7,
	5,
	5,
	1,
	0.7,
	0.6,
	15,
	],
	[0.9, 0.7, 1, 1, 0.8, 0.7, 0.8, 0.8, 0.7, 1, 1, 1, 1, 15],
	]
	# Create a weights vector initialized to zero
	weights_vector = np.zeros(tags_vector.shape)

	# Map weights to the appropriate positions in the weights_vector
	for i, row in enumerate(destinations["tags"][1:].values):
	tags = row.split()
	for tag, weight in zip(tags, weights_tags_vector[i]):
	index = np.where(feature_names == tag.lower())[0][0]
	weights_vector[i][index] = weight
	np.save("Datasets/Weights/weights_bias.npy", weights_vector)


	def create_freq_weights():
	"""
	This function creates a weights vector for frequency-based weights based on the given tags and their frequencies.
	The function initializes a weights vector to zero, then maps the weights from the sorted_tags_dict to the appropriate positions in the weights_vector based on the tags present in the destinations.
	The weights are calculated as the ratio of the tag's frequency to the maximum frequency among all tags.

	Parameters:
	tags_vector (numpy.ndarray): A 2D numpy array representing the tags vector. Each row corresponds to a destination, and each column corresponds to a tag. The value at each position is 1 if the tag is present in the destination, and 0 otherwise.
	sorted_tags_dict (dict): A dictionary where the keys are the tags and the values are their frequencies.
	feature_names (numpy.ndarray): A 1D numpy array representing the names of the features (tags).
	destinations (pandas.DataFrame): A pandas DataFrame containing the destinations data, including the tags column.

	Returns:
	numpy.ndarray: A 2D numpy array representing the weights vector for frequency-based weights. Each row corresponds to a destination, and each column corresponds to a tag. The value at each position represents the weight of the tag for that destination.
	"""
	# Create a weights vector initialized to zero
	weights_vector = np.zeros(tags_vector.shape)
	max_freq = max(sorted_tags_dict.values())

	# Map weights to the appropriate positions in the weights_vector
	for i, row in enumerate(destinations["tags"][1:].values):
	tags = row.split()
	for tag in tags:
	index = np.where(feature_names == tag.lower())[0][0]
	weights_vector[i][
	index
	] = f"{(sorted_tags_dict[tag.replace('_', ' ')]/max_freq):.2f}"
	np.save("Datasets/Weights/weights_freq.npy", weights_vector)


	create_bias_weights()
	create_freq_weights()

	weights_bias_vector = np.load("Datasets/Weights/weights_bias.npy")
	weights_freq = np.load("Datasets/Weights/weights_freq.npy")
	weighted_tags_vector = weights_bias_vector