Spaces:

ABAO77
/

Triventure-AI

Build error

File size: 10,587 Bytes

5ce8318

from src.apis.config.constances import DEFAULT_TEXT_ANNOTATION_FILE, DEFAULT_DESTINATIONS
import json
import underthesea
import string
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from src.utils.dictionary import (
    number_dict,
    translate_dict,
    mispelling_dict,
    wordform2vnese_dict,
    emotion2wordform_dict,
)
with open(DEFAULT_TEXT_ANNOTATION_FILE, "r", encoding="utf-8") as file:
    data = json.load(file)

# Prepare sentences and labels
sentences = [item[0] for item in data["annotations"]]
labels = [item[1]["entities"] for item in data["annotations"]]
# Define tags
tags = data["classes"]
# tags = ['<pad>'] + tags

# Convert tags to indices
tag2idx = {tag: 0 for idx, tag in enumerate(tags)}
for label in labels:
    for entity in label:
        tag2idx[entity[1]] = tag2idx[entity[1]] + 1
# Sort the dictionary by values
sorted_tags_dict = dict(sorted(tag2idx.items(), key=lambda item: item[1],reverse=True))
sorted_tags = {key: value for key, value in sorted_tags_dict.items() if value != 0}

new_tag = {'<pad>': 0}

sorted_tags = {**new_tag, **sorted_tags}

destinations = pd.read_excel(DEFAULT_DESTINATIONS)

vectorizer = CountVectorizer(max_features=10000, stop_words="english")
tags_vector = vectorizer.fit_transform(
    destinations["tags"].values.astype("U")
).toarray()
tags_vector = tags_vector[1:]

feature_names = vectorizer.get_feature_names_out()

# 10 Remove stopwords
def remove_stopwords(input_text, stopwords_file="Datasets/Query/stopword.txt"):
    # Read the custom stop words from the file
    with open(stopwords_file, "r", encoding="utf-8") as file:
        stopwords = set(line.strip() for line in file)

    cleaned_words = [
        word for word in input_text.split() if word.lower() not in stopwords
    ]
    cleaned_text = " ".join(cleaned_words)

    return cleaned_text


# 9 word segmentation
def word_segment(text):
    return underthesea.word_tokenize(text, format="text")


# 8 Remove numbers
def remove_numbers(input_string):
    # Use the isalpha() method to filter out numeric characters
    cleaned_string = "".join(char for char in input_string if not char.isdigit())
    return cleaned_string


# 7
def remove_extra_whitespace(input_string):
    words = input_string.split()
    return " ".join(words)


# 6 Tranform Number to text (8 - tám)
def number2text(sentence):
    words = sentence.split()
    converted_words = [number_dict.get(word, word) for word in words]
    converted_sentence = " ".join(converted_words)
    return converted_sentence


# 5 Transform mispelling words, acronyms, .....(include translate english words)
def translate2word(sentence, dictionary=translate_dict):
    sentence = " " + sentence.strip() + " "
    for key, value_list in dictionary.items():
        for value in value_list:
            sentence = sentence.replace(value, key)
    return sentence


def mispell2word(sentence, dictionary=mispelling_dict):
    sentence = " " + sentence.strip() + " "
    for key, value_list in dictionary.items():
        for value in value_list:
            sentence = sentence.replace(value, key)
    return sentence


# 4 Transform word from into vietnamese (colonsmile - cười)
def word_form2Vnese(sentence):
    words = sentence.split()
    converted_words = [wordform2vnese_dict.get(word, word) for word in words]
    converted_sentence = " ".join(converted_words)
    return converted_sentence


# 3 f
def remove_punctuation(input_string):
    # Create a translation table to remove all punctuation characters
    translator = str.maketrans("", "", string.punctuation)

    # Use the translate method to remove punctuation
    cleaned_string = input_string.translate(translator)

    return cleaned_string


# 2 emoticon to word form  ( :) - colonsmile )
def emoticon2word(sentence):
    words = sentence.split()
    converted_words = [emotion2wordform_dict.get(word, word) for word in words]
    converted_sentence = " ".join(converted_words)
    return converted_sentence


# 1 lower case
def lower_case(text):
    return text.lower()


def data_preprocessing(text):
    return remove_stopwords(
        word_segment(
            remove_extra_whitespace(
                number2text(mispell2word(remove_punctuation(lower_case(text))))
            )
        )
    )


def read_input(input):  # hàm cuối cùng khi đọc và xử lí input sentence
    return data_preprocessing(input)


def create_bias_weights():
    """
    Create a weights vector for bias based on the given tags and weights.
    The function initializes a weights vector to zero, then maps the weights from the weights_tags_vector to the appropriate positions in the weights_vector based on the tags present in the destinations.
    """
    weights_tags_vector = [
        [15, 15, 0.9, 15, 15, 10, 1, 5, 0.6, 0.9, 0.9, 0.8, 10, 10, 1, 15],
        [15, 15, 0.9, 15, 15, 10, 15, 1, 10, 0.6, 0.9, 0.9, 0.8, 10, 10, 15, 0.8, 15],
        [15, 0.9, 0.8, 15, 15, 1, 10, 10, 0.6, 0.9, 0.9, 0.8, 5, 5, 1, 15],
        [
            15,
            15,
            0.9,
            15,
            0.7,
            15,
            15,
            15,
            1,
            10,
            10,
            1,
            0.9,
            0.9,
            0.9,
            5,
            5,
            15,
            0.8,
            15,
        ],
        [
            10,
            10,
            15,
            15,
            0.8,
            0.9,
            15,
            15,
            15,
            1,
            10,
            10,
            0.6,
            0.5,
            0.9,
            0.9,
            0.8,
            0.7,
            15,
            15,
            15,
            15,
            15,
        ],
        [0.8, 0.9, 15, 0.8, 15, 0.9, 10, 15, 0.9, 0.9, 0.9, 0.8, 15, 10, 1, 15],
        [0.9, 0.8, 5, 1, 0.9, 10, 15, 0.9, 0.9, 0.9, 0.9, 0.8, 15, 1, 1, 15],
        [0.8, 0.9, 5, 1, 15, 15, 0.9, 0.9, 0.9, 0.8, 15, 1, 15],
        [0.8, 0.7, 15, 15, 1, 10, 0.7, 0.7, 0.6, 5, 5, 15],
        [0.8, 5, 1, 15, 15, 15, 0.7, 0.7, 15],
        [0.8, 0.7, 1, 15, 15, 0.7, 0.7, 15],
        [0.8, 0.7, 1, 15, 15, 15, 0.7, 0.9, 15],
        [0.8, 0.7, 1, 15, 15, 0.7, 0.7, 15],
        [0.8, 0.7, 1, 15, 15, 15, 0.7, 0.7, 15],
        [0.8, 0.7, 1, 15, 15, 15, 1, 10, 15],
        [10, 0.9, 0.8, 1, 15, 15, 15, 0.8, 10, 15],
        [0.8, 15, 1, 15, 15, 0.8, 10, 15],
        [10, 0.8, 1, 15, 1, 0.9, 0.8, 5, 0.8],
        [0.8, 15, 1, 5, 0.9, 0.8, 0.7, 0.7],
        [0.9, 0.8, 15, 1, 15, 0.7, 0.8, 0.7, 0.7, 5, 5, 15],
        [0.8, 0.7, 1, 5, 0.9, 10, 10, 15],
        [0.8, 1, 15, 15, 1, 0.9, 0.8, 0.8, 15],
        [0.8, 1, 10, 5, 5, 15],
        [0.8, 0.7, 1, 15, 15, 0.8, 0.9, 15],
        [10, 10, 10, 1, 10, 0.8, 1, 5, 10, 10, 10, 10, 1, 0.9, 1, 1, 15],
        [0.8, 0.7, 1, 15, 15, 0.8, 0.9, 15],
        [0.8, 0.7, 1, 10, 10, 0.8, 0.9, 15],
        [10, 0.8, 0.7, 15, 15, 1, 15, 15, 0.7, 0.7, 0.6, 5, 5, 1, 15],
        [5, 0.8, 0.7, 5, 5, 1, 10, 10, 0.7, 0.7, 0.6, 5, 5, 1, 15],
        [0.8, 0.7, 15, 5, 1, 10, 10, 10, 0.8, 0.7, 0.7, 5, 5, 5, 10, 15],
        [5, 5, 10, 15, 15, 15, 15, 0.9, 0.8, 0.7, 0.7, 1, 15],
        [10, 10, 15, 15, 10, 5, 1, 15, 15, 15, 15, 0.7, 5, 5, 0.8, 1, 15],
        [10, 15, 15, 15, 10, 10, 1, 1, 1, 15, 15, 5, 5],
        [0.8, 0.7, 0.6, 0.8, 1, 1, 1, 0.9, 0.8, 0.7, 0.7, 0.6, 5, 5, 1, 15],
        [1, 0.8, 0.9, 0.7, 0.6, 1, 0.9, 0.8, 1, 1, 0.9, 0.8, 0.8, 0.7, 0.9, 5, 5, 15],
        [
            1,
            0.8,
            0.9,
            0.7,
            0.6,
            1,
            0.9,
            0.8,
            1,
            1,
            0.9,
            0.7,
            0.6,
            0.8,
            0.8,
            0.8,
            0.7,
            5,
            5,
            1,
            0.7,
            0.6,
            15,
        ],
        [0.9, 0.7, 1, 1, 0.8, 0.7, 0.8, 0.8, 0.7, 1, 1, 1, 1, 15],
    ]
    # Create a weights vector initialized to zero
    weights_vector = np.zeros(tags_vector.shape)

    # Map weights to the appropriate positions in the weights_vector
    for i, row in enumerate(destinations["tags"][1:].values):
        tags = row.split()
        for tag, weight in zip(tags, weights_tags_vector[i]):
            index = np.where(feature_names == tag.lower())[0][0]
            weights_vector[i][index] = weight
    np.save("Datasets/Weights/weights_bias.npy", weights_vector)


def create_freq_weights():
    """
    This function creates a weights vector for frequency-based weights based on the given tags and their frequencies.
    The function initializes a weights vector to zero, then maps the weights from the sorted_tags_dict to the appropriate positions in the weights_vector based on the tags present in the destinations.
    The weights are calculated as the ratio of the tag's frequency to the maximum frequency among all tags.

    Parameters:
    tags_vector (numpy.ndarray): A 2D numpy array representing the tags vector. Each row corresponds to a destination, and each column corresponds to a tag. The value at each position is 1 if the tag is present in the destination, and 0 otherwise.
    sorted_tags_dict (dict): A dictionary where the keys are the tags and the values are their frequencies.
    feature_names (numpy.ndarray): A 1D numpy array representing the names of the features (tags).
    destinations (pandas.DataFrame): A pandas DataFrame containing the destinations data, including the tags column.

    Returns:
    numpy.ndarray: A 2D numpy array representing the weights vector for frequency-based weights. Each row corresponds to a destination, and each column corresponds to a tag. The value at each position represents the weight of the tag for that destination.
    """
    # Create a weights vector initialized to zero
    weights_vector = np.zeros(tags_vector.shape)
    max_freq = max(sorted_tags_dict.values())

    # Map weights to the appropriate positions in the weights_vector
    for i, row in enumerate(destinations["tags"][1:].values):
        tags = row.split()
        for tag in tags:
            index = np.where(feature_names == tag.lower())[0][0]
            weights_vector[i][
                index
            ] = f"{(sorted_tags_dict[tag.replace('_', ' ')]/max_freq):.2f}"
    np.save("Datasets/Weights/weights_freq.npy", weights_vector)


create_bias_weights()
create_freq_weights()

weights_bias_vector = np.load("Datasets/Weights/weights_bias.npy")
weights_freq = np.load("Datasets/Weights/weights_freq.npy")
weighted_tags_vector = weights_bias_vector