File size: 6,726 Bytes

8302e64

from datasets import load_dataset, Dataset, DatasetDict
import random
random.seed(42)
import py_vncorenlp
py_vncorenlp.download_model(save_dir='./')
from args import args
import re
import json 
import os
from collections import defaultdict

def normalize_example(example):
    for i in ['sentence', 'sentence_annotation']:
        example[i] = re.sub(r'\s+', ' ', example[i])  # Replace multiple spaces with a single space
        example[i] = example[i].strip()  # Remove leading and trailing spaces
        example[i] = example[i].replace('.', '')  # Remove dot
        example[i] = example[i].replace(',', '')  # Remove comma
    return example

def word_segmentation(example):
    example['sentence'] = rdrsegmenter.word_segment(example['sentence'])[0]
    entities = example['entities']
    for entity in entities:
        entity['filler'] = ' '.join(rdrsegmenter.word_segment(entity['filler']))
    return example

def vietnamese_intent_to_slug(intent: str) -> str:
    intent_mapping = {
        'bật thiết bị': 'turn_on_device',
        'giảm mức độ của thiết bị': 'decrease_device_level',
        'giảm nhiệt độ của thiết bị': 'decrease_device_temperature',
        'giảm âm lượng của thiết bị': 'decrease_device_volume',
        'giảm độ sáng của thiết bị': 'decrease_device_brightness',
        'hủy hoạt cảnh': 'cancel_scene',
        'kiểm tra tình trạng thiết bị': 'check_condition_device',
        'kích hoạt cảnh': 'activate_scene',
        'mở thiết bị': 'open_device',
        'tăng mức độ của thiết bị': 'increase_device_level',
        'tăng nhiệt độ của thiết bị': 'increase_device_temperature',
        'tăng âm lượng của thiết bị': 'increase_device_volume',
        'tăng độ sáng của thiết bị': 'increase_device_brightness',
        'tắt thiết bị': 'turn_off_device',
        'đóng thiết bị': 'close_device'
    }

    return intent_mapping.get(intent, 'unknown_intent')

def intent_mapping(example):
    example['intent'] = vietnamese_intent_to_slug(example['intent'])
    return example

def generate_bio_labels(example):
    # Split the text into tokens
    tokens = example['sentence'].split()
    # Initialize labels with 'O' for each token
    labels = ['O'] * len(tokens)

    # For each entity, generate the corresponding BIO label
    for entity in example['entities']:
        entity_type = entity['type']
        entity_tokens = entity['filler'].split()

        # Find the starting index of the entity in the text
        start_idx = None
        for i in range(len(tokens) - len(entity_tokens) + 1):
            if tokens[i:i+len(entity_tokens)] == entity_tokens:
                start_idx = i
                break

        if start_idx is not None:
            labels[start_idx] = 'B-' + entity_type.replace(" ", "_")
            for j in range(1, len(entity_tokens)):
                labels[start_idx + j] = 'I-' + entity_type.replace(" ", "_")

    return {'labels': ' '.join(labels)}


def write_column_to_txt(dataset, column_name, output_file):
    """
    Write the contents of a specific column from a Huggingface dataset to a txt file.

    Args:
    - dataset (Dataset): The loaded Huggingface dataset.
    - column_name (str): The name of the column whose contents need to be written to the txt file.
    - output_file_name (str): The path and name of the output txt file.
    """

    with open(output_file, 'w', encoding='utf-8') as file:
        for sample in dataset[column_name]:
            file.write(str(sample) + '\n')

if __name__ == "__main__":
    # load data
    data = load_dataset("json", data_files=args.data_raw_path)['train']

    # train test split 
    grouped_by_intent = defaultdict(list)
    for sample in data:
        grouped_by_intent[sample['intent']].append(sample)

    train_samples = []
    val_samples = []

    val_ratio = 0.08
    for intent, samples in grouped_by_intent.items():
        random.shuffle(samples)

        num_val_samples = int(len(samples) * val_ratio)

        val_samples.extend(samples[:num_val_samples])
        train_samples.extend(samples[num_val_samples:])

    train_dataset = Dataset.from_dict({k: [dic[k] for dic in train_samples] for k in train_samples[0]})
    val_dataset = Dataset.from_dict({k: [dic[k] for dic in val_samples] for k in val_samples[0]})

    # Create a dataset dict
    data = DatasetDict({
        'train': train_dataset,
        'test': val_dataset
    })

    # normalize text
    data_normalized = data.map(normalize_example)
    # word segermentation
    rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='./')
    data_segmented = data_normalized.map(word_segmentation)
    # intent mapping
    data_mapped = data_segmented.map(intent_mapping)
    # generate BIO labels
    data_labeled = data_mapped.map(generate_bio_labels)
    
    final_data = data_labeled.remove_columns(["sentence_annotation"])
    # save to file
    final_data_train = final_data['train']
    final_data_dev = final_data['test']
    
    # prepare 3 text files for training
    if not os.path.exists(args.data_processed_path):
        os.mkdir(args.data_processed_path)
        os.chdir(args.data_processed_path)
        os.mkdir("word-level")
        os.chdir("word-level")
        os.mkdir("train")
        os.mkdir("dev")
        
    write_column_to_txt(final_data_train, column_name="sentence", output_file= "./train/seq.in")
    write_column_to_txt(final_data_train, column_name="labels", output_file="./train/seq.out")
    write_column_to_txt(final_data_train, column_name="intent", output_file="./train/label")

    write_column_to_txt(final_data_dev, column_name="sentence", output_file="./dev/seq.in")
    write_column_to_txt(final_data_dev, column_name="labels", output_file="./dev/seq.out")
    write_column_to_txt(final_data_dev, column_name="intent", output_file="./dev/label")

    # intent_label.txt
    intent_list = list(set(final_data['train'][:]['intent']))
    intent_list.append('UNK')

    # Write to a .txt file
    with open("intent_label.txt", "w") as file:
        for intent in sorted(intent_list):
            file.write(str(intent) + "\n")

    # slot_label.txt
    def get_unique_labels(data):
        unique_labels = set()
        for entry in data:
            labels = entry['labels'].split()
            unique_labels.update(labels)
        return list(unique_labels)

    slot_list = get_unique_labels(final_data['train'])
    slot_list.append('UNK')
    slot_list.append('PAD')

    # Write to a .txt file
    with open("slot_label.txt", "w") as file:
        for slot in slot_list:
            file.write(str(slot) + "\n")