File size: 6,726 Bytes
8302e64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
from datasets import load_dataset, Dataset, DatasetDict
import random
random.seed(42)
import py_vncorenlp
py_vncorenlp.download_model(save_dir='./')
from args import args
import re
import json 
import os
from collections import defaultdict

def normalize_example(example):
    for i in ['sentence', 'sentence_annotation']:
        example[i] = re.sub(r'\s+', ' ', example[i])  # Replace multiple spaces with a single space
        example[i] = example[i].strip()  # Remove leading and trailing spaces
        example[i] = example[i].replace('.', '')  # Remove dot
        example[i] = example[i].replace(',', '')  # Remove comma
    return example

def word_segmentation(example):
    example['sentence'] = rdrsegmenter.word_segment(example['sentence'])[0]
    entities = example['entities']
    for entity in entities:
        entity['filler'] = ' '.join(rdrsegmenter.word_segment(entity['filler']))
    return example

def vietnamese_intent_to_slug(intent: str) -> str:
    intent_mapping = {
        'bật thiết bị': 'turn_on_device',
        'giảm mức độ của thiết bị': 'decrease_device_level',
        'giảm nhiệt độ của thiết bị': 'decrease_device_temperature',
        'giảm âm lượng của thiết bị': 'decrease_device_volume',
        'giảm độ sáng của thiết bị': 'decrease_device_brightness',
        'hủy hoạt cảnh': 'cancel_scene',
        'kiểm tra tình trạng thiết bị': 'check_condition_device',
        'kích hoạt cảnh': 'activate_scene',
        'mở thiết bị': 'open_device',
        'tăng mức độ của thiết bị': 'increase_device_level',
        'tăng nhiệt độ của thiết bị': 'increase_device_temperature',
        'tăng âm lượng của thiết bị': 'increase_device_volume',
        'tăng độ sáng của thiết bị': 'increase_device_brightness',
        'tắt thiết bị': 'turn_off_device',
        'đóng thiết bị': 'close_device'
    }

    return intent_mapping.get(intent, 'unknown_intent')

def intent_mapping(example):
    example['intent'] = vietnamese_intent_to_slug(example['intent'])
    return example

def generate_bio_labels(example):
    # Split the text into tokens
    tokens = example['sentence'].split()
    # Initialize labels with 'O' for each token
    labels = ['O'] * len(tokens)

    # For each entity, generate the corresponding BIO label
    for entity in example['entities']:
        entity_type = entity['type']
        entity_tokens = entity['filler'].split()

        # Find the starting index of the entity in the text
        start_idx = None
        for i in range(len(tokens) - len(entity_tokens) + 1):
            if tokens[i:i+len(entity_tokens)] == entity_tokens:
                start_idx = i
                break

        if start_idx is not None:
            labels[start_idx] = 'B-' + entity_type.replace(" ", "_")
            for j in range(1, len(entity_tokens)):
                labels[start_idx + j] = 'I-' + entity_type.replace(" ", "_")

    return {'labels': ' '.join(labels)}


def write_column_to_txt(dataset, column_name, output_file):
    """
    Write the contents of a specific column from a Huggingface dataset to a txt file.

    Args:
    - dataset (Dataset): The loaded Huggingface dataset.
    - column_name (str): The name of the column whose contents need to be written to the txt file.
    - output_file_name (str): The path and name of the output txt file.
    """

    with open(output_file, 'w', encoding='utf-8') as file:
        for sample in dataset[column_name]:
            file.write(str(sample) + '\n')

if __name__ == "__main__":
    # load data
    data = load_dataset("json", data_files=args.data_raw_path)['train']

    # train test split 
    grouped_by_intent = defaultdict(list)
    for sample in data:
        grouped_by_intent[sample['intent']].append(sample)

    train_samples = []
    val_samples = []

    val_ratio = 0.08
    for intent, samples in grouped_by_intent.items():
        random.shuffle(samples)

        num_val_samples = int(len(samples) * val_ratio)

        val_samples.extend(samples[:num_val_samples])
        train_samples.extend(samples[num_val_samples:])

    train_dataset = Dataset.from_dict({k: [dic[k] for dic in train_samples] for k in train_samples[0]})
    val_dataset = Dataset.from_dict({k: [dic[k] for dic in val_samples] for k in val_samples[0]})

    # Create a dataset dict
    data = DatasetDict({
        'train': train_dataset,
        'test': val_dataset
    })

    # normalize text
    data_normalized = data.map(normalize_example)
    # word segermentation
    rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='./')
    data_segmented = data_normalized.map(word_segmentation)
    # intent mapping
    data_mapped = data_segmented.map(intent_mapping)
    # generate BIO labels
    data_labeled = data_mapped.map(generate_bio_labels)
    
    final_data = data_labeled.remove_columns(["sentence_annotation"])
    # save to file
    final_data_train = final_data['train']
    final_data_dev = final_data['test']
    
    # prepare 3 text files for training
    if not os.path.exists(args.data_processed_path):
        os.mkdir(args.data_processed_path)
        os.chdir(args.data_processed_path)
        os.mkdir("word-level")
        os.chdir("word-level")
        os.mkdir("train")
        os.mkdir("dev")
        
    write_column_to_txt(final_data_train, column_name="sentence", output_file= "./train/seq.in")
    write_column_to_txt(final_data_train, column_name="labels", output_file="./train/seq.out")
    write_column_to_txt(final_data_train, column_name="intent", output_file="./train/label")

    write_column_to_txt(final_data_dev, column_name="sentence", output_file="./dev/seq.in")
    write_column_to_txt(final_data_dev, column_name="labels", output_file="./dev/seq.out")
    write_column_to_txt(final_data_dev, column_name="intent", output_file="./dev/label")

    # intent_label.txt
    intent_list = list(set(final_data['train'][:]['intent']))
    intent_list.append('UNK')

    # Write to a .txt file
    with open("intent_label.txt", "w") as file:
        for intent in sorted(intent_list):
            file.write(str(intent) + "\n")

    # slot_label.txt
    def get_unique_labels(data):
        unique_labels = set()
        for entry in data:
            labels = entry['labels'].split()
            unique_labels.update(labels)
        return list(unique_labels)

    slot_list = get_unique_labels(final_data['train'])
    slot_list.append('UNK')
    slot_list.append('PAD')

    # Write to a .txt file
    with open("slot_label.txt", "w") as file:
        for slot in slot_list:
            file.write(str(slot) + "\n")