File size: 6,726 Bytes
8302e64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 |
from datasets import load_dataset, Dataset, DatasetDict
import random
random.seed(42)
import py_vncorenlp
py_vncorenlp.download_model(save_dir='./')
from args import args
import re
import json
import os
from collections import defaultdict
def normalize_example(example):
for i in ['sentence', 'sentence_annotation']:
example[i] = re.sub(r'\s+', ' ', example[i]) # Replace multiple spaces with a single space
example[i] = example[i].strip() # Remove leading and trailing spaces
example[i] = example[i].replace('.', '') # Remove dot
example[i] = example[i].replace(',', '') # Remove comma
return example
def word_segmentation(example):
example['sentence'] = rdrsegmenter.word_segment(example['sentence'])[0]
entities = example['entities']
for entity in entities:
entity['filler'] = ' '.join(rdrsegmenter.word_segment(entity['filler']))
return example
def vietnamese_intent_to_slug(intent: str) -> str:
intent_mapping = {
'bật thiết bị': 'turn_on_device',
'giảm mức độ của thiết bị': 'decrease_device_level',
'giảm nhiệt độ của thiết bị': 'decrease_device_temperature',
'giảm âm lượng của thiết bị': 'decrease_device_volume',
'giảm độ sáng của thiết bị': 'decrease_device_brightness',
'hủy hoạt cảnh': 'cancel_scene',
'kiểm tra tình trạng thiết bị': 'check_condition_device',
'kích hoạt cảnh': 'activate_scene',
'mở thiết bị': 'open_device',
'tăng mức độ của thiết bị': 'increase_device_level',
'tăng nhiệt độ của thiết bị': 'increase_device_temperature',
'tăng âm lượng của thiết bị': 'increase_device_volume',
'tăng độ sáng của thiết bị': 'increase_device_brightness',
'tắt thiết bị': 'turn_off_device',
'đóng thiết bị': 'close_device'
}
return intent_mapping.get(intent, 'unknown_intent')
def intent_mapping(example):
example['intent'] = vietnamese_intent_to_slug(example['intent'])
return example
def generate_bio_labels(example):
# Split the text into tokens
tokens = example['sentence'].split()
# Initialize labels with 'O' for each token
labels = ['O'] * len(tokens)
# For each entity, generate the corresponding BIO label
for entity in example['entities']:
entity_type = entity['type']
entity_tokens = entity['filler'].split()
# Find the starting index of the entity in the text
start_idx = None
for i in range(len(tokens) - len(entity_tokens) + 1):
if tokens[i:i+len(entity_tokens)] == entity_tokens:
start_idx = i
break
if start_idx is not None:
labels[start_idx] = 'B-' + entity_type.replace(" ", "_")
for j in range(1, len(entity_tokens)):
labels[start_idx + j] = 'I-' + entity_type.replace(" ", "_")
return {'labels': ' '.join(labels)}
def write_column_to_txt(dataset, column_name, output_file):
"""
Write the contents of a specific column from a Huggingface dataset to a txt file.
Args:
- dataset (Dataset): The loaded Huggingface dataset.
- column_name (str): The name of the column whose contents need to be written to the txt file.
- output_file_name (str): The path and name of the output txt file.
"""
with open(output_file, 'w', encoding='utf-8') as file:
for sample in dataset[column_name]:
file.write(str(sample) + '\n')
if __name__ == "__main__":
# load data
data = load_dataset("json", data_files=args.data_raw_path)['train']
# train test split
grouped_by_intent = defaultdict(list)
for sample in data:
grouped_by_intent[sample['intent']].append(sample)
train_samples = []
val_samples = []
val_ratio = 0.08
for intent, samples in grouped_by_intent.items():
random.shuffle(samples)
num_val_samples = int(len(samples) * val_ratio)
val_samples.extend(samples[:num_val_samples])
train_samples.extend(samples[num_val_samples:])
train_dataset = Dataset.from_dict({k: [dic[k] for dic in train_samples] for k in train_samples[0]})
val_dataset = Dataset.from_dict({k: [dic[k] for dic in val_samples] for k in val_samples[0]})
# Create a dataset dict
data = DatasetDict({
'train': train_dataset,
'test': val_dataset
})
# normalize text
data_normalized = data.map(normalize_example)
# word segermentation
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='./')
data_segmented = data_normalized.map(word_segmentation)
# intent mapping
data_mapped = data_segmented.map(intent_mapping)
# generate BIO labels
data_labeled = data_mapped.map(generate_bio_labels)
final_data = data_labeled.remove_columns(["sentence_annotation"])
# save to file
final_data_train = final_data['train']
final_data_dev = final_data['test']
# prepare 3 text files for training
if not os.path.exists(args.data_processed_path):
os.mkdir(args.data_processed_path)
os.chdir(args.data_processed_path)
os.mkdir("word-level")
os.chdir("word-level")
os.mkdir("train")
os.mkdir("dev")
write_column_to_txt(final_data_train, column_name="sentence", output_file= "./train/seq.in")
write_column_to_txt(final_data_train, column_name="labels", output_file="./train/seq.out")
write_column_to_txt(final_data_train, column_name="intent", output_file="./train/label")
write_column_to_txt(final_data_dev, column_name="sentence", output_file="./dev/seq.in")
write_column_to_txt(final_data_dev, column_name="labels", output_file="./dev/seq.out")
write_column_to_txt(final_data_dev, column_name="intent", output_file="./dev/label")
# intent_label.txt
intent_list = list(set(final_data['train'][:]['intent']))
intent_list.append('UNK')
# Write to a .txt file
with open("intent_label.txt", "w") as file:
for intent in sorted(intent_list):
file.write(str(intent) + "\n")
# slot_label.txt
def get_unique_labels(data):
unique_labels = set()
for entry in data:
labels = entry['labels'].split()
unique_labels.update(labels)
return list(unique_labels)
slot_list = get_unique_labels(final_data['train'])
slot_list.append('UNK')
slot_list.append('PAD')
# Write to a .txt file
with open("slot_label.txt", "w") as file:
for slot in slot_list:
file.write(str(slot) + "\n") |