nlu_fix_0.8 / data_preprocess /preprocess_train_data.py
linhqyy's picture
Upload folder using huggingface_hub
8302e64
from datasets import load_dataset, Dataset, DatasetDict
import random
random.seed(42)
import py_vncorenlp
py_vncorenlp.download_model(save_dir='./')
from args import args
import re
import json
import os
from collections import defaultdict
def normalize_example(example):
for i in ['sentence', 'sentence_annotation']:
example[i] = re.sub(r'\s+', ' ', example[i]) # Replace multiple spaces with a single space
example[i] = example[i].strip() # Remove leading and trailing spaces
example[i] = example[i].replace('.', '') # Remove dot
example[i] = example[i].replace(',', '') # Remove comma
return example
def word_segmentation(example):
example['sentence'] = rdrsegmenter.word_segment(example['sentence'])[0]
entities = example['entities']
for entity in entities:
entity['filler'] = ' '.join(rdrsegmenter.word_segment(entity['filler']))
return example
def vietnamese_intent_to_slug(intent: str) -> str:
intent_mapping = {
'bật thiết bị': 'turn_on_device',
'giảm mức độ của thiết bị': 'decrease_device_level',
'giảm nhiệt độ của thiết bị': 'decrease_device_temperature',
'giảm âm lượng của thiết bị': 'decrease_device_volume',
'giảm độ sáng của thiết bị': 'decrease_device_brightness',
'hủy hoạt cảnh': 'cancel_scene',
'kiểm tra tình trạng thiết bị': 'check_condition_device',
'kích hoạt cảnh': 'activate_scene',
'mở thiết bị': 'open_device',
'tăng mức độ của thiết bị': 'increase_device_level',
'tăng nhiệt độ của thiết bị': 'increase_device_temperature',
'tăng âm lượng của thiết bị': 'increase_device_volume',
'tăng độ sáng của thiết bị': 'increase_device_brightness',
'tắt thiết bị': 'turn_off_device',
'đóng thiết bị': 'close_device'
}
return intent_mapping.get(intent, 'unknown_intent')
def intent_mapping(example):
example['intent'] = vietnamese_intent_to_slug(example['intent'])
return example
def generate_bio_labels(example):
# Split the text into tokens
tokens = example['sentence'].split()
# Initialize labels with 'O' for each token
labels = ['O'] * len(tokens)
# For each entity, generate the corresponding BIO label
for entity in example['entities']:
entity_type = entity['type']
entity_tokens = entity['filler'].split()
# Find the starting index of the entity in the text
start_idx = None
for i in range(len(tokens) - len(entity_tokens) + 1):
if tokens[i:i+len(entity_tokens)] == entity_tokens:
start_idx = i
break
if start_idx is not None:
labels[start_idx] = 'B-' + entity_type.replace(" ", "_")
for j in range(1, len(entity_tokens)):
labels[start_idx + j] = 'I-' + entity_type.replace(" ", "_")
return {'labels': ' '.join(labels)}
def write_column_to_txt(dataset, column_name, output_file):
"""
Write the contents of a specific column from a Huggingface dataset to a txt file.
Args:
- dataset (Dataset): The loaded Huggingface dataset.
- column_name (str): The name of the column whose contents need to be written to the txt file.
- output_file_name (str): The path and name of the output txt file.
"""
with open(output_file, 'w', encoding='utf-8') as file:
for sample in dataset[column_name]:
file.write(str(sample) + '\n')
if __name__ == "__main__":
# load data
data = load_dataset("json", data_files=args.data_raw_path)['train']
# train test split
grouped_by_intent = defaultdict(list)
for sample in data:
grouped_by_intent[sample['intent']].append(sample)
train_samples = []
val_samples = []
val_ratio = 0.08
for intent, samples in grouped_by_intent.items():
random.shuffle(samples)
num_val_samples = int(len(samples) * val_ratio)
val_samples.extend(samples[:num_val_samples])
train_samples.extend(samples[num_val_samples:])
train_dataset = Dataset.from_dict({k: [dic[k] for dic in train_samples] for k in train_samples[0]})
val_dataset = Dataset.from_dict({k: [dic[k] for dic in val_samples] for k in val_samples[0]})
# Create a dataset dict
data = DatasetDict({
'train': train_dataset,
'test': val_dataset
})
# normalize text
data_normalized = data.map(normalize_example)
# word segermentation
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='./')
data_segmented = data_normalized.map(word_segmentation)
# intent mapping
data_mapped = data_segmented.map(intent_mapping)
# generate BIO labels
data_labeled = data_mapped.map(generate_bio_labels)
final_data = data_labeled.remove_columns(["sentence_annotation"])
# save to file
final_data_train = final_data['train']
final_data_dev = final_data['test']
# prepare 3 text files for training
if not os.path.exists(args.data_processed_path):
os.mkdir(args.data_processed_path)
os.chdir(args.data_processed_path)
os.mkdir("word-level")
os.chdir("word-level")
os.mkdir("train")
os.mkdir("dev")
write_column_to_txt(final_data_train, column_name="sentence", output_file= "./train/seq.in")
write_column_to_txt(final_data_train, column_name="labels", output_file="./train/seq.out")
write_column_to_txt(final_data_train, column_name="intent", output_file="./train/label")
write_column_to_txt(final_data_dev, column_name="sentence", output_file="./dev/seq.in")
write_column_to_txt(final_data_dev, column_name="labels", output_file="./dev/seq.out")
write_column_to_txt(final_data_dev, column_name="intent", output_file="./dev/label")
# intent_label.txt
intent_list = list(set(final_data['train'][:]['intent']))
intent_list.append('UNK')
# Write to a .txt file
with open("intent_label.txt", "w") as file:
for intent in sorted(intent_list):
file.write(str(intent) + "\n")
# slot_label.txt
def get_unique_labels(data):
unique_labels = set()
for entry in data:
labels = entry['labels'].split()
unique_labels.update(labels)
return list(unique_labels)
slot_list = get_unique_labels(final_data['train'])
slot_list.append('UNK')
slot_list.append('PAD')
# Write to a .txt file
with open("slot_label.txt", "w") as file:
for slot in slot_list:
file.write(str(slot) + "\n")