nlu_fix_0.8 / data_preprocess /preprocess_train_data.py

Upload folder using huggingface_hub

8302e64 about 2 years ago

6.73 kB

	from datasets import load_dataset, Dataset, DatasetDict
	import random
	random.seed(42)
	import py_vncorenlp
	py_vncorenlp.download_model(save_dir='./')
	from args import args
	import re
	import json
	import os
	from collections import defaultdict

	def normalize_example(example):
	for i in ['sentence', 'sentence_annotation']:
	example[i] = re.sub(r'\s+', ' ', example[i]) # Replace multiple spaces with a single space
	example[i] = example[i].strip() # Remove leading and trailing spaces
	example[i] = example[i].replace('.', '') # Remove dot
	example[i] = example[i].replace(',', '') # Remove comma
	return example

	def word_segmentation(example):
	example['sentence'] = rdrsegmenter.word_segment(example['sentence'])[0]
	entities = example['entities']
	for entity in entities:
	entity['filler'] = ' '.join(rdrsegmenter.word_segment(entity['filler']))
	return example

	def vietnamese_intent_to_slug(intent: str) -> str:
	intent_mapping = {
	'bật thiết bị': 'turn_on_device',
	'giảm mức độ của thiết bị': 'decrease_device_level',
	'giảm nhiệt độ của thiết bị': 'decrease_device_temperature',
	'giảm âm lượng của thiết bị': 'decrease_device_volume',
	'giảm độ sáng của thiết bị': 'decrease_device_brightness',
	'hủy hoạt cảnh': 'cancel_scene',
	'kiểm tra tình trạng thiết bị': 'check_condition_device',
	'kích hoạt cảnh': 'activate_scene',
	'mở thiết bị': 'open_device',
	'tăng mức độ của thiết bị': 'increase_device_level',
	'tăng nhiệt độ của thiết bị': 'increase_device_temperature',
	'tăng âm lượng của thiết bị': 'increase_device_volume',
	'tăng độ sáng của thiết bị': 'increase_device_brightness',
	'tắt thiết bị': 'turn_off_device',
	'đóng thiết bị': 'close_device'
	}

	return intent_mapping.get(intent, 'unknown_intent')

	def intent_mapping(example):
	example['intent'] = vietnamese_intent_to_slug(example['intent'])
	return example

	def generate_bio_labels(example):
	# Split the text into tokens
	tokens = example['sentence'].split()
	# Initialize labels with 'O' for each token
	labels = ['O'] * len(tokens)

	# For each entity, generate the corresponding BIO label
	for entity in example['entities']:
	entity_type = entity['type']
	entity_tokens = entity['filler'].split()

	# Find the starting index of the entity in the text
	start_idx = None
	for i in range(len(tokens) - len(entity_tokens) + 1):
	if tokens[i:i+len(entity_tokens)] == entity_tokens:
	start_idx = i
	break

	if start_idx is not None:
	labels[start_idx] = 'B-' + entity_type.replace(" ", "_")
	for j in range(1, len(entity_tokens)):
	labels[start_idx + j] = 'I-' + entity_type.replace(" ", "_")

	return {'labels': ' '.join(labels)}


	def write_column_to_txt(dataset, column_name, output_file):
	"""
	Write the contents of a specific column from a Huggingface dataset to a txt file.

	Args:
	- dataset (Dataset): The loaded Huggingface dataset.
	- column_name (str): The name of the column whose contents need to be written to the txt file.
	- output_file_name (str): The path and name of the output txt file.
	"""

	with open(output_file, 'w', encoding='utf-8') as file:
	for sample in dataset[column_name]:
	file.write(str(sample) + '\n')

	if __name__ == "__main__":
	# load data
	data = load_dataset("json", data_files=args.data_raw_path)['train']

	# train test split
	grouped_by_intent = defaultdict(list)
	for sample in data:
	grouped_by_intent[sample['intent']].append(sample)

	train_samples = []
	val_samples = []

	val_ratio = 0.08
	for intent, samples in grouped_by_intent.items():
	random.shuffle(samples)

	num_val_samples = int(len(samples) * val_ratio)

	val_samples.extend(samples[:num_val_samples])
	train_samples.extend(samples[num_val_samples:])

	train_dataset = Dataset.from_dict({k: [dic[k] for dic in train_samples] for k in train_samples[0]})
	val_dataset = Dataset.from_dict({k: [dic[k] for dic in val_samples] for k in val_samples[0]})

	# Create a dataset dict
	data = DatasetDict({
	'train': train_dataset,
	'test': val_dataset
	})

	# normalize text
	data_normalized = data.map(normalize_example)
	# word segermentation
	rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='./')
	data_segmented = data_normalized.map(word_segmentation)
	# intent mapping
	data_mapped = data_segmented.map(intent_mapping)
	# generate BIO labels
	data_labeled = data_mapped.map(generate_bio_labels)

	final_data = data_labeled.remove_columns(["sentence_annotation"])
	# save to file
	final_data_train = final_data['train']
	final_data_dev = final_data['test']

	# prepare 3 text files for training
	if not os.path.exists(args.data_processed_path):
	os.mkdir(args.data_processed_path)
	os.chdir(args.data_processed_path)
	os.mkdir("word-level")
	os.chdir("word-level")
	os.mkdir("train")
	os.mkdir("dev")

	write_column_to_txt(final_data_train, column_name="sentence", output_file= "./train/seq.in")
	write_column_to_txt(final_data_train, column_name="labels", output_file="./train/seq.out")
	write_column_to_txt(final_data_train, column_name="intent", output_file="./train/label")

	write_column_to_txt(final_data_dev, column_name="sentence", output_file="./dev/seq.in")
	write_column_to_txt(final_data_dev, column_name="labels", output_file="./dev/seq.out")
	write_column_to_txt(final_data_dev, column_name="intent", output_file="./dev/label")

	# intent_label.txt
	intent_list = list(set(final_data['train'][:]['intent']))
	intent_list.append('UNK')

	# Write to a .txt file
	with open("intent_label.txt", "w") as file:
	for intent in sorted(intent_list):
	file.write(str(intent) + "\n")

	# slot_label.txt
	def get_unique_labels(data):
	unique_labels = set()
	for entry in data:
	labels = entry['labels'].split()
	unique_labels.update(labels)
	return list(unique_labels)

	slot_list = get_unique_labels(final_data['train'])
	slot_list.append('UNK')
	slot_list.append('PAD')

	# Write to a .txt file
	with open("slot_label.txt", "w") as file:
	for slot in slot_list:
	file.write(str(slot) + "\n")