nlu_fix_0.8 / data_preprocess /prep_data_infer.py
linhqyy's picture
Upload folder using huggingface_hub
8302e64
import py_vncorenlp
py_vncorenlp.download_model(save_dir='./')
from datasets import load_dataset
import pandas as pd
from args import args
def word_segmentation(example):
example[args.text_column] = rdrsegmenter.word_segment(example[args.text_column])[0]
return example
def prepare_files(data):
df_test = pd.DataFrame(data[:][args.text_column], columns=[args.output_column])
with open("input.txt", "w") as f:
for i in range(len(df_test)):
f.write(df_test[args.output_column][i] + "\n")
ids = pd.DataFrame(data[:][args.id_column], columns=[args.id_column])
with open("ids.txt", "w") as f:
for i in range(len(ids)):
f.write(ids[args.id_column][i]+ "\n")
if __name__ == "__main__":
# load data
test_data = load_dataset(args.data_path)
# remove unnecessary columns
cols_to_remove = test_data[args.split_name].column_names
cols_to_remove.remove(args.text_column)
cols_to_remove.remove(args.id_column)
test_data = test_data.remove_columns(cols_to_remove)
# word segmentation
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='./')
data_segmented = test_data[args.split_name].map(word_segmentation)
# prepare files
prepare_files(data_segmented)