File size: 1,281 Bytes
8302e64 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import py_vncorenlp
py_vncorenlp.download_model(save_dir='./')
from datasets import load_dataset
import pandas as pd
from args import args
def word_segmentation(example):
example[args.text_column] = rdrsegmenter.word_segment(example[args.text_column])[0]
return example
def prepare_files(data):
df_test = pd.DataFrame(data[:][args.text_column], columns=[args.output_column])
with open("input.txt", "w") as f:
for i in range(len(df_test)):
f.write(df_test[args.output_column][i] + "\n")
ids = pd.DataFrame(data[:][args.id_column], columns=[args.id_column])
with open("ids.txt", "w") as f:
for i in range(len(ids)):
f.write(ids[args.id_column][i]+ "\n")
if __name__ == "__main__":
# load data
test_data = load_dataset(args.data_path)
# remove unnecessary columns
cols_to_remove = test_data[args.split_name].column_names
cols_to_remove.remove(args.text_column)
cols_to_remove.remove(args.id_column)
test_data = test_data.remove_columns(cols_to_remove)
# word segmentation
rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='./')
data_segmented = test_data[args.split_name].map(word_segmentation)
# prepare files
prepare_files(data_segmented)
|