File size: 1,281 Bytes
8302e64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import py_vncorenlp
py_vncorenlp.download_model(save_dir='./')
from datasets import load_dataset
import pandas as pd
from args import args

def word_segmentation(example):
    example[args.text_column] = rdrsegmenter.word_segment(example[args.text_column])[0]
    return example

def prepare_files(data):
    df_test = pd.DataFrame(data[:][args.text_column], columns=[args.output_column])

    with open("input.txt", "w") as f:
        for i in range(len(df_test)):
            f.write(df_test[args.output_column][i] + "\n")

    ids = pd.DataFrame(data[:][args.id_column], columns=[args.id_column])
    with open("ids.txt", "w") as f:
        for i in range(len(ids)):
            f.write(ids[args.id_column][i]+ "\n")

if __name__ == "__main__":
    # load data
    test_data = load_dataset(args.data_path)

    # remove unnecessary columns
    cols_to_remove = test_data[args.split_name].column_names
    cols_to_remove.remove(args.text_column)
    cols_to_remove.remove(args.id_column)
    test_data = test_data.remove_columns(cols_to_remove)

    # word segmentation
    rdrsegmenter = py_vncorenlp.VnCoreNLP(annotators=["wseg"], save_dir='./')
    data_segmented = test_data[args.split_name].map(word_segmentation)

    # prepare files
    prepare_files(data_segmented)