File size: 856 Bytes
95f644c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
import json

FILES = {
    "dataset/train.jsonl": "fasttext_train.txt",
    "dataset/validation.jsonl": "fasttext_validation.txt",
    "dataset/test.jsonl": "fasttext_test.txt",
}

for input_file, output_file in FILES.items():

    print(f"Converting {input_file} -> {output_file}")

    count = 0

    with open(input_file, "r", encoding="utf-8") as fin, \
         open(output_file, "w", encoding="utf-8") as fout:

        for line in fin:

            row = json.loads(line)

            label = str(row["label"]).strip()

            text = str(row["content"])

            
            text = text.replace("__label__", "__lbl__")

            
            text = " ".join(text.split())

            fout.write(
                f"__label__{label} {text}\n"
            )

            count += 1

    print(f"Saved {count:,} samples")

print("\nDone.")