Code-Lang-Classifier / FastText /convert-to-fast-text-format.py
kaushik-harsh-99's picture
initial-upload
95f644c
import json
FILES = {
"dataset/train.jsonl": "fasttext_train.txt",
"dataset/validation.jsonl": "fasttext_validation.txt",
"dataset/test.jsonl": "fasttext_test.txt",
}
for input_file, output_file in FILES.items():
print(f"Converting {input_file} -> {output_file}")
count = 0
with open(input_file, "r", encoding="utf-8") as fin, \
open(output_file, "w", encoding="utf-8") as fout:
for line in fin:
row = json.loads(line)
label = str(row["label"]).strip()
text = str(row["content"])
text = text.replace("__label__", "__lbl__")
text = " ".join(text.split())
fout.write(
f"__label__{label} {text}\n"
)
count += 1
print(f"Saved {count:,} samples")
print("\nDone.")