File size: 1,403 Bytes
41404b8 5a9888f 41404b8 5a9888f 41404b8 5a9888f 41404b8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 |
from datasets import load_dataset
import pandas as pd
# text_src_jsonl = '/home/zychen/hwproject/my_modeling_phase_1/mytest/text_src.jsonl'
# dataset = load_dataset("json", data_files=text_src_jsonl)["train"]
# print(f"Number of examples: {len(dataset)}")
# text_src_df = dataset.to_pandas()
# decoding_res = '/home/zychen/hwproject/my_modeling_phase_1/mytest_3600_test5k/decoding_res.json'
decoding_res = '/home/zychen/hwproject/my_modeling_phase_1/mytest_from56k+64k/decoding_res.json'
dataset2 = load_dataset("json", data_files=decoding_res)["train"]
print(f"Number of examples: {len(dataset2)}")
decoding_df = dataset2.to_pandas()
# df_merged = pd.concat([text_src_df, decoding_df], axis=1)
df_merged = decoding_df
print(df_merged.columns.tolist(), df_merged.iloc[4500])
def clean(sentence):
return ''.join(sentence.split())
df = df_merged
with open('text_src.txt', 'w', encoding='utf-8') as f:
for text in df['text_src']:
# cleaned_text = clean(text)
f.write(text + '\n')
# 将trans_res_seg列的内容写入hyp.txt
with open('hyp.txt', 'w', encoding='utf-8') as f:
for text in df['trans_res_seg']:
cleaned_text = clean(text)
f.write(cleaned_text + '\n')
# 将gt_seg列的内容写入ref.txt
with open('ref.txt', 'w', encoding='utf-8') as f:
for text in df['gt_seg']:
cleaned_text = clean(text)
f.write(cleaned_text + '\n')
|