|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def split_data(data_path, output_path):
|
|
|
import pandas as pd
|
|
|
import random
|
|
|
|
|
|
data_yes_no = pd.read_excel(data_path, sheet_name='Yes or No')
|
|
|
data_factoid = pd.read_excel(data_path, sheet_name='Factoid')
|
|
|
|
|
|
|
|
|
train_data_yes_no = []
|
|
|
test_data_yes_no = []
|
|
|
for category in range(5):
|
|
|
category_data = data_yes_no[category * 40:(category + 1) * 40]
|
|
|
random.seed(42)
|
|
|
train = category_data.sample(n=20, random_state=42)
|
|
|
test = category_data.drop(train.index)
|
|
|
train_data_yes_no.append(train)
|
|
|
test_data_yes_no.append(test)
|
|
|
|
|
|
|
|
|
train_data_factoid = []
|
|
|
test_data_factoid = []
|
|
|
for category in range(5):
|
|
|
category_data = data_factoid[category * 40:(category + 1) * 40]
|
|
|
random.seed(42)
|
|
|
train = category_data.sample(n=20, random_state=42)
|
|
|
test = category_data.drop(train.index)
|
|
|
train_data_factoid.append(train)
|
|
|
test_data_factoid.append(test)
|
|
|
|
|
|
|
|
|
train_data_yes_no = pd.concat(train_data_yes_no)
|
|
|
test_data_yes_no = pd.concat(test_data_yes_no)
|
|
|
train_data_factoid = pd.concat(train_data_factoid)
|
|
|
test_data_factoid = pd.concat(test_data_factoid)
|
|
|
|
|
|
|
|
|
writer = pd.ExcelWriter(output_path, engine='xlsxwriter')
|
|
|
|
|
|
|
|
|
train_data_yes_no.to_excel(writer, sheet_name='Yes or No Train', index=False)
|
|
|
test_data_yes_no.to_excel(writer, sheet_name='Yes or No Test', index=False)
|
|
|
train_data_factoid.to_excel(writer, sheet_name='Factoid Train', index=False)
|
|
|
test_data_factoid.to_excel(writer, sheet_name='Factoid Test', index=False)
|
|
|
|
|
|
|
|
|
writer.close()
|
|
|
print(f"数据已保存到 {output_path}")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
import pandas as pd
|
|
|
import random
|
|
|
data_path = './data/Task2.xlsx'
|
|
|
output_path = './data/train_test_data.xlsx'
|
|
|
split_data(data_path, output_path)
|
|
|
|