|
|
import pandas as pd |
|
|
from sklearn.model_selection import train_test_split |
|
|
import os |
|
|
|
|
|
|
|
|
EXCEL_FILE_PATH = "/home/hsichen/part_time/BERT_finetune/标注数据_更正后.xlsx" |
|
|
OUTPUT_DIR = "./processed_data_task2_fixed" |
|
|
|
|
|
TEST_SIZE = 0.2 |
|
|
|
|
|
RANDOM_SEED = 42 |
|
|
|
|
|
def preprocess_data(excel_path: str, output_dir: str, test_size: float, random_seed: int): |
|
|
""" |
|
|
读取Excel数据,进行清洗和格式转换,并划分为训练集、验证集和测试集。 |
|
|
使用标签数量 (Label Count) 进行分层抽样。 |
|
|
""" |
|
|
print(f"--- 1. 读取数据: {excel_path} ---") |
|
|
try: |
|
|
df = pd.read_excel(excel_path) |
|
|
except FileNotFoundError: |
|
|
print(f"错误:文件未找到在路径: {excel_path}") |
|
|
return |
|
|
except Exception as e: |
|
|
print(f"读取Excel文件时发生错误: {e}") |
|
|
return |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
df = df[df['Envir'] == 1].copy() |
|
|
print(f"筛选 Envir=1 后数据条数: {len(df)}") |
|
|
|
|
|
|
|
|
TAG_COLS = ['Data', 'Action', 'Gain', 'Regu', 'Vague'] |
|
|
|
|
|
|
|
|
df['labels'] = df[TAG_COLS].values.tolist() |
|
|
|
|
|
df = df.rename(columns={'sentence': 'text'}) |
|
|
|
|
|
|
|
|
print("--- 3. 标签组合类型统计 ---") |
|
|
|
|
|
|
|
|
df['label_tuple'] = df['labels'].apply(tuple) |
|
|
|
|
|
|
|
|
|
|
|
df['stratify_col'] = df['label_tuple'].astype(str) |
|
|
|
|
|
print("-" * 30) |
|
|
|
|
|
|
|
|
df = df[['text', 'labels', 'stratify_col']].copy() |
|
|
|
|
|
|
|
|
print(f"--- 划分数据集 (训练集:{1-test_size}, 测试集:{test_size}) ---") |
|
|
|
|
|
|
|
|
train_val_df, test_df = train_test_split( |
|
|
df, |
|
|
test_size=test_size, |
|
|
random_state=random_seed, |
|
|
|
|
|
stratify=df['stratify_col'] |
|
|
) |
|
|
|
|
|
|
|
|
val_size_from_train = 0.1 / (1 - test_size) |
|
|
|
|
|
train_df, val_df = train_test_split( |
|
|
train_val_df, |
|
|
test_size=val_size_from_train, |
|
|
random_state=random_seed, |
|
|
|
|
|
stratify=train_val_df['stratify_col'] |
|
|
) |
|
|
|
|
|
|
|
|
os.makedirs(output_dir, exist_ok=True) |
|
|
|
|
|
|
|
|
train_df[['text', 'labels']].to_csv(os.path.join(output_dir, 'train.csv'), index=False) |
|
|
val_df[['text', 'labels']].to_csv(os.path.join(output_dir, 'validation.csv'), index=False) |
|
|
test_df[['text', 'labels']].to_csv(os.path.join(output_dir, 'test.csv'), index=False) |
|
|
|
|
|
print("--- 结果保存成功 ---") |
|
|
print(f"训练集条数: {len(train_df)}. 保存至: {os.path.join(output_dir, 'train.csv')}") |
|
|
print(f"验证集条数: {len(val_df)}. 保存至: {os.path.join(output_dir, 'validation.csv')}") |
|
|
print(f"测试集条数: {len(test_df)}. 保存至: {os.path.join(output_dir, 'test.csv')}") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
|
|
|
preprocess_data(EXCEL_FILE_PATH, OUTPUT_DIR, TEST_SIZE, RANDOM_SEED) |