File size: 8,076 Bytes
af9853e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224

import os
import sys
import torch
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from datasets import load_dataset, concatenate_datasets
from transformers import (
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    TrainingArguments, 
    Trainer
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ==========================================
# 1. 配置 (Configuration)
# ==========================================
class Config:
    # 基础模型
    BASE_MODEL = "google-bert/bert-base-chinese"
    
    # 目录配置 (根据用户要求指定)
    BASE_DIR = os.getcwd()
    DATA_DIR = os.path.join(BASE_DIR, "data")
    CHECKPOINT_DIR = os.path.join(BASE_DIR, "checkpoints")
    RESULTS_DIR = os.path.join(BASE_DIR, "results")
    DOCS_DIR = os.path.join(BASE_DIR, "docs")
    
    # 标签配置
    NUM_LABELS = 3
    LABEL2ID = {'negative': 0, 'neutral': 1, 'positive': 2}
    ID2LABEL = {0: 'negative', 1: 'neutral', 2: 'positive'}
    
    # 训练参数
    MAX_LENGTH = 128
    BATCH_SIZE = 32
    LEARNING_RATE = 2e-5
    NUM_EPOCHS = 3
    WARMUP_RATIO = 0.1
    SAVE_STEPS = 500
    LOGGING_STEPS = 100

# ==========================================
# 2. 工具函数 (Utils)
# ==========================================
def ensure_directories():
    """ 确保所有必要的目录存在 """
    for path in [Config.DATA_DIR, Config.CHECKPOINT_DIR, Config.RESULTS_DIR, Config.DOCS_DIR]:
        if not os.path.exists(path):
            os.makedirs(path)
            print(f">>> Created directory: {path}")

def plot_training_history(log_history, save_path):
    """ 绘制训练曲线并保存 """
    try:
        # 设置字体 (尝试通用中文字体,云端可能缺失,回退到英文)
        plt.rcParams['font.sans-serif'] = ['SimHei', 'Arial Unicode MS', 'DejaVu Sans']
        plt.rcParams['axes.unicode_minus'] = False
        
        df = pd.DataFrame(log_history)
        train_loss = df[df['loss'].notna()]
        eval_acc = df[df['eval_accuracy'].notna()]
        
        if train_loss.empty:
            return

        plt.figure(figsize=(12, 5))
        
        # Loss
        plt.subplot(1, 2, 1)
        plt.plot(train_loss['epoch'], train_loss['loss'], label='Train Loss', color='#FF6B6B')
        if 'eval_loss' in df.columns:
            eval_loss = df[df['eval_loss'].notna()]
            plt.plot(eval_loss['epoch'], eval_loss['eval_loss'], label='Val Loss', color='#4ECDC4')
        plt.title('Loss Curve')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.legend()
        plt.grid(True, alpha=0.3)
        
        # Accuracy
        if not eval_acc.empty:
            plt.subplot(1, 2, 2)
            plt.plot(eval_acc['epoch'], eval_acc['eval_accuracy'], label='Val Accuracy', color='#6BCB77', marker='o')
            plt.title('Accuracy Curve')
            plt.xlabel('Epoch')
            plt.ylabel('Accuracy')
            plt.legend()
            plt.grid(True, alpha=0.3)
            
        plt.tight_layout()
        plt.savefig(save_path)
        print(f">>> Plot saved to {save_path}")
        plt.close()
    except Exception as e:
        print(f"Warning: Plotting failed ({e})")

# ==========================================
# 3. 数据处理 (Data Processor)
# ==========================================
class DataProcessor:
    def __init__(self, tokenizer):
        self.tokenizer = tokenizer

    def clean_data(self, example):
        text = example['text']
        if text is None: return False
        if "此用户未填写评价内容" in text: return False
        if len(text.strip()) < 2: return False
        return True

    def unify_labels(self, example):
        label = example['label']
        if isinstance(label, str):
            label = label.lower()
            if label in ['negative', 'pos', '0']: return {'label': 0}
            elif label in ['neutral', 'neu', '1']: return {'label': 1}
            elif label in ['positive', 'neg', '2']: return {'label': 2}
        return {'label': int(label)}

    def tokenize_function(self, examples):
        return self.tokenizer(examples['text'], padding="max_length", truncation=True, max_length=Config.MAX_LENGTH)

    def get_dataset(self):
        print(">>> Loading Datasets...")
        # 指定 cache_dir 为 data 目录
        ds_clap = load_dataset("clapAI/MultiLingualSentiment", split="train", trust_remote_code=True, cache_dir=Config.DATA_DIR)
        ds_med = load_dataset("OpenModels/Chinese-Herbal-Medicine-Sentiment", split="train", trust_remote_code=True, cache_dir=Config.DATA_DIR)
        
        # 列对齐
        if 'review_text' in ds_med.column_names: ds_med = ds_med.rename_column('review_text', 'text')
        if 'sentiment_label' in ds_med.column_names: ds_med = ds_med.rename_column('sentiment_label', 'label')
        if 'language' in ds_clap.column_names: ds_clap = ds_clap.filter(lambda x: x['language'] == 'zh')
            
        common_cols = ['text', 'label']
        combined = concatenate_datasets([ds_clap.select_columns(common_cols), ds_med.select_columns(common_cols)])
        
        # 清洗与处理
        combined = combined.filter(self.clean_data).map(self.unify_labels)
        tokenized = combined.map(self.tokenize_function, batched=True, remove_columns=['text', 'label'])
        
        return tokenized.train_test_split(test_size=0.1)

# ==========================================
# 4. Metrics
# ==========================================
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, preds, average='weighted')
    acc = accuracy_score(labels, preds)
    return {'accuracy': acc, 'f1': f1}

# ==========================================
# 5. 主流程
# ==========================================
def main():
    print("=== Cloud Training Script ===")
    ensure_directories()
    
    if torch.cuda.is_available():
        print(f"✅ CUDA Enabled: {torch.cuda.get_device_name(0)}")
    else:
        print("⚠️ Running on CPU")

    tokenizer = AutoTokenizer.from_pretrained(Config.BASE_MODEL)
    processor = DataProcessor(tokenizer)
    dataset = processor.get_dataset()
    
    model = AutoModelForSequenceClassification.from_pretrained(
        Config.BASE_MODEL, 
        num_labels=Config.NUM_LABELS,
        id2label=Config.ID2LABEL,
        label2id=Config.LABEL2ID
    )
    
    training_args = TrainingArguments(
        output_dir=Config.CHECKPOINT_DIR,    # Checkpoints 存放在这里
        num_train_epochs=Config.NUM_EPOCHS,
        per_device_train_batch_size=Config.BATCH_SIZE,
        per_device_eval_batch_size=Config.BATCH_SIZE,
        learning_rate=Config.LEARNING_RATE,
        warmup_ratio=Config.WARMUP_RATIO,
        logging_dir=os.path.join(Config.RESULTS_DIR, 'logs'), # Logs 存放在 Results
        logging_steps=Config.LOGGING_STEPS,
        eval_strategy="steps",
        eval_steps=Config.SAVE_STEPS,
        save_steps=Config.SAVE_STEPS,
        save_total_limit=2,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        fp16=torch.cuda.is_available(),
    )
    
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=dataset['train'],
        eval_dataset=dataset['test'],
        processing_class=tokenizer,
        compute_metrics=compute_metrics,
    )
    
    print(">>> Starting Training...")
    trainer.train()
    
    # 保存最终模型到 checkpoints/final_model
    final_path = os.path.join(Config.CHECKPOINT_DIR, "final_model")
    print(f">>> Saving Final Model to {final_path}...")
    trainer.save_model(final_path)
    tokenizer.save_pretrained(final_path)
    
    # 绘制曲线到 results/
    print(">>> Generating Plots...")
    plot_path = os.path.join(Config.RESULTS_DIR, "training_curves_cloud.png")
    plot_training_history(trainer.state.log_history, plot_path)
    
    print(">>> All Done!")

if __name__ == "__main__":
    main()