|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
|
from sklearn.linear_model import LogisticRegression |
|
|
from sklearn.metrics import classification_report |
|
|
import joblib |
|
|
import os |
|
|
import time |
|
|
|
|
|
|
|
|
TRAIN_FILE_PATH = "/tmp/home/wzh/file/train_data.csv" |
|
|
VALID_FILE_PATH = "/tmp/home/wzh/file/val_data.csv" |
|
|
MODEL_A_SAVE_DIR = "./final_model_tfidf" |
|
|
VECTORIZER_PATH = os.path.join(MODEL_A_SAVE_DIR, "vectorizer.joblib") |
|
|
MODEL_A_PATH = os.path.join(MODEL_A_SAVE_DIR, "model_A.joblib") |
|
|
|
|
|
|
|
|
os.makedirs(MODEL_A_SAVE_DIR, exist_ok=True) |
|
|
|
|
|
|
|
|
print("--- 正在训练【基础模型 A】 (TF-IDF + LR) ---") |
|
|
print("加载数据...") |
|
|
train_df = pd.read_csv(TRAIN_FILE_PATH) |
|
|
eval_df = pd.read_csv(VALID_FILE_PATH) |
|
|
|
|
|
label_map = {"real": 0, "fake": 1} |
|
|
train_df['label'] = train_df['label'].map(label_map) |
|
|
eval_df['label'] = eval_df['label'].map(label_map) |
|
|
|
|
|
X_train_text = train_df['text'] |
|
|
y_train = train_df['label'] |
|
|
X_eval_text = eval_df['text'] |
|
|
y_eval = eval_df['label'] |
|
|
|
|
|
print(f"训练集大小: {len(train_df)}") |
|
|
print(f"验证集大小: {len(eval_df)}") |
|
|
print(f"训练集标签分布:\n{y_train.value_counts(normalize=True)}") |
|
|
|
|
|
|
|
|
print("\n--- 正在训练 TF-IDF Vectorizer... ---") |
|
|
start_time = time.time() |
|
|
|
|
|
vectorizer = TfidfVectorizer(max_features=25000, stop_words='english', ngram_range=(1, 2)) |
|
|
X_train_tfidf = vectorizer.fit_transform(X_train_text) |
|
|
print(f"TF-IDF 训练完毕. 耗时: {time.time() - start_time:.2f} 秒") |
|
|
|
|
|
|
|
|
print("--- 正在训练逻辑回归 (Model A)... ---") |
|
|
start_time = time.time() |
|
|
|
|
|
|
|
|
model_A = LogisticRegression( |
|
|
class_weight='balanced', |
|
|
solver='liblinear', |
|
|
C=1.0, |
|
|
random_state=42 |
|
|
) |
|
|
model_A.fit(X_train_tfidf, y_train) |
|
|
print(f"模型 A 训练完毕. 耗时: {time.time() - start_time:.2f} 秒") |
|
|
|
|
|
|
|
|
print("\n--- 【模型 A】在验证集上的独立性能 ---") |
|
|
X_eval_tfidf = vectorizer.transform(X_eval_text) |
|
|
preds_A = model_A.predict(X_eval_tfidf) |
|
|
print(classification_report(y_eval, preds_A, target_names=['real (0)', 'fake (1)'], digits=4)) |
|
|
|
|
|
|
|
|
print(f"--- 正在保存模型 A 到 {MODEL_A_SAVE_DIR} ---") |
|
|
joblib.dump(vectorizer, VECTORIZER_PATH) |
|
|
joblib.dump(model_A, MODEL_A_PATH) |
|
|
print("模型 A (TF-IDF + LR) 保存完毕!") |