bert_remark / method2 /extract_features.py
BaltimoreCA68's picture
Add files using upload-large-folder tool
027ce51 verified
# ==============================================================================
# 脚本 1: extract_features.py (VADER 最终修正版)
# 目的:使用 VADER 库提取专业的短文本情感分数,作为 Stacking 元特征。
# ==============================================================================
import pandas as pd
import numpy as np
import os
import re
import time
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from collections import defaultdict
# 不需要 word_tokenize,VADER 内部有处理
# --- 1. 定义路径 ---
# 假设数据文件仍在 /root/project/
TRAIN_FILE_PATH = "/tmp/home/wzh/file/train_data.csv"
VALID_FILE_PATH = "/tmp/home/wzh/file/val_data.csv"
OUTPUT_TRAIN_FILE = "train_features.csv"
OUTPUT_EVAL_FILE = "val_features.csv"
# --- 2. VADER 初始化和数据路径修复 ---
def initialize_vader_analyzer():
"""
初始化 VADER 分析器,并处理 NLTK 数据路径问题。
"""
try:
# 尝试直接初始化 VADER
VADER_ANALYZER = SentimentIntensityAnalyzer()
# 成功初始化后返回
return VADER_ANALYZER
except LookupError:
# 如果出现 LookupError (找不到 vader_lexicon),则手动添加路径
print("--- 警告: 正在尝试自动修复 NLTK LookupError ---")
# 假设数据在当前目录下的 sentiment 文件夹 (即你运行 nltk.downloader -d . 的结果)
data_path = os.path.join(os.getcwd(), 'sentiment')
# 检查路径是否存在
if os.path.isdir(data_path):
nltk.data.path.append(data_path)
print(f"已将本地路径 [{data_path}] 添加到 NLTK 搜索路径。")
try:
# 重新尝试初始化 VADER
VADER_ANALYZER = SentimentIntensityAnalyzer()
return VADER_ANALYZER
except LookupError:
print("!!! 致命错误: 无法在本地路径找到 VADER 词典。请检查 'sentiment' 文件夹结构。!!!")
raise
else:
print(f"!!! 致命错误: 找不到本地 VADER 文件夹 [{data_path}]。请确保下载成功。!!!")
raise
# --- 3. 增强特征提取函数 ---
def extract_all_features(df, analyzer):
"""为 DataFrame 中的每条评论提取人工特征"""
# 初始化特征列
# VADER 的主要输出是 'compound' (复合分数),这是最好的总极性指标
df['vader_compound_score'] = 0.0
df['vader_neg_score'] = 0.0 # 负面情感强度
df['vader_pos_score'] = 0.0 # 正面情感强度
df['lengthening_ratio'] = 0.0 # 词语加长比例
df['extreme_punctuation'] = 0 # 极端标点计数 (e.g., !!!, ???)
# 正则表达式
# 词语加长正则:匹配连续重复 3 次或以上的字母
lengthening_pattern = re.compile(r'(.)\1{2,}')
# 极端标点正则:匹配 3 个或以上连续感叹号或问号
punct_pattern = re.compile(r'(!{3,})|(\?{3,})')
texts = df['text'].astype(str).tolist()
print(f"开始提取 {len(texts)} 条评论的 VADER 增强特征...")
for i, text in enumerate(texts):
# A. VADER 情感分析 (专业且准确)
scores = analyzer.polarity_scores(text)
# 使用 loc 确保在多进程环境中数据写入的准确性 (虽然这里是单进程执行)
df.loc[i, 'vader_compound_score'] = scores['compound']
df.loc[i, 'vader_neg_score'] = scores['neg']
df.loc[i, 'vader_pos_score'] = scores['pos']
# B. 不规范表达 (加长) 比例
total_len = len(text)
if total_len > 0:
# 查找所有匹配的加长部分,并计算总长度
lengthened_chars = sum(len(match.group(0)) for match in lengthening_pattern.finditer(text))
df.loc[i, 'lengthening_ratio'] = lengthened_chars / total_len
# C. 极端标点特征
if punct_pattern.search(text):
df.loc[i, 'extreme_punctuation'] = len(punct_pattern.findall(text))
print("特征提取完毕。")
# 返回包含所有新 VADER 特征和人工特征的 DataFrame
return df[['id', 'text', 'label', 'vader_compound_score', 'vader_neg_score',
'vader_pos_score', 'lengthening_ratio', 'extreme_punctuation']].copy()
# --- 4. 主执行逻辑 ---
if __name__ == '__main__':
start_time = time.time()
print("--- 方案二:特征增强 (VADER 最终版) ---")
# 1. 初始化 VADER 分析器
try:
VADER_ANALYZER = initialize_vader_analyzer()
except Exception:
print("脚本因 VADER 初始化失败而终止。请确保已运行 'python -m nltk.downloader -d . vader_lexicon'")
exit()
# 2. 加载数据
train_df = pd.read_csv(TRAIN_FILE_PATH)
train_df['label'] = train_df['label'].map({"real": 0, "fake": 1})
train_df = train_df.reset_index(drop=True)
eval_df = pd.read_csv(VALID_FILE_PATH)
eval_df['label'] = eval_df['label'].map({"real": 0, "fake": 1})
eval_df = eval_df.reset_index(drop=True)
# 3. 提取特征并保存
# 训练集
train_features_df = extract_all_features(train_df, VADER_ANALYZER)
train_features_df.to_csv(OUTPUT_TRAIN_FILE, index=False)
print(f"增强特征后的训练集已保存到: {OUTPUT_TRAIN_FILE}")
# 验证集
eval_features_df = extract_all_features(eval_df, VADER_ANALYZER)
eval_features_df.to_csv(OUTPUT_EVAL_FILE, index=False)
print(f"增强特征后的验证集已保存到: {OUTPUT_EVAL_FILE}")
print(f"\n--- 脚本 extract_features.py 运行结束。总耗时: {time.time() - start_time:.2f} 秒 ---")