|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import os |
|
|
import re |
|
|
import time |
|
|
import nltk |
|
|
from nltk.sentiment.vader import SentimentIntensityAnalyzer |
|
|
from collections import defaultdict |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
TRAIN_FILE_PATH = "/tmp/home/wzh/file/train_data.csv" |
|
|
VALID_FILE_PATH = "/tmp/home/wzh/file/val_data.csv" |
|
|
|
|
|
OUTPUT_TRAIN_FILE = "train_features.csv" |
|
|
OUTPUT_EVAL_FILE = "val_features.csv" |
|
|
|
|
|
|
|
|
|
|
|
def initialize_vader_analyzer(): |
|
|
""" |
|
|
初始化 VADER 分析器,并处理 NLTK 数据路径问题。 |
|
|
""" |
|
|
try: |
|
|
|
|
|
VADER_ANALYZER = SentimentIntensityAnalyzer() |
|
|
|
|
|
return VADER_ANALYZER |
|
|
except LookupError: |
|
|
|
|
|
print("--- 警告: 正在尝试自动修复 NLTK LookupError ---") |
|
|
|
|
|
data_path = os.path.join(os.getcwd(), 'sentiment') |
|
|
|
|
|
|
|
|
if os.path.isdir(data_path): |
|
|
nltk.data.path.append(data_path) |
|
|
print(f"已将本地路径 [{data_path}] 添加到 NLTK 搜索路径。") |
|
|
try: |
|
|
|
|
|
VADER_ANALYZER = SentimentIntensityAnalyzer() |
|
|
return VADER_ANALYZER |
|
|
except LookupError: |
|
|
print("!!! 致命错误: 无法在本地路径找到 VADER 词典。请检查 'sentiment' 文件夹结构。!!!") |
|
|
raise |
|
|
else: |
|
|
print(f"!!! 致命错误: 找不到本地 VADER 文件夹 [{data_path}]。请确保下载成功。!!!") |
|
|
raise |
|
|
|
|
|
|
|
|
def extract_all_features(df, analyzer): |
|
|
"""为 DataFrame 中的每条评论提取人工特征""" |
|
|
|
|
|
|
|
|
|
|
|
df['vader_compound_score'] = 0.0 |
|
|
df['vader_neg_score'] = 0.0 |
|
|
df['vader_pos_score'] = 0.0 |
|
|
df['lengthening_ratio'] = 0.0 |
|
|
df['extreme_punctuation'] = 0 |
|
|
|
|
|
|
|
|
|
|
|
lengthening_pattern = re.compile(r'(.)\1{2,}') |
|
|
|
|
|
punct_pattern = re.compile(r'(!{3,})|(\?{3,})') |
|
|
|
|
|
texts = df['text'].astype(str).tolist() |
|
|
|
|
|
print(f"开始提取 {len(texts)} 条评论的 VADER 增强特征...") |
|
|
|
|
|
for i, text in enumerate(texts): |
|
|
|
|
|
scores = analyzer.polarity_scores(text) |
|
|
|
|
|
|
|
|
df.loc[i, 'vader_compound_score'] = scores['compound'] |
|
|
df.loc[i, 'vader_neg_score'] = scores['neg'] |
|
|
df.loc[i, 'vader_pos_score'] = scores['pos'] |
|
|
|
|
|
|
|
|
total_len = len(text) |
|
|
if total_len > 0: |
|
|
|
|
|
lengthened_chars = sum(len(match.group(0)) for match in lengthening_pattern.finditer(text)) |
|
|
df.loc[i, 'lengthening_ratio'] = lengthened_chars / total_len |
|
|
|
|
|
|
|
|
if punct_pattern.search(text): |
|
|
df.loc[i, 'extreme_punctuation'] = len(punct_pattern.findall(text)) |
|
|
|
|
|
|
|
|
print("特征提取完毕。") |
|
|
|
|
|
return df[['id', 'text', 'label', 'vader_compound_score', 'vader_neg_score', |
|
|
'vader_pos_score', 'lengthening_ratio', 'extreme_punctuation']].copy() |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == '__main__': |
|
|
|
|
|
start_time = time.time() |
|
|
print("--- 方案二:特征增强 (VADER 最终版) ---") |
|
|
|
|
|
|
|
|
try: |
|
|
VADER_ANALYZER = initialize_vader_analyzer() |
|
|
except Exception: |
|
|
print("脚本因 VADER 初始化失败而终止。请确保已运行 'python -m nltk.downloader -d . vader_lexicon'") |
|
|
exit() |
|
|
|
|
|
|
|
|
|
|
|
train_df = pd.read_csv(TRAIN_FILE_PATH) |
|
|
train_df['label'] = train_df['label'].map({"real": 0, "fake": 1}) |
|
|
train_df = train_df.reset_index(drop=True) |
|
|
|
|
|
eval_df = pd.read_csv(VALID_FILE_PATH) |
|
|
eval_df['label'] = eval_df['label'].map({"real": 0, "fake": 1}) |
|
|
eval_df = eval_df.reset_index(drop=True) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
train_features_df = extract_all_features(train_df, VADER_ANALYZER) |
|
|
train_features_df.to_csv(OUTPUT_TRAIN_FILE, index=False) |
|
|
print(f"增强特征后的训练集已保存到: {OUTPUT_TRAIN_FILE}") |
|
|
|
|
|
|
|
|
eval_features_df = extract_all_features(eval_df, VADER_ANALYZER) |
|
|
eval_features_df.to_csv(OUTPUT_EVAL_FILE, index=False) |
|
|
print(f"增强特征后的验证集已保存到: {OUTPUT_EVAL_FILE}") |
|
|
|
|
|
print(f"\n--- 脚本 extract_features.py 运行结束。总耗时: {time.time() - start_time:.2f} 秒 ---") |