jennyyu009 commited on
Commit
23a8f6d
·
verified ·
1 Parent(s): 0349031

Upload 5 files

Browse files
FPB FinBERT and Roberta.py ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ from typing import List
4
+
5
+ import numpy as np
6
+ import pandas as pd
7
+ import torch
8
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
9
+
10
+ # ---------------- Models ----------------
11
+ FINBERT_ID = "ProsusAI/finbert" # 0=negative, 1=neutral, 2=positive
12
+ ROBERTA_ID = "cardiffnlp/twitter-roberta-base-sentiment" # 0=negative, 1=neutral, 2=positive
13
+ CLASS_NAMES = ["negative", "neutral", "positive"]
14
+
15
+ # ---------------- I/O helpers ----------------
16
+ def read_fpb_txt(file_path: str) -> pd.DataFrame:
17
+ """Reads FPB *.txt file with lines like 'sentence@label' -> columns: text, label (optional)."""
18
+ rows = []
19
+ with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
20
+ for line in f:
21
+ line = line.rstrip("\n")
22
+ if not line:
23
+ continue
24
+ if "@" in line:
25
+ sentence, label = line.split("@", 1)
26
+ rows.append({"text": sentence.strip(), "label": label.strip()})
27
+ else:
28
+ rows.append({"text": line.strip(), "label": ""})
29
+ df = pd.DataFrame(rows)
30
+ df["text"] = df["text"].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
31
+ return df
32
+
33
+ def read_input(file_path: str) -> pd.DataFrame:
34
+ """Reads either FPB .txt or CSV file."""
35
+ ext = os.path.splitext(file_path)[1].lower()
36
+ if ext == ".txt":
37
+ return read_fpb_txt(file_path)
38
+ elif ext == ".csv":
39
+ df = pd.read_csv(file_path)
40
+ if "text" not in df.columns:
41
+ raise ValueError("CSV must contain a 'text' column.")
42
+ df["text"] = df["text"].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
43
+ return df
44
+ else:
45
+ raise ValueError(f"Unsupported file type: {ext}. Use .txt or .csv")
46
+
47
+ # ---------------- Model inference ----------------
48
+ def load_model(model_id: str):
49
+ tok = AutoTokenizer.from_pretrained(model_id)
50
+ mdl = AutoModelForSequenceClassification.from_pretrained(model_id)
51
+ device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
52
+ mdl.to(device).eval()
53
+ return tok, mdl, device
54
+
55
+ @torch.no_grad()
56
+ def predict_probs(texts: List[str], tokenizer, model, device, batch_size=32, max_length=128) -> np.ndarray:
57
+ """Returns an (N,3) array of probabilities [p_neg, p_neu, p_pos]."""
58
+ out = []
59
+ for i in range(0, len(texts), batch_size):
60
+ batch = texts[i:i+batch_size]
61
+ enc = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
62
+ enc = {k: v.to(device) for k, v in enc.items()}
63
+ logits = model(**enc).logits
64
+ probs = torch.softmax(logits, dim=-1).cpu().numpy()
65
+ out.append(probs)
66
+ if not out:
67
+ return np.zeros((0, 3), dtype=float)
68
+ probs = np.vstack(out)
69
+
70
+ # Safety: ensure models align to [neg, neu, pos]
71
+ # ProsusAI/finbert and cardiffnlp/twitter-roberta-base-sentiment both map 0,1,2 to neg,neu,pos.
72
+ # If you swap models in future, adjust mapping here.
73
+ return probs
74
+
75
+ def add_prob_columns(df: pd.DataFrame, probs: np.ndarray, prefix: str) -> pd.DataFrame:
76
+ """Adds p_neg/p_neu/p_pos, {prefix}_label, {prefix}_score columns for one model."""
77
+ result = df.copy()
78
+ result[f"{prefix}_p_neg"] = probs[:, 0]
79
+ result[f"{prefix}_p_neu"] = probs[:, 1]
80
+ result[f"{prefix}_p_pos"] = probs[:, 2]
81
+ top = probs.argmax(axis=1)
82
+ result[f"{prefix}_label"] = [CLASS_NAMES[i] for i in top]
83
+ result[f"{prefix}_score"] = probs[np.arange(len(probs)), top]
84
+ return result
85
+
86
+ # ---------------- Main ----------------
87
+ def parse_args():
88
+ p = argparse.ArgumentParser(description="Apply FinBERT + RoBERTa to FPB subset.")
89
+ p.add_argument("--input", required=True, help="Input FPB file (.txt with 'sentence@label' or .csv with 'text')")
90
+ p.add_argument("--dataset", required=True, help="Dataset tag for the output file (e.g., 50Agree, 66Agree, 75Agree, AllAgree)")
91
+ p.add_argument("--batch_size", type=int, default=32)
92
+ p.add_argument("--max_length", type=int, default=128)
93
+ p.add_argument("--out_dir", default="outputs", help="Directory to save results")
94
+ p.add_argument("--out_subdir", default=None, help="Optional subdirectory under out_dir (e.g. 'FinBERT and RoBERTa raw probs')")
95
+ return p.parse_args()
96
+
97
+ def main():
98
+ args = parse_args()
99
+
100
+ # Load data
101
+ df = read_input(args.input)
102
+ if "doc_id" not in df.columns:
103
+ df.insert(0, "doc_id", np.arange(len(df), dtype=int))
104
+ print(f"Loaded {len(df)} rows from {args.input}")
105
+
106
+ # FinBERT
107
+ print("Running FinBERT (ProsusAI/finbert)...")
108
+ fin_tok, fin_mdl, fin_dev = load_model(FINBERT_ID)
109
+ fin_probs = predict_probs(df["text"].tolist(), fin_tok, fin_mdl, fin_dev, args.batch_size, args.max_length)
110
+ df = add_prob_columns(df, fin_probs, "fin")
111
+
112
+ # RoBERTa
113
+ print("Running RoBERTa (cardiffnlp/twitter-roberta-base-sentiment)...")
114
+ rob_tok, rob_mdl, rob_dev = load_model(ROBERTA_ID)
115
+ rob_probs = predict_probs(df["text"].tolist(), rob_tok, rob_mdl, rob_dev, args.batch_size, args.max_length)
116
+ df = add_prob_columns(df, rob_probs, "rob")
117
+
118
+ # Save (optionally into a named subdirectory)
119
+ save_dir = args.out_dir
120
+ if args.out_subdir:
121
+ # create a subdirectory under out_dir
122
+ save_dir = os.path.join(args.out_dir, args.out_subdir)
123
+ os.makedirs(save_dir, exist_ok=True)
124
+ out_path = os.path.join(save_dir, f"FinSent_{args.dataset}_raw_probs.csv")
125
+ df.to_csv(out_path, index=False)
126
+ print(f"Saved to: {out_path}")
127
+ print("Columns:")
128
+ print(" fin_p_neg/neu/pos, fin_label, fin_score")
129
+ print(" rob_p_neg/neu/pos, rob_label, rob_score")
130
+
131
+ if __name__ == "__main__":
132
+ main()
FPB Meta Classifier.py ADDED
@@ -0,0 +1,820 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Meta-classifier for Financial PhraseBank (FPB) datasets.
2
+
3
+ The script expects the following pre-computed artifacts inside ``outputs/``
4
+ (or custom paths can be supplied):
5
+
6
+ * ``FinSent_<split>_raw_probs_prob_features.csv`` – base probabilities and
7
+ probability-derived features for FinBERT/RoBERTa
8
+ * ``FPB_MultiLLM_<split>.csv`` – expert-signal metrics (KL, L1, agreement)
9
+ * ``Sentences_<split>_semantics.csv`` – structured semantics flags
10
+
11
+ Example command (50Agree subset)::
12
+
13
+ python "FPB Meta Classifier.py" \\
14
+ --dataset 50Agree \\
15
+ --folds 5 \\
16
+ --models logreg xgboost \\
17
+ --artifact_prefix outputs/FinSent_50Agree_meta \\
18
+ --save_predictions --save_models --verbose
19
+
20
+ """
21
+
22
+ from __future__ import annotations
23
+
24
+ import argparse
25
+ import os
26
+ from dataclasses import dataclass
27
+ from typing import Dict, Iterable, List, Optional
28
+
29
+ import joblib
30
+ import numpy as np
31
+ import pandas as pd
32
+ from sklearn.compose import ColumnTransformer
33
+ from sklearn.linear_model import LogisticRegression
34
+ from sklearn.metrics import classification_report, confusion_matrix
35
+ from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_validate
36
+ from sklearn.pipeline import Pipeline
37
+ from sklearn.preprocessing import OneHotEncoder, StandardScaler
38
+
39
+ # tqdm用于进度条显示
40
+ try:
41
+ from tqdm import tqdm
42
+ except ImportError:
43
+ tqdm = None
44
+
45
+ try:
46
+ from xgboost import XGBClassifier
47
+ except ImportError: # pragma: no cover - handled at runtime
48
+ XGBClassifier = None # type: ignore
49
+
50
+ try:
51
+ import torch
52
+ from transformers import AutoTokenizer, AutoModelForSequenceClassification
53
+ from scipy.stats import entropy
54
+ TRANSFORMERS_AVAILABLE = True
55
+ except ImportError:
56
+ TRANSFORMERS_AVAILABLE = False
57
+ print("[!] transformers or torch not available. FinSentLLM feature engineering will be disabled.")
58
+
59
+ from sklearn.base import BaseEstimator, TransformerMixin
60
+
61
+
62
+ # ---------------------------------------------------------------------------
63
+ # Data loading
64
+ # ---------------------------------------------------------------------------
65
+
66
+ @dataclass
67
+ class DatasetPaths:
68
+ dataset: str
69
+ prob_features_csv: str
70
+ multi_llm_csv: str
71
+ semantics_csv: str
72
+
73
+
74
+ def infer_paths(dataset: str, base_dir: str = "outputs") -> DatasetPaths:
75
+ dtag = dataset.strip()
76
+ prob_csv = os.path.join(base_dir, "prob features", f"FinSent_{dtag}_raw_probs_prob_features.csv")
77
+ multi_csv = os.path.join(base_dir, "MultiLLM", f"FPB_MultiLLM_{dtag}.csv")
78
+ # 修正语义文件路径 - 实际文件在 Structures Financial Semantics 子目录下
79
+ sem_csv = os.path.join(base_dir, "Structures Financial Semantics", f"Sentences_{dtag}_semantics.csv")
80
+ return DatasetPaths(dataset=dtag, prob_features_csv=prob_csv, multi_llm_csv=multi_csv, semantics_csv=sem_csv)
81
+
82
+
83
+ def _merge_features(left: pd.DataFrame, right: pd.DataFrame, key: str = "doc_id") -> pd.DataFrame:
84
+ """Merge two DataFrames on ``doc_id`` while dropping duplicate feature columns."""
85
+ overlap = [c for c in right.columns if c in left.columns and c != key]
86
+ right_clean = right.drop(columns=overlap, errors="ignore")
87
+ merged = left.merge(right_clean, on=key, how="left", validate="one_to_one")
88
+ return merged
89
+
90
+
91
+ def load_feature_table(paths: DatasetPaths) -> pd.DataFrame:
92
+ if not os.path.exists(paths.multi_llm_csv):
93
+ raise FileNotFoundError(f"Missing Multi-LLM feature CSV: {paths.multi_llm_csv}")
94
+ base = pd.read_csv(paths.multi_llm_csv)
95
+
96
+ # Ensure `doc_id` present for alignment.
97
+ if "doc_id" not in base.columns:
98
+ raise KeyError("Expected 'doc_id' column in Multi-LLM CSV. Re-run stage 3 feature extraction.")
99
+
100
+ # Merge optional probability features if available (guards against missing engineered columns).
101
+ if os.path.exists(paths.prob_features_csv):
102
+ prob = pd.read_csv(paths.prob_features_csv)
103
+ if "doc_id" not in prob.columns:
104
+ raise KeyError("Probability features CSV must contain 'doc_id'.")
105
+ base = _merge_features(base, prob, key="doc_id")
106
+ else:
107
+ print(f"[!] Probability feature CSV not found ({paths.prob_features_csv}); proceeding without extra columns.")
108
+
109
+ # Merge structured semantics.
110
+ if not os.path.exists(paths.semantics_csv):
111
+ raise FileNotFoundError(f"Missing semantics CSV: {paths.semantics_csv}")
112
+ sem = pd.read_csv(paths.semantics_csv)
113
+ if "doc_id" not in sem.columns:
114
+ if "id" in sem.columns:
115
+ sem = sem.rename(columns={"id": "doc_id"})
116
+ else:
117
+ raise KeyError("Semantics CSV must contain 'doc_id' or 'id' column.")
118
+
119
+ sem = sem.drop(columns=[c for c in ["label", "sentence", "text"] if c in sem.columns], errors="ignore")
120
+ merged = _merge_features(base, sem, key="doc_id")
121
+
122
+ # Check for missing semantics flags.
123
+ sem_cols = [c for c in merged.columns if c.startswith("sem_")]
124
+ if sem_cols:
125
+ missing_sem = merged[sem_cols].isna().any(axis=1)
126
+ if missing_sem.any():
127
+ raise ValueError(
128
+ f"{int(missing_sem.sum())} rows lack structured semantics after merging. Make sure the semantics file"
129
+ " matches the dataset split."
130
+ )
131
+
132
+ return merged
133
+
134
+
135
+ def load_best_iterations(results_dir: str = "results") -> Dict[str, int]:
136
+ """Load previously computed best iterations for XGBoost models.
137
+
138
+ Returns:
139
+ Dictionary mapping dataset names to best iteration counts.
140
+ Returns empty dict if file not found.
141
+ """
142
+ best_iters_file = os.path.join(results_dir, "xgb_meta_best_iterations.csv")
143
+
144
+ if not os.path.exists(best_iters_file):
145
+ print(f"[!] Best iterations file not found: {best_iters_file}")
146
+ return {}
147
+
148
+ try:
149
+ df = pd.read_csv(best_iters_file)
150
+ # Create mapping from dataset name to best iteration
151
+ best_iters = {}
152
+ for _, row in df.iterrows():
153
+ dataset = row["meta"] # e.g., "50Agree"
154
+ best_iter = int(row["best_iteration"])
155
+ best_iters[dataset] = best_iter
156
+
157
+ print(f"[✓] Loaded best iterations for {len(best_iters)} datasets:")
158
+ for dataset, iter_count in best_iters.items():
159
+ print(f" {dataset}: {iter_count} iterations")
160
+
161
+ return best_iters
162
+ except Exception as e:
163
+ print(f"[!] Error loading best iterations: {e}")
164
+ return {}
165
+
166
+
167
+ # ---------------------------------------------------------------------------
168
+ # FinSentLLM Feature Engineering Pipeline
169
+ # ---------------------------------------------------------------------------
170
+
171
+ class FinSentLLMFeatureEngineering(BaseEstimator, TransformerMixin):
172
+ """
173
+ 端到端特征工程转换器,将原始文本转换为FinSentLLM的36个特征。
174
+ 包括FinBERT/RoBERTa推理、概率工程、MultiLLM特征和语义特征。
175
+ """
176
+
177
+ def __init__(self,
178
+ finbert_model_id="ProsusAI/finbert",
179
+ roberta_model_id="cardiffnlp/twitter-roberta-base-sentiment",
180
+ batch_size=16,
181
+ max_length=128,
182
+ device=None):
183
+ self.finbert_model_id = finbert_model_id
184
+ self.roberta_model_id = roberta_model_id
185
+ self.batch_size = batch_size
186
+ self.max_length = max_length
187
+ self.device = device
188
+ self.class_names = ["negative", "neutral", "positive"]
189
+
190
+ # 模型组件将在fit时初始化
191
+ self.finbert_tokenizer = None
192
+ self.finbert_model = None
193
+ self.roberta_tokenizer = None
194
+ self.roberta_model = None
195
+ self._device = None
196
+
197
+ def _load_models(self):
198
+ """加载FinBERT和RoBERTa模型"""
199
+ if not TRANSFORMERS_AVAILABLE:
200
+ raise ImportError("transformers and torch are required for FinSentLLM feature engineering")
201
+
202
+ print("[📥] Loading FinBERT and RoBERTa models...")
203
+
204
+ # 设置设备
205
+ if self.device is None:
206
+ self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
207
+ else:
208
+ self._device = torch.device(self.device)
209
+
210
+ # 加载FinBERT
211
+ self.finbert_tokenizer = AutoTokenizer.from_pretrained(self.finbert_model_id)
212
+ self.finbert_model = AutoModelForSequenceClassification.from_pretrained(self.finbert_model_id)
213
+ self.finbert_model.to(self._device).eval()
214
+
215
+ # 加载RoBERTa
216
+ self.roberta_tokenizer = AutoTokenizer.from_pretrained(self.roberta_model_id)
217
+ self.roberta_model = AutoModelForSequenceClassification.from_pretrained(self.roberta_model_id)
218
+ self.roberta_model.to(self._device).eval()
219
+
220
+ print(f"[✅] Models loaded on {self._device}")
221
+
222
+ @torch.no_grad()
223
+ def _get_probabilities(self, texts, tokenizer, model):
224
+ """获取模型的概率预测,带tqdm进度条"""
225
+ all_probs = []
226
+ total = len(texts)
227
+ batch_iter = range(0, total, self.batch_size)
228
+ use_tqdm = tqdm is not None and total > self.batch_size
229
+ iterator = tqdm(batch_iter, desc="[tqdm] Encoding & inference", unit="batch") if use_tqdm else batch_iter
230
+ for i in iterator:
231
+ batch = texts[i:i + self.batch_size]
232
+ # 编码
233
+ encoding = tokenizer(
234
+ batch,
235
+ return_tensors="pt",
236
+ truncation=True,
237
+ padding=True,
238
+ max_length=self.max_length
239
+ )
240
+ # 移动到设备
241
+ encoding = {k: v.to(self._device) for k, v in encoding.items()}
242
+ # 推理
243
+ logits = model(**encoding).logits
244
+ probs = torch.softmax(logits, dim=-1).cpu().numpy()
245
+ all_probs.append(probs)
246
+ return np.vstack(all_probs)
247
+
248
+ def _build_features(self, finbert_probs, roberta_probs):
249
+ """构建完整的36个特征"""
250
+ eps = 1e-12
251
+ features = {}
252
+ n_samples = len(finbert_probs)
253
+
254
+ # 1. 基础概率特征 (8个)
255
+ for i, cls in enumerate(self.class_names):
256
+ features[f"fin_p_{cls[:3]}"] = finbert_probs[:, i]
257
+ features[f"rob_p_{cls[:3]}"] = roberta_probs[:, i]
258
+
259
+ features["fin_score"] = finbert_probs.max(axis=1)
260
+ features["rob_score"] = roberta_probs.max(axis=1)
261
+
262
+ # 2. 标签特征 (2个)
263
+ features["fin_label"] = finbert_probs.argmax(axis=1)
264
+ features["rob_label"] = roberta_probs.argmax(axis=1)
265
+
266
+ # 3. 工程概率特征 (12个)
267
+ # Logits
268
+ for i, cls in enumerate(self.class_names):
269
+ features[f"fin_logit_{cls[:3]}"] = np.log((finbert_probs[:, i] + eps) / (1 - finbert_probs[:, i] + eps))
270
+ features[f"rob_logit_{cls[:3]}"] = np.log((roberta_probs[:, i] + eps) / (1 - roberta_probs[:, i] + eps))
271
+
272
+ # 最大概率
273
+ features["fin_max_prob"] = finbert_probs.max(axis=1)
274
+ features["rob_max_prob"] = roberta_probs.max(axis=1)
275
+
276
+ # 边际 (最高 - 第二高概率)
277
+ fin_sorted = np.sort(finbert_probs, axis=1)
278
+ rob_sorted = np.sort(roberta_probs, axis=1)
279
+ features["fin_margin"] = fin_sorted[:, -1] - fin_sorted[:, -2]
280
+ features["rob_margin"] = rob_sorted[:, -1] - rob_sorted[:, -2]
281
+
282
+ # 熵
283
+ features["fin_entropy"] = entropy(finbert_probs.T)
284
+ features["rob_entropy"] = entropy(roberta_probs.T)
285
+
286
+ # 4. MultiLLM特征 (5个)
287
+ # L1距离和相似性
288
+ l1_dist = np.abs(finbert_probs - roberta_probs).sum(axis=1)
289
+ features["MultiLLM_L1_distance"] = l1_dist
290
+ features["MultiLLM_L1_similarity"] = 1 / (1 + l1_dist)
291
+
292
+ # KL散度
293
+ features["MultiLLM_KL_F_to_R"] = entropy(finbert_probs.T, roberta_probs.T)
294
+ features["MultiLLM_KL_R_to_F"] = entropy(roberta_probs.T, finbert_probs.T)
295
+
296
+ # 一致性
297
+ fin_pred = finbert_probs.argmax(axis=1)
298
+ rob_pred = roberta_probs.argmax(axis=1)
299
+ features["MultiLLM_agree"] = (fin_pred == rob_pred).astype(int)
300
+
301
+ # 5. 结构化语义特征 (9个) - 简化版本,实际使用中应该基于NLP规则
302
+ # 这里使用基于概率的启发式规则
303
+ features["sem_compared"] = ((finbert_probs[:, 1] > 0.4) & (roberta_probs[:, 1] > 0.4)).astype(int)
304
+ features["sem_loss_improve"] = ((finbert_probs[:, 2] > 0.6) & (roberta_probs[:, 2] > 0.5)).astype(int)
305
+ features["sem_loss_worsen"] = ((finbert_probs[:, 0] > 0.6) & (roberta_probs[:, 0] > 0.5)).astype(int)
306
+ features["sem_profit_up"] = ((finbert_probs[:, 2] > 0.7) & (l1_dist < 0.3)).astype(int)
307
+ features["sem_cost_down"] = ((finbert_probs[:, 2] > 0.5) & (features["MultiLLM_agree"] == 1)).astype(int)
308
+ features["sem_contract_fin"] = ((finbert_probs[:, 1] > 0.8)).astype(int)
309
+ features["sem_uncertainty"] = ((features["fin_entropy"] > 1.0) | (features["rob_entropy"] > 1.0)).astype(int)
310
+ features["sem_stable_guidance"] = ((l1_dist < 0.2) & (finbert_probs[:, 1] > 0.5)).astype(int)
311
+ features["sem_operational"] = ((finbert_probs[:, 1] > 0.3) & (roberta_probs[:, 1] > 0.3)).astype(int)
312
+
313
+ return pd.DataFrame(features)
314
+
315
+ def fit(self, X, y=None):
316
+ """训练阶段 - 加载模型"""
317
+ self._load_models()
318
+ return self
319
+
320
+ def transform(self, X):
321
+ """转换阶段 - 将文本转换为特征"""
322
+ if self.finbert_model is None:
323
+ raise RuntimeError("Models not loaded. Call fit() first.")
324
+
325
+ # 处理输入
326
+ if isinstance(X, pd.DataFrame):
327
+ if 'text' in X.columns:
328
+ texts = X['text'].tolist()
329
+ elif len(X.columns) == 1:
330
+ texts = X.iloc[:, 0].tolist()
331
+ else:
332
+ raise ValueError("DataFrame must have 'text' column or single column")
333
+ elif isinstance(X, (list, np.ndarray)):
334
+ texts = list(X)
335
+ else:
336
+ raise ValueError("X must be DataFrame, list, or array")
337
+
338
+ print(f"[🔮] Processing {len(texts)} texts...")
339
+
340
+ # 获取概率
341
+ finbert_probs = self._get_probabilities(texts, self.finbert_tokenizer, self.finbert_model)
342
+ roberta_probs = self._get_probabilities(texts, self.roberta_tokenizer, self.roberta_model)
343
+
344
+ # 构建特征
345
+ features_df = self._build_features(finbert_probs, roberta_probs)
346
+
347
+ print(f"[✅] Generated {len(features_df.columns)} features")
348
+ return features_df
349
+
350
+
351
+ # ---------------------------------------------------------------------------
352
+ # Modeling utilities
353
+ # ---------------------------------------------------------------------------
354
+
355
+ def build_preprocessor(numeric_cols: List[str], categorical_cols: List[str]) -> ColumnTransformer:
356
+ transformers = []
357
+ if numeric_cols:
358
+ transformers.append(("num", StandardScaler(), numeric_cols))
359
+ if categorical_cols:
360
+ transformers.append(("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols))
361
+ if not transformers:
362
+ raise ValueError("No feature columns selected – check your dataset.")
363
+ return ColumnTransformer(transformers=transformers, remainder="drop")
364
+
365
+
366
+ def build_pipelines(
367
+ numeric_cols: List[str],
368
+ categorical_cols: List[str],
369
+ num_classes: int,
370
+ random_state: int,
371
+ models_requested: Iterable[str],
372
+ dataset: str = "",
373
+ best_iterations: Dict[str, int] = None,
374
+ include_feature_engineering: bool = False,
375
+ ) -> Dict[str, Pipeline]:
376
+ pipelines: Dict[str, Pipeline] = {}
377
+
378
+
379
+ # 定义 end-to-end 特征名
380
+ end2end_categorical_features = ["fin_label", "rob_label"]
381
+ end2end_numeric_features = [
382
+ 'fin_p_neg', 'fin_p_neu', 'fin_p_pos', 'fin_score',
383
+ 'rob_p_neg', 'rob_p_neu', 'rob_p_pos', 'rob_score',
384
+ 'fin_logit_neg', 'fin_logit_neu', 'fin_logit_pos',
385
+ 'rob_logit_neg', 'rob_logit_neu', 'rob_logit_pos',
386
+ 'fin_max_prob', 'rob_max_prob', 'fin_margin', 'rob_margin',
387
+ 'fin_entropy', 'rob_entropy',
388
+ 'MultiLLM_L1_distance', 'MultiLLM_L1_similarity',
389
+ 'MultiLLM_KL_F_to_R', 'MultiLLM_KL_R_to_F', 'MultiLLM_agree',
390
+ 'sem_compared', 'sem_loss_improve', 'sem_loss_worsen',
391
+ 'sem_profit_up', 'sem_cost_down', 'sem_contract_fin',
392
+ 'sem_uncertainty', 'sem_stable_guidance', 'sem_operational'
393
+ ]
394
+
395
+ if "logreg" in models_requested:
396
+ logreg = LogisticRegression(max_iter=1000, solver="lbfgs")
397
+ if include_feature_engineering:
398
+ preprocessor = build_preprocessor(end2end_numeric_features, end2end_categorical_features)
399
+ pipelines["logreg"] = Pipeline([
400
+ ("feature_engineering", FinSentLLMFeatureEngineering()),
401
+ ("preprocess", preprocessor),
402
+ ("clf", logreg),
403
+ ])
404
+ else:
405
+ preprocessor = build_preprocessor(numeric_cols, categorical_cols)
406
+ pipelines["logreg"] = Pipeline([
407
+ ("preprocess", preprocessor),
408
+ ("clf", logreg),
409
+ ])
410
+
411
+ if "xgboost" in models_requested:
412
+ if XGBClassifier is None:
413
+ raise ImportError(
414
+ "xgboost is not installed. Install it with 'pip install xgboost' or remove 'xgboost' from --models."
415
+ )
416
+ if best_iterations and dataset in best_iterations:
417
+ n_estimators = best_iterations[dataset]
418
+ print(f"[✓] Using pre-computed best iterations for {dataset}: {n_estimators}")
419
+ else:
420
+ n_estimators = 1000
421
+ print(f"[!] No pre-computed iterations found for {dataset}, using default: {n_estimators}")
422
+ xgb = XGBClassifier(
423
+ objective="multi:softprob",
424
+ num_class=num_classes,
425
+ learning_rate=0.05,
426
+ max_depth=6,
427
+ subsample=0.8,
428
+ colsample_bytree=0.8,
429
+ n_estimators=n_estimators,
430
+ min_child_weight=2,
431
+ reg_lambda=1.0,
432
+ reg_alpha=0.0,
433
+ tree_method="hist",
434
+ eval_metric="mlogloss",
435
+ random_state=random_state,
436
+ n_jobs=0,
437
+ verbosity=0,
438
+ )
439
+ if include_feature_engineering:
440
+ feature_preprocessor = build_preprocessor(end2end_numeric_features, end2end_categorical_features)
441
+ pipelines["xgboost"] = Pipeline([
442
+ ("feature_engineering", FinSentLLMFeatureEngineering()),
443
+ ("preprocess", feature_preprocessor),
444
+ ("clf", xgb),
445
+ ])
446
+ print(f"[🤖] Created end-to-end XGBoost pipeline with feature engineering")
447
+ else:
448
+ preprocessor = build_preprocessor(numeric_cols, categorical_cols)
449
+ pipelines["xgboost"] = Pipeline([
450
+ ("preprocess", preprocessor),
451
+ ("clf", xgb),
452
+ ])
453
+
454
+ return pipelines
455
+
456
+ if "logreg" in models_requested:
457
+ preprocessor = build_preprocessor(numeric_cols, categorical_cols)
458
+ logreg = LogisticRegression(max_iter=1000, solver="lbfgs")
459
+ pipelines["logreg"] = Pipeline([
460
+ ("preprocess", preprocessor),
461
+ ("clf", logreg),
462
+ ])
463
+
464
+ if "xgboost" in models_requested:
465
+ if XGBClassifier is None:
466
+ raise ImportError(
467
+ "xgboost is not installed. Install it with 'pip install xgboost' or remove 'xgboost' from --models."
468
+ )
469
+ preprocessor = build_preprocessor(numeric_cols, categorical_cols)
470
+
471
+ # 使用预存的最优轮数或默认值
472
+ if best_iterations and dataset in best_iterations:
473
+ n_estimators = best_iterations[dataset]
474
+ print(f"[✓] Using pre-computed best iterations for {dataset}: {n_estimators}")
475
+ else:
476
+ n_estimators = 1000 # 默认值
477
+ print(f"[!] No pre-computed iterations found for {dataset}, using default: {n_estimators}")
478
+
479
+ xgb = XGBClassifier(
480
+ objective="multi:softprob",
481
+ num_class=num_classes,
482
+ learning_rate=0.05,
483
+ max_depth=6,
484
+ subsample=0.8,
485
+ colsample_bytree=0.8,
486
+ n_estimators=n_estimators, # 使用预存的最优轮数
487
+ min_child_weight=2,
488
+ reg_lambda=1.0,
489
+ reg_alpha=0.0,
490
+ tree_method="hist",
491
+ eval_metric="mlogloss",
492
+ random_state=random_state,
493
+ n_jobs=0,
494
+ verbosity=0,
495
+ )
496
+ pipelines["xgboost"] = Pipeline([
497
+ ("preprocess", preprocessor),
498
+ ("clf", xgb),
499
+ ])
500
+
501
+ return pipelines
502
+
503
+
504
+ def evaluate_model(
505
+ name: str,
506
+ pipeline: Pipeline,
507
+ X: pd.DataFrame,
508
+ y_train: pd.Series,
509
+ y_eval: pd.Series,
510
+ cv: StratifiedKFold,
511
+ class_labels: List[str],
512
+ label_decoder: Optional[Dict[int, str]] = None,
513
+ ) -> Dict[str, object]:
514
+ scoring = {"accuracy": "accuracy", "macro_f1": "f1_macro"}
515
+
516
+ scores = cross_validate(
517
+ pipeline,
518
+ X,
519
+ y_train,
520
+ scoring=scoring,
521
+ cv=cv,
522
+ n_jobs=None,
523
+ return_estimator=False,
524
+ )
525
+
526
+ preds = cross_val_predict(pipeline, X, y_train, cv=cv, method="predict")
527
+ probas = cross_val_predict(pipeline, X, y_train, cv=cv, method="predict_proba")
528
+
529
+ # 直接训练模型(已经使用了预存的最优轮数)
530
+ fitted = pipeline.fit(X, y_train)
531
+
532
+ clf_raw_classes = list(fitted.named_steps["clf"].classes_)
533
+
534
+ if label_decoder:
535
+ preds_decoded = np.array([label_decoder[int(p)] for p in preds])
536
+ proba_labels = [label_decoder[int(c)] for c in clf_raw_classes]
537
+ else:
538
+ preds_decoded = preds
539
+ proba_labels = [str(c) for c in clf_raw_classes]
540
+
541
+ if proba_labels != class_labels:
542
+ reorder_idx = [proba_labels.index(lbl) for lbl in class_labels]
543
+ probas = probas[:, reorder_idx]
544
+ proba_labels = class_labels
545
+
546
+ y_eval_array = y_eval.to_numpy()
547
+
548
+ report = classification_report(y_eval_array, preds_decoded, labels=class_labels, digits=4)
549
+ cm = confusion_matrix(y_eval_array, preds_decoded, labels=class_labels)
550
+
551
+ metrics = {
552
+ "name": name,
553
+ "accuracy_mean": float(np.mean(scores["test_accuracy"])),
554
+ "accuracy_std": float(np.std(scores["test_accuracy"])),
555
+ "macro_f1_mean": float(np.mean(scores["test_macro_f1"])),
556
+ "macro_f1_std": float(np.std(scores["test_macro_f1"])),
557
+ "classification_report": report,
558
+ "confusion_matrix": cm,
559
+ "classes": class_labels,
560
+ "preds": preds_decoded,
561
+ "probas": probas,
562
+ "final_model": fitted,
563
+ }
564
+
565
+ # 为XGBoost添加best_iteration信息
566
+ if name == "xgboost":
567
+ if hasattr(fitted.named_steps["clf"], "best_iteration"):
568
+ metrics["best_iteration"] = fitted.named_steps["clf"].best_iteration
569
+ elif hasattr(fitted.named_steps["clf"], "n_estimators"):
570
+ metrics["best_iteration"] = fitted.named_steps["clf"].n_estimators
571
+ metrics["best_ntree_limit"] = metrics.get("best_iteration", 0) + 1
572
+
573
+ return metrics
574
+
575
+
576
+ def print_metrics(metrics: Dict[str, object], verbose: bool = False) -> None:
577
+ name = metrics["name"]
578
+ print(f"\n=== {name.upper()} meta-classifier ===")
579
+ print(
580
+ f"Accuracy: {metrics['accuracy_mean']*100:.2f}% ± {metrics['accuracy_std']*100:.2f}%\n"
581
+ f"Macro-F1: {metrics['macro_f1_mean']*100:.2f}% ± {metrics['macro_f1_std']*100:.2f}%"
582
+ )
583
+ if verbose:
584
+ print("\nClassification report:\n", metrics["classification_report"], sep="")
585
+ print("Confusion matrix (rows=true, cols=pred):")
586
+ classes = metrics["classes"]
587
+ header = " " + " ".join(f"{c[:7]:>7}" for c in classes)
588
+ print(header)
589
+ for c, row in zip(classes, metrics["confusion_matrix"]):
590
+ row_fmt = " ".join(f"{int(v):>7}" for v in row)
591
+ print(f"{c[:7]:>4} {row_fmt}")
592
+
593
+
594
+ def save_predictions(base: pd.DataFrame, metrics: Dict[str, object], path: str) -> None:
595
+ pred_df = base[["doc_id"]].copy()
596
+ pred_df["true_label"] = base["label"]
597
+ pred_df["meta_pred"] = metrics["preds"]
598
+ for idx, cls in enumerate(metrics["classes"]):
599
+ pred_df[f"meta_proba_{cls}"] = metrics["probas"][:, idx]
600
+ pred_df.to_csv(path, index=False)
601
+ print(f"Saved predictions: {path}")
602
+
603
+
604
+ # ---------------------------------------------------------------------------
605
+ # CLI
606
+ # ---------------------------------------------------------------------------
607
+
608
+ def parse_args() -> argparse.Namespace:
609
+ parser = argparse.ArgumentParser(description="Train FinSentLLM meta-classifiers (LogReg/XGBoost).")
610
+ parser.add_argument("--dataset", required=True, help="Dataset tag, e.g. 50Agree | 66Agree | 75Agree | AllAgree")
611
+ parser.add_argument("--prob_features_csv", help="Override path to probability feature CSV")
612
+ parser.add_argument("--multi_llm_csv", help="Override path to Multi-LLM feature CSV")
613
+ parser.add_argument("--semantics_csv", help="Override path to structured semantics CSV")
614
+ parser.add_argument("--folds", type=int, default=5, help="Number of stratified CV folds (default: 5)")
615
+ parser.add_argument("--seed", type=int, default=7, help="Random seed for CV shuffling (default: 7)")
616
+ parser.add_argument(
617
+ "--models",
618
+ nargs="+",
619
+ default=["logreg", "xgboost"],
620
+ choices=["logreg", "xgboost"],
621
+ help="Which meta-models to evaluate (default: both)",
622
+ )
623
+ parser.add_argument("--artifact_prefix", help="If set, saves artifacts using this filepath prefix")
624
+ parser.add_argument("--out_dir", default="outputs", help="Base output directory")
625
+ parser.add_argument("--meta_xgb_dir", default="Meta-Classifier_XG_boost_es_optimized", help="Subdir for xgboost artifacts")
626
+ parser.add_argument("--meta_logreg_dir", default="Meta-Classifier-log_regression", help="Subdir for logreg artifacts")
627
+ parser.add_argument("--save_predictions", action="store_true", help="Write out-of-fold predictions per model")
628
+ parser.add_argument("--save_models", action="store_true", help="Persist fitted pipelines per model")
629
+ parser.add_argument("--verbose", action="store_true", help="Print full reports and confusion matrices")
630
+ # 默认不使用 end-to-end,直接用预处理特征,速度快
631
+ parser.add_argument("--end_to_end", action="store_true", default=False, help="[慢] 用大模型重新生成特征 (不建议,除非你需要全流程推理)")
632
+ return parser.parse_args()
633
+
634
+
635
+ def main() -> None:
636
+
637
+ args = parse_args()
638
+
639
+ # 加载预存的最优轮数
640
+ best_iterations = load_best_iterations()
641
+
642
+ # 如果用户同时指定了 --end_to_end 和特征文件路径,给出警告
643
+ if args.end_to_end and (args.prob_features_csv or args.multi_llm_csv or args.semantics_csv):
644
+ print("[警告] --end_to_end 模式下会忽略所有预处理特征文件,全部重新推理,速度极慢!")
645
+
646
+ if args.end_to_end:
647
+ print("[🤖] Creating end-to-end pipelines with feature engineering... (速度极慢,仅用于全流程推理)")
648
+ if not TRANSFORMERS_AVAILABLE:
649
+ raise ImportError("transformers and torch are required for end-to-end feature engineering. Install with: pip install transformers torch")
650
+
651
+ # 对于端到端pipeline,我们需要原始文本数据
652
+ paths = infer_paths(args.dataset)
653
+ data = load_feature_table(paths)
654
+
655
+ # 检查是否有文本列
656
+ text_col = None
657
+ for col in ['text', 'sentence', 'content']:
658
+ if col in data.columns:
659
+ text_col = col
660
+ break
661
+
662
+ if text_col is None:
663
+ raise ValueError("End-to-end mode requires text data, but no text column found in dataset")
664
+
665
+ X_text = data[[text_col]] # 原始文本
666
+ target_col = "label"
667
+ if target_col not in data.columns:
668
+ raise KeyError("Target column 'label' not found after merging.")
669
+
670
+ y = data[target_col].astype(str)
671
+
672
+ default_order = ["negative", "neutral", "positive"]
673
+ observed = list(pd.unique(y))
674
+ class_labels = [lbl for lbl in default_order if lbl in observed]
675
+ class_labels += [lbl for lbl in observed if lbl not in class_labels]
676
+
677
+ label_to_int = {lbl: idx for idx, lbl in enumerate(class_labels)}
678
+ int_to_label = {idx: lbl for lbl, idx in label_to_int.items()}
679
+ y_encoded = y.map(label_to_int).astype(int)
680
+
681
+ pipelines = build_pipelines(
682
+ numeric_cols=[],
683
+ categorical_cols=[],
684
+ num_classes=len(class_labels),
685
+ random_state=args.seed,
686
+ models_requested=args.models,
687
+ dataset=args.dataset,
688
+ best_iterations=best_iterations,
689
+ include_feature_engineering=True,
690
+ )
691
+ X = X_text
692
+ else:
693
+ # 默认推荐:直接用预处理特征,速度快
694
+ paths = infer_paths(args.dataset)
695
+ if args.prob_features_csv:
696
+ paths.prob_features_csv = args.prob_features_csv
697
+ if args.multi_llm_csv:
698
+ paths.multi_llm_csv = args.multi_llm_csv
699
+ if args.semantics_csv:
700
+ paths.semantics_csv = args.semantics_csv
701
+
702
+ data = load_feature_table(paths)
703
+
704
+ target_col = "label"
705
+ if target_col not in data.columns:
706
+ raise KeyError("Target column 'label' not found after merging.")
707
+
708
+ categorical_cols = [c for c in ["fin_label", "rob_label"] if c in data.columns]
709
+ numeric_cols = [
710
+ c for c in data.select_dtypes(include=[np.number]).columns
711
+ if c not in {"doc_id"}
712
+ ]
713
+
714
+ X = data[numeric_cols + categorical_cols]
715
+ y = data[target_col].astype(str)
716
+
717
+ default_order = ["negative", "neutral", "positive"]
718
+ observed = list(pd.unique(y))
719
+ class_labels = [lbl for lbl in default_order if lbl in observed]
720
+ class_labels += [lbl for lbl in observed if lbl not in class_labels]
721
+
722
+ label_to_int = {lbl: idx for idx, lbl in enumerate(class_labels)}
723
+ int_to_label = {idx: lbl for lbl, idx in label_to_int.items()}
724
+ y_encoded = y.map(label_to_int).astype(int)
725
+
726
+ pipelines = build_pipelines(
727
+ numeric_cols=numeric_cols,
728
+ categorical_cols=categorical_cols,
729
+ num_classes=len(class_labels),
730
+ random_state=args.seed,
731
+ models_requested=args.models,
732
+ dataset=args.dataset,
733
+ best_iterations=best_iterations,
734
+ include_feature_engineering=False,
735
+ )
736
+
737
+ cv = StratifiedKFold(n_splits=args.folds, shuffle=True, random_state=args.seed)
738
+
739
+ results = {}
740
+ for name, pipeline in pipelines.items():
741
+ if name == "xgboost":
742
+ metrics = evaluate_model(
743
+ name,
744
+ pipeline,
745
+ X,
746
+ y_encoded,
747
+ y,
748
+ cv=cv,
749
+ class_labels=class_labels,
750
+ label_decoder=int_to_label,
751
+ )
752
+ else:
753
+ metrics = evaluate_model(
754
+ name,
755
+ pipeline,
756
+ X,
757
+ y,
758
+ y,
759
+ cv=cv,
760
+ class_labels=class_labels,
761
+ )
762
+ print_metrics(metrics, verbose=args.verbose)
763
+ results[name] = metrics
764
+
765
+ if args.artifact_prefix and args.save_predictions:
766
+ pred_path = f"{args.artifact_prefix}_{name}_predictions.csv"
767
+ save_predictions(data, metrics, pred_path)
768
+
769
+ if args.artifact_prefix and args.save_models:
770
+ model_path = f"{args.artifact_prefix}_{name}_model.joblib"
771
+ # 为XGBoost保存完整的模型字典
772
+ if name == "xgboost":
773
+ model_dict = {
774
+ "pipeline": metrics["final_model"],
775
+ "feature_columns": list(X.columns),
776
+ "label_map": label_to_int,
777
+ "labels": class_labels,
778
+ "best_iteration": metrics.get("best_iteration", 0),
779
+ "best_ntree_limit": metrics.get("best_ntree_limit", 1),
780
+ }
781
+ joblib.dump(model_dict, model_path)
782
+ else:
783
+ joblib.dump(metrics["final_model"], model_path)
784
+ print(f"Saved model: {model_path}")
785
+
786
+ # If no artifact_prefix is provided but user asked to save, route to default meta subfolders
787
+ if not args.artifact_prefix and args.save_predictions:
788
+ if name == "xgboost":
789
+ save_dir = os.path.join(args.out_dir, args.meta_xgb_dir)
790
+ else:
791
+ save_dir = os.path.join(args.out_dir, args.meta_logreg_dir)
792
+ os.makedirs(save_dir, exist_ok=True)
793
+ pred_path = os.path.join(save_dir, f"FinSent_{args.dataset}_meta_{name}_predictions.csv")
794
+ save_predictions(data, metrics, pred_path)
795
+
796
+ if not args.artifact_prefix and args.save_models:
797
+ if name == "xgboost":
798
+ save_dir = os.path.join(args.out_dir, args.meta_xgb_dir)
799
+ else:
800
+ save_dir = os.path.join(args.out_dir, args.meta_logreg_dir)
801
+ os.makedirs(save_dir, exist_ok=True)
802
+ model_path = os.path.join(save_dir, f"FinSent_{args.dataset}_meta_{name}_model.joblib")
803
+ # 为XGBoost保存完整的模型字典
804
+ if name == "xgboost":
805
+ model_dict = {
806
+ "pipeline": metrics["final_model"],
807
+ "feature_columns": list(X.columns),
808
+ "label_map": label_to_int,
809
+ "labels": class_labels,
810
+ "best_iteration": metrics.get("best_iteration", 0),
811
+ "best_ntree_limit": metrics.get("best_ntree_limit", 1),
812
+ }
813
+ joblib.dump(model_dict, model_path)
814
+ else:
815
+ joblib.dump(metrics["final_model"], model_path)
816
+ print(f"Saved model: {model_path}")
817
+
818
+
819
+ if __name__ == "__main__":
820
+ main()
FPB Multi-LLM .py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # FPB Multi-LLM .py
2
+ # Compute expert-signal features from multiple LLMs (FinBERT, RoBERT
3
+ import argparse
4
+ import os
5
+ import numpy as np
6
+ import pandas as pd
7
+ from scipy.special import rel_entr
8
+
9
+ EPS = 1e-12
10
+
11
+ def row_normalize(a: np.ndarray) -> np.ndarray:
12
+ s = a.sum(axis=1, keepdims=True)
13
+ s[s == 0.0] = 1.0
14
+ return a / s
15
+
16
+ def kl_divergence(P: np.ndarray, Q: np.ndarray) -> np.ndarray:
17
+ P = np.clip(P, EPS, 1.0)
18
+ Q = np.clip(Q, EPS, 1.0)
19
+ return np.sum(rel_entr(P, Q), axis=1) # Σ p * log(p/q)
20
+
21
+ def l1_distance(P: np.ndarray, Q: np.ndarray) -> np.ndarray:
22
+ return 0.5 * np.sum(np.abs(P - Q), axis=1) # ∈ [0,1]
23
+
24
+ def load_probs(df: pd.DataFrame, cols: list, label: str) -> np.ndarray:
25
+ missing = [c for c in cols if c not in df.columns]
26
+ if missing:
27
+ raise ValueError(f"[{label}] Missing columns: {missing}")
28
+ arr = df[cols].to_numpy(dtype=float)
29
+ return row_normalize(arr)
30
+
31
+ def main():
32
+ ap = argparse.ArgumentParser(description="Compute Multi-LLM expert signals (Stage 3 only).")
33
+ ap.add_argument("--input", required=True, help="CSV with per-model probabilities")
34
+ ap.add_argument("--dataset", required=True, help="Dataset tag, e.g., 50Agree | AllAgree")
35
+ ap.add_argument("--out_dir", default=".", help="Output directory")
36
+ ap.add_argument("--out_subdir", default="MultiLLM", help="Subdirectory under out_dir to save multi-llm features")
37
+ # Column names: default to common names; override if your headers differ
38
+ ap.add_argument("--fin_cols", nargs=3, default=["fin_p_neg","fin_p_neu","fin_p_pos"],
39
+ help="FinBERT prob columns [neg neu pos]")
40
+ ap.add_argument("--rob_cols", nargs=3, default=["rob_p_neg","rob_p_neu","rob_p_pos"],
41
+ help="RoBERTa prob columns [neg neu pos]")
42
+ args = ap.parse_args()
43
+
44
+ df = pd.read_csv(args.input)
45
+
46
+ # Load & normalize probabilities
47
+ P = load_probs(df, args.fin_cols, "FinBERT")
48
+ Q = load_probs(df, args.rob_cols, "RoBERTa")
49
+
50
+ # Expert-signal features (paper)
51
+ df["MultiLLM_L1_distance"] = l1_distance(P, Q)
52
+ df["MultiLLM_L1_similarity"] = 1.0 - df["MultiLLM_L1_distance"]
53
+ df["MultiLLM_KL_F_to_R"] = kl_divergence(P, Q)
54
+ df["MultiLLM_KL_R_to_F"] = kl_divergence(Q, P)
55
+
56
+ # Optional: simple agreement flag (same argmax class)
57
+ df["MultiLLM_agree"] = (np.argmax(P, axis=1) == np.argmax(Q, axis=1)).astype(int)
58
+
59
+ save_dir = args.out_dir
60
+ if args.out_subdir:
61
+ save_dir = os.path.join(args.out_dir, args.out_subdir)
62
+ os.makedirs(save_dir, exist_ok=True)
63
+ out_path = os.path.join(save_dir, f"FPB_MultiLLM_{args.dataset}.csv")
64
+ df.to_csv(out_path, index=False)
65
+ print(f"✅ Saved expert-signal features to: {out_path}")
66
+ print(" Added columns: MultiLLM_L1_distance, MultiLLM_L1_similarity, "
67
+ "MultiLLM_KL_F_to_R, MultiLLM_KL_R_to_F, MultiLLM_agree")
68
+
69
+ if __name__ == "__main__":
70
+ main()
FPB Prob Features.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ #!/usr/bin/env python3
4
+ import argparse
5
+ import os
6
+ import numpy as np
7
+ import pandas as pd
8
+
9
+ EPS = 1e-12
10
+
11
+ # ---------- Helper functions ----------
12
+ def logit(p: np.ndarray) -> np.ndarray:
13
+ """Compute logit(p) = log(p / (1 - p))."""
14
+ p = np.clip(p, EPS, 1.0 - EPS)
15
+ return np.log(p / (1.0 - p))
16
+
17
+ def entropy(p: np.ndarray) -> np.ndarray:
18
+ """Shannon entropy: H(p) = -Σ p log p."""
19
+ p = np.clip(p, EPS, 1.0)
20
+ return -np.sum(p * np.log(p), axis=1)
21
+
22
+ def top2_margin(p: np.ndarray) -> np.ndarray:
23
+ """Margin = top1(p) - top2(p)."""
24
+ s = np.sort(p, axis=1)
25
+ return s[:, -1] - s[:, -2]
26
+
27
+ # ---------- Main ----------
28
+ def main():
29
+ ap = argparse.ArgumentParser(description="Compute probability-derived features (logit, max prob, margin, entropy).")
30
+ ap.add_argument("--input", required=True,
31
+ help="Path to FinSent_*_raw_probs.csv from FinBERT/RoBERTa step.")
32
+ ap.add_argument("--out_file", default=None,
33
+ help="Output CSV (default: adds _prob_features to filename).")
34
+ ap.add_argument("--out_dir", default="outputs", help="Base output directory")
35
+ ap.add_argument("--out_subdir", default="prob features", help="Subdirectory under out_dir to save prob features")
36
+ args = ap.parse_args()
37
+
38
+ df = pd.read_csv(args.input)
39
+
40
+ # Check probability columns
41
+ req = [
42
+ "fin_p_neg","fin_p_neu","fin_p_pos",
43
+ "rob_p_neg","rob_p_neu","rob_p_pos"
44
+ ]
45
+ missing = [c for c in req if c not in df.columns]
46
+ if missing:
47
+ raise ValueError(f"Missing columns: {missing}")
48
+
49
+ # Prepare arrays
50
+ p_fin = df[["fin_p_neg","fin_p_neu","fin_p_pos"]].to_numpy(dtype=float)
51
+ p_rob = df[["rob_p_neg","rob_p_neu","rob_p_pos"]].to_numpy(dtype=float)
52
+
53
+ # ---- FinBERT features ----
54
+ fin_logit = logit(p_fin)
55
+ for i, cls in enumerate(["neg","neu","pos"]):
56
+ df[f"fin_logit_{cls}"] = fin_logit[:, i]
57
+ df["fin_max_prob"] = np.max(p_fin, axis=1)
58
+ df["fin_margin"] = top2_margin(p_fin)
59
+ df["fin_entropy"] = entropy(p_fin)
60
+
61
+ # ---- RoBERTa features ----
62
+ rob_logit = logit(p_rob)
63
+ for i, cls in enumerate(["neg","neu","pos"]):
64
+ df[f"rob_logit_{cls}"] = rob_logit[:, i]
65
+ df["rob_max_prob"] = np.max(p_rob, axis=1)
66
+ df["rob_margin"] = top2_margin(p_rob)
67
+ df["rob_entropy"] = entropy(p_rob)
68
+
69
+ # Save
70
+ root, ext = os.path.splitext(args.input)
71
+ if args.out_file:
72
+ out_path = args.out_file
73
+ else:
74
+ # save into outputs/<out_subdir>/ by default
75
+ save_dir = os.path.join(args.out_dir, args.out_subdir) if args.out_subdir else args.out_dir
76
+ os.makedirs(save_dir, exist_ok=True)
77
+ base = os.path.basename(root)
78
+ out_path = os.path.join(save_dir, f"{base}_prob_features.csv")
79
+ df.to_csv(out_path, index=False)
80
+ print(f"[✓] Saved probability-derived features to: {out_path}")
81
+ print("Added columns: fin/rob logits, max_prob, margin, entropy")
82
+
83
+ if __name__ == "__main__":
84
+ main()
FPB_Structured_Financial_Semantics.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import re
3
+ from typing import Dict, List
4
+
5
+ def _compile(patterns: List[str], flags=re.IGNORECASE):
6
+ return [re.compile(p, flags=flags) for p in patterns]
7
+
8
+ def _any_match(text: str, regs) -> bool:
9
+ return any(r.search(text) for r in regs)
10
+
11
+ # Operators per FinSentLLM Table 1
12
+ _COMPARATIVE = _compile([
13
+ r"\bcompared\s+to\b",
14
+ r"\bcompared\s+with\b",
15
+ r"\bversus\b",
16
+ r"\bvs\.?\b",
17
+ r"\bfrom\s+[-+]?\d+(?:\.\d+)?\s*(?:%|percent|percentage|[A-Za-z]+)?\s+to\s+[-+]?\d+(?:\.\d+)?\s*(?:%|percent|percentage|[A-Za-z]+)?\b",
18
+ r"\bfrom\s+[A-Za-z0-9\.,%-]+\s+to\s+[A-Za-z0-9\.,%-]+\b",
19
+ ])
20
+
21
+ _LOSS_IMPROVE = _compile([
22
+ r"\bloss(?:es)?\s+(?:narrowed|shr[aou]nk|decreased|fell|reduced)\b",
23
+ r"\bturn(?:ed)?\s+to\s+(?:profit|black)\b",
24
+ ])
25
+ _LOSS_WORSEN = _compile([
26
+ r"\bloss(?:es)?\s+(?:widened|grew|increased|rose|deepened)\b",
27
+ r"\bturn(?:ed)?\s+to\s+(?:loss|red)\b",
28
+ ])
29
+
30
+ _PROFIT_UP = _compile([
31
+ r"\b(profit|profits|net\s+income|earnings|ebit|ebitda|eps|roe|roi|return(?:s)?(?:\s+on\s+equity)?)\b.*\b(rose|grew|increased|up|higher|improved|jumped|surged|soared)\b",
32
+ r"\b(rose|grew|increased|up|higher|improved|jumped|surged|soared)\b.*\b(profit|profits|net\s+income|earnings|ebit|ebitda|eps|roe|roi|return(?:s)?(?:\s+on\s+equity)?)\b",
33
+ ])
34
+
35
+ _COST_DOWN = _compile([
36
+ r"\b(cost|costs|expenses|opex|operating\s+expense(?:s)?)\b.*\b(fell|declined|decreased|lower|reduced|down)\b",
37
+ r"\b(fell|declined|decreased|lower|reduced|down)\b.*\b(cost|costs|expenses|opex|operating\s+expense(?:s)?)\b",
38
+ ])
39
+
40
+ _CONTRACT_FIN = _compile([
41
+ r"\b(agreement|deal|contract|order|purchase\s+order|framework\s+agreement)\b",
42
+ r"\b(bond|notes?|debenture|convertible|placement|issuance|issue|offering|ipo|follow-?on)\b",
43
+ r"\b(loan|credit\s+facility|credit\s+line|revolver|revolving\s+credit|financing)\b",
44
+ ])
45
+
46
+ _UNCERTAIN = _compile([
47
+ r"\b(uncertain|uncertainty|cannot\s+be\s+determined|not\s+clear|unknown|unpredictable)\b",
48
+ r"\b(impairment|write-?down|one-?off|exceptional\s+(?:item|charge)|non-?recurring)\b",
49
+ r"\b(outlook\s+(?:uncertain|cloudy|cautious))\b",
50
+ ])
51
+
52
+ _STABLE_GUIDE = _compile([
53
+ r"\b(expects?|expected|expects\s+to|guidance|forecast|outlook)\b.*\b(remain(?:s|ed|ing)?\s+(?:stable|unchanged)|in[-\s]?line)\b",
54
+ r"\b(reiterated|maintained)\s+(?:its\s+)?(guidance|forecast|outlook)\b",
55
+ ])
56
+
57
+ _OPERATIONAL = _compile([
58
+ r"\b(restructuring|reorganization|spin-?off|divest(?:iture)?|asset\s+sale)\b",
59
+ r"\b(ban|suspension|halted|blocked|prohibited)\b",
60
+ r"\b(recall|probe|investigation|lawsuit|litigation|settlement)\b",
61
+ r"\b(layoffs?|headcount\s+reduction|cut\s+jobs|hiring\s+freeze)\b",
62
+ ])
63
+
64
+ def extract_semantic_flags(text: str) -> Dict[str, int]:
65
+ t = text.strip().lower()
66
+
67
+ flags = {
68
+ "sem_compared": int(_any_match(t, _COMPARATIVE)),
69
+ "sem_loss_improve": int(_any_match(t, _LOSS_IMPROVE)),
70
+ "sem_loss_worsen": int(_any_match(t, _LOSS_WORSEN)),
71
+ "sem_profit_up": int(_any_match(t, _PROFIT_UP)),
72
+ "sem_cost_down": int(_any_match(t, _COST_DOWN)),
73
+ "sem_contract_fin": int(_any_match(t, _CONTRACT_FIN)),
74
+ "sem_uncertainty": int(_any_match(t, _UNCERTAIN)),
75
+ "sem_stable_guidance":int(_any_match(t, _STABLE_GUIDE)),
76
+ "sem_operational": int(_any_match(t, _OPERATIONAL)),
77
+ }
78
+ return flags
79
+
80
+ # ============================================================
81
+ # Run directly from terminal
82
+ # ============================================================
83
+ if __name__ == "__main__":
84
+ import argparse, pandas as pd
85
+ from pathlib import Path
86
+
87
+ parser = argparse.ArgumentParser(description="Extract Structured Financial Semantics from FPB text file.")
88
+ parser.add_argument("--input", required=True, help="Path to Sentences_*.txt or a CSV with text column.")
89
+ parser.add_argument("--out", required=True, help="Output CSV path.")
90
+ parser.add_argument("--text_col", default="sentence", help="Column name if input is CSV.")
91
+ args = parser.parse_args()
92
+
93
+ def parse_fpb_line(line):
94
+ if "@positive" in line:
95
+ return line.rsplit("@positive", 1)[0].strip(), "positive"
96
+ elif "@negative" in line:
97
+ return line.rsplit("@negative", 1)[0].strip(), "negative"
98
+ elif "@neutral" in line:
99
+ return line.rsplit("@neutral", 1)[0].strip(), "neutral"
100
+ else:
101
+ return line.strip(), ""
102
+
103
+ path = Path(args.input)
104
+ rows = []
105
+ if path.suffix.lower() == ".txt":
106
+ with open(path, "r", encoding="utf-8", errors="ignore") as f:
107
+ for i, line in enumerate(f):
108
+ text, label = parse_fpb_line(line)
109
+ if text:
110
+ rows.append({"id": i, args.text_col: text, "label": label})
111
+ df = pd.DataFrame(rows)
112
+ else:
113
+ df = pd.read_csv(path)
114
+
115
+ # Apply semantic extraction
116
+ df_feats = df[args.text_col].astype(str).apply(extract_semantic_flags).apply(pd.Series)
117
+ df_out = pd.concat([df, df_feats], axis=1)
118
+ df_out.to_csv(args.out, index=False)
119
+ print(f"Saved structured semantics to: {args.out}")
120
+ print("Columns:", [c for c in df_out.columns if c.startswith('sem_')])