Upload 5 files
Browse files- FPB FinBERT and Roberta.py +132 -0
- FPB Meta Classifier.py +820 -0
- FPB Multi-LLM .py +70 -0
- FPB Prob Features.py +84 -0
- FPB_Structured_Financial_Semantics.py +120 -0
FPB FinBERT and Roberta.py
ADDED
|
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import argparse
|
| 3 |
+
from typing import List
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
import torch
|
| 8 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 9 |
+
|
| 10 |
+
# ---------------- Models ----------------
|
| 11 |
+
FINBERT_ID = "ProsusAI/finbert" # 0=negative, 1=neutral, 2=positive
|
| 12 |
+
ROBERTA_ID = "cardiffnlp/twitter-roberta-base-sentiment" # 0=negative, 1=neutral, 2=positive
|
| 13 |
+
CLASS_NAMES = ["negative", "neutral", "positive"]
|
| 14 |
+
|
| 15 |
+
# ---------------- I/O helpers ----------------
|
| 16 |
+
def read_fpb_txt(file_path: str) -> pd.DataFrame:
|
| 17 |
+
"""Reads FPB *.txt file with lines like 'sentence@label' -> columns: text, label (optional)."""
|
| 18 |
+
rows = []
|
| 19 |
+
with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
|
| 20 |
+
for line in f:
|
| 21 |
+
line = line.rstrip("\n")
|
| 22 |
+
if not line:
|
| 23 |
+
continue
|
| 24 |
+
if "@" in line:
|
| 25 |
+
sentence, label = line.split("@", 1)
|
| 26 |
+
rows.append({"text": sentence.strip(), "label": label.strip()})
|
| 27 |
+
else:
|
| 28 |
+
rows.append({"text": line.strip(), "label": ""})
|
| 29 |
+
df = pd.DataFrame(rows)
|
| 30 |
+
df["text"] = df["text"].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
|
| 31 |
+
return df
|
| 32 |
+
|
| 33 |
+
def read_input(file_path: str) -> pd.DataFrame:
|
| 34 |
+
"""Reads either FPB .txt or CSV file."""
|
| 35 |
+
ext = os.path.splitext(file_path)[1].lower()
|
| 36 |
+
if ext == ".txt":
|
| 37 |
+
return read_fpb_txt(file_path)
|
| 38 |
+
elif ext == ".csv":
|
| 39 |
+
df = pd.read_csv(file_path)
|
| 40 |
+
if "text" not in df.columns:
|
| 41 |
+
raise ValueError("CSV must contain a 'text' column.")
|
| 42 |
+
df["text"] = df["text"].astype(str).str.replace(r"\s+", " ", regex=True).str.strip()
|
| 43 |
+
return df
|
| 44 |
+
else:
|
| 45 |
+
raise ValueError(f"Unsupported file type: {ext}. Use .txt or .csv")
|
| 46 |
+
|
| 47 |
+
# ---------------- Model inference ----------------
|
| 48 |
+
def load_model(model_id: str):
|
| 49 |
+
tok = AutoTokenizer.from_pretrained(model_id)
|
| 50 |
+
mdl = AutoModelForSequenceClassification.from_pretrained(model_id)
|
| 51 |
+
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 52 |
+
mdl.to(device).eval()
|
| 53 |
+
return tok, mdl, device
|
| 54 |
+
|
| 55 |
+
@torch.no_grad()
|
| 56 |
+
def predict_probs(texts: List[str], tokenizer, model, device, batch_size=32, max_length=128) -> np.ndarray:
|
| 57 |
+
"""Returns an (N,3) array of probabilities [p_neg, p_neu, p_pos]."""
|
| 58 |
+
out = []
|
| 59 |
+
for i in range(0, len(texts), batch_size):
|
| 60 |
+
batch = texts[i:i+batch_size]
|
| 61 |
+
enc = tokenizer(batch, return_tensors="pt", truncation=True, padding=True, max_length=max_length)
|
| 62 |
+
enc = {k: v.to(device) for k, v in enc.items()}
|
| 63 |
+
logits = model(**enc).logits
|
| 64 |
+
probs = torch.softmax(logits, dim=-1).cpu().numpy()
|
| 65 |
+
out.append(probs)
|
| 66 |
+
if not out:
|
| 67 |
+
return np.zeros((0, 3), dtype=float)
|
| 68 |
+
probs = np.vstack(out)
|
| 69 |
+
|
| 70 |
+
# Safety: ensure models align to [neg, neu, pos]
|
| 71 |
+
# ProsusAI/finbert and cardiffnlp/twitter-roberta-base-sentiment both map 0,1,2 to neg,neu,pos.
|
| 72 |
+
# If you swap models in future, adjust mapping here.
|
| 73 |
+
return probs
|
| 74 |
+
|
| 75 |
+
def add_prob_columns(df: pd.DataFrame, probs: np.ndarray, prefix: str) -> pd.DataFrame:
|
| 76 |
+
"""Adds p_neg/p_neu/p_pos, {prefix}_label, {prefix}_score columns for one model."""
|
| 77 |
+
result = df.copy()
|
| 78 |
+
result[f"{prefix}_p_neg"] = probs[:, 0]
|
| 79 |
+
result[f"{prefix}_p_neu"] = probs[:, 1]
|
| 80 |
+
result[f"{prefix}_p_pos"] = probs[:, 2]
|
| 81 |
+
top = probs.argmax(axis=1)
|
| 82 |
+
result[f"{prefix}_label"] = [CLASS_NAMES[i] for i in top]
|
| 83 |
+
result[f"{prefix}_score"] = probs[np.arange(len(probs)), top]
|
| 84 |
+
return result
|
| 85 |
+
|
| 86 |
+
# ---------------- Main ----------------
|
| 87 |
+
def parse_args():
|
| 88 |
+
p = argparse.ArgumentParser(description="Apply FinBERT + RoBERTa to FPB subset.")
|
| 89 |
+
p.add_argument("--input", required=True, help="Input FPB file (.txt with 'sentence@label' or .csv with 'text')")
|
| 90 |
+
p.add_argument("--dataset", required=True, help="Dataset tag for the output file (e.g., 50Agree, 66Agree, 75Agree, AllAgree)")
|
| 91 |
+
p.add_argument("--batch_size", type=int, default=32)
|
| 92 |
+
p.add_argument("--max_length", type=int, default=128)
|
| 93 |
+
p.add_argument("--out_dir", default="outputs", help="Directory to save results")
|
| 94 |
+
p.add_argument("--out_subdir", default=None, help="Optional subdirectory under out_dir (e.g. 'FinBERT and RoBERTa raw probs')")
|
| 95 |
+
return p.parse_args()
|
| 96 |
+
|
| 97 |
+
def main():
|
| 98 |
+
args = parse_args()
|
| 99 |
+
|
| 100 |
+
# Load data
|
| 101 |
+
df = read_input(args.input)
|
| 102 |
+
if "doc_id" not in df.columns:
|
| 103 |
+
df.insert(0, "doc_id", np.arange(len(df), dtype=int))
|
| 104 |
+
print(f"Loaded {len(df)} rows from {args.input}")
|
| 105 |
+
|
| 106 |
+
# FinBERT
|
| 107 |
+
print("Running FinBERT (ProsusAI/finbert)...")
|
| 108 |
+
fin_tok, fin_mdl, fin_dev = load_model(FINBERT_ID)
|
| 109 |
+
fin_probs = predict_probs(df["text"].tolist(), fin_tok, fin_mdl, fin_dev, args.batch_size, args.max_length)
|
| 110 |
+
df = add_prob_columns(df, fin_probs, "fin")
|
| 111 |
+
|
| 112 |
+
# RoBERTa
|
| 113 |
+
print("Running RoBERTa (cardiffnlp/twitter-roberta-base-sentiment)...")
|
| 114 |
+
rob_tok, rob_mdl, rob_dev = load_model(ROBERTA_ID)
|
| 115 |
+
rob_probs = predict_probs(df["text"].tolist(), rob_tok, rob_mdl, rob_dev, args.batch_size, args.max_length)
|
| 116 |
+
df = add_prob_columns(df, rob_probs, "rob")
|
| 117 |
+
|
| 118 |
+
# Save (optionally into a named subdirectory)
|
| 119 |
+
save_dir = args.out_dir
|
| 120 |
+
if args.out_subdir:
|
| 121 |
+
# create a subdirectory under out_dir
|
| 122 |
+
save_dir = os.path.join(args.out_dir, args.out_subdir)
|
| 123 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 124 |
+
out_path = os.path.join(save_dir, f"FinSent_{args.dataset}_raw_probs.csv")
|
| 125 |
+
df.to_csv(out_path, index=False)
|
| 126 |
+
print(f"Saved to: {out_path}")
|
| 127 |
+
print("Columns:")
|
| 128 |
+
print(" fin_p_neg/neu/pos, fin_label, fin_score")
|
| 129 |
+
print(" rob_p_neg/neu/pos, rob_label, rob_score")
|
| 130 |
+
|
| 131 |
+
if __name__ == "__main__":
|
| 132 |
+
main()
|
FPB Meta Classifier.py
ADDED
|
@@ -0,0 +1,820 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""Meta-classifier for Financial PhraseBank (FPB) datasets.
|
| 2 |
+
|
| 3 |
+
The script expects the following pre-computed artifacts inside ``outputs/``
|
| 4 |
+
(or custom paths can be supplied):
|
| 5 |
+
|
| 6 |
+
* ``FinSent_<split>_raw_probs_prob_features.csv`` – base probabilities and
|
| 7 |
+
probability-derived features for FinBERT/RoBERTa
|
| 8 |
+
* ``FPB_MultiLLM_<split>.csv`` – expert-signal metrics (KL, L1, agreement)
|
| 9 |
+
* ``Sentences_<split>_semantics.csv`` – structured semantics flags
|
| 10 |
+
|
| 11 |
+
Example command (50Agree subset)::
|
| 12 |
+
|
| 13 |
+
python "FPB Meta Classifier.py" \\
|
| 14 |
+
--dataset 50Agree \\
|
| 15 |
+
--folds 5 \\
|
| 16 |
+
--models logreg xgboost \\
|
| 17 |
+
--artifact_prefix outputs/FinSent_50Agree_meta \\
|
| 18 |
+
--save_predictions --save_models --verbose
|
| 19 |
+
|
| 20 |
+
"""
|
| 21 |
+
|
| 22 |
+
from __future__ import annotations
|
| 23 |
+
|
| 24 |
+
import argparse
|
| 25 |
+
import os
|
| 26 |
+
from dataclasses import dataclass
|
| 27 |
+
from typing import Dict, Iterable, List, Optional
|
| 28 |
+
|
| 29 |
+
import joblib
|
| 30 |
+
import numpy as np
|
| 31 |
+
import pandas as pd
|
| 32 |
+
from sklearn.compose import ColumnTransformer
|
| 33 |
+
from sklearn.linear_model import LogisticRegression
|
| 34 |
+
from sklearn.metrics import classification_report, confusion_matrix
|
| 35 |
+
from sklearn.model_selection import StratifiedKFold, cross_val_predict, cross_validate
|
| 36 |
+
from sklearn.pipeline import Pipeline
|
| 37 |
+
from sklearn.preprocessing import OneHotEncoder, StandardScaler
|
| 38 |
+
|
| 39 |
+
# tqdm用于进度条显示
|
| 40 |
+
try:
|
| 41 |
+
from tqdm import tqdm
|
| 42 |
+
except ImportError:
|
| 43 |
+
tqdm = None
|
| 44 |
+
|
| 45 |
+
try:
|
| 46 |
+
from xgboost import XGBClassifier
|
| 47 |
+
except ImportError: # pragma: no cover - handled at runtime
|
| 48 |
+
XGBClassifier = None # type: ignore
|
| 49 |
+
|
| 50 |
+
try:
|
| 51 |
+
import torch
|
| 52 |
+
from transformers import AutoTokenizer, AutoModelForSequenceClassification
|
| 53 |
+
from scipy.stats import entropy
|
| 54 |
+
TRANSFORMERS_AVAILABLE = True
|
| 55 |
+
except ImportError:
|
| 56 |
+
TRANSFORMERS_AVAILABLE = False
|
| 57 |
+
print("[!] transformers or torch not available. FinSentLLM feature engineering will be disabled.")
|
| 58 |
+
|
| 59 |
+
from sklearn.base import BaseEstimator, TransformerMixin
|
| 60 |
+
|
| 61 |
+
|
| 62 |
+
# ---------------------------------------------------------------------------
|
| 63 |
+
# Data loading
|
| 64 |
+
# ---------------------------------------------------------------------------
|
| 65 |
+
|
| 66 |
+
@dataclass
|
| 67 |
+
class DatasetPaths:
|
| 68 |
+
dataset: str
|
| 69 |
+
prob_features_csv: str
|
| 70 |
+
multi_llm_csv: str
|
| 71 |
+
semantics_csv: str
|
| 72 |
+
|
| 73 |
+
|
| 74 |
+
def infer_paths(dataset: str, base_dir: str = "outputs") -> DatasetPaths:
|
| 75 |
+
dtag = dataset.strip()
|
| 76 |
+
prob_csv = os.path.join(base_dir, "prob features", f"FinSent_{dtag}_raw_probs_prob_features.csv")
|
| 77 |
+
multi_csv = os.path.join(base_dir, "MultiLLM", f"FPB_MultiLLM_{dtag}.csv")
|
| 78 |
+
# 修正语义文件路径 - 实际文件在 Structures Financial Semantics 子目录下
|
| 79 |
+
sem_csv = os.path.join(base_dir, "Structures Financial Semantics", f"Sentences_{dtag}_semantics.csv")
|
| 80 |
+
return DatasetPaths(dataset=dtag, prob_features_csv=prob_csv, multi_llm_csv=multi_csv, semantics_csv=sem_csv)
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def _merge_features(left: pd.DataFrame, right: pd.DataFrame, key: str = "doc_id") -> pd.DataFrame:
|
| 84 |
+
"""Merge two DataFrames on ``doc_id`` while dropping duplicate feature columns."""
|
| 85 |
+
overlap = [c for c in right.columns if c in left.columns and c != key]
|
| 86 |
+
right_clean = right.drop(columns=overlap, errors="ignore")
|
| 87 |
+
merged = left.merge(right_clean, on=key, how="left", validate="one_to_one")
|
| 88 |
+
return merged
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
def load_feature_table(paths: DatasetPaths) -> pd.DataFrame:
|
| 92 |
+
if not os.path.exists(paths.multi_llm_csv):
|
| 93 |
+
raise FileNotFoundError(f"Missing Multi-LLM feature CSV: {paths.multi_llm_csv}")
|
| 94 |
+
base = pd.read_csv(paths.multi_llm_csv)
|
| 95 |
+
|
| 96 |
+
# Ensure `doc_id` present for alignment.
|
| 97 |
+
if "doc_id" not in base.columns:
|
| 98 |
+
raise KeyError("Expected 'doc_id' column in Multi-LLM CSV. Re-run stage 3 feature extraction.")
|
| 99 |
+
|
| 100 |
+
# Merge optional probability features if available (guards against missing engineered columns).
|
| 101 |
+
if os.path.exists(paths.prob_features_csv):
|
| 102 |
+
prob = pd.read_csv(paths.prob_features_csv)
|
| 103 |
+
if "doc_id" not in prob.columns:
|
| 104 |
+
raise KeyError("Probability features CSV must contain 'doc_id'.")
|
| 105 |
+
base = _merge_features(base, prob, key="doc_id")
|
| 106 |
+
else:
|
| 107 |
+
print(f"[!] Probability feature CSV not found ({paths.prob_features_csv}); proceeding without extra columns.")
|
| 108 |
+
|
| 109 |
+
# Merge structured semantics.
|
| 110 |
+
if not os.path.exists(paths.semantics_csv):
|
| 111 |
+
raise FileNotFoundError(f"Missing semantics CSV: {paths.semantics_csv}")
|
| 112 |
+
sem = pd.read_csv(paths.semantics_csv)
|
| 113 |
+
if "doc_id" not in sem.columns:
|
| 114 |
+
if "id" in sem.columns:
|
| 115 |
+
sem = sem.rename(columns={"id": "doc_id"})
|
| 116 |
+
else:
|
| 117 |
+
raise KeyError("Semantics CSV must contain 'doc_id' or 'id' column.")
|
| 118 |
+
|
| 119 |
+
sem = sem.drop(columns=[c for c in ["label", "sentence", "text"] if c in sem.columns], errors="ignore")
|
| 120 |
+
merged = _merge_features(base, sem, key="doc_id")
|
| 121 |
+
|
| 122 |
+
# Check for missing semantics flags.
|
| 123 |
+
sem_cols = [c for c in merged.columns if c.startswith("sem_")]
|
| 124 |
+
if sem_cols:
|
| 125 |
+
missing_sem = merged[sem_cols].isna().any(axis=1)
|
| 126 |
+
if missing_sem.any():
|
| 127 |
+
raise ValueError(
|
| 128 |
+
f"{int(missing_sem.sum())} rows lack structured semantics after merging. Make sure the semantics file"
|
| 129 |
+
" matches the dataset split."
|
| 130 |
+
)
|
| 131 |
+
|
| 132 |
+
return merged
|
| 133 |
+
|
| 134 |
+
|
| 135 |
+
def load_best_iterations(results_dir: str = "results") -> Dict[str, int]:
|
| 136 |
+
"""Load previously computed best iterations for XGBoost models.
|
| 137 |
+
|
| 138 |
+
Returns:
|
| 139 |
+
Dictionary mapping dataset names to best iteration counts.
|
| 140 |
+
Returns empty dict if file not found.
|
| 141 |
+
"""
|
| 142 |
+
best_iters_file = os.path.join(results_dir, "xgb_meta_best_iterations.csv")
|
| 143 |
+
|
| 144 |
+
if not os.path.exists(best_iters_file):
|
| 145 |
+
print(f"[!] Best iterations file not found: {best_iters_file}")
|
| 146 |
+
return {}
|
| 147 |
+
|
| 148 |
+
try:
|
| 149 |
+
df = pd.read_csv(best_iters_file)
|
| 150 |
+
# Create mapping from dataset name to best iteration
|
| 151 |
+
best_iters = {}
|
| 152 |
+
for _, row in df.iterrows():
|
| 153 |
+
dataset = row["meta"] # e.g., "50Agree"
|
| 154 |
+
best_iter = int(row["best_iteration"])
|
| 155 |
+
best_iters[dataset] = best_iter
|
| 156 |
+
|
| 157 |
+
print(f"[✓] Loaded best iterations for {len(best_iters)} datasets:")
|
| 158 |
+
for dataset, iter_count in best_iters.items():
|
| 159 |
+
print(f" {dataset}: {iter_count} iterations")
|
| 160 |
+
|
| 161 |
+
return best_iters
|
| 162 |
+
except Exception as e:
|
| 163 |
+
print(f"[!] Error loading best iterations: {e}")
|
| 164 |
+
return {}
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
# ---------------------------------------------------------------------------
|
| 168 |
+
# FinSentLLM Feature Engineering Pipeline
|
| 169 |
+
# ---------------------------------------------------------------------------
|
| 170 |
+
|
| 171 |
+
class FinSentLLMFeatureEngineering(BaseEstimator, TransformerMixin):
|
| 172 |
+
"""
|
| 173 |
+
端到端特征工程转换器,将原始文本转换为FinSentLLM的36个特征。
|
| 174 |
+
包括FinBERT/RoBERTa推理、概率工程、MultiLLM特征和语义特征。
|
| 175 |
+
"""
|
| 176 |
+
|
| 177 |
+
def __init__(self,
|
| 178 |
+
finbert_model_id="ProsusAI/finbert",
|
| 179 |
+
roberta_model_id="cardiffnlp/twitter-roberta-base-sentiment",
|
| 180 |
+
batch_size=16,
|
| 181 |
+
max_length=128,
|
| 182 |
+
device=None):
|
| 183 |
+
self.finbert_model_id = finbert_model_id
|
| 184 |
+
self.roberta_model_id = roberta_model_id
|
| 185 |
+
self.batch_size = batch_size
|
| 186 |
+
self.max_length = max_length
|
| 187 |
+
self.device = device
|
| 188 |
+
self.class_names = ["negative", "neutral", "positive"]
|
| 189 |
+
|
| 190 |
+
# 模型组件将在fit时初始化
|
| 191 |
+
self.finbert_tokenizer = None
|
| 192 |
+
self.finbert_model = None
|
| 193 |
+
self.roberta_tokenizer = None
|
| 194 |
+
self.roberta_model = None
|
| 195 |
+
self._device = None
|
| 196 |
+
|
| 197 |
+
def _load_models(self):
|
| 198 |
+
"""加载FinBERT和RoBERTa模型"""
|
| 199 |
+
if not TRANSFORMERS_AVAILABLE:
|
| 200 |
+
raise ImportError("transformers and torch are required for FinSentLLM feature engineering")
|
| 201 |
+
|
| 202 |
+
print("[📥] Loading FinBERT and RoBERTa models...")
|
| 203 |
+
|
| 204 |
+
# 设置设备
|
| 205 |
+
if self.device is None:
|
| 206 |
+
self._device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 207 |
+
else:
|
| 208 |
+
self._device = torch.device(self.device)
|
| 209 |
+
|
| 210 |
+
# 加载FinBERT
|
| 211 |
+
self.finbert_tokenizer = AutoTokenizer.from_pretrained(self.finbert_model_id)
|
| 212 |
+
self.finbert_model = AutoModelForSequenceClassification.from_pretrained(self.finbert_model_id)
|
| 213 |
+
self.finbert_model.to(self._device).eval()
|
| 214 |
+
|
| 215 |
+
# 加载RoBERTa
|
| 216 |
+
self.roberta_tokenizer = AutoTokenizer.from_pretrained(self.roberta_model_id)
|
| 217 |
+
self.roberta_model = AutoModelForSequenceClassification.from_pretrained(self.roberta_model_id)
|
| 218 |
+
self.roberta_model.to(self._device).eval()
|
| 219 |
+
|
| 220 |
+
print(f"[✅] Models loaded on {self._device}")
|
| 221 |
+
|
| 222 |
+
@torch.no_grad()
|
| 223 |
+
def _get_probabilities(self, texts, tokenizer, model):
|
| 224 |
+
"""获取模型的概率预测,带tqdm进度条"""
|
| 225 |
+
all_probs = []
|
| 226 |
+
total = len(texts)
|
| 227 |
+
batch_iter = range(0, total, self.batch_size)
|
| 228 |
+
use_tqdm = tqdm is not None and total > self.batch_size
|
| 229 |
+
iterator = tqdm(batch_iter, desc="[tqdm] Encoding & inference", unit="batch") if use_tqdm else batch_iter
|
| 230 |
+
for i in iterator:
|
| 231 |
+
batch = texts[i:i + self.batch_size]
|
| 232 |
+
# 编码
|
| 233 |
+
encoding = tokenizer(
|
| 234 |
+
batch,
|
| 235 |
+
return_tensors="pt",
|
| 236 |
+
truncation=True,
|
| 237 |
+
padding=True,
|
| 238 |
+
max_length=self.max_length
|
| 239 |
+
)
|
| 240 |
+
# 移动到设备
|
| 241 |
+
encoding = {k: v.to(self._device) for k, v in encoding.items()}
|
| 242 |
+
# 推理
|
| 243 |
+
logits = model(**encoding).logits
|
| 244 |
+
probs = torch.softmax(logits, dim=-1).cpu().numpy()
|
| 245 |
+
all_probs.append(probs)
|
| 246 |
+
return np.vstack(all_probs)
|
| 247 |
+
|
| 248 |
+
def _build_features(self, finbert_probs, roberta_probs):
|
| 249 |
+
"""构建完整的36个特征"""
|
| 250 |
+
eps = 1e-12
|
| 251 |
+
features = {}
|
| 252 |
+
n_samples = len(finbert_probs)
|
| 253 |
+
|
| 254 |
+
# 1. 基础概率特征 (8个)
|
| 255 |
+
for i, cls in enumerate(self.class_names):
|
| 256 |
+
features[f"fin_p_{cls[:3]}"] = finbert_probs[:, i]
|
| 257 |
+
features[f"rob_p_{cls[:3]}"] = roberta_probs[:, i]
|
| 258 |
+
|
| 259 |
+
features["fin_score"] = finbert_probs.max(axis=1)
|
| 260 |
+
features["rob_score"] = roberta_probs.max(axis=1)
|
| 261 |
+
|
| 262 |
+
# 2. 标签特征 (2个)
|
| 263 |
+
features["fin_label"] = finbert_probs.argmax(axis=1)
|
| 264 |
+
features["rob_label"] = roberta_probs.argmax(axis=1)
|
| 265 |
+
|
| 266 |
+
# 3. 工程概率特征 (12个)
|
| 267 |
+
# Logits
|
| 268 |
+
for i, cls in enumerate(self.class_names):
|
| 269 |
+
features[f"fin_logit_{cls[:3]}"] = np.log((finbert_probs[:, i] + eps) / (1 - finbert_probs[:, i] + eps))
|
| 270 |
+
features[f"rob_logit_{cls[:3]}"] = np.log((roberta_probs[:, i] + eps) / (1 - roberta_probs[:, i] + eps))
|
| 271 |
+
|
| 272 |
+
# 最大概率
|
| 273 |
+
features["fin_max_prob"] = finbert_probs.max(axis=1)
|
| 274 |
+
features["rob_max_prob"] = roberta_probs.max(axis=1)
|
| 275 |
+
|
| 276 |
+
# 边际 (最高 - 第二高概率)
|
| 277 |
+
fin_sorted = np.sort(finbert_probs, axis=1)
|
| 278 |
+
rob_sorted = np.sort(roberta_probs, axis=1)
|
| 279 |
+
features["fin_margin"] = fin_sorted[:, -1] - fin_sorted[:, -2]
|
| 280 |
+
features["rob_margin"] = rob_sorted[:, -1] - rob_sorted[:, -2]
|
| 281 |
+
|
| 282 |
+
# 熵
|
| 283 |
+
features["fin_entropy"] = entropy(finbert_probs.T)
|
| 284 |
+
features["rob_entropy"] = entropy(roberta_probs.T)
|
| 285 |
+
|
| 286 |
+
# 4. MultiLLM特征 (5个)
|
| 287 |
+
# L1距离和相似性
|
| 288 |
+
l1_dist = np.abs(finbert_probs - roberta_probs).sum(axis=1)
|
| 289 |
+
features["MultiLLM_L1_distance"] = l1_dist
|
| 290 |
+
features["MultiLLM_L1_similarity"] = 1 / (1 + l1_dist)
|
| 291 |
+
|
| 292 |
+
# KL散度
|
| 293 |
+
features["MultiLLM_KL_F_to_R"] = entropy(finbert_probs.T, roberta_probs.T)
|
| 294 |
+
features["MultiLLM_KL_R_to_F"] = entropy(roberta_probs.T, finbert_probs.T)
|
| 295 |
+
|
| 296 |
+
# 一致性
|
| 297 |
+
fin_pred = finbert_probs.argmax(axis=1)
|
| 298 |
+
rob_pred = roberta_probs.argmax(axis=1)
|
| 299 |
+
features["MultiLLM_agree"] = (fin_pred == rob_pred).astype(int)
|
| 300 |
+
|
| 301 |
+
# 5. 结构化语义特征 (9个) - 简化版本,实际使用中应该基于NLP规则
|
| 302 |
+
# 这里使用基于概率的启发式规则
|
| 303 |
+
features["sem_compared"] = ((finbert_probs[:, 1] > 0.4) & (roberta_probs[:, 1] > 0.4)).astype(int)
|
| 304 |
+
features["sem_loss_improve"] = ((finbert_probs[:, 2] > 0.6) & (roberta_probs[:, 2] > 0.5)).astype(int)
|
| 305 |
+
features["sem_loss_worsen"] = ((finbert_probs[:, 0] > 0.6) & (roberta_probs[:, 0] > 0.5)).astype(int)
|
| 306 |
+
features["sem_profit_up"] = ((finbert_probs[:, 2] > 0.7) & (l1_dist < 0.3)).astype(int)
|
| 307 |
+
features["sem_cost_down"] = ((finbert_probs[:, 2] > 0.5) & (features["MultiLLM_agree"] == 1)).astype(int)
|
| 308 |
+
features["sem_contract_fin"] = ((finbert_probs[:, 1] > 0.8)).astype(int)
|
| 309 |
+
features["sem_uncertainty"] = ((features["fin_entropy"] > 1.0) | (features["rob_entropy"] > 1.0)).astype(int)
|
| 310 |
+
features["sem_stable_guidance"] = ((l1_dist < 0.2) & (finbert_probs[:, 1] > 0.5)).astype(int)
|
| 311 |
+
features["sem_operational"] = ((finbert_probs[:, 1] > 0.3) & (roberta_probs[:, 1] > 0.3)).astype(int)
|
| 312 |
+
|
| 313 |
+
return pd.DataFrame(features)
|
| 314 |
+
|
| 315 |
+
def fit(self, X, y=None):
|
| 316 |
+
"""训练阶段 - 加载模型"""
|
| 317 |
+
self._load_models()
|
| 318 |
+
return self
|
| 319 |
+
|
| 320 |
+
def transform(self, X):
|
| 321 |
+
"""转换阶段 - 将文本转换为特征"""
|
| 322 |
+
if self.finbert_model is None:
|
| 323 |
+
raise RuntimeError("Models not loaded. Call fit() first.")
|
| 324 |
+
|
| 325 |
+
# 处理输入
|
| 326 |
+
if isinstance(X, pd.DataFrame):
|
| 327 |
+
if 'text' in X.columns:
|
| 328 |
+
texts = X['text'].tolist()
|
| 329 |
+
elif len(X.columns) == 1:
|
| 330 |
+
texts = X.iloc[:, 0].tolist()
|
| 331 |
+
else:
|
| 332 |
+
raise ValueError("DataFrame must have 'text' column or single column")
|
| 333 |
+
elif isinstance(X, (list, np.ndarray)):
|
| 334 |
+
texts = list(X)
|
| 335 |
+
else:
|
| 336 |
+
raise ValueError("X must be DataFrame, list, or array")
|
| 337 |
+
|
| 338 |
+
print(f"[🔮] Processing {len(texts)} texts...")
|
| 339 |
+
|
| 340 |
+
# 获取概率
|
| 341 |
+
finbert_probs = self._get_probabilities(texts, self.finbert_tokenizer, self.finbert_model)
|
| 342 |
+
roberta_probs = self._get_probabilities(texts, self.roberta_tokenizer, self.roberta_model)
|
| 343 |
+
|
| 344 |
+
# 构建特征
|
| 345 |
+
features_df = self._build_features(finbert_probs, roberta_probs)
|
| 346 |
+
|
| 347 |
+
print(f"[✅] Generated {len(features_df.columns)} features")
|
| 348 |
+
return features_df
|
| 349 |
+
|
| 350 |
+
|
| 351 |
+
# ---------------------------------------------------------------------------
|
| 352 |
+
# Modeling utilities
|
| 353 |
+
# ---------------------------------------------------------------------------
|
| 354 |
+
|
| 355 |
+
def build_preprocessor(numeric_cols: List[str], categorical_cols: List[str]) -> ColumnTransformer:
|
| 356 |
+
transformers = []
|
| 357 |
+
if numeric_cols:
|
| 358 |
+
transformers.append(("num", StandardScaler(), numeric_cols))
|
| 359 |
+
if categorical_cols:
|
| 360 |
+
transformers.append(("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols))
|
| 361 |
+
if not transformers:
|
| 362 |
+
raise ValueError("No feature columns selected – check your dataset.")
|
| 363 |
+
return ColumnTransformer(transformers=transformers, remainder="drop")
|
| 364 |
+
|
| 365 |
+
|
| 366 |
+
def build_pipelines(
|
| 367 |
+
numeric_cols: List[str],
|
| 368 |
+
categorical_cols: List[str],
|
| 369 |
+
num_classes: int,
|
| 370 |
+
random_state: int,
|
| 371 |
+
models_requested: Iterable[str],
|
| 372 |
+
dataset: str = "",
|
| 373 |
+
best_iterations: Dict[str, int] = None,
|
| 374 |
+
include_feature_engineering: bool = False,
|
| 375 |
+
) -> Dict[str, Pipeline]:
|
| 376 |
+
pipelines: Dict[str, Pipeline] = {}
|
| 377 |
+
|
| 378 |
+
|
| 379 |
+
# 定义 end-to-end 特征名
|
| 380 |
+
end2end_categorical_features = ["fin_label", "rob_label"]
|
| 381 |
+
end2end_numeric_features = [
|
| 382 |
+
'fin_p_neg', 'fin_p_neu', 'fin_p_pos', 'fin_score',
|
| 383 |
+
'rob_p_neg', 'rob_p_neu', 'rob_p_pos', 'rob_score',
|
| 384 |
+
'fin_logit_neg', 'fin_logit_neu', 'fin_logit_pos',
|
| 385 |
+
'rob_logit_neg', 'rob_logit_neu', 'rob_logit_pos',
|
| 386 |
+
'fin_max_prob', 'rob_max_prob', 'fin_margin', 'rob_margin',
|
| 387 |
+
'fin_entropy', 'rob_entropy',
|
| 388 |
+
'MultiLLM_L1_distance', 'MultiLLM_L1_similarity',
|
| 389 |
+
'MultiLLM_KL_F_to_R', 'MultiLLM_KL_R_to_F', 'MultiLLM_agree',
|
| 390 |
+
'sem_compared', 'sem_loss_improve', 'sem_loss_worsen',
|
| 391 |
+
'sem_profit_up', 'sem_cost_down', 'sem_contract_fin',
|
| 392 |
+
'sem_uncertainty', 'sem_stable_guidance', 'sem_operational'
|
| 393 |
+
]
|
| 394 |
+
|
| 395 |
+
if "logreg" in models_requested:
|
| 396 |
+
logreg = LogisticRegression(max_iter=1000, solver="lbfgs")
|
| 397 |
+
if include_feature_engineering:
|
| 398 |
+
preprocessor = build_preprocessor(end2end_numeric_features, end2end_categorical_features)
|
| 399 |
+
pipelines["logreg"] = Pipeline([
|
| 400 |
+
("feature_engineering", FinSentLLMFeatureEngineering()),
|
| 401 |
+
("preprocess", preprocessor),
|
| 402 |
+
("clf", logreg),
|
| 403 |
+
])
|
| 404 |
+
else:
|
| 405 |
+
preprocessor = build_preprocessor(numeric_cols, categorical_cols)
|
| 406 |
+
pipelines["logreg"] = Pipeline([
|
| 407 |
+
("preprocess", preprocessor),
|
| 408 |
+
("clf", logreg),
|
| 409 |
+
])
|
| 410 |
+
|
| 411 |
+
if "xgboost" in models_requested:
|
| 412 |
+
if XGBClassifier is None:
|
| 413 |
+
raise ImportError(
|
| 414 |
+
"xgboost is not installed. Install it with 'pip install xgboost' or remove 'xgboost' from --models."
|
| 415 |
+
)
|
| 416 |
+
if best_iterations and dataset in best_iterations:
|
| 417 |
+
n_estimators = best_iterations[dataset]
|
| 418 |
+
print(f"[✓] Using pre-computed best iterations for {dataset}: {n_estimators}")
|
| 419 |
+
else:
|
| 420 |
+
n_estimators = 1000
|
| 421 |
+
print(f"[!] No pre-computed iterations found for {dataset}, using default: {n_estimators}")
|
| 422 |
+
xgb = XGBClassifier(
|
| 423 |
+
objective="multi:softprob",
|
| 424 |
+
num_class=num_classes,
|
| 425 |
+
learning_rate=0.05,
|
| 426 |
+
max_depth=6,
|
| 427 |
+
subsample=0.8,
|
| 428 |
+
colsample_bytree=0.8,
|
| 429 |
+
n_estimators=n_estimators,
|
| 430 |
+
min_child_weight=2,
|
| 431 |
+
reg_lambda=1.0,
|
| 432 |
+
reg_alpha=0.0,
|
| 433 |
+
tree_method="hist",
|
| 434 |
+
eval_metric="mlogloss",
|
| 435 |
+
random_state=random_state,
|
| 436 |
+
n_jobs=0,
|
| 437 |
+
verbosity=0,
|
| 438 |
+
)
|
| 439 |
+
if include_feature_engineering:
|
| 440 |
+
feature_preprocessor = build_preprocessor(end2end_numeric_features, end2end_categorical_features)
|
| 441 |
+
pipelines["xgboost"] = Pipeline([
|
| 442 |
+
("feature_engineering", FinSentLLMFeatureEngineering()),
|
| 443 |
+
("preprocess", feature_preprocessor),
|
| 444 |
+
("clf", xgb),
|
| 445 |
+
])
|
| 446 |
+
print(f"[🤖] Created end-to-end XGBoost pipeline with feature engineering")
|
| 447 |
+
else:
|
| 448 |
+
preprocessor = build_preprocessor(numeric_cols, categorical_cols)
|
| 449 |
+
pipelines["xgboost"] = Pipeline([
|
| 450 |
+
("preprocess", preprocessor),
|
| 451 |
+
("clf", xgb),
|
| 452 |
+
])
|
| 453 |
+
|
| 454 |
+
return pipelines
|
| 455 |
+
|
| 456 |
+
if "logreg" in models_requested:
|
| 457 |
+
preprocessor = build_preprocessor(numeric_cols, categorical_cols)
|
| 458 |
+
logreg = LogisticRegression(max_iter=1000, solver="lbfgs")
|
| 459 |
+
pipelines["logreg"] = Pipeline([
|
| 460 |
+
("preprocess", preprocessor),
|
| 461 |
+
("clf", logreg),
|
| 462 |
+
])
|
| 463 |
+
|
| 464 |
+
if "xgboost" in models_requested:
|
| 465 |
+
if XGBClassifier is None:
|
| 466 |
+
raise ImportError(
|
| 467 |
+
"xgboost is not installed. Install it with 'pip install xgboost' or remove 'xgboost' from --models."
|
| 468 |
+
)
|
| 469 |
+
preprocessor = build_preprocessor(numeric_cols, categorical_cols)
|
| 470 |
+
|
| 471 |
+
# 使用预存的最优轮数或默认值
|
| 472 |
+
if best_iterations and dataset in best_iterations:
|
| 473 |
+
n_estimators = best_iterations[dataset]
|
| 474 |
+
print(f"[✓] Using pre-computed best iterations for {dataset}: {n_estimators}")
|
| 475 |
+
else:
|
| 476 |
+
n_estimators = 1000 # 默认值
|
| 477 |
+
print(f"[!] No pre-computed iterations found for {dataset}, using default: {n_estimators}")
|
| 478 |
+
|
| 479 |
+
xgb = XGBClassifier(
|
| 480 |
+
objective="multi:softprob",
|
| 481 |
+
num_class=num_classes,
|
| 482 |
+
learning_rate=0.05,
|
| 483 |
+
max_depth=6,
|
| 484 |
+
subsample=0.8,
|
| 485 |
+
colsample_bytree=0.8,
|
| 486 |
+
n_estimators=n_estimators, # 使用预存的最优轮数
|
| 487 |
+
min_child_weight=2,
|
| 488 |
+
reg_lambda=1.0,
|
| 489 |
+
reg_alpha=0.0,
|
| 490 |
+
tree_method="hist",
|
| 491 |
+
eval_metric="mlogloss",
|
| 492 |
+
random_state=random_state,
|
| 493 |
+
n_jobs=0,
|
| 494 |
+
verbosity=0,
|
| 495 |
+
)
|
| 496 |
+
pipelines["xgboost"] = Pipeline([
|
| 497 |
+
("preprocess", preprocessor),
|
| 498 |
+
("clf", xgb),
|
| 499 |
+
])
|
| 500 |
+
|
| 501 |
+
return pipelines
|
| 502 |
+
|
| 503 |
+
|
| 504 |
+
def evaluate_model(
|
| 505 |
+
name: str,
|
| 506 |
+
pipeline: Pipeline,
|
| 507 |
+
X: pd.DataFrame,
|
| 508 |
+
y_train: pd.Series,
|
| 509 |
+
y_eval: pd.Series,
|
| 510 |
+
cv: StratifiedKFold,
|
| 511 |
+
class_labels: List[str],
|
| 512 |
+
label_decoder: Optional[Dict[int, str]] = None,
|
| 513 |
+
) -> Dict[str, object]:
|
| 514 |
+
scoring = {"accuracy": "accuracy", "macro_f1": "f1_macro"}
|
| 515 |
+
|
| 516 |
+
scores = cross_validate(
|
| 517 |
+
pipeline,
|
| 518 |
+
X,
|
| 519 |
+
y_train,
|
| 520 |
+
scoring=scoring,
|
| 521 |
+
cv=cv,
|
| 522 |
+
n_jobs=None,
|
| 523 |
+
return_estimator=False,
|
| 524 |
+
)
|
| 525 |
+
|
| 526 |
+
preds = cross_val_predict(pipeline, X, y_train, cv=cv, method="predict")
|
| 527 |
+
probas = cross_val_predict(pipeline, X, y_train, cv=cv, method="predict_proba")
|
| 528 |
+
|
| 529 |
+
# 直接训练模型(已经使用了预存的最优轮数)
|
| 530 |
+
fitted = pipeline.fit(X, y_train)
|
| 531 |
+
|
| 532 |
+
clf_raw_classes = list(fitted.named_steps["clf"].classes_)
|
| 533 |
+
|
| 534 |
+
if label_decoder:
|
| 535 |
+
preds_decoded = np.array([label_decoder[int(p)] for p in preds])
|
| 536 |
+
proba_labels = [label_decoder[int(c)] for c in clf_raw_classes]
|
| 537 |
+
else:
|
| 538 |
+
preds_decoded = preds
|
| 539 |
+
proba_labels = [str(c) for c in clf_raw_classes]
|
| 540 |
+
|
| 541 |
+
if proba_labels != class_labels:
|
| 542 |
+
reorder_idx = [proba_labels.index(lbl) for lbl in class_labels]
|
| 543 |
+
probas = probas[:, reorder_idx]
|
| 544 |
+
proba_labels = class_labels
|
| 545 |
+
|
| 546 |
+
y_eval_array = y_eval.to_numpy()
|
| 547 |
+
|
| 548 |
+
report = classification_report(y_eval_array, preds_decoded, labels=class_labels, digits=4)
|
| 549 |
+
cm = confusion_matrix(y_eval_array, preds_decoded, labels=class_labels)
|
| 550 |
+
|
| 551 |
+
metrics = {
|
| 552 |
+
"name": name,
|
| 553 |
+
"accuracy_mean": float(np.mean(scores["test_accuracy"])),
|
| 554 |
+
"accuracy_std": float(np.std(scores["test_accuracy"])),
|
| 555 |
+
"macro_f1_mean": float(np.mean(scores["test_macro_f1"])),
|
| 556 |
+
"macro_f1_std": float(np.std(scores["test_macro_f1"])),
|
| 557 |
+
"classification_report": report,
|
| 558 |
+
"confusion_matrix": cm,
|
| 559 |
+
"classes": class_labels,
|
| 560 |
+
"preds": preds_decoded,
|
| 561 |
+
"probas": probas,
|
| 562 |
+
"final_model": fitted,
|
| 563 |
+
}
|
| 564 |
+
|
| 565 |
+
# 为XGBoost添加best_iteration信息
|
| 566 |
+
if name == "xgboost":
|
| 567 |
+
if hasattr(fitted.named_steps["clf"], "best_iteration"):
|
| 568 |
+
metrics["best_iteration"] = fitted.named_steps["clf"].best_iteration
|
| 569 |
+
elif hasattr(fitted.named_steps["clf"], "n_estimators"):
|
| 570 |
+
metrics["best_iteration"] = fitted.named_steps["clf"].n_estimators
|
| 571 |
+
metrics["best_ntree_limit"] = metrics.get("best_iteration", 0) + 1
|
| 572 |
+
|
| 573 |
+
return metrics
|
| 574 |
+
|
| 575 |
+
|
| 576 |
+
def print_metrics(metrics: Dict[str, object], verbose: bool = False) -> None:
|
| 577 |
+
name = metrics["name"]
|
| 578 |
+
print(f"\n=== {name.upper()} meta-classifier ===")
|
| 579 |
+
print(
|
| 580 |
+
f"Accuracy: {metrics['accuracy_mean']*100:.2f}% ± {metrics['accuracy_std']*100:.2f}%\n"
|
| 581 |
+
f"Macro-F1: {metrics['macro_f1_mean']*100:.2f}% ± {metrics['macro_f1_std']*100:.2f}%"
|
| 582 |
+
)
|
| 583 |
+
if verbose:
|
| 584 |
+
print("\nClassification report:\n", metrics["classification_report"], sep="")
|
| 585 |
+
print("Confusion matrix (rows=true, cols=pred):")
|
| 586 |
+
classes = metrics["classes"]
|
| 587 |
+
header = " " + " ".join(f"{c[:7]:>7}" for c in classes)
|
| 588 |
+
print(header)
|
| 589 |
+
for c, row in zip(classes, metrics["confusion_matrix"]):
|
| 590 |
+
row_fmt = " ".join(f"{int(v):>7}" for v in row)
|
| 591 |
+
print(f"{c[:7]:>4} {row_fmt}")
|
| 592 |
+
|
| 593 |
+
|
| 594 |
+
def save_predictions(base: pd.DataFrame, metrics: Dict[str, object], path: str) -> None:
|
| 595 |
+
pred_df = base[["doc_id"]].copy()
|
| 596 |
+
pred_df["true_label"] = base["label"]
|
| 597 |
+
pred_df["meta_pred"] = metrics["preds"]
|
| 598 |
+
for idx, cls in enumerate(metrics["classes"]):
|
| 599 |
+
pred_df[f"meta_proba_{cls}"] = metrics["probas"][:, idx]
|
| 600 |
+
pred_df.to_csv(path, index=False)
|
| 601 |
+
print(f"Saved predictions: {path}")
|
| 602 |
+
|
| 603 |
+
|
| 604 |
+
# ---------------------------------------------------------------------------
|
| 605 |
+
# CLI
|
| 606 |
+
# ---------------------------------------------------------------------------
|
| 607 |
+
|
| 608 |
+
def parse_args() -> argparse.Namespace:
|
| 609 |
+
parser = argparse.ArgumentParser(description="Train FinSentLLM meta-classifiers (LogReg/XGBoost).")
|
| 610 |
+
parser.add_argument("--dataset", required=True, help="Dataset tag, e.g. 50Agree | 66Agree | 75Agree | AllAgree")
|
| 611 |
+
parser.add_argument("--prob_features_csv", help="Override path to probability feature CSV")
|
| 612 |
+
parser.add_argument("--multi_llm_csv", help="Override path to Multi-LLM feature CSV")
|
| 613 |
+
parser.add_argument("--semantics_csv", help="Override path to structured semantics CSV")
|
| 614 |
+
parser.add_argument("--folds", type=int, default=5, help="Number of stratified CV folds (default: 5)")
|
| 615 |
+
parser.add_argument("--seed", type=int, default=7, help="Random seed for CV shuffling (default: 7)")
|
| 616 |
+
parser.add_argument(
|
| 617 |
+
"--models",
|
| 618 |
+
nargs="+",
|
| 619 |
+
default=["logreg", "xgboost"],
|
| 620 |
+
choices=["logreg", "xgboost"],
|
| 621 |
+
help="Which meta-models to evaluate (default: both)",
|
| 622 |
+
)
|
| 623 |
+
parser.add_argument("--artifact_prefix", help="If set, saves artifacts using this filepath prefix")
|
| 624 |
+
parser.add_argument("--out_dir", default="outputs", help="Base output directory")
|
| 625 |
+
parser.add_argument("--meta_xgb_dir", default="Meta-Classifier_XG_boost_es_optimized", help="Subdir for xgboost artifacts")
|
| 626 |
+
parser.add_argument("--meta_logreg_dir", default="Meta-Classifier-log_regression", help="Subdir for logreg artifacts")
|
| 627 |
+
parser.add_argument("--save_predictions", action="store_true", help="Write out-of-fold predictions per model")
|
| 628 |
+
parser.add_argument("--save_models", action="store_true", help="Persist fitted pipelines per model")
|
| 629 |
+
parser.add_argument("--verbose", action="store_true", help="Print full reports and confusion matrices")
|
| 630 |
+
# 默认不使用 end-to-end,直接用预处理特征,速度快
|
| 631 |
+
parser.add_argument("--end_to_end", action="store_true", default=False, help="[慢] 用大模型重新生成特征 (不建议,除非你需要全流程推理)")
|
| 632 |
+
return parser.parse_args()
|
| 633 |
+
|
| 634 |
+
|
| 635 |
+
def main() -> None:
|
| 636 |
+
|
| 637 |
+
args = parse_args()
|
| 638 |
+
|
| 639 |
+
# 加载预存的最优轮数
|
| 640 |
+
best_iterations = load_best_iterations()
|
| 641 |
+
|
| 642 |
+
# 如果用户同时指定了 --end_to_end 和特征文件路径,给出警告
|
| 643 |
+
if args.end_to_end and (args.prob_features_csv or args.multi_llm_csv or args.semantics_csv):
|
| 644 |
+
print("[警告] --end_to_end 模式下会忽略所有预处理特征文件,全部重新推理,速度极慢!")
|
| 645 |
+
|
| 646 |
+
if args.end_to_end:
|
| 647 |
+
print("[🤖] Creating end-to-end pipelines with feature engineering... (速度极慢,仅用于全流程推理)")
|
| 648 |
+
if not TRANSFORMERS_AVAILABLE:
|
| 649 |
+
raise ImportError("transformers and torch are required for end-to-end feature engineering. Install with: pip install transformers torch")
|
| 650 |
+
|
| 651 |
+
# 对于端到端pipeline,我们需要原始文本数据
|
| 652 |
+
paths = infer_paths(args.dataset)
|
| 653 |
+
data = load_feature_table(paths)
|
| 654 |
+
|
| 655 |
+
# 检查是否有文本列
|
| 656 |
+
text_col = None
|
| 657 |
+
for col in ['text', 'sentence', 'content']:
|
| 658 |
+
if col in data.columns:
|
| 659 |
+
text_col = col
|
| 660 |
+
break
|
| 661 |
+
|
| 662 |
+
if text_col is None:
|
| 663 |
+
raise ValueError("End-to-end mode requires text data, but no text column found in dataset")
|
| 664 |
+
|
| 665 |
+
X_text = data[[text_col]] # 原始文本
|
| 666 |
+
target_col = "label"
|
| 667 |
+
if target_col not in data.columns:
|
| 668 |
+
raise KeyError("Target column 'label' not found after merging.")
|
| 669 |
+
|
| 670 |
+
y = data[target_col].astype(str)
|
| 671 |
+
|
| 672 |
+
default_order = ["negative", "neutral", "positive"]
|
| 673 |
+
observed = list(pd.unique(y))
|
| 674 |
+
class_labels = [lbl for lbl in default_order if lbl in observed]
|
| 675 |
+
class_labels += [lbl for lbl in observed if lbl not in class_labels]
|
| 676 |
+
|
| 677 |
+
label_to_int = {lbl: idx for idx, lbl in enumerate(class_labels)}
|
| 678 |
+
int_to_label = {idx: lbl for lbl, idx in label_to_int.items()}
|
| 679 |
+
y_encoded = y.map(label_to_int).astype(int)
|
| 680 |
+
|
| 681 |
+
pipelines = build_pipelines(
|
| 682 |
+
numeric_cols=[],
|
| 683 |
+
categorical_cols=[],
|
| 684 |
+
num_classes=len(class_labels),
|
| 685 |
+
random_state=args.seed,
|
| 686 |
+
models_requested=args.models,
|
| 687 |
+
dataset=args.dataset,
|
| 688 |
+
best_iterations=best_iterations,
|
| 689 |
+
include_feature_engineering=True,
|
| 690 |
+
)
|
| 691 |
+
X = X_text
|
| 692 |
+
else:
|
| 693 |
+
# 默认推荐:直接用预处理特征,速度快
|
| 694 |
+
paths = infer_paths(args.dataset)
|
| 695 |
+
if args.prob_features_csv:
|
| 696 |
+
paths.prob_features_csv = args.prob_features_csv
|
| 697 |
+
if args.multi_llm_csv:
|
| 698 |
+
paths.multi_llm_csv = args.multi_llm_csv
|
| 699 |
+
if args.semantics_csv:
|
| 700 |
+
paths.semantics_csv = args.semantics_csv
|
| 701 |
+
|
| 702 |
+
data = load_feature_table(paths)
|
| 703 |
+
|
| 704 |
+
target_col = "label"
|
| 705 |
+
if target_col not in data.columns:
|
| 706 |
+
raise KeyError("Target column 'label' not found after merging.")
|
| 707 |
+
|
| 708 |
+
categorical_cols = [c for c in ["fin_label", "rob_label"] if c in data.columns]
|
| 709 |
+
numeric_cols = [
|
| 710 |
+
c for c in data.select_dtypes(include=[np.number]).columns
|
| 711 |
+
if c not in {"doc_id"}
|
| 712 |
+
]
|
| 713 |
+
|
| 714 |
+
X = data[numeric_cols + categorical_cols]
|
| 715 |
+
y = data[target_col].astype(str)
|
| 716 |
+
|
| 717 |
+
default_order = ["negative", "neutral", "positive"]
|
| 718 |
+
observed = list(pd.unique(y))
|
| 719 |
+
class_labels = [lbl for lbl in default_order if lbl in observed]
|
| 720 |
+
class_labels += [lbl for lbl in observed if lbl not in class_labels]
|
| 721 |
+
|
| 722 |
+
label_to_int = {lbl: idx for idx, lbl in enumerate(class_labels)}
|
| 723 |
+
int_to_label = {idx: lbl for lbl, idx in label_to_int.items()}
|
| 724 |
+
y_encoded = y.map(label_to_int).astype(int)
|
| 725 |
+
|
| 726 |
+
pipelines = build_pipelines(
|
| 727 |
+
numeric_cols=numeric_cols,
|
| 728 |
+
categorical_cols=categorical_cols,
|
| 729 |
+
num_classes=len(class_labels),
|
| 730 |
+
random_state=args.seed,
|
| 731 |
+
models_requested=args.models,
|
| 732 |
+
dataset=args.dataset,
|
| 733 |
+
best_iterations=best_iterations,
|
| 734 |
+
include_feature_engineering=False,
|
| 735 |
+
)
|
| 736 |
+
|
| 737 |
+
cv = StratifiedKFold(n_splits=args.folds, shuffle=True, random_state=args.seed)
|
| 738 |
+
|
| 739 |
+
results = {}
|
| 740 |
+
for name, pipeline in pipelines.items():
|
| 741 |
+
if name == "xgboost":
|
| 742 |
+
metrics = evaluate_model(
|
| 743 |
+
name,
|
| 744 |
+
pipeline,
|
| 745 |
+
X,
|
| 746 |
+
y_encoded,
|
| 747 |
+
y,
|
| 748 |
+
cv=cv,
|
| 749 |
+
class_labels=class_labels,
|
| 750 |
+
label_decoder=int_to_label,
|
| 751 |
+
)
|
| 752 |
+
else:
|
| 753 |
+
metrics = evaluate_model(
|
| 754 |
+
name,
|
| 755 |
+
pipeline,
|
| 756 |
+
X,
|
| 757 |
+
y,
|
| 758 |
+
y,
|
| 759 |
+
cv=cv,
|
| 760 |
+
class_labels=class_labels,
|
| 761 |
+
)
|
| 762 |
+
print_metrics(metrics, verbose=args.verbose)
|
| 763 |
+
results[name] = metrics
|
| 764 |
+
|
| 765 |
+
if args.artifact_prefix and args.save_predictions:
|
| 766 |
+
pred_path = f"{args.artifact_prefix}_{name}_predictions.csv"
|
| 767 |
+
save_predictions(data, metrics, pred_path)
|
| 768 |
+
|
| 769 |
+
if args.artifact_prefix and args.save_models:
|
| 770 |
+
model_path = f"{args.artifact_prefix}_{name}_model.joblib"
|
| 771 |
+
# 为XGBoost保存完整的模型字典
|
| 772 |
+
if name == "xgboost":
|
| 773 |
+
model_dict = {
|
| 774 |
+
"pipeline": metrics["final_model"],
|
| 775 |
+
"feature_columns": list(X.columns),
|
| 776 |
+
"label_map": label_to_int,
|
| 777 |
+
"labels": class_labels,
|
| 778 |
+
"best_iteration": metrics.get("best_iteration", 0),
|
| 779 |
+
"best_ntree_limit": metrics.get("best_ntree_limit", 1),
|
| 780 |
+
}
|
| 781 |
+
joblib.dump(model_dict, model_path)
|
| 782 |
+
else:
|
| 783 |
+
joblib.dump(metrics["final_model"], model_path)
|
| 784 |
+
print(f"Saved model: {model_path}")
|
| 785 |
+
|
| 786 |
+
# If no artifact_prefix is provided but user asked to save, route to default meta subfolders
|
| 787 |
+
if not args.artifact_prefix and args.save_predictions:
|
| 788 |
+
if name == "xgboost":
|
| 789 |
+
save_dir = os.path.join(args.out_dir, args.meta_xgb_dir)
|
| 790 |
+
else:
|
| 791 |
+
save_dir = os.path.join(args.out_dir, args.meta_logreg_dir)
|
| 792 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 793 |
+
pred_path = os.path.join(save_dir, f"FinSent_{args.dataset}_meta_{name}_predictions.csv")
|
| 794 |
+
save_predictions(data, metrics, pred_path)
|
| 795 |
+
|
| 796 |
+
if not args.artifact_prefix and args.save_models:
|
| 797 |
+
if name == "xgboost":
|
| 798 |
+
save_dir = os.path.join(args.out_dir, args.meta_xgb_dir)
|
| 799 |
+
else:
|
| 800 |
+
save_dir = os.path.join(args.out_dir, args.meta_logreg_dir)
|
| 801 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 802 |
+
model_path = os.path.join(save_dir, f"FinSent_{args.dataset}_meta_{name}_model.joblib")
|
| 803 |
+
# 为XGBoost保存完整的模型字典
|
| 804 |
+
if name == "xgboost":
|
| 805 |
+
model_dict = {
|
| 806 |
+
"pipeline": metrics["final_model"],
|
| 807 |
+
"feature_columns": list(X.columns),
|
| 808 |
+
"label_map": label_to_int,
|
| 809 |
+
"labels": class_labels,
|
| 810 |
+
"best_iteration": metrics.get("best_iteration", 0),
|
| 811 |
+
"best_ntree_limit": metrics.get("best_ntree_limit", 1),
|
| 812 |
+
}
|
| 813 |
+
joblib.dump(model_dict, model_path)
|
| 814 |
+
else:
|
| 815 |
+
joblib.dump(metrics["final_model"], model_path)
|
| 816 |
+
print(f"Saved model: {model_path}")
|
| 817 |
+
|
| 818 |
+
|
| 819 |
+
if __name__ == "__main__":
|
| 820 |
+
main()
|
FPB Multi-LLM .py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# FPB Multi-LLM .py
|
| 2 |
+
# Compute expert-signal features from multiple LLMs (FinBERT, RoBERT
|
| 3 |
+
import argparse
|
| 4 |
+
import os
|
| 5 |
+
import numpy as np
|
| 6 |
+
import pandas as pd
|
| 7 |
+
from scipy.special import rel_entr
|
| 8 |
+
|
| 9 |
+
EPS = 1e-12
|
| 10 |
+
|
| 11 |
+
def row_normalize(a: np.ndarray) -> np.ndarray:
|
| 12 |
+
s = a.sum(axis=1, keepdims=True)
|
| 13 |
+
s[s == 0.0] = 1.0
|
| 14 |
+
return a / s
|
| 15 |
+
|
| 16 |
+
def kl_divergence(P: np.ndarray, Q: np.ndarray) -> np.ndarray:
|
| 17 |
+
P = np.clip(P, EPS, 1.0)
|
| 18 |
+
Q = np.clip(Q, EPS, 1.0)
|
| 19 |
+
return np.sum(rel_entr(P, Q), axis=1) # Σ p * log(p/q)
|
| 20 |
+
|
| 21 |
+
def l1_distance(P: np.ndarray, Q: np.ndarray) -> np.ndarray:
|
| 22 |
+
return 0.5 * np.sum(np.abs(P - Q), axis=1) # ∈ [0,1]
|
| 23 |
+
|
| 24 |
+
def load_probs(df: pd.DataFrame, cols: list, label: str) -> np.ndarray:
|
| 25 |
+
missing = [c for c in cols if c not in df.columns]
|
| 26 |
+
if missing:
|
| 27 |
+
raise ValueError(f"[{label}] Missing columns: {missing}")
|
| 28 |
+
arr = df[cols].to_numpy(dtype=float)
|
| 29 |
+
return row_normalize(arr)
|
| 30 |
+
|
| 31 |
+
def main():
|
| 32 |
+
ap = argparse.ArgumentParser(description="Compute Multi-LLM expert signals (Stage 3 only).")
|
| 33 |
+
ap.add_argument("--input", required=True, help="CSV with per-model probabilities")
|
| 34 |
+
ap.add_argument("--dataset", required=True, help="Dataset tag, e.g., 50Agree | AllAgree")
|
| 35 |
+
ap.add_argument("--out_dir", default=".", help="Output directory")
|
| 36 |
+
ap.add_argument("--out_subdir", default="MultiLLM", help="Subdirectory under out_dir to save multi-llm features")
|
| 37 |
+
# Column names: default to common names; override if your headers differ
|
| 38 |
+
ap.add_argument("--fin_cols", nargs=3, default=["fin_p_neg","fin_p_neu","fin_p_pos"],
|
| 39 |
+
help="FinBERT prob columns [neg neu pos]")
|
| 40 |
+
ap.add_argument("--rob_cols", nargs=3, default=["rob_p_neg","rob_p_neu","rob_p_pos"],
|
| 41 |
+
help="RoBERTa prob columns [neg neu pos]")
|
| 42 |
+
args = ap.parse_args()
|
| 43 |
+
|
| 44 |
+
df = pd.read_csv(args.input)
|
| 45 |
+
|
| 46 |
+
# Load & normalize probabilities
|
| 47 |
+
P = load_probs(df, args.fin_cols, "FinBERT")
|
| 48 |
+
Q = load_probs(df, args.rob_cols, "RoBERTa")
|
| 49 |
+
|
| 50 |
+
# Expert-signal features (paper)
|
| 51 |
+
df["MultiLLM_L1_distance"] = l1_distance(P, Q)
|
| 52 |
+
df["MultiLLM_L1_similarity"] = 1.0 - df["MultiLLM_L1_distance"]
|
| 53 |
+
df["MultiLLM_KL_F_to_R"] = kl_divergence(P, Q)
|
| 54 |
+
df["MultiLLM_KL_R_to_F"] = kl_divergence(Q, P)
|
| 55 |
+
|
| 56 |
+
# Optional: simple agreement flag (same argmax class)
|
| 57 |
+
df["MultiLLM_agree"] = (np.argmax(P, axis=1) == np.argmax(Q, axis=1)).astype(int)
|
| 58 |
+
|
| 59 |
+
save_dir = args.out_dir
|
| 60 |
+
if args.out_subdir:
|
| 61 |
+
save_dir = os.path.join(args.out_dir, args.out_subdir)
|
| 62 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 63 |
+
out_path = os.path.join(save_dir, f"FPB_MultiLLM_{args.dataset}.csv")
|
| 64 |
+
df.to_csv(out_path, index=False)
|
| 65 |
+
print(f"✅ Saved expert-signal features to: {out_path}")
|
| 66 |
+
print(" Added columns: MultiLLM_L1_distance, MultiLLM_L1_similarity, "
|
| 67 |
+
"MultiLLM_KL_F_to_R, MultiLLM_KL_R_to_F, MultiLLM_agree")
|
| 68 |
+
|
| 69 |
+
if __name__ == "__main__":
|
| 70 |
+
main()
|
FPB Prob Features.py
ADDED
|
@@ -0,0 +1,84 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import argparse
|
| 2 |
+
import os
|
| 3 |
+
#!/usr/bin/env python3
|
| 4 |
+
import argparse
|
| 5 |
+
import os
|
| 6 |
+
import numpy as np
|
| 7 |
+
import pandas as pd
|
| 8 |
+
|
| 9 |
+
EPS = 1e-12
|
| 10 |
+
|
| 11 |
+
# ---------- Helper functions ----------
|
| 12 |
+
def logit(p: np.ndarray) -> np.ndarray:
|
| 13 |
+
"""Compute logit(p) = log(p / (1 - p))."""
|
| 14 |
+
p = np.clip(p, EPS, 1.0 - EPS)
|
| 15 |
+
return np.log(p / (1.0 - p))
|
| 16 |
+
|
| 17 |
+
def entropy(p: np.ndarray) -> np.ndarray:
|
| 18 |
+
"""Shannon entropy: H(p) = -Σ p log p."""
|
| 19 |
+
p = np.clip(p, EPS, 1.0)
|
| 20 |
+
return -np.sum(p * np.log(p), axis=1)
|
| 21 |
+
|
| 22 |
+
def top2_margin(p: np.ndarray) -> np.ndarray:
|
| 23 |
+
"""Margin = top1(p) - top2(p)."""
|
| 24 |
+
s = np.sort(p, axis=1)
|
| 25 |
+
return s[:, -1] - s[:, -2]
|
| 26 |
+
|
| 27 |
+
# ---------- Main ----------
|
| 28 |
+
def main():
|
| 29 |
+
ap = argparse.ArgumentParser(description="Compute probability-derived features (logit, max prob, margin, entropy).")
|
| 30 |
+
ap.add_argument("--input", required=True,
|
| 31 |
+
help="Path to FinSent_*_raw_probs.csv from FinBERT/RoBERTa step.")
|
| 32 |
+
ap.add_argument("--out_file", default=None,
|
| 33 |
+
help="Output CSV (default: adds _prob_features to filename).")
|
| 34 |
+
ap.add_argument("--out_dir", default="outputs", help="Base output directory")
|
| 35 |
+
ap.add_argument("--out_subdir", default="prob features", help="Subdirectory under out_dir to save prob features")
|
| 36 |
+
args = ap.parse_args()
|
| 37 |
+
|
| 38 |
+
df = pd.read_csv(args.input)
|
| 39 |
+
|
| 40 |
+
# Check probability columns
|
| 41 |
+
req = [
|
| 42 |
+
"fin_p_neg","fin_p_neu","fin_p_pos",
|
| 43 |
+
"rob_p_neg","rob_p_neu","rob_p_pos"
|
| 44 |
+
]
|
| 45 |
+
missing = [c for c in req if c not in df.columns]
|
| 46 |
+
if missing:
|
| 47 |
+
raise ValueError(f"Missing columns: {missing}")
|
| 48 |
+
|
| 49 |
+
# Prepare arrays
|
| 50 |
+
p_fin = df[["fin_p_neg","fin_p_neu","fin_p_pos"]].to_numpy(dtype=float)
|
| 51 |
+
p_rob = df[["rob_p_neg","rob_p_neu","rob_p_pos"]].to_numpy(dtype=float)
|
| 52 |
+
|
| 53 |
+
# ---- FinBERT features ----
|
| 54 |
+
fin_logit = logit(p_fin)
|
| 55 |
+
for i, cls in enumerate(["neg","neu","pos"]):
|
| 56 |
+
df[f"fin_logit_{cls}"] = fin_logit[:, i]
|
| 57 |
+
df["fin_max_prob"] = np.max(p_fin, axis=1)
|
| 58 |
+
df["fin_margin"] = top2_margin(p_fin)
|
| 59 |
+
df["fin_entropy"] = entropy(p_fin)
|
| 60 |
+
|
| 61 |
+
# ---- RoBERTa features ----
|
| 62 |
+
rob_logit = logit(p_rob)
|
| 63 |
+
for i, cls in enumerate(["neg","neu","pos"]):
|
| 64 |
+
df[f"rob_logit_{cls}"] = rob_logit[:, i]
|
| 65 |
+
df["rob_max_prob"] = np.max(p_rob, axis=1)
|
| 66 |
+
df["rob_margin"] = top2_margin(p_rob)
|
| 67 |
+
df["rob_entropy"] = entropy(p_rob)
|
| 68 |
+
|
| 69 |
+
# Save
|
| 70 |
+
root, ext = os.path.splitext(args.input)
|
| 71 |
+
if args.out_file:
|
| 72 |
+
out_path = args.out_file
|
| 73 |
+
else:
|
| 74 |
+
# save into outputs/<out_subdir>/ by default
|
| 75 |
+
save_dir = os.path.join(args.out_dir, args.out_subdir) if args.out_subdir else args.out_dir
|
| 76 |
+
os.makedirs(save_dir, exist_ok=True)
|
| 77 |
+
base = os.path.basename(root)
|
| 78 |
+
out_path = os.path.join(save_dir, f"{base}_prob_features.csv")
|
| 79 |
+
df.to_csv(out_path, index=False)
|
| 80 |
+
print(f"[✓] Saved probability-derived features to: {out_path}")
|
| 81 |
+
print("Added columns: fin/rob logits, max_prob, margin, entropy")
|
| 82 |
+
|
| 83 |
+
if __name__ == "__main__":
|
| 84 |
+
main()
|
FPB_Structured_Financial_Semantics.py
ADDED
|
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
|
| 2 |
+
import re
|
| 3 |
+
from typing import Dict, List
|
| 4 |
+
|
| 5 |
+
def _compile(patterns: List[str], flags=re.IGNORECASE):
|
| 6 |
+
return [re.compile(p, flags=flags) for p in patterns]
|
| 7 |
+
|
| 8 |
+
def _any_match(text: str, regs) -> bool:
|
| 9 |
+
return any(r.search(text) for r in regs)
|
| 10 |
+
|
| 11 |
+
# Operators per FinSentLLM Table 1
|
| 12 |
+
_COMPARATIVE = _compile([
|
| 13 |
+
r"\bcompared\s+to\b",
|
| 14 |
+
r"\bcompared\s+with\b",
|
| 15 |
+
r"\bversus\b",
|
| 16 |
+
r"\bvs\.?\b",
|
| 17 |
+
r"\bfrom\s+[-+]?\d+(?:\.\d+)?\s*(?:%|percent|percentage|[A-Za-z]+)?\s+to\s+[-+]?\d+(?:\.\d+)?\s*(?:%|percent|percentage|[A-Za-z]+)?\b",
|
| 18 |
+
r"\bfrom\s+[A-Za-z0-9\.,%-]+\s+to\s+[A-Za-z0-9\.,%-]+\b",
|
| 19 |
+
])
|
| 20 |
+
|
| 21 |
+
_LOSS_IMPROVE = _compile([
|
| 22 |
+
r"\bloss(?:es)?\s+(?:narrowed|shr[aou]nk|decreased|fell|reduced)\b",
|
| 23 |
+
r"\bturn(?:ed)?\s+to\s+(?:profit|black)\b",
|
| 24 |
+
])
|
| 25 |
+
_LOSS_WORSEN = _compile([
|
| 26 |
+
r"\bloss(?:es)?\s+(?:widened|grew|increased|rose|deepened)\b",
|
| 27 |
+
r"\bturn(?:ed)?\s+to\s+(?:loss|red)\b",
|
| 28 |
+
])
|
| 29 |
+
|
| 30 |
+
_PROFIT_UP = _compile([
|
| 31 |
+
r"\b(profit|profits|net\s+income|earnings|ebit|ebitda|eps|roe|roi|return(?:s)?(?:\s+on\s+equity)?)\b.*\b(rose|grew|increased|up|higher|improved|jumped|surged|soared)\b",
|
| 32 |
+
r"\b(rose|grew|increased|up|higher|improved|jumped|surged|soared)\b.*\b(profit|profits|net\s+income|earnings|ebit|ebitda|eps|roe|roi|return(?:s)?(?:\s+on\s+equity)?)\b",
|
| 33 |
+
])
|
| 34 |
+
|
| 35 |
+
_COST_DOWN = _compile([
|
| 36 |
+
r"\b(cost|costs|expenses|opex|operating\s+expense(?:s)?)\b.*\b(fell|declined|decreased|lower|reduced|down)\b",
|
| 37 |
+
r"\b(fell|declined|decreased|lower|reduced|down)\b.*\b(cost|costs|expenses|opex|operating\s+expense(?:s)?)\b",
|
| 38 |
+
])
|
| 39 |
+
|
| 40 |
+
_CONTRACT_FIN = _compile([
|
| 41 |
+
r"\b(agreement|deal|contract|order|purchase\s+order|framework\s+agreement)\b",
|
| 42 |
+
r"\b(bond|notes?|debenture|convertible|placement|issuance|issue|offering|ipo|follow-?on)\b",
|
| 43 |
+
r"\b(loan|credit\s+facility|credit\s+line|revolver|revolving\s+credit|financing)\b",
|
| 44 |
+
])
|
| 45 |
+
|
| 46 |
+
_UNCERTAIN = _compile([
|
| 47 |
+
r"\b(uncertain|uncertainty|cannot\s+be\s+determined|not\s+clear|unknown|unpredictable)\b",
|
| 48 |
+
r"\b(impairment|write-?down|one-?off|exceptional\s+(?:item|charge)|non-?recurring)\b",
|
| 49 |
+
r"\b(outlook\s+(?:uncertain|cloudy|cautious))\b",
|
| 50 |
+
])
|
| 51 |
+
|
| 52 |
+
_STABLE_GUIDE = _compile([
|
| 53 |
+
r"\b(expects?|expected|expects\s+to|guidance|forecast|outlook)\b.*\b(remain(?:s|ed|ing)?\s+(?:stable|unchanged)|in[-\s]?line)\b",
|
| 54 |
+
r"\b(reiterated|maintained)\s+(?:its\s+)?(guidance|forecast|outlook)\b",
|
| 55 |
+
])
|
| 56 |
+
|
| 57 |
+
_OPERATIONAL = _compile([
|
| 58 |
+
r"\b(restructuring|reorganization|spin-?off|divest(?:iture)?|asset\s+sale)\b",
|
| 59 |
+
r"\b(ban|suspension|halted|blocked|prohibited)\b",
|
| 60 |
+
r"\b(recall|probe|investigation|lawsuit|litigation|settlement)\b",
|
| 61 |
+
r"\b(layoffs?|headcount\s+reduction|cut\s+jobs|hiring\s+freeze)\b",
|
| 62 |
+
])
|
| 63 |
+
|
| 64 |
+
def extract_semantic_flags(text: str) -> Dict[str, int]:
|
| 65 |
+
t = text.strip().lower()
|
| 66 |
+
|
| 67 |
+
flags = {
|
| 68 |
+
"sem_compared": int(_any_match(t, _COMPARATIVE)),
|
| 69 |
+
"sem_loss_improve": int(_any_match(t, _LOSS_IMPROVE)),
|
| 70 |
+
"sem_loss_worsen": int(_any_match(t, _LOSS_WORSEN)),
|
| 71 |
+
"sem_profit_up": int(_any_match(t, _PROFIT_UP)),
|
| 72 |
+
"sem_cost_down": int(_any_match(t, _COST_DOWN)),
|
| 73 |
+
"sem_contract_fin": int(_any_match(t, _CONTRACT_FIN)),
|
| 74 |
+
"sem_uncertainty": int(_any_match(t, _UNCERTAIN)),
|
| 75 |
+
"sem_stable_guidance":int(_any_match(t, _STABLE_GUIDE)),
|
| 76 |
+
"sem_operational": int(_any_match(t, _OPERATIONAL)),
|
| 77 |
+
}
|
| 78 |
+
return flags
|
| 79 |
+
|
| 80 |
+
# ============================================================
|
| 81 |
+
# Run directly from terminal
|
| 82 |
+
# ============================================================
|
| 83 |
+
if __name__ == "__main__":
|
| 84 |
+
import argparse, pandas as pd
|
| 85 |
+
from pathlib import Path
|
| 86 |
+
|
| 87 |
+
parser = argparse.ArgumentParser(description="Extract Structured Financial Semantics from FPB text file.")
|
| 88 |
+
parser.add_argument("--input", required=True, help="Path to Sentences_*.txt or a CSV with text column.")
|
| 89 |
+
parser.add_argument("--out", required=True, help="Output CSV path.")
|
| 90 |
+
parser.add_argument("--text_col", default="sentence", help="Column name if input is CSV.")
|
| 91 |
+
args = parser.parse_args()
|
| 92 |
+
|
| 93 |
+
def parse_fpb_line(line):
|
| 94 |
+
if "@positive" in line:
|
| 95 |
+
return line.rsplit("@positive", 1)[0].strip(), "positive"
|
| 96 |
+
elif "@negative" in line:
|
| 97 |
+
return line.rsplit("@negative", 1)[0].strip(), "negative"
|
| 98 |
+
elif "@neutral" in line:
|
| 99 |
+
return line.rsplit("@neutral", 1)[0].strip(), "neutral"
|
| 100 |
+
else:
|
| 101 |
+
return line.strip(), ""
|
| 102 |
+
|
| 103 |
+
path = Path(args.input)
|
| 104 |
+
rows = []
|
| 105 |
+
if path.suffix.lower() == ".txt":
|
| 106 |
+
with open(path, "r", encoding="utf-8", errors="ignore") as f:
|
| 107 |
+
for i, line in enumerate(f):
|
| 108 |
+
text, label = parse_fpb_line(line)
|
| 109 |
+
if text:
|
| 110 |
+
rows.append({"id": i, args.text_col: text, "label": label})
|
| 111 |
+
df = pd.DataFrame(rows)
|
| 112 |
+
else:
|
| 113 |
+
df = pd.read_csv(path)
|
| 114 |
+
|
| 115 |
+
# Apply semantic extraction
|
| 116 |
+
df_feats = df[args.text_col].astype(str).apply(extract_semantic_flags).apply(pd.Series)
|
| 117 |
+
df_out = pd.concat([df, df_feats], axis=1)
|
| 118 |
+
df_out.to_csv(args.out, index=False)
|
| 119 |
+
print(f"Saved structured semantics to: {args.out}")
|
| 120 |
+
print("Columns:", [c for c in df_out.columns if c.startswith('sem_')])
|