FinSentLLM / FPB Multi-LLM .py
jennyyu009's picture
Upload 5 files
23a8f6d verified
# FPB Multi-LLM .py
# Compute expert-signal features from multiple LLMs (FinBERT, RoBERT
import argparse
import os
import numpy as np
import pandas as pd
from scipy.special import rel_entr
EPS = 1e-12
def row_normalize(a: np.ndarray) -> np.ndarray:
s = a.sum(axis=1, keepdims=True)
s[s == 0.0] = 1.0
return a / s
def kl_divergence(P: np.ndarray, Q: np.ndarray) -> np.ndarray:
P = np.clip(P, EPS, 1.0)
Q = np.clip(Q, EPS, 1.0)
return np.sum(rel_entr(P, Q), axis=1) # Σ p * log(p/q)
def l1_distance(P: np.ndarray, Q: np.ndarray) -> np.ndarray:
return 0.5 * np.sum(np.abs(P - Q), axis=1) # ∈ [0,1]
def load_probs(df: pd.DataFrame, cols: list, label: str) -> np.ndarray:
missing = [c for c in cols if c not in df.columns]
if missing:
raise ValueError(f"[{label}] Missing columns: {missing}")
arr = df[cols].to_numpy(dtype=float)
return row_normalize(arr)
def main():
ap = argparse.ArgumentParser(description="Compute Multi-LLM expert signals (Stage 3 only).")
ap.add_argument("--input", required=True, help="CSV with per-model probabilities")
ap.add_argument("--dataset", required=True, help="Dataset tag, e.g., 50Agree | AllAgree")
ap.add_argument("--out_dir", default=".", help="Output directory")
ap.add_argument("--out_subdir", default="MultiLLM", help="Subdirectory under out_dir to save multi-llm features")
# Column names: default to common names; override if your headers differ
ap.add_argument("--fin_cols", nargs=3, default=["fin_p_neg","fin_p_neu","fin_p_pos"],
help="FinBERT prob columns [neg neu pos]")
ap.add_argument("--rob_cols", nargs=3, default=["rob_p_neg","rob_p_neu","rob_p_pos"],
help="RoBERTa prob columns [neg neu pos]")
args = ap.parse_args()
df = pd.read_csv(args.input)
# Load & normalize probabilities
P = load_probs(df, args.fin_cols, "FinBERT")
Q = load_probs(df, args.rob_cols, "RoBERTa")
# Expert-signal features (paper)
df["MultiLLM_L1_distance"] = l1_distance(P, Q)
df["MultiLLM_L1_similarity"] = 1.0 - df["MultiLLM_L1_distance"]
df["MultiLLM_KL_F_to_R"] = kl_divergence(P, Q)
df["MultiLLM_KL_R_to_F"] = kl_divergence(Q, P)
# Optional: simple agreement flag (same argmax class)
df["MultiLLM_agree"] = (np.argmax(P, axis=1) == np.argmax(Q, axis=1)).astype(int)
save_dir = args.out_dir
if args.out_subdir:
save_dir = os.path.join(args.out_dir, args.out_subdir)
os.makedirs(save_dir, exist_ok=True)
out_path = os.path.join(save_dir, f"FPB_MultiLLM_{args.dataset}.csv")
df.to_csv(out_path, index=False)
print(f"✅ Saved expert-signal features to: {out_path}")
print(" Added columns: MultiLLM_L1_distance, MultiLLM_L1_similarity, "
"MultiLLM_KL_F_to_R, MultiLLM_KL_R_to_F, MultiLLM_agree")
if __name__ == "__main__":
main()