# FPB Multi-LLM .py # Compute expert-signal features from multiple LLMs (FinBERT, RoBERT import argparse import os import numpy as np import pandas as pd from scipy.special import rel_entr EPS = 1e-12 def row_normalize(a: np.ndarray) -> np.ndarray: s = a.sum(axis=1, keepdims=True) s[s == 0.0] = 1.0 return a / s def kl_divergence(P: np.ndarray, Q: np.ndarray) -> np.ndarray: P = np.clip(P, EPS, 1.0) Q = np.clip(Q, EPS, 1.0) return np.sum(rel_entr(P, Q), axis=1) # Σ p * log(p/q) def l1_distance(P: np.ndarray, Q: np.ndarray) -> np.ndarray: return 0.5 * np.sum(np.abs(P - Q), axis=1) # ∈ [0,1] def load_probs(df: pd.DataFrame, cols: list, label: str) -> np.ndarray: missing = [c for c in cols if c not in df.columns] if missing: raise ValueError(f"[{label}] Missing columns: {missing}") arr = df[cols].to_numpy(dtype=float) return row_normalize(arr) def main(): ap = argparse.ArgumentParser(description="Compute Multi-LLM expert signals (Stage 3 only).") ap.add_argument("--input", required=True, help="CSV with per-model probabilities") ap.add_argument("--dataset", required=True, help="Dataset tag, e.g., 50Agree | AllAgree") ap.add_argument("--out_dir", default=".", help="Output directory") ap.add_argument("--out_subdir", default="MultiLLM", help="Subdirectory under out_dir to save multi-llm features") # Column names: default to common names; override if your headers differ ap.add_argument("--fin_cols", nargs=3, default=["fin_p_neg","fin_p_neu","fin_p_pos"], help="FinBERT prob columns [neg neu pos]") ap.add_argument("--rob_cols", nargs=3, default=["rob_p_neg","rob_p_neu","rob_p_pos"], help="RoBERTa prob columns [neg neu pos]") args = ap.parse_args() df = pd.read_csv(args.input) # Load & normalize probabilities P = load_probs(df, args.fin_cols, "FinBERT") Q = load_probs(df, args.rob_cols, "RoBERTa") # Expert-signal features (paper) df["MultiLLM_L1_distance"] = l1_distance(P, Q) df["MultiLLM_L1_similarity"] = 1.0 - df["MultiLLM_L1_distance"] df["MultiLLM_KL_F_to_R"] = kl_divergence(P, Q) df["MultiLLM_KL_R_to_F"] = kl_divergence(Q, P) # Optional: simple agreement flag (same argmax class) df["MultiLLM_agree"] = (np.argmax(P, axis=1) == np.argmax(Q, axis=1)).astype(int) save_dir = args.out_dir if args.out_subdir: save_dir = os.path.join(args.out_dir, args.out_subdir) os.makedirs(save_dir, exist_ok=True) out_path = os.path.join(save_dir, f"FPB_MultiLLM_{args.dataset}.csv") df.to_csv(out_path, index=False) print(f"✅ Saved expert-signal features to: {out_path}") print(" Added columns: MultiLLM_L1_distance, MultiLLM_L1_similarity, " "MultiLLM_KL_F_to_R, MultiLLM_KL_R_to_F, MultiLLM_agree") if __name__ == "__main__": main()