|
|
|
|
|
|
|
|
import argparse |
|
|
import os |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
from scipy.special import rel_entr |
|
|
|
|
|
EPS = 1e-12 |
|
|
|
|
|
def row_normalize(a: np.ndarray) -> np.ndarray: |
|
|
s = a.sum(axis=1, keepdims=True) |
|
|
s[s == 0.0] = 1.0 |
|
|
return a / s |
|
|
|
|
|
def kl_divergence(P: np.ndarray, Q: np.ndarray) -> np.ndarray: |
|
|
P = np.clip(P, EPS, 1.0) |
|
|
Q = np.clip(Q, EPS, 1.0) |
|
|
return np.sum(rel_entr(P, Q), axis=1) |
|
|
|
|
|
def l1_distance(P: np.ndarray, Q: np.ndarray) -> np.ndarray: |
|
|
return 0.5 * np.sum(np.abs(P - Q), axis=1) |
|
|
|
|
|
def load_probs(df: pd.DataFrame, cols: list, label: str) -> np.ndarray: |
|
|
missing = [c for c in cols if c not in df.columns] |
|
|
if missing: |
|
|
raise ValueError(f"[{label}] Missing columns: {missing}") |
|
|
arr = df[cols].to_numpy(dtype=float) |
|
|
return row_normalize(arr) |
|
|
|
|
|
def main(): |
|
|
ap = argparse.ArgumentParser(description="Compute Multi-LLM expert signals (Stage 3 only).") |
|
|
ap.add_argument("--input", required=True, help="CSV with per-model probabilities") |
|
|
ap.add_argument("--dataset", required=True, help="Dataset tag, e.g., 50Agree | AllAgree") |
|
|
ap.add_argument("--out_dir", default=".", help="Output directory") |
|
|
ap.add_argument("--out_subdir", default="MultiLLM", help="Subdirectory under out_dir to save multi-llm features") |
|
|
|
|
|
ap.add_argument("--fin_cols", nargs=3, default=["fin_p_neg","fin_p_neu","fin_p_pos"], |
|
|
help="FinBERT prob columns [neg neu pos]") |
|
|
ap.add_argument("--rob_cols", nargs=3, default=["rob_p_neg","rob_p_neu","rob_p_pos"], |
|
|
help="RoBERTa prob columns [neg neu pos]") |
|
|
args = ap.parse_args() |
|
|
|
|
|
df = pd.read_csv(args.input) |
|
|
|
|
|
|
|
|
P = load_probs(df, args.fin_cols, "FinBERT") |
|
|
Q = load_probs(df, args.rob_cols, "RoBERTa") |
|
|
|
|
|
|
|
|
df["MultiLLM_L1_distance"] = l1_distance(P, Q) |
|
|
df["MultiLLM_L1_similarity"] = 1.0 - df["MultiLLM_L1_distance"] |
|
|
df["MultiLLM_KL_F_to_R"] = kl_divergence(P, Q) |
|
|
df["MultiLLM_KL_R_to_F"] = kl_divergence(Q, P) |
|
|
|
|
|
|
|
|
df["MultiLLM_agree"] = (np.argmax(P, axis=1) == np.argmax(Q, axis=1)).astype(int) |
|
|
|
|
|
save_dir = args.out_dir |
|
|
if args.out_subdir: |
|
|
save_dir = os.path.join(args.out_dir, args.out_subdir) |
|
|
os.makedirs(save_dir, exist_ok=True) |
|
|
out_path = os.path.join(save_dir, f"FPB_MultiLLM_{args.dataset}.csv") |
|
|
df.to_csv(out_path, index=False) |
|
|
print(f"✅ Saved expert-signal features to: {out_path}") |
|
|
print(" Added columns: MultiLLM_L1_distance, MultiLLM_L1_similarity, " |
|
|
"MultiLLM_KL_F_to_R, MultiLLM_KL_R_to_F, MultiLLM_agree") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|