import argparse import os #!/usr/bin/env python3 import argparse import os import numpy as np import pandas as pd EPS = 1e-12 # ---------- Helper functions ---------- def logit(p: np.ndarray) -> np.ndarray: """Compute logit(p) = log(p / (1 - p)).""" p = np.clip(p, EPS, 1.0 - EPS) return np.log(p / (1.0 - p)) def entropy(p: np.ndarray) -> np.ndarray: """Shannon entropy: H(p) = -Σ p log p.""" p = np.clip(p, EPS, 1.0) return -np.sum(p * np.log(p), axis=1) def top2_margin(p: np.ndarray) -> np.ndarray: """Margin = top1(p) - top2(p).""" s = np.sort(p, axis=1) return s[:, -1] - s[:, -2] # ---------- Main ---------- def main(): ap = argparse.ArgumentParser(description="Compute probability-derived features (logit, max prob, margin, entropy).") ap.add_argument("--input", required=True, help="Path to FinSent_*_raw_probs.csv from FinBERT/RoBERTa step.") ap.add_argument("--out_file", default=None, help="Output CSV (default: adds _prob_features to filename).") ap.add_argument("--out_dir", default="outputs", help="Base output directory") ap.add_argument("--out_subdir", default="prob features", help="Subdirectory under out_dir to save prob features") args = ap.parse_args() df = pd.read_csv(args.input) # Check probability columns req = [ "fin_p_neg","fin_p_neu","fin_p_pos", "rob_p_neg","rob_p_neu","rob_p_pos" ] missing = [c for c in req if c not in df.columns] if missing: raise ValueError(f"Missing columns: {missing}") # Prepare arrays p_fin = df[["fin_p_neg","fin_p_neu","fin_p_pos"]].to_numpy(dtype=float) p_rob = df[["rob_p_neg","rob_p_neu","rob_p_pos"]].to_numpy(dtype=float) # ---- FinBERT features ---- fin_logit = logit(p_fin) for i, cls in enumerate(["neg","neu","pos"]): df[f"fin_logit_{cls}"] = fin_logit[:, i] df["fin_max_prob"] = np.max(p_fin, axis=1) df["fin_margin"] = top2_margin(p_fin) df["fin_entropy"] = entropy(p_fin) # ---- RoBERTa features ---- rob_logit = logit(p_rob) for i, cls in enumerate(["neg","neu","pos"]): df[f"rob_logit_{cls}"] = rob_logit[:, i] df["rob_max_prob"] = np.max(p_rob, axis=1) df["rob_margin"] = top2_margin(p_rob) df["rob_entropy"] = entropy(p_rob) # Save root, ext = os.path.splitext(args.input) if args.out_file: out_path = args.out_file else: # save into outputs// by default save_dir = os.path.join(args.out_dir, args.out_subdir) if args.out_subdir else args.out_dir os.makedirs(save_dir, exist_ok=True) base = os.path.basename(root) out_path = os.path.join(save_dir, f"{base}_prob_features.csv") df.to_csv(out_path, index=False) print(f"[✓] Saved probability-derived features to: {out_path}") print("Added columns: fin/rob logits, max_prob, margin, entropy") if __name__ == "__main__": main()