FinSentLLM / FPB Prob Features.py
jennyyu009's picture
Upload 5 files
23a8f6d verified
import argparse
import os
#!/usr/bin/env python3
import argparse
import os
import numpy as np
import pandas as pd
EPS = 1e-12
# ---------- Helper functions ----------
def logit(p: np.ndarray) -> np.ndarray:
"""Compute logit(p) = log(p / (1 - p))."""
p = np.clip(p, EPS, 1.0 - EPS)
return np.log(p / (1.0 - p))
def entropy(p: np.ndarray) -> np.ndarray:
"""Shannon entropy: H(p) = -Σ p log p."""
p = np.clip(p, EPS, 1.0)
return -np.sum(p * np.log(p), axis=1)
def top2_margin(p: np.ndarray) -> np.ndarray:
"""Margin = top1(p) - top2(p)."""
s = np.sort(p, axis=1)
return s[:, -1] - s[:, -2]
# ---------- Main ----------
def main():
ap = argparse.ArgumentParser(description="Compute probability-derived features (logit, max prob, margin, entropy).")
ap.add_argument("--input", required=True,
help="Path to FinSent_*_raw_probs.csv from FinBERT/RoBERTa step.")
ap.add_argument("--out_file", default=None,
help="Output CSV (default: adds _prob_features to filename).")
ap.add_argument("--out_dir", default="outputs", help="Base output directory")
ap.add_argument("--out_subdir", default="prob features", help="Subdirectory under out_dir to save prob features")
args = ap.parse_args()
df = pd.read_csv(args.input)
# Check probability columns
req = [
"fin_p_neg","fin_p_neu","fin_p_pos",
"rob_p_neg","rob_p_neu","rob_p_pos"
]
missing = [c for c in req if c not in df.columns]
if missing:
raise ValueError(f"Missing columns: {missing}")
# Prepare arrays
p_fin = df[["fin_p_neg","fin_p_neu","fin_p_pos"]].to_numpy(dtype=float)
p_rob = df[["rob_p_neg","rob_p_neu","rob_p_pos"]].to_numpy(dtype=float)
# ---- FinBERT features ----
fin_logit = logit(p_fin)
for i, cls in enumerate(["neg","neu","pos"]):
df[f"fin_logit_{cls}"] = fin_logit[:, i]
df["fin_max_prob"] = np.max(p_fin, axis=1)
df["fin_margin"] = top2_margin(p_fin)
df["fin_entropy"] = entropy(p_fin)
# ---- RoBERTa features ----
rob_logit = logit(p_rob)
for i, cls in enumerate(["neg","neu","pos"]):
df[f"rob_logit_{cls}"] = rob_logit[:, i]
df["rob_max_prob"] = np.max(p_rob, axis=1)
df["rob_margin"] = top2_margin(p_rob)
df["rob_entropy"] = entropy(p_rob)
# Save
root, ext = os.path.splitext(args.input)
if args.out_file:
out_path = args.out_file
else:
# save into outputs/<out_subdir>/ by default
save_dir = os.path.join(args.out_dir, args.out_subdir) if args.out_subdir else args.out_dir
os.makedirs(save_dir, exist_ok=True)
base = os.path.basename(root)
out_path = os.path.join(save_dir, f"{base}_prob_features.csv")
df.to_csv(out_path, index=False)
print(f"[✓] Saved probability-derived features to: {out_path}")
print("Added columns: fin/rob logits, max_prob, margin, entropy")
if __name__ == "__main__":
main()