|
|
import argparse |
|
|
import os |
|
|
|
|
|
import argparse |
|
|
import os |
|
|
import numpy as np |
|
|
import pandas as pd |
|
|
|
|
|
EPS = 1e-12 |
|
|
|
|
|
|
|
|
def logit(p: np.ndarray) -> np.ndarray: |
|
|
"""Compute logit(p) = log(p / (1 - p)).""" |
|
|
p = np.clip(p, EPS, 1.0 - EPS) |
|
|
return np.log(p / (1.0 - p)) |
|
|
|
|
|
def entropy(p: np.ndarray) -> np.ndarray: |
|
|
"""Shannon entropy: H(p) = -Σ p log p.""" |
|
|
p = np.clip(p, EPS, 1.0) |
|
|
return -np.sum(p * np.log(p), axis=1) |
|
|
|
|
|
def top2_margin(p: np.ndarray) -> np.ndarray: |
|
|
"""Margin = top1(p) - top2(p).""" |
|
|
s = np.sort(p, axis=1) |
|
|
return s[:, -1] - s[:, -2] |
|
|
|
|
|
|
|
|
def main(): |
|
|
ap = argparse.ArgumentParser(description="Compute probability-derived features (logit, max prob, margin, entropy).") |
|
|
ap.add_argument("--input", required=True, |
|
|
help="Path to FinSent_*_raw_probs.csv from FinBERT/RoBERTa step.") |
|
|
ap.add_argument("--out_file", default=None, |
|
|
help="Output CSV (default: adds _prob_features to filename).") |
|
|
ap.add_argument("--out_dir", default="outputs", help="Base output directory") |
|
|
ap.add_argument("--out_subdir", default="prob features", help="Subdirectory under out_dir to save prob features") |
|
|
args = ap.parse_args() |
|
|
|
|
|
df = pd.read_csv(args.input) |
|
|
|
|
|
|
|
|
req = [ |
|
|
"fin_p_neg","fin_p_neu","fin_p_pos", |
|
|
"rob_p_neg","rob_p_neu","rob_p_pos" |
|
|
] |
|
|
missing = [c for c in req if c not in df.columns] |
|
|
if missing: |
|
|
raise ValueError(f"Missing columns: {missing}") |
|
|
|
|
|
|
|
|
p_fin = df[["fin_p_neg","fin_p_neu","fin_p_pos"]].to_numpy(dtype=float) |
|
|
p_rob = df[["rob_p_neg","rob_p_neu","rob_p_pos"]].to_numpy(dtype=float) |
|
|
|
|
|
|
|
|
fin_logit = logit(p_fin) |
|
|
for i, cls in enumerate(["neg","neu","pos"]): |
|
|
df[f"fin_logit_{cls}"] = fin_logit[:, i] |
|
|
df["fin_max_prob"] = np.max(p_fin, axis=1) |
|
|
df["fin_margin"] = top2_margin(p_fin) |
|
|
df["fin_entropy"] = entropy(p_fin) |
|
|
|
|
|
|
|
|
rob_logit = logit(p_rob) |
|
|
for i, cls in enumerate(["neg","neu","pos"]): |
|
|
df[f"rob_logit_{cls}"] = rob_logit[:, i] |
|
|
df["rob_max_prob"] = np.max(p_rob, axis=1) |
|
|
df["rob_margin"] = top2_margin(p_rob) |
|
|
df["rob_entropy"] = entropy(p_rob) |
|
|
|
|
|
|
|
|
root, ext = os.path.splitext(args.input) |
|
|
if args.out_file: |
|
|
out_path = args.out_file |
|
|
else: |
|
|
|
|
|
save_dir = os.path.join(args.out_dir, args.out_subdir) if args.out_subdir else args.out_dir |
|
|
os.makedirs(save_dir, exist_ok=True) |
|
|
base = os.path.basename(root) |
|
|
out_path = os.path.join(save_dir, f"{base}_prob_features.csv") |
|
|
df.to_csv(out_path, index=False) |
|
|
print(f"[✓] Saved probability-derived features to: {out_path}") |
|
|
print("Added columns: fin/rob logits, max_prob, margin, entropy") |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |