SignalMod / src /features /metadata_features.py
Mirae Kang
feat: implement new models and improve UI, #23
46cc63a
raw
history blame
1.73 kB
"""
Numeric metadata features for hybrid LR (dual-input pipeline).
"""
from __future__ import annotations
import pandas as pd
DEFAULT_METADATA_COLUMNS = [
"char_length",
"word_count",
"exclamation_ratio",
"question_ratio",
"caps_ratio",
]
def extract_metadata_features(
df: pd.DataFrame,
*,
text_column: str = "Text",
existing_stats: pd.DataFrame | None = None,
) -> pd.DataFrame:
"""
Build 3–5 numeric features for LR fusion.
Uses columns from ``comments_with_stats`` when present; otherwise computes from text.
"""
text = df[text_column].fillna("").astype(str)
out = pd.DataFrame(index=df.index)
if existing_stats is not None:
for col in ("char_length", "word_count", "n_labels"):
if col in existing_stats.columns:
out[col] = existing_stats[col].values
if "char_length" not in out.columns:
out["char_length"] = text.str.len()
if "word_count" not in out.columns:
out["word_count"] = text.str.split().str.len()
if "n_labels" not in out.columns:
label_cols = [c for c in df.columns if c.startswith("Is") and c != "IsToxic"]
if label_cols:
out["n_labels"] = df[label_cols].astype(int).sum(axis=1)
elif "IsToxic" in df.columns:
out["n_labels"] = df["IsToxic"].astype(int)
else:
out["n_labels"] = 0
length = text.str.len().clip(lower=1)
out["exclamation_ratio"] = text.str.count("!") / length
out["question_ratio"] = text.str.count(r"\?") / length
out["caps_ratio"] = text.apply(
lambda s: sum(1 for c in s if c.isupper()) / max(len(s), 1)
)
return out[DEFAULT_METADATA_COLUMNS].astype(float)