| """ |
| Numeric metadata features for hybrid LR (dual-input pipeline). |
| """ |
|
|
| from __future__ import annotations |
|
|
| import pandas as pd |
|
|
|
|
| DEFAULT_METADATA_COLUMNS = [ |
| "char_length", |
| "word_count", |
| "exclamation_ratio", |
| "question_ratio", |
| "caps_ratio", |
| ] |
|
|
|
|
| def extract_metadata_features( |
| df: pd.DataFrame, |
| *, |
| text_column: str = "Text", |
| existing_stats: pd.DataFrame | None = None, |
| ) -> pd.DataFrame: |
| """ |
| Build 3–5 numeric features for LR fusion. |
| |
| Uses columns from ``comments_with_stats`` when present; otherwise computes from text. |
| """ |
| text = df[text_column].fillna("").astype(str) |
| out = pd.DataFrame(index=df.index) |
|
|
| if existing_stats is not None: |
| for col in ("char_length", "word_count", "n_labels"): |
| if col in existing_stats.columns: |
| out[col] = existing_stats[col].values |
|
|
| if "char_length" not in out.columns: |
| out["char_length"] = text.str.len() |
| if "word_count" not in out.columns: |
| out["word_count"] = text.str.split().str.len() |
| if "n_labels" not in out.columns: |
| label_cols = [c for c in df.columns if c.startswith("Is") and c != "IsToxic"] |
| if label_cols: |
| out["n_labels"] = df[label_cols].astype(int).sum(axis=1) |
| elif "IsToxic" in df.columns: |
| out["n_labels"] = df["IsToxic"].astype(int) |
| else: |
| out["n_labels"] = 0 |
|
|
| length = text.str.len().clip(lower=1) |
| out["exclamation_ratio"] = text.str.count("!") / length |
| out["question_ratio"] = text.str.count(r"\?") / length |
| out["caps_ratio"] = text.apply( |
| lambda s: sum(1 for c in s if c.isupper()) / max(len(s), 1) |
| ) |
|
|
| return out[DEFAULT_METADATA_COLUMNS].astype(float) |
|
|