File size: 1,730 Bytes
46cc63a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
"""
Numeric metadata features for hybrid LR (dual-input pipeline).
"""

from __future__ import annotations

import pandas as pd


DEFAULT_METADATA_COLUMNS = [
    "char_length",
    "word_count",
    "exclamation_ratio",
    "question_ratio",
    "caps_ratio",
]


def extract_metadata_features(
    df: pd.DataFrame,
    *,
    text_column: str = "Text",
    existing_stats: pd.DataFrame | None = None,
) -> pd.DataFrame:
    """
    Build 3–5 numeric features for LR fusion.

    Uses columns from ``comments_with_stats`` when present; otherwise computes from text.
    """
    text = df[text_column].fillna("").astype(str)
    out = pd.DataFrame(index=df.index)

    if existing_stats is not None:
        for col in ("char_length", "word_count", "n_labels"):
            if col in existing_stats.columns:
                out[col] = existing_stats[col].values

    if "char_length" not in out.columns:
        out["char_length"] = text.str.len()
    if "word_count" not in out.columns:
        out["word_count"] = text.str.split().str.len()
    if "n_labels" not in out.columns:
        label_cols = [c for c in df.columns if c.startswith("Is") and c != "IsToxic"]
        if label_cols:
            out["n_labels"] = df[label_cols].astype(int).sum(axis=1)
        elif "IsToxic" in df.columns:
            out["n_labels"] = df["IsToxic"].astype(int)
        else:
            out["n_labels"] = 0

    length = text.str.len().clip(lower=1)
    out["exclamation_ratio"] = text.str.count("!") / length
    out["question_ratio"] = text.str.count(r"\?") / length
    out["caps_ratio"] = text.apply(
        lambda s: sum(1 for c in s if c.isupper()) / max(len(s), 1)
    )

    return out[DEFAULT_METADATA_COLUMNS].astype(float)