Spaces:

devrup404
/

SignalMod

Running

SignalMod / src /features /metadata_features.py

Mirae Kang

feat: implement new models and improve UI, #23

46cc63a 4 days ago

1.73 kB

	"""
	Numeric metadata features for hybrid LR (dual-input pipeline).
	"""

	from __future__ import annotations

	import pandas as pd


	DEFAULT_METADATA_COLUMNS = [
	"char_length",
	"word_count",
	"exclamation_ratio",
	"question_ratio",
	"caps_ratio",
	]


	def extract_metadata_features(
	df: pd.DataFrame,
	*,
	text_column: str = "Text",
	existing_stats: pd.DataFrame \| None = None,
	) -> pd.DataFrame:
	"""
	Build 3–5 numeric features for LR fusion.

	Uses columns from ``comments_with_stats`` when present; otherwise computes from text.
	"""
	text = df[text_column].fillna("").astype(str)
	out = pd.DataFrame(index=df.index)

	if existing_stats is not None:
	for col in ("char_length", "word_count", "n_labels"):
	if col in existing_stats.columns:
	out[col] = existing_stats[col].values

	if "char_length" not in out.columns:
	out["char_length"] = text.str.len()
	if "word_count" not in out.columns:
	out["word_count"] = text.str.split().str.len()
	if "n_labels" not in out.columns:
	label_cols = [c for c in df.columns if c.startswith("Is") and c != "IsToxic"]
	if label_cols:
	out["n_labels"] = df[label_cols].astype(int).sum(axis=1)
	elif "IsToxic" in df.columns:
	out["n_labels"] = df["IsToxic"].astype(int)
	else:
	out["n_labels"] = 0

	length = text.str.len().clip(lower=1)
	out["exclamation_ratio"] = text.str.count("!") / length
	out["question_ratio"] = text.str.count(r"\?") / length
	out["caps_ratio"] = text.apply(
	lambda s: sum(1 for c in s if c.isupper()) / max(len(s), 1)
	)

	return out[DEFAULT_METADATA_COLUMNS].astype(float)