Spaces:

TwinklData
/

Community_Collections_App

Sleeping

App Files Files Community

lynn-twinkl commited on Apr 21, 2025

Commit

7e21a53

1 Parent(s): df474fc

added clearer docstring

Browse files

Files changed (1) hide show

functions/column_detection.py +89 -54

functions/column_detection.py CHANGED Viewed

@@ -1,62 +1,97 @@
-import pandas as pd
-import numpy as np
 import re
 import string
-def detect_freeform_answer_col(df, penalty_for_low_uniqueness=0.4):
     """
-    Detect the 'freeform_answer' column using heuristics: average length, punctuation, uniqueness.
-    Returns the most likely column name or None.
     """
-    text_cols = df.select_dtypes(include=['object']).columns.tolist()
-    if not text_cols:
-        return None
-    scores = {}
-    for col in text_cols:
-        series = df[col].dropna().astype(str)
-        if series.empty:
             continue
-        avg_len = series.apply(len).mean()
-        punct_counts = series.apply(lambda x: sum(1 for char in x if char in string.punctuation))
-        avg_punct = punct_counts.mean()
-        total = len(series)
-        unique_ratio = series.nunique() / total if total else 0
-        # Weighted composite
-        weight_length = 0.4
-        weight_punct  = 0.3
-        weight_unique = 0.3
-        norm_factor = 1e-9  # avoid dividing by 0
-        scores[col] = {
-            'avg_len': avg_len,
-            'avg_punct': avg_punct,
-            'unique_ratio': unique_ratio,
         }
-    if not scores:
-        return None
-    # Normalizing across all columns
-    max_len   = max(s['avg_len']   for s in scores.values()) or 1e-9
-    max_punct = max(s['avg_punct'] for s in scores.values()) or 1e-9
-    composite = {}
-    for col, s in scores.items():
-        norm_len   = s['avg_len'] / max_len
-        norm_punct = s['avg_punct'] / max_punct
-        comp_score = (0.4 * norm_len) + (0.3 * norm_punct) + (0.3 * s['unique_ratio'])
-        # Bonus/penalty for column names
-        if "additional_comment" in col.lower():
-            comp_score *= 3.1
-        if "usage_reason" in col.lower():
-            comp_score *= 0.5
-        # Penalize low uniqueness
-        if s['unique_ratio'] < penalty_for_low_uniqueness:
-            comp_score *= 0.5
-        composite[col] = comp_score
-    return max(composite, key=composite.get)

+"""
+column_detect.py  ──  tiny heuristics for finding ID and free‑text columns
+"""
+from __future__ import annotations  # harmless on 3.11+, useful on 3.7‑3.10
 import re
 import string
+from typing import Sequence, Dict, Tuple, Optional
+import pandas as pd
+# --------- HELPER FUNCTIONS --------
+def _max_or_eps(values, eps: float = 1e-9) -> float:
+    """Avoid divide‑by‑zero during normalisation."""
+    return max(values) or eps
+def _normalise(value: float, max_value: float) -> float:
+    return value / max_value if max_value else 0.0
+## -------- DETECT FREEFORM COL FUNCTION ------------
+def detect_freeform_col(
+    df: pd.DataFrame,
+    *,
+    length_weight: float = 0.4,
+    punct_weight: float = 0.3,
+    unique_weight: float = 0.3,
+    low_uniqueness_penalty: float = 0.4,
+    name_boosts: dict[str, float] | None = None,
+    min_score: float = 0.50,
+    return_scores: bool = False,
+) -> str | None | Tuple[str | None, Dict[str, float]]:
     """
+    Guess which *object* column contains free‑text answers or comments.
+    A good free‑text column tends to be longish, rich in punctuation,
+    and fairly unique row‑to‑row.
+    name_boosts
+        e.g. ``{"additional_comment": 3.1, "usage_reason": 0.5}``
+        Multiplicative factors applied if the token appears in the header.
     """
+    name_boosts = name_boosts or {}
+    obj_cols = df.select_dtypes(include=["object"]).columns
+    # quick exit
+    if not obj_cols.size:
+        return (None, {}) if return_scores else None
+    # pre‑compute raw metrics
+    raw: Dict[str, dict[str, float]] = {}
+    for col in obj_cols:
+        ser = df[col].dropna().astype(str)
+        if ser.empty:
             continue
+        raw[col] = {
+            "avg_len": ser.str.len().mean(),
+            "avg_punct": ser.apply(lambda s: sum(c in string.punctuation for c in s)).mean(),
+            "unique_ratio": ser.nunique() / len(ser),
         }
+    if not raw:
+        return (None, {}) if return_scores else None
+    # normalisers
+    max_len = _max_or_eps([m["avg_len"] for m in raw.values()])
+    max_punc = _max_or_eps([m["avg_punct"] for m in raw.values()])
+    # composite scores
+    scores: Dict[str, float] = {}
+    for col, m in raw.items():
+        score = (
+            length_weight * _normalise(m["avg_len"], max_len)
+            + punct_weight * _normalise(m["avg_punct"], max_punc)
+            + unique_weight * m["unique_ratio"]
+        )
+        # header boosts / penalties
+        for token, factor in name_boosts.items():
+            if token in col.lower():
+                score *= factor
+        # penalise low uniqueness
+        if m["unique_ratio"] < low_uniqueness_penalty:
+            score *= 0.5
+        scores[col] = score
+    best_col, best_score = max(scores.items(), key=lambda kv: kv[1])
+    passed = best_score >= min_score
+    if return_scores:
+        return (best_col if passed else None, scores)
+    return best_col if passed else None