lynn-twinkl commited on
Commit ·
7e21a53
1
Parent(s): df474fc
added clearer docstring
Browse files- functions/column_detection.py +89 -54
functions/column_detection.py
CHANGED
|
@@ -1,62 +1,97 @@
|
|
| 1 |
-
|
| 2 |
-
|
|
|
|
|
|
|
| 3 |
import re
|
| 4 |
import string
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 5 |
|
| 6 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
"""
|
| 8 |
-
|
| 9 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 10 |
"""
|
| 11 |
-
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
|
| 15 |
-
|
| 16 |
-
|
| 17 |
-
|
| 18 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 19 |
continue
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
|
| 23 |
-
|
| 24 |
-
unique_ratio = series.nunique() / total if total else 0
|
| 25 |
-
|
| 26 |
-
# Weighted composite
|
| 27 |
-
weight_length = 0.4
|
| 28 |
-
weight_punct = 0.3
|
| 29 |
-
weight_unique = 0.3
|
| 30 |
-
norm_factor = 1e-9 # avoid dividing by 0
|
| 31 |
-
scores[col] = {
|
| 32 |
-
'avg_len': avg_len,
|
| 33 |
-
'avg_punct': avg_punct,
|
| 34 |
-
'unique_ratio': unique_ratio,
|
| 35 |
}
|
| 36 |
|
| 37 |
-
if not
|
| 38 |
-
return None
|
| 39 |
-
|
| 40 |
-
#
|
| 41 |
-
max_len
|
| 42 |
-
|
| 43 |
-
|
| 44 |
-
composite
|
| 45 |
-
|
| 46 |
-
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
|
| 52 |
-
|
| 53 |
-
|
| 54 |
-
|
| 55 |
-
|
| 56 |
-
|
| 57 |
-
|
| 58 |
-
|
| 59 |
-
|
| 60 |
-
|
| 61 |
-
|
| 62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
column_detect.py ── tiny heuristics for finding ID and free‑text columns
|
| 3 |
+
"""
|
| 4 |
+
from __future__ import annotations # harmless on 3.11+, useful on 3.7‑3.10
|
| 5 |
import re
|
| 6 |
import string
|
| 7 |
+
from typing import Sequence, Dict, Tuple, Optional
|
| 8 |
+
|
| 9 |
+
import pandas as pd
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
# --------- HELPER FUNCTIONS --------
|
| 13 |
+
|
| 14 |
+
def _max_or_eps(values, eps: float = 1e-9) -> float:
|
| 15 |
+
"""Avoid divide‑by‑zero during normalisation."""
|
| 16 |
+
return max(values) or eps
|
| 17 |
|
| 18 |
+
|
| 19 |
+
def _normalise(value: float, max_value: float) -> float:
|
| 20 |
+
return value / max_value if max_value else 0.0
|
| 21 |
+
|
| 22 |
+
## -------- DETECT FREEFORM COL FUNCTION ------------
|
| 23 |
+
|
| 24 |
+
def detect_freeform_col(
|
| 25 |
+
df: pd.DataFrame,
|
| 26 |
+
*,
|
| 27 |
+
length_weight: float = 0.4,
|
| 28 |
+
punct_weight: float = 0.3,
|
| 29 |
+
unique_weight: float = 0.3,
|
| 30 |
+
low_uniqueness_penalty: float = 0.4,
|
| 31 |
+
name_boosts: dict[str, float] | None = None,
|
| 32 |
+
min_score: float = 0.50,
|
| 33 |
+
return_scores: bool = False,
|
| 34 |
+
) -> str | None | Tuple[str | None, Dict[str, float]]:
|
| 35 |
"""
|
| 36 |
+
Guess which *object* column contains free‑text answers or comments.
|
| 37 |
+
|
| 38 |
+
A good free‑text column tends to be longish, rich in punctuation,
|
| 39 |
+
and fairly unique row‑to‑row.
|
| 40 |
+
|
| 41 |
+
name_boosts
|
| 42 |
+
e.g. ``{"additional_comment": 3.1, "usage_reason": 0.5}``
|
| 43 |
+
Multiplicative factors applied if the token appears in the header.
|
| 44 |
"""
|
| 45 |
+
name_boosts = name_boosts or {}
|
| 46 |
+
obj_cols = df.select_dtypes(include=["object"]).columns
|
| 47 |
+
|
| 48 |
+
# quick exit
|
| 49 |
+
if not obj_cols.size:
|
| 50 |
+
return (None, {}) if return_scores else None
|
| 51 |
+
|
| 52 |
+
# pre‑compute raw metrics
|
| 53 |
+
raw: Dict[str, dict[str, float]] = {}
|
| 54 |
+
for col in obj_cols:
|
| 55 |
+
ser = df[col].dropna().astype(str)
|
| 56 |
+
if ser.empty:
|
| 57 |
continue
|
| 58 |
+
raw[col] = {
|
| 59 |
+
"avg_len": ser.str.len().mean(),
|
| 60 |
+
"avg_punct": ser.apply(lambda s: sum(c in string.punctuation for c in s)).mean(),
|
| 61 |
+
"unique_ratio": ser.nunique() / len(ser),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 62 |
}
|
| 63 |
|
| 64 |
+
if not raw:
|
| 65 |
+
return (None, {}) if return_scores else None
|
| 66 |
+
|
| 67 |
+
# normalisers
|
| 68 |
+
max_len = _max_or_eps([m["avg_len"] for m in raw.values()])
|
| 69 |
+
max_punc = _max_or_eps([m["avg_punct"] for m in raw.values()])
|
| 70 |
+
|
| 71 |
+
# composite scores
|
| 72 |
+
scores: Dict[str, float] = {}
|
| 73 |
+
for col, m in raw.items():
|
| 74 |
+
score = (
|
| 75 |
+
length_weight * _normalise(m["avg_len"], max_len)
|
| 76 |
+
+ punct_weight * _normalise(m["avg_punct"], max_punc)
|
| 77 |
+
+ unique_weight * m["unique_ratio"]
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# header boosts / penalties
|
| 81 |
+
for token, factor in name_boosts.items():
|
| 82 |
+
if token in col.lower():
|
| 83 |
+
score *= factor
|
| 84 |
+
|
| 85 |
+
# penalise low uniqueness
|
| 86 |
+
if m["unique_ratio"] < low_uniqueness_penalty:
|
| 87 |
+
score *= 0.5
|
| 88 |
+
|
| 89 |
+
scores[col] = score
|
| 90 |
+
|
| 91 |
+
best_col, best_score = max(scores.items(), key=lambda kv: kv[1])
|
| 92 |
+
passed = best_score >= min_score
|
| 93 |
+
|
| 94 |
+
if return_scores:
|
| 95 |
+
return (best_col if passed else None, scores)
|
| 96 |
+
return best_col if passed else None
|
| 97 |
+
|