lynn-twinkl commited on
Commit
7e21a53
·
1 Parent(s): df474fc

added clearer docstring

Browse files
Files changed (1) hide show
  1. functions/column_detection.py +89 -54
functions/column_detection.py CHANGED
@@ -1,62 +1,97 @@
1
- import pandas as pd
2
- import numpy as np
 
 
3
  import re
4
  import string
 
 
 
 
 
 
 
 
 
 
5
 
6
- def detect_freeform_answer_col(df, penalty_for_low_uniqueness=0.4):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7
  """
8
- Detect the 'freeform_answer' column using heuristics: average length, punctuation, uniqueness.
9
- Returns the most likely column name or None.
 
 
 
 
 
 
10
  """
11
- text_cols = df.select_dtypes(include=['object']).columns.tolist()
12
- if not text_cols:
13
- return None
14
-
15
- scores = {}
16
- for col in text_cols:
17
- series = df[col].dropna().astype(str)
18
- if series.empty:
 
 
 
 
19
  continue
20
- avg_len = series.apply(len).mean()
21
- punct_counts = series.apply(lambda x: sum(1 for char in x if char in string.punctuation))
22
- avg_punct = punct_counts.mean()
23
- total = len(series)
24
- unique_ratio = series.nunique() / total if total else 0
25
-
26
- # Weighted composite
27
- weight_length = 0.4
28
- weight_punct = 0.3
29
- weight_unique = 0.3
30
- norm_factor = 1e-9 # avoid dividing by 0
31
- scores[col] = {
32
- 'avg_len': avg_len,
33
- 'avg_punct': avg_punct,
34
- 'unique_ratio': unique_ratio,
35
  }
36
 
37
- if not scores:
38
- return None
39
-
40
- # Normalizing across all columns
41
- max_len = max(s['avg_len'] for s in scores.values()) or 1e-9
42
- max_punct = max(s['avg_punct'] for s in scores.values()) or 1e-9
43
-
44
- composite = {}
45
- for col, s in scores.items():
46
- norm_len = s['avg_len'] / max_len
47
- norm_punct = s['avg_punct'] / max_punct
48
- comp_score = (0.4 * norm_len) + (0.3 * norm_punct) + (0.3 * s['unique_ratio'])
49
-
50
- # Bonus/penalty for column names
51
- if "additional_comment" in col.lower():
52
- comp_score *= 3.1
53
- if "usage_reason" in col.lower():
54
- comp_score *= 0.5
55
-
56
- # Penalize low uniqueness
57
- if s['unique_ratio'] < penalty_for_low_uniqueness:
58
- comp_score *= 0.5
59
-
60
- composite[col] = comp_score
61
-
62
- return max(composite, key=composite.get)
 
 
 
 
 
 
 
 
 
1
+ """
2
+ column_detect.py ── tiny heuristics for finding ID and free‑text columns
3
+ """
4
+ from __future__ import annotations # harmless on 3.11+, useful on 3.7‑3.10
5
  import re
6
  import string
7
+ from typing import Sequence, Dict, Tuple, Optional
8
+
9
+ import pandas as pd
10
+
11
+
12
+ # --------- HELPER FUNCTIONS --------
13
+
14
+ def _max_or_eps(values, eps: float = 1e-9) -> float:
15
+ """Avoid divide‑by‑zero during normalisation."""
16
+ return max(values) or eps
17
 
18
+
19
+ def _normalise(value: float, max_value: float) -> float:
20
+ return value / max_value if max_value else 0.0
21
+
22
+ ## -------- DETECT FREEFORM COL FUNCTION ------------
23
+
24
+ def detect_freeform_col(
25
+ df: pd.DataFrame,
26
+ *,
27
+ length_weight: float = 0.4,
28
+ punct_weight: float = 0.3,
29
+ unique_weight: float = 0.3,
30
+ low_uniqueness_penalty: float = 0.4,
31
+ name_boosts: dict[str, float] | None = None,
32
+ min_score: float = 0.50,
33
+ return_scores: bool = False,
34
+ ) -> str | None | Tuple[str | None, Dict[str, float]]:
35
  """
36
+ Guess which *object* column contains free‑text answers or comments.
37
+
38
+ A good free‑text column tends to be longish, rich in punctuation,
39
+ and fairly unique row‑to‑row.
40
+
41
+ name_boosts
42
+ e.g. ``{"additional_comment": 3.1, "usage_reason": 0.5}``
43
+ Multiplicative factors applied if the token appears in the header.
44
  """
45
+ name_boosts = name_boosts or {}
46
+ obj_cols = df.select_dtypes(include=["object"]).columns
47
+
48
+ # quick exit
49
+ if not obj_cols.size:
50
+ return (None, {}) if return_scores else None
51
+
52
+ # pre‑compute raw metrics
53
+ raw: Dict[str, dict[str, float]] = {}
54
+ for col in obj_cols:
55
+ ser = df[col].dropna().astype(str)
56
+ if ser.empty:
57
  continue
58
+ raw[col] = {
59
+ "avg_len": ser.str.len().mean(),
60
+ "avg_punct": ser.apply(lambda s: sum(c in string.punctuation for c in s)).mean(),
61
+ "unique_ratio": ser.nunique() / len(ser),
 
 
 
 
 
 
 
 
 
 
 
62
  }
63
 
64
+ if not raw:
65
+ return (None, {}) if return_scores else None
66
+
67
+ # normalisers
68
+ max_len = _max_or_eps([m["avg_len"] for m in raw.values()])
69
+ max_punc = _max_or_eps([m["avg_punct"] for m in raw.values()])
70
+
71
+ # composite scores
72
+ scores: Dict[str, float] = {}
73
+ for col, m in raw.items():
74
+ score = (
75
+ length_weight * _normalise(m["avg_len"], max_len)
76
+ + punct_weight * _normalise(m["avg_punct"], max_punc)
77
+ + unique_weight * m["unique_ratio"]
78
+ )
79
+
80
+ # header boosts / penalties
81
+ for token, factor in name_boosts.items():
82
+ if token in col.lower():
83
+ score *= factor
84
+
85
+ # penalise low uniqueness
86
+ if m["unique_ratio"] < low_uniqueness_penalty:
87
+ score *= 0.5
88
+
89
+ scores[col] = score
90
+
91
+ best_col, best_score = max(scores.items(), key=lambda kv: kv[1])
92
+ passed = best_score >= min_score
93
+
94
+ if return_scores:
95
+ return (best_col if passed else None, scores)
96
+ return best_col if passed else None
97
+