lynn-twinkl commited on
Commit
f5235b7
·
1 Parent(s): 6b72c99

add: column detection for careers. also added a module test script

Browse files
Files changed (1) hide show
  1. src/column_detection.py +132 -6
src/column_detection.py CHANGED
@@ -1,11 +1,7 @@
1
- """
2
- column_detect.py ── tiny heuristics for finding ID and free‑text columns
3
- """
4
  from __future__ import annotations # harmless on 3.11+, useful on 3.7‑3.10
5
  import re
6
  import string
7
  from typing import Sequence, Dict, Tuple, Optional
8
-
9
  import pandas as pd
10
 
11
 
@@ -19,7 +15,7 @@ def _max_or_eps(values, eps: float = 1e-9) -> float:
19
  def _normalise(value: float, max_value: float) -> float:
20
  return value / max_value if max_value else 0.0
21
 
22
- # ========== DETECT FREEFORM COL FUNCTION ============
23
 
24
  def detect_freeform_col(
25
  df: pd.DataFrame,
@@ -96,7 +92,7 @@ def detect_freeform_col(
96
  return best_col if passed else None
97
 
98
 
99
- # ========= DETECT ID COLUMN =========
100
 
101
  def detect_id_col(df: pd.DataFrame) -> str | None:
102
  n_rows = len(df)
@@ -127,3 +123,133 @@ def detect_id_col(df: pd.DataFrame) -> str | None:
127
  # Fallback: return the first candidate
128
  return candidates[0]
129
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from __future__ import annotations # harmless on 3.11+, useful on 3.7‑3.10
2
  import re
3
  import string
4
  from typing import Sequence, Dict, Tuple, Optional
 
5
  import pandas as pd
6
 
7
 
 
15
  def _normalise(value: float, max_value: float) -> float:
16
  return value / max_value if max_value else 0.0
17
 
18
+ # =================== FREEFORM COL =====================
19
 
20
  def detect_freeform_col(
21
  df: pd.DataFrame,
 
92
  return best_col if passed else None
93
 
94
 
95
+ # ================= ID COLUMN =================
96
 
97
  def detect_id_col(df: pd.DataFrame) -> str | None:
98
  n_rows = len(df)
 
123
  # Fallback: return the first candidate
124
  return candidates[0]
125
 
126
+
127
+ # ============== CAREER COLUMN =============
128
+
129
+ def detect_career_col(
130
+ df: pd.DataFrame,
131
+ *,
132
+ uniqueness_weight: float = 0.5,
133
+ length_weight: float = 0.3,
134
+ punct_weight: float = 0.2,
135
+ name_boosts: dict[str, float] | None = None,
136
+ min_score: float = 0.40,
137
+ high_uniqueness_penalty: float = 0.95,
138
+ return_scores: bool = False,
139
+ ) -> str | None | Tuple[str | None, Dict[str, float]]:
140
+ """
141
+ Analyzes a DataFrame to find the column that most likely represents a 'career' or 'role'.
142
+
143
+ The function operates on heuristics based on common characteristics of a career column:
144
+ 1. **Low Uniqueness**: Values are often repeated (e.g., 'teacher', 'ks1').
145
+ 2. **Short Text**: Entries are typically brief.
146
+ 3. **Minimal Punctuation**: Values are clean strings, not sentences.
147
+ 4. **Header Keywords**: The column name itself is a strong indicator (e.g., 'Career', 'Job').
148
+
149
+ Args:
150
+ df: The DataFrame to analyze.
151
+ uniqueness_weight: The importance of having low uniqueness (many repeated values).
152
+ length_weight: The importance of having short text values.
153
+ punct_weight: The importance of having little to no punctuation.
154
+ name_boosts: Multiplicative factors for keyword matches in the column header.
155
+ Defaults to boosts for 'career', 'job', 'role', and 'position'.
156
+ min_score: The minimum score for a column to be considered a match.
157
+ high_uniqueness_penalty: A uniqueness ratio (e.g., 0.95) above which a column's
158
+ score is heavily penalized, as it is unlikely to be
159
+ a categorical role column.
160
+ return_scores: If True, returns a tuple containing the best column name and a
161
+ dictionary of scores for all candidate columns.
162
+
163
+ Returns:
164
+ The name of the detected career column, or None if no suitable column is found.
165
+ If return_scores is True, it returns a tuple of (column_name, scores_dict).
166
+ """
167
+
168
+ if name_boosts is None:
169
+ name_boosts = {'career': 3.0, 'job': 2.5, 'role': 2.5, 'position': 2.0}
170
+
171
+ obj_cols = df.select_dtypes(include=["object"]).columns
172
+ if not obj_cols.size:
173
+ return (None, {}) if return_scores else None
174
+
175
+ # Pre-compute raw metrics for each object column
176
+ raw_metrics: Dict[str, dict[str, float]] = {}
177
+ for col in obj_cols:
178
+ # Drop temporary NA's to not skew metrics, then convert to string
179
+ ser = df[col].dropna().astype(str)
180
+ if ser.empty:
181
+ continue
182
+ raw_metrics[col] = {
183
+ "avg_len": ser.str.len().mean(),
184
+ "avg_punct": ser.apply(lambda s: sum(c in string.punctuation for c in s)).mean(),
185
+ "unique_ratio": ser.nunique() / len(ser) if len(ser) > 0 else 0.0,
186
+ }
187
+
188
+ if not raw_metrics:
189
+ return (None, {}) if return_scores else None
190
+
191
+ # Get max values for normalization across all columns
192
+ max_len = _max_or_eps([m["avg_len"] for m in raw_metrics.values()])
193
+ max_punc = _max_or_eps([m["avg_punct"] for m in raw_metrics.values()])
194
+
195
+ # Calculate a final score for each column
196
+ scores: Dict[str, float] = {}
197
+ for col, metrics in raw_metrics.items():
198
+ len_score = 1 - _normalise(metrics["avg_len"], max_len)
199
+ punc_score = 1 - _normalise(metrics["avg_punct"], max_punc)
200
+ uniq_score = 1 - metrics["unique_ratio"]
201
+
202
+ score = (
203
+ length_weight * len_score
204
+ + punct_weight * punc_score
205
+ + uniqueness_weight * uniq_score
206
+ )
207
+
208
+ # Apply boosts for matching header keywords
209
+ for token, factor in name_boosts.items():
210
+ if token in col.lower().strip():
211
+ score *= factor
212
+
213
+ # Apply penalty for columns that are almost entirely unique
214
+ if metrics["unique_ratio"] > high_uniqueness_penalty:
215
+ score *= 0.1 # Heavy penalty
216
+
217
+ scores[col] = score
218
+
219
+ if not scores:
220
+ return (None, {}) if return_scores else None
221
+
222
+ best_col, best_score = max(scores.items(), key=lambda item: item[1])
223
+ passed = best_score >= min_score
224
+
225
+ if return_scores:
226
+ return (best_col if passed else None, scores)
227
+ return best_col if passed else None
228
+
229
+ # =========== USAGE ============
230
+
231
+ def main():
232
+
233
+ df = pd.read_csv('data/raw/new-application-format-data.csv')
234
+ df.columns = df.columns.str.strip()
235
+
236
+ print("--- Testing Column Detection Functions ---")
237
+
238
+ id_col = detect_id_col(df)
239
+ freeform_col, freeform_scores = detect_freeform_col(df, return_scores=True)
240
+ career_col, career_scores = detect_career_col(df, return_scores=True)
241
+
242
+ print(f"\nDetected ID Column: '{id_col}'")
243
+ print(f"Detected Free-Form Column: '{freeform_col}'")
244
+ print(f"Detected Career Column: '{career_col}'")
245
+
246
+ print("\n--- Career Column Scores (Higher is better) ---")
247
+ if career_scores:
248
+ sorted_scores = sorted(career_scores.items(), key=lambda item: item[1], reverse=True)
249
+ for col, score in sorted_scores:
250
+ print(f" - {col:<25}: {score:.4f}")
251
+ else:
252
+ print("No object columns found to score for career.")
253
+
254
+ if __name__ == '__main__':
255
+ main()