lynn-twinkl commited on
Commit
7da8ace
·
1 Parent(s): d4c9bcf

expadned logs with freeform col scores

Browse files
Files changed (1) hide show
  1. src/column_detection.py +58 -42
src/column_detection.py CHANGED
@@ -124,49 +124,42 @@ def detect_id_col(df: pd.DataFrame) -> str | None:
124
  return candidates[0]
125
 
126
 
127
- # ============== CAREER COLUMN =============
128
 
129
- def detect_career_col(
130
  df: pd.DataFrame,
131
  *,
132
- uniqueness_weight: float = 0.5,
133
- length_weight: float = 0.3,
134
- punct_weight: float = 0.2,
 
135
  name_boosts: dict[str, float] | None = None,
 
136
  min_score: float = 0.40,
137
  high_uniqueness_penalty: float = 0.95,
138
  return_scores: bool = False,
139
  ) -> str | None | Tuple[str | None, Dict[str, float]]:
140
  """
141
- Analyzes a DataFrame to find the column that most likely represents a 'career' or 'role'.
142
-
143
- The function operates on heuristics based on common characteristics of a career column:
144
- 1. **Low Uniqueness**: Values are often repeated (e.g., 'teacher', 'ks1').
145
- 2. **Short Text**: Entries are typically brief.
146
- 3. **Minimal Punctuation**: Values are clean strings, not sentences.
147
- 4. **Header Keywords**: The column name itself is a strong indicator (e.g., 'Career', 'Job').
148
-
149
- Args:
150
- df: The DataFrame to analyze.
151
- uniqueness_weight: The importance of having low uniqueness (many repeated values).
152
- length_weight: The importance of having short text values.
153
- punct_weight: The importance of having little to no punctuation.
154
- name_boosts: Multiplicative factors for keyword matches in the column header.
155
- Defaults to boosts for 'career', 'job', 'role', and 'position'.
156
- min_score: The minimum score for a column to be considered a match.
157
- high_uniqueness_penalty: A uniqueness ratio (e.g., 0.95) above which a column's
158
- score is heavily penalized, as it is unlikely to be
159
- a categorical role column.
160
- return_scores: If True, returns a tuple containing the best column name and a
161
- dictionary of scores for all candidate columns.
162
-
163
- Returns:
164
- The name of the detected career column, or None if no suitable column is found.
165
- If return_scores is True, it returns a tuple of (column_name, scores_dict).
166
  """
167
-
168
  if name_boosts is None:
169
- name_boosts = {'career': 3.0, 'job': 2.5, 'role': 2.5, 'position': 2.0}
 
 
 
 
 
 
 
 
170
 
171
  obj_cols = df.select_dtypes(include=["object"]).columns
172
  if not obj_cols.size:
@@ -175,20 +168,33 @@ def detect_career_col(
175
  # Pre-compute raw metrics for each object column
176
  raw_metrics: Dict[str, dict[str, float]] = {}
177
  for col in obj_cols:
178
- # Drop temporary NA's to not skew metrics, then convert to string
179
  ser = df[col].dropna().astype(str)
180
  if ser.empty:
181
  continue
 
 
 
 
 
 
 
 
 
 
 
 
 
182
  raw_metrics[col] = {
183
  "avg_len": ser.str.len().mean(),
184
  "avg_punct": ser.apply(lambda s: sum(c in string.punctuation for c in s)).mean(),
185
  "unique_ratio": ser.nunique() / len(ser) if len(ser) > 0 else 0.0,
 
186
  }
187
 
188
  if not raw_metrics:
189
  return (None, {}) if return_scores else None
190
 
191
- # Get max values for normalization across all columns
192
  max_len = _max_or_eps([m["avg_len"] for m in raw_metrics.values()])
193
  max_punc = _max_or_eps([m["avg_punct"] for m in raw_metrics.values()])
194
 
@@ -199,10 +205,12 @@ def detect_career_col(
199
  punc_score = 1 - _normalise(metrics["avg_punct"], max_punc)
200
  uniq_score = 1 - metrics["unique_ratio"]
201
 
 
202
  score = (
203
- length_weight * len_score
204
- + punct_weight * punc_score
205
  + uniqueness_weight * uniq_score
 
 
206
  )
207
 
208
  # Apply boosts for matching header keywords
@@ -225,7 +233,6 @@ def detect_career_col(
225
  if return_scores:
226
  return (best_col if passed else None, scores)
227
  return best_col if passed else None
228
-
229
  # =========== USAGE ============
230
 
231
  def main():
@@ -237,15 +244,24 @@ def main():
237
 
238
  id_col = detect_id_col(df)
239
  freeform_col, freeform_scores = detect_freeform_col(df, return_scores=True)
240
- career_col, career_scores = detect_career_col(df, return_scores=True)
241
 
242
  print(f"\nDetected ID Column: '{id_col}'")
243
  print(f"Detected Free-Form Column: '{freeform_col}'")
244
- print(f"Detected Career Column: '{career_col}'")
 
 
 
 
 
 
 
 
 
245
 
246
- print("\n--- Career Column Scores (Higher is better) ---")
247
- if career_scores:
248
- sorted_scores = sorted(career_scores.items(), key=lambda item: item[1], reverse=True)
249
  for col, score in sorted_scores:
250
  print(f" - {col:<25}: {score:.4f}")
251
  else:
 
124
  return candidates[0]
125
 
126
 
127
+ # ============== SCHOOL TYPE COLUMN =============
128
 
129
+ def detect_school_type_col(
130
  df: pd.DataFrame,
131
  *,
132
+ uniqueness_weight: float = 0.3,
133
+ content_match_weight: float = 0.4, # <-- New weight for content
134
+ length_weight: float = 0.2,
135
+ punct_weight: float = 0.1,
136
  name_boosts: dict[str, float] | None = None,
137
+ value_keywords: set[str] | None = None, # <-- New parameter for keywords
138
  min_score: float = 0.40,
139
  high_uniqueness_penalty: float = 0.95,
140
  return_scores: bool = False,
141
  ) -> str | None | Tuple[str | None, Dict[str, float]]:
142
  """
143
+ Analyzes a DataFrame to find the column that most likely represents a 'school type'.
144
+
145
+ The function operates on heuristics based on common characteristics of a school-type col:
146
+ 1. **Content Match**: A significant portion of values match known school types (the strongest signal).
147
+ 2. **Low Uniqueness**: Values are often repeated (e.g., 'Primary', 'All-through').
148
+ 3. **Short Text**: Entries are typically brief.
149
+ 4. **Minimal Punctuation**: Values are clean strings, not sentences.
150
+ 5. **Header Keywords**: The column name itself is a strong indicator (e.g., 'School Type').
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  """
152
+ # More robust default name boosts
153
  if name_boosts is None:
154
+ name_boosts = {'school': 3.0, 'type': 2.0}
155
+
156
+ # Default set of keywords to search for within the column's values
157
+ if value_keywords is None:
158
+ value_keywords = {
159
+ 'nursery', 'primary', 'secondary', 'infant', 'junior',
160
+ 'college', 'academy', 'independent', 'special', 'pru',
161
+ 'all-through', 'middle', 'state', 'educator', 'home'
162
+ }
163
 
164
  obj_cols = df.select_dtypes(include=["object"]).columns
165
  if not obj_cols.size:
 
168
  # Pre-compute raw metrics for each object column
169
  raw_metrics: Dict[str, dict[str, float]] = {}
170
  for col in obj_cols:
 
171
  ser = df[col].dropna().astype(str)
172
  if ser.empty:
173
  continue
174
+
175
+ # --- New Content Match Calculation ---
176
+ unique_values = ser.unique()
177
+ content_match_score = 0.0
178
+ if len(unique_values) > 0:
179
+ match_count = 0
180
+ for val in unique_values:
181
+ # Check if any keyword is a substring of the lowercase value
182
+ if any(keyword in val.lower() for keyword in value_keywords):
183
+ match_count += 1
184
+ content_match_score = match_count / len(unique_values)
185
+ # --- End of New Calculation ---
186
+
187
  raw_metrics[col] = {
188
  "avg_len": ser.str.len().mean(),
189
  "avg_punct": ser.apply(lambda s: sum(c in string.punctuation for c in s)).mean(),
190
  "unique_ratio": ser.nunique() / len(ser) if len(ser) > 0 else 0.0,
191
+ "content_match": content_match_score # Store the new score
192
  }
193
 
194
  if not raw_metrics:
195
  return (None, {}) if return_scores else None
196
 
197
+ # Get max values for normalization
198
  max_len = _max_or_eps([m["avg_len"] for m in raw_metrics.values()])
199
  max_punc = _max_or_eps([m["avg_punct"] for m in raw_metrics.values()])
200
 
 
205
  punc_score = 1 - _normalise(metrics["avg_punct"], max_punc)
206
  uniq_score = 1 - metrics["unique_ratio"]
207
 
208
+ # --- Updated Final Scoring Formula ---
209
  score = (
210
+ content_match_weight * metrics["content_match"] # Use the new score directly
 
211
  + uniqueness_weight * uniq_score
212
+ + length_weight * len_score
213
+ + punct_weight * punc_score
214
  )
215
 
216
  # Apply boosts for matching header keywords
 
233
  if return_scores:
234
  return (best_col if passed else None, scores)
235
  return best_col if passed else None
 
236
  # =========== USAGE ============
237
 
238
  def main():
 
244
 
245
  id_col = detect_id_col(df)
246
  freeform_col, freeform_scores = detect_freeform_col(df, return_scores=True)
247
+ school_type_col, school_type_scores = detect_school_type_col(df, return_scores=True)
248
 
249
  print(f"\nDetected ID Column: '{id_col}'")
250
  print(f"Detected Free-Form Column: '{freeform_col}'")
251
+ print(f"Detected School Type Column: '{school_type_col}'")
252
+ print()
253
+ print("\n--- Free-form Column Scores (Higher is better) ---")
254
+ if freeform_scores:
255
+ sorted_scores = sorted(freeform_scores.items(), key=lambda item: item[1], reverse=True)
256
+ for col, score in sorted_scores:
257
+ print(f" - {col:<25}: {score:.4f}")
258
+ else:
259
+ print("No object columns found to score for freeform col...")
260
+
261
 
262
+ print("\n--- School Type Column Scores (Higher is better) ---")
263
+ if school_type_scores:
264
+ sorted_scores = sorted(school_type_scores.items(), key=lambda item: item[1], reverse=True)
265
  for col, score in sorted_scores:
266
  print(f" - {col:<25}: {score:.4f}")
267
  else: