lynn-twinkl commited on
Commit ·
7da8ace
1
Parent(s): d4c9bcf
expadned logs with freeform col scores
Browse files- src/column_detection.py +58 -42
src/column_detection.py
CHANGED
|
@@ -124,49 +124,42 @@ def detect_id_col(df: pd.DataFrame) -> str | None:
|
|
| 124 |
return candidates[0]
|
| 125 |
|
| 126 |
|
| 127 |
-
# ==============
|
| 128 |
|
| 129 |
-
def
|
| 130 |
df: pd.DataFrame,
|
| 131 |
*,
|
| 132 |
-
uniqueness_weight: float = 0.
|
| 133 |
-
|
| 134 |
-
|
|
|
|
| 135 |
name_boosts: dict[str, float] | None = None,
|
|
|
|
| 136 |
min_score: float = 0.40,
|
| 137 |
high_uniqueness_penalty: float = 0.95,
|
| 138 |
return_scores: bool = False,
|
| 139 |
) -> str | None | Tuple[str | None, Dict[str, float]]:
|
| 140 |
"""
|
| 141 |
-
Analyzes a DataFrame to find the column that most likely represents a '
|
| 142 |
-
|
| 143 |
-
The function operates on heuristics based on common characteristics of a
|
| 144 |
-
1. **
|
| 145 |
-
2. **
|
| 146 |
-
3. **
|
| 147 |
-
4. **
|
| 148 |
-
|
| 149 |
-
Args:
|
| 150 |
-
df: The DataFrame to analyze.
|
| 151 |
-
uniqueness_weight: The importance of having low uniqueness (many repeated values).
|
| 152 |
-
length_weight: The importance of having short text values.
|
| 153 |
-
punct_weight: The importance of having little to no punctuation.
|
| 154 |
-
name_boosts: Multiplicative factors for keyword matches in the column header.
|
| 155 |
-
Defaults to boosts for 'career', 'job', 'role', and 'position'.
|
| 156 |
-
min_score: The minimum score for a column to be considered a match.
|
| 157 |
-
high_uniqueness_penalty: A uniqueness ratio (e.g., 0.95) above which a column's
|
| 158 |
-
score is heavily penalized, as it is unlikely to be
|
| 159 |
-
a categorical role column.
|
| 160 |
-
return_scores: If True, returns a tuple containing the best column name and a
|
| 161 |
-
dictionary of scores for all candidate columns.
|
| 162 |
-
|
| 163 |
-
Returns:
|
| 164 |
-
The name of the detected career column, or None if no suitable column is found.
|
| 165 |
-
If return_scores is True, it returns a tuple of (column_name, scores_dict).
|
| 166 |
"""
|
| 167 |
-
|
| 168 |
if name_boosts is None:
|
| 169 |
-
name_boosts = {'
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 170 |
|
| 171 |
obj_cols = df.select_dtypes(include=["object"]).columns
|
| 172 |
if not obj_cols.size:
|
|
@@ -175,20 +168,33 @@ def detect_career_col(
|
|
| 175 |
# Pre-compute raw metrics for each object column
|
| 176 |
raw_metrics: Dict[str, dict[str, float]] = {}
|
| 177 |
for col in obj_cols:
|
| 178 |
-
# Drop temporary NA's to not skew metrics, then convert to string
|
| 179 |
ser = df[col].dropna().astype(str)
|
| 180 |
if ser.empty:
|
| 181 |
continue
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 182 |
raw_metrics[col] = {
|
| 183 |
"avg_len": ser.str.len().mean(),
|
| 184 |
"avg_punct": ser.apply(lambda s: sum(c in string.punctuation for c in s)).mean(),
|
| 185 |
"unique_ratio": ser.nunique() / len(ser) if len(ser) > 0 else 0.0,
|
|
|
|
| 186 |
}
|
| 187 |
|
| 188 |
if not raw_metrics:
|
| 189 |
return (None, {}) if return_scores else None
|
| 190 |
|
| 191 |
-
# Get max values for normalization
|
| 192 |
max_len = _max_or_eps([m["avg_len"] for m in raw_metrics.values()])
|
| 193 |
max_punc = _max_or_eps([m["avg_punct"] for m in raw_metrics.values()])
|
| 194 |
|
|
@@ -199,10 +205,12 @@ def detect_career_col(
|
|
| 199 |
punc_score = 1 - _normalise(metrics["avg_punct"], max_punc)
|
| 200 |
uniq_score = 1 - metrics["unique_ratio"]
|
| 201 |
|
|
|
|
| 202 |
score = (
|
| 203 |
-
|
| 204 |
-
+ punct_weight * punc_score
|
| 205 |
+ uniqueness_weight * uniq_score
|
|
|
|
|
|
|
| 206 |
)
|
| 207 |
|
| 208 |
# Apply boosts for matching header keywords
|
|
@@ -225,7 +233,6 @@ def detect_career_col(
|
|
| 225 |
if return_scores:
|
| 226 |
return (best_col if passed else None, scores)
|
| 227 |
return best_col if passed else None
|
| 228 |
-
|
| 229 |
# =========== USAGE ============
|
| 230 |
|
| 231 |
def main():
|
|
@@ -237,15 +244,24 @@ def main():
|
|
| 237 |
|
| 238 |
id_col = detect_id_col(df)
|
| 239 |
freeform_col, freeform_scores = detect_freeform_col(df, return_scores=True)
|
| 240 |
-
|
| 241 |
|
| 242 |
print(f"\nDetected ID Column: '{id_col}'")
|
| 243 |
print(f"Detected Free-Form Column: '{freeform_col}'")
|
| 244 |
-
print(f"Detected
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 245 |
|
| 246 |
-
print("\n---
|
| 247 |
-
if
|
| 248 |
-
sorted_scores = sorted(
|
| 249 |
for col, score in sorted_scores:
|
| 250 |
print(f" - {col:<25}: {score:.4f}")
|
| 251 |
else:
|
|
|
|
| 124 |
return candidates[0]
|
| 125 |
|
| 126 |
|
| 127 |
+
# ============== SCHOOL TYPE COLUMN =============
|
| 128 |
|
| 129 |
+
def detect_school_type_col(
|
| 130 |
df: pd.DataFrame,
|
| 131 |
*,
|
| 132 |
+
uniqueness_weight: float = 0.3,
|
| 133 |
+
content_match_weight: float = 0.4, # <-- New weight for content
|
| 134 |
+
length_weight: float = 0.2,
|
| 135 |
+
punct_weight: float = 0.1,
|
| 136 |
name_boosts: dict[str, float] | None = None,
|
| 137 |
+
value_keywords: set[str] | None = None, # <-- New parameter for keywords
|
| 138 |
min_score: float = 0.40,
|
| 139 |
high_uniqueness_penalty: float = 0.95,
|
| 140 |
return_scores: bool = False,
|
| 141 |
) -> str | None | Tuple[str | None, Dict[str, float]]:
|
| 142 |
"""
|
| 143 |
+
Analyzes a DataFrame to find the column that most likely represents a 'school type'.
|
| 144 |
+
|
| 145 |
+
The function operates on heuristics based on common characteristics of a school-type col:
|
| 146 |
+
1. **Content Match**: A significant portion of values match known school types (the strongest signal).
|
| 147 |
+
2. **Low Uniqueness**: Values are often repeated (e.g., 'Primary', 'All-through').
|
| 148 |
+
3. **Short Text**: Entries are typically brief.
|
| 149 |
+
4. **Minimal Punctuation**: Values are clean strings, not sentences.
|
| 150 |
+
5. **Header Keywords**: The column name itself is a strong indicator (e.g., 'School Type').
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
"""
|
| 152 |
+
# More robust default name boosts
|
| 153 |
if name_boosts is None:
|
| 154 |
+
name_boosts = {'school': 3.0, 'type': 2.0}
|
| 155 |
+
|
| 156 |
+
# Default set of keywords to search for within the column's values
|
| 157 |
+
if value_keywords is None:
|
| 158 |
+
value_keywords = {
|
| 159 |
+
'nursery', 'primary', 'secondary', 'infant', 'junior',
|
| 160 |
+
'college', 'academy', 'independent', 'special', 'pru',
|
| 161 |
+
'all-through', 'middle', 'state', 'educator', 'home'
|
| 162 |
+
}
|
| 163 |
|
| 164 |
obj_cols = df.select_dtypes(include=["object"]).columns
|
| 165 |
if not obj_cols.size:
|
|
|
|
| 168 |
# Pre-compute raw metrics for each object column
|
| 169 |
raw_metrics: Dict[str, dict[str, float]] = {}
|
| 170 |
for col in obj_cols:
|
|
|
|
| 171 |
ser = df[col].dropna().astype(str)
|
| 172 |
if ser.empty:
|
| 173 |
continue
|
| 174 |
+
|
| 175 |
+
# --- New Content Match Calculation ---
|
| 176 |
+
unique_values = ser.unique()
|
| 177 |
+
content_match_score = 0.0
|
| 178 |
+
if len(unique_values) > 0:
|
| 179 |
+
match_count = 0
|
| 180 |
+
for val in unique_values:
|
| 181 |
+
# Check if any keyword is a substring of the lowercase value
|
| 182 |
+
if any(keyword in val.lower() for keyword in value_keywords):
|
| 183 |
+
match_count += 1
|
| 184 |
+
content_match_score = match_count / len(unique_values)
|
| 185 |
+
# --- End of New Calculation ---
|
| 186 |
+
|
| 187 |
raw_metrics[col] = {
|
| 188 |
"avg_len": ser.str.len().mean(),
|
| 189 |
"avg_punct": ser.apply(lambda s: sum(c in string.punctuation for c in s)).mean(),
|
| 190 |
"unique_ratio": ser.nunique() / len(ser) if len(ser) > 0 else 0.0,
|
| 191 |
+
"content_match": content_match_score # Store the new score
|
| 192 |
}
|
| 193 |
|
| 194 |
if not raw_metrics:
|
| 195 |
return (None, {}) if return_scores else None
|
| 196 |
|
| 197 |
+
# Get max values for normalization
|
| 198 |
max_len = _max_or_eps([m["avg_len"] for m in raw_metrics.values()])
|
| 199 |
max_punc = _max_or_eps([m["avg_punct"] for m in raw_metrics.values()])
|
| 200 |
|
|
|
|
| 205 |
punc_score = 1 - _normalise(metrics["avg_punct"], max_punc)
|
| 206 |
uniq_score = 1 - metrics["unique_ratio"]
|
| 207 |
|
| 208 |
+
# --- Updated Final Scoring Formula ---
|
| 209 |
score = (
|
| 210 |
+
content_match_weight * metrics["content_match"] # Use the new score directly
|
|
|
|
| 211 |
+ uniqueness_weight * uniq_score
|
| 212 |
+
+ length_weight * len_score
|
| 213 |
+
+ punct_weight * punc_score
|
| 214 |
)
|
| 215 |
|
| 216 |
# Apply boosts for matching header keywords
|
|
|
|
| 233 |
if return_scores:
|
| 234 |
return (best_col if passed else None, scores)
|
| 235 |
return best_col if passed else None
|
|
|
|
| 236 |
# =========== USAGE ============
|
| 237 |
|
| 238 |
def main():
|
|
|
|
| 244 |
|
| 245 |
id_col = detect_id_col(df)
|
| 246 |
freeform_col, freeform_scores = detect_freeform_col(df, return_scores=True)
|
| 247 |
+
school_type_col, school_type_scores = detect_school_type_col(df, return_scores=True)
|
| 248 |
|
| 249 |
print(f"\nDetected ID Column: '{id_col}'")
|
| 250 |
print(f"Detected Free-Form Column: '{freeform_col}'")
|
| 251 |
+
print(f"Detected School Type Column: '{school_type_col}'")
|
| 252 |
+
print()
|
| 253 |
+
print("\n--- Free-form Column Scores (Higher is better) ---")
|
| 254 |
+
if freeform_scores:
|
| 255 |
+
sorted_scores = sorted(freeform_scores.items(), key=lambda item: item[1], reverse=True)
|
| 256 |
+
for col, score in sorted_scores:
|
| 257 |
+
print(f" - {col:<25}: {score:.4f}")
|
| 258 |
+
else:
|
| 259 |
+
print("No object columns found to score for freeform col...")
|
| 260 |
+
|
| 261 |
|
| 262 |
+
print("\n--- School Type Column Scores (Higher is better) ---")
|
| 263 |
+
if school_type_scores:
|
| 264 |
+
sorted_scores = sorted(school_type_scores.items(), key=lambda item: item[1], reverse=True)
|
| 265 |
for col, score in sorted_scores:
|
| 266 |
print(f" - {col:<25}: {score:.4f}")
|
| 267 |
else:
|