lynn-twinkl
commited on
Commit
·
76ee39e
1
Parent(s):
fd62f9c
id column detection
Browse files- src/column_detection.py +34 -2
src/column_detection.py
CHANGED
|
@@ -9,7 +9,7 @@ from typing import Sequence, Dict, Tuple, Optional
|
|
| 9 |
import pandas as pd
|
| 10 |
|
| 11 |
|
| 12 |
-
#
|
| 13 |
|
| 14 |
def _max_or_eps(values, eps: float = 1e-9) -> float:
|
| 15 |
"""Avoid divide‑by‑zero during normalisation."""
|
|
@@ -19,7 +19,7 @@ def _max_or_eps(values, eps: float = 1e-9) -> float:
|
|
| 19 |
def _normalise(value: float, max_value: float) -> float:
|
| 20 |
return value / max_value if max_value else 0.0
|
| 21 |
|
| 22 |
-
|
| 23 |
|
| 24 |
def detect_freeform_col(
|
| 25 |
df: pd.DataFrame,
|
|
@@ -95,3 +95,35 @@ def detect_freeform_col(
|
|
| 95 |
return (best_col if passed else None, scores)
|
| 96 |
return best_col if passed else None
|
| 97 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
import pandas as pd
|
| 10 |
|
| 11 |
|
| 12 |
+
# ========= HELPER FUNCTIONS ========
|
| 13 |
|
| 14 |
def _max_or_eps(values, eps: float = 1e-9) -> float:
|
| 15 |
"""Avoid divide‑by‑zero during normalisation."""
|
|
|
|
| 19 |
def _normalise(value: float, max_value: float) -> float:
|
| 20 |
return value / max_value if max_value else 0.0
|
| 21 |
|
| 22 |
+
# ========== DETECT FREEFORM COL FUNCTION ============
|
| 23 |
|
| 24 |
def detect_freeform_col(
|
| 25 |
df: pd.DataFrame,
|
|
|
|
| 95 |
return (best_col if passed else None, scores)
|
| 96 |
return best_col if passed else None
|
| 97 |
|
| 98 |
+
|
| 99 |
+
# ========= DETECT ID COLUMN =========
|
| 100 |
+
|
| 101 |
+
def detect_id_col(df: pd.DataFrame) -> str | None:
|
| 102 |
+
n_rows = len(df)
|
| 103 |
+
|
| 104 |
+
# 1) Name‐based detection
|
| 105 |
+
name_pattern = re.compile(r'\b(id|identifier|key)\b', re.IGNORECASE)
|
| 106 |
+
for col in df.columns:
|
| 107 |
+
if name_pattern.search(col):
|
| 108 |
+
return col
|
| 109 |
+
|
| 110 |
+
# 2) Uniqueness detection: columns where every row is unique
|
| 111 |
+
unique_cols = [
|
| 112 |
+
col for col in df.columns
|
| 113 |
+
if df[col].nunique(dropna=False) == n_rows
|
| 114 |
+
]
|
| 115 |
+
if not unique_cols:
|
| 116 |
+
return None
|
| 117 |
+
|
| 118 |
+
# 3) Prioritise int cols over object cols when both are unique
|
| 119 |
+
non_unnamed = [c for c in unique_cols if not c.startswith("Unnamed")]
|
| 120 |
+
candidates = non_unnamed or unique_cols
|
| 121 |
+
|
| 122 |
+
# 4) Prefer integer dtypes among candidates
|
| 123 |
+
for col in candidates:
|
| 124 |
+
if pd.api.types.is_integer_dtype(df[col]):
|
| 125 |
+
return col
|
| 126 |
+
|
| 127 |
+
# Fallback: return the first candidate
|
| 128 |
+
return candidates[0]
|
| 129 |
+
|