lynn-twinkl commited on
Commit
76ee39e
·
1 Parent(s): fd62f9c

id column detection

Browse files
Files changed (1) hide show
  1. src/column_detection.py +34 -2
src/column_detection.py CHANGED
@@ -9,7 +9,7 @@ from typing import Sequence, Dict, Tuple, Optional
9
  import pandas as pd
10
 
11
 
12
- # --------- HELPER FUNCTIONS --------
13
 
14
  def _max_or_eps(values, eps: float = 1e-9) -> float:
15
  """Avoid divide‑by‑zero during normalisation."""
@@ -19,7 +19,7 @@ def _max_or_eps(values, eps: float = 1e-9) -> float:
19
  def _normalise(value: float, max_value: float) -> float:
20
  return value / max_value if max_value else 0.0
21
 
22
- ## -------- DETECT FREEFORM COL FUNCTION ------------
23
 
24
  def detect_freeform_col(
25
  df: pd.DataFrame,
@@ -95,3 +95,35 @@ def detect_freeform_col(
95
  return (best_col if passed else None, scores)
96
  return best_col if passed else None
97
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9
  import pandas as pd
10
 
11
 
12
+ # ========= HELPER FUNCTIONS ========
13
 
14
  def _max_or_eps(values, eps: float = 1e-9) -> float:
15
  """Avoid divide‑by‑zero during normalisation."""
 
19
  def _normalise(value: float, max_value: float) -> float:
20
  return value / max_value if max_value else 0.0
21
 
22
+ # ========== DETECT FREEFORM COL FUNCTION ============
23
 
24
  def detect_freeform_col(
25
  df: pd.DataFrame,
 
95
  return (best_col if passed else None, scores)
96
  return best_col if passed else None
97
 
98
+
99
+ # ========= DETECT ID COLUMN =========
100
+
101
+ def detect_id_col(df: pd.DataFrame) -> str | None:
102
+ n_rows = len(df)
103
+
104
+ # 1) Name‐based detection
105
+ name_pattern = re.compile(r'\b(id|identifier|key)\b', re.IGNORECASE)
106
+ for col in df.columns:
107
+ if name_pattern.search(col):
108
+ return col
109
+
110
+ # 2) Uniqueness detection: columns where every row is unique
111
+ unique_cols = [
112
+ col for col in df.columns
113
+ if df[col].nunique(dropna=False) == n_rows
114
+ ]
115
+ if not unique_cols:
116
+ return None
117
+
118
+ # 3) Prioritise int cols over object cols when both are unique
119
+ non_unnamed = [c for c in unique_cols if not c.startswith("Unnamed")]
120
+ candidates = non_unnamed or unique_cols
121
+
122
+ # 4) Prefer integer dtypes among candidates
123
+ for col in candidates:
124
+ if pd.api.types.is_integer_dtype(df[col]):
125
+ return col
126
+
127
+ # Fallback: return the first candidate
128
+ return candidates[0]
129
+