Spaces:

TwinklData
/

Community_Collections_App

Sleeping

lynn-twinkl commited on May 24, 2025

Commit

76ee39e

1 Parent(s): fd62f9c

id column detection

Files changed (1) hide show

src/column_detection.py CHANGED Viewed

@@ -9,7 +9,7 @@ from typing import Sequence, Dict, Tuple, Optional
 import pandas as pd
-# --------- HELPER FUNCTIONS --------
 def _max_or_eps(values, eps: float = 1e-9) -> float:
     """Avoid divide‑by‑zero during normalisation."""
@@ -19,7 +19,7 @@ def _max_or_eps(values, eps: float = 1e-9) -> float:
 def _normalise(value: float, max_value: float) -> float:
     return value / max_value if max_value else 0.0
-## -------- DETECT FREEFORM COL FUNCTION ------------
 def detect_freeform_col(
     df: pd.DataFrame,
@@ -95,3 +95,35 @@ def detect_freeform_col(
         return (best_col if passed else None, scores)
     return best_col if passed else None

 import pandas as pd
+# ========= HELPER FUNCTIONS ========
 def _max_or_eps(values, eps: float = 1e-9) -> float:
     """Avoid divide‑by‑zero during normalisation."""
 def _normalise(value: float, max_value: float) -> float:
     return value / max_value if max_value else 0.0
+# ========== DETECT FREEFORM COL FUNCTION ============
 def detect_freeform_col(
     df: pd.DataFrame,
         return (best_col if passed else None, scores)
     return best_col if passed else None
+# ========= DETECT ID COLUMN =========
+def detect_id_col(df: pd.DataFrame) -> str | None:
+    n_rows = len(df)
+    # 1) Name‐based detection
+    name_pattern = re.compile(r'\b(id|identifier|key)\b', re.IGNORECASE)
+    for col in df.columns:
+        if name_pattern.search(col):
+            return col
+    # 2) Uniqueness detection: columns where every row is unique
+    unique_cols = [
+        col for col in df.columns
+        if df[col].nunique(dropna=False) == n_rows
+    ]
+    if not unique_cols:
+        return None
+    # 3) Prioritise int cols over object cols when both are unique
+    non_unnamed = [c for c in unique_cols if not c.startswith("Unnamed")]
+    candidates = non_unnamed or unique_cols
+    # 4) Prefer integer dtypes among candidates
+    for col in candidates:
+        if pd.api.types.is_integer_dtype(df[col]):
+            return col
+    # Fallback: return the first candidate
+    return candidates[0]