Rifqi Hafizuddin commited on
Commit
9b59334
·
1 Parent(s): 7f3bb97

[KM-441] add mean and median

Browse files
src/pipeline/db_pipeline/extractor.py CHANGED
@@ -94,6 +94,12 @@ def profile_column(
94
  if is_numeric:
95
  select_cols.append(f"MIN({qc}) AS min_val")
96
  select_cols.append(f"MAX({qc}) AS max_val")
 
 
 
 
 
 
97
  stats = pd.read_sql(f"SELECT {', '.join(select_cols)} FROM {qt}", engine)
98
 
99
  null_count = int(stats.iloc[0]["nulls"])
@@ -109,6 +115,8 @@ def profile_column(
109
  if is_numeric:
110
  profile["min"] = stats.iloc[0]["min_val"]
111
  profile["max"] = stats.iloc[0]["max_val"]
 
 
112
 
113
  if 0 < distinct_ratio <= TOP_VALUES_THRESHOLD:
114
  top = pd.read_sql(
@@ -170,6 +178,7 @@ def build_text(table_name: str, row_count: int, col: dict, profile: dict) -> str
170
  text += f"Distinct count: {profile['distinct_count']} ({profile['distinct_ratio']:.1%})\n"
171
  if "min" in profile:
172
  text += f"Min: {profile['min']}, Max: {profile['max']}\n"
 
173
  if "top_values" in profile:
174
  top_str = ", ".join(f"{v} ({c})" for v, c in profile["top_values"])
175
  text += f"Top values: {top_str}\n"
 
94
  if is_numeric:
95
  select_cols.append(f"MIN({qc}) AS min_val")
96
  select_cols.append(f"MAX({qc}) AS max_val")
97
+ select_cols.append(f"AVG({qc}) AS mean_val")
98
+ # PERCENTILE_CONT is supported by Postgres and SQL Server; MySQL would need
99
+ # a dialect-specific fallback when that connector is added.
100
+ select_cols.append(
101
+ f"PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {qc}) AS median_val"
102
+ )
103
  stats = pd.read_sql(f"SELECT {', '.join(select_cols)} FROM {qt}", engine)
104
 
105
  null_count = int(stats.iloc[0]["nulls"])
 
115
  if is_numeric:
116
  profile["min"] = stats.iloc[0]["min_val"]
117
  profile["max"] = stats.iloc[0]["max_val"]
118
+ profile["mean"] = stats.iloc[0]["mean_val"]
119
+ profile["median"] = stats.iloc[0]["median_val"]
120
 
121
  if 0 < distinct_ratio <= TOP_VALUES_THRESHOLD:
122
  top = pd.read_sql(
 
178
  text += f"Distinct count: {profile['distinct_count']} ({profile['distinct_ratio']:.1%})\n"
179
  if "min" in profile:
180
  text += f"Min: {profile['min']}, Max: {profile['max']}\n"
181
+ text += f"Mean: {profile['mean']}, Median: {profile['median']}\n"
182
  if "top_values" in profile:
183
  top_str = ", ".join(f"{v} ({c})" for v, c in profile["top_values"])
184
  text += f"Top values: {top_str}\n"