Rifqi Hafizuddin commited on
Commit ·
9b59334
1
Parent(s): 7f3bb97
[KM-441] add mean and median
Browse files
src/pipeline/db_pipeline/extractor.py
CHANGED
|
@@ -94,6 +94,12 @@ def profile_column(
|
|
| 94 |
if is_numeric:
|
| 95 |
select_cols.append(f"MIN({qc}) AS min_val")
|
| 96 |
select_cols.append(f"MAX({qc}) AS max_val")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
stats = pd.read_sql(f"SELECT {', '.join(select_cols)} FROM {qt}", engine)
|
| 98 |
|
| 99 |
null_count = int(stats.iloc[0]["nulls"])
|
|
@@ -109,6 +115,8 @@ def profile_column(
|
|
| 109 |
if is_numeric:
|
| 110 |
profile["min"] = stats.iloc[0]["min_val"]
|
| 111 |
profile["max"] = stats.iloc[0]["max_val"]
|
|
|
|
|
|
|
| 112 |
|
| 113 |
if 0 < distinct_ratio <= TOP_VALUES_THRESHOLD:
|
| 114 |
top = pd.read_sql(
|
|
@@ -170,6 +178,7 @@ def build_text(table_name: str, row_count: int, col: dict, profile: dict) -> str
|
|
| 170 |
text += f"Distinct count: {profile['distinct_count']} ({profile['distinct_ratio']:.1%})\n"
|
| 171 |
if "min" in profile:
|
| 172 |
text += f"Min: {profile['min']}, Max: {profile['max']}\n"
|
|
|
|
| 173 |
if "top_values" in profile:
|
| 174 |
top_str = ", ".join(f"{v} ({c})" for v, c in profile["top_values"])
|
| 175 |
text += f"Top values: {top_str}\n"
|
|
|
|
| 94 |
if is_numeric:
|
| 95 |
select_cols.append(f"MIN({qc}) AS min_val")
|
| 96 |
select_cols.append(f"MAX({qc}) AS max_val")
|
| 97 |
+
select_cols.append(f"AVG({qc}) AS mean_val")
|
| 98 |
+
# PERCENTILE_CONT is supported by Postgres and SQL Server; MySQL would need
|
| 99 |
+
# a dialect-specific fallback when that connector is added.
|
| 100 |
+
select_cols.append(
|
| 101 |
+
f"PERCENTILE_CONT(0.5) WITHIN GROUP (ORDER BY {qc}) AS median_val"
|
| 102 |
+
)
|
| 103 |
stats = pd.read_sql(f"SELECT {', '.join(select_cols)} FROM {qt}", engine)
|
| 104 |
|
| 105 |
null_count = int(stats.iloc[0]["nulls"])
|
|
|
|
| 115 |
if is_numeric:
|
| 116 |
profile["min"] = stats.iloc[0]["min_val"]
|
| 117 |
profile["max"] = stats.iloc[0]["max_val"]
|
| 118 |
+
profile["mean"] = stats.iloc[0]["mean_val"]
|
| 119 |
+
profile["median"] = stats.iloc[0]["median_val"]
|
| 120 |
|
| 121 |
if 0 < distinct_ratio <= TOP_VALUES_THRESHOLD:
|
| 122 |
top = pd.read_sql(
|
|
|
|
| 178 |
text += f"Distinct count: {profile['distinct_count']} ({profile['distinct_ratio']:.1%})\n"
|
| 179 |
if "min" in profile:
|
| 180 |
text += f"Min: {profile['min']}, Max: {profile['max']}\n"
|
| 181 |
+
text += f"Mean: {profile['mean']}, Median: {profile['median']}\n"
|
| 182 |
if "top_values" in profile:
|
| 183 |
top_str = ", ".join(f"{v} ({c})" for v, c in profile["top_values"])
|
| 184 |
text += f"Top values: {top_str}\n"
|