Spaces:
Sleeping
Sleeping
File size: 2,761 Bytes
f73646a | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | import pandas as pd
# BASIC INFO
def get_shape(df):
return {
"rows": df.shape[0],
"columns": df.shape[1]
}
def get_columns(df):
return df.columns.tolist()
def get_dtypes(df):
return df.dtypes.astype(str).to_dict()
# COUNT UNIQUE VALUES
def count_unique(df, column):
if column not in df.columns:
return f"Column '{column}' not found."
return int(df[column].nunique())
# VALUE COUNTS
def get_value_counts(df, column, top_n=10):
if column not in df.columns:
return f"Column '{column}' not found."
counts = (
df[column]
.value_counts()
.head(top_n)
.to_dict()
)
return counts
# column mean
def get_mean(df, column):
if column not in df.columns:
return f"Column '{column}' not found."
if not pd.api.types.is_numeric_dtype(df[column]):
return f"Column '{column}' is not numeric."
return float(df[column].mean())
# groubby mean
def groupby_mean(
df,
group_col,
value_col,
ascending=False
):
if group_col not in df.columns:
return f"Column '{group_col}' not found."
if value_col not in df.columns:
return f"Column '{value_col}' not found."
if not pd.api.types.is_numeric_dtype(df[value_col]):
return f"Column '{value_col}' is not numeric."
result = (
df.groupby(group_col)[value_col]
.mean()
.sort_values(ascending=ascending)
)
return result.head(10).to_dict()
# correlation
def calculate_correlation(
df,
col1,
col2
):
if col1 not in df.columns:
return f"Column '{col1}' not found."
if col2 not in df.columns:
return f"Column '{col2}' not found."
if not pd.api.types.is_numeric_dtype(df[col1]):
return f"Column '{col1}' is not numeric."
if not pd.api.types.is_numeric_dtype(df[col2]):
return f"Column '{col2}' is not numeric."
corr = df[col1].corr(df[col2])
return float(corr)
# max value row
def get_max_row(df, column):
if column not in df.columns:
return f"Column '{column}' not found."
if not pd.api.types.is_numeric_dtype(df[column]):
return f"Column '{column}' is not numeric."
idx = df[column].idxmax()
return df.loc[idx].to_dict()
# min value row
def get_min_row(df, column):
if column not in df.columns:
return f"Column '{column}' not found."
if not pd.api.types.is_numeric_dtype(df[column]):
return f"Column '{column}' is not numeric."
idx = df[column].idxmin()
return df.loc[idx].to_dict() |