data_analysis_agent / utils /data_tools.py
Shrouk04's picture
Upload 36 files
f73646a verified
Raw
History Blame Contribute Delete
2.76 kB
import pandas as pd
# BASIC INFO
def get_shape(df):
return {
"rows": df.shape[0],
"columns": df.shape[1]
}
def get_columns(df):
return df.columns.tolist()
def get_dtypes(df):
return df.dtypes.astype(str).to_dict()
# COUNT UNIQUE VALUES
def count_unique(df, column):
if column not in df.columns:
return f"Column '{column}' not found."
return int(df[column].nunique())
# VALUE COUNTS
def get_value_counts(df, column, top_n=10):
if column not in df.columns:
return f"Column '{column}' not found."
counts = (
df[column]
.value_counts()
.head(top_n)
.to_dict()
)
return counts
# column mean
def get_mean(df, column):
if column not in df.columns:
return f"Column '{column}' not found."
if not pd.api.types.is_numeric_dtype(df[column]):
return f"Column '{column}' is not numeric."
return float(df[column].mean())
# groubby mean
def groupby_mean(
df,
group_col,
value_col,
ascending=False
):
if group_col not in df.columns:
return f"Column '{group_col}' not found."
if value_col not in df.columns:
return f"Column '{value_col}' not found."
if not pd.api.types.is_numeric_dtype(df[value_col]):
return f"Column '{value_col}' is not numeric."
result = (
df.groupby(group_col)[value_col]
.mean()
.sort_values(ascending=ascending)
)
return result.head(10).to_dict()
# correlation
def calculate_correlation(
df,
col1,
col2
):
if col1 not in df.columns:
return f"Column '{col1}' not found."
if col2 not in df.columns:
return f"Column '{col2}' not found."
if not pd.api.types.is_numeric_dtype(df[col1]):
return f"Column '{col1}' is not numeric."
if not pd.api.types.is_numeric_dtype(df[col2]):
return f"Column '{col2}' is not numeric."
corr = df[col1].corr(df[col2])
return float(corr)
# max value row
def get_max_row(df, column):
if column not in df.columns:
return f"Column '{column}' not found."
if not pd.api.types.is_numeric_dtype(df[column]):
return f"Column '{column}' is not numeric."
idx = df[column].idxmax()
return df.loc[idx].to_dict()
# min value row
def get_min_row(df, column):
if column not in df.columns:
return f"Column '{column}' not found."
if not pd.api.types.is_numeric_dtype(df[column]):
return f"Column '{column}' is not numeric."
idx = df[column].idxmin()
return df.loc[idx].to_dict()