import pandas as pd import numpy as np # column matching def find_column(question, columns): q = question.lower() best = None score = 0 for col in columns: c = col.lower().replace("_", " ") s = 0 if c in q: s += 10 for word in q.split(): if word in c: s += 1 if s > score: score = s best = col return best ## query executer def execute_query(df, question): q = question.lower() numeric_cols = df.select_dtypes( include=np.number ).columns.tolist() categorical_cols = df.select_dtypes( include=["object", "category"] ).columns.tolist() ## gighest if any(w in q for w in [ "highest", "max", "most", "top", "best" ]): value_col = find_column( q, numeric_cols ) group_col = find_column( q, categorical_cols ) if value_col is None: if len(numeric_cols) > 0: value_col = numeric_cols[0] if group_col is None: if len(categorical_cols) > 0: group_col = categorical_cols[0] if not value_col or not group_col: return { "success": False, "message": "Could not identify columns" } grouped = ( df.groupby(group_col)[value_col] .mean() .sort_values(ascending=False) ) top_name = grouped.index[0] top_value = grouped.iloc[0] return { "success": True, "type": "highest", "group_column": group_col, "value_column": value_col, "name": str(top_name), "value": float(top_value) } ## lowest if any(w in q for w in [ "lowest", "minimum", "least", "worst" ]): value_col = find_column( q, numeric_cols ) group_col = find_column( q, categorical_cols ) if value_col is None: if len(numeric_cols) > 0: value_col = numeric_cols[0] if group_col is None: if len(categorical_cols) > 0: group_col = categorical_cols[0] grouped = ( df.groupby(group_col)[value_col] .mean() .sort_values(ascending=True) ) top_name = grouped.index[0] top_value = grouped.iloc[0] return { "success": True, "type": "lowest", "group_column": group_col, "value_column": value_col, "name": str(top_name), "value": float(top_value) } ## avg if any(w in q for w in [ "average", "mean", "avg" ]): col = find_column(q, numeric_cols) if col is None: col = numeric_cols[0] value = df[col].mean() return { "success": True, "type": "average", "column": col, "value": float(value) } ## correlation if any(w in q for w in [ "correlation", "relationship", "impact", "efffect", "affected" ]): if len(numeric_cols) < 2: return { "success": False, "message": "Need 2 numeric columns" } best_corr = 0 best_pair = None for i in range(len(numeric_cols)): for j in range(i + 1, len(numeric_cols)): c1 = numeric_cols[i] c2 = numeric_cols[j] corr = df[c1].corr(df[c2]) if abs(corr) > abs(best_corr): best_corr = corr best_pair = (c1, c2) return { "success": True, "type": "correlation", "col1": best_pair[0], "col2": best_pair[1], "correlation": float(best_corr) } return { "success": False, "message": "Could not understand question" }