Spaces:
Sleeping
Sleeping
| import pandas as pd | |
| import numpy as np | |
| # column matching | |
| def find_column(question, columns): | |
| q = question.lower() | |
| best = None | |
| score = 0 | |
| for col in columns: | |
| c = col.lower().replace("_", " ") | |
| s = 0 | |
| if c in q: | |
| s += 10 | |
| for word in q.split(): | |
| if word in c: | |
| s += 1 | |
| if s > score: | |
| score = s | |
| best = col | |
| return best | |
| ## query executer | |
| def execute_query(df, question): | |
| q = question.lower() | |
| numeric_cols = df.select_dtypes( | |
| include=np.number | |
| ).columns.tolist() | |
| categorical_cols = df.select_dtypes( | |
| include=["object", "category"] | |
| ).columns.tolist() | |
| ## gighest | |
| if any(w in q for w in [ | |
| "highest", | |
| "max", | |
| "most", | |
| "top", | |
| "best" | |
| ]): | |
| value_col = find_column( | |
| q, | |
| numeric_cols | |
| ) | |
| group_col = find_column( | |
| q, | |
| categorical_cols | |
| ) | |
| if value_col is None: | |
| if len(numeric_cols) > 0: | |
| value_col = numeric_cols[0] | |
| if group_col is None: | |
| if len(categorical_cols) > 0: | |
| group_col = categorical_cols[0] | |
| if not value_col or not group_col: | |
| return { | |
| "success": False, | |
| "message": "Could not identify columns" | |
| } | |
| grouped = ( | |
| df.groupby(group_col)[value_col] | |
| .mean() | |
| .sort_values(ascending=False) | |
| ) | |
| top_name = grouped.index[0] | |
| top_value = grouped.iloc[0] | |
| return { | |
| "success": True, | |
| "type": "highest", | |
| "group_column": group_col, | |
| "value_column": value_col, | |
| "name": str(top_name), | |
| "value": float(top_value) | |
| } | |
| ## lowest | |
| if any(w in q for w in [ | |
| "lowest", | |
| "minimum", | |
| "least", | |
| "worst" | |
| ]): | |
| value_col = find_column( | |
| q, | |
| numeric_cols | |
| ) | |
| group_col = find_column( | |
| q, | |
| categorical_cols | |
| ) | |
| if value_col is None: | |
| if len(numeric_cols) > 0: | |
| value_col = numeric_cols[0] | |
| if group_col is None: | |
| if len(categorical_cols) > 0: | |
| group_col = categorical_cols[0] | |
| grouped = ( | |
| df.groupby(group_col)[value_col] | |
| .mean() | |
| .sort_values(ascending=True) | |
| ) | |
| top_name = grouped.index[0] | |
| top_value = grouped.iloc[0] | |
| return { | |
| "success": True, | |
| "type": "lowest", | |
| "group_column": group_col, | |
| "value_column": value_col, | |
| "name": str(top_name), | |
| "value": float(top_value) | |
| } | |
| ## avg | |
| if any(w in q for w in [ | |
| "average", | |
| "mean", | |
| "avg" | |
| ]): | |
| col = find_column(q, numeric_cols) | |
| if col is None: | |
| col = numeric_cols[0] | |
| value = df[col].mean() | |
| return { | |
| "success": True, | |
| "type": "average", | |
| "column": col, | |
| "value": float(value) | |
| } | |
| ## correlation | |
| if any(w in q for w in [ | |
| "correlation", | |
| "relationship", | |
| "impact", | |
| "efffect", | |
| "affected" | |
| ]): | |
| if len(numeric_cols) < 2: | |
| return { | |
| "success": False, | |
| "message": "Need 2 numeric columns" | |
| } | |
| best_corr = 0 | |
| best_pair = None | |
| for i in range(len(numeric_cols)): | |
| for j in range(i + 1, len(numeric_cols)): | |
| c1 = numeric_cols[i] | |
| c2 = numeric_cols[j] | |
| corr = df[c1].corr(df[c2]) | |
| if abs(corr) > abs(best_corr): | |
| best_corr = corr | |
| best_pair = (c1, c2) | |
| return { | |
| "success": True, | |
| "type": "correlation", | |
| "col1": best_pair[0], | |
| "col2": best_pair[1], | |
| "correlation": float(best_corr) | |
| } | |
| return { | |
| "success": False, | |
| "message": "Could not understand question" | |
| } |