Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -5,18 +5,13 @@ import io
|
|
| 5 |
import re
|
| 6 |
import gc
|
| 7 |
import os
|
| 8 |
-
from typing import Tuple, Optional
|
| 9 |
|
| 10 |
-
# ----------
|
| 11 |
def read_uploaded_file(file):
|
| 12 |
-
"""
|
| 13 |
-
Try multiple ways to get bytes from Gradio upload objects.
|
| 14 |
-
Returns (bytes_content, filename or None, error_message)
|
| 15 |
-
"""
|
| 16 |
if file is None:
|
| 17 |
return None, None, "No file uploaded."
|
| 18 |
-
|
| 19 |
-
# 1) If object has read() (file-like), use it
|
| 20 |
try:
|
| 21 |
if hasattr(file, "read"):
|
| 22 |
content = file.read()
|
|
@@ -24,8 +19,6 @@ def read_uploaded_file(file):
|
|
| 24 |
return content, name, None
|
| 25 |
except Exception:
|
| 26 |
pass
|
| 27 |
-
|
| 28 |
-
# 2) If it's a path-like string, open it
|
| 29 |
try:
|
| 30 |
if isinstance(file, (str, os.PathLike)):
|
| 31 |
path = str(file)
|
|
@@ -35,25 +28,18 @@ def read_uploaded_file(file):
|
|
| 35 |
return content, os.path.basename(path), None
|
| 36 |
except Exception:
|
| 37 |
pass
|
| 38 |
-
|
| 39 |
-
# 3) If it's a dict-like object (some envs), try common keys
|
| 40 |
try:
|
| 41 |
if isinstance(file, dict):
|
| 42 |
-
# Many times keys could be 'name' and 'data' or 'content'
|
| 43 |
name = file.get("name") or file.get("filename")
|
| 44 |
data = file.get("data") or file.get("content") or file.get("bytes")
|
| 45 |
if isinstance(data, (bytes, bytearray)):
|
| 46 |
return data, name, None
|
| 47 |
-
if isinstance(data, str):
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
|
| 51 |
-
content = f.read()
|
| 52 |
-
return content, name or os.path.basename(data), None
|
| 53 |
except Exception:
|
| 54 |
pass
|
| 55 |
-
|
| 56 |
-
# 4) Fallback: try to get .name and open that path
|
| 57 |
try:
|
| 58 |
name = getattr(file, "name", None)
|
| 59 |
if name and isinstance(name, str) and os.path.exists(name):
|
|
@@ -62,22 +48,16 @@ def read_uploaded_file(file):
|
|
| 62 |
return content, os.path.basename(name), None
|
| 63 |
except Exception:
|
| 64 |
pass
|
| 65 |
-
|
| 66 |
return None, None, "Uploaded file format not supported by this server environment."
|
| 67 |
|
| 68 |
def load_file_bytes_to_df(file) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
|
| 69 |
-
"""
|
| 70 |
-
Read uploaded file bytes into a pandas DataFrame WITHOUT saving to disk.
|
| 71 |
-
Returns (df, error_message).
|
| 72 |
-
"""
|
| 73 |
content, name, err = read_uploaded_file(file)
|
| 74 |
if err:
|
| 75 |
return None, f"Error reading file: {err}"
|
| 76 |
if content is None:
|
| 77 |
return None, "No content read from uploaded file."
|
| 78 |
-
|
| 79 |
try:
|
| 80 |
-
# Basic heuristic for CSV vs Excel
|
| 81 |
fname = (name or "").lower()
|
| 82 |
if fname.endswith(".csv") or (isinstance(content, (bytes, bytearray)) and b"," in content[:200]):
|
| 83 |
df = pd.read_csv(io.BytesIO(content))
|
|
@@ -86,7 +66,6 @@ def load_file_bytes_to_df(file) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
|
|
| 86 |
except Exception as e:
|
| 87 |
return None, f"Error reading file: {e}"
|
| 88 |
finally:
|
| 89 |
-
# remove raw bytes quickly
|
| 90 |
try:
|
| 91 |
del content
|
| 92 |
except Exception:
|
|
@@ -94,18 +73,89 @@ def load_file_bytes_to_df(file) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
|
|
| 94 |
gc.collect()
|
| 95 |
return df, None
|
| 96 |
|
| 97 |
-
# ----------
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 98 |
def simple_nl_to_action(df: pd.DataFrame, query: str):
|
| 99 |
-
"""NL parser that also supports 'wise head count' groupby counting."""
|
| 100 |
q = (query or "").strip().lower()
|
| 101 |
if q == "":
|
| 102 |
-
return None, "Please type a question like: '
|
|
|
|
|
|
|
|
|
|
| 103 |
|
| 104 |
-
#
|
| 105 |
if "columns" in q or "show columns" in q or "list columns" in q:
|
| 106 |
-
return pd.DataFrame({"columns":
|
|
|
|
|
|
|
|
|
|
|
|
|
| 107 |
|
| 108 |
-
#
|
| 109 |
m = re.search(r"(first|head)\s*(\d+)?", q)
|
| 110 |
if "head" in q or "first" in q:
|
| 111 |
n = 5
|
|
@@ -113,88 +163,90 @@ def simple_nl_to_action(df: pd.DataFrame, query: str):
|
|
| 113 |
n = int(m.group(2))
|
| 114 |
return df.head(n), None
|
| 115 |
|
| 116 |
-
#
|
| 117 |
if "describe" in q or "summary" in q or "statistics" in q:
|
| 118 |
return df.describe(include='all').reset_index(), None
|
| 119 |
|
| 120 |
-
#
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 127 |
else:
|
| 128 |
-
|
|
|
|
| 129 |
|
| 130 |
-
#
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
if
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
elif op == "<":
|
| 148 |
-
res = df[df[colname] < val_num]
|
| 149 |
-
elif op == ">=":
|
| 150 |
-
res = df[df[colname] >= val_num]
|
| 151 |
-
elif op == "<=":
|
| 152 |
-
res = df[df[colname] <= val_num]
|
| 153 |
-
return res.head(200), None
|
| 154 |
-
else:
|
| 155 |
-
if op == "=":
|
| 156 |
-
res = df[df[colname].astype(str).str.lower() == val.lower()]
|
| 157 |
-
return res.head(200), None
|
| 158 |
-
else:
|
| 159 |
-
return None, f"Operator {op} not supported for non-numeric column '{colname}'."
|
| 160 |
-
except Exception as e:
|
| 161 |
-
return None, f"Error applying filter: {e}"
|
| 162 |
-
|
| 163 |
-
# 6) department-wise / <column> wise head count / count by column
|
| 164 |
-
# Examples caught: "department wise head count", "count by Department", "headcount by department"
|
| 165 |
-
if re.search(r"(head[\s-]?count|headcount|count)", q) and (("wise" in q) or (" by " in q) or ("by " in q)):
|
| 166 |
-
# try "X wise head count" pattern
|
| 167 |
-
m = re.search(r"([a-z0-9_ ]+?)\s*(?:wise|by)\s*(?:head[\s-]?count|headcount|count)", q)
|
| 168 |
-
col_candidate = None
|
| 169 |
-
if m:
|
| 170 |
-
col_candidate = m.group(1).strip()
|
| 171 |
else:
|
| 172 |
-
|
| 173 |
-
|
| 174 |
-
|
| 175 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
|
| 177 |
-
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
|
| 181 |
-
|
| 182 |
-
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
else:
|
| 186 |
-
return None, f"Column '{col_candidate}' not found. Use 'show columns' to see exact names."
|
| 187 |
else:
|
| 188 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 189 |
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 195 |
|
| 196 |
-
# fallback: first 10 rows
|
| 197 |
-
return df.head(10), "Couldn't parse exact request — showing first 10 rows. Try: 'show columns', '
|
| 198 |
|
| 199 |
# ---------- Processing wrapper ----------
|
| 200 |
def process(file, query):
|
|
@@ -229,11 +281,8 @@ def process(file, query):
|
|
| 229 |
else:
|
| 230 |
return None, (msg or "No result")
|
| 231 |
|
| 232 |
-
# ---------- Clear / reset
|
| 233 |
def clear_all():
|
| 234 |
-
"""
|
| 235 |
-
Returns Gradio update objects that clear inputs and outputs.
|
| 236 |
-
"""
|
| 237 |
return (
|
| 238 |
gr.File.update(value=None),
|
| 239 |
gr.Textbox.update(value=""),
|
|
@@ -243,10 +292,10 @@ def clear_all():
|
|
| 243 |
|
| 244 |
# ---------- Gradio UI ----------
|
| 245 |
with gr.Blocks() as demo:
|
| 246 |
-
gr.Markdown("# Chat-with-CSV —
|
| 247 |
with gr.Row():
|
| 248 |
file_input = gr.File(label="Upload CSV or XLSX (will not be saved)", file_count="single")
|
| 249 |
-
query_input = gr.Textbox(label="Ask a question (
|
| 250 |
with gr.Row():
|
| 251 |
submit = gr.Button("Run query")
|
| 252 |
clear_btn = gr.Button("Clear / Reset (remove uploaded file & results)")
|
|
|
|
| 5 |
import re
|
| 6 |
import gc
|
| 7 |
import os
|
| 8 |
+
from typing import Tuple, Optional, List
|
| 9 |
|
| 10 |
+
# ---------- Helpers for uploaded file reading ----------
|
| 11 |
def read_uploaded_file(file):
|
| 12 |
+
"""Try multiple ways to get bytes from Gradio upload objects."""
|
|
|
|
|
|
|
|
|
|
| 13 |
if file is None:
|
| 14 |
return None, None, "No file uploaded."
|
|
|
|
|
|
|
| 15 |
try:
|
| 16 |
if hasattr(file, "read"):
|
| 17 |
content = file.read()
|
|
|
|
| 19 |
return content, name, None
|
| 20 |
except Exception:
|
| 21 |
pass
|
|
|
|
|
|
|
| 22 |
try:
|
| 23 |
if isinstance(file, (str, os.PathLike)):
|
| 24 |
path = str(file)
|
|
|
|
| 28 |
return content, os.path.basename(path), None
|
| 29 |
except Exception:
|
| 30 |
pass
|
|
|
|
|
|
|
| 31 |
try:
|
| 32 |
if isinstance(file, dict):
|
|
|
|
| 33 |
name = file.get("name") or file.get("filename")
|
| 34 |
data = file.get("data") or file.get("content") or file.get("bytes")
|
| 35 |
if isinstance(data, (bytes, bytearray)):
|
| 36 |
return data, name, None
|
| 37 |
+
if isinstance(data, str) and os.path.exists(data):
|
| 38 |
+
with open(data, "rb") as f:
|
| 39 |
+
content = f.read()
|
| 40 |
+
return content, name or os.path.basename(data), None
|
|
|
|
|
|
|
| 41 |
except Exception:
|
| 42 |
pass
|
|
|
|
|
|
|
| 43 |
try:
|
| 44 |
name = getattr(file, "name", None)
|
| 45 |
if name and isinstance(name, str) and os.path.exists(name):
|
|
|
|
| 48 |
return content, os.path.basename(name), None
|
| 49 |
except Exception:
|
| 50 |
pass
|
|
|
|
| 51 |
return None, None, "Uploaded file format not supported by this server environment."
|
| 52 |
|
| 53 |
def load_file_bytes_to_df(file) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
|
| 54 |
+
"""Read bytes and convert to DataFrame (no disk writes)."""
|
|
|
|
|
|
|
|
|
|
| 55 |
content, name, err = read_uploaded_file(file)
|
| 56 |
if err:
|
| 57 |
return None, f"Error reading file: {err}"
|
| 58 |
if content is None:
|
| 59 |
return None, "No content read from uploaded file."
|
|
|
|
| 60 |
try:
|
|
|
|
| 61 |
fname = (name or "").lower()
|
| 62 |
if fname.endswith(".csv") or (isinstance(content, (bytes, bytearray)) and b"," in content[:200]):
|
| 63 |
df = pd.read_csv(io.BytesIO(content))
|
|
|
|
| 66 |
except Exception as e:
|
| 67 |
return None, f"Error reading file: {e}"
|
| 68 |
finally:
|
|
|
|
| 69 |
try:
|
| 70 |
del content
|
| 71 |
except Exception:
|
|
|
|
| 73 |
gc.collect()
|
| 74 |
return df, None
|
| 75 |
|
| 76 |
+
# ---------- Column matching in queries ----------
|
| 77 |
+
def find_columns_in_query(columns: List[str], query: str, max_matches: int = 3) -> List[str]:
|
| 78 |
+
"""Return a list of best matching column names from the DataFrame for words in the query."""
|
| 79 |
+
q = query.lower()
|
| 80 |
+
found = []
|
| 81 |
+
# exact word matches first
|
| 82 |
+
for col in columns:
|
| 83 |
+
cl = col.lower()
|
| 84 |
+
# exact full word present
|
| 85 |
+
if re.search(r"\b" + re.escape(cl) + r"\b", q):
|
| 86 |
+
found.append(col)
|
| 87 |
+
if len(found) >= max_matches:
|
| 88 |
+
return found
|
| 89 |
+
# partial matches (any token)
|
| 90 |
+
q_tokens = set(re.findall(r"[a-z0-9_]+", q))
|
| 91 |
+
for col in columns:
|
| 92 |
+
if col in found:
|
| 93 |
+
continue
|
| 94 |
+
cl = col.lower()
|
| 95 |
+
col_tokens = set(re.findall(r"[a-z0-9_]+", cl))
|
| 96 |
+
if q_tokens & col_tokens:
|
| 97 |
+
found.append(col)
|
| 98 |
+
if len(found) >= max_matches:
|
| 99 |
+
return found
|
| 100 |
+
# fallback: if query contains "department" but no exact column, look for column names containing department
|
| 101 |
+
for col in columns:
|
| 102 |
+
if "department" in col.lower() and col not in found:
|
| 103 |
+
found.append(col)
|
| 104 |
+
if len(found) >= max_matches:
|
| 105 |
+
return found
|
| 106 |
+
return found
|
| 107 |
+
|
| 108 |
+
# ---------- Aggregation helpers ----------
|
| 109 |
+
def group_count(df: pd.DataFrame, group_col: str, top_n: Optional[int] = None):
|
| 110 |
+
res = df.groupby(group_col).size().reset_index(name="count").sort_values("count", ascending=False).reset_index(drop=True)
|
| 111 |
+
if top_n:
|
| 112 |
+
return res.head(top_n)
|
| 113 |
+
return res
|
| 114 |
+
|
| 115 |
+
def group_agg(df: pd.DataFrame, group_col: str, value_col: str, agg: str):
|
| 116 |
+
if agg in ("mean", "avg", "average"):
|
| 117 |
+
res = df.groupby(group_col)[value_col].mean().reset_index().rename(columns={value_col: "average"})
|
| 118 |
+
elif agg in ("sum",):
|
| 119 |
+
res = df.groupby(group_col)[value_col].sum().reset_index().rename(columns={value_col: "sum"})
|
| 120 |
+
elif agg in ("max",):
|
| 121 |
+
res = df.groupby(group_col)[value_col].max().reset_index().rename(columns={value_col: "max"})
|
| 122 |
+
elif agg in ("min",):
|
| 123 |
+
res = df.groupby(group_col)[value_col].min().reset_index().rename(columns={value_col: "min"})
|
| 124 |
+
else:
|
| 125 |
+
res = df.groupby(group_col)[value_col].agg(agg).reset_index().rename(columns={value_col: agg})
|
| 126 |
+
return res.sort_values(res.columns[-1], ascending=False).reset_index(drop=True)
|
| 127 |
+
|
| 128 |
+
def compute_percentage_counts(df: pd.DataFrame, group_col: str):
|
| 129 |
+
counts = group_count(df, group_col)
|
| 130 |
+
total = counts["count"].sum()
|
| 131 |
+
counts["percentage"] = (counts["count"] / total * 100).round(2)
|
| 132 |
+
return counts
|
| 133 |
+
|
| 134 |
+
def compute_percentage_of_value(df: pd.DataFrame, group_col: str, value_col: str):
|
| 135 |
+
# percent share of value_col per group
|
| 136 |
+
sums = df.groupby(group_col)[value_col].sum().reset_index().rename(columns={value_col: "sum"})
|
| 137 |
+
total = sums["sum"].sum()
|
| 138 |
+
sums["percentage"] = (sums["sum"] / total * 100).round(2)
|
| 139 |
+
return sums.sort_values("sum", ascending=False).reset_index(drop=True)
|
| 140 |
+
|
| 141 |
+
# ---------- Natural language parser & action ----------
|
| 142 |
def simple_nl_to_action(df: pd.DataFrame, query: str):
|
|
|
|
| 143 |
q = (query or "").strip().lower()
|
| 144 |
if q == "":
|
| 145 |
+
return None, "Please type a question like: 'department wise head count', 'percentage of employees by department', 'average salary by department', or 'show columns'."
|
| 146 |
+
|
| 147 |
+
cols = list(df.columns)
|
| 148 |
+
matched = find_columns_in_query(cols, q, max_matches=3) # up to 3 column matches
|
| 149 |
|
| 150 |
+
# direct commands
|
| 151 |
if "columns" in q or "show columns" in q or "list columns" in q:
|
| 152 |
+
return pd.DataFrame({"columns": cols}), None
|
| 153 |
+
|
| 154 |
+
# overall totals
|
| 155 |
+
if re.search(r"\b(total|how many|count of rows|number of rows|total employees|total employee)\b", q):
|
| 156 |
+
return pd.DataFrame({"total_rows": [len(df)]}), None
|
| 157 |
|
| 158 |
+
# show first N rows
|
| 159 |
m = re.search(r"(first|head)\s*(\d+)?", q)
|
| 160 |
if "head" in q or "first" in q:
|
| 161 |
n = 5
|
|
|
|
| 163 |
n = int(m.group(2))
|
| 164 |
return df.head(n), None
|
| 165 |
|
| 166 |
+
# describe / summary
|
| 167 |
if "describe" in q or "summary" in q or "statistics" in q:
|
| 168 |
return df.describe(include='all').reset_index(), None
|
| 169 |
|
| 170 |
+
# HEADCOUNT / COUNT requests (department wise head count etc.)
|
| 171 |
+
if any(w in q for w in ["headcount", "head count", "head-count", "headcounts", "head count", "number of employees", "how many", "count by", "count of", "count"]):
|
| 172 |
+
# If a grouping column is mentioned, use it
|
| 173 |
+
if matched:
|
| 174 |
+
group_col = matched[0]
|
| 175 |
+
# if user mentions percentage as well
|
| 176 |
+
if "%" in q or "percentage" in q or "percent" in q or "share" in q:
|
| 177 |
+
return compute_percentage_counts(df, group_col), None
|
| 178 |
+
# If they asked which has maximum
|
| 179 |
+
if any(w in q for w in ["most", "maximum", "max", "highest", "where max", "to which"]):
|
| 180 |
+
counts = group_count(df, group_col)
|
| 181 |
+
top = counts.head(1)
|
| 182 |
+
# also show full counts for context
|
| 183 |
+
summary = counts
|
| 184 |
+
# build a small output that includes top and summary (we'll return summary; top is first row)
|
| 185 |
+
return summary, f"Top: {top.iloc[0,0]} with {top.iloc[0,1]} (rows)."
|
| 186 |
+
# just return counts
|
| 187 |
+
return group_count(df, group_col), None
|
| 188 |
else:
|
| 189 |
+
# no group column mentioned: return total rows
|
| 190 |
+
return pd.DataFrame({"total_rows": [len(df)]}), None
|
| 191 |
|
| 192 |
+
# AGGREGATION requests (average, mean, sum, max/min of a numeric column grouped by another)
|
| 193 |
+
if any(w in q for w in ["average", "mean", "avg", "sum", "total", "maximum", "minimum", "max", "min"]):
|
| 194 |
+
# try to detect grouping and value column
|
| 195 |
+
if len(matched) >= 2:
|
| 196 |
+
group_col = matched[0]
|
| 197 |
+
value_col = matched[1]
|
| 198 |
+
elif len(matched) == 1:
|
| 199 |
+
# ambiguous: user mentioned one column. If that's numeric, perhaps they want overall average
|
| 200 |
+
cand = matched[0]
|
| 201 |
+
if pd.api.types.is_numeric_dtype(df[cand]):
|
| 202 |
+
# overall stat
|
| 203 |
+
if any(w in q for w in ["average", "mean", "avg"]):
|
| 204 |
+
return pd.DataFrame({f"overall_{cand}_average": [df[cand].mean()]}), None
|
| 205 |
+
if "sum" in q or "total" in q:
|
| 206 |
+
return pd.DataFrame({f"overall_{cand}_sum": [df[cand].sum()]}), None
|
| 207 |
+
# else ask for more clarity
|
| 208 |
+
return None, "I found one column but couldn't tell grouping vs value column. Please ask like 'average Salary by Department' or 'sum Sales by Region'."
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 209 |
else:
|
| 210 |
+
return None, "Please mention columns. Example: 'average Salary by Department' or 'sum Sales by Region'."
|
| 211 |
+
# determine aggregation type
|
| 212 |
+
if any(w in q for w in ["average", "mean", "avg"]):
|
| 213 |
+
return group_agg(df, group_col, value_col, "mean"), None
|
| 214 |
+
if any(w in q for w in ["sum", "total"]):
|
| 215 |
+
return group_agg(df, group_col, value_col, "sum"), None
|
| 216 |
+
if any(w in q for w in ["max", "maximum", "highest"]):
|
| 217 |
+
return group_agg(df, group_col, value_col, "max"), None
|
| 218 |
+
if any(w in q for w in ["min", "minimum", "lowest"]):
|
| 219 |
+
return group_agg(df, group_col, value_col, "min"), None
|
| 220 |
|
| 221 |
+
# PERCENTAGE requests for a numeric column per group
|
| 222 |
+
if any(w in q for w in ["percentage", "%", "percent", "share"]):
|
| 223 |
+
# if two columns mentioned, assume first is group, second is numeric value
|
| 224 |
+
if len(matched) >= 2:
|
| 225 |
+
group_col = matched[0]
|
| 226 |
+
value_col = matched[1]
|
| 227 |
+
if pd.api.types.is_numeric_dtype(df[value_col]):
|
| 228 |
+
return compute_percentage_of_value(df, group_col, value_col), None
|
|
|
|
|
|
|
| 229 |
else:
|
| 230 |
+
return None, f"Column '{value_col}' is not numeric; cannot compute percentage of values."
|
| 231 |
+
elif len(matched) == 1:
|
| 232 |
+
group_col = matched[0]
|
| 233 |
+
# percent of counts
|
| 234 |
+
return compute_percentage_counts(df, group_col), None
|
| 235 |
+
else:
|
| 236 |
+
return None, "Please mention the group column (and optionally a numeric column). Example: 'percentage of Salary by Department' or 'percentage of employees by Department'."
|
| 237 |
|
| 238 |
+
# SHOW specific columns (e.g., 'show Department and Salary')
|
| 239 |
+
m = re.search(r"show (.+)", q)
|
| 240 |
+
if m:
|
| 241 |
+
# try to extract column names from matched list
|
| 242 |
+
if matched:
|
| 243 |
+
# if user asked show with two columns, return them
|
| 244 |
+
return df[matched].head(200), None
|
| 245 |
+
else:
|
| 246 |
+
return None, "Couldn't identify columns to show. Use 'show columns' to view exact names."
|
| 247 |
|
| 248 |
+
# fallback: return first 10 rows with suggestion
|
| 249 |
+
return df.head(10), "Couldn't parse exact request — showing first 10 rows. Try: 'show columns', 'department wise head count', 'percentage of employees by department', or 'average Salary by Department'."
|
| 250 |
|
| 251 |
# ---------- Processing wrapper ----------
|
| 252 |
def process(file, query):
|
|
|
|
| 281 |
else:
|
| 282 |
return None, (msg or "No result")
|
| 283 |
|
| 284 |
+
# ---------- Clear / reset ----------
|
| 285 |
def clear_all():
|
|
|
|
|
|
|
|
|
|
| 286 |
return (
|
| 287 |
gr.File.update(value=None),
|
| 288 |
gr.Textbox.update(value=""),
|
|
|
|
| 292 |
|
| 293 |
# ---------- Gradio UI ----------
|
| 294 |
with gr.Blocks() as demo:
|
| 295 |
+
gr.Markdown("# Chat-with-CSV — enhanced analysis (ephemeral uploads)")
|
| 296 |
with gr.Row():
|
| 297 |
file_input = gr.File(label="Upload CSV or XLSX (will not be saved)", file_count="single")
|
| 298 |
+
query_input = gr.Textbox(label="Ask a question (examples: 'department wise head count', 'percentage of Salary by Department', 'average Salary by Department')", placeholder="Type your question here")
|
| 299 |
with gr.Row():
|
| 300 |
submit = gr.Button("Run query")
|
| 301 |
clear_btn = gr.Button("Clear / Reset (remove uploaded file & results)")
|