rithwikreal commited on
Commit
f33de66
·
verified ·
1 Parent(s): a36f327

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +162 -113
app.py CHANGED
@@ -5,18 +5,13 @@ import io
5
  import re
6
  import gc
7
  import os
8
- from typing import Tuple, Optional
9
 
10
- # ---------- Helper functions ----------
11
  def read_uploaded_file(file):
12
- """
13
- Try multiple ways to get bytes from Gradio upload objects.
14
- Returns (bytes_content, filename or None, error_message)
15
- """
16
  if file is None:
17
  return None, None, "No file uploaded."
18
-
19
- # 1) If object has read() (file-like), use it
20
  try:
21
  if hasattr(file, "read"):
22
  content = file.read()
@@ -24,8 +19,6 @@ def read_uploaded_file(file):
24
  return content, name, None
25
  except Exception:
26
  pass
27
-
28
- # 2) If it's a path-like string, open it
29
  try:
30
  if isinstance(file, (str, os.PathLike)):
31
  path = str(file)
@@ -35,25 +28,18 @@ def read_uploaded_file(file):
35
  return content, os.path.basename(path), None
36
  except Exception:
37
  pass
38
-
39
- # 3) If it's a dict-like object (some envs), try common keys
40
  try:
41
  if isinstance(file, dict):
42
- # Many times keys could be 'name' and 'data' or 'content'
43
  name = file.get("name") or file.get("filename")
44
  data = file.get("data") or file.get("content") or file.get("bytes")
45
  if isinstance(data, (bytes, bytearray)):
46
  return data, name, None
47
- if isinstance(data, str):
48
- # maybe base64 or path — try open
49
- if os.path.exists(data):
50
- with open(data, "rb") as f:
51
- content = f.read()
52
- return content, name or os.path.basename(data), None
53
  except Exception:
54
  pass
55
-
56
- # 4) Fallback: try to get .name and open that path
57
  try:
58
  name = getattr(file, "name", None)
59
  if name and isinstance(name, str) and os.path.exists(name):
@@ -62,22 +48,16 @@ def read_uploaded_file(file):
62
  return content, os.path.basename(name), None
63
  except Exception:
64
  pass
65
-
66
  return None, None, "Uploaded file format not supported by this server environment."
67
 
68
  def load_file_bytes_to_df(file) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
69
- """
70
- Read uploaded file bytes into a pandas DataFrame WITHOUT saving to disk.
71
- Returns (df, error_message).
72
- """
73
  content, name, err = read_uploaded_file(file)
74
  if err:
75
  return None, f"Error reading file: {err}"
76
  if content is None:
77
  return None, "No content read from uploaded file."
78
-
79
  try:
80
- # Basic heuristic for CSV vs Excel
81
  fname = (name or "").lower()
82
  if fname.endswith(".csv") or (isinstance(content, (bytes, bytearray)) and b"," in content[:200]):
83
  df = pd.read_csv(io.BytesIO(content))
@@ -86,7 +66,6 @@ def load_file_bytes_to_df(file) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
86
  except Exception as e:
87
  return None, f"Error reading file: {e}"
88
  finally:
89
- # remove raw bytes quickly
90
  try:
91
  del content
92
  except Exception:
@@ -94,18 +73,89 @@ def load_file_bytes_to_df(file) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
94
  gc.collect()
95
  return df, None
96
 
97
- # ---------- Natural language to action ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
98
  def simple_nl_to_action(df: pd.DataFrame, query: str):
99
- """NL parser that also supports 'wise head count' groupby counting."""
100
  q = (query or "").strip().lower()
101
  if q == "":
102
- return None, "Please type a question like: 'show columns', 'show first 5 rows', 'describe', 'department wise head count', or 'filter where Region = India'."
 
 
 
103
 
104
- # 1) show columns
105
  if "columns" in q or "show columns" in q or "list columns" in q:
106
- return pd.DataFrame({"columns": df.columns}), None
 
 
 
 
107
 
108
- # 2) head / first N rows
109
  m = re.search(r"(first|head)\s*(\d+)?", q)
110
  if "head" in q or "first" in q:
111
  n = 5
@@ -113,88 +163,90 @@ def simple_nl_to_action(df: pd.DataFrame, query: str):
113
  n = int(m.group(2))
114
  return df.head(n), None
115
 
116
- # 3) describe / summary stats
117
  if "describe" in q or "summary" in q or "statistics" in q:
118
  return df.describe(include='all').reset_index(), None
119
 
120
- # 4) show column X
121
- m = re.search(r"show (?:column )?([a-z0-9_ ]+)", q)
122
- if m:
123
- col = m.group(1).strip()
124
- matches = [c for c in df.columns if c.lower() == col.lower()]
125
- if matches:
126
- return df[[matches[0]]].head(100), None
 
 
 
 
 
 
 
 
 
 
 
127
  else:
128
- return None, f"Column '{col}' not found. Try 'show columns' to see exact names."
 
129
 
130
- # 5) filter pattern
131
- m = re.search(r"filter where ([a-z0-9_ ]+?)\s*(=|>|<|>=|<=)\s*'?(?P<val>[^']+?)'?$", q)
132
- if m:
133
- col_text = m.group(1).strip()
134
- op = m.group(2)
135
- val = m.group('val').strip()
136
- matches = [c for c in df.columns if c.lower() == col_text.lower()]
137
- if not matches:
138
- return None, f"Column '{col_text}' not found. Use 'show columns' to check names."
139
- colname = matches[0]
140
- try:
141
- if pd.api.types.is_numeric_dtype(df[colname]):
142
- val_num = float(val)
143
- if op == "=":
144
- res = df[df[colname] == val_num]
145
- elif op == ">":
146
- res = df[df[colname] > val_num]
147
- elif op == "<":
148
- res = df[df[colname] < val_num]
149
- elif op == ">=":
150
- res = df[df[colname] >= val_num]
151
- elif op == "<=":
152
- res = df[df[colname] <= val_num]
153
- return res.head(200), None
154
- else:
155
- if op == "=":
156
- res = df[df[colname].astype(str).str.lower() == val.lower()]
157
- return res.head(200), None
158
- else:
159
- return None, f"Operator {op} not supported for non-numeric column '{colname}'."
160
- except Exception as e:
161
- return None, f"Error applying filter: {e}"
162
-
163
- # 6) department-wise / <column> wise head count / count by column
164
- # Examples caught: "department wise head count", "count by Department", "headcount by department"
165
- if re.search(r"(head[\s-]?count|headcount|count)", q) and (("wise" in q) or (" by " in q) or ("by " in q)):
166
- # try "X wise head count" pattern
167
- m = re.search(r"([a-z0-9_ ]+?)\s*(?:wise|by)\s*(?:head[\s-]?count|headcount|count)", q)
168
- col_candidate = None
169
- if m:
170
- col_candidate = m.group(1).strip()
171
  else:
172
- # try "count by X"
173
- m2 = re.search(r"(?:head[\s-]?count|headcount|count)\s*(?:by\s*)([a-z0-9_ ]+)", q)
174
- if m2:
175
- col_candidate = m2.group(1).strip()
 
 
 
 
 
 
176
 
177
- if col_candidate:
178
- # match to actual column name
179
- matches = [c for c in df.columns if c.lower() == col_candidate.lower()]
180
- if not matches:
181
- # try partial match (contains)
182
- partials = [c for c in df.columns if col_candidate.lower() in c.lower()]
183
- if partials:
184
- colname = partials[0]
185
- else:
186
- return None, f"Column '{col_candidate}' not found. Use 'show columns' to see exact names."
187
  else:
188
- colname = matches[0]
 
 
 
 
 
 
189
 
190
- try:
191
- counts = df.groupby(colname).size().reset_index(name="count").sort_values("count", ascending=False)
192
- return counts.reset_index(drop=True), None
193
- except Exception as e:
194
- return None, f"Error computing counts: {e}"
 
 
 
 
195
 
196
- # fallback: first 10 rows w/ message
197
- return df.head(10), "Couldn't parse exact request — showing first 10 rows. Try: 'show columns', 'describe', 'department wise head count', or 'filter where Column = Value'."
198
 
199
  # ---------- Processing wrapper ----------
200
  def process(file, query):
@@ -229,11 +281,8 @@ def process(file, query):
229
  else:
230
  return None, (msg or "No result")
231
 
232
- # ---------- Clear / reset function ----------
233
  def clear_all():
234
- """
235
- Returns Gradio update objects that clear inputs and outputs.
236
- """
237
  return (
238
  gr.File.update(value=None),
239
  gr.Textbox.update(value=""),
@@ -243,10 +292,10 @@ def clear_all():
243
 
244
  # ---------- Gradio UI ----------
245
  with gr.Blocks() as demo:
246
- gr.Markdown("# Chat-with-CSV — private ephemeral uploads (cleared on Reset)")
247
  with gr.Row():
248
  file_input = gr.File(label="Upload CSV or XLSX (will not be saved)", file_count="single")
249
- query_input = gr.Textbox(label="Ask a question (example: 'department wise head count' or 'filter where Country = India')", placeholder="Type your question here")
250
  with gr.Row():
251
  submit = gr.Button("Run query")
252
  clear_btn = gr.Button("Clear / Reset (remove uploaded file & results)")
 
5
  import re
6
  import gc
7
  import os
8
+ from typing import Tuple, Optional, List
9
 
10
+ # ---------- Helpers for uploaded file reading ----------
11
  def read_uploaded_file(file):
12
+ """Try multiple ways to get bytes from Gradio upload objects."""
 
 
 
13
  if file is None:
14
  return None, None, "No file uploaded."
 
 
15
  try:
16
  if hasattr(file, "read"):
17
  content = file.read()
 
19
  return content, name, None
20
  except Exception:
21
  pass
 
 
22
  try:
23
  if isinstance(file, (str, os.PathLike)):
24
  path = str(file)
 
28
  return content, os.path.basename(path), None
29
  except Exception:
30
  pass
 
 
31
  try:
32
  if isinstance(file, dict):
 
33
  name = file.get("name") or file.get("filename")
34
  data = file.get("data") or file.get("content") or file.get("bytes")
35
  if isinstance(data, (bytes, bytearray)):
36
  return data, name, None
37
+ if isinstance(data, str) and os.path.exists(data):
38
+ with open(data, "rb") as f:
39
+ content = f.read()
40
+ return content, name or os.path.basename(data), None
 
 
41
  except Exception:
42
  pass
 
 
43
  try:
44
  name = getattr(file, "name", None)
45
  if name and isinstance(name, str) and os.path.exists(name):
 
48
  return content, os.path.basename(name), None
49
  except Exception:
50
  pass
 
51
  return None, None, "Uploaded file format not supported by this server environment."
52
 
53
  def load_file_bytes_to_df(file) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
54
+ """Read bytes and convert to DataFrame (no disk writes)."""
 
 
 
55
  content, name, err = read_uploaded_file(file)
56
  if err:
57
  return None, f"Error reading file: {err}"
58
  if content is None:
59
  return None, "No content read from uploaded file."
 
60
  try:
 
61
  fname = (name or "").lower()
62
  if fname.endswith(".csv") or (isinstance(content, (bytes, bytearray)) and b"," in content[:200]):
63
  df = pd.read_csv(io.BytesIO(content))
 
66
  except Exception as e:
67
  return None, f"Error reading file: {e}"
68
  finally:
 
69
  try:
70
  del content
71
  except Exception:
 
73
  gc.collect()
74
  return df, None
75
 
76
+ # ---------- Column matching in queries ----------
77
+ def find_columns_in_query(columns: List[str], query: str, max_matches: int = 3) -> List[str]:
78
+ """Return a list of best matching column names from the DataFrame for words in the query."""
79
+ q = query.lower()
80
+ found = []
81
+ # exact word matches first
82
+ for col in columns:
83
+ cl = col.lower()
84
+ # exact full word present
85
+ if re.search(r"\b" + re.escape(cl) + r"\b", q):
86
+ found.append(col)
87
+ if len(found) >= max_matches:
88
+ return found
89
+ # partial matches (any token)
90
+ q_tokens = set(re.findall(r"[a-z0-9_]+", q))
91
+ for col in columns:
92
+ if col in found:
93
+ continue
94
+ cl = col.lower()
95
+ col_tokens = set(re.findall(r"[a-z0-9_]+", cl))
96
+ if q_tokens & col_tokens:
97
+ found.append(col)
98
+ if len(found) >= max_matches:
99
+ return found
100
+ # fallback: if query contains "department" but no exact column, look for column names containing department
101
+ for col in columns:
102
+ if "department" in col.lower() and col not in found:
103
+ found.append(col)
104
+ if len(found) >= max_matches:
105
+ return found
106
+ return found
107
+
108
+ # ---------- Aggregation helpers ----------
109
+ def group_count(df: pd.DataFrame, group_col: str, top_n: Optional[int] = None):
110
+ res = df.groupby(group_col).size().reset_index(name="count").sort_values("count", ascending=False).reset_index(drop=True)
111
+ if top_n:
112
+ return res.head(top_n)
113
+ return res
114
+
115
+ def group_agg(df: pd.DataFrame, group_col: str, value_col: str, agg: str):
116
+ if agg in ("mean", "avg", "average"):
117
+ res = df.groupby(group_col)[value_col].mean().reset_index().rename(columns={value_col: "average"})
118
+ elif agg in ("sum",):
119
+ res = df.groupby(group_col)[value_col].sum().reset_index().rename(columns={value_col: "sum"})
120
+ elif agg in ("max",):
121
+ res = df.groupby(group_col)[value_col].max().reset_index().rename(columns={value_col: "max"})
122
+ elif agg in ("min",):
123
+ res = df.groupby(group_col)[value_col].min().reset_index().rename(columns={value_col: "min"})
124
+ else:
125
+ res = df.groupby(group_col)[value_col].agg(agg).reset_index().rename(columns={value_col: agg})
126
+ return res.sort_values(res.columns[-1], ascending=False).reset_index(drop=True)
127
+
128
+ def compute_percentage_counts(df: pd.DataFrame, group_col: str):
129
+ counts = group_count(df, group_col)
130
+ total = counts["count"].sum()
131
+ counts["percentage"] = (counts["count"] / total * 100).round(2)
132
+ return counts
133
+
134
+ def compute_percentage_of_value(df: pd.DataFrame, group_col: str, value_col: str):
135
+ # percent share of value_col per group
136
+ sums = df.groupby(group_col)[value_col].sum().reset_index().rename(columns={value_col: "sum"})
137
+ total = sums["sum"].sum()
138
+ sums["percentage"] = (sums["sum"] / total * 100).round(2)
139
+ return sums.sort_values("sum", ascending=False).reset_index(drop=True)
140
+
141
+ # ---------- Natural language parser & action ----------
142
  def simple_nl_to_action(df: pd.DataFrame, query: str):
 
143
  q = (query or "").strip().lower()
144
  if q == "":
145
+ return None, "Please type a question like: 'department wise head count', 'percentage of employees by department', 'average salary by department', or 'show columns'."
146
+
147
+ cols = list(df.columns)
148
+ matched = find_columns_in_query(cols, q, max_matches=3) # up to 3 column matches
149
 
150
+ # direct commands
151
  if "columns" in q or "show columns" in q or "list columns" in q:
152
+ return pd.DataFrame({"columns": cols}), None
153
+
154
+ # overall totals
155
+ if re.search(r"\b(total|how many|count of rows|number of rows|total employees|total employee)\b", q):
156
+ return pd.DataFrame({"total_rows": [len(df)]}), None
157
 
158
+ # show first N rows
159
  m = re.search(r"(first|head)\s*(\d+)?", q)
160
  if "head" in q or "first" in q:
161
  n = 5
 
163
  n = int(m.group(2))
164
  return df.head(n), None
165
 
166
+ # describe / summary
167
  if "describe" in q or "summary" in q or "statistics" in q:
168
  return df.describe(include='all').reset_index(), None
169
 
170
+ # HEADCOUNT / COUNT requests (department wise head count etc.)
171
+ if any(w in q for w in ["headcount", "head count", "head-count", "headcounts", "head count", "number of employees", "how many", "count by", "count of", "count"]):
172
+ # If a grouping column is mentioned, use it
173
+ if matched:
174
+ group_col = matched[0]
175
+ # if user mentions percentage as well
176
+ if "%" in q or "percentage" in q or "percent" in q or "share" in q:
177
+ return compute_percentage_counts(df, group_col), None
178
+ # If they asked which has maximum
179
+ if any(w in q for w in ["most", "maximum", "max", "highest", "where max", "to which"]):
180
+ counts = group_count(df, group_col)
181
+ top = counts.head(1)
182
+ # also show full counts for context
183
+ summary = counts
184
+ # build a small output that includes top and summary (we'll return summary; top is first row)
185
+ return summary, f"Top: {top.iloc[0,0]} with {top.iloc[0,1]} (rows)."
186
+ # just return counts
187
+ return group_count(df, group_col), None
188
  else:
189
+ # no group column mentioned: return total rows
190
+ return pd.DataFrame({"total_rows": [len(df)]}), None
191
 
192
+ # AGGREGATION requests (average, mean, sum, max/min of a numeric column grouped by another)
193
+ if any(w in q for w in ["average", "mean", "avg", "sum", "total", "maximum", "minimum", "max", "min"]):
194
+ # try to detect grouping and value column
195
+ if len(matched) >= 2:
196
+ group_col = matched[0]
197
+ value_col = matched[1]
198
+ elif len(matched) == 1:
199
+ # ambiguous: user mentioned one column. If that's numeric, perhaps they want overall average
200
+ cand = matched[0]
201
+ if pd.api.types.is_numeric_dtype(df[cand]):
202
+ # overall stat
203
+ if any(w in q for w in ["average", "mean", "avg"]):
204
+ return pd.DataFrame({f"overall_{cand}_average": [df[cand].mean()]}), None
205
+ if "sum" in q or "total" in q:
206
+ return pd.DataFrame({f"overall_{cand}_sum": [df[cand].sum()]}), None
207
+ # else ask for more clarity
208
+ return None, "I found one column but couldn't tell grouping vs value column. Please ask like 'average Salary by Department' or 'sum Sales by Region'."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  else:
210
+ return None, "Please mention columns. Example: 'average Salary by Department' or 'sum Sales by Region'."
211
+ # determine aggregation type
212
+ if any(w in q for w in ["average", "mean", "avg"]):
213
+ return group_agg(df, group_col, value_col, "mean"), None
214
+ if any(w in q for w in ["sum", "total"]):
215
+ return group_agg(df, group_col, value_col, "sum"), None
216
+ if any(w in q for w in ["max", "maximum", "highest"]):
217
+ return group_agg(df, group_col, value_col, "max"), None
218
+ if any(w in q for w in ["min", "minimum", "lowest"]):
219
+ return group_agg(df, group_col, value_col, "min"), None
220
 
221
+ # PERCENTAGE requests for a numeric column per group
222
+ if any(w in q for w in ["percentage", "%", "percent", "share"]):
223
+ # if two columns mentioned, assume first is group, second is numeric value
224
+ if len(matched) >= 2:
225
+ group_col = matched[0]
226
+ value_col = matched[1]
227
+ if pd.api.types.is_numeric_dtype(df[value_col]):
228
+ return compute_percentage_of_value(df, group_col, value_col), None
 
 
229
  else:
230
+ return None, f"Column '{value_col}' is not numeric; cannot compute percentage of values."
231
+ elif len(matched) == 1:
232
+ group_col = matched[0]
233
+ # percent of counts
234
+ return compute_percentage_counts(df, group_col), None
235
+ else:
236
+ return None, "Please mention the group column (and optionally a numeric column). Example: 'percentage of Salary by Department' or 'percentage of employees by Department'."
237
 
238
+ # SHOW specific columns (e.g., 'show Department and Salary')
239
+ m = re.search(r"show (.+)", q)
240
+ if m:
241
+ # try to extract column names from matched list
242
+ if matched:
243
+ # if user asked show with two columns, return them
244
+ return df[matched].head(200), None
245
+ else:
246
+ return None, "Couldn't identify columns to show. Use 'show columns' to view exact names."
247
 
248
+ # fallback: return first 10 rows with suggestion
249
+ return df.head(10), "Couldn't parse exact request — showing first 10 rows. Try: 'show columns', 'department wise head count', 'percentage of employees by department', or 'average Salary by Department'."
250
 
251
  # ---------- Processing wrapper ----------
252
  def process(file, query):
 
281
  else:
282
  return None, (msg or "No result")
283
 
284
+ # ---------- Clear / reset ----------
285
  def clear_all():
 
 
 
286
  return (
287
  gr.File.update(value=None),
288
  gr.Textbox.update(value=""),
 
292
 
293
  # ---------- Gradio UI ----------
294
  with gr.Blocks() as demo:
295
+ gr.Markdown("# Chat-with-CSV — enhanced analysis (ephemeral uploads)")
296
  with gr.Row():
297
  file_input = gr.File(label="Upload CSV or XLSX (will not be saved)", file_count="single")
298
+ query_input = gr.Textbox(label="Ask a question (examples: 'department wise head count', 'percentage of Salary by Department', 'average Salary by Department')", placeholder="Type your question here")
299
  with gr.Row():
300
  submit = gr.Button("Run query")
301
  clear_btn = gr.Button("Clear / Reset (remove uploaded file & results)")