rithwikreal commited on
Commit
a36f327
·
verified ·
1 Parent(s): 2568c2d

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +114 -20
app.py CHANGED
@@ -4,31 +4,89 @@ import pandas as pd
4
  import io
5
  import re
6
  import gc
 
7
  from typing import Tuple, Optional
8
 
9
  # ---------- Helper functions ----------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  def load_file_bytes_to_df(file) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
11
  """
12
- Read uploaded file bytes into a DataFrame WITHOUT saving to disk.
13
  Returns (df, error_message).
14
  """
15
- if file is None:
16
- return None, "No file uploaded."
 
 
 
 
17
  try:
18
- # file is a TemporaryFile-like object in Gradio; read bytes once
19
- content = file.read()
20
- # use BytesIO to load into pandas
21
- # decide by filename or by sniffing bytes
22
- name = getattr(file, "name", "") or ""
23
  # Basic heuristic for CSV vs Excel
24
- if name.lower().endswith(".csv") or (isinstance(content, (bytes, bytearray)) and b"," in content[:200]):
 
25
  df = pd.read_csv(io.BytesIO(content))
26
  else:
27
  df = pd.read_excel(io.BytesIO(content))
28
  except Exception as e:
29
  return None, f"Error reading file: {e}"
30
  finally:
31
- # Immediately try to remove raw bytes if present to minimize memory lifetime
32
  try:
33
  del content
34
  except Exception:
@@ -36,15 +94,18 @@ def load_file_bytes_to_df(file) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
36
  gc.collect()
37
  return df, None
38
 
 
39
  def simple_nl_to_action(df: pd.DataFrame, query: str):
40
- """Very small NL parser that recognizes a few common requests and returns a DataFrame or (None, message)."""
41
  q = (query or "").strip().lower()
42
  if q == "":
43
- return None, "Please type a question like: 'show columns', 'show first 5 rows', 'describe column sales', or 'filter where Region = India and Year >= 2021'."
44
 
 
45
  if "columns" in q or "show columns" in q or "list columns" in q:
46
  return pd.DataFrame({"columns": df.columns}), None
47
 
 
48
  m = re.search(r"(first|head)\s*(\d+)?", q)
49
  if "head" in q or "first" in q:
50
  n = 5
@@ -52,18 +113,21 @@ def simple_nl_to_action(df: pd.DataFrame, query: str):
52
  n = int(m.group(2))
53
  return df.head(n), None
54
 
 
55
  if "describe" in q or "summary" in q or "statistics" in q:
56
  return df.describe(include='all').reset_index(), None
57
 
58
- m = re.search(r"show (column )?([a-z0-9_ ]+)", q)
 
59
  if m:
60
- col = m.group(2).strip()
61
  matches = [c for c in df.columns if c.lower() == col.lower()]
62
  if matches:
63
  return df[[matches[0]]].head(100), None
64
  else:
65
  return None, f"Column '{col}' not found. Try 'show columns' to see exact names."
66
 
 
67
  m = re.search(r"filter where ([a-z0-9_ ]+?)\s*(=|>|<|>=|<=)\s*'?(?P<val>[^']+?)'?$", q)
68
  if m:
69
  col_text = m.group(1).strip()
@@ -96,11 +160,44 @@ def simple_nl_to_action(df: pd.DataFrame, query: str):
96
  except Exception as e:
97
  return None, f"Error applying filter: {e}"
98
 
99
- return df.head(10), "Couldn't parse exact request showing first 10 rows. Try: 'show columns', 'show first 5 rows', 'describe', or 'filter where Column = Value'."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  # ---------- Processing wrapper ----------
102
  def process(file, query):
103
- # Load into memory-only DataFrame
104
  df, err = load_file_bytes_to_df(file)
105
  if err:
106
  try:
@@ -110,7 +207,6 @@ def process(file, query):
110
  gc.collect()
111
  return None, err
112
 
113
- # Run the NLP-to-action
114
  try:
115
  res, msg = simple_nl_to_action(df, query)
116
  if isinstance(res, pd.DataFrame):
@@ -121,7 +217,6 @@ def process(file, query):
121
  out_df = None
122
  msg = f"Error while processing: {e}"
123
 
124
- # Remove references to large objects immediately
125
  try:
126
  del df
127
  del file
@@ -151,11 +246,10 @@ with gr.Blocks() as demo:
151
  gr.Markdown("# Chat-with-CSV — private ephemeral uploads (cleared on Reset)")
152
  with gr.Row():
153
  file_input = gr.File(label="Upload CSV or XLSX (will not be saved)", file_count="single")
154
- query_input = gr.Textbox(label="Ask a question (example: 'show columns' or 'filter where Country = India')", placeholder="Type your question here")
155
  with gr.Row():
156
  submit = gr.Button("Run query")
157
  clear_btn = gr.Button("Clear / Reset (remove uploaded file & results)")
158
- # Use headers=None to be compatible with the Gradio version in Spaces
159
  output_table = gr.Dataframe(headers=None, label="Result table")
160
  status = gr.Textbox(label="Status / Messages", interactive=False)
161
 
 
4
  import io
5
  import re
6
  import gc
7
+ import os
8
  from typing import Tuple, Optional
9
 
10
  # ---------- Helper functions ----------
11
+ def read_uploaded_file(file):
12
+ """
13
+ Try multiple ways to get bytes from Gradio upload objects.
14
+ Returns (bytes_content, filename or None, error_message)
15
+ """
16
+ if file is None:
17
+ return None, None, "No file uploaded."
18
+
19
+ # 1) If object has read() (file-like), use it
20
+ try:
21
+ if hasattr(file, "read"):
22
+ content = file.read()
23
+ name = getattr(file, "name", None)
24
+ return content, name, None
25
+ except Exception:
26
+ pass
27
+
28
+ # 2) If it's a path-like string, open it
29
+ try:
30
+ if isinstance(file, (str, os.PathLike)):
31
+ path = str(file)
32
+ if os.path.exists(path):
33
+ with open(path, "rb") as f:
34
+ content = f.read()
35
+ return content, os.path.basename(path), None
36
+ except Exception:
37
+ pass
38
+
39
+ # 3) If it's a dict-like object (some envs), try common keys
40
+ try:
41
+ if isinstance(file, dict):
42
+ # Many times keys could be 'name' and 'data' or 'content'
43
+ name = file.get("name") or file.get("filename")
44
+ data = file.get("data") or file.get("content") or file.get("bytes")
45
+ if isinstance(data, (bytes, bytearray)):
46
+ return data, name, None
47
+ if isinstance(data, str):
48
+ # maybe base64 or path — try open
49
+ if os.path.exists(data):
50
+ with open(data, "rb") as f:
51
+ content = f.read()
52
+ return content, name or os.path.basename(data), None
53
+ except Exception:
54
+ pass
55
+
56
+ # 4) Fallback: try to get .name and open that path
57
+ try:
58
+ name = getattr(file, "name", None)
59
+ if name and isinstance(name, str) and os.path.exists(name):
60
+ with open(name, "rb") as f:
61
+ content = f.read()
62
+ return content, os.path.basename(name), None
63
+ except Exception:
64
+ pass
65
+
66
+ return None, None, "Uploaded file format not supported by this server environment."
67
+
68
  def load_file_bytes_to_df(file) -> Tuple[Optional[pd.DataFrame], Optional[str]]:
69
  """
70
+ Read uploaded file bytes into a pandas DataFrame WITHOUT saving to disk.
71
  Returns (df, error_message).
72
  """
73
+ content, name, err = read_uploaded_file(file)
74
+ if err:
75
+ return None, f"Error reading file: {err}"
76
+ if content is None:
77
+ return None, "No content read from uploaded file."
78
+
79
  try:
 
 
 
 
 
80
  # Basic heuristic for CSV vs Excel
81
+ fname = (name or "").lower()
82
+ if fname.endswith(".csv") or (isinstance(content, (bytes, bytearray)) and b"," in content[:200]):
83
  df = pd.read_csv(io.BytesIO(content))
84
  else:
85
  df = pd.read_excel(io.BytesIO(content))
86
  except Exception as e:
87
  return None, f"Error reading file: {e}"
88
  finally:
89
+ # remove raw bytes quickly
90
  try:
91
  del content
92
  except Exception:
 
94
  gc.collect()
95
  return df, None
96
 
97
+ # ---------- Natural language to action ----------
98
  def simple_nl_to_action(df: pd.DataFrame, query: str):
99
+ """NL parser that also supports 'wise head count' groupby counting."""
100
  q = (query or "").strip().lower()
101
  if q == "":
102
+ return None, "Please type a question like: 'show columns', 'show first 5 rows', 'describe', 'department wise head count', or 'filter where Region = India'."
103
 
104
+ # 1) show columns
105
  if "columns" in q or "show columns" in q or "list columns" in q:
106
  return pd.DataFrame({"columns": df.columns}), None
107
 
108
+ # 2) head / first N rows
109
  m = re.search(r"(first|head)\s*(\d+)?", q)
110
  if "head" in q or "first" in q:
111
  n = 5
 
113
  n = int(m.group(2))
114
  return df.head(n), None
115
 
116
+ # 3) describe / summary stats
117
  if "describe" in q or "summary" in q or "statistics" in q:
118
  return df.describe(include='all').reset_index(), None
119
 
120
+ # 4) show column X
121
+ m = re.search(r"show (?:column )?([a-z0-9_ ]+)", q)
122
  if m:
123
+ col = m.group(1).strip()
124
  matches = [c for c in df.columns if c.lower() == col.lower()]
125
  if matches:
126
  return df[[matches[0]]].head(100), None
127
  else:
128
  return None, f"Column '{col}' not found. Try 'show columns' to see exact names."
129
 
130
+ # 5) filter pattern
131
  m = re.search(r"filter where ([a-z0-9_ ]+?)\s*(=|>|<|>=|<=)\s*'?(?P<val>[^']+?)'?$", q)
132
  if m:
133
  col_text = m.group(1).strip()
 
160
  except Exception as e:
161
  return None, f"Error applying filter: {e}"
162
 
163
+ # 6) department-wise / <column> wise head count / count by column
164
+ # Examples caught: "department wise head count", "count by Department", "headcount by department"
165
+ if re.search(r"(head[\s-]?count|headcount|count)", q) and (("wise" in q) or (" by " in q) or ("by " in q)):
166
+ # try "X wise head count" pattern
167
+ m = re.search(r"([a-z0-9_ ]+?)\s*(?:wise|by)\s*(?:head[\s-]?count|headcount|count)", q)
168
+ col_candidate = None
169
+ if m:
170
+ col_candidate = m.group(1).strip()
171
+ else:
172
+ # try "count by X"
173
+ m2 = re.search(r"(?:head[\s-]?count|headcount|count)\s*(?:by\s*)([a-z0-9_ ]+)", q)
174
+ if m2:
175
+ col_candidate = m2.group(1).strip()
176
+
177
+ if col_candidate:
178
+ # match to actual column name
179
+ matches = [c for c in df.columns if c.lower() == col_candidate.lower()]
180
+ if not matches:
181
+ # try partial match (contains)
182
+ partials = [c for c in df.columns if col_candidate.lower() in c.lower()]
183
+ if partials:
184
+ colname = partials[0]
185
+ else:
186
+ return None, f"Column '{col_candidate}' not found. Use 'show columns' to see exact names."
187
+ else:
188
+ colname = matches[0]
189
+
190
+ try:
191
+ counts = df.groupby(colname).size().reset_index(name="count").sort_values("count", ascending=False)
192
+ return counts.reset_index(drop=True), None
193
+ except Exception as e:
194
+ return None, f"Error computing counts: {e}"
195
+
196
+ # fallback: first 10 rows w/ message
197
+ return df.head(10), "Couldn't parse exact request — showing first 10 rows. Try: 'show columns', 'describe', 'department wise head count', or 'filter where Column = Value'."
198
 
199
  # ---------- Processing wrapper ----------
200
  def process(file, query):
 
201
  df, err = load_file_bytes_to_df(file)
202
  if err:
203
  try:
 
207
  gc.collect()
208
  return None, err
209
 
 
210
  try:
211
  res, msg = simple_nl_to_action(df, query)
212
  if isinstance(res, pd.DataFrame):
 
217
  out_df = None
218
  msg = f"Error while processing: {e}"
219
 
 
220
  try:
221
  del df
222
  del file
 
246
  gr.Markdown("# Chat-with-CSV — private ephemeral uploads (cleared on Reset)")
247
  with gr.Row():
248
  file_input = gr.File(label="Upload CSV or XLSX (will not be saved)", file_count="single")
249
+ query_input = gr.Textbox(label="Ask a question (example: 'department wise head count' or 'filter where Country = India')", placeholder="Type your question here")
250
  with gr.Row():
251
  submit = gr.Button("Run query")
252
  clear_btn = gr.Button("Clear / Reset (remove uploaded file & results)")
 
253
  output_table = gr.Dataframe(headers=None, label="Result table")
254
  status = gr.Textbox(label="Status / Messages", interactive=False)
255