rairo commited on
Commit
6e37da8
·
verified ·
1 Parent(s): 462ec2d

Update main.py

Browse files
Files changed (1) hide show
  1. main.py +79 -65
main.py CHANGED
@@ -1,4 +1,5 @@
1
- # app.py Drop-in refactor to contain PandasAI errors and guarantee analyst fallback
 
2
  from langchain_google_genai import ChatGoogleGenerativeAI
3
  import pandas as pd
4
  import os
@@ -82,19 +83,17 @@ user_defined_path = os.path.join("/exports/charts", str(uuid.uuid4()))
82
  logger.info(f"Chart export path set to: {user_defined_path}")
83
 
84
  # -----------------------------------------------------------------------------
85
- # Utilities: Temporal awareness + PandasAI response guards
86
  # -----------------------------------------------------------------------------
87
- TZ = "Africa/Harare" # single source of truth for business dates
88
 
89
  def now_harare() -> pd.Timestamp:
90
- # Use pandas Timestamp to avoid datetime collisions entirely
91
  return pd.Timestamp.now(tz=TZ)
92
 
93
  def week_bounds_from(ts: pd.Timestamp):
94
- # Monday..Sunday window containing ts
95
- monday = ts.normalize() - pd.Timedelta(days=ts.weekday())
96
- sunday = monday + pd.Timedelta(days=6)
97
- return monday, sunday
98
 
99
  def next_week_bounds(ts: pd.Timestamp):
100
  this_mon, _ = week_bounds_from(ts)
@@ -110,7 +109,6 @@ def last_month_bounds(ts: pd.Timestamp):
110
 
111
  def this_month_bounds(ts: pd.Timestamp):
112
  first_this = ts.normalize().replace(day=1)
113
- # next month first
114
  if first_this.month == 12:
115
  first_next = first_this.replace(year=first_this.year + 1, month=1)
116
  else:
@@ -144,7 +142,6 @@ _TEMP_WINDOWS = [
144
  ]
145
 
146
  def extract_numeric_window(question: str):
147
- """Detect 'last N days' / 'past N days' → (start, end)"""
148
  m = re.search(r"(last|past)\s+(\d{1,3})\s+days", question.lower())
149
  if m:
150
  n = int(m.group(2))
@@ -154,21 +151,15 @@ def extract_numeric_window(question: str):
154
  return None
155
 
156
  def temporal_hints(question: str) -> str:
157
- """
158
- Build a short natural-language preface with explicit date windows.
159
- Example: "next week" => '2025-09-29 to 2025-10-05'
160
- """
161
  base = now_harare()
162
  hints = {}
163
  ql = question.lower()
164
 
165
- # Pre-defined windows
166
  for key, fn in _TEMP_WINDOWS:
167
  if key in ql:
168
  s, e = fn(base)
169
  hints[key] = (s.date().isoformat(), e.date().isoformat())
170
 
171
- # Numeric windows
172
  rng = extract_numeric_window(question)
173
  if rng:
174
  s, e = rng
@@ -177,67 +168,101 @@ def temporal_hints(question: str) -> str:
177
  if not hints:
178
  return (
179
  f"Temporal context: Today is {base.date().isoformat()} ({TZ}). "
180
- f"Week is Monday–Sunday. Use pd.Timestamp.now(tz='{TZ}') and pd.Timedelta."
 
181
  )
182
 
183
  parts = [f"Temporal context: Today is {base.date().isoformat()} ({TZ})."]
184
  for k, (s, e) in hints.items():
185
  parts.append(f"Interpret \"{k}\" as {s} to {e}.")
186
- parts.append(f"Always prefer pd.Timestamp.now(tz='{TZ}') + pd.Timedelta over 'datetime'.")
 
 
187
  return " ".join(parts)
188
 
189
- _ERROR_PATTERNS = [
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
190
  "traceback", "exception", "keyerror", "nameerror", "syntaxerror",
191
  "modulenotfounderror", "importerror", "pipeline failed", "execution failed",
192
- "__import__", "failed with error", "attributeerror", "method_descriptor"
 
 
 
 
193
  ]
194
 
195
- def looks_like_error(ans) -> bool:
 
 
 
 
 
 
 
 
 
 
196
  """
197
- Heuristics to detect PandasAI bad outputs that shouldn't reach users.
 
 
 
198
  """
199
- if ans is None:
200
- return True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
201
  if isinstance(ans, (pd.DataFrame, plt.Figure)):
202
  return False
203
- s = str(ans).strip()
204
  if not s:
205
  return True
206
- sl = s.lower()
207
- if any(p in sl for p in _ERROR_PATTERNS):
208
  return True
209
- # crude detection of stack trace text
210
- if "file \"" in sl and "line " in sl and "error" in sl:
211
  return True
212
  return False
213
 
214
  def sanitize_answer(ans) -> str:
215
- """
216
- Strip code-fences / raw logs; return plain, user-safe content.
217
- """
218
- s = str(ans)
219
- # Remove common code fences to avoid dumping code unintentionally
220
- s = re.sub(r"```+(\w+)?", "", s)
221
- # Truncate any accidental traceback after the first line if present
222
  if "Traceback (most recent call last):" in s:
223
  s = s.split("Traceback (most recent call last):")[0].strip()
 
 
 
224
  return s.strip()
225
 
226
- def guardrails_preamble() -> str:
227
- """
228
- Instruction prefix to reduce PandasAI failure rates around datetime.
229
- """
230
- return (
231
- "Rules for code you generate:\n"
232
- "1) DO NOT use 'from datetime import datetime' or 'datetime.date.today()'.\n"
233
- "2) Use pandas time APIs only: pd.Timestamp.now(tz='Africa/Harare'), pd.Timedelta, dt.floor/ceil.\n"
234
- "3) If a 'Time' column exists, combine Date + Time and localize to 'Africa/Harare'.\n"
235
- "4) Ensure numeric conversion with errors='coerce' for amounts.\n"
236
- "5) Never print stack traces; always return a concise answer or a plot/dataframe."
237
- )
238
-
239
  # -----------------------------------------------------------------------------
240
- # Analyst KPI layer (preserved with small safety tweaks)
241
  # -----------------------------------------------------------------------------
242
  class IrisReportEngine:
243
  def __init__(self, transactions_data: list, llm_instance):
@@ -250,12 +275,10 @@ class IrisReportEngine:
250
  return pd.DataFrame()
251
  df = pd.DataFrame(transactions)
252
 
253
- numeric_cols = ["Units_Sold", "Unit_Cost_Price", "Amount"]
254
- for col in numeric_cols:
255
  if col in df.columns:
256
  df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
257
 
258
- # Build datetime safely and localize
259
  if "Time" in df.columns:
260
  dt_series = pd.to_datetime(
261
  df["Date"].astype(str) + " " + df["Time"].astype(str),
@@ -270,7 +293,6 @@ class IrisReportEngine:
270
  else:
271
  dt_series = dt_series.dt.tz_convert(TZ)
272
  except Exception:
273
- # keep naive if localization fails
274
  pass
275
 
276
  df["datetime"] = dt_series
@@ -279,7 +301,6 @@ class IrisReportEngine:
279
  df["DayOfWeek"] = df["datetime"].dt.day_name()
280
  df["HourOfDay"] = df["datetime"].dt.hour
281
 
282
- # sales-only view & basic profitability
283
  if "Transaction_Type" in df.columns:
284
  sales_df = df[df["Transaction_Type"].astype(str).str.lower() == "sale"].copy()
285
  else:
@@ -389,7 +410,7 @@ class IrisReportEngine:
389
  fallback_prompt = f"""
390
  You are Iris, an expert business data analyst. Answer the user's question using the business data below.
391
 
392
- If their question is specific (e.g., “sales yesterday”, “top product”), answer directly.
393
  If the request can't be answered precisely, provide a helpful business briefing.
394
 
395
  Use clear markdown with short headings and bullets. Keep it concise.
@@ -424,9 +445,8 @@ def bot():
424
  )
425
  resp.raise_for_status()
426
  transactions = (resp.json() or {}).get("transactions") or []
427
- except Exception as e:
428
  logger.exception("Transaction API error")
429
- # Contained message (no stack to user)
430
  return jsonify({"answer": "I couldn't reach the transactions service. Please try again shortly."})
431
 
432
  if not transactions:
@@ -437,18 +457,15 @@ def bot():
437
  logger.info("Attempting Tier 1 (PandasAI)...")
438
  df = pd.DataFrame(transactions)
439
 
440
- # PandasAI config; ResponseParser ensures plots/dfs are serialized safely
441
  pandas_agent = SmartDataframe(df, config={
442
  "llm": llm,
443
  "response_parser": FlaskResponse,
444
- # keep your settings; security = "none" as you had
445
  "security": "none",
446
  "save_charts_path": user_defined_path,
447
  "save_charts": False,
448
  "enable_cache": False,
449
  "conversational": True,
450
  "enable_logging": False,
451
- # keep deps list; we'll still hard-guard with prompt + fallback
452
  "custom_whitelisted_dependencies": [
453
  "os","io","sys","chr","glob","b64decoder","collections",
454
  "geopy","geopandas","wordcloud","builtins","datetime",
@@ -457,7 +474,6 @@ def bot():
457
  ],
458
  })
459
 
460
- # Prepend guardrails + temporal hints
461
  combined_prompt = f"{guardrails_preamble()}\n\n{temporal_hints(user_question)}\n\nQuestion: {user_question}"
462
  answer = pandas_agent.chat(combined_prompt)
463
 
@@ -476,8 +492,7 @@ def bot():
476
 
477
  return jsonify({"answer": sanitize_answer(answer), "meta": {"source": "pandasai"}})
478
 
479
- except Exception as e:
480
- # Log *everything*, return nothing noisy to user
481
  logger.exception("Tier 1 (PandasAI) failed; moving to analyst layer.")
482
 
483
  # Tier 2 — Analyst KPI fallback (guaranteed)
@@ -488,7 +503,6 @@ def bot():
488
 
489
  except Exception:
490
  logger.exception("Critical unexpected error in /chat")
491
- # Final safety message (no tracebacks to user)
492
  return jsonify({"answer": "Something went wrong on our side. Please try again."})
493
 
494
  # -----------------------------------------------------------------------------
 
1
+ # app.py Hardened: never leak PandasAI errors; always fallback cleanly
2
+
3
  from langchain_google_genai import ChatGoogleGenerativeAI
4
  import pandas as pd
5
  import os
 
83
  logger.info(f"Chart export path set to: {user_defined_path}")
84
 
85
  # -----------------------------------------------------------------------------
86
+ # Temporal helpers + guardrails
87
  # -----------------------------------------------------------------------------
88
+ TZ = "Africa/Harare"
89
 
90
  def now_harare() -> pd.Timestamp:
 
91
  return pd.Timestamp.now(tz=TZ)
92
 
93
  def week_bounds_from(ts: pd.Timestamp):
94
+ mon = ts.normalize() - pd.Timedelta(days=ts.weekday())
95
+ sun = mon + pd.Timedelta(days=6)
96
+ return mon, sun
 
97
 
98
  def next_week_bounds(ts: pd.Timestamp):
99
  this_mon, _ = week_bounds_from(ts)
 
109
 
110
  def this_month_bounds(ts: pd.Timestamp):
111
  first_this = ts.normalize().replace(day=1)
 
112
  if first_this.month == 12:
113
  first_next = first_this.replace(year=first_this.year + 1, month=1)
114
  else:
 
142
  ]
143
 
144
  def extract_numeric_window(question: str):
 
145
  m = re.search(r"(last|past)\s+(\d{1,3})\s+days", question.lower())
146
  if m:
147
  n = int(m.group(2))
 
151
  return None
152
 
153
  def temporal_hints(question: str) -> str:
 
 
 
 
154
  base = now_harare()
155
  hints = {}
156
  ql = question.lower()
157
 
 
158
  for key, fn in _TEMP_WINDOWS:
159
  if key in ql:
160
  s, e = fn(base)
161
  hints[key] = (s.date().isoformat(), e.date().isoformat())
162
 
 
163
  rng = extract_numeric_window(question)
164
  if rng:
165
  s, e = rng
 
168
  if not hints:
169
  return (
170
  f"Temporal context: Today is {base.date().isoformat()} ({TZ}). "
171
+ f"Week is Monday–Sunday. Use pd.Timestamp.now(tz='{TZ}') and pd.Timedelta. "
172
+ f"Avoid month/week rolling windows; use groupby with dt.to_period('M') or resample('D')."
173
  )
174
 
175
  parts = [f"Temporal context: Today is {base.date().isoformat()} ({TZ})."]
176
  for k, (s, e) in hints.items():
177
  parts.append(f"Interpret \"{k}\" as {s} to {e}.")
178
+ parts.append(
179
+ "Avoid non-fixed frequencies (MonthBegin/MonthEnd/Week) for rolling; prefer daily resample or groupby Periods."
180
+ )
181
  return " ".join(parts)
182
 
183
+ def guardrails_preamble() -> str:
184
+ return (
185
+ "Rules for any code you generate:\n"
186
+ "1) Do NOT use 'from datetime import datetime' or 'datetime.date.today()'.\n"
187
+ "2) Use pandas time APIs only: pd.Timestamp.now(tz='Africa/Harare'), pd.Timedelta, dt accessors.\n"
188
+ "3) If a Time column exists, combine Date + Time and localize to Africa/Harare.\n"
189
+ "4) Ensure numeric conversion with errors='coerce' for amount fields.\n"
190
+ "5) Avoid non-fixed rolling/resample frequencies (e.g., MonthBegin/MonthEnd/Week). "
191
+ " For monthly, use df['datetime'].dt.to_period('M') then groupby.\n"
192
+ "6) Never print stack traces; return a concise answer or a plot/dataframe."
193
+ )
194
+
195
+ # -----------------------------------------------------------------------------
196
+ # Error detection & sanitization — blocks all PandasAI leakages
197
+ # -----------------------------------------------------------------------------
198
+ ERROR_PATTERNS = [
199
  "traceback", "exception", "keyerror", "nameerror", "syntaxerror",
200
  "modulenotfounderror", "importerror", "pipeline failed", "execution failed",
201
+ "__import__", "failed with error", "attributeerror", "method_descriptor",
202
+ # PandasAI canonical failure banner:
203
+ "unfortunately, i was not able to answer your question",
204
+ # Pandas non-fixed frequency class of errors:
205
+ "non-fixed frequency", "monthbegin", "monthend", "week:", "weekday="
206
  ]
207
 
208
+ def _stringify(obj) -> str:
209
+ try:
210
+ if isinstance(obj, (pd.DataFrame, plt.Figure)):
211
+ return ""
212
+ if isinstance(obj, (bytes, bytearray)):
213
+ return obj.decode("utf-8", errors="ignore")
214
+ return str(obj)
215
+ except Exception:
216
+ return ""
217
+
218
+ def _extract_text_like(ans):
219
  """
220
+ Return the most relevant text to inspect:
221
+ - dict with 'value'
222
+ - objects with 'value' attr
223
+ - plain string/number
224
  """
225
+ if isinstance(ans, dict):
226
+ if "value" in ans:
227
+ return _stringify(ans["value"])
228
+ # some parsers use {'type': 'string', 'value': '...'}
229
+ for k in ("message", "text", "content"):
230
+ if k in ans:
231
+ return _stringify(ans[k])
232
+ return _stringify(ans)
233
+ if hasattr(ans, "value"):
234
+ try:
235
+ return _stringify(getattr(ans, "value"))
236
+ except Exception:
237
+ pass
238
+ return _stringify(ans)
239
+
240
+ def looks_like_error(ans) -> bool:
241
+ # Early accept for DataFrame/Figure
242
  if isinstance(ans, (pd.DataFrame, plt.Figure)):
243
  return False
244
+ s = _extract_text_like(ans).strip().lower()
245
  if not s:
246
  return True
247
+ if any(p in s for p in ERROR_PATTERNS):
 
248
  return True
249
+ # crude stack trace glimpse
250
+ if ("file \"" in s and "line " in s and "error" in s) or ("valueerror:" in s):
251
  return True
252
  return False
253
 
254
  def sanitize_answer(ans) -> str:
255
+ s = _extract_text_like(ans)
256
+ s = re.sub(r"```+(\w+)?", "", s) # strip fences
 
 
 
 
 
257
  if "Traceback (most recent call last):" in s:
258
  s = s.split("Traceback (most recent call last):")[0].strip()
259
+ # if PandasAI banner leaked, nuke it
260
+ if "Unfortunately, I was not able to answer your question" in s:
261
+ s = ""
262
  return s.strip()
263
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
  # -----------------------------------------------------------------------------
265
+ # Analyst KPI layer (unchanged logic, small safety tweaks)
266
  # -----------------------------------------------------------------------------
267
  class IrisReportEngine:
268
  def __init__(self, transactions_data: list, llm_instance):
 
275
  return pd.DataFrame()
276
  df = pd.DataFrame(transactions)
277
 
278
+ for col in ["Units_Sold", "Unit_Cost_Price", "Amount"]:
 
279
  if col in df.columns:
280
  df[col] = pd.to_numeric(df[col], errors="coerce").fillna(0)
281
 
 
282
  if "Time" in df.columns:
283
  dt_series = pd.to_datetime(
284
  df["Date"].astype(str) + " " + df["Time"].astype(str),
 
293
  else:
294
  dt_series = dt_series.dt.tz_convert(TZ)
295
  except Exception:
 
296
  pass
297
 
298
  df["datetime"] = dt_series
 
301
  df["DayOfWeek"] = df["datetime"].dt.day_name()
302
  df["HourOfDay"] = df["datetime"].dt.hour
303
 
 
304
  if "Transaction_Type" in df.columns:
305
  sales_df = df[df["Transaction_Type"].astype(str).str.lower() == "sale"].copy()
306
  else:
 
410
  fallback_prompt = f"""
411
  You are Iris, an expert business data analyst. Answer the user's question using the business data below.
412
 
413
+ If their question is specific (e.g., “sales yesterday”, “top product”), answer directly.
414
  If the request can't be answered precisely, provide a helpful business briefing.
415
 
416
  Use clear markdown with short headings and bullets. Keep it concise.
 
445
  )
446
  resp.raise_for_status()
447
  transactions = (resp.json() or {}).get("transactions") or []
448
+ except Exception:
449
  logger.exception("Transaction API error")
 
450
  return jsonify({"answer": "I couldn't reach the transactions service. Please try again shortly."})
451
 
452
  if not transactions:
 
457
  logger.info("Attempting Tier 1 (PandasAI)...")
458
  df = pd.DataFrame(transactions)
459
 
 
460
  pandas_agent = SmartDataframe(df, config={
461
  "llm": llm,
462
  "response_parser": FlaskResponse,
 
463
  "security": "none",
464
  "save_charts_path": user_defined_path,
465
  "save_charts": False,
466
  "enable_cache": False,
467
  "conversational": True,
468
  "enable_logging": False,
 
469
  "custom_whitelisted_dependencies": [
470
  "os","io","sys","chr","glob","b64decoder","collections",
471
  "geopy","geopandas","wordcloud","builtins","datetime",
 
474
  ],
475
  })
476
 
 
477
  combined_prompt = f"{guardrails_preamble()}\n\n{temporal_hints(user_question)}\n\nQuestion: {user_question}"
478
  answer = pandas_agent.chat(combined_prompt)
479
 
 
492
 
493
  return jsonify({"answer": sanitize_answer(answer), "meta": {"source": "pandasai"}})
494
 
495
+ except Exception:
 
496
  logger.exception("Tier 1 (PandasAI) failed; moving to analyst layer.")
497
 
498
  # Tier 2 — Analyst KPI fallback (guaranteed)
 
503
 
504
  except Exception:
505
  logger.exception("Critical unexpected error in /chat")
 
506
  return jsonify({"answer": "Something went wrong on our side. Please try again."})
507
 
508
  # -----------------------------------------------------------------------------