VEDAGI1 commited on
Commit
da409af
·
verified ·
1 Parent(s): d900f6e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +417 -172
app.py CHANGED
@@ -1,28 +1,47 @@
1
  # app.py
2
- # Universal AI Data Analyst – FINAL FIXED VERSION (Nov 2025)
 
 
 
 
 
 
3
  from __future__ import annotations
 
4
  import io
5
  import json
6
  import os
7
  import traceback
8
- import re
9
  from contextlib import redirect_stdout
10
  from datetime import datetime
11
  from typing import Any, Dict, List
 
12
  import gradio as gr
13
  import pandas as pd
14
  import regex as re2
 
 
15
  from langchain_cohere import ChatCohere # noqa: F401
 
16
  from settings import (
17
  GENERAL_CONVERSATION_PROMPT,
18
  COHERE_MODEL_PRIMARY,
19
- COHERE_TIMEOUT_S, # noqa: F401
20
- USE_OPEN_FALLBACKS # noqa: F401
21
  )
 
 
 
22
 
23
- # Optional HIPAA settings with safe defaults
24
  try:
25
- from settings import PHI_MODE, PERSIST_HISTORY, HISTORY_TTL_DAYS, REDACT_BEFORE_LLM, ALLOW_EXTERNAL_PHI
 
 
 
 
 
 
26
  except Exception:
27
  PHI_MODE = False
28
  PERSIST_HISTORY = True
@@ -30,21 +49,8 @@ except Exception:
30
  REDACT_BEFORE_LLM = False
31
  ALLOW_EXTERNAL_PHI = True
32
 
33
- from audit_log import log_event
34
- from privacy import safety_filter, refusal_reply
35
- from llm_router import cohere_chat, _co_client, cohere_embed
36
-
37
-
38
- # ———————— PERMANENT FIX: Safe .item() for floats & pandas scalars ————————
39
- def safe_item(x):
40
- """Safely extract scalar from pandas/numpy objects OR plain Python types"""
41
- try:
42
- return x.item() if hasattr(x, "item") else x
43
- except:
44
- return x
45
- # —————————————————————————————————————————————————————————————————————
46
-
47
 
 
48
  def load_markdown_text(filepath: str) -> str:
49
  try:
50
  with open(filepath, "r", encoding="utf-8") as f:
@@ -52,11 +58,15 @@ def load_markdown_text(filepath: str) -> str:
52
  except FileNotFoundError:
53
  return f"**Error:** Document `{os.path.basename(filepath)}` not found."
54
 
 
55
  def _sanitize_text(s: str) -> str:
56
  if not isinstance(s, str):
57
  return s
 
58
  return re2.sub(r"[\p{C}--[\n\t]]+", "", s)
59
 
 
 
60
  PHI_PATTERNS = [
61
  (re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED_SSN]"),
62
  (re.compile(r"\b\d{9}\b"), "[REDACTED_MRN]"),
@@ -67,6 +77,7 @@ PHI_PATTERNS = [
67
  (re.compile(r"\b\d{5}(-\d{4})?\b"), "[REDACTED_ZIP]"),
68
  ]
69
 
 
70
  def redact_phi(text: str) -> str:
71
  if not isinstance(text, str):
72
  return text
@@ -75,101 +86,197 @@ def redact_phi(text: str) -> str:
75
  t = pat.sub(repl, t)
76
  return t
77
 
 
78
  def safe_log(event_name: str, meta: dict | None = None):
 
79
  try:
80
  meta = (meta or {}).copy()
81
  meta.pop("raw", None)
82
  log_event(event_name, None, meta)
83
  except Exception:
 
84
  pass
85
 
86
- # ———————— Rest of your unchanged logic (kept 100% identical) ————————
87
  def _create_python_script(user_scenario: str, schema_context: str) -> str:
88
- EXPERT_ANALYTICAL_GUIDELINES = """
89
- --- EXPERT ANALYTICAL GUIDELINES ---
90
- When writing your script, you MUST follow these expert business rules:
91
- 1. **Linking Datasets Rule:** If you need to connect facilities to health zones when the 'zone' column is not in the facility list,
92
- you must first identify the high-priority zone from the beds data, then find the major city (by facility count) in the facility list,
93
- and *then* assess that city's capacity. Do not try to filter the facility list by a 'zone' column if it does not exist in the schema.
94
- 2. **Prioritization Rule:** To prioritize locations, you MUST combine the most recent population data with specific high-risk health indicators
95
- to create a multi-factor risk score.
96
- 3. **Capacity Calculation Rule:** For capacity over a 3-month window, assume **60 working days**.
97
- 4. **Cost Calculation Rule:** Sum 'Startup cost' and 'Ongoing cost' per person before multiplying.
98
- """
99
  prompt_for_coder = f"""\
100
- You are an expert Python data scientist. Your job is to write a script to extract the data needed to answer the user's request.
101
- You have dataframes in a list `dfs`.
102
- {EXPERT_ANALYTICAL_GUIDELINES}
 
 
 
103
  --- DATA SCHEMA ---
104
  {schema_context}
105
  --- END DATA SCHEMA ---
106
- CRITICAL RULES:
107
- 1. **DO NOT READ FILES:** You MUST NOT include `pd.read_csv`. The data is ALREADY loaded in the `dfs` variable. You MUST use this variable. Failure to do so will cause a fatal error.
108
- 2. **JSON OUTPUT ONLY:** Your script's ONLY output must be a single JSON object printed to stdout containing the raw data findings.
109
- 3. **BE PRECISE:** Use the exact, case-sensitive column names from the schema and robustly clean strings (`re.sub()`) before converting to numbers.
110
- 4. **JSON SERIALIZATION:** Before adding data to your final dictionary for JSON conversion, you MUST convert any pandas-specific types (like `int64`) to standard Python types using `safe_item()` for single values or `.tolist()` for lists.
111
- --- USER'S SCENARIO ---
112
- {user_scenario}
113
 
114
- --- PYTHON SCRIPT ---
115
- Now, write the complete Python script that performs the analysis and prints a single, serializable JSON object.
116
- ```python
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
117
  """
118
  generated_text = cohere_chat(prompt_for_coder)
119
- match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
 
120
  if match:
121
  return match.group(1).strip()
122
- return "print(json.dumps({'error': 'Failed to generate a valid Python script.'}))"
 
 
 
 
 
123
 
124
  def _generate_long_report(prompt: str) -> str:
125
  try:
126
  client = _co_client()
127
  if not client:
128
  return "Error: Cohere client not initialized."
129
- response = client.chat(model=COHERE_MODEL_PRIMARY, message=prompt, max_tokens=4096)
 
 
 
 
130
  return response.text
131
  except Exception as e:
132
  safe_log("cohere_chat_error", {"err": str(e)})
133
  return f"Error during final report generation: {e}"
134
 
 
135
  def _generate_final_report(user_scenario: str, raw_data_json: str) -> str:
 
 
 
 
 
136
  prompt_for_writer = f"""\
137
- You are an expert management consultant and data analyst.
138
- A data science script has run to extract key findings. You have the user's original request and the raw JSON data.
139
- Your task is to synthesize these raw findings into a single, comprehensive, and professional report that directly answers all of the user's questions with detailed justifications.
140
- --- USER'S ORIGINAL SCENARIO & DELIVERABLES ---
141
  {user_scenario}
142
  --- END SCENARIO ---
 
143
  --- RAW DATA FINDINGS (JSON) ---
144
  {raw_data_json}
145
  --- END RAW DATA ---
146
- Now, write the final, polished report. The report MUST:
147
- 1. Follow the "Expected Output Format" requested by the user.
148
- 2. Use tables, bullet points, and DETAILED narrative justifications for each recommendation.
149
- 3. Synthesize the raw data into actionable insights. Do not just copy the raw numbers; interpret them.
150
- 4. Ensure you fully address ALL evaluation questions, especially the final recommendations.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
  """
152
  return _generate_long_report(prompt_for_writer)
153
 
 
154
  def _append_msg(h: List[Dict[str, str]], r: str, c: str) -> List[Dict[str, str]]:
155
  return (h or []) + [{"role": r, "content": c}]
156
 
 
157
  def ping_cohere() -> str:
158
  try:
159
  cli = _co_client()
160
  if not cli:
161
  return "Cohere client not initialized."
162
  vecs = cohere_embed(["hello", "world"])
163
- return f"Cohere OK (model={COHERE_MODEL_PRIMARY})" if vecs else "Cohere reachable."
164
  except Exception as e:
165
  return f"Cohere ping failed: {e}"
166
 
 
167
  def handle(user_msg: str, files: list, yield_update) -> str:
168
  try:
 
169
  safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
170
  if blocked_in:
171
  return refusal_reply(reason_in)
172
 
 
173
  redacted_in = safe_in
174
  if PHI_MODE and REDACT_BEFORE_LLM:
175
  redacted_in = redact_phi(safe_in)
@@ -177,6 +284,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
177
  file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
178
 
179
  if file_paths:
 
180
  dataframes, schema_parts = [], []
181
  for i, p in enumerate(file_paths):
182
  if p.endswith(".csv"):
@@ -185,93 +293,92 @@ def handle(user_msg: str, files: list, yield_update) -> str:
185
  except UnicodeDecodeError:
186
  df = pd.read_csv(p, encoding="latin1")
187
  dataframes.append(df)
188
- schema_parts.append(f"DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):\n{df.head().to_markdown()}\n")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
189
 
190
  if not dataframes:
191
  return "Please upload at least one CSV file."
192
 
193
  schema_context = "\n".join(schema_parts)
194
- prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
195
 
196
- yield_update("```\nGenerating aligned analysis script...\n```")
 
 
 
 
197
  analysis_script = _create_python_script(prompt_for_code, schema_context)
198
- yield_update("```\nExecuting script to extract raw data...\n```")
199
-
200
- # ←←← INJECT safe_item INTO SCRIPT NAMESPACE ←←←
201
- execution_namespace = {
202
- "dfs": dataframes,
203
- "pd": pd,
204
- "re": re,
205
- "json": json,
206
- "safe_item": safe_item
207
- }
208
 
 
 
209
  output_buffer = io.StringIO()
 
210
  try:
211
  with redirect_stdout(output_buffer):
212
  exec(analysis_script, execution_namespace)
213
  raw_data_output = output_buffer.getvalue()
214
-
215
- # Robust JSON extraction
216
- try:
217
- raw_data = json.loads(raw_data_output)
218
- except json.JSONDecodeError:
219
- json_match = re.search(r'\{.*\}', raw_data_output, re.DOTALL)
220
- raw_data = json.loads(json_match.group(0)) if json_match else {}
221
-
222
- # Final safety net – convert any lingering pandas types
223
- def convert(obj):
224
- return safe_item(obj) if not isinstance(obj, (dict, list)) else obj
225
- def deep_convert(o):
226
- if isinstance(o, dict):
227
- return {k: deep_convert(v) for k, v in o.items()}
228
- elif isinstance(o, list):
229
- return [deep_convert(i) for i in o]
230
- else:
231
- return convert(o)
232
- raw_data = deep_convert(raw_data)
233
- raw_data_json = json.dumps(raw_data)
234
-
235
  except Exception as e:
236
- error_detail = f"Script execution failed: {e}\n\nGenerated script:\n```python\n{analysis_script}\n```"
237
- return error_detail if not PHI_MODE else "A critical error occurred."
238
-
239
- yield_update("```\nSynthesizing final comprehensive report...\n```")
240
- writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
241
- final_report = _generate_final_report(writer_input, raw_data_json)
 
 
 
 
242
  return _sanitize_text(final_report)
243
-
244
  else:
245
- # Pure chat mode
246
- chat_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
 
 
247
  prompt = f"{GENERAL_CONVERSATION_PROMPT}\n\nUser: {chat_input}\nAssistant:"
248
  return _sanitize_text(cohere_chat(prompt) or "How can I help further?")
249
 
250
  except Exception as e:
251
  tb = traceback.format_exc()
252
  safe_log("app_error", {"err": str(e)})
253
- return "A critical error occurred. Please contact your administrator." if PHI_MODE else f"Error: {e}"
 
254
 
255
  PRIVACY_POLICY_TEXT = load_markdown_text("privacy_policy.md")
256
  TERMS_OF_SERVICE_TEXT = load_markdown_text("terms_of_service.md")
257
 
258
- # ———————— FINAL WORKING CSS (Nov 2025 – Gradio 4+) ————————
 
259
  SLEEK_CSS = """
260
- /* Full-bleed layout */
261
- :root, body, #root, .gradio-container { height: 100%; margin:0; padding:0; }
262
  .gradio-container { padding: 0 !important; }
 
263
 
264
  /* Header */
265
  .header {
266
  padding: 20px 28px;
267
  background: linear-gradient(135deg, #0e1726, #1d2a44 60%, #243a5e);
268
  color: #fff;
269
- display: flex; align-items: center; justify-content: space-between; gap: 16px;
 
270
  }
271
- .header h1 { margin:0; font-size:22px; font-weight:600; letter-spacing:0.3px; }
272
- .header .badge { font-size:12px; background:#ffffff22; padding:6px 10px; border-radius:999px; }
273
 
274
- /* Main grid */
275
  .main {
276
  display: grid;
277
  grid-template-columns: 420px 1fr;
@@ -289,106 +396,244 @@ SLEEK_CSS = """
289
  .left { padding: 16px; display: flex; flex-direction: column; gap: 12px; }
290
  .right { padding: 0; display: flex; flex-direction: column; }
291
 
292
- /* Make chatbot fill entire right panel – WORKS IN 2025 */
293
- #chatbot_container {
294
- flex: 1 !important;
295
- min-height: 0;
296
- display: flex !important;
297
- flex-direction: column !important;
298
- }
299
- #chatbot_container .svelte-1cea1s5 {
300
- flex: 1 !important;
301
- min-height: 0 !important;
302
- display: flex !important;
303
- flex-direction: column !important;
304
- }
305
- #chatbot_container .messages {
306
- flex: 1 !important;
307
- overflow-y: auto !important;
308
- overflow-x: hidden !important;
309
- padding: 28px !important;
310
- min-height: 0 !important;
311
- }
312
- #chatbot_container .gr-chatbot,
313
- #chatbot_container .svelte-1cea1s5,
314
- #chatbot_container .messages { max-height: none !important; }
315
 
316
- /* Scrollbars */
317
- #chatbot_container .messages::-webkit-scrollbar {
318
- width: 8px;
319
- }
320
- #chatbot_container .messages::-webkit-scrollbar-track { background: transparent; }
321
- #chatbot_container .messages::-webkit-scrollbar-thumb {
322
- background: rgba(100,120,160,0.4);
323
- border-radius: 4px;
324
  }
325
- #chatbot_container .messages::-webkit-scrollbar-thumb:hover { background: rgba(100,120,160,0.7); }
 
 
 
 
 
 
326
 
327
- /* Code blocks */
328
- #chatbot_container pre {
329
- background: #0f1629 !important;
330
- border: 1px solid #2a3755 !important;
331
- border-radius: 8px !important;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
332
  }
 
333
  """
334
 
335
- VOICE_STT_HTML = """...""" # (your existing voice script – unchanged)
336
 
 
337
  with gr.Blocks(theme=gr.themes.Soft(), css=SLEEK_CSS, fill_width=True) as demo:
 
338
  assessment_history = gr.State([])
339
 
 
340
  with gr.Row(elem_classes=["header"]):
341
  gr.Markdown("<h1>Clarity Ops Augmented Decision Support</h1>")
342
- pill = "PHI Mode ON · history off" if (PHI_MODE and not PERSIST_HISTORY) else "PHI Mode ON" if PHI_MODE else "PHI Mode OFF"
343
  gr.Markdown(f"<span class='badge'>{pill}</span>")
344
 
 
345
  with gr.Row(elem_classes=["main"]):
 
346
  with gr.Column(elem_classes=["left"]):
347
  gr.Markdown("<div class='panel-title'>New Assessment</div>")
348
  gr.Markdown("<div class='helper'>Upload CSVs for analysis, or enter a prompt. Voice works in modern browsers.</div>")
349
- files_input = gr.Files(label="Upload Data Files (.csv)", file_count="multiple", type="filepath", file_types=[".csv"])
350
- prompt_input = gr.Textbox(label="Prompt", placeholder="Paste your scenario or question here...", lines=12, elem_id="prompt_box", autofocus=True)
351
-
 
 
 
 
 
 
 
 
 
 
352
  with gr.Row(elem_classes=["actions"]):
353
- gr.Button("Run Analysis", variant="primary")
354
- gr.Button("Clear")
355
- gr.Button("Voice")
356
 
357
  gr.Markdown("<div class='voice-hint'>Click Voice to start/stop dictation into the prompt box.</div>")
358
- gr.Button("Ping Cohere") .click(ping_cohere, outputs=gr.Markdown())
 
359
  gr.Markdown("<div class='hr'></div>")
360
-
361
  if PHI_MODE:
362
- gr.Markdown("PHI Mode: History persistence is disabled by default. Avoid unnecessary identifiers.")
363
-
 
364
  with gr.Accordion("Privacy & Terms", open=False):
365
  gr.Markdown(PRIVACY_POLICY_TEXT)
366
  gr.Markdown("<div class='hr'></div>")
367
  gr.Markdown(TERMS_OF_SERVICE_TEXT)
368
-
369
  with gr.Column(elem_classes=["right"]):
370
  with gr.Tabs(elem_classes=["tabs"]):
371
- with gr.TabItem("Current Assessment", id=0):
372
  with gr.Column(elem_id="chatbot_container"):
373
  chat_history_output = gr.Chatbot(
374
- label="Analysis Output",
375
- type="messages",
376
- container=False,
377
- autoscroll=True,
378
- elem_id="chatbot_root",
379
- height=None # Let CSS control height
380
  )
381
- with gr.TabItem("Assessment History", id=1):
382
  gr.Markdown("### Review Past Assessments")
383
- history_dropdown = gr.Dropdown(label="Select an assessment", choices=[])
384
- history_display = gr.Markdown()
 
 
385
 
 
386
  gr.HTML(VOICE_STT_HTML)
387
 
388
- # (Your event wiring stays exactly the same – unchanged)
389
- # ... (rest of your code unchanged)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
390
 
391
  if __name__ == "__main__":
392
  if not os.getenv("COHERE_API_KEY"):
393
- print("COHERE_API_KEY not set")
 
 
394
  demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))
 
1
  # app.py
2
+ #
3
+ # Universal AI Data Analyst with:
4
+ # - IMPROVED: "Plan-and-Execute" logic for high-accuracy analysis.
5
+ # - IMPROVED: Professional, structured report generation.
6
+ # - IMPROVED: Enriched schema context for the AI analyst.
7
+ # - Unchanged UI, event wiring, and core infrastructure.
8
+
9
  from __future__ import annotations
10
+
11
  import io
12
  import json
13
  import os
14
  import traceback
 
15
  from contextlib import redirect_stdout
16
  from datetime import datetime
17
  from typing import Any, Dict, List
18
+
19
  import gradio as gr
20
  import pandas as pd
21
  import regex as re2
22
+ import re
23
+
24
  from langchain_cohere import ChatCohere # noqa: F401
25
+
26
  from settings import (
27
  GENERAL_CONVERSATION_PROMPT,
28
  COHERE_MODEL_PRIMARY,
29
+ COHERE_TIMEOUT_S,
30
+ USE_OPEN_FALLBACKS,
31
  )
32
+ from audit_log import log_event
33
+ from privacy import safety_filter, refusal_reply
34
+ from llm_router import cohere_chat, _co_client, cohere_embed
35
 
36
+ # Try to import optional HIPAA flags; fall back to safe defaults if not defined.
37
  try:
38
+ from settings import (
39
+ PHI_MODE,
40
+ PERSIST_HISTORY,
41
+ HISTORY_TTL_DAYS,
42
+ REDACT_BEFORE_LLM,
43
+ ALLOW_EXTERNAL_PHI,
44
+ )
45
  except Exception:
46
  PHI_MODE = False
47
  PERSIST_HISTORY = True
 
49
  REDACT_BEFORE_LLM = False
50
  ALLOW_EXTERNAL_PHI = True
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
+ # ---------------------- Helpers (analysis logic selectively improved) ----------------------
54
  def load_markdown_text(filepath: str) -> str:
55
  try:
56
  with open(filepath, "r", encoding="utf-8") as f:
 
58
  except FileNotFoundError:
59
  return f"**Error:** Document `{os.path.basename(filepath)}` not found."
60
 
61
+
62
  def _sanitize_text(s: str) -> str:
63
  if not isinstance(s, str):
64
  return s
65
+ # Remove control characters (except newline and tab)
66
  return re2.sub(r"[\p{C}--[\n\t]]+", "", s)
67
 
68
+
69
+ # Conservative PHI redaction patterns (only applied if PHI_MODE & REDACT_BEFORE_LLM are enabled)
70
  PHI_PATTERNS = [
71
  (re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED_SSN]"),
72
  (re.compile(r"\b\d{9}\b"), "[REDACTED_MRN]"),
 
77
  (re.compile(r"\b\d{5}(-\d{4})?\b"), "[REDACTED_ZIP]"),
78
  ]
79
 
80
+
81
  def redact_phi(text: str) -> str:
82
  if not isinstance(text, str):
83
  return text
 
86
  t = pat.sub(repl, t)
87
  return t
88
 
89
+
90
  def safe_log(event_name: str, meta: dict | None = None):
91
+ # Avoid logging raw PHI or payloads
92
  try:
93
  meta = (meta or {}).copy()
94
  meta.pop("raw", None)
95
  log_event(event_name, None, meta)
96
  except Exception:
97
+ # Never raise from logging
98
  pass
99
 
100
+
101
  def _create_python_script(user_scenario: str, schema_context: str) -> str:
102
+ """
103
+ IMPROVED: Generates a Python script using a universal "Map, Plan, Execute" approach.
104
+ The AI first maps user concepts to data columns, then plans and executes the analysis.
105
+ This ensures the logic is robust, dynamic, and not hardcoded to a specific dataset.
106
+ """
 
 
 
 
 
 
107
  prompt_for_coder = f"""\
108
+ You are an expert-level, universal Python data scientist. Your task is to dynamically analyze any provided dataset(s) to answer a user's business request.
109
+
110
+ --- USER'S SCENARIO ---
111
+ {user_scenario}
112
+ --- END SCENARIO ---
113
+
114
  --- DATA SCHEMA ---
115
  {schema_context}
116
  --- END DATA SCHEMA ---
 
 
 
 
 
 
 
117
 
118
+ You must follow a rigorous three-step "Map, Plan, Execute" process:
119
+
120
+ **Step 1: Map Concepts to Data.**
121
+ First, analyze the user's scenario and the provided data schemas. Identify the key business concepts (e.g., "hospitals", "sales", "regions") and metrics (e.g., "wait times", "revenue", "population"). Then, create a logical mapping from these concepts to the actual column names in the provided DataFrames. State these mappings clearly. This is the most critical step to ensure your analysis is relevant.
122
+
123
+ **Step 2: Create a Detailed Analysis Plan.**
124
+ Based on your mapping, formulate a step-by-step plan. Describe the data cleaning, merging, grouping, and aggregation steps needed to answer the user's request using the columns you identified.
125
+
126
+ **Step 3: Write the Python Script.**
127
+ Based on your plan, write a complete Python script.
128
+
129
+ CRITICAL SCRIPTING RULES:
130
+ 1. **DYNAMIC DATAFRAME IDENTIFICATION:** Your script MUST identify the correct DataFrame by checking for the presence of the columns you mapped in Step 1. Do NOT use hardcoded indices like `dfs[0]`.
131
+ 2. **ROBUST SUCCESS CHECK (MOST IMPORTANT TO PREVENT AMBIGUITY ERROR):** After attempting to find a DataFrame, you MUST check for success by comparing the result to `None`. Do NOT use `if not my_dataframe:` as this is ambiguous.
132
+ ```python
133
+ # Good, robust code
134
+ def find_df_by_cols(dfs, required_cols):
135
+ for df in dfs:
136
+ if all(col in df.columns for col in required_cols):
137
+ return df
138
+ return None
139
+
140
+ primary_df = find_df_by_cols(dfs, ['user_id', 'transaction_amount'])
141
+
142
+ # This is the correct way to check for failure
143
+ if primary_df is None:
144
+ raise ValueError("Could not find the primary dataframe based on its columns.")
145
+ ```
146
+ 3. **VERIFY COLUMN EXISTENCE:** Only use columns that you have explicitly identified and mapped.
147
+ 4. **NO FILE READING:** The data is already in the `dfs` list.
148
+ 5. **STRICTLY JSON OUTPUT:** The script's ONLY output must be a single JSON object.
149
+ 6. **ROBUST & GENERIC:** Write robust code that can handle potential missing data (`errors='coerce'`, checking for `None`).
150
+
151
+ Now, provide your response in the following format:
152
+
153
+ **ANALYSIS PLAN:**
154
+ ```text
155
+ **1. Concept-to-Column Mapping:**
156
+ - Concept: [e.g., 'Hospitals'] -> Mapped Column: [e.g., `Facility`]
157
+ - Concept: [e.g., 'Surgical Wait Time'] -> Mapped Column: [e.g., `Surgery_Median`]
158
+
159
+ **2. Step-by-Step Analysis:**
160
+ 1. **Data Identification:** [e.g., "Define a helper function to find dataframes by checking for key columns..."]
161
+ 2. **Data Cleaning:** [e.g., "Convert metric columns to numeric..."]
162
+ 3. **Analysis Step A:** [e.g., "Group the primary dataframe by the 'Facility' column and calculate the mean of the 'Surgery_Median' column..."]
163
+ 4. ...
164
+
165
+ the final JSON object]
166
+
167
+ # Your complete Python script starts here
168
+ import pandas as pd
169
+ import json
170
+ import re
171
+
172
+ # Main analysis logic...
173
+ # ...
174
+ # Final print statement
175
+ print(json.dumps(final_data_structure, indent=4))```
176
  """
177
  generated_text = cohere_chat(prompt_for_coder)
178
+ # This regex is more robust for extracting the final code block
179
+ match = re2.search(r"PYTHON SCRIPT:\s*```python\n(.*?)```", generated_text, re2.DOTALL)
180
  if match:
181
  return match.group(1).strip()
182
+ # Fallback if the structured format fails
183
+ fallback_match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
184
+ if fallback_match:
185
+ return fallback_match.group(1).strip()
186
+ return "print(json.dumps({'error': 'Failed to generate a valid Python script from the plan.'}))"
187
+
188
 
189
  def _generate_long_report(prompt: str) -> str:
190
  try:
191
  client = _co_client()
192
  if not client:
193
  return "Error: Cohere client not initialized."
194
+ response = client.chat(
195
+ model=COHERE_MODEL_PRIMARY,
196
+ message=prompt,
197
+ max_tokens=4096,
198
+ )
199
  return response.text
200
  except Exception as e:
201
  safe_log("cohere_chat_error", {"err": str(e)})
202
  return f"Error during final report generation: {e}"
203
 
204
+
205
  def _generate_final_report(user_scenario: str, raw_data_json: str) -> str:
206
+ """
207
+ IMPROVED: Generates a professional, structured report from the JSON data.
208
+ The prompt guides the AI to synthesize insights in a standard consulting format,
209
+ ensuring a high level of detail and actionable recommendations.
210
+ """
211
  prompt_for_writer = f"""\
212
+ You are an expert management consultant specializing in data-driven strategy. A Python script has been executed to extract key data points based on a user's request. Your task is to synthesize this raw data into a polished, comprehensive, and actionable report.
213
+
214
+ --- USER'S ORIGINAL SCENARIO ---
 
215
  {user_scenario}
216
  --- END SCENARIO ---
217
+
218
  --- RAW DATA FINDINGS (JSON) ---
219
  {raw_data_json}
220
  --- END RAW DATA ---
221
+
222
+ CRITICAL INSTRUCTIONS:
223
+ You must write a final report that follows this exact structure:
224
+
225
+ **### Executive Summary**
226
+ - Start with a brief paragraph summarizing the core problem, key findings, and top recommendations. This should be a high-level overview for a leadership audience.
227
+
228
+ **### 1. [First Key Finding, e.g., Hospitals with the Longest Wait Times]**
229
+ - Present the relevant data in a Markdown table.
230
+ - Write a short narrative interpreting the data. What does it mean? Are there any outliers? Why might these facilities have long waits (e.g., specialized care, rural location, capacity issues)?
231
+
232
+ **### 2. [Second Key Finding, e.g., Specialties with the Longest Wait Times]**
233
+ - Present the relevant data in a Markdown table.
234
+ - Interpret the findings. Why are these specialties facing delays (e.g., specialist shortages, equipment needs)?
235
+
236
+ **### 3. [Third Key Finding, e.g., Zone-Level Performance]**
237
+ - Present the data in a table, including a comparison to a relevant average or baseline.
238
+ - Analyze the geographic or systemic issues this data reveals.
239
+
240
+ **### 4. [Fourth Key Finding, if applicable, e.g., Geographic Distribution]**
241
+ - Synthesize location data with the wait-time findings.
242
+ - Discuss the implications for patient equity, travel burdens, and access to care.
243
+
244
+ **### 5. Recommendations for Resource Allocation**
245
+ - Provide specific, actionable, and justified recommendations.
246
+ - Structure them by category (e.g., by facility, by specialty, by zone).
247
+ - For each recommendation, provide a clear rationale directly linked to the data findings above (e.g., "Allocate additional resources to Glace Bay Hospital because it is a rural facility in a high-wait zone, suggesting a capacity bottleneck.").
248
+
249
+ **### Data Limitations**
250
+ - Briefly mention any potential limitations of the analysis (e.g., missing data, use of proxies, case severity not included). This adds credibility to the report.
251
+
252
+ Do not just repeat the JSON data. Your value is in interpreting the numbers, connecting the dots between different findings, and providing clear, data-backed strategic advice.
253
  """
254
  return _generate_long_report(prompt_for_writer)
255
 
256
+
257
  def _append_msg(h: List[Dict[str, str]], r: str, c: str) -> List[Dict[str, str]]:
258
  return (h or []) + [{"role": r, "content": c}]
259
 
260
+
261
  def ping_cohere() -> str:
262
  try:
263
  cli = _co_client()
264
  if not cli:
265
  return "Cohere client not initialized."
266
  vecs = cohere_embed(["hello", "world"])
267
+ return f"Cohere OK (model={COHERE_MODEL_PRIMARY})" if vecs else "Cohere reachable."
268
  except Exception as e:
269
  return f"Cohere ping failed: {e}"
270
 
271
+
272
  def handle(user_msg: str, files: list, yield_update) -> str:
273
  try:
274
+ # Safety filter on incoming message
275
  safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
276
  if blocked_in:
277
  return refusal_reply(reason_in)
278
 
279
+ # Optional PHI redaction for prompts sent to an external LLM
280
  redacted_in = safe_in
281
  if PHI_MODE and REDACT_BEFORE_LLM:
282
  redacted_in = redact_phi(safe_in)
 
284
  file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
285
 
286
  if file_paths:
287
+ # CSV analysis path
288
  dataframes, schema_parts = [], []
289
  for i, p in enumerate(file_paths):
290
  if p.endswith(".csv"):
 
293
  except UnicodeDecodeError:
294
  df = pd.read_csv(p, encoding="latin1")
295
  dataframes.append(df)
296
+ # --- IMPROVEMENT: ENRICHED SCHEMA CONTEXT ---
297
+ schema_buffer = io.StringIO()
298
+ df.info(buf=schema_buffer)
299
+ schema_info = schema_buffer.getvalue()
300
+ schema_parts.append(
301
+ f"""DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):
302
+ ### Head
303
+ {df.head().to_markdown()}
304
+
305
+ ### Schema and Data Types
306
+ {schema_info}
307
+
308
+ ### Summary Statistics
309
+ {df.describe(include='all').to_markdown()}
310
+ """
311
+ )
312
 
313
  if not dataframes:
314
  return "Please upload at least one CSV file."
315
 
316
  schema_context = "\n".join(schema_parts)
 
317
 
318
+ # If external PHI is not allowed, use redacted prompt; otherwise use original
319
+ prompt_for_code = (
320
+ redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
321
+ )
322
+ yield_update("```\n🧠 Generating aligned analysis script...\n```")
323
  analysis_script = _create_python_script(prompt_for_code, schema_context)
 
 
 
 
 
 
 
 
 
 
324
 
325
+ yield_update("```\n⚙️ Executing script to extract raw data...\n```")
326
+ execution_namespace = {"dfs": dataframes, "pd": pd, "re": re, "json": json}
327
  output_buffer = io.StringIO()
328
+
329
  try:
330
  with redirect_stdout(output_buffer):
331
  exec(analysis_script, execution_namespace)
332
  raw_data_output = output_buffer.getvalue()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
333
  except Exception as e:
334
+ return (
335
+ f"An error occurred executing the script: {e}\n\nGenerated Script:\n"
336
+ f"```python\n{analysis_script}\n```"
337
+ )
338
+
339
+ yield_update("```\n✍️ Synthesizing final comprehensive report...\n```")
340
+ writer_input = (
341
+ redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
342
+ )
343
+ final_report = _generate_final_report(writer_input, raw_data_output)
344
  return _sanitize_text(final_report)
 
345
  else:
346
+ # Pure chat path
347
+ chat_input = (
348
+ redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
349
+ )
350
  prompt = f"{GENERAL_CONVERSATION_PROMPT}\n\nUser: {chat_input}\nAssistant:"
351
  return _sanitize_text(cohere_chat(prompt) or "How can I help further?")
352
 
353
  except Exception as e:
354
  tb = traceback.format_exc()
355
  safe_log("app_error", {"err": str(e)})
356
+ return ("A critical error occurred. Please contact your administrator." if PHI_MODE else f"A critical error occurred: {e}")
357
+
358
 
359
  PRIVACY_POLICY_TEXT = load_markdown_text("privacy_policy.md")
360
  TERMS_OF_SERVICE_TEXT = load_markdown_text("terms_of_service.md")
361
 
362
+
363
+ # ---------------------- Sleek UI assets (CSS/JS only) ----------------------
364
  SLEEK_CSS = """
365
+ /* Full-bleed, modern look */
366
+ :root, body, #root, .gradio-container { height: 100%; }
367
  .gradio-container { padding: 0 !important; }
368
+ .block { padding: 0 !important; }
369
 
370
  /* Header */
371
  .header {
372
  padding: 20px 28px;
373
  background: linear-gradient(135deg, #0e1726, #1d2a44 60%, #243a5e);
374
  color: #fff;
375
+ display: flex; align-items: center; justify-content: space-between;
376
+ gap: 16px;
377
  }
378
+ .header h1 { margin: 0; font-size: 22px; letter-spacing: 0.3px; font-weight: 600; }
379
+ .header .badge { font-size: 12px; opacity: 0.9; background:#ffffff22; padding:6px 10px; border-radius: 999px; }
380
 
381
+ /* Main layout */
382
  .main {
383
  display: grid;
384
  grid-template-columns: 420px 1fr;
 
396
  .left { padding: 16px; display: flex; flex-direction: column; gap: 12px; }
397
  .right { padding: 0; display: flex; flex-direction: column; }
398
 
399
+ /* Panels */
400
+ .panel-title { font-size: 14px; font-weight: 600; color: #aeb8cc; margin-bottom: 6px; }
401
+ .helper { font-size: 12px; color: #97a3bb; margin-bottom: 8px; }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
402
 
403
+ /* Sticky actions */
404
+ .actions {
405
+ display: flex; gap: 8px; align-items: center; justify-content: stretch;
 
 
 
 
 
406
  }
407
+ .actions .gr-button { flex: 1; }
408
+
409
+ /* Tabs full height */
410
+ .right .tabs { height: 100%; display: flex; flex-direction: column; }
411
+ .right .tabitem { flex: 1; display: flex; flex-direction: column; }
412
+ #chatbot_container { flex: 1; }
413
+ #chatbot_container .gr-chatbot { height: 100%; }
414
 
415
+ /* Tiny separators */
416
+ .hr { height: 1px; background: #16203b; margin: 10px 0; }
417
+
418
+ /* Voice hint */
419
+ .voice-hint { font-size: 12px; color:#9fb0cc; margin-top: 4px; }
420
+ """
421
+
422
+ VOICE_STT_HTML = """
423
+ <script>
424
+ let __rs_rec = null;
425
+ function rs_toggle_stt(elemId){
426
+ const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
427
+ if (!SpeechRecognition){
428
+ alert("This browser does not support Speech Recognition. Try Chrome or Edge.");
429
+ return;
430
+ }
431
+ if (__rs_rec){ __rs_rec.stop(); __rs_rec = null; return; }
432
+ __rs_rec = new SpeechRecognition();
433
+ __rs_rec.lang = "en-US";
434
+ __rs_rec.interimResults = true;
435
+ __rs_rec.continuous = true;
436
+
437
+ const box = document.querySelector(`#${elemId} textarea`);
438
+ if (!box){ alert("Prompt box not found."); return; }
439
+ let base = box.value || "";
440
+
441
+ __rs_rec.onresult = (ev) => {
442
+ let t = "";
443
+ for (let i = ev.resultIndex; i < ev.results.length; i++){
444
+ t += ev.results[i][0].transcript;
445
+ }
446
+ box.value = (base + " " + t).trim();
447
+ box.dispatchEvent(new Event("input", { bubbles: true }));
448
+ };
449
+ __rs_rec.onend = () => { __rs_rec = null; };
450
+ __rs_rec.start();
451
  }
452
+ </script>
453
  """
454
 
 
455
 
456
+ # ---------------------- Sleek UI (with fixed State wiring) ----------------------
457
  with gr.Blocks(theme=gr.themes.Soft(), css=SLEEK_CSS, fill_width=True) as demo:
458
+ # Persistent in-memory history component (fixes list/_id error)
459
  assessment_history = gr.State([])
460
 
461
+ # Header
462
  with gr.Row(elem_classes=["header"]):
463
  gr.Markdown("<h1>Clarity Ops Augmented Decision Support</h1>")
464
+ pill = ("PHI Mode ON · history off" if (PHI_MODE and not PERSIST_HISTORY) else "PHI Mode ON" if PHI_MODE else "PHI Mode OFF")
465
  gr.Markdown(f"<span class='badge'>{pill}</span>")
466
 
467
+ # Main layout
468
  with gr.Row(elem_classes=["main"]):
469
+ # Left panel
470
  with gr.Column(elem_classes=["left"]):
471
  gr.Markdown("<div class='panel-title'>New Assessment</div>")
472
  gr.Markdown("<div class='helper'>Upload CSVs for analysis, or enter a prompt. Voice works in modern browsers.</div>")
473
+ files_input = gr.Files(
474
+ label="Upload Data Files (.csv)",
475
+ file_count="multiple",
476
+ type="filepath",
477
+ file_types=[".csv"],
478
+ )
479
+ prompt_input = gr.Textbox(
480
+ label="Prompt",
481
+ placeholder="Paste your scenario or question here...",
482
+ lines=12,
483
+ elem_id="prompt_box",
484
+ autofocus=True,
485
+ )
486
  with gr.Row(elem_classes=["actions"]):
487
+ send_btn = gr.Button("▶️ Run Analysis", variant="primary")
488
+ clear_btn = gr.Button("🧹 Clear")
489
+ voice_btn = gr.Button("🎙️ Voice")
490
 
491
  gr.Markdown("<div class='voice-hint'>Click Voice to start/stop dictation into the prompt box.</div>")
492
+ ping_btn = gr.Button("🔌 Ping Cohere")
493
+ ping_out = gr.Markdown()
494
  gr.Markdown("<div class='hr'></div>")
 
495
  if PHI_MODE:
496
+ gr.Markdown(
497
+ "⚠️ **PHI Mode:** History persistence is disabled by default. Avoid unnecessary identifiers."
498
+ )
499
  with gr.Accordion("Privacy & Terms", open=False):
500
  gr.Markdown(PRIVACY_POLICY_TEXT)
501
  gr.Markdown("<div class='hr'></div>")
502
  gr.Markdown(TERMS_OF_SERVICE_TEXT)
503
+ # Right panel
504
  with gr.Column(elem_classes=["right"]):
505
  with gr.Tabs(elem_classes=["tabs"]):
506
+ with gr.TabItem("Current Assessment", id=0, elem_classes=["tabitem"]):
507
  with gr.Column(elem_id="chatbot_container"):
508
  chat_history_output = gr.Chatbot(
509
+ label="Analysis Output", type="messages"
 
 
 
 
 
510
  )
511
+ with gr.TabItem("Assessment History", id=1, elem_classes=["tabitem"]):
512
  gr.Markdown("### Review Past Assessments")
513
+ history_dropdown = gr.Dropdown(
514
+ label="Select an assessment to review", choices=[]
515
+ )
516
+ history_display = gr.Markdown(label="Selected Assessment Details")
517
 
518
+ # Inject voice-to-text helper
519
  gr.HTML(VOICE_STT_HTML)
520
 
521
+ # --------- Event logic (unchanged analysis flow) ----------
522
+ def run_analysis_wrapper(
523
+ prompt, files, chat_history_list, history_state_list
524
+ ):
525
+ if not prompt:
526
+ gr.Warning("Please enter a prompt.")
527
+ yield chat_history_list, history_state_list, gr.update()
528
+ return
529
+
530
+ chat_with_user_msg = _append_msg(chat_history_list, "user", prompt)
531
+
532
+ # Optional progress callback (not streaming in this UI)
533
+ def dummy_update(message: str):
534
+ pass
535
+
536
+ thinking_message = _append_msg(
537
+ chat_with_user_msg,
538
+ "assistant",
539
+ "```\n🧠 Generating and executing analysis... Please wait.\n```",
540
+ )
541
+ yield thinking_message, history_state_list, gr.update()
542
+
543
+ ai_response_text = handle(prompt, files, dummy_update)
544
+
545
+ final_chat = _append_msg(chat_with_user_msg, "assistant", ai_response_text)
546
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
547
+
548
+ file_names: List[str] = []
549
+ if files:
550
+ file_names = [
551
+ os.path.basename(f.name if hasattr(f, "name") else f) for f in files
552
+ ]
553
+
554
+ new_entry = {
555
+ "id": timestamp,
556
+ "prompt": prompt,
557
+ "files": file_names,
558
+ "response": ai_response_text,
559
+ "chat_history": final_chat,
560
+ }
561
+
562
+ if PERSIST_HISTORY and (not PHI_MODE or (PHI_MODE and HISTORY_TTL_DAYS > 0)):
563
+ updated_history: List[Dict[str, Any]] = (history_state_list or []) + [
564
+ new_entry
565
+ ]
566
+ else:
567
+ updated_history = history_state_list or []
568
+
569
+ history_labels = [
570
+ f"{item['id']} - {item['prompt'][:40]}..."
571
+ for item in updated_history
572
+ ]
573
+
574
+ yield final_chat, updated_history, gr.update(choices=history_labels)
575
+
576
+ def view_history(selection: str, history_state_list: List[Dict[str, Any]]) -> str:
577
+ if not selection or not history_state_list:
578
+ return ""
579
+ try:
580
+ selected_id = selection.split(" - ", 1)[0]
581
+ except Exception:
582
+ selected_id = selection
583
+
584
+ selected_assessment = next(
585
+ (item for item in history_state_list if item.get("id") == selected_id), None
586
+ )
587
+ if not selected_assessment:
588
+ return "Could not find the selected assessment."
589
+
590
+ file_list = selected_assessment.get("files", [])
591
+ file_list_md = "\n- ".join(file_list) if file_list else "*(no files uploaded)*"
592
+
593
+ chat_entries = selected_assessment.get("chat_history", [])
594
+ chat_md_lines = []
595
+ for msg in chat_entries:
596
+ role = msg.get("role", "").capitalize()
597
+ content = msg.get("content", "")
598
+ chat_md_lines.append(f"**{role}:** {content}")
599
+ chat_md = "\n\n".join(chat_md_lines)
600
+
601
+ return f"""### Assessment from: {selected_assessment['id']}
602
+ **Files Used:**
603
+ - {file_list_md}
604
+ ---
605
+ **Original Prompt:**
606
+ > {selected_assessment['prompt']}
607
+ ---
608
+ **AI Generated Response:**
609
+ {selected_assessment['response']}
610
+ ---
611
+ **Chat Transcript:**
612
+ {chat_md}
613
+ """
614
+
615
+ # Wire events (using proper gr.State component for history)
616
+ send_btn.click(
617
+ run_analysis_wrapper,
618
+ inputs=[prompt_input, files_input, chat_history_output, assessment_history],
619
+ outputs=[chat_history_output, assessment_history, history_dropdown],
620
+ )
621
+ history_dropdown.change(
622
+ view_history,
623
+ inputs=[history_dropdown, assessment_history],
624
+ outputs=[history_display],
625
+ )
626
+ clear_btn.click(
627
+ lambda: (None, None, []),
628
+ outputs=[prompt_input, files_input, chat_history_output],
629
+ )
630
+ ping_btn.click(ping_cohere, outputs=[ping_out])
631
+ voice_btn.click(None, [], [], js="rs_toggle_stt('prompt_box')")
632
+
633
 
634
  if __name__ == "__main__":
635
  if not os.getenv("COHERE_API_KEY"):
636
+ print(
637
+ "🔴 COHERE_API_KEY environment variable not set. Application may not function correctly."
638
+ )
639
  demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))