VEDAGI1 commited on
Commit
b1a7c72
·
verified ·
1 Parent(s): 5c93af4

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +828 -11
app.py CHANGED
@@ -83,6 +83,805 @@ def safe_log(event_name: str, meta: dict | None = None):
83
  # Never raise from logging
84
  pass
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  def _create_python_script(user_scenario: str, schema_context: str) -> str:
87
  EXPERT_ANALYTICAL_GUIDELINES = """
88
  --- EXPERT ANALYTICAL GUIDELINES ---
@@ -110,6 +909,8 @@ CRITICAL RULES:
110
  2. **JSON OUTPUT ONLY:** Your script's ONLY output must be a single JSON object printed to stdout containing the raw data findings.
111
  3. **BE PRECISE:** Use the exact, case-sensitive column names from the schema and robustly clean strings (`re.sub()`) before converting to numbers.
112
  4. **JSON SERIALIZATION:** Before adding data to your final dictionary for JSON conversion, you MUST convert any pandas-specific types (like `int64`) to standard Python types using `.item()` for single values or `.tolist()` for lists.
 
 
113
 
114
  --- USER'S SCENARIO ---
115
  {user_scenario}
@@ -141,25 +942,25 @@ def _generate_long_report(prompt: str) -> str:
141
  return f"Error during final report generation: {e}"
142
 
143
 
144
- def _generate_final_report(user_scenario: str, raw_data_json: str) -> str:
145
  prompt_for_writer = f"""\
146
  You are an expert management consultant and data analyst.
147
- A data science script has run to extract key findings. You have the user's original request and the raw JSON data.
148
 
149
- Your task is to synthesize these raw findings into a single, comprehensive, and professional report that directly answers all of the user's questions with detailed justifications.
150
 
151
  --- USER'S ORIGINAL SCENARIO & DELIVERABLES ---
152
  {user_scenario}
153
  --- END SCENARIO ---
154
 
155
- --- RAW DATA FINDINGS (JSON) ---
156
- {raw_data_json}
157
- --- END RAW DATA ---
158
 
159
  Now, write the final, polished report. The report MUST:
160
  1. Follow the "Expected Output Format" requested by the user.
161
  2. Use tables, bullet points, and DETAILED narrative justifications for each recommendation.
162
- 3. Synthesize the raw data into actionable insights. Do not just copy the raw numbers; interpret them.
163
  4. Ensure you fully address ALL evaluation questions, especially the final recommendations.
164
  """
165
  return _generate_long_report(prompt_for_writer)
@@ -195,7 +996,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
195
  file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
196
 
197
  if file_paths:
198
- # CSV analysis path (unchanged)
199
  dataframes, schema_parts = [], []
200
  for i, p in enumerate(file_paths):
201
  if p.endswith(".csv"):
@@ -237,10 +1038,26 @@ def handle(user_msg: str, files: list, yield_update) -> str:
237
  f"```python\n{analysis_script}\n```"
238
  )
239
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
240
  yield_update("""```
241
- ✍️ Synthesizing final comprehensive report...```""")
 
242
  writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
243
- final_report = _generate_final_report(writer_input, raw_data_output)
244
  return _sanitize_text(final_report)
245
  else:
246
  # Pure chat path
@@ -481,7 +1298,7 @@ with gr.Blocks(theme=gr.themes.Soft(), css=SLEEK_CSS, fill_width=True) as demo:
481
  if not selection or not history_state_list:
482
  return ""
483
  try:
484
- selected_id = selection.split(" - ", 1)
485
  except Exception:
486
  selected_id = selection
487
 
 
83
  # Never raise from logging
84
  pass
85
 
86
+
87
+ # ---------------------- JSON Validation ----------------------
88
+
89
+ class JSONValidationError(Exception):
90
+ """Raised when script output fails JSON validation."""
91
+ pass
92
+
93
+
94
+ def validate_json_output(raw_output: str) -> Dict[str, Any]:
95
+ """
96
+ Validates and parses JSON output from the analysis script.
97
+
98
+ This creates the "hard boundary" between calculation and communication
99
+ as described in the ClarityOps architecture. The function:
100
+ 1. Strips whitespace and handles empty output
101
+ 2. Attempts to parse as JSON
102
+ 3. Validates the structure is a dictionary (not array or primitive)
103
+ 4. Checks for error indicators in the output
104
+ 5. Returns validated Python dict for report generation
105
+
106
+ Args:
107
+ raw_output: Raw string captured from script stdout
108
+
109
+ Returns:
110
+ Validated dictionary containing analysis findings
111
+
112
+ Raises:
113
+ JSONValidationError: If output is empty, malformed, or contains errors
114
+ """
115
+ # Strip whitespace
116
+ cleaned_output = raw_output.strip()
117
+
118
+ # Check for empty output
119
+ if not cleaned_output:
120
+ raise JSONValidationError(
121
+ "Analysis script produced no output. The script must print a JSON object to stdout."
122
+ )
123
+
124
+ # Handle multiple JSON objects (take the last complete one)
125
+ # This handles cases where debug prints precede the final JSON
126
+ json_candidates = []
127
+ brace_count = 0
128
+ current_start = None
129
+
130
+ for i, char in enumerate(cleaned_output):
131
+ if char == '{':
132
+ if brace_count == 0:
133
+ current_start = i
134
+ brace_count += 1
135
+ elif char == '}':
136
+ brace_count -= 1
137
+ if brace_count == 0 and current_start is not None:
138
+ json_candidates.append(cleaned_output[current_start:i+1])
139
+ current_start = None
140
+
141
+ # If no valid JSON structure found, try parsing the whole output
142
+ if not json_candidates:
143
+ json_to_parse = cleaned_output
144
+ else:
145
+ # Use the last JSON object (most likely the final output)
146
+ json_to_parse = json_candidates[-1]
147
+
148
+ # Attempt JSON parsing
149
+ try:
150
+ parsed = json.loads(json_to_parse)
151
+ except json.JSONDecodeError as e:
152
+ # Provide helpful error message with context
153
+ error_context = cleaned_output[:500] + ("..." if len(cleaned_output) > 500 else "")
154
+ raise JSONValidationError(
155
+ f"Analysis script produced invalid JSON. Parse error: {e.msg} at position {e.pos}.\n\n"
156
+ f"Raw output (first 500 chars):\n```\n{error_context}\n```"
157
+ )
158
+
159
+ # Validate structure is a dictionary
160
+ if not isinstance(parsed, dict):
161
+ raise JSONValidationError(
162
+ f"Analysis output must be a JSON object (dictionary), not {type(parsed).__name__}. "
163
+ f"Ensure your script prints a dictionary with json.dumps()."
164
+ )
165
+
166
+ # Check for error indicators in the output
167
+ if "error" in parsed:
168
+ error_msg = parsed.get("error", "Unknown error")
169
+ raise JSONValidationError(
170
+ f"Analysis script reported an error: {error_msg}"
171
+ )
172
+
173
+ # Validate output is not empty dict
174
+ if not parsed:
175
+ raise JSONValidationError(
176
+ "Analysis script produced an empty JSON object. "
177
+ "Ensure your script populates the output dictionary with findings."
178
+ )
179
+
180
+ # Log successful validation (without sensitive data)
181
+ safe_log("json_validation_success", {"keys": list(parsed.keys()), "key_count": len(parsed)})
182
+
183
+ return parsed
184
+
185
+
186
+ def format_validated_json_for_report(validated_data: Dict[str, Any]) -> str:
187
+ """
188
+ Formats validated JSON data for the report generator.
189
+
190
+ Converts the validated Python dictionary back to a formatted JSON string
191
+ for the LLM to interpret. This ensures consistent formatting and handles
192
+ any edge cases in serialization.
193
+
194
+ Args:
195
+ validated_data: Validated dictionary from validate_json_output()
196
+
197
+ Returns:
198
+ Formatted JSON string ready for report generation
199
+ """
200
+ try:
201
+ return json.dumps(validated_data, indent=2, default=str, ensure_ascii=False)
202
+ except (TypeError, ValueError) as e:
203
+ # Fallback to string representation if JSON serialization fails
204
+ safe_log("json_format_warning", {"error": str(e)})
205
+ return json.dumps({"raw_data": str(validated_data)}, indent=2)
206
+
207
+
208
+ # ---------------------- Analysis Script Generation ----------------------
209
+
210
+ def _create_python_script(user_scenario: str, schema_context: str) -> str:
211
+ EXPERT_ANALYTICAL_GUIDELINES = """
212
+ --- EXPERT ANALYTICAL GUIDELINES ---
213
+ When writing your script, you MUST follow these expert business rules:
214
+ 1. **Linking Datasets Rule:** If you need to connect facilities to health zones when the 'zone' column is not in the facility list,
215
+ you must first identify the high-priority zone from the beds data, then find the major city (by facility count) in the facility list,
216
+ and *then* assess that city's capacity. Do not try to filter the facility list by a 'zone' column if it does not exist in the schema.
217
+ 2. **Prioritization Rule:** To prioritize locations, you MUST combine the most recent population data with specific high-risk health indicators
218
+ to create a multi-factor risk score.
219
+ 3. **Capacity Calculation Rule:** For capacity over a 3-month window, assume **60 working days**.
220
+ 4. **Cost Calculation Rule:** Sum 'Startup cost' and 'Ongoing cost' per person before multiplying.
221
+ """
222
+ prompt_for_coder = f"""\
223
+ You are an expert Python data scientist. Your job is to write a script to extract the data needed to answer the user's request.
224
+ You have dataframes in a list `dfs`.
225
+
226
+ {EXPERT_ANALYTICAL_GUIDELINES}
227
+
228
+ --- DATA SCHEMA ---
229
+ {schema_context}
230
+ --- END DATA SCHEMA ---
231
+
232
+ CRITICAL RULES:
233
+ 1. **DO NOT READ FILES:** You MUST NOT include `pd.read_csv`. The data is ALREADY loaded in the `dfs` variable. You MUST use this variable. Failure to do so will cause a fatal error.
234
+ 2. **JSON OUTPUT ONLY:** Your script's ONLY output must be a single JSON object printed to stdout containing the raw data findings.
235
+ 3. **BE PRECISE:** Use the exact, case-sensitive column names from the schema and robustly clean strings (`re.sub()`) before converting to numbers.
236
+ 4. **JSON SERIALIZATION:** Before adding data to your final dictionary for JSON conversion, you MUST convert any pandas-specific types (like `int64`) to standard Python types using `.item()` for single values or `.tolist()` for lists.
237
+ 5. **SINGLE JSON OUTPUT:** Print exactly ONE JSON object at the end of your script. Do not print debug statements or multiple JSON objects.
238
+ 6. **VALID JSON STRUCTURE:** The output MUST be a dictionary/object, not an array or primitive value.
239
+
240
+ --- USER'S SCENARIO ---
241
+ {user_scenario}
242
+
243
+ --- PYTHON SCRIPT ---
244
+ Now, write the complete Python script that performs the analysis and prints a single, serializable JSON object.
245
+ ```python
246
+ """
247
+ generated_text = cohere_chat(prompt_for_coder)
248
+ match = re2.search(r"```python\n(.*?)```", generated_text, re2.DOTALL)
249
+ if match:
250
+ return match.group(1).strip()
251
+ return "print(json.dumps({'error': 'Failed to generate a valid Python script.'}))"
252
+
253
+
254
+ def _generate_long_report(prompt: str) -> str:
255
+ try:
256
+ client = _co_client()
257
+ if not client:
258
+ return "Error: Cohere client not initialized."
259
+ response = client.chat(
260
+ model=COHERE_MODEL_PRIMARY,
261
+ message=prompt,
262
+ max_tokens=4096,
263
+ )
264
+ return response.text
265
+ except Exception as e:
266
+ safe_log("cohere_chat_error", {"err": str(e)})
267
+ return f"Error during final report generation: {e}"
268
+
269
+
270
+ def _generate_final_report(user_scenario: str, validated_json_str: str) -> str:
271
+ prompt_for_writer = f"""\
272
+ You are an expert management consultant and data analyst.
273
+ A data science script has run to extract key findings. You have the user's original request and the validated JSON data.
274
+
275
+ Your task is to synthesize these validated findings into a single, comprehensive, and professional report that directly answers all of the user's questions with detailed justifications.
276
+
277
+ --- USER'S ORIGINAL SCENARIO & DELIVERABLES ---
278
+ {user_scenario}
279
+ --- END SCENARIO ---
280
+
281
+ --- VALIDATED DATA FINDINGS (JSON) ---
282
+ {validated_json_str}
283
+ --- END VALIDATED DATA ---
284
+
285
+ Now, write the final, polished report. The report MUST:
286
+ 1. Follow the "Expected Output Format" requested by the user.
287
+ 2. Use tables, bullet points, and DETAILED narrative justifications for each recommendation.
288
+ 3. Synthesize the validated data into actionable insights. Do not just copy the raw numbers; interpret them.
289
+ 4. Ensure you fully address ALL evaluation questions, especially the final recommendations.
290
+ """
291
+ return _generate_long_report(prompt_for_writer)
292
+
293
+
294
+ def _append_msg(h: List[Dict[str, str]], r: str, c: str) -> List[Dict[str, str]]:
295
+ return (h or []) + [{"role": r, "content": c}]
296
+
297
+
298
+ def ping_cohere() -> str:
299
+ try:
300
+ cli = _co_client()
301
+ if not cli:
302
+ return "Cohere client not initialized."
303
+ vecs = cohere_embed(["hello", "world"])
304
+ return f"Cohere OK ✅ (model={COHERE_MODEL_PRIMARY})" if vecs else "Cohere reachable."
305
+ except Exception as e:
306
+ return f"Cohere ping failed: {e}"
307
+
308
+
309
+ def handle(user_msg: str, files: list, yield_update) -> str:
310
+ try:
311
+ # Safety filter on incoming message
312
+ safe_in, blocked_in, reason_in = safety_filter(user_msg, mode="input")
313
+ if blocked_in:
314
+ return refusal_reply(reason_in)
315
+
316
+ # Optional PHI redaction for prompts sent to an external LLM
317
+ redacted_in = safe_in
318
+ if PHI_MODE and REDACT_BEFORE_LLM:
319
+ redacted_in = redact_phi(safe_in)
320
+
321
+ file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
322
+
323
+ if file_paths:
324
+ # CSV analysis path
325
+ dataframes, schema_parts = [], []
326
+ for i, p in enumerate(file_paths):
327
+ if p.endswith(".csv"):
328
+ try:
329
+ df = pd.read_csv(p)
330
+ except UnicodeDecodeError:
331
+ df = pd.read_csv(p, encoding="latin1")
332
+ dataframes.append(df)
333
+ schema_parts.append(
334
+ f"DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):\n{df.head().to_markdown()}\n"
335
+ )
336
+
337
+ if not dataframes:
338
+ return "Please upload at least one CSV file."
339
+
340
+ schema_context = "\n".join(schema_parts)
341
+
342
+ # If external PHI is not allowed, use redacted prompt; otherwise use original
343
+ prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
344
+
345
+ yield_update("""```
346
+ 🧠 Generating aligned analysis script...
347
+ ```""")
348
+ analysis_script = _create_python_script(prompt_for_code, schema_context)
349
+
350
+ yield_update("""```
351
+ ⚙️ Executing script to extract raw data...
352
+ ```""")
353
+ execution_namespace = {"dfs": dataframes, "pd": pd, "re": re, "json": json}
354
+ output_buffer = io.StringIO()
355
+
356
+ try:
357
+ with redirect_stdout(output_buffer):
358
+ exec(analysis_script, execution_namespace)
359
+ raw_data_output = output_buffer.getvalue()
360
+ except Exception as e:
361
+ return (
362
+ f"An error occurred executing the script: {e}\n\nGenerated Script:\n"
363
+ f"```python\n{analysis_script}\n```"
364
+ )
365
+
366
+ # JSON Validation - creates hard boundary between calculation and communication
367
+ yield_update("""```
368
+ 🔍 Validating JSON output...
369
+ ```""")
370
+ try:
371
+ validated_data = validate_json_output(raw_data_output)
372
+ validated_json_str = format_validated_json_for_report(validated_data)
373
+ safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
374
+ except JSONValidationError as e:
375
+ safe_log("json_validation_failed", {"error": str(e)})
376
+ return (
377
+ f"**JSON Validation Failed**\n\n{e}\n\n"
378
+ f"Generated Script:\n```python\n{analysis_script}\n```"
379
+ )
380
+
381
+ yield_update("""```
382
+ ✍️ Synthesizing final comprehensive report...
383
+ ```""")
384
+ writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
385
+ final_report = _generate_final_report(writer_input, validated_json_str)
386
+ return _sanitize_text(final_report)
387
+ else:
388
+ # Pure chat path
389
+ chat_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
390
+ prompt = f"{GENERAL_CONVERSATION_PROMPT}\n\nUser: {chat_input}\nAssistant:"
391
+ return _sanitize_text(cohere_chat(prompt) or "How can I help further?")
392
+
393
+ except Exception as e:
394
+ tb = traceback.format_exc()
395
+ safe_log("app_error", {"err": str(e)})
396
+ return "A critical error occurred. Please contact your administrator." if PHI_MODE else f"A critical error occurred: {e}"
397
+
398
+
399
+ PRIVACY_POLICY_TEXT = load_markdown_text("privacy_policy.md")
400
+ TERMS_OF_SERVICE_TEXT = load_markdown_text("terms_of_service.md")
401
+
402
+
403
+ # ---------------------- Sleek UI assets (CSS/JS only) ----------------------
404
+
405
+ SLEEK_CSS = """
406
+ /* Full-bleed, modern look */
407
+ :root, body, #root, .gradio-container { height: 100%; }
408
+ .gradio-container { padding: 0 !important; }
409
+ .block { padding: 0 !important; }
410
+
411
+ /* Header */
412
+ .header {
413
+ padding: 20px 28px;
414
+ background: linear-gradient(135deg, #0e1726, #1d2a44 60%, #243a5e);
415
+ color: #fff;
416
+ display: flex; align-items: center; justify-content: space-between;
417
+ gap: 16px;
418
+ }
419
+ .header h1 { margin: 0; font-size: 22px; letter-spacing: 0.3px; font-weight: 600; }
420
+ .header .badge { font-size: 12px; opacity: 0.9; background:#ffffff22; padding:6px 10px; border-radius: 999px; }
421
+
422
+ /* Main layout */
423
+ .main {
424
+ display: grid;
425
+ grid-template-columns: 420px 1fr;
426
+ gap: 16px;
427
+ padding: 16px;
428
+ height: calc(100vh - 72px);
429
+ box-sizing: border-box;
430
+ }
431
+ .left, .right {
432
+ background: #0b1020;
433
+ color: #e9edf3;
434
+ border-radius: 16px;
435
+ border: 1px solid #1c2642;
436
+ }
437
+ .left { padding: 16px; display: flex; flex-direction: column; gap: 12px; }
438
+ .right { padding: 0; display: flex; flex-direction: column; }
439
+
440
+ /* Panels */
441
+ .panel-title { font-size: 14px; font-weight: 600; color: #aeb8cc; margin-bottom: 6px; }
442
+ .helper { font-size: 12px; color: #97a3bb; margin-bottom: 8px; }
443
+
444
+ /* Sticky actions */
445
+ .actions {
446
+ display: flex; gap: 8px; align-items: center; justify-content: stretch;
447
+ }
448
+ .actions .gr-button { flex: 1; }
449
+
450
+ /* Tabs full height */
451
+ .right .tabs { height: 100%; display: flex; flex-direction: column; }
452
+ .right .tabitem { flex: 1; display: flex; flex-direction: column; }
453
+ #chatbot_container { flex: 1; }
454
+ #chatbot_container .gr-chatbot { height: 100%; }
455
+
456
+ /* Tiny separators */
457
+ .hr { height: 1px; background: #16203b; margin: 10px 0; }
458
+
459
+ /* Voice hint */
460
+ .voice-hint { font-size: 12px; color:#9fb0cc; margin-top: 4px; }
461
+ """
462
+
463
+ VOICE_STT_HTML = """
464
+ <script>
465
+ let __rs_rec = null;
466
+ function rs_toggle_stt(elemId){
467
+ const SpeechRecognition = window.SpeechRecognition || window.webkitSpeechRecognition;
468
+ if (!SpeechRecognition){
469
+ alert("This browser does not support Speech Recognition. Try Chrome or Edge.");
470
+ return;
471
+ }
472
+ if (__rs_rec){ __rs_rec.stop(); __rs_rec = null; return; }
473
+ __rs_rec = new SpeechRecognition();
474
+ __rs_rec.lang = "en-US";
475
+ __rs_rec.interimResults = true;
476
+ __rs_rec.continuous = true;
477
+
478
+ const box = document.querySelector(`#${elemId} textarea`);
479
+ if (!box){ alert("Prompt box not found."); return; }
480
+ let base = box.value || "";
481
+
482
+ __rs_rec.onresult = (ev) => {
483
+ let t = "";
484
+ for (let i = ev.resultIndex; i < ev.results.length; i++){
485
+ t += ev.results[i].transcript;
486
+ }
487
+ box.value = (base + " " + t).trim();
488
+ box.dispatchEvent(new Event("input", { bubbles: true }));
489
+ };
490
+ __rs_rec.onend = () => { __rs_rec = null; };
491
+ __rs_rec.start();
492
+ }
493
+ </script>
494
+ """
495
+
496
+
497
+ # ---------------------- Sleek UI (with fixed State wiring) ----------------------
498
+
499
+ with gr.Blocks(theme=gr.themes.Soft(), css=SLEEK_CSS, fill_width=True) as demo:
500
+ # Persistent in-memory history component (fixes list/_id error)
501
+ assessment_history = gr.State([])
502
+
503
+ # Header
504
+ with gr.Row(elem_classes=["header"]):
505
+ gr.Markdown("<h1>Clarity Ops Augemented Decision Support</h1>")
506
+ pill = "PHI Mode ON · history off" if (PHI_MODE and not PERSIST_HISTORY) else \
507
+ "PHI Mode ON" if PHI_MODE else "PHI Mode OFF"
508
+ gr.Markdown(f"<span class='badge'>{pill}</span>")
509
+
510
+ # Main layout
511
+ with gr.Row(elem_classes=["main"]):
512
+ # Left panel
513
+ with gr.Column(elem_classes=["left"]):
514
+ gr.Markdown("<div class='panel-title'>New Assessment</div>")
515
+ gr.Markdown("<div class='helper'>Upload CSVs for analysis, or enter a prompt. Voice works in modern browsers.</div>")
516
+ files_input = gr.Files(
517
+ label="Upload Data Files (.csv)",
518
+ file_count="multiple",
519
+ type="filepath",
520
+ file_types=[".csv"],
521
+ )
522
+ prompt_input = gr.Textbox(
523
+ label="Prompt",
524
+ placeholder="Paste your scenario or question here...",
525
+ lines=12,
526
+ elem_id="prompt_box",
527
+ autofocus=True,
528
+ )
529
+
530
+ with gr.Row(elem_classes=["actions"]):
531
+ send_btn = gr.Button("▶️ Run Analysis", variant="primary")
532
+ clear_btn = gr.Button("🧹 Clear")
533
+ voice_btn = gr.Button("🎙️ Voice")
534
+
535
+ gr.Markdown("<div class='voice-hint'>Click Voice to start/stop dictation into the prompt box.</div>")
536
+ ping_btn = gr.Button("🔌 Ping Cohere")
537
+ ping_out = gr.Markdown()
538
+
539
+ gr.Markdown("<div class='hr'></div>")
540
+ if PHI_MODE:
541
+ gr.Markdown(
542
+ "⚠️ **PHI Mode:** History persistence is disabled by default. Avoid unnecessary identifiers."
543
+ )
544
+
545
+ with gr.Accordion("Privacy & Terms", open=False):
546
+ gr.Markdown(PRIVACY_POLICY_TEXT)
547
+ gr.Markdown("<div class='hr'></div>")
548
+ gr.Markdown(TERMS_OF_SERVICE_TEXT)
549
+
550
+ # Right panel
551
+ with gr.Column(elem_classes=["right"]):
552
+ with gr.Tabs(elem_classes=["tabs"]):
553
+ with gr.TabItem("Current Assessment", id=0, elem_classes=["tabitem"]):
554
+ with gr.Column(elem_id="chatbot_container"):
555
+ chat_history_output = gr.Chatbot(label="Analysis Output", type="messages", container=False, autoscroll=True)
556
+ with gr.TabItem("Assessment History", id=1, elem_classes=["tabitem"]):
557
+ gr.Markdown("### Review Past Assessments")
558
+ history_dropdown = gr.Dropdown(label="Select an assessment to review", choices=[])
559
+ history_display = gr.Markdown(label="Selected Assessment Details")
560
+
561
+ # Inject voice-to-text helper
562
+ gr.HTML(VOICE_STT_HTML)
563
+
564
+ # --------- Event logic (unchanged analysis flow) ----------
565
+
566
+ def run_analysis_wrapper(prompt, files, chat_history_list, history_state_list):
567
+ if not prompt:
568
+ gr.Warning("Please enter a prompt.")
569
+ yield chat_history_list, history_state_list, gr.update()
570
+ return
571
+
572
+ # Append user's message
573
+ chat_with_user_msg = _append_msg(chat_history_list, "user", prompt)
574
+
575
+ # Optional progress callback (not streaming in this UI)
576
+ def dummy_update(message: str):
577
+ pass
578
+
579
+ # Thinking bubble
580
+ thinking_message = _append_msg(
581
+ chat_with_user_msg,
582
+ "assistant",
583
+ """```
584
+ 🧠 Generating and executing analysis... Please wait.
585
+ ```""",
586
+ )
587
+ yield thinking_message, history_state_list, gr.update()
588
+
589
+ # Run analysis/chat
590
+ ai_response_text = handle(prompt, files, dummy_update)
591
+
592
+ # Append final assistant response
593
+ final_chat = _append_msg(chat_with_user_msg, "assistant", ai_response_text)
594
+ timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
595
+
596
+ # Capture filenames (if any)
597
+ file_names: List[str] = []
598
+ if files:
599
+ file_names = [
600
+ os.path.basename(f.name if hasattr(f, "name") else f) for f in files
601
+ ]
602
+
603
+ # Build history record
604
+ new_entry = {
605
+ "id": timestamp,
606
+ "prompt": prompt,
607
+ "files": file_names,
608
+ "response": ai_response_text,
609
+ "chat_history": final_chat,
610
+ }
611
+
612
+ # Respect PHI/history flags
613
+ if PERSIST_HISTORY and (not PHI_MODE or (PHI_MODE and HISTORY_TTL_DAYS > 0)):
614
+ updated_history: List[Dict[str, Any]] = (history_state_list or []) + [new_entry]
615
+ else:
616
+ updated_history = history_state_list or []
617
+
618
+ history_labels = [f"{item['id']} - {item['prompt'][:40]}..." for item in updated_history]
619
+
620
+ yield final_chat, updated_history, gr.update(choices=history_labels)
621
+
622
+ def view_history(selection: str, history_state_list: List[Dict[str, Any]]) -> str:
623
+ if not selection or not history_state_list:
624
+ return ""
625
+ try:
626
+ selected_id = selection.split(" - ", 1)[0]
627
+ except Exception:
628
+ selected_id = selection
629
+
630
+ selected_assessment = next(
631
+ (item for item in history_state_list if item.get("id") == selected_id), None
632
+ )
633
+ if not selected_assessment:
634
+ return "Could not find the selected assessment."
635
+
636
+ file_list = selected_assessment.get("files", [])
637
+ file_list_md = "\n- ".join(file_list) if file_list else "*(no files uploaded)*"
638
+
639
+ chat_entries = selected_assessment.get("chat_history", [])
640
+ chat_md_lines = []
641
+ for msg in chat_entries:
642
+ role = msg.get("role", "").capitalize()
643
+ content = msg.get("content", "")
644
+ chat_md_lines.append(f"**{role}:** {content}")
645
+ chat_md = "\n\n".join(chat_md_lines)
646
+
647
+ return f"""### Assessment from: {selected_assessment['id']}
648
+ **Files Used:**
649
+ - {file_list_md}
650
+ ---
651
+ **Original Prompt:**
652
+ > {selected_assessment['prompt']}
653
+ ---
654
+ **AI Generated Response:**
655
+ {selected_assessment['response']}
656
+ ---
657
+ **Chat Transcript:**
658
+ {chat_md}
659
+ """
660
+
661
+ # Wire events (using proper gr.State component for history)
662
+ send_btn.click(
663
+ run_analysis_wrapper,
664
+ inputs=[prompt_input, files_input, chat_history_output, assessment_history],
665
+ outputs=[chat_history_output, assessment_history, history_dropdown],
666
+ )
667
+ history_dropdown.change(
668
+ view_history,
669
+ inputs=[history_dropdown, assessment_history],
670
+ outputs=[history_display],
671
+ )
672
+ clear_btn.click(
673
+ lambda: (None, None, []),
674
+ outputs=[prompt_input, files_input, chat_history_output],
675
+ )
676
+ ping_btn.click(ping_cohere, outputs=[ping_out])
677
+ voice_btn.click(None, [], [], js="rs_toggle_stt('prompt_box')")
678
+
679
+
680
+ if __name__ == "__main__":
681
+ if not os.getenv("COHERE_API_KEY"):
682
+ print("🔴 COHERE_API_KEY environment variable not set. Application may not function correctly.")
683
+ demo.launch(server_name="0.0.0.0", server_port=int(os.getenv("PORT", "7860")))from __future__ import annotations
684
+
685
+ import io
686
+ import json
687
+ import os
688
+ import traceback
689
+ from contextlib import redirect_stdout
690
+ from datetime import datetime
691
+ from typing import Any, Dict, List
692
+
693
+ import gradio as gr
694
+ import pandas as pd
695
+ import regex as re2
696
+ import re
697
+ from langchain_cohere import ChatCohere # noqa: F401
698
+ from settings import (
699
+ GENERAL_CONVERSATION_PROMPT,
700
+ COHERE_MODEL_PRIMARY,
701
+ COHERE_TIMEOUT_S, # noqa: F401
702
+ USE_OPEN_FALLBACKS # noqa: F401
703
+ )
704
+ # Try to import optional HIPAA flags; fall back to safe defaults if not defined.
705
+ try:
706
+ from settings import PHI_MODE, PERSIST_HISTORY, HISTORY_TTL_DAYS, REDACT_BEFORE_LLM, ALLOW_EXTERNAL_PHI
707
+ except Exception:
708
+ PHI_MODE = False
709
+ PERSIST_HISTORY = True
710
+ HISTORY_TTL_DAYS = 365
711
+ REDACT_BEFORE_LLM = False
712
+ ALLOW_EXTERNAL_PHI = True
713
+
714
+ from audit_log import log_event
715
+ from privacy import safety_filter, refusal_reply
716
+ from llm_router import cohere_chat, _co_client, cohere_embed
717
+
718
+ # ---------------------- Helpers (analysis logic unchanged) ----------------------
719
+ def load_markdown_text(filepath: str) -> str:
720
+ try:
721
+ with open(filepath, "r", encoding="utf-8") as f:
722
+ return f.read()
723
+ except FileNotFoundError:
724
+ return f"**Error:** Document `{os.path.basename(filepath)}` not found."
725
+
726
+ def _sanitize_text(s: str) -> str:
727
+ if not isinstance(s, str):
728
+ return s
729
+ # Remove control characters (except newline and tab)
730
+ return re2.sub(r"[\p{C}--[\n\t]]+", "", s)
731
+
732
+ # Conservative PHI redaction patterns (only applied if PHI_MODE & REDACT_BEFORE_LLM are enabled)
733
+ PHI_PATTERNS = [
734
+ (re.compile(r"\b\d{3}-\d{2}-\d{4}\b"), "[REDACTED_SSN]"),
735
+ (re.compile(r"\b\d{9}\b"), "[REDACTED_MRN]"),
736
+ (re.compile(r"\b\d{3}[-.\s]?\d{3}[-.\s]?\d{4}\b"), "[REDACTED_PHONE]"),
737
+ (re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}"), "[REDACTED_EMAIL]"),
738
+ (re.compile(r"\b(19|20)\d{2}-\d{2}-\d{2}\b"), "[REDACTED_DOB]"),
739
+ (re.compile(r"\b\d{2}/\d{2}/(19|20)\d{2}\b"), "[REDACTED_DOB]"),
740
+ (re.compile(r"\b\d{5}(-\d{4})?\b"), "[REDACTED_ZIP]"),
741
+ ]
742
+
743
+ def redact_phi(text: str) -> str:
744
+ if not isinstance(text, str):
745
+ return text
746
+ t = text
747
+ for pat, repl in PHI_PATTERNS:
748
+ t = pat.sub(repl, t)
749
+ return t
750
+
751
+ def safe_log(event_name: str, meta: dict | None = None):
752
+ # Avoid logging raw PHI or payloads
753
+ try:
754
+ meta = (meta or {}).copy()
755
+ meta.pop("raw", None)
756
+ log_event(event_name, None, meta)
757
+ except Exception:
758
+ # Never raise from logging
759
+ pass
760
+
761
+
762
+ # ---------------------- JSON Validation ----------------------
763
+
764
+ class JSONValidationError(Exception):
765
+ """Raised when script output fails JSON validation."""
766
+ pass
767
+
768
+
769
+ def validate_json_output(raw_output: str) -> Dict[str, Any]:
770
+ """
771
+ Validates and parses JSON output from the analysis script.
772
+
773
+ This creates the "hard boundary" between calculation and communication
774
+ as described in the ClarityOps architecture. The function:
775
+ 1. Strips whitespace and handles empty output
776
+ 2. Attempts to parse as JSON
777
+ 3. Validates the structure is a dictionary (not array or primitive)
778
+ 4. Checks for error indicators in the output
779
+ 5. Returns validated Python dict for report generation
780
+
781
+ Args:
782
+ raw_output: Raw string captured from script stdout
783
+
784
+ Returns:
785
+ Validated dictionary containing analysis findings
786
+
787
+ Raises:
788
+ JSONValidationError: If output is empty, malformed, or contains errors
789
+ """
790
+ # Strip whitespace
791
+ cleaned_output = raw_output.strip()
792
+
793
+ # Check for empty output
794
+ if not cleaned_output:
795
+ raise JSONValidationError(
796
+ "Analysis script produced no output. The script must print a JSON object to stdout."
797
+ )
798
+
799
+ # Handle multiple JSON objects (take the last complete one)
800
+ # This handles cases where debug prints precede the final JSON
801
+ json_candidates = []
802
+ brace_count = 0
803
+ current_start = None
804
+
805
+ for i, char in enumerate(cleaned_output):
806
+ if char == '{':
807
+ if brace_count == 0:
808
+ current_start = i
809
+ brace_count += 1
810
+ elif char == '}':
811
+ brace_count -= 1
812
+ if brace_count == 0 and current_start is not None:
813
+ json_candidates.append(cleaned_output[current_start:i+1])
814
+ current_start = None
815
+
816
+ # If no valid JSON structure found, try parsing the whole output
817
+ if not json_candidates:
818
+ json_to_parse = cleaned_output
819
+ else:
820
+ # Use the last JSON object (most likely the final output)
821
+ json_to_parse = json_candidates[-1]
822
+
823
+ # Attempt JSON parsing
824
+ try:
825
+ parsed = json.loads(json_to_parse)
826
+ except json.JSONDecodeError as e:
827
+ # Provide helpful error message with context
828
+ error_context = cleaned_output[:500] + ("..." if len(cleaned_output) > 500 else "")
829
+ raise JSONValidationError(
830
+ f"Analysis script produced invalid JSON. Parse error: {e.msg} at position {e.pos}.\n\n"
831
+ f"Raw output (first 500 chars):\n```\n{error_context}\n```"
832
+ )
833
+
834
+ # Validate structure is a dictionary
835
+ if not isinstance(parsed, dict):
836
+ raise JSONValidationError(
837
+ f"Analysis output must be a JSON object (dictionary), not {type(parsed).__name__}. "
838
+ f"Ensure your script prints a dictionary with json.dumps()."
839
+ )
840
+
841
+ # Check for error indicators in the output
842
+ if "error" in parsed:
843
+ error_msg = parsed.get("error", "Unknown error")
844
+ raise JSONValidationError(
845
+ f"Analysis script reported an error: {error_msg}"
846
+ )
847
+
848
+ # Validate output is not empty dict
849
+ if not parsed:
850
+ raise JSONValidationError(
851
+ "Analysis script produced an empty JSON object. "
852
+ "Ensure your script populates the output dictionary with findings."
853
+ )
854
+
855
+ # Log successful validation (without sensitive data)
856
+ safe_log("json_validation_success", {"keys": list(parsed.keys()), "key_count": len(parsed)})
857
+
858
+ return parsed
859
+
860
+
861
+ def format_validated_json_for_report(validated_data: Dict[str, Any]) -> str:
862
+ """
863
+ Formats validated JSON data for the report generator.
864
+
865
+ Converts the validated Python dictionary back to a formatted JSON string
866
+ for the LLM to interpret. This ensures consistent formatting and handles
867
+ any edge cases in serialization.
868
+
869
+ Args:
870
+ validated_data: Validated dictionary from validate_json_output()
871
+
872
+ Returns:
873
+ Formatted JSON string ready for report generation
874
+ """
875
+ try:
876
+ return json.dumps(validated_data, indent=2, default=str, ensure_ascii=False)
877
+ except (TypeError, ValueError) as e:
878
+ # Fallback to string representation if JSON serialization fails
879
+ safe_log("json_format_warning", {"error": str(e)})
880
+ return json.dumps({"raw_data": str(validated_data)}, indent=2)
881
+
882
+
883
+ # ---------------------- Analysis Script Generation ----------------------
884
+
885
  def _create_python_script(user_scenario: str, schema_context: str) -> str:
886
  EXPERT_ANALYTICAL_GUIDELINES = """
887
  --- EXPERT ANALYTICAL GUIDELINES ---
 
909
  2. **JSON OUTPUT ONLY:** Your script's ONLY output must be a single JSON object printed to stdout containing the raw data findings.
910
  3. **BE PRECISE:** Use the exact, case-sensitive column names from the schema and robustly clean strings (`re.sub()`) before converting to numbers.
911
  4. **JSON SERIALIZATION:** Before adding data to your final dictionary for JSON conversion, you MUST convert any pandas-specific types (like `int64`) to standard Python types using `.item()` for single values or `.tolist()` for lists.
912
+ 5. **SINGLE JSON OUTPUT:** Print exactly ONE JSON object at the end of your script. Do not print debug statements or multiple JSON objects.
913
+ 6. **VALID JSON STRUCTURE:** The output MUST be a dictionary/object, not an array or primitive value.
914
 
915
  --- USER'S SCENARIO ---
916
  {user_scenario}
 
942
  return f"Error during final report generation: {e}"
943
 
944
 
945
+ def _generate_final_report(user_scenario: str, validated_json_str: str) -> str:
946
  prompt_for_writer = f"""\
947
  You are an expert management consultant and data analyst.
948
+ A data science script has run to extract key findings. You have the user's original request and the validated JSON data.
949
 
950
+ Your task is to synthesize these validated findings into a single, comprehensive, and professional report that directly answers all of the user's questions with detailed justifications.
951
 
952
  --- USER'S ORIGINAL SCENARIO & DELIVERABLES ---
953
  {user_scenario}
954
  --- END SCENARIO ---
955
 
956
+ --- VALIDATED DATA FINDINGS (JSON) ---
957
+ {validated_json_str}
958
+ --- END VALIDATED DATA ---
959
 
960
  Now, write the final, polished report. The report MUST:
961
  1. Follow the "Expected Output Format" requested by the user.
962
  2. Use tables, bullet points, and DETAILED narrative justifications for each recommendation.
963
+ 3. Synthesize the validated data into actionable insights. Do not just copy the raw numbers; interpret them.
964
  4. Ensure you fully address ALL evaluation questions, especially the final recommendations.
965
  """
966
  return _generate_long_report(prompt_for_writer)
 
996
  file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
997
 
998
  if file_paths:
999
+ # CSV analysis path
1000
  dataframes, schema_parts = [], []
1001
  for i, p in enumerate(file_paths):
1002
  if p.endswith(".csv"):
 
1038
  f"```python\n{analysis_script}\n```"
1039
  )
1040
 
1041
+ # JSON Validation - creates hard boundary between calculation and communication
1042
+ yield_update("""```
1043
+ 🔍 Validating JSON output...
1044
+ ```""")
1045
+ try:
1046
+ validated_data = validate_json_output(raw_data_output)
1047
+ validated_json_str = format_validated_json_for_report(validated_data)
1048
+ safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
1049
+ except JSONValidationError as e:
1050
+ safe_log("json_validation_failed", {"error": str(e)})
1051
+ return (
1052
+ f"**JSON Validation Failed**\n\n{e}\n\n"
1053
+ f"Generated Script:\n```python\n{analysis_script}\n```"
1054
+ )
1055
+
1056
  yield_update("""```
1057
+ ✍️ Synthesizing final comprehensive report...
1058
+ ```""")
1059
  writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
1060
+ final_report = _generate_final_report(writer_input, validated_json_str)
1061
  return _sanitize_text(final_report)
1062
  else:
1063
  # Pure chat path
 
1298
  if not selection or not history_state_list:
1299
  return ""
1300
  try:
1301
+ selected_id = selection.split(" - ", 1)[0]
1302
  except Exception:
1303
  selected_id = selection
1304