VEDAGI1 commited on
Commit
c78a9de
·
verified ·
1 Parent(s): 1e4ca95

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +161 -1
app.py CHANGED
@@ -80,6 +80,141 @@ def safe_log(event_name: str, meta: dict | None = None):
80
  pass
81
 
82
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
  # ---------------------- JSON Validation ----------------------
84
 
85
  class JSONValidationError(Exception):
@@ -663,21 +798,32 @@ def handle(user_msg: str, files: list, yield_update) -> str:
663
  # Schema Validation - examines column names, data types, and value ranges
664
  yield_update("```\n🔎 Validating input schema...\n```")
665
  try:
666
- validate_all_dataframes(dataframes, filenames)
667
  except SchemaValidationError as e:
668
  safe_log("schema_validation_failed", {"error": str(e)})
669
  return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
670
 
 
 
 
 
 
671
  schema_context = "\n".join(schema_parts)
672
  prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
673
 
674
  yield_update("```\n🧠 Generating aligned analysis script...\n```")
675
  analysis_script = _create_python_script(prompt_for_code, schema_context)
676
 
 
 
 
677
  yield_update("```\n⚙️ Executing script in sandbox...\n```")
678
  try:
679
  raw_data_output = execute_in_sandbox(analysis_script, dataframes)
 
680
  except SandboxViolationError as e:
 
 
681
  safe_log("sandbox_violation", {"error": str(e)})
682
  return (
683
  f"**Security Violation Detected**\n\n{e}\n\n"
@@ -686,6 +832,8 @@ def handle(user_msg: str, files: list, yield_update) -> str:
686
  f"Generated Script:\n```python\n{analysis_script}\n```"
687
  )
688
  except Exception as e:
 
 
689
  return (
690
  f"An error occurred executing the script: {e}\n\nGenerated Script:\n"
691
  f"```python\n{analysis_script}\n```"
@@ -698,6 +846,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
698
  validated_json_str = format_validated_json_for_report(validated_data)
699
  safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
700
  except JSONValidationError as e:
 
701
  safe_log("json_validation_failed", {"error": str(e)})
702
  return (
703
  f"**JSON Validation Failed**\n\n{e}\n\n"
@@ -707,6 +856,17 @@ def handle(user_msg: str, files: list, yield_update) -> str:
707
  yield_update("```\n✍️ Synthesizing final comprehensive report...\n```")
708
  writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
709
  final_report = _generate_final_report(writer_input, validated_json_str)
 
 
 
 
 
 
 
 
 
 
 
710
  return _sanitize_text(final_report)
711
  else:
712
  chat_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
 
80
  pass
81
 
82
 
83
+ # ---------------------- Audit Trail ----------------------
84
+
85
+ import hashlib
86
+ from datetime import datetime as dt
87
+
88
+
89
+ def _hash_content(content: str) -> str:
90
+ """Generate a short hash for content identification without storing full content."""
91
+ return hashlib.sha256(content.encode('utf-8')).hexdigest()[:16]
92
+
93
+
94
+ def _safe_truncate(text: str, max_length: int = 500) -> str:
95
+ """Safely truncate text for logging without exposing sensitive data."""
96
+ if not text or len(text) <= max_length:
97
+ return text
98
+ return text[:max_length] + f"... [truncated, {len(text)} total chars]"
99
+
100
+
101
+ def log_analysis_start(user_prompt: str, filenames: List[str], schema_summary: List[Dict[str, Any]]) -> str:
102
+ """
103
+ Log the start of an analysis session.
104
+
105
+ Captures data lineage: what files were uploaded and their schemas.
106
+ Returns a session_id for correlating subsequent log entries.
107
+ """
108
+ session_id = dt.now().strftime("%Y%m%d_%H%M%S_") + _hash_content(user_prompt)[:8]
109
+
110
+ # Build schema summary without sensitive data
111
+ schema_log = []
112
+ for schema in schema_summary:
113
+ schema_log.append({
114
+ "filename": schema.get("filename"),
115
+ "rows": schema.get("rows"),
116
+ "columns": schema.get("columns"),
117
+ "column_names": schema.get("column_names"),
118
+ "dtypes": schema.get("dtypes"),
119
+ })
120
+
121
+ safe_log("analysis_session_start", {
122
+ "session_id": session_id,
123
+ "prompt_hash": _hash_content(user_prompt),
124
+ "prompt_length": len(user_prompt),
125
+ "file_count": len(filenames),
126
+ "filenames": filenames,
127
+ "schemas": schema_log,
128
+ "timestamp": dt.now().isoformat(),
129
+ })
130
+
131
+ return session_id
132
+
133
+
134
+ def log_code_generation(session_id: str, generated_code: str) -> None:
135
+ """
136
+ Log the generated analysis code.
137
+
138
+ Captures code execution logs for traceability.
139
+ Every finding can be traced back to specific lines of generated Python code.
140
+ """
141
+ # Parse code to extract key operations for the log
142
+ code_operations = []
143
+ if "groupby" in generated_code:
144
+ code_operations.append("groupby")
145
+ if "merge" in generated_code or "join" in generated_code:
146
+ code_operations.append("merge/join")
147
+ if "pivot" in generated_code:
148
+ code_operations.append("pivot")
149
+ if "agg" in generated_code or "aggregate" in generated_code:
150
+ code_operations.append("aggregate")
151
+ if "sort" in generated_code:
152
+ code_operations.append("sort")
153
+ if "filter" in generated_code or ".loc[" in generated_code or ".query(" in generated_code:
154
+ code_operations.append("filter")
155
+ if "mean(" in generated_code or "sum(" in generated_code or "count(" in generated_code:
156
+ code_operations.append("statistics")
157
+
158
+ safe_log("code_generation", {
159
+ "session_id": session_id,
160
+ "code_hash": _hash_content(generated_code),
161
+ "code_length": len(generated_code),
162
+ "code_lines": generated_code.count('\n') + 1,
163
+ "operations_detected": code_operations,
164
+ "timestamp": dt.now().isoformat(),
165
+ })
166
+
167
+
168
+ def log_code_execution(session_id: str, success: bool, output_size: int, error: str = None) -> None:
169
+ """
170
+ Log the result of code execution.
171
+
172
+ Captures execution status and output metadata.
173
+ """
174
+ safe_log("code_execution", {
175
+ "session_id": session_id,
176
+ "success": success,
177
+ "output_size_bytes": output_size,
178
+ "error": _safe_truncate(error) if error else None,
179
+ "timestamp": dt.now().isoformat(),
180
+ })
181
+
182
+
183
+ def log_analysis_complete(
184
+ session_id: str,
185
+ validated_output_keys: List[str],
186
+ report_length: int,
187
+ total_duration_ms: float = None
188
+ ) -> None:
189
+ """
190
+ Log successful completion of analysis.
191
+
192
+ Captures analytical provenance: what was produced and output structure.
193
+ """
194
+ safe_log("analysis_session_complete", {
195
+ "session_id": session_id,
196
+ "output_keys": validated_output_keys,
197
+ "output_key_count": len(validated_output_keys),
198
+ "report_length": report_length,
199
+ "duration_ms": total_duration_ms,
200
+ "timestamp": dt.now().isoformat(),
201
+ })
202
+
203
+
204
+ def log_analysis_error(session_id: str, error_type: str, error_message: str) -> None:
205
+ """
206
+ Log analysis failure.
207
+
208
+ Captures error information for debugging without exposing sensitive data.
209
+ """
210
+ safe_log("analysis_session_error", {
211
+ "session_id": session_id,
212
+ "error_type": error_type,
213
+ "error_message": _safe_truncate(error_message),
214
+ "timestamp": dt.now().isoformat(),
215
+ })
216
+
217
+
218
  # ---------------------- JSON Validation ----------------------
219
 
220
  class JSONValidationError(Exception):
 
798
  # Schema Validation - examines column names, data types, and value ranges
799
  yield_update("```\n🔎 Validating input schema...\n```")
800
  try:
801
+ schema_infos = validate_all_dataframes(dataframes, filenames)
802
  except SchemaValidationError as e:
803
  safe_log("schema_validation_failed", {"error": str(e)})
804
  return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
805
 
806
+ # Start audit trail session
807
+ import time as _time
808
+ _start_time = _time.time()
809
+ session_id = log_analysis_start(safe_in, filenames, schema_infos)
810
+
811
  schema_context = "\n".join(schema_parts)
812
  prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
813
 
814
  yield_update("```\n🧠 Generating aligned analysis script...\n```")
815
  analysis_script = _create_python_script(prompt_for_code, schema_context)
816
 
817
+ # Log generated code
818
+ log_code_generation(session_id, analysis_script)
819
+
820
  yield_update("```\n⚙️ Executing script in sandbox...\n```")
821
  try:
822
  raw_data_output = execute_in_sandbox(analysis_script, dataframes)
823
+ log_code_execution(session_id, success=True, output_size=len(raw_data_output))
824
  except SandboxViolationError as e:
825
+ log_code_execution(session_id, success=False, output_size=0, error=str(e))
826
+ log_analysis_error(session_id, "sandbox_violation", str(e))
827
  safe_log("sandbox_violation", {"error": str(e)})
828
  return (
829
  f"**Security Violation Detected**\n\n{e}\n\n"
 
832
  f"Generated Script:\n```python\n{analysis_script}\n```"
833
  )
834
  except Exception as e:
835
+ log_code_execution(session_id, success=False, output_size=0, error=str(e))
836
+ log_analysis_error(session_id, "execution_error", str(e))
837
  return (
838
  f"An error occurred executing the script: {e}\n\nGenerated Script:\n"
839
  f"```python\n{analysis_script}\n```"
 
846
  validated_json_str = format_validated_json_for_report(validated_data)
847
  safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
848
  except JSONValidationError as e:
849
+ log_analysis_error(session_id, "json_validation_error", str(e))
850
  safe_log("json_validation_failed", {"error": str(e)})
851
  return (
852
  f"**JSON Validation Failed**\n\n{e}\n\n"
 
856
  yield_update("```\n✍️ Synthesizing final comprehensive report...\n```")
857
  writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
858
  final_report = _generate_final_report(writer_input, validated_json_str)
859
+
860
+ # Log successful completion
861
+ _end_time = _time.time()
862
+ _duration_ms = (_end_time - _start_time) * 1000
863
+ log_analysis_complete(
864
+ session_id,
865
+ validated_output_keys=list(validated_data.keys()),
866
+ report_length=len(final_report),
867
+ total_duration_ms=_duration_ms
868
+ )
869
+
870
  return _sanitize_text(final_report)
871
  else:
872
  chat_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in