Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -80,6 +80,141 @@ def safe_log(event_name: str, meta: dict | None = None):
|
|
| 80 |
pass
|
| 81 |
|
| 82 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 83 |
# ---------------------- JSON Validation ----------------------
|
| 84 |
|
| 85 |
class JSONValidationError(Exception):
|
|
@@ -663,21 +798,32 @@ def handle(user_msg: str, files: list, yield_update) -> str:
|
|
| 663 |
# Schema Validation - examines column names, data types, and value ranges
|
| 664 |
yield_update("```\n🔎 Validating input schema...\n```")
|
| 665 |
try:
|
| 666 |
-
validate_all_dataframes(dataframes, filenames)
|
| 667 |
except SchemaValidationError as e:
|
| 668 |
safe_log("schema_validation_failed", {"error": str(e)})
|
| 669 |
return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
|
| 670 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 671 |
schema_context = "\n".join(schema_parts)
|
| 672 |
prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|
| 673 |
|
| 674 |
yield_update("```\n🧠 Generating aligned analysis script...\n```")
|
| 675 |
analysis_script = _create_python_script(prompt_for_code, schema_context)
|
| 676 |
|
|
|
|
|
|
|
|
|
|
| 677 |
yield_update("```\n⚙️ Executing script in sandbox...\n```")
|
| 678 |
try:
|
| 679 |
raw_data_output = execute_in_sandbox(analysis_script, dataframes)
|
|
|
|
| 680 |
except SandboxViolationError as e:
|
|
|
|
|
|
|
| 681 |
safe_log("sandbox_violation", {"error": str(e)})
|
| 682 |
return (
|
| 683 |
f"**Security Violation Detected**\n\n{e}\n\n"
|
|
@@ -686,6 +832,8 @@ def handle(user_msg: str, files: list, yield_update) -> str:
|
|
| 686 |
f"Generated Script:\n```python\n{analysis_script}\n```"
|
| 687 |
)
|
| 688 |
except Exception as e:
|
|
|
|
|
|
|
| 689 |
return (
|
| 690 |
f"An error occurred executing the script: {e}\n\nGenerated Script:\n"
|
| 691 |
f"```python\n{analysis_script}\n```"
|
|
@@ -698,6 +846,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
|
|
| 698 |
validated_json_str = format_validated_json_for_report(validated_data)
|
| 699 |
safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
|
| 700 |
except JSONValidationError as e:
|
|
|
|
| 701 |
safe_log("json_validation_failed", {"error": str(e)})
|
| 702 |
return (
|
| 703 |
f"**JSON Validation Failed**\n\n{e}\n\n"
|
|
@@ -707,6 +856,17 @@ def handle(user_msg: str, files: list, yield_update) -> str:
|
|
| 707 |
yield_update("```\n✍️ Synthesizing final comprehensive report...\n```")
|
| 708 |
writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|
| 709 |
final_report = _generate_final_report(writer_input, validated_json_str)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 710 |
return _sanitize_text(final_report)
|
| 711 |
else:
|
| 712 |
chat_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|
|
|
|
| 80 |
pass
|
| 81 |
|
| 82 |
|
| 83 |
+
# ---------------------- Audit Trail ----------------------
|
| 84 |
+
|
| 85 |
+
import hashlib
|
| 86 |
+
from datetime import datetime as dt
|
| 87 |
+
|
| 88 |
+
|
| 89 |
+
def _hash_content(content: str) -> str:
|
| 90 |
+
"""Generate a short hash for content identification without storing full content."""
|
| 91 |
+
return hashlib.sha256(content.encode('utf-8')).hexdigest()[:16]
|
| 92 |
+
|
| 93 |
+
|
| 94 |
+
def _safe_truncate(text: str, max_length: int = 500) -> str:
|
| 95 |
+
"""Safely truncate text for logging without exposing sensitive data."""
|
| 96 |
+
if not text or len(text) <= max_length:
|
| 97 |
+
return text
|
| 98 |
+
return text[:max_length] + f"... [truncated, {len(text)} total chars]"
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def log_analysis_start(user_prompt: str, filenames: List[str], schema_summary: List[Dict[str, Any]]) -> str:
|
| 102 |
+
"""
|
| 103 |
+
Log the start of an analysis session.
|
| 104 |
+
|
| 105 |
+
Captures data lineage: what files were uploaded and their schemas.
|
| 106 |
+
Returns a session_id for correlating subsequent log entries.
|
| 107 |
+
"""
|
| 108 |
+
session_id = dt.now().strftime("%Y%m%d_%H%M%S_") + _hash_content(user_prompt)[:8]
|
| 109 |
+
|
| 110 |
+
# Build schema summary without sensitive data
|
| 111 |
+
schema_log = []
|
| 112 |
+
for schema in schema_summary:
|
| 113 |
+
schema_log.append({
|
| 114 |
+
"filename": schema.get("filename"),
|
| 115 |
+
"rows": schema.get("rows"),
|
| 116 |
+
"columns": schema.get("columns"),
|
| 117 |
+
"column_names": schema.get("column_names"),
|
| 118 |
+
"dtypes": schema.get("dtypes"),
|
| 119 |
+
})
|
| 120 |
+
|
| 121 |
+
safe_log("analysis_session_start", {
|
| 122 |
+
"session_id": session_id,
|
| 123 |
+
"prompt_hash": _hash_content(user_prompt),
|
| 124 |
+
"prompt_length": len(user_prompt),
|
| 125 |
+
"file_count": len(filenames),
|
| 126 |
+
"filenames": filenames,
|
| 127 |
+
"schemas": schema_log,
|
| 128 |
+
"timestamp": dt.now().isoformat(),
|
| 129 |
+
})
|
| 130 |
+
|
| 131 |
+
return session_id
|
| 132 |
+
|
| 133 |
+
|
| 134 |
+
def log_code_generation(session_id: str, generated_code: str) -> None:
|
| 135 |
+
"""
|
| 136 |
+
Log the generated analysis code.
|
| 137 |
+
|
| 138 |
+
Captures code execution logs for traceability.
|
| 139 |
+
Every finding can be traced back to specific lines of generated Python code.
|
| 140 |
+
"""
|
| 141 |
+
# Parse code to extract key operations for the log
|
| 142 |
+
code_operations = []
|
| 143 |
+
if "groupby" in generated_code:
|
| 144 |
+
code_operations.append("groupby")
|
| 145 |
+
if "merge" in generated_code or "join" in generated_code:
|
| 146 |
+
code_operations.append("merge/join")
|
| 147 |
+
if "pivot" in generated_code:
|
| 148 |
+
code_operations.append("pivot")
|
| 149 |
+
if "agg" in generated_code or "aggregate" in generated_code:
|
| 150 |
+
code_operations.append("aggregate")
|
| 151 |
+
if "sort" in generated_code:
|
| 152 |
+
code_operations.append("sort")
|
| 153 |
+
if "filter" in generated_code or ".loc[" in generated_code or ".query(" in generated_code:
|
| 154 |
+
code_operations.append("filter")
|
| 155 |
+
if "mean(" in generated_code or "sum(" in generated_code or "count(" in generated_code:
|
| 156 |
+
code_operations.append("statistics")
|
| 157 |
+
|
| 158 |
+
safe_log("code_generation", {
|
| 159 |
+
"session_id": session_id,
|
| 160 |
+
"code_hash": _hash_content(generated_code),
|
| 161 |
+
"code_length": len(generated_code),
|
| 162 |
+
"code_lines": generated_code.count('\n') + 1,
|
| 163 |
+
"operations_detected": code_operations,
|
| 164 |
+
"timestamp": dt.now().isoformat(),
|
| 165 |
+
})
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def log_code_execution(session_id: str, success: bool, output_size: int, error: str = None) -> None:
|
| 169 |
+
"""
|
| 170 |
+
Log the result of code execution.
|
| 171 |
+
|
| 172 |
+
Captures execution status and output metadata.
|
| 173 |
+
"""
|
| 174 |
+
safe_log("code_execution", {
|
| 175 |
+
"session_id": session_id,
|
| 176 |
+
"success": success,
|
| 177 |
+
"output_size_bytes": output_size,
|
| 178 |
+
"error": _safe_truncate(error) if error else None,
|
| 179 |
+
"timestamp": dt.now().isoformat(),
|
| 180 |
+
})
|
| 181 |
+
|
| 182 |
+
|
| 183 |
+
def log_analysis_complete(
|
| 184 |
+
session_id: str,
|
| 185 |
+
validated_output_keys: List[str],
|
| 186 |
+
report_length: int,
|
| 187 |
+
total_duration_ms: float = None
|
| 188 |
+
) -> None:
|
| 189 |
+
"""
|
| 190 |
+
Log successful completion of analysis.
|
| 191 |
+
|
| 192 |
+
Captures analytical provenance: what was produced and output structure.
|
| 193 |
+
"""
|
| 194 |
+
safe_log("analysis_session_complete", {
|
| 195 |
+
"session_id": session_id,
|
| 196 |
+
"output_keys": validated_output_keys,
|
| 197 |
+
"output_key_count": len(validated_output_keys),
|
| 198 |
+
"report_length": report_length,
|
| 199 |
+
"duration_ms": total_duration_ms,
|
| 200 |
+
"timestamp": dt.now().isoformat(),
|
| 201 |
+
})
|
| 202 |
+
|
| 203 |
+
|
| 204 |
+
def log_analysis_error(session_id: str, error_type: str, error_message: str) -> None:
|
| 205 |
+
"""
|
| 206 |
+
Log analysis failure.
|
| 207 |
+
|
| 208 |
+
Captures error information for debugging without exposing sensitive data.
|
| 209 |
+
"""
|
| 210 |
+
safe_log("analysis_session_error", {
|
| 211 |
+
"session_id": session_id,
|
| 212 |
+
"error_type": error_type,
|
| 213 |
+
"error_message": _safe_truncate(error_message),
|
| 214 |
+
"timestamp": dt.now().isoformat(),
|
| 215 |
+
})
|
| 216 |
+
|
| 217 |
+
|
| 218 |
# ---------------------- JSON Validation ----------------------
|
| 219 |
|
| 220 |
class JSONValidationError(Exception):
|
|
|
|
| 798 |
# Schema Validation - examines column names, data types, and value ranges
|
| 799 |
yield_update("```\n🔎 Validating input schema...\n```")
|
| 800 |
try:
|
| 801 |
+
schema_infos = validate_all_dataframes(dataframes, filenames)
|
| 802 |
except SchemaValidationError as e:
|
| 803 |
safe_log("schema_validation_failed", {"error": str(e)})
|
| 804 |
return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
|
| 805 |
|
| 806 |
+
# Start audit trail session
|
| 807 |
+
import time as _time
|
| 808 |
+
_start_time = _time.time()
|
| 809 |
+
session_id = log_analysis_start(safe_in, filenames, schema_infos)
|
| 810 |
+
|
| 811 |
schema_context = "\n".join(schema_parts)
|
| 812 |
prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|
| 813 |
|
| 814 |
yield_update("```\n🧠 Generating aligned analysis script...\n```")
|
| 815 |
analysis_script = _create_python_script(prompt_for_code, schema_context)
|
| 816 |
|
| 817 |
+
# Log generated code
|
| 818 |
+
log_code_generation(session_id, analysis_script)
|
| 819 |
+
|
| 820 |
yield_update("```\n⚙️ Executing script in sandbox...\n```")
|
| 821 |
try:
|
| 822 |
raw_data_output = execute_in_sandbox(analysis_script, dataframes)
|
| 823 |
+
log_code_execution(session_id, success=True, output_size=len(raw_data_output))
|
| 824 |
except SandboxViolationError as e:
|
| 825 |
+
log_code_execution(session_id, success=False, output_size=0, error=str(e))
|
| 826 |
+
log_analysis_error(session_id, "sandbox_violation", str(e))
|
| 827 |
safe_log("sandbox_violation", {"error": str(e)})
|
| 828 |
return (
|
| 829 |
f"**Security Violation Detected**\n\n{e}\n\n"
|
|
|
|
| 832 |
f"Generated Script:\n```python\n{analysis_script}\n```"
|
| 833 |
)
|
| 834 |
except Exception as e:
|
| 835 |
+
log_code_execution(session_id, success=False, output_size=0, error=str(e))
|
| 836 |
+
log_analysis_error(session_id, "execution_error", str(e))
|
| 837 |
return (
|
| 838 |
f"An error occurred executing the script: {e}\n\nGenerated Script:\n"
|
| 839 |
f"```python\n{analysis_script}\n```"
|
|
|
|
| 846 |
validated_json_str = format_validated_json_for_report(validated_data)
|
| 847 |
safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
|
| 848 |
except JSONValidationError as e:
|
| 849 |
+
log_analysis_error(session_id, "json_validation_error", str(e))
|
| 850 |
safe_log("json_validation_failed", {"error": str(e)})
|
| 851 |
return (
|
| 852 |
f"**JSON Validation Failed**\n\n{e}\n\n"
|
|
|
|
| 856 |
yield_update("```\n✍️ Synthesizing final comprehensive report...\n```")
|
| 857 |
writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|
| 858 |
final_report = _generate_final_report(writer_input, validated_json_str)
|
| 859 |
+
|
| 860 |
+
# Log successful completion
|
| 861 |
+
_end_time = _time.time()
|
| 862 |
+
_duration_ms = (_end_time - _start_time) * 1000
|
| 863 |
+
log_analysis_complete(
|
| 864 |
+
session_id,
|
| 865 |
+
validated_output_keys=list(validated_data.keys()),
|
| 866 |
+
report_length=len(final_report),
|
| 867 |
+
total_duration_ms=_duration_ms
|
| 868 |
+
)
|
| 869 |
+
|
| 870 |
return _sanitize_text(final_report)
|
| 871 |
else:
|
| 872 |
chat_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|