Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

VEDAGI1 commited on 5 days ago

Commit

c78a9de

verified ·

1 Parent(s): 1e4ca95

Update app.py

Browse files

Files changed (1) hide show

app.py +161 -1

app.py CHANGED Viewed

@@ -80,6 +80,141 @@ def safe_log(event_name: str, meta: dict | None = None):
         pass
 # ---------------------- JSON Validation ----------------------
 class JSONValidationError(Exception):
@@ -663,21 +798,32 @@ def handle(user_msg: str, files: list, yield_update) -> str:
             # Schema Validation - examines column names, data types, and value ranges
             yield_update("```\n🔎 Validating input schema...\n```")
             try:
-                validate_all_dataframes(dataframes, filenames)
             except SchemaValidationError as e:
                 safe_log("schema_validation_failed", {"error": str(e)})
                 return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
             schema_context = "\n".join(schema_parts)
             prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
             yield_update("```\n🧠 Generating aligned analysis script...\n```")
             analysis_script = _create_python_script(prompt_for_code, schema_context)
             yield_update("```\n⚙️ Executing script in sandbox...\n```")
             try:
                 raw_data_output = execute_in_sandbox(analysis_script, dataframes)
             except SandboxViolationError as e:
                 safe_log("sandbox_violation", {"error": str(e)})
                 return (
                     f"**Security Violation Detected**\n\n{e}\n\n"
@@ -686,6 +832,8 @@ def handle(user_msg: str, files: list, yield_update) -> str:
                     f"Generated Script:\n```python\n{analysis_script}\n```"
                 )
             except Exception as e:
                 return (
                     f"An error occurred executing the script: {e}\n\nGenerated Script:\n"
                     f"```python\n{analysis_script}\n```"
@@ -698,6 +846,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
                 validated_json_str = format_validated_json_for_report(validated_data)
                 safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
             except JSONValidationError as e:
                 safe_log("json_validation_failed", {"error": str(e)})
                 return (
                     f"**JSON Validation Failed**\n\n{e}\n\n"
@@ -707,6 +856,17 @@ def handle(user_msg: str, files: list, yield_update) -> str:
             yield_update("```\n✍️ Synthesizing final comprehensive report...\n```")
             writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
             final_report = _generate_final_report(writer_input, validated_json_str)
             return _sanitize_text(final_report)
         else:
             chat_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in

         pass
+# ---------------------- Audit Trail ----------------------
+import hashlib
+from datetime import datetime as dt
+def _hash_content(content: str) -> str:
+    """Generate a short hash for content identification without storing full content."""
+    return hashlib.sha256(content.encode('utf-8')).hexdigest()[:16]
+def _safe_truncate(text: str, max_length: int = 500) -> str:
+    """Safely truncate text for logging without exposing sensitive data."""
+    if not text or len(text) <= max_length:
+        return text
+    return text[:max_length] + f"... [truncated, {len(text)} total chars]"
+def log_analysis_start(user_prompt: str, filenames: List[str], schema_summary: List[Dict[str, Any]]) -> str:
+    """
+    Log the start of an analysis session.
+    Captures data lineage: what files were uploaded and their schemas.
+    Returns a session_id for correlating subsequent log entries.
+    """
+    session_id = dt.now().strftime("%Y%m%d_%H%M%S_") + _hash_content(user_prompt)[:8]
+    # Build schema summary without sensitive data
+    schema_log = []
+    for schema in schema_summary:
+        schema_log.append({
+            "filename": schema.get("filename"),
+            "rows": schema.get("rows"),
+            "columns": schema.get("columns"),
+            "column_names": schema.get("column_names"),
+            "dtypes": schema.get("dtypes"),
+        })
+    safe_log("analysis_session_start", {
+        "session_id": session_id,
+        "prompt_hash": _hash_content(user_prompt),
+        "prompt_length": len(user_prompt),
+        "file_count": len(filenames),
+        "filenames": filenames,
+        "schemas": schema_log,
+        "timestamp": dt.now().isoformat(),
+    })
+    return session_id
+def log_code_generation(session_id: str, generated_code: str) -> None:
+    """
+    Log the generated analysis code.
+    Captures code execution logs for traceability.
+    Every finding can be traced back to specific lines of generated Python code.
+    """
+    # Parse code to extract key operations for the log
+    code_operations = []
+    if "groupby" in generated_code:
+        code_operations.append("groupby")
+    if "merge" in generated_code or "join" in generated_code:
+        code_operations.append("merge/join")
+    if "pivot" in generated_code:
+        code_operations.append("pivot")
+    if "agg" in generated_code or "aggregate" in generated_code:
+        code_operations.append("aggregate")
+    if "sort" in generated_code:
+        code_operations.append("sort")
+    if "filter" in generated_code or ".loc[" in generated_code or ".query(" in generated_code:
+        code_operations.append("filter")
+    if "mean(" in generated_code or "sum(" in generated_code or "count(" in generated_code:
+        code_operations.append("statistics")
+    safe_log("code_generation", {
+        "session_id": session_id,
+        "code_hash": _hash_content(generated_code),
+        "code_length": len(generated_code),
+        "code_lines": generated_code.count('\n') + 1,
+        "operations_detected": code_operations,
+        "timestamp": dt.now().isoformat(),
+    })
+def log_code_execution(session_id: str, success: bool, output_size: int, error: str = None) -> None:
+    """
+    Log the result of code execution.
+    Captures execution status and output metadata.
+    """
+    safe_log("code_execution", {
+        "session_id": session_id,
+        "success": success,
+        "output_size_bytes": output_size,
+        "error": _safe_truncate(error) if error else None,
+        "timestamp": dt.now().isoformat(),
+    })
+def log_analysis_complete(
+    session_id: str,
+    validated_output_keys: List[str],
+    report_length: int,
+    total_duration_ms: float = None
+) -> None:
+    """
+    Log successful completion of analysis.
+    Captures analytical provenance: what was produced and output structure.
+    """
+    safe_log("analysis_session_complete", {
+        "session_id": session_id,
+        "output_keys": validated_output_keys,
+        "output_key_count": len(validated_output_keys),
+        "report_length": report_length,
+        "duration_ms": total_duration_ms,
+        "timestamp": dt.now().isoformat(),
+    })
+def log_analysis_error(session_id: str, error_type: str, error_message: str) -> None:
+    """
+    Log analysis failure.
+    Captures error information for debugging without exposing sensitive data.
+    """
+    safe_log("analysis_session_error", {
+        "session_id": session_id,
+        "error_type": error_type,
+        "error_message": _safe_truncate(error_message),
+        "timestamp": dt.now().isoformat(),
+    })
 # ---------------------- JSON Validation ----------------------
 class JSONValidationError(Exception):
             # Schema Validation - examines column names, data types, and value ranges
             yield_update("```\n🔎 Validating input schema...\n```")
             try:
+                schema_infos = validate_all_dataframes(dataframes, filenames)
             except SchemaValidationError as e:
                 safe_log("schema_validation_failed", {"error": str(e)})
                 return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
+            # Start audit trail session
+            import time as _time
+            _start_time = _time.time()
+            session_id = log_analysis_start(safe_in, filenames, schema_infos)
             schema_context = "\n".join(schema_parts)
             prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
             yield_update("```\n🧠 Generating aligned analysis script...\n```")
             analysis_script = _create_python_script(prompt_for_code, schema_context)
+            # Log generated code
+            log_code_generation(session_id, analysis_script)
             yield_update("```\n⚙️ Executing script in sandbox...\n```")
             try:
                 raw_data_output = execute_in_sandbox(analysis_script, dataframes)
+                log_code_execution(session_id, success=True, output_size=len(raw_data_output))
             except SandboxViolationError as e:
+                log_code_execution(session_id, success=False, output_size=0, error=str(e))
+                log_analysis_error(session_id, "sandbox_violation", str(e))
                 safe_log("sandbox_violation", {"error": str(e)})
                 return (
                     f"**Security Violation Detected**\n\n{e}\n\n"
                     f"Generated Script:\n```python\n{analysis_script}\n```"
                 )
             except Exception as e:
+                log_code_execution(session_id, success=False, output_size=0, error=str(e))
+                log_analysis_error(session_id, "execution_error", str(e))
                 return (
                     f"An error occurred executing the script: {e}\n\nGenerated Script:\n"
                     f"```python\n{analysis_script}\n```"
                 validated_json_str = format_validated_json_for_report(validated_data)
                 safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
             except JSONValidationError as e:
+                log_analysis_error(session_id, "json_validation_error", str(e))
                 safe_log("json_validation_failed", {"error": str(e)})
                 return (
                     f"**JSON Validation Failed**\n\n{e}\n\n"
             yield_update("```\n✍️ Synthesizing final comprehensive report...\n```")
             writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
             final_report = _generate_final_report(writer_input, validated_json_str)
+            # Log successful completion
+            _end_time = _time.time()
+            _duration_ms = (_end_time - _start_time) * 1000
+            log_analysis_complete(
+                session_id,
+                validated_output_keys=list(validated_data.keys()),
+                report_length=len(final_report),
+                total_duration_ms=_duration_ms
+            )
             return _sanitize_text(final_report)
         else:
             chat_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in