Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

VEDAGI1 commited on 4 days ago

Commit

2f1bf78

verified ·

1 Parent(s): 3ce7830

Update app.py

Browse files

Files changed (1) hide show

app.py +278 -1

app.py CHANGED Viewed

@@ -71,6 +71,264 @@ def redact_phi(text: str) -> str:
     return t
 def safe_log(event_name: str, meta: dict | None = None):
     try:
         meta = (meta or {}).copy()
@@ -867,6 +1125,16 @@ def handle(user_msg: str, files: list, yield_update) -> str:
                 safe_log("schema_validation_failed", {"error": str(e)})
                 return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
             # Start audit trail session
             import time as _time
             _start_time = _time.time()
@@ -907,7 +1175,6 @@ def handle(user_msg: str, files: list, yield_update) -> str:
             yield_update("```\n🔍 Validating JSON output...\n```")
             try:
                 validated_data = validate_json_output(raw_data_output)
-                validated_json_str = format_validated_json_for_report(validated_data)
                 safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
             except JSONValidationError as e:
                 log_analysis_error(session_id, "json_validation_error", str(e))
@@ -917,6 +1184,16 @@ def handle(user_msg: str, files: list, yield_update) -> str:
                     f"Generated Script:\n```python\n{analysis_script}\n```"
                 )
             yield_update("```\n✍️ Synthesizing final comprehensive report...\n```")
             writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
             final_report = _generate_final_report(writer_input, validated_json_str)

     return t
+# ---------------------- Date Shifting Protocol ----------------------
+import random
+from datetime import timedelta
+# Session-consistent date shift (random offset between 1-365 days, consistent within session)
+_DATE_SHIFT_OFFSET: int | None = None
+def _get_date_shift_offset() -> int:
+    """
+    Returns a consistent date shift offset for the current session.
+    This ensures all dates in a single analysis are shifted by the same amount,
+    preserving relative time relationships while protecting absolute dates.
+    """
+    global _DATE_SHIFT_OFFSET
+    if _DATE_SHIFT_OFFSET is None:
+        _DATE_SHIFT_OFFSET = random.randint(30, 365)  # Shift by 1-12 months
+    return _DATE_SHIFT_OFFSET
+def reset_date_shift_offset() -> None:
+    """Reset the date shift offset for a new session."""
+    global _DATE_SHIFT_OFFSET
+    _DATE_SHIFT_OFFSET = None
+def shift_date(date_obj, offset_days: int = None):
+    """
+    Shift a date by the session offset to protect PHI.
+    Preserves day-of-week and relative relationships.
+    Args:
+        date_obj: datetime.date or datetime.datetime object
+        offset_days: Optional specific offset; uses session offset if None
+    Returns:
+        Shifted date object of same type
+    """
+    if offset_days is None:
+        offset_days = _get_date_shift_offset()
+    return date_obj - timedelta(days=offset_days)
+def shift_dates_in_dataframe(df: pd.DataFrame, date_columns: List[str] = None) -> pd.DataFrame:
+    """
+    Apply date shifting to specified columns in a DataFrame.
+    Implements the ClarityOps date-shifting protocol:
+    - Shifts all dates by a consistent offset within a session
+    - Preserves relative time relationships between records
+    - Protects absolute date PHI while maintaining analytical validity
+    Args:
+        df: DataFrame to process
+        date_columns: List of column names containing dates. If None, auto-detects.
+    Returns:
+        DataFrame with shifted dates
+    """
+    df_copy = df.copy()
+    offset = _get_date_shift_offset()
+    # Auto-detect date columns if not specified
+    if date_columns is None:
+        date_columns = []
+        for col in df_copy.columns:
+            if df_copy[col].dtype == 'datetime64[ns]':
+                date_columns.append(col)
+            elif df_copy[col].dtype == 'object':
+                # Check if column contains date-like strings
+                sample = df_copy[col].dropna().head(10)
+                if len(sample) > 0:
+                    try:
+                        pd.to_datetime(sample, errors='raise')
+                        date_columns.append(col)
+                    except (ValueError, TypeError):
+                        pass
+    # Apply date shifting
+    for col in date_columns:
+        if col in df_copy.columns:
+            try:
+                # Convert to datetime if needed
+                if df_copy[col].dtype != 'datetime64[ns]':
+                    df_copy[col] = pd.to_datetime(df_copy[col], errors='coerce')
+                # Shift dates
+                df_copy[col] = df_copy[col] - pd.Timedelta(days=offset)
+            except Exception:
+                pass  # Skip columns that can't be converted
+    return df_copy
+# ---------------------- Minimum Cell Size Enforcement ----------------------
+# Minimum cell size threshold for aggregated outputs (prevents re-identification)
+MIN_CELL_SIZE = 5  # Industry standard: suppress cells with fewer than 5 records
+class CellSizeSuppressor:
+    """
+    Enforces minimum cell sizes in aggregated outputs to prevent re-identification.
+    Implements the ClarityOps privacy protection:
+    - Suppresses aggregate values derived from fewer than MIN_CELL_SIZE records
+    - Replaces suppressed values with "[SUPPRESSED: n<5]"
+    - Logs suppression events for audit trail
+    """
+    def __init__(self, min_size: int = MIN_CELL_SIZE):
+        self.min_size = min_size
+        self.suppression_count = 0
+        self.suppressed_cells = []
+    def check_and_suppress(self, value: Any, count: int, context: str = "") -> Any:
+        """
+        Check if a value should be suppressed based on its underlying count.
+        Args:
+            value: The aggregate value (mean, sum, etc.)
+            count: Number of records underlying this value
+            context: Description of what this value represents (for logging)
+        Returns:
+            Original value if count >= min_size, otherwise suppression marker
+        """
+        if count < self.min_size:
+            self.suppression_count += 1
+            self.suppressed_cells.append({
+                "context": context,
+                "count": count,
+                "threshold": self.min_size
+            })
+            return f"[SUPPRESSED: n<{self.min_size}]"
+        return value
+    def suppress_small_groups_in_dataframe(
+        self,
+        df: pd.DataFrame,
+        count_column: str,
+        value_columns: List[str],
+        group_description: str = "group"
+    ) -> pd.DataFrame:
+        """
+        Suppress values in a DataFrame where the count column is below threshold.
+        Args:
+            df: DataFrame with aggregated data
+            count_column: Name of column containing record counts
+            value_columns: Columns whose values should be suppressed if count is low
+            group_description: Description for logging
+        Returns:
+            DataFrame with small-cell values suppressed
+        """
+        df_copy = df.copy()
+        for idx, row in df_copy.iterrows():
+            count = row.get(count_column, self.min_size)
+            if pd.notna(count) and count < self.min_size:
+                for col in value_columns:
+                    if col in df_copy.columns:
+                        original_value = df_copy.at[idx, col]
+                        df_copy.at[idx, col] = f"[SUPPRESSED: n<{self.min_size}]"
+                        self.suppression_count += 1
+                        self.suppressed_cells.append({
+                            "context": f"{group_description} at index {idx}, column {col}",
+                            "original_count": count,
+                            "threshold": self.min_size
+                        })
+        return df_copy
+    def get_suppression_report(self) -> Dict[str, Any]:
+        """Generate a report of all suppressions applied."""
+        return {
+            "total_suppressions": self.suppression_count,
+            "min_cell_size_threshold": self.min_size,
+            "suppressed_cells": self.suppressed_cells
+        }
+def enforce_minimum_cell_size(
+    output_data: Dict[str, Any],
+    count_key_patterns: List[str] = None
+) -> tuple[Dict[str, Any], Dict[str, Any]]:
+    """
+    Scan output data for small cell sizes and suppress as needed.
+    This is applied to the validated JSON output before report generation
+    to ensure no re-identifiable small-group statistics are exposed.
+    Args:
+        output_data: The validated JSON output dictionary
+        count_key_patterns: Keys that indicate count fields (default: common patterns)
+    Returns:
+        Tuple of (processed_data, suppression_report)
+    """
+    if count_key_patterns is None:
+        count_key_patterns = ['count', 'n', 'num', 'total', 'records', 'sample_size', 'volume']
+    suppressor = CellSizeSuppressor(MIN_CELL_SIZE)
+    processed = _recursive_cell_size_check(output_data, count_key_patterns, suppressor)
+    report = suppressor.get_suppression_report()
+    if report["total_suppressions"] > 0:
+        safe_log("cell_size_suppression", report)
+    return processed, report
+def _recursive_cell_size_check(
+    data: Any,
+    count_patterns: List[str],
+    suppressor: CellSizeSuppressor,
+    path: str = ""
+) -> Any:
+    """Recursively check and suppress small cells in nested data structures."""
+    if isinstance(data, dict):
+        # Look for count fields in this dict
+        count_value = None
+        for key in data.keys():
+            if any(pattern in key.lower() for pattern in count_patterns):
+                try:
+                    count_value = int(data[key])
+                    break
+                except (ValueError, TypeError):
+                    pass
+        # If we found a small count, suppress numeric values in this dict
+        result = {}
+        for key, value in data.items():
+            new_path = f"{path}.{key}" if path else key
+            if count_value is not None and count_value < MIN_CELL_SIZE:
+                # Suppress numeric values (but not the count itself or identifiers)
+                if isinstance(value, (int, float)) and not any(p in key.lower() for p in count_patterns):
+                    result[key] = suppressor.check_and_suppress(value, count_value, new_path)
+                else:
+                    result[key] = _recursive_cell_size_check(value, count_patterns, suppressor, new_path)
+            else:
+                result[key] = _recursive_cell_size_check(value, count_patterns, suppressor, new_path)
+        return result
+    elif isinstance(data, list):
+        return [
+            _recursive_cell_size_check(item, count_patterns, suppressor, f"{path}[{i}]")
+            for i, item in enumerate(data)
+        ]
+    else:
+        return data
 def safe_log(event_name: str, meta: dict | None = None):
     try:
         meta = (meta or {}).copy()
                 safe_log("schema_validation_failed", {"error": str(e)})
                 return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
+            # PHI Protection: Apply date shifting if PHI mode is enabled
+            if PHI_MODE:
+                yield_update("```\n🔒 Applying PHI protections (date shifting)...\n```")
+                reset_date_shift_offset()  # New session = new offset
+                dataframes = [shift_dates_in_dataframe(df) for df in dataframes]
+                safe_log("date_shifting_applied", {
+                    "offset_days": _get_date_shift_offset(),
+                    "dataframes_processed": len(dataframes)
+                })
             # Start audit trail session
             import time as _time
             _start_time = _time.time()
             yield_update("```\n🔍 Validating JSON output...\n```")
             try:
                 validated_data = validate_json_output(raw_data_output)
                 safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
             except JSONValidationError as e:
                 log_analysis_error(session_id, "json_validation_error", str(e))
                     f"Generated Script:\n```python\n{analysis_script}\n```"
                 )
+            # PHI Protection: Enforce minimum cell sizes to prevent re-identification
+            suppression_report = None
+            if PHI_MODE:
+                yield_update("```\n🔒 Enforcing minimum cell sizes...\n```")
+                validated_data, suppression_report = enforce_minimum_cell_size(validated_data)
+                if suppression_report and suppression_report.get("total_suppressions", 0) > 0:
+                    safe_log("cell_size_enforcement", suppression_report)
+            validated_json_str = format_validated_json_for_report(validated_data)
             yield_update("```\n✍️ Synthesizing final comprehensive report...\n```")
             writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
             final_report = _generate_final_report(writer_input, validated_json_str)