Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -71,6 +71,264 @@ def redact_phi(text: str) -> str:
|
|
| 71 |
return t
|
| 72 |
|
| 73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
def safe_log(event_name: str, meta: dict | None = None):
|
| 75 |
try:
|
| 76 |
meta = (meta or {}).copy()
|
|
@@ -867,6 +1125,16 @@ def handle(user_msg: str, files: list, yield_update) -> str:
|
|
| 867 |
safe_log("schema_validation_failed", {"error": str(e)})
|
| 868 |
return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
|
| 869 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 870 |
# Start audit trail session
|
| 871 |
import time as _time
|
| 872 |
_start_time = _time.time()
|
|
@@ -907,7 +1175,6 @@ def handle(user_msg: str, files: list, yield_update) -> str:
|
|
| 907 |
yield_update("```\n🔍 Validating JSON output...\n```")
|
| 908 |
try:
|
| 909 |
validated_data = validate_json_output(raw_data_output)
|
| 910 |
-
validated_json_str = format_validated_json_for_report(validated_data)
|
| 911 |
safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
|
| 912 |
except JSONValidationError as e:
|
| 913 |
log_analysis_error(session_id, "json_validation_error", str(e))
|
|
@@ -917,6 +1184,16 @@ def handle(user_msg: str, files: list, yield_update) -> str:
|
|
| 917 |
f"Generated Script:\n```python\n{analysis_script}\n```"
|
| 918 |
)
|
| 919 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 920 |
yield_update("```\n✍️ Synthesizing final comprehensive report...\n```")
|
| 921 |
writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|
| 922 |
final_report = _generate_final_report(writer_input, validated_json_str)
|
|
|
|
| 71 |
return t
|
| 72 |
|
| 73 |
|
| 74 |
+
# ---------------------- Date Shifting Protocol ----------------------
|
| 75 |
+
|
| 76 |
+
import random
|
| 77 |
+
from datetime import timedelta
|
| 78 |
+
|
| 79 |
+
# Session-consistent date shift (random offset between 1-365 days, consistent within session)
|
| 80 |
+
_DATE_SHIFT_OFFSET: int | None = None
|
| 81 |
+
|
| 82 |
+
|
| 83 |
+
def _get_date_shift_offset() -> int:
|
| 84 |
+
"""
|
| 85 |
+
Returns a consistent date shift offset for the current session.
|
| 86 |
+
This ensures all dates in a single analysis are shifted by the same amount,
|
| 87 |
+
preserving relative time relationships while protecting absolute dates.
|
| 88 |
+
"""
|
| 89 |
+
global _DATE_SHIFT_OFFSET
|
| 90 |
+
if _DATE_SHIFT_OFFSET is None:
|
| 91 |
+
_DATE_SHIFT_OFFSET = random.randint(30, 365) # Shift by 1-12 months
|
| 92 |
+
return _DATE_SHIFT_OFFSET
|
| 93 |
+
|
| 94 |
+
|
| 95 |
+
def reset_date_shift_offset() -> None:
|
| 96 |
+
"""Reset the date shift offset for a new session."""
|
| 97 |
+
global _DATE_SHIFT_OFFSET
|
| 98 |
+
_DATE_SHIFT_OFFSET = None
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
def shift_date(date_obj, offset_days: int = None):
|
| 102 |
+
"""
|
| 103 |
+
Shift a date by the session offset to protect PHI.
|
| 104 |
+
Preserves day-of-week and relative relationships.
|
| 105 |
+
|
| 106 |
+
Args:
|
| 107 |
+
date_obj: datetime.date or datetime.datetime object
|
| 108 |
+
offset_days: Optional specific offset; uses session offset if None
|
| 109 |
+
|
| 110 |
+
Returns:
|
| 111 |
+
Shifted date object of same type
|
| 112 |
+
"""
|
| 113 |
+
if offset_days is None:
|
| 114 |
+
offset_days = _get_date_shift_offset()
|
| 115 |
+
return date_obj - timedelta(days=offset_days)
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def shift_dates_in_dataframe(df: pd.DataFrame, date_columns: List[str] = None) -> pd.DataFrame:
|
| 119 |
+
"""
|
| 120 |
+
Apply date shifting to specified columns in a DataFrame.
|
| 121 |
+
|
| 122 |
+
Implements the ClarityOps date-shifting protocol:
|
| 123 |
+
- Shifts all dates by a consistent offset within a session
|
| 124 |
+
- Preserves relative time relationships between records
|
| 125 |
+
- Protects absolute date PHI while maintaining analytical validity
|
| 126 |
+
|
| 127 |
+
Args:
|
| 128 |
+
df: DataFrame to process
|
| 129 |
+
date_columns: List of column names containing dates. If None, auto-detects.
|
| 130 |
+
|
| 131 |
+
Returns:
|
| 132 |
+
DataFrame with shifted dates
|
| 133 |
+
"""
|
| 134 |
+
df_copy = df.copy()
|
| 135 |
+
offset = _get_date_shift_offset()
|
| 136 |
+
|
| 137 |
+
# Auto-detect date columns if not specified
|
| 138 |
+
if date_columns is None:
|
| 139 |
+
date_columns = []
|
| 140 |
+
for col in df_copy.columns:
|
| 141 |
+
if df_copy[col].dtype == 'datetime64[ns]':
|
| 142 |
+
date_columns.append(col)
|
| 143 |
+
elif df_copy[col].dtype == 'object':
|
| 144 |
+
# Check if column contains date-like strings
|
| 145 |
+
sample = df_copy[col].dropna().head(10)
|
| 146 |
+
if len(sample) > 0:
|
| 147 |
+
try:
|
| 148 |
+
pd.to_datetime(sample, errors='raise')
|
| 149 |
+
date_columns.append(col)
|
| 150 |
+
except (ValueError, TypeError):
|
| 151 |
+
pass
|
| 152 |
+
|
| 153 |
+
# Apply date shifting
|
| 154 |
+
for col in date_columns:
|
| 155 |
+
if col in df_copy.columns:
|
| 156 |
+
try:
|
| 157 |
+
# Convert to datetime if needed
|
| 158 |
+
if df_copy[col].dtype != 'datetime64[ns]':
|
| 159 |
+
df_copy[col] = pd.to_datetime(df_copy[col], errors='coerce')
|
| 160 |
+
# Shift dates
|
| 161 |
+
df_copy[col] = df_copy[col] - pd.Timedelta(days=offset)
|
| 162 |
+
except Exception:
|
| 163 |
+
pass # Skip columns that can't be converted
|
| 164 |
+
|
| 165 |
+
return df_copy
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# ---------------------- Minimum Cell Size Enforcement ----------------------
|
| 169 |
+
|
| 170 |
+
# Minimum cell size threshold for aggregated outputs (prevents re-identification)
|
| 171 |
+
MIN_CELL_SIZE = 5 # Industry standard: suppress cells with fewer than 5 records
|
| 172 |
+
|
| 173 |
+
|
| 174 |
+
class CellSizeSuppressor:
|
| 175 |
+
"""
|
| 176 |
+
Enforces minimum cell sizes in aggregated outputs to prevent re-identification.
|
| 177 |
+
|
| 178 |
+
Implements the ClarityOps privacy protection:
|
| 179 |
+
- Suppresses aggregate values derived from fewer than MIN_CELL_SIZE records
|
| 180 |
+
- Replaces suppressed values with "[SUPPRESSED: n<5]"
|
| 181 |
+
- Logs suppression events for audit trail
|
| 182 |
+
"""
|
| 183 |
+
|
| 184 |
+
def __init__(self, min_size: int = MIN_CELL_SIZE):
|
| 185 |
+
self.min_size = min_size
|
| 186 |
+
self.suppression_count = 0
|
| 187 |
+
self.suppressed_cells = []
|
| 188 |
+
|
| 189 |
+
def check_and_suppress(self, value: Any, count: int, context: str = "") -> Any:
|
| 190 |
+
"""
|
| 191 |
+
Check if a value should be suppressed based on its underlying count.
|
| 192 |
+
|
| 193 |
+
Args:
|
| 194 |
+
value: The aggregate value (mean, sum, etc.)
|
| 195 |
+
count: Number of records underlying this value
|
| 196 |
+
context: Description of what this value represents (for logging)
|
| 197 |
+
|
| 198 |
+
Returns:
|
| 199 |
+
Original value if count >= min_size, otherwise suppression marker
|
| 200 |
+
"""
|
| 201 |
+
if count < self.min_size:
|
| 202 |
+
self.suppression_count += 1
|
| 203 |
+
self.suppressed_cells.append({
|
| 204 |
+
"context": context,
|
| 205 |
+
"count": count,
|
| 206 |
+
"threshold": self.min_size
|
| 207 |
+
})
|
| 208 |
+
return f"[SUPPRESSED: n<{self.min_size}]"
|
| 209 |
+
return value
|
| 210 |
+
|
| 211 |
+
def suppress_small_groups_in_dataframe(
|
| 212 |
+
self,
|
| 213 |
+
df: pd.DataFrame,
|
| 214 |
+
count_column: str,
|
| 215 |
+
value_columns: List[str],
|
| 216 |
+
group_description: str = "group"
|
| 217 |
+
) -> pd.DataFrame:
|
| 218 |
+
"""
|
| 219 |
+
Suppress values in a DataFrame where the count column is below threshold.
|
| 220 |
+
|
| 221 |
+
Args:
|
| 222 |
+
df: DataFrame with aggregated data
|
| 223 |
+
count_column: Name of column containing record counts
|
| 224 |
+
value_columns: Columns whose values should be suppressed if count is low
|
| 225 |
+
group_description: Description for logging
|
| 226 |
+
|
| 227 |
+
Returns:
|
| 228 |
+
DataFrame with small-cell values suppressed
|
| 229 |
+
"""
|
| 230 |
+
df_copy = df.copy()
|
| 231 |
+
|
| 232 |
+
for idx, row in df_copy.iterrows():
|
| 233 |
+
count = row.get(count_column, self.min_size)
|
| 234 |
+
if pd.notna(count) and count < self.min_size:
|
| 235 |
+
for col in value_columns:
|
| 236 |
+
if col in df_copy.columns:
|
| 237 |
+
original_value = df_copy.at[idx, col]
|
| 238 |
+
df_copy.at[idx, col] = f"[SUPPRESSED: n<{self.min_size}]"
|
| 239 |
+
self.suppression_count += 1
|
| 240 |
+
self.suppressed_cells.append({
|
| 241 |
+
"context": f"{group_description} at index {idx}, column {col}",
|
| 242 |
+
"original_count": count,
|
| 243 |
+
"threshold": self.min_size
|
| 244 |
+
})
|
| 245 |
+
|
| 246 |
+
return df_copy
|
| 247 |
+
|
| 248 |
+
def get_suppression_report(self) -> Dict[str, Any]:
|
| 249 |
+
"""Generate a report of all suppressions applied."""
|
| 250 |
+
return {
|
| 251 |
+
"total_suppressions": self.suppression_count,
|
| 252 |
+
"min_cell_size_threshold": self.min_size,
|
| 253 |
+
"suppressed_cells": self.suppressed_cells
|
| 254 |
+
}
|
| 255 |
+
|
| 256 |
+
|
| 257 |
+
def enforce_minimum_cell_size(
|
| 258 |
+
output_data: Dict[str, Any],
|
| 259 |
+
count_key_patterns: List[str] = None
|
| 260 |
+
) -> tuple[Dict[str, Any], Dict[str, Any]]:
|
| 261 |
+
"""
|
| 262 |
+
Scan output data for small cell sizes and suppress as needed.
|
| 263 |
+
|
| 264 |
+
This is applied to the validated JSON output before report generation
|
| 265 |
+
to ensure no re-identifiable small-group statistics are exposed.
|
| 266 |
+
|
| 267 |
+
Args:
|
| 268 |
+
output_data: The validated JSON output dictionary
|
| 269 |
+
count_key_patterns: Keys that indicate count fields (default: common patterns)
|
| 270 |
+
|
| 271 |
+
Returns:
|
| 272 |
+
Tuple of (processed_data, suppression_report)
|
| 273 |
+
"""
|
| 274 |
+
if count_key_patterns is None:
|
| 275 |
+
count_key_patterns = ['count', 'n', 'num', 'total', 'records', 'sample_size', 'volume']
|
| 276 |
+
|
| 277 |
+
suppressor = CellSizeSuppressor(MIN_CELL_SIZE)
|
| 278 |
+
processed = _recursive_cell_size_check(output_data, count_key_patterns, suppressor)
|
| 279 |
+
|
| 280 |
+
report = suppressor.get_suppression_report()
|
| 281 |
+
if report["total_suppressions"] > 0:
|
| 282 |
+
safe_log("cell_size_suppression", report)
|
| 283 |
+
|
| 284 |
+
return processed, report
|
| 285 |
+
|
| 286 |
+
|
| 287 |
+
def _recursive_cell_size_check(
|
| 288 |
+
data: Any,
|
| 289 |
+
count_patterns: List[str],
|
| 290 |
+
suppressor: CellSizeSuppressor,
|
| 291 |
+
path: str = ""
|
| 292 |
+
) -> Any:
|
| 293 |
+
"""Recursively check and suppress small cells in nested data structures."""
|
| 294 |
+
|
| 295 |
+
if isinstance(data, dict):
|
| 296 |
+
# Look for count fields in this dict
|
| 297 |
+
count_value = None
|
| 298 |
+
for key in data.keys():
|
| 299 |
+
if any(pattern in key.lower() for pattern in count_patterns):
|
| 300 |
+
try:
|
| 301 |
+
count_value = int(data[key])
|
| 302 |
+
break
|
| 303 |
+
except (ValueError, TypeError):
|
| 304 |
+
pass
|
| 305 |
+
|
| 306 |
+
# If we found a small count, suppress numeric values in this dict
|
| 307 |
+
result = {}
|
| 308 |
+
for key, value in data.items():
|
| 309 |
+
new_path = f"{path}.{key}" if path else key
|
| 310 |
+
|
| 311 |
+
if count_value is not None and count_value < MIN_CELL_SIZE:
|
| 312 |
+
# Suppress numeric values (but not the count itself or identifiers)
|
| 313 |
+
if isinstance(value, (int, float)) and not any(p in key.lower() for p in count_patterns):
|
| 314 |
+
result[key] = suppressor.check_and_suppress(value, count_value, new_path)
|
| 315 |
+
else:
|
| 316 |
+
result[key] = _recursive_cell_size_check(value, count_patterns, suppressor, new_path)
|
| 317 |
+
else:
|
| 318 |
+
result[key] = _recursive_cell_size_check(value, count_patterns, suppressor, new_path)
|
| 319 |
+
|
| 320 |
+
return result
|
| 321 |
+
|
| 322 |
+
elif isinstance(data, list):
|
| 323 |
+
return [
|
| 324 |
+
_recursive_cell_size_check(item, count_patterns, suppressor, f"{path}[{i}]")
|
| 325 |
+
for i, item in enumerate(data)
|
| 326 |
+
]
|
| 327 |
+
|
| 328 |
+
else:
|
| 329 |
+
return data
|
| 330 |
+
|
| 331 |
+
|
| 332 |
def safe_log(event_name: str, meta: dict | None = None):
|
| 333 |
try:
|
| 334 |
meta = (meta or {}).copy()
|
|
|
|
| 1125 |
safe_log("schema_validation_failed", {"error": str(e)})
|
| 1126 |
return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
|
| 1127 |
|
| 1128 |
+
# PHI Protection: Apply date shifting if PHI mode is enabled
|
| 1129 |
+
if PHI_MODE:
|
| 1130 |
+
yield_update("```\n🔒 Applying PHI protections (date shifting)...\n```")
|
| 1131 |
+
reset_date_shift_offset() # New session = new offset
|
| 1132 |
+
dataframes = [shift_dates_in_dataframe(df) for df in dataframes]
|
| 1133 |
+
safe_log("date_shifting_applied", {
|
| 1134 |
+
"offset_days": _get_date_shift_offset(),
|
| 1135 |
+
"dataframes_processed": len(dataframes)
|
| 1136 |
+
})
|
| 1137 |
+
|
| 1138 |
# Start audit trail session
|
| 1139 |
import time as _time
|
| 1140 |
_start_time = _time.time()
|
|
|
|
| 1175 |
yield_update("```\n🔍 Validating JSON output...\n```")
|
| 1176 |
try:
|
| 1177 |
validated_data = validate_json_output(raw_data_output)
|
|
|
|
| 1178 |
safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
|
| 1179 |
except JSONValidationError as e:
|
| 1180 |
log_analysis_error(session_id, "json_validation_error", str(e))
|
|
|
|
| 1184 |
f"Generated Script:\n```python\n{analysis_script}\n```"
|
| 1185 |
)
|
| 1186 |
|
| 1187 |
+
# PHI Protection: Enforce minimum cell sizes to prevent re-identification
|
| 1188 |
+
suppression_report = None
|
| 1189 |
+
if PHI_MODE:
|
| 1190 |
+
yield_update("```\n🔒 Enforcing minimum cell sizes...\n```")
|
| 1191 |
+
validated_data, suppression_report = enforce_minimum_cell_size(validated_data)
|
| 1192 |
+
if suppression_report and suppression_report.get("total_suppressions", 0) > 0:
|
| 1193 |
+
safe_log("cell_size_enforcement", suppression_report)
|
| 1194 |
+
|
| 1195 |
+
validated_json_str = format_validated_json_for_report(validated_data)
|
| 1196 |
+
|
| 1197 |
yield_update("```\n✍️ Synthesizing final comprehensive report...\n```")
|
| 1198 |
writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|
| 1199 |
final_report = _generate_final_report(writer_input, validated_json_str)
|