VEDAGI1 commited on
Commit
2f1bf78
·
verified ·
1 Parent(s): 3ce7830

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +278 -1
app.py CHANGED
@@ -71,6 +71,264 @@ def redact_phi(text: str) -> str:
71
  return t
72
 
73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
  def safe_log(event_name: str, meta: dict | None = None):
75
  try:
76
  meta = (meta or {}).copy()
@@ -867,6 +1125,16 @@ def handle(user_msg: str, files: list, yield_update) -> str:
867
  safe_log("schema_validation_failed", {"error": str(e)})
868
  return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
869
 
 
 
 
 
 
 
 
 
 
 
870
  # Start audit trail session
871
  import time as _time
872
  _start_time = _time.time()
@@ -907,7 +1175,6 @@ def handle(user_msg: str, files: list, yield_update) -> str:
907
  yield_update("```\n🔍 Validating JSON output...\n```")
908
  try:
909
  validated_data = validate_json_output(raw_data_output)
910
- validated_json_str = format_validated_json_for_report(validated_data)
911
  safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
912
  except JSONValidationError as e:
913
  log_analysis_error(session_id, "json_validation_error", str(e))
@@ -917,6 +1184,16 @@ def handle(user_msg: str, files: list, yield_update) -> str:
917
  f"Generated Script:\n```python\n{analysis_script}\n```"
918
  )
919
 
 
 
 
 
 
 
 
 
 
 
920
  yield_update("```\n✍️ Synthesizing final comprehensive report...\n```")
921
  writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
922
  final_report = _generate_final_report(writer_input, validated_json_str)
 
71
  return t
72
 
73
 
74
+ # ---------------------- Date Shifting Protocol ----------------------
75
+
76
+ import random
77
+ from datetime import timedelta
78
+
79
+ # Session-consistent date shift (random offset between 1-365 days, consistent within session)
80
+ _DATE_SHIFT_OFFSET: int | None = None
81
+
82
+
83
+ def _get_date_shift_offset() -> int:
84
+ """
85
+ Returns a consistent date shift offset for the current session.
86
+ This ensures all dates in a single analysis are shifted by the same amount,
87
+ preserving relative time relationships while protecting absolute dates.
88
+ """
89
+ global _DATE_SHIFT_OFFSET
90
+ if _DATE_SHIFT_OFFSET is None:
91
+ _DATE_SHIFT_OFFSET = random.randint(30, 365) # Shift by 1-12 months
92
+ return _DATE_SHIFT_OFFSET
93
+
94
+
95
+ def reset_date_shift_offset() -> None:
96
+ """Reset the date shift offset for a new session."""
97
+ global _DATE_SHIFT_OFFSET
98
+ _DATE_SHIFT_OFFSET = None
99
+
100
+
101
+ def shift_date(date_obj, offset_days: int = None):
102
+ """
103
+ Shift a date by the session offset to protect PHI.
104
+ Preserves day-of-week and relative relationships.
105
+
106
+ Args:
107
+ date_obj: datetime.date or datetime.datetime object
108
+ offset_days: Optional specific offset; uses session offset if None
109
+
110
+ Returns:
111
+ Shifted date object of same type
112
+ """
113
+ if offset_days is None:
114
+ offset_days = _get_date_shift_offset()
115
+ return date_obj - timedelta(days=offset_days)
116
+
117
+
118
+ def shift_dates_in_dataframe(df: pd.DataFrame, date_columns: List[str] = None) -> pd.DataFrame:
119
+ """
120
+ Apply date shifting to specified columns in a DataFrame.
121
+
122
+ Implements the ClarityOps date-shifting protocol:
123
+ - Shifts all dates by a consistent offset within a session
124
+ - Preserves relative time relationships between records
125
+ - Protects absolute date PHI while maintaining analytical validity
126
+
127
+ Args:
128
+ df: DataFrame to process
129
+ date_columns: List of column names containing dates. If None, auto-detects.
130
+
131
+ Returns:
132
+ DataFrame with shifted dates
133
+ """
134
+ df_copy = df.copy()
135
+ offset = _get_date_shift_offset()
136
+
137
+ # Auto-detect date columns if not specified
138
+ if date_columns is None:
139
+ date_columns = []
140
+ for col in df_copy.columns:
141
+ if df_copy[col].dtype == 'datetime64[ns]':
142
+ date_columns.append(col)
143
+ elif df_copy[col].dtype == 'object':
144
+ # Check if column contains date-like strings
145
+ sample = df_copy[col].dropna().head(10)
146
+ if len(sample) > 0:
147
+ try:
148
+ pd.to_datetime(sample, errors='raise')
149
+ date_columns.append(col)
150
+ except (ValueError, TypeError):
151
+ pass
152
+
153
+ # Apply date shifting
154
+ for col in date_columns:
155
+ if col in df_copy.columns:
156
+ try:
157
+ # Convert to datetime if needed
158
+ if df_copy[col].dtype != 'datetime64[ns]':
159
+ df_copy[col] = pd.to_datetime(df_copy[col], errors='coerce')
160
+ # Shift dates
161
+ df_copy[col] = df_copy[col] - pd.Timedelta(days=offset)
162
+ except Exception:
163
+ pass # Skip columns that can't be converted
164
+
165
+ return df_copy
166
+
167
+
168
+ # ---------------------- Minimum Cell Size Enforcement ----------------------
169
+
170
+ # Minimum cell size threshold for aggregated outputs (prevents re-identification)
171
+ MIN_CELL_SIZE = 5 # Industry standard: suppress cells with fewer than 5 records
172
+
173
+
174
+ class CellSizeSuppressor:
175
+ """
176
+ Enforces minimum cell sizes in aggregated outputs to prevent re-identification.
177
+
178
+ Implements the ClarityOps privacy protection:
179
+ - Suppresses aggregate values derived from fewer than MIN_CELL_SIZE records
180
+ - Replaces suppressed values with "[SUPPRESSED: n<5]"
181
+ - Logs suppression events for audit trail
182
+ """
183
+
184
+ def __init__(self, min_size: int = MIN_CELL_SIZE):
185
+ self.min_size = min_size
186
+ self.suppression_count = 0
187
+ self.suppressed_cells = []
188
+
189
+ def check_and_suppress(self, value: Any, count: int, context: str = "") -> Any:
190
+ """
191
+ Check if a value should be suppressed based on its underlying count.
192
+
193
+ Args:
194
+ value: The aggregate value (mean, sum, etc.)
195
+ count: Number of records underlying this value
196
+ context: Description of what this value represents (for logging)
197
+
198
+ Returns:
199
+ Original value if count >= min_size, otherwise suppression marker
200
+ """
201
+ if count < self.min_size:
202
+ self.suppression_count += 1
203
+ self.suppressed_cells.append({
204
+ "context": context,
205
+ "count": count,
206
+ "threshold": self.min_size
207
+ })
208
+ return f"[SUPPRESSED: n<{self.min_size}]"
209
+ return value
210
+
211
+ def suppress_small_groups_in_dataframe(
212
+ self,
213
+ df: pd.DataFrame,
214
+ count_column: str,
215
+ value_columns: List[str],
216
+ group_description: str = "group"
217
+ ) -> pd.DataFrame:
218
+ """
219
+ Suppress values in a DataFrame where the count column is below threshold.
220
+
221
+ Args:
222
+ df: DataFrame with aggregated data
223
+ count_column: Name of column containing record counts
224
+ value_columns: Columns whose values should be suppressed if count is low
225
+ group_description: Description for logging
226
+
227
+ Returns:
228
+ DataFrame with small-cell values suppressed
229
+ """
230
+ df_copy = df.copy()
231
+
232
+ for idx, row in df_copy.iterrows():
233
+ count = row.get(count_column, self.min_size)
234
+ if pd.notna(count) and count < self.min_size:
235
+ for col in value_columns:
236
+ if col in df_copy.columns:
237
+ original_value = df_copy.at[idx, col]
238
+ df_copy.at[idx, col] = f"[SUPPRESSED: n<{self.min_size}]"
239
+ self.suppression_count += 1
240
+ self.suppressed_cells.append({
241
+ "context": f"{group_description} at index {idx}, column {col}",
242
+ "original_count": count,
243
+ "threshold": self.min_size
244
+ })
245
+
246
+ return df_copy
247
+
248
+ def get_suppression_report(self) -> Dict[str, Any]:
249
+ """Generate a report of all suppressions applied."""
250
+ return {
251
+ "total_suppressions": self.suppression_count,
252
+ "min_cell_size_threshold": self.min_size,
253
+ "suppressed_cells": self.suppressed_cells
254
+ }
255
+
256
+
257
+ def enforce_minimum_cell_size(
258
+ output_data: Dict[str, Any],
259
+ count_key_patterns: List[str] = None
260
+ ) -> tuple[Dict[str, Any], Dict[str, Any]]:
261
+ """
262
+ Scan output data for small cell sizes and suppress as needed.
263
+
264
+ This is applied to the validated JSON output before report generation
265
+ to ensure no re-identifiable small-group statistics are exposed.
266
+
267
+ Args:
268
+ output_data: The validated JSON output dictionary
269
+ count_key_patterns: Keys that indicate count fields (default: common patterns)
270
+
271
+ Returns:
272
+ Tuple of (processed_data, suppression_report)
273
+ """
274
+ if count_key_patterns is None:
275
+ count_key_patterns = ['count', 'n', 'num', 'total', 'records', 'sample_size', 'volume']
276
+
277
+ suppressor = CellSizeSuppressor(MIN_CELL_SIZE)
278
+ processed = _recursive_cell_size_check(output_data, count_key_patterns, suppressor)
279
+
280
+ report = suppressor.get_suppression_report()
281
+ if report["total_suppressions"] > 0:
282
+ safe_log("cell_size_suppression", report)
283
+
284
+ return processed, report
285
+
286
+
287
+ def _recursive_cell_size_check(
288
+ data: Any,
289
+ count_patterns: List[str],
290
+ suppressor: CellSizeSuppressor,
291
+ path: str = ""
292
+ ) -> Any:
293
+ """Recursively check and suppress small cells in nested data structures."""
294
+
295
+ if isinstance(data, dict):
296
+ # Look for count fields in this dict
297
+ count_value = None
298
+ for key in data.keys():
299
+ if any(pattern in key.lower() for pattern in count_patterns):
300
+ try:
301
+ count_value = int(data[key])
302
+ break
303
+ except (ValueError, TypeError):
304
+ pass
305
+
306
+ # If we found a small count, suppress numeric values in this dict
307
+ result = {}
308
+ for key, value in data.items():
309
+ new_path = f"{path}.{key}" if path else key
310
+
311
+ if count_value is not None and count_value < MIN_CELL_SIZE:
312
+ # Suppress numeric values (but not the count itself or identifiers)
313
+ if isinstance(value, (int, float)) and not any(p in key.lower() for p in count_patterns):
314
+ result[key] = suppressor.check_and_suppress(value, count_value, new_path)
315
+ else:
316
+ result[key] = _recursive_cell_size_check(value, count_patterns, suppressor, new_path)
317
+ else:
318
+ result[key] = _recursive_cell_size_check(value, count_patterns, suppressor, new_path)
319
+
320
+ return result
321
+
322
+ elif isinstance(data, list):
323
+ return [
324
+ _recursive_cell_size_check(item, count_patterns, suppressor, f"{path}[{i}]")
325
+ for i, item in enumerate(data)
326
+ ]
327
+
328
+ else:
329
+ return data
330
+
331
+
332
  def safe_log(event_name: str, meta: dict | None = None):
333
  try:
334
  meta = (meta or {}).copy()
 
1125
  safe_log("schema_validation_failed", {"error": str(e)})
1126
  return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
1127
 
1128
+ # PHI Protection: Apply date shifting if PHI mode is enabled
1129
+ if PHI_MODE:
1130
+ yield_update("```\n🔒 Applying PHI protections (date shifting)...\n```")
1131
+ reset_date_shift_offset() # New session = new offset
1132
+ dataframes = [shift_dates_in_dataframe(df) for df in dataframes]
1133
+ safe_log("date_shifting_applied", {
1134
+ "offset_days": _get_date_shift_offset(),
1135
+ "dataframes_processed": len(dataframes)
1136
+ })
1137
+
1138
  # Start audit trail session
1139
  import time as _time
1140
  _start_time = _time.time()
 
1175
  yield_update("```\n🔍 Validating JSON output...\n```")
1176
  try:
1177
  validated_data = validate_json_output(raw_data_output)
 
1178
  safe_log("json_validation_passed", {"output_keys": list(validated_data.keys())})
1179
  except JSONValidationError as e:
1180
  log_analysis_error(session_id, "json_validation_error", str(e))
 
1184
  f"Generated Script:\n```python\n{analysis_script}\n```"
1185
  )
1186
 
1187
+ # PHI Protection: Enforce minimum cell sizes to prevent re-identification
1188
+ suppression_report = None
1189
+ if PHI_MODE:
1190
+ yield_update("```\n🔒 Enforcing minimum cell sizes...\n```")
1191
+ validated_data, suppression_report = enforce_minimum_cell_size(validated_data)
1192
+ if suppression_report and suppression_report.get("total_suppressions", 0) > 0:
1193
+ safe_log("cell_size_enforcement", suppression_report)
1194
+
1195
+ validated_json_str = format_validated_json_for_report(validated_data)
1196
+
1197
  yield_update("```\n✍️ Synthesizing final comprehensive report...\n```")
1198
  writer_input = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
1199
  final_report = _generate_final_report(writer_input, validated_json_str)