VEDAGI1 commited on
Commit
1e4ca95
·
verified ·
1 Parent(s): a37596a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -1
app.py CHANGED
@@ -158,6 +158,148 @@ def format_validated_json_for_report(validated_data: Dict[str, Any]) -> str:
158
  return json.dumps({"raw_data": str(validated_data)}, indent=2)
159
 
160
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
161
  # ---------------------- Sandbox Execution ----------------------
162
 
163
  class SandboxViolationError(Exception):
@@ -502,7 +644,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
502
  file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
503
 
504
  if file_paths:
505
- dataframes, schema_parts = [], []
506
  for i, p in enumerate(file_paths):
507
  if p.endswith(".csv"):
508
  try:
@@ -510,6 +652,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
510
  except UnicodeDecodeError:
511
  df = pd.read_csv(p, encoding="latin1")
512
  dataframes.append(df)
 
513
  schema_parts.append(
514
  f"DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):\n{df.head().to_markdown()}\n"
515
  )
@@ -517,6 +660,14 @@ def handle(user_msg: str, files: list, yield_update) -> str:
517
  if not dataframes:
518
  return "Please upload at least one CSV file."
519
 
 
 
 
 
 
 
 
 
520
  schema_context = "\n".join(schema_parts)
521
  prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
522
 
 
158
  return json.dumps({"raw_data": str(validated_data)}, indent=2)
159
 
160
 
161
+ # ---------------------- Schema Validation ----------------------
162
+
163
+ class SchemaValidationError(Exception):
164
+ """Raised when input data fails schema validation."""
165
+ pass
166
+
167
+
168
+ def validate_dataframe_schema(df: pd.DataFrame, filename: str) -> Dict[str, Any]:
169
+ """
170
+ Validates a DataFrame's schema before analysis.
171
+
172
+ Implements the ClarityOps requirement:
173
+ "Schema validation examines column names, data types, and value ranges
174
+ before analysis begins. The system rejects malformed inputs."
175
+
176
+ Args:
177
+ df: The DataFrame to validate
178
+ filename: Original filename for error messages
179
+
180
+ Returns:
181
+ Dict containing schema metadata for logging
182
+
183
+ Raises:
184
+ SchemaValidationError: If the DataFrame fails validation
185
+ """
186
+ errors = []
187
+ warnings = []
188
+
189
+ # Check 1: DataFrame is not empty
190
+ if df.empty:
191
+ raise SchemaValidationError(f"File '{filename}' contains no data (empty DataFrame).")
192
+
193
+ # Check 2: Has at least one column
194
+ if len(df.columns) == 0:
195
+ raise SchemaValidationError(f"File '{filename}' has no columns.")
196
+
197
+ # Check 3: Column names are valid (not empty, no duplicates)
198
+ col_names = list(df.columns)
199
+
200
+ # Check for empty column names
201
+ empty_cols = [i for i, c in enumerate(col_names) if str(c).strip() == "" or pd.isna(c)]
202
+ if empty_cols:
203
+ errors.append(f"Empty column names at positions: {empty_cols}")
204
+
205
+ # Check for duplicate column names
206
+ seen = {}
207
+ duplicates = []
208
+ for col in col_names:
209
+ col_str = str(col)
210
+ if col_str in seen:
211
+ duplicates.append(col_str)
212
+ seen[col_str] = True
213
+ if duplicates:
214
+ errors.append(f"Duplicate column names: {list(set(duplicates))}")
215
+
216
+ # Check 4: Data types are recognizable (skip if duplicates found)
217
+ has_duplicates = len(duplicates) > 0
218
+ if not has_duplicates:
219
+ for col in df.columns:
220
+ dtype = df[col].dtype
221
+ if dtype == object:
222
+ # Check if object column has mixed types that could cause issues
223
+ sample = df[col].dropna().head(100)
224
+ if len(sample) > 0:
225
+ types_in_col = set(type(x).__name__ for x in sample)
226
+ if len(types_in_col) > 2: # Allow str + one other type
227
+ warnings.append(f"Column '{col}' has mixed types: {types_in_col}")
228
+
229
+ # Check 5: Reasonable row count (warn if very large)
230
+ if len(df) > 1_000_000:
231
+ warnings.append(f"Large dataset ({len(df):,} rows) may impact performance.")
232
+
233
+ # Check 6: Check for completely null columns (skip if duplicates found)
234
+ if not has_duplicates:
235
+ null_cols = [col for col in df.columns if df[col].isna().all()]
236
+ if null_cols:
237
+ warnings.append(f"Columns with all null values: {null_cols}")
238
+
239
+ # Check 7: Validate numeric columns have reasonable ranges (skip if duplicates found)
240
+ if not has_duplicates:
241
+ import numpy as np
242
+ for col in df.select_dtypes(include=['number']).columns:
243
+ col_data = df[col].dropna()
244
+ if len(col_data) > 0:
245
+ if np.isinf(col_data).any():
246
+ errors.append(f"Column '{col}' contains infinite values.")
247
+
248
+ # If there are critical errors, reject the input
249
+ if errors:
250
+ error_msg = f"Schema validation failed for '{filename}':\n" + "\n".join(f" - {e}" for e in errors)
251
+ raise SchemaValidationError(error_msg)
252
+
253
+ # Build schema metadata
254
+ schema_info = {
255
+ "filename": filename,
256
+ "rows": len(df),
257
+ "columns": len(df.columns),
258
+ "column_names": col_names,
259
+ "dtypes": {str(col): str(df[col].dtype) for col in df.columns},
260
+ "null_counts": {str(col): int(df[col].isna().sum()) for col in df.columns},
261
+ "warnings": warnings,
262
+ }
263
+
264
+ # Log warnings but don't fail
265
+ if warnings:
266
+ safe_log("schema_validation_warnings", {"filename": filename, "warnings": warnings})
267
+
268
+ safe_log("schema_validation_passed", {"filename": filename, "rows": len(df), "columns": len(df.columns)})
269
+
270
+ return schema_info
271
+
272
+
273
+ def validate_all_dataframes(dataframes: List[pd.DataFrame], filenames: List[str]) -> List[Dict[str, Any]]:
274
+ """
275
+ Validates all uploaded DataFrames.
276
+
277
+ Args:
278
+ dataframes: List of DataFrames to validate
279
+ filenames: Corresponding filenames
280
+
281
+ Returns:
282
+ List of schema metadata dicts
283
+
284
+ Raises:
285
+ SchemaValidationError: If any DataFrame fails validation
286
+ """
287
+ schema_infos = []
288
+ all_errors = []
289
+
290
+ for df, filename in zip(dataframes, filenames):
291
+ try:
292
+ schema_info = validate_dataframe_schema(df, filename)
293
+ schema_infos.append(schema_info)
294
+ except SchemaValidationError as e:
295
+ all_errors.append(str(e))
296
+
297
+ if all_errors:
298
+ raise SchemaValidationError("\n\n".join(all_errors))
299
+
300
+ return schema_infos
301
+
302
+
303
  # ---------------------- Sandbox Execution ----------------------
304
 
305
  class SandboxViolationError(Exception):
 
644
  file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
645
 
646
  if file_paths:
647
+ dataframes, schema_parts, filenames = [], [], []
648
  for i, p in enumerate(file_paths):
649
  if p.endswith(".csv"):
650
  try:
 
652
  except UnicodeDecodeError:
653
  df = pd.read_csv(p, encoding="latin1")
654
  dataframes.append(df)
655
+ filenames.append(os.path.basename(p))
656
  schema_parts.append(
657
  f"DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):\n{df.head().to_markdown()}\n"
658
  )
 
660
  if not dataframes:
661
  return "Please upload at least one CSV file."
662
 
663
+ # Schema Validation - examines column names, data types, and value ranges
664
+ yield_update("```\n🔎 Validating input schema...\n```")
665
+ try:
666
+ validate_all_dataframes(dataframes, filenames)
667
+ except SchemaValidationError as e:
668
+ safe_log("schema_validation_failed", {"error": str(e)})
669
+ return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
670
+
671
  schema_context = "\n".join(schema_parts)
672
  prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
673