Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -158,6 +158,148 @@ def format_validated_json_for_report(validated_data: Dict[str, Any]) -> str:
|
|
| 158 |
return json.dumps({"raw_data": str(validated_data)}, indent=2)
|
| 159 |
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
# ---------------------- Sandbox Execution ----------------------
|
| 162 |
|
| 163 |
class SandboxViolationError(Exception):
|
|
@@ -502,7 +644,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
|
|
| 502 |
file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
|
| 503 |
|
| 504 |
if file_paths:
|
| 505 |
-
dataframes, schema_parts = [], []
|
| 506 |
for i, p in enumerate(file_paths):
|
| 507 |
if p.endswith(".csv"):
|
| 508 |
try:
|
|
@@ -510,6 +652,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
|
|
| 510 |
except UnicodeDecodeError:
|
| 511 |
df = pd.read_csv(p, encoding="latin1")
|
| 512 |
dataframes.append(df)
|
|
|
|
| 513 |
schema_parts.append(
|
| 514 |
f"DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):\n{df.head().to_markdown()}\n"
|
| 515 |
)
|
|
@@ -517,6 +660,14 @@ def handle(user_msg: str, files: list, yield_update) -> str:
|
|
| 517 |
if not dataframes:
|
| 518 |
return "Please upload at least one CSV file."
|
| 519 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 520 |
schema_context = "\n".join(schema_parts)
|
| 521 |
prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|
| 522 |
|
|
|
|
| 158 |
return json.dumps({"raw_data": str(validated_data)}, indent=2)
|
| 159 |
|
| 160 |
|
| 161 |
+
# ---------------------- Schema Validation ----------------------
|
| 162 |
+
|
| 163 |
+
class SchemaValidationError(Exception):
|
| 164 |
+
"""Raised when input data fails schema validation."""
|
| 165 |
+
pass
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
def validate_dataframe_schema(df: pd.DataFrame, filename: str) -> Dict[str, Any]:
|
| 169 |
+
"""
|
| 170 |
+
Validates a DataFrame's schema before analysis.
|
| 171 |
+
|
| 172 |
+
Implements the ClarityOps requirement:
|
| 173 |
+
"Schema validation examines column names, data types, and value ranges
|
| 174 |
+
before analysis begins. The system rejects malformed inputs."
|
| 175 |
+
|
| 176 |
+
Args:
|
| 177 |
+
df: The DataFrame to validate
|
| 178 |
+
filename: Original filename for error messages
|
| 179 |
+
|
| 180 |
+
Returns:
|
| 181 |
+
Dict containing schema metadata for logging
|
| 182 |
+
|
| 183 |
+
Raises:
|
| 184 |
+
SchemaValidationError: If the DataFrame fails validation
|
| 185 |
+
"""
|
| 186 |
+
errors = []
|
| 187 |
+
warnings = []
|
| 188 |
+
|
| 189 |
+
# Check 1: DataFrame is not empty
|
| 190 |
+
if df.empty:
|
| 191 |
+
raise SchemaValidationError(f"File '{filename}' contains no data (empty DataFrame).")
|
| 192 |
+
|
| 193 |
+
# Check 2: Has at least one column
|
| 194 |
+
if len(df.columns) == 0:
|
| 195 |
+
raise SchemaValidationError(f"File '{filename}' has no columns.")
|
| 196 |
+
|
| 197 |
+
# Check 3: Column names are valid (not empty, no duplicates)
|
| 198 |
+
col_names = list(df.columns)
|
| 199 |
+
|
| 200 |
+
# Check for empty column names
|
| 201 |
+
empty_cols = [i for i, c in enumerate(col_names) if str(c).strip() == "" or pd.isna(c)]
|
| 202 |
+
if empty_cols:
|
| 203 |
+
errors.append(f"Empty column names at positions: {empty_cols}")
|
| 204 |
+
|
| 205 |
+
# Check for duplicate column names
|
| 206 |
+
seen = {}
|
| 207 |
+
duplicates = []
|
| 208 |
+
for col in col_names:
|
| 209 |
+
col_str = str(col)
|
| 210 |
+
if col_str in seen:
|
| 211 |
+
duplicates.append(col_str)
|
| 212 |
+
seen[col_str] = True
|
| 213 |
+
if duplicates:
|
| 214 |
+
errors.append(f"Duplicate column names: {list(set(duplicates))}")
|
| 215 |
+
|
| 216 |
+
# Check 4: Data types are recognizable (skip if duplicates found)
|
| 217 |
+
has_duplicates = len(duplicates) > 0
|
| 218 |
+
if not has_duplicates:
|
| 219 |
+
for col in df.columns:
|
| 220 |
+
dtype = df[col].dtype
|
| 221 |
+
if dtype == object:
|
| 222 |
+
# Check if object column has mixed types that could cause issues
|
| 223 |
+
sample = df[col].dropna().head(100)
|
| 224 |
+
if len(sample) > 0:
|
| 225 |
+
types_in_col = set(type(x).__name__ for x in sample)
|
| 226 |
+
if len(types_in_col) > 2: # Allow str + one other type
|
| 227 |
+
warnings.append(f"Column '{col}' has mixed types: {types_in_col}")
|
| 228 |
+
|
| 229 |
+
# Check 5: Reasonable row count (warn if very large)
|
| 230 |
+
if len(df) > 1_000_000:
|
| 231 |
+
warnings.append(f"Large dataset ({len(df):,} rows) may impact performance.")
|
| 232 |
+
|
| 233 |
+
# Check 6: Check for completely null columns (skip if duplicates found)
|
| 234 |
+
if not has_duplicates:
|
| 235 |
+
null_cols = [col for col in df.columns if df[col].isna().all()]
|
| 236 |
+
if null_cols:
|
| 237 |
+
warnings.append(f"Columns with all null values: {null_cols}")
|
| 238 |
+
|
| 239 |
+
# Check 7: Validate numeric columns have reasonable ranges (skip if duplicates found)
|
| 240 |
+
if not has_duplicates:
|
| 241 |
+
import numpy as np
|
| 242 |
+
for col in df.select_dtypes(include=['number']).columns:
|
| 243 |
+
col_data = df[col].dropna()
|
| 244 |
+
if len(col_data) > 0:
|
| 245 |
+
if np.isinf(col_data).any():
|
| 246 |
+
errors.append(f"Column '{col}' contains infinite values.")
|
| 247 |
+
|
| 248 |
+
# If there are critical errors, reject the input
|
| 249 |
+
if errors:
|
| 250 |
+
error_msg = f"Schema validation failed for '{filename}':\n" + "\n".join(f" - {e}" for e in errors)
|
| 251 |
+
raise SchemaValidationError(error_msg)
|
| 252 |
+
|
| 253 |
+
# Build schema metadata
|
| 254 |
+
schema_info = {
|
| 255 |
+
"filename": filename,
|
| 256 |
+
"rows": len(df),
|
| 257 |
+
"columns": len(df.columns),
|
| 258 |
+
"column_names": col_names,
|
| 259 |
+
"dtypes": {str(col): str(df[col].dtype) for col in df.columns},
|
| 260 |
+
"null_counts": {str(col): int(df[col].isna().sum()) for col in df.columns},
|
| 261 |
+
"warnings": warnings,
|
| 262 |
+
}
|
| 263 |
+
|
| 264 |
+
# Log warnings but don't fail
|
| 265 |
+
if warnings:
|
| 266 |
+
safe_log("schema_validation_warnings", {"filename": filename, "warnings": warnings})
|
| 267 |
+
|
| 268 |
+
safe_log("schema_validation_passed", {"filename": filename, "rows": len(df), "columns": len(df.columns)})
|
| 269 |
+
|
| 270 |
+
return schema_info
|
| 271 |
+
|
| 272 |
+
|
| 273 |
+
def validate_all_dataframes(dataframes: List[pd.DataFrame], filenames: List[str]) -> List[Dict[str, Any]]:
|
| 274 |
+
"""
|
| 275 |
+
Validates all uploaded DataFrames.
|
| 276 |
+
|
| 277 |
+
Args:
|
| 278 |
+
dataframes: List of DataFrames to validate
|
| 279 |
+
filenames: Corresponding filenames
|
| 280 |
+
|
| 281 |
+
Returns:
|
| 282 |
+
List of schema metadata dicts
|
| 283 |
+
|
| 284 |
+
Raises:
|
| 285 |
+
SchemaValidationError: If any DataFrame fails validation
|
| 286 |
+
"""
|
| 287 |
+
schema_infos = []
|
| 288 |
+
all_errors = []
|
| 289 |
+
|
| 290 |
+
for df, filename in zip(dataframes, filenames):
|
| 291 |
+
try:
|
| 292 |
+
schema_info = validate_dataframe_schema(df, filename)
|
| 293 |
+
schema_infos.append(schema_info)
|
| 294 |
+
except SchemaValidationError as e:
|
| 295 |
+
all_errors.append(str(e))
|
| 296 |
+
|
| 297 |
+
if all_errors:
|
| 298 |
+
raise SchemaValidationError("\n\n".join(all_errors))
|
| 299 |
+
|
| 300 |
+
return schema_infos
|
| 301 |
+
|
| 302 |
+
|
| 303 |
# ---------------------- Sandbox Execution ----------------------
|
| 304 |
|
| 305 |
class SandboxViolationError(Exception):
|
|
|
|
| 644 |
file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
|
| 645 |
|
| 646 |
if file_paths:
|
| 647 |
+
dataframes, schema_parts, filenames = [], [], []
|
| 648 |
for i, p in enumerate(file_paths):
|
| 649 |
if p.endswith(".csv"):
|
| 650 |
try:
|
|
|
|
| 652 |
except UnicodeDecodeError:
|
| 653 |
df = pd.read_csv(p, encoding="latin1")
|
| 654 |
dataframes.append(df)
|
| 655 |
+
filenames.append(os.path.basename(p))
|
| 656 |
schema_parts.append(
|
| 657 |
f"DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):\n{df.head().to_markdown()}\n"
|
| 658 |
)
|
|
|
|
| 660 |
if not dataframes:
|
| 661 |
return "Please upload at least one CSV file."
|
| 662 |
|
| 663 |
+
# Schema Validation - examines column names, data types, and value ranges
|
| 664 |
+
yield_update("```\n🔎 Validating input schema...\n```")
|
| 665 |
+
try:
|
| 666 |
+
validate_all_dataframes(dataframes, filenames)
|
| 667 |
+
except SchemaValidationError as e:
|
| 668 |
+
safe_log("schema_validation_failed", {"error": str(e)})
|
| 669 |
+
return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
|
| 670 |
+
|
| 671 |
schema_context = "\n".join(schema_parts)
|
| 672 |
prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in
|
| 673 |
|