Spaces:

VEDAGI1
/

Medica_DecisionSupportAI

Sleeping

App Files Files Community

VEDAGI1 commited on 5 days ago

Commit

1e4ca95

verified ·

1 Parent(s): a37596a

Update app.py

Browse files

Files changed (1) hide show

app.py +152 -1

app.py CHANGED Viewed

@@ -158,6 +158,148 @@ def format_validated_json_for_report(validated_data: Dict[str, Any]) -> str:
         return json.dumps({"raw_data": str(validated_data)}, indent=2)
 # ---------------------- Sandbox Execution ----------------------
 class SandboxViolationError(Exception):
@@ -502,7 +644,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
         file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
         if file_paths:
-            dataframes, schema_parts = [], []
             for i, p in enumerate(file_paths):
                 if p.endswith(".csv"):
                     try:
@@ -510,6 +652,7 @@ def handle(user_msg: str, files: list, yield_update) -> str:
                     except UnicodeDecodeError:
                         df = pd.read_csv(p, encoding="latin1")
                     dataframes.append(df)
                     schema_parts.append(
                         f"DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):\n{df.head().to_markdown()}\n"
                     )
@@ -517,6 +660,14 @@ def handle(user_msg: str, files: list, yield_update) -> str:
             if not dataframes:
                 return "Please upload at least one CSV file."
             schema_context = "\n".join(schema_parts)
             prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in

         return json.dumps({"raw_data": str(validated_data)}, indent=2)
+# ---------------------- Schema Validation ----------------------
+class SchemaValidationError(Exception):
+    """Raised when input data fails schema validation."""
+    pass
+def validate_dataframe_schema(df: pd.DataFrame, filename: str) -> Dict[str, Any]:
+    """
+    Validates a DataFrame's schema before analysis.
+    Implements the ClarityOps requirement:
+    "Schema validation examines column names, data types, and value ranges
+    before analysis begins. The system rejects malformed inputs."
+    Args:
+        df: The DataFrame to validate
+        filename: Original filename for error messages
+    Returns:
+        Dict containing schema metadata for logging
+    Raises:
+        SchemaValidationError: If the DataFrame fails validation
+    """
+    errors = []
+    warnings = []
+    # Check 1: DataFrame is not empty
+    if df.empty:
+        raise SchemaValidationError(f"File '{filename}' contains no data (empty DataFrame).")
+    # Check 2: Has at least one column
+    if len(df.columns) == 0:
+        raise SchemaValidationError(f"File '{filename}' has no columns.")
+    # Check 3: Column names are valid (not empty, no duplicates)
+    col_names = list(df.columns)
+    # Check for empty column names
+    empty_cols = [i for i, c in enumerate(col_names) if str(c).strip() == "" or pd.isna(c)]
+    if empty_cols:
+        errors.append(f"Empty column names at positions: {empty_cols}")
+    # Check for duplicate column names
+    seen = {}
+    duplicates = []
+    for col in col_names:
+        col_str = str(col)
+        if col_str in seen:
+            duplicates.append(col_str)
+        seen[col_str] = True
+    if duplicates:
+        errors.append(f"Duplicate column names: {list(set(duplicates))}")
+    # Check 4: Data types are recognizable (skip if duplicates found)
+    has_duplicates = len(duplicates) > 0
+    if not has_duplicates:
+        for col in df.columns:
+            dtype = df[col].dtype
+            if dtype == object:
+                # Check if object column has mixed types that could cause issues
+                sample = df[col].dropna().head(100)
+                if len(sample) > 0:
+                    types_in_col = set(type(x).__name__ for x in sample)
+                    if len(types_in_col) > 2:  # Allow str + one other type
+                        warnings.append(f"Column '{col}' has mixed types: {types_in_col}")
+    # Check 5: Reasonable row count (warn if very large)
+    if len(df) > 1_000_000:
+        warnings.append(f"Large dataset ({len(df):,} rows) may impact performance.")
+    # Check 6: Check for completely null columns (skip if duplicates found)
+    if not has_duplicates:
+        null_cols = [col for col in df.columns if df[col].isna().all()]
+        if null_cols:
+            warnings.append(f"Columns with all null values: {null_cols}")
+    # Check 7: Validate numeric columns have reasonable ranges (skip if duplicates found)
+    if not has_duplicates:
+        import numpy as np
+        for col in df.select_dtypes(include=['number']).columns:
+            col_data = df[col].dropna()
+            if len(col_data) > 0:
+                if np.isinf(col_data).any():
+                    errors.append(f"Column '{col}' contains infinite values.")
+    # If there are critical errors, reject the input
+    if errors:
+        error_msg = f"Schema validation failed for '{filename}':\n" + "\n".join(f"  - {e}" for e in errors)
+        raise SchemaValidationError(error_msg)
+    # Build schema metadata
+    schema_info = {
+        "filename": filename,
+        "rows": len(df),
+        "columns": len(df.columns),
+        "column_names": col_names,
+        "dtypes": {str(col): str(df[col].dtype) for col in df.columns},
+        "null_counts": {str(col): int(df[col].isna().sum()) for col in df.columns},
+        "warnings": warnings,
+    }
+    # Log warnings but don't fail
+    if warnings:
+        safe_log("schema_validation_warnings", {"filename": filename, "warnings": warnings})
+    safe_log("schema_validation_passed", {"filename": filename, "rows": len(df), "columns": len(df.columns)})
+    return schema_info
+def validate_all_dataframes(dataframes: List[pd.DataFrame], filenames: List[str]) -> List[Dict[str, Any]]:
+    """
+    Validates all uploaded DataFrames.
+    Args:
+        dataframes: List of DataFrames to validate
+        filenames: Corresponding filenames
+    Returns:
+        List of schema metadata dicts
+    Raises:
+        SchemaValidationError: If any DataFrame fails validation
+    """
+    schema_infos = []
+    all_errors = []
+    for df, filename in zip(dataframes, filenames):
+        try:
+            schema_info = validate_dataframe_schema(df, filename)
+            schema_infos.append(schema_info)
+        except SchemaValidationError as e:
+            all_errors.append(str(e))
+    if all_errors:
+        raise SchemaValidationError("\n\n".join(all_errors))
+    return schema_infos
 # ---------------------- Sandbox Execution ----------------------
 class SandboxViolationError(Exception):
         file_paths: List[str] = [getattr(f, "name", None) or f for f in (files or [])]
         if file_paths:
+            dataframes, schema_parts, filenames = [], [], []
             for i, p in enumerate(file_paths):
                 if p.endswith(".csv"):
                     try:
                     except UnicodeDecodeError:
                         df = pd.read_csv(p, encoding="latin1")
                     dataframes.append(df)
+                    filenames.append(os.path.basename(p))
                     schema_parts.append(
                         f"DataFrame `dfs[{i}]` (`{os.path.basename(p)}`):\n{df.head().to_markdown()}\n"
                     )
             if not dataframes:
                 return "Please upload at least one CSV file."
+            # Schema Validation - examines column names, data types, and value ranges
+            yield_update("```\n🔎 Validating input schema...\n```")
+            try:
+                validate_all_dataframes(dataframes, filenames)
+            except SchemaValidationError as e:
+                safe_log("schema_validation_failed", {"error": str(e)})
+                return f"**Schema Validation Failed**\n\n{e}\n\nPlease fix the data issues and re-upload."
             schema_context = "\n".join(schema_parts)
             prompt_for_code = redacted_in if (PHI_MODE and not ALLOW_EXTERNAL_PHI) else safe_in