Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -158,6 +158,236 @@ def format_validated_json_for_report(validated_data: Dict[str, Any]) -> str:
|
|
| 158 |
return json.dumps({"raw_data": str(validated_data)}, indent=2)
|
| 159 |
|
| 160 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 161 |
# ---------------------- Analysis Script Generation ----------------------
|
| 162 |
|
| 163 |
def _create_python_script(user_scenario: str, schema_context: str) -> str:
|
|
@@ -293,14 +523,17 @@ def handle(user_msg: str, files: list, yield_update) -> str:
|
|
| 293 |
yield_update("```\n🧠 Generating aligned analysis script...\n```")
|
| 294 |
analysis_script = _create_python_script(prompt_for_code, schema_context)
|
| 295 |
|
| 296 |
-
yield_update("```\n⚙️ Executing script
|
| 297 |
-
execution_namespace = {"dfs": dataframes, "pd": pd, "re": re, "json": json}
|
| 298 |
-
output_buffer = io.StringIO()
|
| 299 |
-
|
| 300 |
try:
|
| 301 |
-
|
| 302 |
-
|
| 303 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 304 |
except Exception as e:
|
| 305 |
return (
|
| 306 |
f"An error occurred executing the script: {e}\n\nGenerated Script:\n"
|
|
|
|
| 158 |
return json.dumps({"raw_data": str(validated_data)}, indent=2)
|
| 159 |
|
| 160 |
|
| 161 |
+
# ---------------------- Sandbox Execution ----------------------
|
| 162 |
+
|
| 163 |
+
class SandboxViolationError(Exception):
|
| 164 |
+
"""Raised when generated code attempts forbidden operations."""
|
| 165 |
+
pass
|
| 166 |
+
|
| 167 |
+
|
| 168 |
+
# Restricted import function that only allows safe modules
|
| 169 |
+
_ALLOWED_MODULES = frozenset({
|
| 170 |
+
"json", "math", "statistics", "collections", "itertools", "functools",
|
| 171 |
+
"operator", "string", "re", "datetime", "decimal", "fractions",
|
| 172 |
+
"random", "copy", "types", "typing", "dataclasses", "enum",
|
| 173 |
+
"numpy", "pandas", "scipy.stats",
|
| 174 |
+
})
|
| 175 |
+
|
| 176 |
+
_BLOCKED_MODULES = frozenset({
|
| 177 |
+
"os", "sys", "subprocess", "shutil", "pathlib", "glob",
|
| 178 |
+
"socket", "http", "urllib", "requests", "ftplib", "smtplib",
|
| 179 |
+
"pickle", "shelve", "marshal", "importlib", "builtins",
|
| 180 |
+
"ctypes", "multiprocessing", "threading", "asyncio",
|
| 181 |
+
"eval", "exec", "compile", "open", "file", "input",
|
| 182 |
+
"code", "codeop", "pty", "tty", "termios", "resource",
|
| 183 |
+
"signal", "mmap", "sysconfig", "platform",
|
| 184 |
+
})
|
| 185 |
+
|
| 186 |
+
|
| 187 |
+
def _safe_import(name: str, globals_dict=None, locals_dict=None, fromlist=(), level=0):
|
| 188 |
+
"""Restricted import that only allows whitelisted modules."""
|
| 189 |
+
base_module = name.split('.')[0]
|
| 190 |
+
|
| 191 |
+
if base_module in _BLOCKED_MODULES or name in _BLOCKED_MODULES:
|
| 192 |
+
raise SandboxViolationError(f"Import of '{name}' is not allowed in sandbox environment.")
|
| 193 |
+
|
| 194 |
+
if base_module not in _ALLOWED_MODULES and name not in _ALLOWED_MODULES:
|
| 195 |
+
raise SandboxViolationError(f"Import of '{name}' is not allowed. Allowed modules: {', '.join(sorted(_ALLOWED_MODULES))}")
|
| 196 |
+
|
| 197 |
+
return __builtins__["__import__"](name, globals_dict, locals_dict, fromlist, level)
|
| 198 |
+
|
| 199 |
+
|
| 200 |
+
def _create_sandbox_builtins() -> Dict[str, Any]:
|
| 201 |
+
"""
|
| 202 |
+
Creates a restricted builtins dict that prevents dangerous operations.
|
| 203 |
+
Allows safe operations needed for data analysis.
|
| 204 |
+
"""
|
| 205 |
+
import builtins
|
| 206 |
+
|
| 207 |
+
# Safe builtins for data analysis
|
| 208 |
+
safe_builtins = {
|
| 209 |
+
# Types and constructors
|
| 210 |
+
"bool": builtins.bool,
|
| 211 |
+
"int": builtins.int,
|
| 212 |
+
"float": builtins.float,
|
| 213 |
+
"str": builtins.str,
|
| 214 |
+
"list": builtins.list,
|
| 215 |
+
"dict": builtins.dict,
|
| 216 |
+
"tuple": builtins.tuple,
|
| 217 |
+
"set": builtins.set,
|
| 218 |
+
"frozenset": builtins.frozenset,
|
| 219 |
+
"bytes": builtins.bytes,
|
| 220 |
+
"bytearray": builtins.bytearray,
|
| 221 |
+
"complex": builtins.complex,
|
| 222 |
+
"slice": builtins.slice,
|
| 223 |
+
"type": builtins.type,
|
| 224 |
+
"object": builtins.object,
|
| 225 |
+
|
| 226 |
+
# Iteration and sequences
|
| 227 |
+
"range": builtins.range,
|
| 228 |
+
"enumerate": builtins.enumerate,
|
| 229 |
+
"zip": builtins.zip,
|
| 230 |
+
"map": builtins.map,
|
| 231 |
+
"filter": builtins.filter,
|
| 232 |
+
"reversed": builtins.reversed,
|
| 233 |
+
"sorted": builtins.sorted,
|
| 234 |
+
"iter": builtins.iter,
|
| 235 |
+
"next": builtins.next,
|
| 236 |
+
"len": builtins.len,
|
| 237 |
+
|
| 238 |
+
# Math and comparison
|
| 239 |
+
"abs": builtins.abs,
|
| 240 |
+
"min": builtins.min,
|
| 241 |
+
"max": builtins.max,
|
| 242 |
+
"sum": builtins.sum,
|
| 243 |
+
"pow": builtins.pow,
|
| 244 |
+
"round": builtins.round,
|
| 245 |
+
"divmod": builtins.divmod,
|
| 246 |
+
|
| 247 |
+
# Logic and identity
|
| 248 |
+
"all": builtins.all,
|
| 249 |
+
"any": builtins.any,
|
| 250 |
+
"isinstance": builtins.isinstance,
|
| 251 |
+
"issubclass": builtins.issubclass,
|
| 252 |
+
"id": builtins.id,
|
| 253 |
+
"hash": builtins.hash,
|
| 254 |
+
|
| 255 |
+
# String and representation
|
| 256 |
+
"repr": builtins.repr,
|
| 257 |
+
"ascii": builtins.ascii,
|
| 258 |
+
"chr": builtins.chr,
|
| 259 |
+
"ord": builtins.ord,
|
| 260 |
+
"format": builtins.format,
|
| 261 |
+
"print": builtins.print,
|
| 262 |
+
|
| 263 |
+
# Attribute access
|
| 264 |
+
"getattr": builtins.getattr,
|
| 265 |
+
"setattr": builtins.setattr,
|
| 266 |
+
"hasattr": builtins.hasattr,
|
| 267 |
+
"delattr": builtins.delattr,
|
| 268 |
+
|
| 269 |
+
# Other safe operations
|
| 270 |
+
"callable": builtins.callable,
|
| 271 |
+
"dir": builtins.dir,
|
| 272 |
+
"vars": builtins.vars,
|
| 273 |
+
"locals": builtins.locals,
|
| 274 |
+
"globals": lambda: {}, # Return empty dict to prevent access to real globals
|
| 275 |
+
|
| 276 |
+
# Exceptions (needed for error handling in scripts)
|
| 277 |
+
"Exception": builtins.Exception,
|
| 278 |
+
"ValueError": builtins.ValueError,
|
| 279 |
+
"TypeError": builtins.TypeError,
|
| 280 |
+
"KeyError": builtins.KeyError,
|
| 281 |
+
"IndexError": builtins.IndexError,
|
| 282 |
+
"AttributeError": builtins.AttributeError,
|
| 283 |
+
"ZeroDivisionError": builtins.ZeroDivisionError,
|
| 284 |
+
"StopIteration": builtins.StopIteration,
|
| 285 |
+
"RuntimeError": builtins.RuntimeError,
|
| 286 |
+
|
| 287 |
+
# Constants
|
| 288 |
+
"None": None,
|
| 289 |
+
"True": True,
|
| 290 |
+
"False": False,
|
| 291 |
+
"Ellipsis": builtins.Ellipsis,
|
| 292 |
+
"NotImplemented": builtins.NotImplemented,
|
| 293 |
+
|
| 294 |
+
# Restricted import
|
| 295 |
+
"__import__": _safe_import,
|
| 296 |
+
"__name__": "__sandbox__",
|
| 297 |
+
"__doc__": None,
|
| 298 |
+
}
|
| 299 |
+
|
| 300 |
+
return safe_builtins
|
| 301 |
+
|
| 302 |
+
|
| 303 |
+
def _create_sandbox_namespace(dataframes: List[Any]) -> Dict[str, Any]:
|
| 304 |
+
"""
|
| 305 |
+
Creates a sandboxed execution namespace with only safe operations.
|
| 306 |
+
|
| 307 |
+
This implements the ClarityOps security model:
|
| 308 |
+
- Memory-only execution (no file I/O)
|
| 309 |
+
- No network access
|
| 310 |
+
- No system calls
|
| 311 |
+
- Only data analysis libraries available
|
| 312 |
+
"""
|
| 313 |
+
import numpy as np
|
| 314 |
+
|
| 315 |
+
sandbox_builtins = _create_sandbox_builtins()
|
| 316 |
+
|
| 317 |
+
namespace = {
|
| 318 |
+
"__builtins__": sandbox_builtins,
|
| 319 |
+
# Pre-loaded safe modules
|
| 320 |
+
"dfs": dataframes,
|
| 321 |
+
"pd": pd,
|
| 322 |
+
"np": np,
|
| 323 |
+
"re": re,
|
| 324 |
+
"json": json,
|
| 325 |
+
# Common pandas/numpy items for convenience
|
| 326 |
+
"DataFrame": pd.DataFrame,
|
| 327 |
+
"Series": pd.Series,
|
| 328 |
+
"NaN": np.nan,
|
| 329 |
+
"nan": np.nan,
|
| 330 |
+
}
|
| 331 |
+
|
| 332 |
+
return namespace
|
| 333 |
+
|
| 334 |
+
|
| 335 |
+
def execute_in_sandbox(script: str, dataframes: List[Any]) -> str:
|
| 336 |
+
"""
|
| 337 |
+
Executes the analysis script in a sandboxed environment.
|
| 338 |
+
|
| 339 |
+
Returns the captured stdout output.
|
| 340 |
+
|
| 341 |
+
Raises:
|
| 342 |
+
SandboxViolationError: If script attempts forbidden operations
|
| 343 |
+
Exception: For other execution errors
|
| 344 |
+
"""
|
| 345 |
+
# Pre-execution safety checks on the script text
|
| 346 |
+
forbidden_patterns = [
|
| 347 |
+
(r'\bopen\s*\(', "File operations (open) are not allowed"),
|
| 348 |
+
(r'\bos\s*\.', "OS module access is not allowed"),
|
| 349 |
+
(r'\bsys\s*\.', "Sys module access is not allowed"),
|
| 350 |
+
(r'\bsubprocess', "Subprocess module is not allowed"),
|
| 351 |
+
(r'\bsocket\s*\.', "Network operations are not allowed"),
|
| 352 |
+
(r'\burllib', "Network operations are not allowed"),
|
| 353 |
+
(r'\brequests\s*\.', "Network operations are not allowed"),
|
| 354 |
+
(r'\bhttp\s*\.', "Network operations are not allowed"),
|
| 355 |
+
(r'\beval\s*\(', "eval() is not allowed"),
|
| 356 |
+
(r'\bexec\s*\(', "exec() is not allowed"),
|
| 357 |
+
(r'\bcompile\s*\(', "compile() is not allowed"),
|
| 358 |
+
(r'\b__import__\s*\(', "Direct __import__ calls are not allowed"),
|
| 359 |
+
(r'\bimportlib', "importlib is not allowed"),
|
| 360 |
+
(r'\bpickle', "pickle module is not allowed"),
|
| 361 |
+
(r'\bshutil', "shutil module is not allowed"),
|
| 362 |
+
(r'\bglobals\s*\(\s*\)', "globals() access is restricted"),
|
| 363 |
+
(r'\.to_csv\s*\(', "Writing files (to_csv) is not allowed"),
|
| 364 |
+
(r'\.to_excel\s*\(', "Writing files (to_excel) is not allowed"),
|
| 365 |
+
(r'\.to_parquet\s*\(', "Writing files (to_parquet) is not allowed"),
|
| 366 |
+
(r'\.to_sql\s*\(', "Database operations (to_sql) are not allowed"),
|
| 367 |
+
(r'pd\.read_', "Reading files is not allowed - use the provided dfs variable"),
|
| 368 |
+
]
|
| 369 |
+
|
| 370 |
+
for pattern, message in forbidden_patterns:
|
| 371 |
+
if re.search(pattern, script):
|
| 372 |
+
raise SandboxViolationError(f"Security violation: {message}")
|
| 373 |
+
|
| 374 |
+
# Create sandboxed namespace
|
| 375 |
+
namespace = _create_sandbox_namespace(dataframes)
|
| 376 |
+
|
| 377 |
+
# Capture stdout
|
| 378 |
+
output_buffer = io.StringIO()
|
| 379 |
+
|
| 380 |
+
try:
|
| 381 |
+
with redirect_stdout(output_buffer):
|
| 382 |
+
exec(script, namespace, namespace)
|
| 383 |
+
return output_buffer.getvalue()
|
| 384 |
+
except SandboxViolationError:
|
| 385 |
+
raise
|
| 386 |
+
except Exception as e:
|
| 387 |
+
# Re-raise with context but don't expose internal details
|
| 388 |
+
raise RuntimeError(f"Script execution error: {type(e).__name__}: {e}")
|
| 389 |
+
|
| 390 |
+
|
| 391 |
# ---------------------- Analysis Script Generation ----------------------
|
| 392 |
|
| 393 |
def _create_python_script(user_scenario: str, schema_context: str) -> str:
|
|
|
|
| 523 |
yield_update("```\n🧠 Generating aligned analysis script...\n```")
|
| 524 |
analysis_script = _create_python_script(prompt_for_code, schema_context)
|
| 525 |
|
| 526 |
+
yield_update("```\n⚙️ Executing script in sandbox...\n```")
|
|
|
|
|
|
|
|
|
|
| 527 |
try:
|
| 528 |
+
raw_data_output = execute_in_sandbox(analysis_script, dataframes)
|
| 529 |
+
except SandboxViolationError as e:
|
| 530 |
+
safe_log("sandbox_violation", {"error": str(e)})
|
| 531 |
+
return (
|
| 532 |
+
f"**Security Violation Detected**\n\n{e}\n\n"
|
| 533 |
+
f"The generated script attempted a forbidden operation. "
|
| 534 |
+
f"Please rephrase your request.\n\n"
|
| 535 |
+
f"Generated Script:\n```python\n{analysis_script}\n```"
|
| 536 |
+
)
|
| 537 |
except Exception as e:
|
| 538 |
return (
|
| 539 |
f"An error occurred executing the script: {e}\n\nGenerated Script:\n"
|