Spaces:
Running
Refactor: Move workflow context out of LLM prompts into structured state
Browse filesPHASE 1: Foundation for token reduction (architectural improvement)
NEW FILES:
- src/workflow_state.py: WorkflowState class stores intermediate results
- src/utils/schema_extraction.py: Local schema extraction (NO LLM calls)
KEY CHANGES in orchestrator.py:
1. Local schema extraction BEFORE first LLM call
- Extract columns, types, row counts, basic stats locally (Polars)
- NO raw CSV or large previews sent to LLM
- Saves ~2-3K tokens on first prompt
2. WorkflowState integration
- Stores profiling, quality, cleaning, feature engineering, modeling results
- State persists across steps in Python dict (not LLM memory)
- _update_workflow_state() called after each tool execution
3. Minimal context in prompts
- User message includes schema summary, not raw data
- Only 8 column names shown (truncated)
- Numeric/categorical counts instead of full lists
BENEFITS:
- Reduces first prompt from ~8-12K to ~3-5K tokens
- State stored in Python, not LLM context window
- Prepares for step-scoped prompting (Phase 2)
- Maintains backward compatibility (all existing tools work)
NEXT PHASE:
- Refactor prompts to only include state slice for current step
- Further reduce conversation history sent to LLM
- src/orchestrator.py +104 -1
- src/utils/schema_extraction.py +114 -0
- src/workflow_state.py +154 -0
|
@@ -19,6 +19,8 @@ from .cache.cache_manager import CacheManager
|
|
| 19 |
from .tools.tools_registry import TOOLS, get_all_tool_names, get_tools_by_category
|
| 20 |
from .session_memory import SessionMemory
|
| 21 |
from .session_store import SessionStore
|
|
|
|
|
|
|
| 22 |
from .tools import (
|
| 23 |
# Basic Tools (13) - UPDATED: Added get_smart_summary + 3 wrangling tools
|
| 24 |
profile_dataset,
|
|
@@ -263,6 +265,9 @@ class DataScienceCopilot:
|
|
| 263 |
# Rate limiting for Gemini (10 RPM free tier)
|
| 264 |
self.last_api_call_time = 0
|
| 265 |
|
|
|
|
|
|
|
|
|
|
| 266 |
# Ensure output directories exist
|
| 267 |
Path("./outputs").mkdir(exist_ok=True)
|
| 268 |
Path("./outputs/models").mkdir(exist_ok=True)
|
|
@@ -1422,6 +1427,67 @@ You are a DOER. Complete workflows based on user intent."""
|
|
| 1422 |
|
| 1423 |
return gemini_tools
|
| 1424 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1425 |
def analyze(self, file_path: str, task_description: str,
|
| 1426 |
target_col: Optional[str] = None,
|
| 1427 |
use_cache: bool = True,
|
|
@@ -1443,6 +1509,26 @@ You are a DOER. Complete workflows based on user intent."""
|
|
| 1443 |
"""
|
| 1444 |
start_time = time.time()
|
| 1445 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1446 |
# Check cache
|
| 1447 |
if use_cache:
|
| 1448 |
cache_key = self._generate_cache_key(file_path, task_description, target_col)
|
|
@@ -1571,11 +1657,25 @@ You are a DOER. Complete workflows based on user intent."""
|
|
| 1571 |
# Default full workflow
|
| 1572 |
workflow_guidance = "\n\n🎯 **WORKFLOW**: Complete Analysis\nExecute: profile → clean → encode → train → report"
|
| 1573 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1574 |
user_message = f"""Please analyze the dataset and complete the following task:
|
| 1575 |
|
| 1576 |
**Dataset**: {file_path}
|
| 1577 |
**Task**: {task_description}
|
| 1578 |
-
**Target Column**: {target_col if target_col else 'Not specified - please infer from data'}{workflow_guidance}"""
|
| 1579 |
|
| 1580 |
#🧠 Store file path in session memory for follow-up requests
|
| 1581 |
if self.session and file_path:
|
|
@@ -2299,6 +2399,9 @@ You are a DOER. Complete workflows based on user intent."""
|
|
| 2299 |
"result": tool_result
|
| 2300 |
})
|
| 2301 |
|
|
|
|
|
|
|
|
|
|
| 2302 |
# ⚡ CRITICAL FIX: Add tool result back to messages so LLM sees it in next iteration!
|
| 2303 |
if self.provider == "groq":
|
| 2304 |
# For Groq, add tool message with the result
|
|
|
|
| 19 |
from .tools.tools_registry import TOOLS, get_all_tool_names, get_tools_by_category
|
| 20 |
from .session_memory import SessionMemory
|
| 21 |
from .session_store import SessionStore
|
| 22 |
+
from .workflow_state import WorkflowState
|
| 23 |
+
from .utils.schema_extraction import extract_schema_local, infer_task_type
|
| 24 |
from .tools import (
|
| 25 |
# Basic Tools (13) - UPDATED: Added get_smart_summary + 3 wrangling tools
|
| 26 |
profile_dataset,
|
|
|
|
| 265 |
# Rate limiting for Gemini (10 RPM free tier)
|
| 266 |
self.last_api_call_time = 0
|
| 267 |
|
| 268 |
+
# Workflow state for context management (reduces token usage)
|
| 269 |
+
self.workflow_state = WorkflowState()
|
| 270 |
+
|
| 271 |
# Ensure output directories exist
|
| 272 |
Path("./outputs").mkdir(exist_ok=True)
|
| 273 |
Path("./outputs/models").mkdir(exist_ok=True)
|
|
|
|
| 1427 |
|
| 1428 |
return gemini_tools
|
| 1429 |
|
| 1430 |
+
def _update_workflow_state(self, tool_name: str, tool_result: Dict[str, Any]):
|
| 1431 |
+
"""
|
| 1432 |
+
Update workflow state based on tool execution.
|
| 1433 |
+
This reduces the need to keep full tool results in LLM context.
|
| 1434 |
+
"""
|
| 1435 |
+
if not tool_result.get("success", True):
|
| 1436 |
+
return # Don't update state on failures
|
| 1437 |
+
|
| 1438 |
+
result_data = tool_result.get("result", {})
|
| 1439 |
+
|
| 1440 |
+
# Profile dataset
|
| 1441 |
+
if tool_name == "profile_dataset":
|
| 1442 |
+
self.workflow_state.update_profiling({
|
| 1443 |
+
"num_rows": result_data.get("num_rows"),
|
| 1444 |
+
"num_columns": result_data.get("num_columns"),
|
| 1445 |
+
"missing_percentage": result_data.get("missing_percentage"),
|
| 1446 |
+
"numeric_columns": result_data.get("numeric_columns", []),
|
| 1447 |
+
"categorical_columns": result_data.get("categorical_columns", [])
|
| 1448 |
+
})
|
| 1449 |
+
|
| 1450 |
+
# Quality check
|
| 1451 |
+
elif tool_name == "detect_data_quality_issues":
|
| 1452 |
+
self.workflow_state.update_quality({
|
| 1453 |
+
"total_issues": result_data.get("total_issues", 0),
|
| 1454 |
+
"has_missing": result_data.get("has_missing", False),
|
| 1455 |
+
"has_outliers": result_data.get("has_outliers", False),
|
| 1456 |
+
"has_duplicates": result_data.get("has_duplicates", False)
|
| 1457 |
+
})
|
| 1458 |
+
|
| 1459 |
+
# Cleaning tools
|
| 1460 |
+
elif tool_name in ["clean_missing_values", "handle_outliers", "encode_categorical"]:
|
| 1461 |
+
self.workflow_state.update_cleaning({
|
| 1462 |
+
"output_file": result_data.get("output_file") or result_data.get("output_path"),
|
| 1463 |
+
"rows_processed": result_data.get("rows_after") or result_data.get("num_rows"),
|
| 1464 |
+
"tool": tool_name
|
| 1465 |
+
})
|
| 1466 |
+
|
| 1467 |
+
# Feature engineering
|
| 1468 |
+
elif tool_name in ["create_time_features", "create_interaction_features", "auto_feature_engineering"]:
|
| 1469 |
+
self.workflow_state.update_features({
|
| 1470 |
+
"output_file": result_data.get("output_file") or result_data.get("output_path"),
|
| 1471 |
+
"new_features": result_data.get("new_columns", []),
|
| 1472 |
+
"tool": tool_name
|
| 1473 |
+
})
|
| 1474 |
+
|
| 1475 |
+
# Model training
|
| 1476 |
+
elif tool_name == "train_baseline_models":
|
| 1477 |
+
models = result_data.get("models", [])
|
| 1478 |
+
best_model = None
|
| 1479 |
+
if models and isinstance(models, list):
|
| 1480 |
+
valid_models = [m for m in models if isinstance(m, dict) and "test_score" in m]
|
| 1481 |
+
if valid_models:
|
| 1482 |
+
best_model = max(valid_models, key=lambda m: m.get("test_score", 0))
|
| 1483 |
+
|
| 1484 |
+
self.workflow_state.update_modeling({
|
| 1485 |
+
"best_model": best_model.get("model") if best_model else None,
|
| 1486 |
+
"best_score": best_model.get("test_score") if best_model else None,
|
| 1487 |
+
"models_trained": len(valid_models) if best_model else 0,
|
| 1488 |
+
"task_type": result_data.get("task_type")
|
| 1489 |
+
})
|
| 1490 |
+
|
| 1491 |
def analyze(self, file_path: str, task_description: str,
|
| 1492 |
target_col: Optional[str] = None,
|
| 1493 |
use_cache: bool = True,
|
|
|
|
| 1509 |
"""
|
| 1510 |
start_time = time.time()
|
| 1511 |
|
| 1512 |
+
# 🚀 LOCAL SCHEMA EXTRACTION (NO LLM) - Extract metadata before any LLM calls
|
| 1513 |
+
print("🔍 Extracting dataset schema locally (no LLM)...")
|
| 1514 |
+
schema_info = extract_schema_local(file_path, sample_rows=3)
|
| 1515 |
+
|
| 1516 |
+
if 'error' not in schema_info:
|
| 1517 |
+
# Update workflow state with schema
|
| 1518 |
+
self.workflow_state.update_dataset_info(schema_info)
|
| 1519 |
+
print(f"✅ Schema extracted: {schema_info['num_rows']} rows × {schema_info['num_columns']} cols")
|
| 1520 |
+
print(f" File size: {schema_info['file_size_mb']} MB")
|
| 1521 |
+
|
| 1522 |
+
# Infer task type if target column provided
|
| 1523 |
+
if target_col and target_col in schema_info['columns']:
|
| 1524 |
+
inferred_task = infer_task_type(target_col, schema_info)
|
| 1525 |
+
if inferred_task:
|
| 1526 |
+
self.workflow_state.task_type = inferred_task
|
| 1527 |
+
self.workflow_state.target_column = target_col
|
| 1528 |
+
print(f" Task type inferred: {inferred_task}")
|
| 1529 |
+
else:
|
| 1530 |
+
print(f"⚠️ Schema extraction failed: {schema_info.get('error')}")
|
| 1531 |
+
|
| 1532 |
# Check cache
|
| 1533 |
if use_cache:
|
| 1534 |
cache_key = self._generate_cache_key(file_path, task_description, target_col)
|
|
|
|
| 1657 |
# Default full workflow
|
| 1658 |
workflow_guidance = "\n\n🎯 **WORKFLOW**: Complete Analysis\nExecute: profile → clean → encode → train → report"
|
| 1659 |
|
| 1660 |
+
# Build user message with workflow state context (minimal, not full history)
|
| 1661 |
+
state_context = ""
|
| 1662 |
+
if self.workflow_state.dataset_info:
|
| 1663 |
+
# Include schema summary instead of raw data
|
| 1664 |
+
info = self.workflow_state.dataset_info
|
| 1665 |
+
state_context = f"""
|
| 1666 |
+
**Dataset Schema** (extracted locally):
|
| 1667 |
+
- Rows: {info['num_rows']:,} | Columns: {info['num_columns']}
|
| 1668 |
+
- Size: {info['file_size_mb']} MB
|
| 1669 |
+
- Numeric columns: {len(info['numeric_columns'])}
|
| 1670 |
+
- Categorical columns: {len(info['categorical_columns'])}
|
| 1671 |
+
- Sample columns: {', '.join(list(info['columns'].keys())[:8])}{'...' if len(info['columns']) > 8 else ''}
|
| 1672 |
+
"""
|
| 1673 |
+
|
| 1674 |
user_message = f"""Please analyze the dataset and complete the following task:
|
| 1675 |
|
| 1676 |
**Dataset**: {file_path}
|
| 1677 |
**Task**: {task_description}
|
| 1678 |
+
**Target Column**: {target_col if target_col else 'Not specified - please infer from data'}{state_context}{workflow_guidance}"""
|
| 1679 |
|
| 1680 |
#🧠 Store file path in session memory for follow-up requests
|
| 1681 |
if self.session and file_path:
|
|
|
|
| 2399 |
"result": tool_result
|
| 2400 |
})
|
| 2401 |
|
| 2402 |
+
# 🗂️ UPDATE WORKFLOW STATE (reduces need to send full history to LLM)
|
| 2403 |
+
self._update_workflow_state(tool_name, tool_result)
|
| 2404 |
+
|
| 2405 |
# ⚡ CRITICAL FIX: Add tool result back to messages so LLM sees it in next iteration!
|
| 2406 |
if self.provider == "groq":
|
| 2407 |
# For Groq, add tool message with the result
|
|
@@ -0,0 +1,114 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Local Schema Extraction (No LLM)
|
| 3 |
+
Fast, cheap extraction of dataset metadata without sending to LLM.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import polars as pl
|
| 7 |
+
from pathlib import Path
|
| 8 |
+
from typing import Dict, Any, Optional
|
| 9 |
+
|
| 10 |
+
|
| 11 |
+
def extract_schema_local(file_path: str, sample_rows: int = 5) -> Dict[str, Any]:
|
| 12 |
+
"""
|
| 13 |
+
Extract dataset schema and basic stats locally without LLM.
|
| 14 |
+
|
| 15 |
+
Returns:
|
| 16 |
+
- column names and types
|
| 17 |
+
- row/column counts
|
| 18 |
+
- missing value counts
|
| 19 |
+
- small sample for reference
|
| 20 |
+
- memory usage
|
| 21 |
+
"""
|
| 22 |
+
try:
|
| 23 |
+
# Read with Polars (faster than pandas)
|
| 24 |
+
if file_path.endswith('.csv'):
|
| 25 |
+
df = pl.read_csv(file_path)
|
| 26 |
+
elif file_path.endswith('.parquet'):
|
| 27 |
+
df = pl.read_parquet(file_path)
|
| 28 |
+
else:
|
| 29 |
+
# Fallback to pandas
|
| 30 |
+
import pandas as pd
|
| 31 |
+
pdf = pd.read_csv(file_path)
|
| 32 |
+
df = pl.from_pandas(pdf)
|
| 33 |
+
|
| 34 |
+
# Basic metadata
|
| 35 |
+
schema_info = {
|
| 36 |
+
'file_path': file_path,
|
| 37 |
+
'file_size_mb': round(Path(file_path).stat().st_size / (1024 * 1024), 2),
|
| 38 |
+
'num_rows': df.shape[0],
|
| 39 |
+
'num_columns': df.shape[1],
|
| 40 |
+
'columns': {}
|
| 41 |
+
}
|
| 42 |
+
|
| 43 |
+
# Per-column metadata
|
| 44 |
+
for col in df.columns:
|
| 45 |
+
col_series = df[col]
|
| 46 |
+
dtype_str = str(col_series.dtype)
|
| 47 |
+
|
| 48 |
+
col_info = {
|
| 49 |
+
'dtype': dtype_str,
|
| 50 |
+
'missing_count': col_series.null_count(),
|
| 51 |
+
'missing_pct': round(col_series.null_count() / len(col_series) * 100, 2),
|
| 52 |
+
'unique_count': col_series.n_unique() if len(col_series) < 100000 else None # Skip for huge datasets
|
| 53 |
+
}
|
| 54 |
+
|
| 55 |
+
# Type-specific stats (lightweight)
|
| 56 |
+
if dtype_str in ['Int64', 'Float64', 'Int32', 'Float32']:
|
| 57 |
+
try:
|
| 58 |
+
col_info['min'] = float(col_series.min())
|
| 59 |
+
col_info['max'] = float(col_series.max())
|
| 60 |
+
col_info['mean'] = float(col_series.mean())
|
| 61 |
+
except:
|
| 62 |
+
pass
|
| 63 |
+
|
| 64 |
+
schema_info['columns'][col] = col_info
|
| 65 |
+
|
| 66 |
+
# Small sample for LLM context (only first few rows)
|
| 67 |
+
sample_data = df.head(sample_rows).to_dicts()
|
| 68 |
+
schema_info['sample_rows'] = sample_data
|
| 69 |
+
|
| 70 |
+
# Categorize columns
|
| 71 |
+
schema_info['numeric_columns'] = [
|
| 72 |
+
col for col, info in schema_info['columns'].items()
|
| 73 |
+
if 'Int' in info['dtype'] or 'Float' in info['dtype']
|
| 74 |
+
]
|
| 75 |
+
schema_info['categorical_columns'] = [
|
| 76 |
+
col for col, info in schema_info['columns'].items()
|
| 77 |
+
if info['dtype'] in ['Utf8', 'String'] or (info.get('unique_count', 999999) < 50 and col not in schema_info['numeric_columns'])
|
| 78 |
+
]
|
| 79 |
+
schema_info['datetime_columns'] = [
|
| 80 |
+
col for col, info in schema_info['columns'].items()
|
| 81 |
+
if 'Date' in info['dtype'] or 'Time' in info['dtype']
|
| 82 |
+
]
|
| 83 |
+
|
| 84 |
+
return schema_info
|
| 85 |
+
|
| 86 |
+
except Exception as e:
|
| 87 |
+
return {
|
| 88 |
+
'error': f"Failed to extract schema: {str(e)}",
|
| 89 |
+
'file_path': file_path
|
| 90 |
+
}
|
| 91 |
+
|
| 92 |
+
|
| 93 |
+
def infer_task_type(target_column: str, schema_info: Dict[str, Any]) -> Optional[str]:
|
| 94 |
+
"""
|
| 95 |
+
Infer ML task type from target column without LLM.
|
| 96 |
+
"""
|
| 97 |
+
if not target_column or target_column not in schema_info.get('columns', {}):
|
| 98 |
+
return None
|
| 99 |
+
|
| 100 |
+
target_info = schema_info['columns'][target_column]
|
| 101 |
+
|
| 102 |
+
# Numeric with many unique values → regression
|
| 103 |
+
if target_info['dtype'] in ['Int64', 'Float64', 'Int32', 'Float32']:
|
| 104 |
+
unique_count = target_info.get('unique_count')
|
| 105 |
+
if unique_count and unique_count > 20:
|
| 106 |
+
return 'regression'
|
| 107 |
+
elif unique_count and unique_count <= 10:
|
| 108 |
+
return 'classification'
|
| 109 |
+
|
| 110 |
+
# Categorical or low cardinality → classification
|
| 111 |
+
if target_info['dtype'] in ['Utf8', 'String'] or target_info.get('unique_count', 0) <= 20:
|
| 112 |
+
return 'classification'
|
| 113 |
+
|
| 114 |
+
return None
|
|
@@ -0,0 +1,154 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
Workflow State Management
|
| 3 |
+
Stores intermediate results and metadata between steps to minimize LLM context.
|
| 4 |
+
"""
|
| 5 |
+
|
| 6 |
+
import json
|
| 7 |
+
from typing import Dict, Any, List, Optional
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from datetime import datetime
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
class WorkflowState:
|
| 13 |
+
"""
|
| 14 |
+
Structured state object that holds workflow context.
|
| 15 |
+
Replaces storing everything in LLM conversation history.
|
| 16 |
+
"""
|
| 17 |
+
|
| 18 |
+
def __init__(self):
|
| 19 |
+
self.dataset_info: Optional[Dict[str, Any]] = None
|
| 20 |
+
self.profiling_summary: Optional[Dict[str, Any]] = None
|
| 21 |
+
self.quality_issues: Optional[Dict[str, Any]] = None
|
| 22 |
+
self.cleaning_results: Optional[Dict[str, Any]] = None
|
| 23 |
+
self.feature_engineering: Optional[Dict[str, Any]] = None
|
| 24 |
+
self.modeling_results: Optional[Dict[str, Any]] = None
|
| 25 |
+
self.visualization_paths: List[str] = []
|
| 26 |
+
self.current_file: Optional[str] = None
|
| 27 |
+
self.target_column: Optional[str] = None
|
| 28 |
+
self.task_type: Optional[str] = None # 'classification', 'regression', etc.
|
| 29 |
+
self.steps_completed: List[str] = []
|
| 30 |
+
self.created_at = datetime.utcnow().isoformat()
|
| 31 |
+
|
| 32 |
+
def update_dataset_info(self, info: Dict[str, Any]):
|
| 33 |
+
"""Store basic dataset metadata (schema, shape, etc.)"""
|
| 34 |
+
self.dataset_info = info
|
| 35 |
+
self.current_file = info.get('file_path')
|
| 36 |
+
self.steps_completed.append('dataset_loaded')
|
| 37 |
+
|
| 38 |
+
def update_profiling(self, summary: Dict[str, Any]):
|
| 39 |
+
"""Store profiling results summary"""
|
| 40 |
+
self.profiling_summary = summary
|
| 41 |
+
self.steps_completed.append('profiling_complete')
|
| 42 |
+
|
| 43 |
+
def update_quality(self, issues: Dict[str, Any]):
|
| 44 |
+
"""Store data quality assessment"""
|
| 45 |
+
self.quality_issues = issues
|
| 46 |
+
self.steps_completed.append('quality_checked')
|
| 47 |
+
|
| 48 |
+
def update_cleaning(self, results: Dict[str, Any]):
|
| 49 |
+
"""Store cleaning/preprocessing results"""
|
| 50 |
+
self.cleaning_results = results
|
| 51 |
+
if results.get('output_file'):
|
| 52 |
+
self.current_file = results['output_file']
|
| 53 |
+
self.steps_completed.append('data_cleaned')
|
| 54 |
+
|
| 55 |
+
def update_features(self, results: Dict[str, Any]):
|
| 56 |
+
"""Store feature engineering results"""
|
| 57 |
+
self.feature_engineering = results
|
| 58 |
+
if results.get('output_file'):
|
| 59 |
+
self.current_file = results['output_file']
|
| 60 |
+
self.steps_completed.append('features_engineered')
|
| 61 |
+
|
| 62 |
+
def update_modeling(self, results: Dict[str, Any]):
|
| 63 |
+
"""Store model training results"""
|
| 64 |
+
self.modeling_results = results
|
| 65 |
+
self.steps_completed.append('model_trained')
|
| 66 |
+
|
| 67 |
+
def add_visualization(self, path: str):
|
| 68 |
+
"""Track generated visualization"""
|
| 69 |
+
self.visualization_paths.append(path)
|
| 70 |
+
|
| 71 |
+
def get_context_for_step(self, step_name: str) -> Dict[str, Any]:
|
| 72 |
+
"""
|
| 73 |
+
Get minimal context needed for a specific step.
|
| 74 |
+
This replaces sending full conversation history to LLM.
|
| 75 |
+
"""
|
| 76 |
+
context = {
|
| 77 |
+
'current_file': self.current_file,
|
| 78 |
+
'target_column': self.target_column,
|
| 79 |
+
'task_type': self.task_type,
|
| 80 |
+
'steps_completed': self.steps_completed
|
| 81 |
+
}
|
| 82 |
+
|
| 83 |
+
# Step-specific context slicing
|
| 84 |
+
if step_name == 'profiling':
|
| 85 |
+
context['dataset_info'] = self.dataset_info
|
| 86 |
+
|
| 87 |
+
elif step_name == 'quality_check':
|
| 88 |
+
context['dataset_info'] = self.dataset_info
|
| 89 |
+
context['profiling'] = self.profiling_summary
|
| 90 |
+
|
| 91 |
+
elif step_name == 'cleaning':
|
| 92 |
+
context['quality_issues'] = self.quality_issues
|
| 93 |
+
context['profiling'] = self.profiling_summary
|
| 94 |
+
|
| 95 |
+
elif step_name == 'feature_engineering':
|
| 96 |
+
context['cleaning_results'] = self.cleaning_results
|
| 97 |
+
context['dataset_info'] = self.dataset_info
|
| 98 |
+
|
| 99 |
+
elif step_name == 'modeling':
|
| 100 |
+
context['feature_engineering'] = self.feature_engineering
|
| 101 |
+
context['cleaning_results'] = self.cleaning_results
|
| 102 |
+
context['target_column'] = self.target_column
|
| 103 |
+
context['task_type'] = self.task_type
|
| 104 |
+
|
| 105 |
+
elif step_name == 'visualization':
|
| 106 |
+
context['modeling_results'] = self.modeling_results
|
| 107 |
+
context['dataset_info'] = self.dataset_info
|
| 108 |
+
|
| 109 |
+
return context
|
| 110 |
+
|
| 111 |
+
def to_dict(self) -> Dict[str, Any]:
|
| 112 |
+
"""Serialize state for storage/debugging"""
|
| 113 |
+
return {
|
| 114 |
+
'dataset_info': self.dataset_info,
|
| 115 |
+
'profiling_summary': self.profiling_summary,
|
| 116 |
+
'quality_issues': self.quality_issues,
|
| 117 |
+
'cleaning_results': self.cleaning_results,
|
| 118 |
+
'feature_engineering': self.feature_engineering,
|
| 119 |
+
'modeling_results': self.modeling_results,
|
| 120 |
+
'visualization_paths': self.visualization_paths,
|
| 121 |
+
'current_file': self.current_file,
|
| 122 |
+
'target_column': self.target_column,
|
| 123 |
+
'task_type': self.task_type,
|
| 124 |
+
'steps_completed': self.steps_completed,
|
| 125 |
+
'created_at': self.created_at
|
| 126 |
+
}
|
| 127 |
+
|
| 128 |
+
def save_to_file(self, path: str):
|
| 129 |
+
"""Save state to JSON file"""
|
| 130 |
+
Path(path).parent.mkdir(parents=True, exist_ok=True)
|
| 131 |
+
with open(path, 'w') as f:
|
| 132 |
+
json.dump(self.to_dict(), f, indent=2)
|
| 133 |
+
|
| 134 |
+
@classmethod
|
| 135 |
+
def load_from_file(cls, path: str) -> 'WorkflowState':
|
| 136 |
+
"""Load state from JSON file"""
|
| 137 |
+
with open(path, 'r') as f:
|
| 138 |
+
data = json.load(f)
|
| 139 |
+
|
| 140 |
+
state = cls()
|
| 141 |
+
state.dataset_info = data.get('dataset_info')
|
| 142 |
+
state.profiling_summary = data.get('profiling_summary')
|
| 143 |
+
state.quality_issues = data.get('quality_issues')
|
| 144 |
+
state.cleaning_results = data.get('cleaning_results')
|
| 145 |
+
state.feature_engineering = data.get('feature_engineering')
|
| 146 |
+
state.modeling_results = data.get('modeling_results')
|
| 147 |
+
state.visualization_paths = data.get('visualization_paths', [])
|
| 148 |
+
state.current_file = data.get('current_file')
|
| 149 |
+
state.target_column = data.get('target_column')
|
| 150 |
+
state.task_type = data.get('task_type')
|
| 151 |
+
state.steps_completed = data.get('steps_completed', [])
|
| 152 |
+
state.created_at = data.get('created_at')
|
| 153 |
+
|
| 154 |
+
return state
|