File size: 4,882 Bytes
f35ddc4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
Production-grade tool result compression for small context window models.
Add this function to orchestrator.py before _parse_text_tool_calls method.
"""

def _compress_tool_result(self, tool_name: str, result: Dict[str, Any]) -> Dict[str, Any]:
    """
    Compress tool results for small context models (production-grade approach).
    
    Keep only:
    - Status (success/failure)
    - Key metrics (5-10 most important numbers)
    - File paths created  
    - Next action hints
    
    Full results stored in workflow_history and session memory.
    LLM doesn't need verbose output - only decision-making info.
    
    Args:
        tool_name: Name of the tool executed
        result: Full tool result dict
        
    Returns:
        Compressed result dict (typically 100-500 tokens vs 5K-10K)
    """
    if not result.get("success", True):
        # Keep full error info (critical for debugging)
        return result
    
    compressed = {
        "success": True,
        "tool": tool_name
    }
    
    # Tool-specific compression rules
    if tool_name == "profile_dataset":
        # Original: ~5K tokens with full stats
        # Compressed: ~200 tokens with key metrics
        r = result.get("result", {})
        compressed["summary"] = {
            "rows": r.get("num_rows"),
            "cols": r.get("num_columns"),
            "missing_pct": r.get("missing_percentage"),
            "numeric_cols": len(r.get("numeric_columns", [])),
            "categorical_cols": len(r.get("categorical_columns", [])),
            "file_size_mb": round(r.get("memory_usage_mb", 0), 1),
            "key_columns": list(r.get("columns", {}).keys())[:5]  # First 5 columns only
        }
        compressed["next_steps"] = ["clean_missing_values", "detect_data_quality_issues"]
        
    elif tool_name == "detect_data_quality_issues":
        r = result.get("result", {})
        compressed["summary"] = {
            "total_issues": r.get("total_issues", 0),
            "critical_issues": r.get("critical_issues", 0),
            "missing_data": r.get("has_missing"),
            "outliers": r.get("has_outliers"),
            "duplicates": r.get("has_duplicates")
        }
        compressed["next_steps"] = ["clean_missing_values", "handle_outliers"]
        
    elif tool_name in ["clean_missing_values", "handle_outliers", "encode_categorical"]:
        r = result.get("result", {})
        compressed["summary"] = {
            "output_file": r.get("output_file", r.get("output_path")),
            "rows_processed": r.get("rows_after", r.get("num_rows")),
            "changes_made": bool(r.get("changes", {}) or r.get("imputed_columns"))
        }
        compressed["next_steps"] = ["Use this file for next step"]
        
    elif tool_name == "train_baseline_models":
        r = result.get("result", {})
        models = r.get("models", [])
        if models:
            best = max(models, key=lambda m: m.get("test_score", 0))
            compressed["summary"] = {
                "best_model": best.get("model"),
                "test_score": round(best.get("test_score", 0), 4),
                "train_score": round(best.get("train_score", 0), 4),
                "task_type": r.get("task_type"),
                "models_trained": len(models)
            }
        compressed["next_steps"] = ["hyperparameter_tuning", "generate_combined_eda_report"]
        
    elif tool_name in ["generate_plotly_dashboard", "generate_ydata_profiling_report", "generate_combined_eda_report"]:
        r = result.get("result", {})
        compressed["summary"] = {
            "report_path": r.get("report_path", r.get("output_path")),
            "report_type": tool_name,
            "success": True
        }
        compressed["next_steps"] = ["Report ready for viewing"]
        
    elif tool_name == "hyperparameter_tuning":
        r = result.get("result", {})
        compressed["summary"] = {
            "best_params": r.get("best_params", {}),
            "best_score": round(r.get("best_score", 0), 4),
            "model_type": r.get("model_type"),
            "trials_completed": r.get("n_trials")
        }
        compressed["next_steps"] = ["perform_cross_validation", "generate_model_performance_plots"]
        
    else:
        # Generic compression: Keep only key fields
        r = result.get("result", {})
        if isinstance(r, dict):
            # Extract key fields (common patterns)
            key_fields = {}
            for key in ["output_path", "output_file", "status", "message", "success"]:
                if key in r:
                    key_fields[key] = r[key]
            compressed["summary"] = key_fields or {"result": "completed"}
        else:
            compressed["summary"] = {"result": str(r)[:200] if r else "completed"}
        compressed["next_steps"] = ["Continue workflow"]
    
    return compressed