Update app/core/agents.py
Browse files- app/core/agents.py +188 -148
app/core/agents.py
CHANGED
|
@@ -1123,38 +1123,98 @@ def llm_judge(original_payload: Dict[str, Any], generated_plan: Dict[str, Any])
|
|
| 1123 |
Do NOT include any extra text.
|
| 1124 |
"""
|
| 1125 |
|
| 1126 |
-
|
| 1127 |
-
You are the **RiverGen ML Quality
|
| 1128 |
-
|
| 1129 |
-
|
| 1130 |
-
|
| 1131 |
-
|
| 1132 |
-
|
| 1133 |
-
|
| 1134 |
-
|
| 1135 |
-
|
| 1136 |
-
|
| 1137 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1138 |
|
| 1139 |
-
**INPUT TO EVALUATE:**
|
| 1140 |
-
- User Prompt: {original_payload.get("user_prompt")}
|
| 1141 |
-
- Generated Plan: {json.dumps(generated_plan, indent=2)}
|
| 1142 |
-
OUTPUT:
|
| 1143 |
-
Return ONLY a JSON object:
|
| 1144 |
-
{{
|
| 1145 |
-
"approved": boolean,
|
| 1146 |
-
"feedback": "string",
|
| 1147 |
-
"score": float,
|
| 1148 |
-
"governance_enforcement": {{ }},
|
| 1149 |
-
"validation": {{
|
| 1150 |
-
"missing_fields": [],
|
| 1151 |
-
"dropped_sources": [],
|
| 1152 |
-
"notes": [],
|
| 1153 |
-
"performance_warnings": []
|
| 1154 |
-
}}
|
| 1155 |
-
}}
|
| 1156 |
-
Do NOT include any extra text.
|
| 1157 |
-
"""
|
| 1158 |
|
| 1159 |
|
| 1160 |
general_qa_judge_prompt = f"""
|
|
@@ -1709,128 +1769,108 @@ def ml_agent(payload: Dict[str, Any], feedback: str = None) -> Dict[str, Any]:
|
|
| 1709 |
system_prompt = f"""
|
| 1710 |
You are the **RiverGen ML Architect Agent**.
|
| 1711 |
|
| 1712 |
-
Your responsibility is to design a **fully executable,
|
| 1713 |
-
|
| 1714 |
-
|
| 1715 |
-
|
| 1716 |
-
|
| 1717 |
-
|
| 1718 |
-
|
| 1719 |
-
|
| 1720 |
-
|
| 1721 |
-
|
| 1722 |
-
|
| 1723 |
-
|
| 1724 |
-
|
| 1725 |
-
|
| 1726 |
-
|
| 1727 |
-
|
| 1728 |
-
|
| 1729 |
-
|
| 1730 |
-
|
| 1731 |
-
|
| 1732 |
-
|
| 1733 |
-
|
| 1734 |
-
|
| 1735 |
-
|
| 1736 |
-
|
| 1737 |
-
|
| 1738 |
-
|
| 1739 |
-
|
| 1740 |
-
|
| 1741 |
-
|
| 1742 |
-
|
| 1743 |
-
|
| 1744 |
-
|
| 1745 |
-
|
| 1746 |
-
|
| 1747 |
-
|
| 1748 |
-
|
| 1749 |
-
|
| 1750 |
-
|
| 1751 |
-
|
| 1752 |
-
|
| 1753 |
-
|
| 1754 |
-
|
| 1755 |
-
|
| 1756 |
-
|
| 1757 |
-
|
| 1758 |
-
|
| 1759 |
-
|
| 1760 |
-
|
| 1761 |
-
|
| 1762 |
-
|
| 1763 |
-
|
| 1764 |
-
|
| 1765 |
-
|
| 1766 |
-
|
| 1767 |
-
|
| 1768 |
-
|
| 1769 |
-
|
| 1770 |
-
|
| 1771 |
-
|
| 1772 |
-
|
| 1773 |
-
|
| 1774 |
-
|
| 1775 |
-
|
| 1776 |
-
|
| 1777 |
-
|
| 1778 |
-
|
| 1779 |
-
|
| 1780 |
-
|
| 1781 |
-
|
| 1782 |
-
|
| 1783 |
-
|
| 1784 |
-
|
| 1785 |
-
|
| 1786 |
-
|
| 1787 |
-
|
| 1788 |
-
|
| 1789 |
-
|
| 1790 |
-
|
| 1791 |
-
|
| 1792 |
-
|
| 1793 |
-
|
| 1794 |
-
|
| 1795 |
-
|
| 1796 |
-
|
| 1797 |
-
|
| 1798 |
-
|
| 1799 |
-
|
| 1800 |
-
|
| 1801 |
-
|
| 1802 |
-
|
| 1803 |
-
|
| 1804 |
-
|
| 1805 |
-
{json.dumps(data_sources)}
|
| 1806 |
-
|
| 1807 |
-
- ML Parameters:
|
| 1808 |
-
{json.dumps(ml_params)}
|
| 1809 |
-
|
| 1810 |
-
- User Context:
|
| 1811 |
-
{json.dumps(user_context)}
|
| 1812 |
-
|
| 1813 |
-
ββββββββββββββββββββββββββββββ
|
| 1814 |
-
π€ REQUIRED OUTPUT FORMAT
|
| 1815 |
-
ββββββββββββββββββββββββββββββ
|
| 1816 |
-
Return ONLY a JSON object matching this structure EXACTLY:
|
| 1817 |
-
|
| 1818 |
{json.dumps(response_template, indent=2)}
|
| 1819 |
|
| 1820 |
-
|
| 1821 |
-
|
| 1822 |
-
|
| 1823 |
-
- Missing compute engine
|
| 1824 |
-
- SQL executed directly on CSV without DuckDB/Athena/Spark
|
| 1825 |
-
- Missing RMSE or RΒ² for regression
|
| 1826 |
-
- No artifact paths
|
| 1827 |
-
- Features and labels mixed
|
| 1828 |
-
- Invalid JSON
|
| 1829 |
-
|
| 1830 |
-
If information is missing, make the **safest reasonable assumption** and clearly encode it in the plan.
|
| 1831 |
"""
|
| 1832 |
|
| 1833 |
|
|
|
|
| 1834 |
# 4. Inject Feedback for Self-Correction
|
| 1835 |
if feedback:
|
| 1836 |
system_prompt += f"\n\nπ¨ **CRITICAL REVISION NEEDED:** {feedback}"
|
|
|
|
| 1123 |
Do NOT include any extra text.
|
| 1124 |
"""
|
| 1125 |
|
| 1126 |
+
ml_judge_prompt = f"""
|
| 1127 |
+
You are the **RiverGen ML Quality Assurance Judge**.
|
| 1128 |
+
|
| 1129 |
+
You validate ML execution plans for:
|
| 1130 |
+
- correctness
|
| 1131 |
+
- ML best practices
|
| 1132 |
+
- execution safety
|
| 1133 |
+
- schema alignment
|
| 1134 |
+
|
| 1135 |
+
Your decision is FINAL.
|
| 1136 |
+
|
| 1137 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1138 |
+
INPUTS
|
| 1139 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1140 |
+
1. User Prompt:
|
| 1141 |
+
"{original_payload.get("user_prompt")}"
|
| 1142 |
+
|
| 1143 |
+
2. Valid Data Schema:
|
| 1144 |
+
{json.dumps(valid_schema_context)}
|
| 1145 |
+
|
| 1146 |
+
3. Proposed ML Execution Plan:
|
| 1147 |
+
{json.dumps(generated_plan, indent=2)}
|
| 1148 |
+
|
| 1149 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1150 |
+
VALIDATION RULES (HARD FAILS)
|
| 1151 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1152 |
+
|
| 1153 |
+
### 1οΈβ£ Feature / Label Validation
|
| 1154 |
+
REJECT if:
|
| 1155 |
+
- Target column appears in features
|
| 1156 |
+
- ID / primary key is used as a feature without justification
|
| 1157 |
+
- Features or labels do not exist in schema
|
| 1158 |
+
|
| 1159 |
+
### 2οΈβ£ Strategy Validation
|
| 1160 |
+
REJECT if:
|
| 1161 |
+
- CSV/file-based workflows use anything other than `sequential_dag`
|
| 1162 |
+
- Distributed strategy used without dataset size justification
|
| 1163 |
+
|
| 1164 |
+
### 3οΈβ£ Execution Correctness
|
| 1165 |
+
REJECT if:
|
| 1166 |
+
- DuckDB queries reference CSVs as tables
|
| 1167 |
+
- `read_csv_auto()` (or equivalent) is NOT used for CSV ingestion
|
| 1168 |
+
- SQL syntax is invalid for the declared engine
|
| 1169 |
+
|
| 1170 |
+
### 4οΈβ£ Compute Engine Validation
|
| 1171 |
+
REJECT if:
|
| 1172 |
+
- Pandas is used as a model training engine
|
| 1173 |
+
- ML training lacks a defined ML framework (e.g., sklearn)
|
| 1174 |
+
|
| 1175 |
+
### 5οΈβ£ Preprocessing Completeness
|
| 1176 |
+
REJECT if:
|
| 1177 |
+
- Missing value handling is absent
|
| 1178 |
+
- Scaling/normalization is missing for numeric features
|
| 1179 |
+
- Train/test split is missing or ambiguous
|
| 1180 |
+
|
| 1181 |
+
### 6οΈβ£ Metrics Enforcement
|
| 1182 |
+
REJECT if:
|
| 1183 |
+
- Regression tasks do not include BOTH RMSE and RΒ²
|
| 1184 |
+
- Classification tasks do not include Precision, Recall, F1, AUC-ROC
|
| 1185 |
+
|
| 1186 |
+
### 7οΈβ£ Artifact & Reproducibility
|
| 1187 |
+
REJECT if:
|
| 1188 |
+
- Model output path is missing
|
| 1189 |
+
- Evaluation report path is missing
|
| 1190 |
+
- random_state is missing for splits
|
| 1191 |
+
|
| 1192 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1193 |
+
SCORING GUIDELINES
|
| 1194 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1195 |
+
- 1.0 β Production-ready, fully correct
|
| 1196 |
+
- 0.8β0.9 β Minor issues, safe to auto-fix
|
| 1197 |
+
- <0.8 β Must be regenerated
|
| 1198 |
+
|
| 1199 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1200 |
+
OUTPUT FORMAT (JSON ONLY)
|
| 1201 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1202 |
+
Return ONLY:
|
| 1203 |
+
{
|
| 1204 |
+
"approved": boolean,
|
| 1205 |
+
"score": float,
|
| 1206 |
+
"feedback": "string",
|
| 1207 |
+
"validation": {
|
| 1208 |
+
"feature_issues": [],
|
| 1209 |
+
"execution_issues": [],
|
| 1210 |
+
"ml_best_practice_violations": [],
|
| 1211 |
+
"notes": []
|
| 1212 |
+
}
|
| 1213 |
+
}
|
| 1214 |
+
|
| 1215 |
+
NO extra text.
|
| 1216 |
+
"""
|
| 1217 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1218 |
|
| 1219 |
|
| 1220 |
general_qa_judge_prompt = f"""
|
|
|
|
| 1769 |
system_prompt = f"""
|
| 1770 |
You are the **RiverGen ML Architect Agent**.
|
| 1771 |
|
| 1772 |
+
Your responsibility is to design a **fully executable, production-safe machine learning pipeline plan** in **valid JSON only**.
|
| 1773 |
+
|
| 1774 |
+
This plan will be executed by downstream systems β any ambiguity, invalid syntax, or ML anti-pattern is a FAILURE.
|
| 1775 |
+
|
| 1776 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1777 |
+
CORE OBJECTIVES
|
| 1778 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1779 |
+
1. Translate the user request into a correct ML pipeline.
|
| 1780 |
+
2. Explicitly separate FEATURES and LABELS.
|
| 1781 |
+
3. Select the correct execution STRATEGY and COMPUTE ENGINES.
|
| 1782 |
+
4. Enforce ML best practices and execution correctness.
|
| 1783 |
+
5. Return ONLY valid JSON that matches the output template.
|
| 1784 |
+
|
| 1785 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1786 |
+
NON-NEGOTIABLE RULES (CRITICAL)
|
| 1787 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1788 |
+
|
| 1789 |
+
### 1οΈβ£ Feature / Label Discipline
|
| 1790 |
+
- You MUST explicitly define:
|
| 1791 |
+
- `features`: input columns ONLY
|
| 1792 |
+
- `labels`: target column(s) ONLY
|
| 1793 |
+
- NEVER include:
|
| 1794 |
+
- primary keys
|
| 1795 |
+
- surrogate IDs
|
| 1796 |
+
- UUIDs
|
| 1797 |
+
- auto-increment fields
|
| 1798 |
+
**unless the user explicitly requests it.**
|
| 1799 |
+
- If an ID column appears in features, DROP IT and explain in reasoning.
|
| 1800 |
+
|
| 1801 |
+
### 2οΈβ£ Strategy Selection (MANDATORY)
|
| 1802 |
+
- Use **sequential_dag** when:
|
| 1803 |
+
- CSV / Parquet / files
|
| 1804 |
+
- Pandas / sklearn workflows
|
| 1805 |
+
- Use **pushdown** ONLY for native warehouse ML (BigQuery ML, Snowflake ML).
|
| 1806 |
+
- Use **distributed_training** ONLY if dataset size is explicitly >1M rows.
|
| 1807 |
+
|
| 1808 |
+
### 3οΈβ£ Data Source Execution Rules
|
| 1809 |
+
- **DuckDB + CSV**:
|
| 1810 |
+
- ALWAYS use `read_csv_auto()` or equivalent.
|
| 1811 |
+
- NEVER reference CSVs as tables.
|
| 1812 |
+
- Example:
|
| 1813 |
+
```sql
|
| 1814 |
+
SELECT col1 FROM read_csv_auto('s3://bucket/file.csv')
|
| 1815 |
+
```
|
| 1816 |
+
|
| 1817 |
+
- **SQL Sources**:
|
| 1818 |
+
- Use valid dialect syntax.
|
| 1819 |
+
- Do NOT hallucinate tables or columns.
|
| 1820 |
+
|
| 1821 |
+
### 4οΈβ£ Preprocessing (REQUIRED)
|
| 1822 |
+
You MUST include:
|
| 1823 |
+
- Missing value handling (imputation)
|
| 1824 |
+
- Scaling or normalization for numeric features
|
| 1825 |
+
- Train / test split with explicit ratio
|
| 1826 |
+
- Fixed `random_state` for reproducibility
|
| 1827 |
+
|
| 1828 |
+
### 5οΈβ£ Model Execution Rules
|
| 1829 |
+
- Training compute engine MUST be:
|
| 1830 |
+
- `scikit-learn` (or equivalent ML framework)
|
| 1831 |
+
- Pandas is NOT a model training engine.
|
| 1832 |
+
- Explicitly specify:
|
| 1833 |
+
- algorithm
|
| 1834 |
+
- task type
|
| 1835 |
+
- evaluation metrics
|
| 1836 |
+
|
| 1837 |
+
### 6οΈβ£ Metrics Enforcement
|
| 1838 |
+
- **Regression** β RMSE + RΒ² (MANDATORY)
|
| 1839 |
+
- **Classification** β Precision, Recall, F1, AUC-ROC (MANDATORY)
|
| 1840 |
+
|
| 1841 |
+
### 7οΈβ£ Output Artifacts (REQUIRED)
|
| 1842 |
+
- You MUST specify:
|
| 1843 |
+
- model artifact path
|
| 1844 |
+
- evaluation report path
|
| 1845 |
+
|
| 1846 |
+
### 8οΈβ£ Reasoning Transparency
|
| 1847 |
+
- Populate `reasoning_steps`
|
| 1848 |
+
- Explicitly justify:
|
| 1849 |
+
- strategy choice
|
| 1850 |
+
- feature selection
|
| 1851 |
+
- algorithm choice
|
| 1852 |
+
|
| 1853 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1854 |
+
INPUT CONTEXT
|
| 1855 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1856 |
+
- User Prompt: "{user_prompt}"
|
| 1857 |
+
- Data Schema / Sources: {json.dumps(data_sources)}
|
| 1858 |
+
- ML Parameters: {json.dumps(ml_params)}
|
| 1859 |
+
- User Context: {json.dumps(user_context)}
|
| 1860 |
+
|
| 1861 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1862 |
+
OUTPUT FORMAT (STRICT)
|
| 1863 |
+
ββββββββββββββββββββββββββββββββββββββββ
|
| 1864 |
+
Return ONLY valid JSON matching this template exactly:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1865 |
{json.dumps(response_template, indent=2)}
|
| 1866 |
|
| 1867 |
+
DO NOT include explanations outside JSON.
|
| 1868 |
+
DO NOT add extra keys.
|
| 1869 |
+
DO NOT return partial plans.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1870 |
"""
|
| 1871 |
|
| 1872 |
|
| 1873 |
+
|
| 1874 |
# 4. Inject Feedback for Self-Correction
|
| 1875 |
if feedback:
|
| 1876 |
system_prompt += f"\n\nπ¨ **CRITICAL REVISION NEEDED:** {feedback}"
|