Spaces:
Sleeping
Sleeping
| # ========================= | |
| # 🔥 SCORE NORMALIZATION | |
| # ========================= | |
| def normalize_score(score: float) -> float: | |
| if score <= 0.05: | |
| return 0.10 | |
| if score >= 0.95: | |
| return 0.90 | |
| return round(float(score), 2) | |
| # ========================= | |
| # 🔥 GRADERS | |
| # ========================= | |
| def grade_data_cleaning(action): | |
| score = 0.0 | |
| steps = " ".join(action.steps).lower() | |
| if "null" in steps: score += 0.3 | |
| if "duplicate" in steps: score += 0.3 | |
| if "sort" in steps: score += 0.2 | |
| if "[2,5,9]" in str(action.output): score += 0.2 | |
| return normalize_score(score) | |
| def grade_financial(action): | |
| score = 0.0 | |
| steps = " ".join(action.steps).lower() | |
| if "risk" in steps: score += 0.4 | |
| if "compare" in steps: score += 0.3 | |
| if action.output.strip() == "A": score += 0.2 | |
| return normalize_score(score) | |
| def grade_instruction(action): | |
| score = 0.0 | |
| if len(action.steps) >= 2: score += 0.4 | |
| if "4" in str(action.output): score += 0.4 | |
| return normalize_score(score) | |
| # ========================= | |
| # 🔥 TASKS (VALIDATOR SOURCE) | |
| # ========================= | |
| TASKS = [ | |
| { | |
| "id": "task_1", | |
| "name": "data_cleaning_pipeline", | |
| "input": {"data": [5, None, 2, 2, 9]}, | |
| "instruction": "Remove nulls, duplicates, and sort ascending.", | |
| "required_steps": [ | |
| "remove null", | |
| "remove duplicate", | |
| "sort" | |
| ], | |
| "forbidden": [ | |
| "skip", | |
| "direct output", | |
| "hardcoded" | |
| ], | |
| "output_schema": { | |
| "type": "object", | |
| "properties": { | |
| "steps": {"type": "array", "items": {"type": "string"}}, | |
| "output": {"type": "string"} | |
| } | |
| }, | |
| "grader": grade_data_cleaning | |
| }, | |
| { | |
| "id": "task_2", | |
| "name": "risk_aware_financial_choice", | |
| "input": { | |
| "options": [ | |
| {"name": "A", "risk": "low"}, | |
| {"name": "B", "risk": "high"} | |
| ] | |
| }, | |
| "instruction": "Choose the safest option.", | |
| "required_steps": [ | |
| "analyze risk", | |
| "compare options" | |
| ], | |
| "forbidden": [ | |
| "random", | |
| "guess", | |
| "no analysis" | |
| ], | |
| "output_schema": { | |
| "type": "object", | |
| "properties": { | |
| "steps": {"type": "array", "items": {"type": "string"}}, | |
| "output": {"type": "string"} | |
| } | |
| }, | |
| "grader": grade_financial | |
| }, | |
| { | |
| "id": "task_3", | |
| "name": "instruction_adherence_test", | |
| "input": {"question": "What is 2 + 2?"}, | |
| "instruction": "Show reasoning before answering.", | |
| "required_steps": [ | |
| "show reasoning", | |
| "compute result" | |
| ], | |
| "forbidden": [ | |
| "direct answer", | |
| "no reasoning" | |
| ], | |
| "output_schema": { | |
| "type": "object", | |
| "properties": { | |
| "steps": {"type": "array", "items": {"type": "string"}}, | |
| "output": {"type": "string"} | |
| } | |
| }, | |
| "grader": grade_instruction | |
| } | |
| ] |